├── .gitignore ├── LICENSE ├── README.md ├── README_CN.md ├── docs ├── README.md ├── data │ ├── load_data.md │ └── split_data.md └── experiment.md ├── example ├── m_attrec_demo.py ├── m_bpr_demo.py ├── m_caser_demo.py ├── m_dssm_demo.py ├── m_fissa_demo.py ├── m_gru4rec_demo.py ├── m_mind_demo.py ├── m_ncf_demo.py ├── m_poprec_demo.py ├── m_sasrec_demo.py ├── m_youtubednn_demo.py ├── r_afm_demo.py ├── r_dcn_demo.py ├── r_deep_crossing_demo.py ├── r_deepfm_demo.py ├── r_fm_demo.py ├── r_nfm_demo.py ├── r_pnn_demo.py ├── r_wdl_demo.py ├── r_xdeepfm_demo.py └── train_small_criteo_demo.py ├── reclearn ├── __init__.py ├── data │ ├── __init__.py │ ├── __pycache__ │ │ └── __init__.cpython-38.pyc │ ├── datasets │ │ ├── __init__.py │ │ ├── beauty.py │ │ ├── criteo.py │ │ ├── games.py │ │ ├── movielens.py │ │ └── steam.py │ ├── feature_column.py │ └── utils.py ├── evaluator │ ├── __init__.py │ ├── evaluator.py │ └── metrics.py ├── layers │ ├── __init__.py │ ├── core.py │ └── utils.py └── models │ ├── __init__.py │ ├── losses.py │ ├── matching │ ├── __init__.py │ ├── attrec.py │ ├── bpr.py │ ├── caser.py │ ├── dssm.py │ ├── fissa.py │ ├── gru4rec.py │ ├── mind.py │ ├── ncf.py │ ├── poprec.py │ ├── sasrec.py │ └── youtubednn.py │ └── ranking │ ├── __init__.py │ ├── afm.py │ ├── dcn.py │ ├── deep_crossing.py │ ├── deepfm.py │ ├── ffm.py │ ├── fm.py │ ├── nfm.py │ ├── pnn.py │ ├── wdl.py │ └── xdeepfm.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_Store 2 | /example/data/* 3 | .idea/* 4 | /dist/* 5 | /reclearn.egg-info/* 6 | /build/* 7 | */__pycache__ 8 | **/__pycache__ 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Ziyao Geng 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Reclearn文档指南 2 | 3 | 本篇文档是为了方便使用者对Reclearn项目的使用与理解。 4 | 5 | -------------------------------------------------------------------------------- /docs/data/load_data.md: -------------------------------------------------------------------------------- 1 | ## 数据读取 2 | 3 | 数据读取是数据pipeline的最后一步。我们需要指定哪些特征的读取、负样本的数量(Top-K推荐中)以及一些特征工程。 4 | 5 | 6 | 7 | ## Movielens等 8 | 9 | 对应于`split_data`方法,`data/datasets/movielens.py`中有`load_data`: 10 | 11 | ```python 12 | def load_data(file_path, neg_num, max_item_num): 13 | """load movielens dataset. 14 | Args: 15 | :param file_path: A string. The file path. 16 | :param neg_num: A scalar(int). The negative num of one sample. 17 | :param max_item_num: A scalar(int). The max index of item. 18 | :return: A dict. data. 19 | """ 20 | data = np.array(pd.read_csv(file_path, delimiter='\t')) 21 | np.random.shuffle(data) 22 | neg_items = [] 23 | for i in tqdm(range(len(data))): 24 | neg_item = [random.randint(1, max_item_num) for _ in range(neg_num)] 25 | neg_items.append(neg_item) 26 | return {'user': data[:, 0].astype(int), 'pos_item': data[:, 1].astype(int), 'neg_item': np.array(neg_items)} 27 | ``` 28 | 29 | 给定数据集路径(训练/验证/测试)、负样本数量以及最大的item索引值(为了控制负样本的范围)。该方法并没有包含一些特征工程,只包括了`user id`、`item id`两个特征。 -------------------------------------------------------------------------------- /docs/experiment.md: -------------------------------------------------------------------------------- 1 | # 实验说明 2 | 3 | 本篇将详细介绍如何构建简明的实验,实验案例可以参考[example](../example)。Reclearn的实验主要分为两大模块:**Top-K推荐**与**CTR 预估**,因此我们从这两部分内容进行展开。 4 | 5 | 6 | 7 | ## Top-K推荐 8 | 9 | Top-K推荐指的是对于单一用户,推荐模型给其推荐物料库中最有可能的K个物品,然后再通过一些离线排序指标例如HR@K、NDCG@K、MRR@K、MAP@K等评估该模型的能力,在工业上这类模型也称为召回模型,应用在召回阶段。对于召回模型,**最显著的特点**就是user侧的user embedding向量与item embedding向量几乎不会发生特征交叉,除了最后阶段的点积。 10 | 11 | Reclearn项目包含了学术上的模型(SASRec、AttRec等)和工业上的模型(DSSM、YoutubeDNN),但最终的实验流程需要统一,所以本篇实验说明定义了一种常用的数据集切割、负采样、离线指标评估等。如果使用者有自己的实验方式,也可以自己形式化定义。 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /example/m_attrec_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 20, 2021 3 | Updated on Apr 23, 2022 4 | train AttRec demo 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import os 8 | from absl import flags, app 9 | from time import time 10 | from tensorflow.keras.optimizers import Adam 11 | 12 | from reclearn.models.matching import AttRec 13 | from reclearn.data.datasets import movielens as ml 14 | from reclearn.evaluator import eval_pos_neg 15 | 16 | FLAGS = flags.FLAGS 17 | 18 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 19 | # os.environ['CUDA_VISIBLE_DEVICES'] = '6' 20 | 21 | # Setting training parameters 22 | flags.DEFINE_string("file_path", "data/ml-1m/ratings.dat", "file path.") 23 | flags.DEFINE_string("train_path", "data/ml-1m/ml_seq_train.txt", "train path. If set to None, the program will split the dataset.") 24 | flags.DEFINE_string("val_path", "data/ml-1m/ml_seq_val.txt", "val path.") 25 | flags.DEFINE_string("test_path", "data/ml-1m/ml_seq_test.txt", "test path.") 26 | flags.DEFINE_string("meta_path", "data/ml-1m/ml_seq_meta.txt", "meta path.") 27 | flags.DEFINE_integer("embed_dim", 64, "The size of embedding dimension.") 28 | flags.DEFINE_float("embed_reg", 0.0, "The value of embedding regularization.") 29 | flags.DEFINE_string("mode", "inner", "inner or dist.") 30 | flags.DEFINE_float("w", 0.3, "The weight of short interest.") 31 | flags.DEFINE_boolean("use_l2norm", False, "Whether user embedding, item embedding should be normalized or not.") 32 | flags.DEFINE_string("loss_name", "hinge_loss", "Loss Name.") 33 | flags.DEFINE_float("gamma", 0.5, "If hinge_loss is selected as the loss function, you can specify the margin.") 34 | flags.DEFINE_float("learning_rate", 0.001, "Learning rate.") 35 | flags.DEFINE_integer("neg_num", 4, "The number of negative sample for each positive sample.") 36 | flags.DEFINE_integer("seq_len", 100, "The length of user's behavior sequence.") 37 | flags.DEFINE_integer("epochs", 20, "train steps.") 38 | flags.DEFINE_integer("batch_size", 512, "Batch Size.") 39 | flags.DEFINE_integer("test_neg_num", 100, "The number of test negative samples.") 40 | flags.DEFINE_integer("k", 10, "recall k items at test stage.") 41 | 42 | 43 | def main(argv): 44 | # TODO: 1. Split Data 45 | if FLAGS.train_path == "None": 46 | train_path, val_path, test_path, meta_path = ml.split_seq_data(file_path=FLAGS.file_path) 47 | else: 48 | train_path, val_path, test_path, meta_path = FLAGS.train_path, FLAGS.val_path, FLAGS.test_path, FLAGS.meta_path 49 | with open(meta_path) as f: 50 | max_user_num, max_item_num = [int(x) for x in f.readline().strip('\n').split('\t')] 51 | # TODO: 2. Load Sequence Data 52 | train_data = ml.load_seq_data(train_path, "train", FLAGS.seq_len, FLAGS.neg_num, max_item_num, contain_user=True) 53 | val_data = ml.load_seq_data(val_path, "val", FLAGS.seq_len, FLAGS.neg_num, max_item_num, contain_user=True) 54 | test_data = ml.load_seq_data(test_path, "test", FLAGS.seq_len, FLAGS.test_neg_num, max_item_num, contain_user=True) 55 | # TODO: 3. Set Model Hyper Parameters. 56 | model_params = { 57 | 'user_num': max_user_num + 1, 58 | 'item_num': max_item_num + 1, 59 | 'embed_dim': FLAGS.embed_dim, 60 | 'mode': FLAGS.mode, 61 | 'w': FLAGS.w, 62 | 'use_l2norm': FLAGS.use_l2norm, 63 | 'loss_name': FLAGS.loss_name, 64 | 'gamma': FLAGS.gamma, 65 | 'embed_reg': FLAGS.embed_reg 66 | } 67 | # TODO: 4. Build Model 68 | model = AttRec(**model_params) 69 | model.compile(optimizer=Adam(learning_rate=FLAGS.learning_rate)) 70 | # TODO: 5. Fit Model 71 | for epoch in range(1, FLAGS.epochs + 1): 72 | t1 = time() 73 | model.fit( 74 | x=train_data, 75 | epochs=1, 76 | validation_data=val_data, 77 | batch_size=FLAGS.batch_size 78 | ) 79 | t2 = time() 80 | eval_dict = eval_pos_neg(model, test_data, ['hr', 'mrr', 'ndcg'], FLAGS.k, FLAGS.batch_size) 81 | print('Iteration %d Fit [%.1f s], Evaluate [%.1f s]: HR = %.4f, MRR = %.4f, NDCG = %.4f' 82 | % (epoch, t2 - t1, time() - t2, eval_dict['hr'], eval_dict['mrr'], eval_dict['ndcg'])) 83 | 84 | 85 | if __name__ == '__main__': 86 | app.run(main) -------------------------------------------------------------------------------- /example/m_bpr_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 19, 2021 3 | train BPR demo 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import os 7 | from absl import flags, app 8 | from time import time 9 | from tensorflow.keras.optimizers import Adam 10 | 11 | from reclearn.models.matching import BPR 12 | from reclearn.data.datasets import movielens as ml 13 | from reclearn.evaluator import eval_pos_neg 14 | 15 | FLAGS = flags.FLAGS 16 | 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 18 | # os.environ['CUDA_VISIBLE_DEVICES'] = '6' 19 | 20 | # Setting training parameters 21 | flags.DEFINE_string("file_path", "data/ml-1m/ratings.dat", "file path.") 22 | flags.DEFINE_string("train_path", "data/ml-1m/ml_train.txt", "train path. If set to None, the program will split the dataset.") 23 | flags.DEFINE_string("val_path", "data/ml-1m/ml_val.txt", "val path.") 24 | flags.DEFINE_string("test_path", "data/ml-1m/ml_test.txt", "test path.") 25 | flags.DEFINE_string("meta_path", "data/ml-1m/ml_meta.txt", "meta path.") 26 | flags.DEFINE_integer("embed_dim", 64, "The size of embedding dimension.") 27 | flags.DEFINE_float("embed_reg", 0.0, "The value of embedding regularization.") 28 | flags.DEFINE_boolean("use_l2norm", False, "Whether user embedding, item embedding should be normalized or not.") 29 | flags.DEFINE_float("learning_rate", 0.001, "Learning rate.") 30 | flags.DEFINE_integer("neg_num", 4, "The number of negative sample for each positive sample.") 31 | flags.DEFINE_integer("epochs", 20, "train steps.") 32 | flags.DEFINE_integer("batch_size", 512, "Batch Size.") 33 | flags.DEFINE_integer("test_neg_num", 100, "The number of test negative samples.") 34 | flags.DEFINE_integer("k", 10, "recall k items at test stage.") 35 | 36 | 37 | def main(argv): 38 | # TODO: 1. Split Data 39 | if FLAGS.train_path == "None": 40 | train_path, val_path, test_path, meta_path = ml.split_data(file_path=FLAGS.file_path) 41 | else: 42 | train_path, val_path, test_path, meta_path = FLAGS.train_path, FLAGS.val_path, FLAGS.test_path, FLAGS.meta_path 43 | with open(meta_path) as f: 44 | max_user_num, max_item_num = [int(x) for x in f.readline().strip('\n').split('\t')] 45 | # TODO: 2. Load Data 46 | train_data = ml.load_data(train_path, FLAGS.neg_num, max_item_num) 47 | val_data = ml.load_data(val_path, FLAGS.neg_num, max_item_num) 48 | test_data = ml.load_data(test_path, FLAGS.test_neg_num, max_item_num) 49 | # TODO: 3. Set Model Hyper Parameters. 50 | model_params = { 51 | 'user_num': max_user_num + 1, 52 | 'item_num': max_item_num + 1, 53 | 'embed_dim': FLAGS.embed_dim, 54 | 'use_l2norm': FLAGS.use_l2norm, 55 | 'embed_reg': FLAGS.embed_reg 56 | } 57 | # TODO: 4. Build Model 58 | model = BPR(**model_params) 59 | model.compile(optimizer=Adam(learning_rate=FLAGS.learning_rate)) 60 | # TODO: 5. Fit Model 61 | for epoch in range(1, FLAGS.epochs + 1): 62 | t1 = time() 63 | model.fit( 64 | x=train_data, 65 | epochs=1, 66 | validation_data=val_data, 67 | batch_size=FLAGS.batch_size 68 | ) 69 | t2 = time() 70 | eval_dict = eval_pos_neg(model, test_data, ['hr', 'mrr', 'ndcg'], FLAGS.k, FLAGS.batch_size) 71 | print('Iteration %d Fit [%.1f s], Evaluate [%.1f s]: HR = %.4f, MRR = %.4f, NDCG = %.4f, ' 72 | % (epoch, t2 - t1, time() - t2, eval_dict['hr'], eval_dict['mrr'], eval_dict['ndcg'])) 73 | 74 | 75 | if __name__ == '__main__': 76 | app.run(main) -------------------------------------------------------------------------------- /example/m_caser_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 20, 2021 3 | Updated on Apr 23, 2022 4 | train Caser demo 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import os 8 | from absl import flags, app 9 | from time import time 10 | from tensorflow.keras.optimizers import Adam 11 | 12 | from reclearn.models.matching import Caser 13 | from reclearn.data.datasets import movielens as ml 14 | from reclearn.evaluator import eval_pos_neg 15 | 16 | FLAGS = flags.FLAGS 17 | 18 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 19 | # os.environ['CUDA_VISIBLE_DEVICES'] = '6' 20 | 21 | # Setting training parameters 22 | flags.DEFINE_string("file_path", "data/ml-1m/ratings.dat", "file path.") 23 | flags.DEFINE_string("train_path", "data/ml-1m/ml_seq_train.txt", "train path. If set to None, the program will split the dataset.") 24 | flags.DEFINE_string("val_path", "data/ml-1m/ml_seq_val.txt", "val path.") 25 | flags.DEFINE_string("test_path", "data/ml-1m/ml_seq_test.txt", "test path.") 26 | flags.DEFINE_string("meta_path", "data/ml-1m/ml_seq_meta.txt", "meta path.") 27 | flags.DEFINE_integer("embed_dim", 64, "The size of embedding dimension.") 28 | flags.DEFINE_float("embed_reg", 0.0, "The value of embedding regularization.") 29 | flags.DEFINE_integer("hor_n", 8, "The number of horizontal filters.") 30 | flags.DEFINE_integer("hor_h", 2, "Height of horizontal filters.") 31 | flags.DEFINE_integer("ver_n", 4, "The number of vertical filters.") 32 | flags.DEFINE_string("activation", "relu", "Activation Name.") 33 | flags.DEFINE_float("dnn_dropout", 0., "Float between 0 and 1. Dropout of user and item MLP layer.") 34 | flags.DEFINE_boolean("use_l2norm", False, "Whether user embedding, item embedding should be normalized or not.") 35 | flags.DEFINE_string("loss_name", "binary_cross_entropy_loss", "Loss Name.") 36 | flags.DEFINE_float("gamma", 0.5, "If hinge_loss is selected as the loss function, you can specify the margin.") 37 | flags.DEFINE_float("learning_rate", 0.001, "Learning rate.") 38 | flags.DEFINE_integer("neg_num", 4, "The number of negative sample for each positive sample.") 39 | flags.DEFINE_integer("seq_len", 100, "The length of user's behavior sequence.") 40 | flags.DEFINE_integer("epochs", 20, "train steps.") 41 | flags.DEFINE_integer("batch_size", 512, "Batch Size.") 42 | flags.DEFINE_integer("test_neg_num", 100, "The number of test negative samples.") 43 | flags.DEFINE_integer("k", 10, "recall k items at test stage.") 44 | 45 | 46 | def main(argv): 47 | # TODO: 1. Split Data 48 | if FLAGS.train_path == "None": 49 | train_path, val_path, test_path, meta_path = ml.split_seq_data(file_path=FLAGS.file_path) 50 | else: 51 | train_path, val_path, test_path, meta_path = FLAGS.train_path, FLAGS.val_path, FLAGS.test_path, FLAGS.meta_path 52 | with open(meta_path) as f: 53 | max_user_num, max_item_num = [int(x) for x in f.readline().strip('\n').split('\t')] 54 | # TODO: 2. Load Sequence Data 55 | train_data = ml.load_seq_data(train_path, "train", FLAGS.seq_len, FLAGS.neg_num, max_item_num, contain_user=True) 56 | val_data = ml.load_seq_data(val_path, "val", FLAGS.seq_len, FLAGS.neg_num, max_item_num, contain_user=True) 57 | test_data = ml.load_seq_data(test_path, "test", FLAGS.seq_len, FLAGS.test_neg_num, max_item_num, contain_user=True) 58 | # TODO: 3. Set Model Hyper Parameters. 59 | model_params = { 60 | 'user_num': max_user_num + 1, 61 | 'item_num': max_item_num + 1, 62 | 'embed_dim': FLAGS.embed_dim, 63 | 'seq_len': FLAGS.seq_len, 64 | 'hor_n': FLAGS.hor_n, 65 | 'hor_h': FLAGS.hor_h, 66 | 'ver_n': FLAGS.ver_n, 67 | 'activation': FLAGS.activation, 68 | 'dnn_dropout': FLAGS.dnn_dropout, 69 | 'use_l2norm': FLAGS.use_l2norm, 70 | 'loss_name': FLAGS.loss_name, 71 | 'gamma': FLAGS.gamma, 72 | 'embed_reg': FLAGS.embed_reg 73 | } 74 | # TODO: 4. Build Model 75 | model = Caser(**model_params) 76 | model.compile(optimizer=Adam(learning_rate=FLAGS.learning_rate)) 77 | # TODO: 5. Fit Model 78 | for epoch in range(1, FLAGS.epochs + 1): 79 | t1 = time() 80 | model.fit( 81 | x=train_data, 82 | epochs=1, 83 | validation_data=val_data, 84 | batch_size=FLAGS.batch_size 85 | ) 86 | t2 = time() 87 | eval_dict = eval_pos_neg(model, test_data, ['hr', 'mrr', 'ndcg'], FLAGS.k, FLAGS.batch_size) 88 | print('Iteration %d Fit [%.1f s], Evaluate [%.1f s]: HR = %.4f, MRR = %.4f, NDCG = %.4f' 89 | % (epoch, t2 - t1, time() - t2, eval_dict['hr'], eval_dict['mrr'], eval_dict['ndcg'])) 90 | 91 | 92 | if __name__ == '__main__': 93 | app.run(main) -------------------------------------------------------------------------------- /example/m_dssm_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Apr 1, 2022 3 | train DSSM demo 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import os 7 | from absl import flags, app 8 | from time import time 9 | from tensorflow.keras.optimizers import Adam 10 | 11 | from reclearn.models.matching import DSSM 12 | from reclearn.data.datasets import movielens as ml 13 | from reclearn.evaluator import eval_pos_neg 14 | 15 | FLAGS = flags.FLAGS 16 | 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 18 | # os.environ['CUDA_VISIBLE_DEVICES'] = '0' 19 | 20 | # Setting training parameters 21 | flags.DEFINE_string("file_path", "data/ml-1m/ratings.dat", "file path.") 22 | flags.DEFINE_string("train_path", "data/ml-1m/ml_train.txt", "train path. If set to None, the program will split the dataset.") 23 | flags.DEFINE_string("val_path", "data/ml-1m/ml_val.txt", "val path.") 24 | flags.DEFINE_string("test_path", "data/ml-1m/ml_test.txt", "test path.") 25 | flags.DEFINE_string("meta_path", "data/ml-1m/ml_meta.txt", "meta path.") 26 | flags.DEFINE_integer("embed_dim", 64, "The size of embedding dimension.") 27 | flags.DEFINE_float("embed_reg", 0.0, "The value of embedding regularization.") 28 | flags.DEFINE_list("user_mlp", [128], "A list of user MLP hidden units.") 29 | flags.DEFINE_list("item_mlp", [128], "A list of item MLP hidden units") 30 | flags.DEFINE_string("activation", "relu", "Activation Name.") 31 | flags.DEFINE_float("dnn_dropout", 0., "Float between 0 and 1. Dropout of user and item MLP layer.") 32 | flags.DEFINE_boolean("use_l2norm", False, "Whether user embedding, item embedding should be normalized or not.") 33 | flags.DEFINE_string("loss_name", "binary_cross_entropy_loss", "Loss Name.") 34 | flags.DEFINE_float("gamma", 0.5, "If hinge_loss is selected as the loss function, you can specify the margin.") 35 | flags.DEFINE_float("learning_rate", 0.001, "Learning rate.") 36 | flags.DEFINE_integer("neg_num", 4, "The number of negative sample for each positive sample.") 37 | flags.DEFINE_integer("epochs", 20, "train steps.") 38 | flags.DEFINE_integer("batch_size", 512, "Batch Size.") 39 | flags.DEFINE_integer("test_neg_num", 100, "The number of test negative samples.") 40 | flags.DEFINE_integer("k", 10, "recall k items at test stage.") 41 | 42 | 43 | def main(argv): 44 | # TODO: 1. Split Data 45 | if FLAGS.train_path == "None": 46 | train_path, val_path, test_path, meta_path = ml.split_data(file_path=FLAGS.file_path) 47 | else: 48 | train_path, val_path, test_path, meta_path = FLAGS.train_path, FLAGS.val_path, FLAGS.test_path, FLAGS.meta_path 49 | with open(meta_path) as f: 50 | max_user_num, max_item_num = [int(x) for x in f.readline().strip('\n').split('\t')] 51 | # TODO: 2. Load Data 52 | train_data = ml.load_data(train_path, FLAGS.neg_num, max_item_num) 53 | val_data = ml.load_data(val_path, FLAGS.neg_num, max_item_num) 54 | test_data = ml.load_data(test_path, FLAGS.test_neg_num, max_item_num) 55 | # TODO: 3. Set Model Hyper Parameters. 56 | model_params = { 57 | 'user_num': max_user_num + 1, 58 | 'item_num': max_item_num + 1, 59 | 'embed_dim': FLAGS.embed_dim, 60 | 'user_mlp': FLAGS.user_mlp, 61 | 'item_mlp': FLAGS.item_mlp, 62 | 'activation': FLAGS.activation, 63 | 'dnn_dropout': FLAGS.dnn_dropout, 64 | 'use_l2norm': FLAGS.use_l2norm, 65 | 'loss_name': FLAGS.loss_name, 66 | 'gamma': FLAGS.gamma, 67 | 'embed_reg': FLAGS.embed_reg 68 | } 69 | # TODO: 4. Build Model 70 | model = DSSM(**model_params) 71 | model.compile(optimizer=Adam(learning_rate=FLAGS.learning_rate)) 72 | # TODO: 5. Fit Model 73 | for epoch in range(1, FLAGS.epochs + 1): 74 | t1 = time() 75 | model.fit( 76 | x=train_data, 77 | epochs=1, 78 | validation_data=val_data, 79 | batch_size=FLAGS.batch_size 80 | ) 81 | t2 = time() 82 | eval_dict = eval_pos_neg(model, test_data, ['hr', 'mrr', 'ndcg'], FLAGS.k, FLAGS.batch_size) 83 | print('Iteration %d Fit [%.1f s], Evaluate [%.1f s]: HR = %.4f, MRR = %.4f, NDCG = %.4f' 84 | % (epoch, t2 - t1, time() - t2, eval_dict['hr'], eval_dict['mrr'], eval_dict['ndcg'])) 85 | 86 | 87 | if __name__ == '__main__': 88 | app.run(main) -------------------------------------------------------------------------------- /example/m_fissa_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 21, 2021 3 | Updated on Apr 23, 2022 4 | train FISSA demo 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import os 8 | from absl import flags, app 9 | from time import time 10 | from tensorflow.keras.optimizers import Adam 11 | 12 | from reclearn.models.matching import FISSA 13 | from reclearn.data.datasets import movielens as ml 14 | from reclearn.evaluator import eval_pos_neg 15 | 16 | FLAGS = flags.FLAGS 17 | 18 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 19 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 20 | 21 | # Setting training parameters 22 | flags.DEFINE_string("file_path", "data/ml-1m/ratings.dat", "file path.") 23 | flags.DEFINE_string("train_path", "data/ml-1m/ml_seq_train.txt", "train path. If set to None, the program will split the dataset.") 24 | flags.DEFINE_string("val_path", "data/ml-1m/ml_seq_val.txt", "val path.") 25 | flags.DEFINE_string("test_path", "data/ml-1m/ml_seq_test.txt", "test path.") 26 | flags.DEFINE_string("meta_path", "data/ml-1m/ml_seq_meta.txt", "meta path.") 27 | flags.DEFINE_integer("embed_dim", 64, "The size of embedding dimension.") 28 | flags.DEFINE_float("embed_reg", 0.0, "The value of embedding regularization.") 29 | flags.DEFINE_integer("blocks", 2, "The Number of blocks.") 30 | flags.DEFINE_integer("num_heads", 2, "The Number of attention heads.") 31 | flags.DEFINE_integer("ffn_hidden_unit", 64, "Number of hidden unit in FFN.") 32 | flags.DEFINE_float("dnn_dropout", 0.2, "Float between 0 and 1. Dropout of user and item MLP layer.") 33 | flags.DEFINE_float("layer_norm_eps", 1e-6, "Small float added to variance to avoid dividing by zero.") 34 | flags.DEFINE_boolean("use_l2norm", False, "Whether user embedding, item embedding should be normalized or not.") 35 | flags.DEFINE_string("loss_name", "binary_cross_entropy_loss", "Loss Name.") 36 | flags.DEFINE_float("gamma", 0.5, "If hinge_loss is selected as the loss function, you can specify the margin.") 37 | flags.DEFINE_float("learning_rate", 0.001, "Learning rate.") 38 | flags.DEFINE_integer("neg_num", 4, "The number of negative sample for each positive sample.") 39 | flags.DEFINE_integer("seq_len", 100, "The length of user's behavior sequence.") 40 | flags.DEFINE_integer("epochs", 20, "train steps.") 41 | flags.DEFINE_integer("batch_size", 512, "Batch Size.") 42 | flags.DEFINE_integer("test_neg_num", 100, "The number of test negative samples.") 43 | flags.DEFINE_integer("k", 10, "recall k items at test stage.") 44 | 45 | 46 | def main(argv): 47 | # TODO: 1. Split Data 48 | if FLAGS.train_path == "None": 49 | train_path, val_path, test_path, meta_path = ml.split_seq_data(file_path=FLAGS.file_path) 50 | else: 51 | train_path, val_path, test_path, meta_path = FLAGS.train_path, FLAGS.val_path, FLAGS.test_path, FLAGS.meta_path 52 | with open(meta_path) as f: 53 | _, max_item_num = [int(x) for x in f.readline().strip('\n').split('\t')] 54 | # TODO: 2. Load Sequence Data 55 | train_data = ml.load_seq_data(train_path, "train", FLAGS.seq_len, FLAGS.neg_num, max_item_num) 56 | val_data = ml.load_seq_data(val_path, "val", FLAGS.seq_len, FLAGS.neg_num, max_item_num) 57 | test_data = ml.load_seq_data(test_path, "test", FLAGS.seq_len, FLAGS.test_neg_num, max_item_num) 58 | # TODO: 3. Set Model Hyper Parameters. 59 | model_params = { 60 | 'item_num': max_item_num + 1, 61 | 'embed_dim': FLAGS.embed_dim, 62 | 'seq_len': FLAGS.seq_len, 63 | 'blocks': FLAGS.blocks, 64 | 'num_heads': FLAGS.num_heads, 65 | 'ffn_hidden_unit': FLAGS.ffn_hidden_unit, 66 | 'dnn_dropout': FLAGS.dnn_dropout, 67 | 'use_l2norm': FLAGS.use_l2norm, 68 | 'loss_name': FLAGS.loss_name, 69 | 'gamma': FLAGS.gamma, 70 | 'embed_reg': FLAGS.embed_reg 71 | } 72 | # TODO: 4. Build Model 73 | model = FISSA(**model_params) 74 | model.compile(optimizer=Adam(learning_rate=FLAGS.learning_rate)) 75 | # TODO: 5. Fit Model 76 | for epoch in range(1, FLAGS.epochs + 1): 77 | t1 = time() 78 | model.fit( 79 | x=train_data, 80 | epochs=1, 81 | validation_data=val_data, 82 | batch_size=FLAGS.batch_size 83 | ) 84 | t2 = time() 85 | eval_dict = eval_pos_neg(model, test_data, ['hr', 'mrr', 'ndcg'], FLAGS.k, FLAGS.batch_size) 86 | print('Iteration %d Fit [%.1f s], Evaluate [%.1f s]: HR = %.4f, MRR = %.4f, NDCG = %.4f' 87 | % (epoch, t2 - t1, time() - t2, eval_dict['hr'], eval_dict['mrr'], eval_dict['ndcg'])) 88 | 89 | 90 | if __name__ == '__main__': 91 | app.run(main) -------------------------------------------------------------------------------- /example/m_gru4rec_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 20, 2021 3 | Updated on Apr 23, 2022 4 | train GRU4Rec demo 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import os 8 | from absl import flags, app 9 | from time import time 10 | from tensorflow.keras.optimizers import Adam 11 | 12 | from reclearn.models.matching import GRU4Rec 13 | from reclearn.data.datasets import movielens as ml 14 | from reclearn.evaluator import eval_pos_neg 15 | 16 | FLAGS = flags.FLAGS 17 | 18 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 19 | # os.environ['CUDA_VISIBLE_DEVICES'] = '6' 20 | 21 | # Setting training parameters 22 | flags.DEFINE_string("file_path", "data/ml-1m/ratings.dat", "file path.") 23 | flags.DEFINE_string("train_path", "data/ml-1m/ml_seq_train.txt", "train path. If set to None, the program will split the dataset.") 24 | flags.DEFINE_string("val_path", "data/ml-1m/ml_seq_val.txt", "val path.") 25 | flags.DEFINE_string("test_path", "data/ml-1m/ml_seq_test.txt", "test path.") 26 | flags.DEFINE_string("meta_path", "data/ml-1m/ml_seq_meta.txt", "meta path.") 27 | flags.DEFINE_integer("embed_dim", 64, "The size of embedding dimension.") 28 | flags.DEFINE_float("embed_reg", 0.0, "The value of embedding regularization.") 29 | flags.DEFINE_integer("gru_layers", 2, "The number of GRU Layers.") 30 | flags.DEFINE_integer("gru_unit", 128, "The unit of GRU Layer.") 31 | flags.DEFINE_string("gru_activation", "tanh", "Activation Name.") 32 | flags.DEFINE_float("dnn_dropout", 0., "Float between 0 and 1. Dropout of user and item MLP layer.") 33 | flags.DEFINE_boolean("use_l2norm", False, "Whether user embedding, item embedding should be normalized or not.") 34 | flags.DEFINE_string("loss_name", "bpr_loss", "Loss Name.") 35 | flags.DEFINE_float("gamma", 0.5, "If hinge_loss is selected as the loss function, you can specify the margin.") 36 | flags.DEFINE_float("learning_rate", 0.001, "Learning rate.") 37 | flags.DEFINE_integer("neg_num", 4, "The number of negative sample for each positive sample.") 38 | flags.DEFINE_integer("seq_len", 100, "The length of user's behavior sequence.") 39 | flags.DEFINE_integer("epochs", 20, "train steps.") 40 | flags.DEFINE_integer("batch_size", 512, "Batch Size.") 41 | flags.DEFINE_integer("test_neg_num", 100, "The number of test negative samples.") 42 | flags.DEFINE_integer("k", 10, "recall k items at test stage.") 43 | 44 | 45 | def main(argv): 46 | # TODO: 1. Split Data 47 | if FLAGS.train_path == "None": 48 | train_path, val_path, test_path, meta_path = ml.split_seq_data(file_path=FLAGS.file_path) 49 | else: 50 | train_path, val_path, test_path, meta_path = FLAGS.train_path, FLAGS.val_path, FLAGS.test_path, FLAGS.meta_path 51 | with open(meta_path) as f: 52 | _, max_item_num = [int(x) for x in f.readline().strip('\n').split('\t')] 53 | # TODO: 2. Load Sequence Data 54 | train_data = ml.load_seq_data(train_path, "train", FLAGS.seq_len, FLAGS.neg_num, max_item_num) 55 | val_data = ml.load_seq_data(val_path, "val", FLAGS.seq_len, FLAGS.neg_num, max_item_num) 56 | test_data = ml.load_seq_data(test_path, "test", FLAGS.seq_len, FLAGS.test_neg_num, max_item_num) 57 | # TODO: 3. Set Model Hyper Parameters. 58 | model_params = { 59 | 'item_num': max_item_num + 1, 60 | 'embed_dim': FLAGS.embed_dim, 61 | 'gru_layers': FLAGS.gru_layers, 62 | 'gru_unit': FLAGS.gru_unit, 63 | 'gru_activation': FLAGS.gru_activation, 64 | 'dnn_dropout': FLAGS.dnn_dropout, 65 | 'use_l2norm': FLAGS.use_l2norm, 66 | 'loss_name': FLAGS.loss_name, 67 | 'gamma': FLAGS.gamma, 68 | 'embed_reg': FLAGS.embed_reg 69 | } 70 | # TODO: 4. Build Model 71 | model = GRU4Rec(**model_params) 72 | model.compile(optimizer=Adam(learning_rate=FLAGS.learning_rate)) 73 | # TODO: 5. Fit Model 74 | for epoch in range(1, FLAGS.epochs + 1): 75 | t1 = time() 76 | model.fit( 77 | x=train_data, 78 | epochs=1, 79 | validation_data=val_data, 80 | batch_size=FLAGS.batch_size 81 | ) 82 | t2 = time() 83 | eval_dict = eval_pos_neg(model, test_data, ['hr', 'mrr', 'ndcg'], FLAGS.k, FLAGS.batch_size) 84 | print('Iteration %d Fit [%.1f s], Evaluate [%.1f s]: HR = %.4f, MRR = %.4f, NDCG = %.4f' 85 | % (epoch, t2 - t1, time() - t2, eval_dict['hr'], eval_dict['mrr'], eval_dict['ndcg'])) 86 | 87 | 88 | if __name__ == '__main__': 89 | app.run(main) -------------------------------------------------------------------------------- /example/m_mind_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Apr 26, 2022 3 | train MIND demo 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import os 7 | from absl import flags, app 8 | from time import time 9 | from tensorflow.keras.optimizers import Adam 10 | 11 | from reclearn.models.matching import MIND 12 | from reclearn.data.datasets import movielens as ml 13 | from reclearn.evaluator import eval_pos_neg 14 | 15 | FLAGS = flags.FLAGS 16 | 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 18 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 19 | 20 | # Setting training parameters 21 | flags.DEFINE_string("file_path", "data/ml-1m/ratings.dat", "file path.") 22 | flags.DEFINE_string("train_path", "data/ml-1m/ml_seq_train.txt", "train path. If set to None, the program will split the dataset.") 23 | flags.DEFINE_string("val_path", "data/ml-1m/ml_seq_val.txt", "val path.") 24 | flags.DEFINE_string("test_path", "data/ml-1m/ml_seq_test.txt", "test path.") 25 | flags.DEFINE_string("meta_path", "data/ml-1m/ml_seq_meta.txt", "meta path.") 26 | flags.DEFINE_integer("embed_dim", 64, "The size of embedding dimension.") 27 | flags.DEFINE_float("embed_reg", 0.0, "The value of embedding regularization.") 28 | flags.DEFINE_integer("num_interest", 1, "The number of user interests.") 29 | flags.DEFINE_bool("stop_grad", True, "The weights in the capsule network are updated without gradient descent.") 30 | flags.DEFINE_bool("label_attention", True, "Whether using label-aware attention or not.") 31 | flags.DEFINE_float("learning_rate", 0.001, "Learning rate.") 32 | flags.DEFINE_integer("neg_num", 2, "The number of negative sample for each positive sample.") 33 | flags.DEFINE_integer("seq_len", 100, "The length of user's behavior sequence.") 34 | flags.DEFINE_integer("epochs", 20, "train steps.") 35 | flags.DEFINE_integer("batch_size", 512, "Batch Size.") 36 | flags.DEFINE_integer("test_neg_num", 100, "The number of test negative samples.") 37 | flags.DEFINE_integer("k", 10, "recall k items at test stage.") 38 | 39 | 40 | def main(argv): 41 | # TODO: 1. Split Data 42 | if FLAGS.train_path == "None": 43 | train_path, val_path, test_path, meta_path = ml.split_seq_data(file_path=FLAGS.file_path) 44 | else: 45 | train_path, val_path, test_path, meta_path = FLAGS.train_path, FLAGS.val_path, FLAGS.test_path, FLAGS.meta_path 46 | with open(meta_path) as f: 47 | _, max_item_num = [int(x) for x in f.readline().strip('\n').split('\t')] 48 | # TODO: 2. Load Sequence Data 49 | train_data = ml.load_seq_data(train_path, "train", FLAGS.seq_len, FLAGS.neg_num, max_item_num) 50 | val_data = ml.load_seq_data(val_path, "val", FLAGS.seq_len, FLAGS.neg_num, max_item_num) 51 | test_data = ml.load_seq_data(test_path, "test", FLAGS.seq_len, FLAGS.test_neg_num, max_item_num) 52 | # TODO: 3. Set Model Hyper Parameters. 53 | model_params = { 54 | 'item_num': max_item_num + 1, 55 | 'embed_dim': FLAGS.embed_dim, 56 | 'seq_len': FLAGS.seq_len, 57 | 'num_interest': FLAGS.num_interest, 58 | 'stop_grad': FLAGS.stop_grad, 59 | 'label_attention': FLAGS.label_attention, 60 | 'neg_num': FLAGS.neg_num, 61 | 'batch_size': FLAGS.batch_size, 62 | 'embed_reg': FLAGS.embed_reg 63 | } 64 | # TODO: 4. Build Model 65 | model = MIND(**model_params) 66 | model.compile(optimizer=Adam(learning_rate=FLAGS.learning_rate)) 67 | # TODO: 5. Fit Model 68 | for epoch in range(1, FLAGS.epochs + 1): 69 | t1 = time() 70 | model.fit( 71 | x=train_data, 72 | epochs=1, 73 | validation_data=val_data, 74 | batch_size=FLAGS.batch_size 75 | ) 76 | t2 = time() 77 | eval_dict = eval_pos_neg(model, test_data, ['hr', 'mrr', 'ndcg'], FLAGS.k, FLAGS.batch_size) 78 | print('Iteration %d Fit [%.1f s], Evaluate [%.1f s]: HR = %.4f, MRR = %.4f, NDCG = %.4f' 79 | % (epoch, t2 - t1, time() - t2, eval_dict['hr'], eval_dict['mrr'], eval_dict['ndcg'])) 80 | 81 | 82 | if __name__ == '__main__': 83 | app.run(main) -------------------------------------------------------------------------------- /example/m_ncf_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 19, 2021 3 | Updated on Apr 23, 2022 4 | train NCF demo 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import os 8 | from absl import flags, app 9 | from time import time 10 | from tensorflow.keras.optimizers import Adam 11 | 12 | from reclearn.models.matching import NCF 13 | from reclearn.data.datasets import movielens as ml 14 | from reclearn.evaluator import eval_pos_neg 15 | 16 | FLAGS = flags.FLAGS 17 | 18 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 19 | # os.environ['CUDA_VISIBLE_DEVICES'] = '6' 20 | 21 | # Setting training parameters 22 | flags.DEFINE_string("file_path", "data/ml-1m/ratings.dat", "file path.") 23 | flags.DEFINE_string("train_path", "data/ml-1m/ml_train.txt", "train path. If set to None, the program will split the dataset.") 24 | flags.DEFINE_string("val_path", "data/ml-1m/ml_val.txt", "val path.") 25 | flags.DEFINE_string("test_path", "data/ml-1m/ml_test.txt", "test path.") 26 | flags.DEFINE_string("meta_path", "data/ml-1m/ml_meta.txt", "meta path.") 27 | flags.DEFINE_integer("embed_dim", 64, "The size of embedding dimension.") 28 | flags.DEFINE_float("embed_reg", 0.0, "The value of embedding regularization.") 29 | flags.DEFINE_list("hidden_units", [256, 128, 64], "A list of MLP hidden units.") 30 | flags.DEFINE_string("activation", "relu", "Activation Name.") 31 | flags.DEFINE_float("dnn_dropout", 0., "Float between 0 and 1. Dropout of user and item MLP layer.") 32 | flags.DEFINE_boolean("use_batch_norm", False, "Whether using batch normalization or not.") 33 | flags.DEFINE_boolean("use_l2norm", False, "Whether user embedding, item embedding should be normalized or not.") 34 | flags.DEFINE_string("loss_name", "binary_cross_entropy_loss", "Loss Name.") 35 | flags.DEFINE_float("gamma", 0.5, "If hinge_loss is selected as the loss function, you can specify the margin.") 36 | flags.DEFINE_float("learning_rate", 0.001, "Learning rate.") 37 | flags.DEFINE_integer("neg_num", 4, "The number of negative sample for each positive sample.") 38 | flags.DEFINE_integer("epochs", 20, "train steps.") 39 | flags.DEFINE_integer("batch_size", 512, "Batch Size.") 40 | flags.DEFINE_integer("test_neg_num", 100, "The number of test negative samples.") 41 | flags.DEFINE_integer("k", 10, "recall k items at test stage.") 42 | 43 | 44 | def main(argv): 45 | # TODO: 1. Split Data 46 | if FLAGS.train_path == "None": 47 | train_path, val_path, test_path, meta_path = ml.split_data(file_path=FLAGS.file_path) 48 | else: 49 | train_path, val_path, test_path, meta_path = FLAGS.train_path, FLAGS.val_path, FLAGS.test_path, FLAGS.meta_path 50 | with open(meta_path) as f: 51 | max_user_num, max_item_num = [int(x) for x in f.readline().strip('\n').split('\t')] 52 | # TODO: 2. Load Data 53 | train_data = ml.load_data(train_path, FLAGS.neg_num, max_item_num) 54 | val_data = ml.load_data(val_path, FLAGS.neg_num, max_item_num) 55 | test_data = ml.load_data(test_path, FLAGS.test_neg_num, max_item_num) 56 | # TODO: 3. Set Model Hyper Parameters. 57 | model_params = { 58 | 'user_num': max_user_num + 1, 59 | 'item_num': max_item_num + 1, 60 | 'embed_dim': FLAGS.embed_dim, 61 | 'hidden_units': FLAGS.hidden_units, 62 | 'activation': FLAGS.activation, 63 | 'dnn_dropout': FLAGS.dnn_dropout, 64 | 'use_batch_norm': FLAGS.use_batch_norm, 65 | 'use_l2norm': FLAGS.use_l2norm, 66 | 'loss_name': FLAGS.loss_name, 67 | 'gamma': FLAGS.gamma, 68 | 'embed_reg': FLAGS.embed_reg 69 | } 70 | # TODO: 4. Build Model 71 | model = NCF(**model_params) 72 | model.compile(optimizer=Adam(learning_rate=FLAGS.learning_rate)) 73 | # TODO: 5. Fit Model 74 | for epoch in range(1, FLAGS.epochs + 1): 75 | t1 = time() 76 | model.fit( 77 | x=train_data, 78 | epochs=1, 79 | validation_data=val_data, 80 | batch_size=FLAGS.batch_size 81 | ) 82 | t2 = time() 83 | eval_dict = eval_pos_neg(model, test_data, ['hr', 'mrr', 'ndcg'], FLAGS.k, FLAGS.batch_size) 84 | print('Iteration %d Fit [%.1f s], Evaluate [%.1f s]: HR = %.4f, MRR = %.4f, NDCG = %.4f, ' 85 | % (epoch, t2 - t1, time() - t2, eval_dict['hr'], eval_dict['mrr'], eval_dict['ndcg'])) 86 | 87 | 88 | if __name__ == '__main__': 89 | app.run(main) -------------------------------------------------------------------------------- /example/m_poprec_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 20, 2021 3 | train PopRec demo 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import os 7 | from time import time 8 | from tensorflow.keras.optimizers import Adam 9 | 10 | from reclearn.models.matching import PopRec 11 | from reclearn.data.datasets import movielens as ml 12 | from reclearn.evaluator import eval_pos_neg 13 | 14 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 15 | 16 | k = 10 17 | 18 | 19 | def main(): 20 | # TODO: 1. Split Data 21 | # file_path = 'data/ml-1m/ratings.dat' 22 | # train_path, val_path, test_path, _ = ml.split_movielens(file_path=file_path) 23 | train_path = 'data/ml-1m/ml_train.txt' 24 | val_path = 'data/ml-1m/ml_val.txt' 25 | test_path = 'data/ml-1m/ml_test.txt' 26 | meta_path = 'data/ml-1m/ml_meta.txt' 27 | with open(meta_path) as f: 28 | max_user_num, max_item_num = [int(x) for x in f.readline().strip('\n').split('\t')] 29 | # TODO: 2. Load Data 30 | test_data = ml.load_data(test_path, 100, max_item_num) 31 | # TODO: 3. Build Model 32 | model = PopRec(train_path=train_path, delimiter='\t') 33 | # TODO: 3. Update Model. 34 | model.update(data_path=val_path, delimiter='\t') 35 | # TODO: 4. Evaluate Model 36 | t1 = time() 37 | eval_dict = eval_pos_neg(model, test_data, metric_names=['hr', 'mrr', 'ndcg'], k=k) 38 | print('Evaluate [%.1f s]: HR = %.4f, MRR = %.4f, NDCG = %.4f, ' 39 | % (time() - t1, eval_dict['hr'], eval_dict['mrr'], eval_dict['ndcg'])) 40 | 41 | 42 | main() -------------------------------------------------------------------------------- /example/m_sasrec_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 20, 2021 3 | Updated on Apr 23, 2022 4 | train SASRec demo 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import os 8 | from absl import flags, app 9 | from time import time 10 | from tensorflow.keras.optimizers import Adam 11 | 12 | from reclearn.models.matching import SASRec 13 | from reclearn.data.datasets import movielens as ml 14 | from reclearn.evaluator import eval_pos_neg 15 | 16 | FLAGS = flags.FLAGS 17 | 18 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 19 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 20 | 21 | # Setting training parameters 22 | flags.DEFINE_string("file_path", "data/ml-1m/ratings.dat", "file path.") 23 | flags.DEFINE_string("train_path", "data/ml-1m/ml_seq_train.txt", "train path. If set to None, the program will split the dataset.") 24 | flags.DEFINE_string("val_path", "data/ml-1m/ml_seq_val.txt", "val path.") 25 | flags.DEFINE_string("test_path", "data/ml-1m/ml_seq_test.txt", "test path.") 26 | flags.DEFINE_string("meta_path", "data/ml-1m/ml_seq_meta.txt", "meta path.") 27 | flags.DEFINE_integer("embed_dim", 64, "The size of embedding dimension.") 28 | flags.DEFINE_float("embed_reg", 0.0, "The value of embedding regularization.") 29 | flags.DEFINE_integer("blocks", 2, "The Number of blocks.") 30 | flags.DEFINE_integer("num_heads", 2, "The Number of attention heads.") 31 | flags.DEFINE_integer("ffn_hidden_unit", 64, "Number of hidden unit in FFN.") 32 | flags.DEFINE_float("dnn_dropout", 0.2, "Float between 0 and 1. Dropout of user and item MLP layer.") 33 | flags.DEFINE_float("layer_norm_eps", 1e-6, "Small float added to variance to avoid dividing by zero.") 34 | flags.DEFINE_boolean("use_l2norm", False, "Whether user embedding, item embedding should be normalized or not.") 35 | flags.DEFINE_string("loss_name", "binary_cross_entropy_loss", "Loss Name.") 36 | flags.DEFINE_float("gamma", 0.5, "If hinge_loss is selected as the loss function, you can specify the margin.") 37 | flags.DEFINE_float("learning_rate", 0.001, "Learning rate.") 38 | flags.DEFINE_integer("neg_num", 4, "The number of negative sample for each positive sample.") 39 | flags.DEFINE_integer("seq_len", 100, "The length of user's behavior sequence.") 40 | flags.DEFINE_integer("epochs", 20, "train steps.") 41 | flags.DEFINE_integer("batch_size", 512, "Batch Size.") 42 | flags.DEFINE_integer("test_neg_num", 100, "The number of test negative samples.") 43 | flags.DEFINE_integer("k", 10, "recall k items at test stage.") 44 | 45 | 46 | def main(argv): 47 | # TODO: 1. Split Data 48 | if FLAGS.train_path == "None": 49 | train_path, val_path, test_path, meta_path = ml.split_seq_data(file_path=FLAGS.file_path) 50 | else: 51 | train_path, val_path, test_path, meta_path = FLAGS.train_path, FLAGS.val_path, FLAGS.test_path, FLAGS.meta_path 52 | with open(meta_path) as f: 53 | _, max_item_num = [int(x) for x in f.readline().strip('\n').split('\t')] 54 | # TODO: 2. Load Sequence Data 55 | train_data = ml.load_seq_data(train_path, "train", FLAGS.seq_len, FLAGS.neg_num, max_item_num) 56 | val_data = ml.load_seq_data(val_path, "val", FLAGS.seq_len, FLAGS.neg_num, max_item_num) 57 | test_data = ml.load_seq_data(test_path, "test", FLAGS.seq_len, FLAGS.test_neg_num, max_item_num) 58 | # TODO: 3. Set Model Hyper Parameters. 59 | model_params = { 60 | 'item_num': max_item_num + 1, 61 | 'embed_dim': FLAGS.embed_dim, 62 | 'seq_len': FLAGS.seq_len, 63 | 'blocks': FLAGS.blocks, 64 | 'num_heads': FLAGS.num_heads, 65 | 'ffn_hidden_unit': FLAGS.ffn_hidden_unit, 66 | 'dnn_dropout': FLAGS.dnn_dropout, 67 | 'use_l2norm': FLAGS.use_l2norm, 68 | 'loss_name': FLAGS.loss_name, 69 | 'gamma': FLAGS.gamma, 70 | 'embed_reg': FLAGS.embed_reg 71 | } 72 | # TODO: 4. Build Model 73 | model = SASRec(**model_params) 74 | model.compile(optimizer=Adam(learning_rate=FLAGS.learning_rate)) 75 | # TODO: 5. Fit Model 76 | for epoch in range(1, FLAGS.epochs + 1): 77 | t1 = time() 78 | model.fit( 79 | x=train_data, 80 | epochs=1, 81 | validation_data=val_data, 82 | batch_size=FLAGS.batch_size 83 | ) 84 | t2 = time() 85 | eval_dict = eval_pos_neg(model, test_data, ['hr', 'mrr', 'ndcg'], FLAGS.k, FLAGS.batch_size) 86 | print('Iteration %d Fit [%.1f s], Evaluate [%.1f s]: HR = %.4f, MRR = %.4f, NDCG = %.4f' 87 | % (epoch, t2 - t1, time() - t2, eval_dict['hr'], eval_dict['mrr'], eval_dict['ndcg'])) 88 | 89 | 90 | if __name__ == '__main__': 91 | app.run(main) -------------------------------------------------------------------------------- /example/m_youtubednn_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Apr 5, 2022 3 | train YoutubeDNN demo 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import os 7 | from absl import flags, app 8 | from time import time 9 | from tensorflow.keras.optimizers import Adam 10 | 11 | from reclearn.models.matching import YoutubeDNN 12 | from reclearn.data.datasets import movielens as ml 13 | from reclearn.evaluator import eval_pos_neg 14 | 15 | FLAGS = flags.FLAGS 16 | 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 18 | # os.environ['CUDA_VISIBLE_DEVICES'] = '6' 19 | 20 | # Setting training parameters 21 | flags.DEFINE_string("file_path", "data/ml-1m/ratings.dat", "file path.") 22 | flags.DEFINE_string("train_path", "data/ml-1m/ml_seq_train.txt", "train path. If set to None, the program will split the dataset.") 23 | flags.DEFINE_string("val_path", "data/ml-1m/ml_seq_val.txt", "val path.") 24 | flags.DEFINE_string("test_path", "data/ml-1m/ml_seq_test.txt", "test path.") 25 | flags.DEFINE_string("meta_path", "data/ml-1m/ml_seq_meta.txt", "meta path.") 26 | flags.DEFINE_integer("embed_dim", 64, "The size of embedding dimension.") 27 | flags.DEFINE_float("embed_reg", 0.0, "The value of embedding regularization.") 28 | flags.DEFINE_list("user_mlp", [128, 256, 64], "A list of user MLP hidden units.") 29 | flags.DEFINE_string("activation", "relu", "Activation Name.") 30 | flags.DEFINE_float("dnn_dropout", 0., "Float between 0 and 1. Dropout of user and item MLP layer.") 31 | flags.DEFINE_boolean("use_l2norm", False, "Whether user embedding, item embedding should be normalized or not.") 32 | flags.DEFINE_float("learning_rate", 0.001, "Learning rate.") 33 | flags.DEFINE_integer("neg_num", 4, "The number of negative sample for each positive sample.") 34 | flags.DEFINE_integer("seq_len", 100, "The length of user's behavior sequence.") 35 | flags.DEFINE_integer("epochs", 20, "train steps.") 36 | flags.DEFINE_integer("batch_size", 512, "Batch Size.") 37 | flags.DEFINE_integer("test_neg_num", 100, "The number of test negative samples.") 38 | flags.DEFINE_integer("k", 10, "recall k items at test stage.") 39 | 40 | 41 | def main(argv): 42 | # TODO: 1. Split Data 43 | if FLAGS.train_path == "None": 44 | train_path, val_path, test_path, meta_path = ml.split_seq_data(file_path=FLAGS.file_path) 45 | else: 46 | train_path, val_path, test_path, meta_path = FLAGS.train_path, FLAGS.val_path, FLAGS.test_path, FLAGS.meta_path 47 | with open(meta_path) as f: 48 | _, max_item_num = [int(x) for x in f.readline().strip('\n').split('\t')] 49 | # TODO: 2. Load Sequence Data 50 | train_data = ml.load_seq_data(train_path, "train", FLAGS.seq_len, 0, max_item_num) 51 | val_data = ml.load_seq_data(val_path, "val", FLAGS.seq_len, FLAGS.neg_num, max_item_num) 52 | test_data = ml.load_seq_data(test_path, "test", FLAGS.seq_len, FLAGS.test_neg_num, max_item_num) 53 | # TODO: 3. Set Model Hyper Parameters. 54 | model_params = { 55 | 'item_num': max_item_num + 1, 56 | 'embed_dim': FLAGS.embed_dim, 57 | 'user_mlp': FLAGS.user_mlp, 58 | 'activation': FLAGS.activation, 59 | 'dnn_dropout': FLAGS.dnn_dropout, 60 | 'neg_num': FLAGS.neg_num, 61 | 'batch_size': FLAGS.batch_size, 62 | 'use_l2norm': FLAGS.use_l2norm, 63 | 'embed_reg': FLAGS.embed_reg 64 | } 65 | # TODO: 4. Build Model 66 | model = YoutubeDNN(**model_params) 67 | model.compile(optimizer=Adam(learning_rate=FLAGS.learning_rate)) 68 | # TODO: 5. Fit Model 69 | for epoch in range(1, FLAGS.epochs + 1): 70 | t1 = time() 71 | model.fit( 72 | x=train_data, 73 | epochs=1, 74 | validation_data=val_data, 75 | batch_size=FLAGS.batch_size 76 | ) 77 | t2 = time() 78 | eval_dict = eval_pos_neg(model, test_data, ['hr', 'mrr', 'ndcg'], FLAGS.k, FLAGS.batch_size) 79 | print('Iteration %d Fit [%.1f s], Evaluate [%.1f s]: HR = %.4f, MRR = %.4f, NDCG = %.4f' 80 | % (epoch, t2 - t1, time() - t2, eval_dict['hr'], eval_dict['mrr'], eval_dict['ndcg'])) 81 | 82 | 83 | if __name__ == '__main__': 84 | app.run(main) -------------------------------------------------------------------------------- /example/r_afm_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 14, 2021 3 | train AFM demo 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import tensorflow as tf 7 | from tensorflow.keras.losses import binary_crossentropy 8 | from tensorflow.keras.optimizers import Adam 9 | from tensorflow.keras.metrics import AUC 10 | 11 | from reclearn.models.ranking import AFM 12 | from reclearn.data.datasets.criteo import get_split_file_path, get_fea_map, create_criteo_dataset 13 | 14 | import pickle 15 | import os 16 | 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 18 | # If you have GPU, and the value is GPU serial number. 19 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 20 | 21 | 22 | if __name__ == '__main__': 23 | # TODO: Hyper Parameters 24 | file = 'data/criteo/train.txt' 25 | learning_rate = 0.001 26 | batch_size = 4096 27 | embed_dim = 8 28 | model_params = { 29 | 'mode': 'max', # 'att' has the problem. 30 | 'att_dim': 8, 31 | 'dnn_dropout': 0.5, 32 | 'embed_reg': 0., 33 | } 34 | # TODO: Split dataset 35 | # If you want to split the file 36 | sample_num = 4600000 37 | split_file_list = get_split_file_path(dataset_path=file, sample_num=sample_num) 38 | # Or if you split the file before 39 | # split_file_list = get_split_file_path(parent_path='data/criteo/split') 40 | print('split file name: %s' % str(split_file_list)) 41 | # TODO: Get Feature Map 42 | # If you want to make feature map. 43 | fea_map = get_fea_map(split_file_list=split_file_list) 44 | # Or if you want to load feature map. 45 | # fea_map = get_fea_map(fea_map_path='data/criteo/split/fea_map.pkl') 46 | # TODO: Load test data 47 | print("load test file: %s" % split_file_list[-1]) 48 | feature_columns, test_data = create_criteo_dataset(split_file_list[-1], fea_map, embed_dim=embed_dim) 49 | # TODO: Build Model 50 | mirrored_strategy = tf.distribute.MirroredStrategy() 51 | with mirrored_strategy.scope(): 52 | model = AFM(feature_columns=feature_columns, **model_params) 53 | model.summary() 54 | model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate), 55 | metrics=[AUC()]) 56 | # TODO: Load train data 57 | for file in split_file_list[:-1]: 58 | print("load %s" % file) 59 | _, train_data = create_criteo_dataset(file, fea_map) 60 | # TODO: Fit 61 | model.fit( 62 | x=train_data[0], 63 | y=train_data[1], 64 | epochs=1, 65 | batch_size=batch_size, 66 | validation_split=0.1 67 | ) 68 | # TODO: Test 69 | train_data = [] 70 | print('test AUC: %f' % model.evaluate(x=test_data[0], y=test_data[1], batch_size=batch_size)[1]) -------------------------------------------------------------------------------- /example/r_dcn_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 14, 2021 3 | train DCN demo 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import tensorflow as tf 7 | from tensorflow.keras.losses import binary_crossentropy 8 | from tensorflow.keras.optimizers import Adam 9 | from tensorflow.keras.metrics import AUC 10 | 11 | from reclearn.models.ranking import DCN 12 | from reclearn.data.datasets.criteo import get_split_file_path, get_fea_map, create_criteo_dataset 13 | 14 | import pickle 15 | import os 16 | 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 18 | # If you have GPU, and the value is GPU serial number. 19 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 20 | 21 | 22 | if __name__ == '__main__': 23 | # TODO: Hyper Parameters 24 | file = 'data/criteo/train.txt' 25 | learning_rate = 0.001 26 | batch_size = 4096 27 | embed_dim = 8 28 | model_params = { 29 | 'hidden_units': [256, 128, 64], 30 | 'dnn_dropout': 0.5, 31 | 'embed_reg': 0., 32 | 'cross_w_reg': 0., 33 | 'cross_b_reg': 0. 34 | } 35 | # TODO: Split dataset 36 | # If you want to split the file 37 | sample_num = 4600000 38 | split_file_list = get_split_file_path(dataset_path=file, sample_num=sample_num) 39 | # Or if you split the file before 40 | # split_file_list = get_split_file_path(parent_path='data/criteo/split') 41 | print('split file name: %s' % str(split_file_list)) 42 | # TODO: Get Feature Map 43 | # If you want to make feature map. 44 | fea_map = get_fea_map(split_file_list=split_file_list) 45 | # Or if you want to load feature map. 46 | # fea_map = get_fea_map(fea_map_path='data/criteo/split/fea_map.pkl') 47 | # TODO: Load test data 48 | print("load test file: %s" % split_file_list[-1]) 49 | feature_columns, test_data = create_criteo_dataset(split_file_list[-1], fea_map, embed_dim=embed_dim) 50 | # TODO: Build Model 51 | mirrored_strategy = tf.distribute.MirroredStrategy() 52 | with mirrored_strategy.scope(): 53 | model = DCN(feature_columns=feature_columns, **model_params) 54 | model.summary() 55 | model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate), 56 | metrics=[AUC()]) 57 | # TODO: Load train data 58 | for file in split_file_list[:-1]: 59 | print("load %s" % file) 60 | _, train_data = create_criteo_dataset(file, fea_map) 61 | # TODO: Fit 62 | model.fit( 63 | x=train_data[0], 64 | y=train_data[1], 65 | epochs=1, 66 | batch_size=batch_size, 67 | validation_split=0.1 68 | ) 69 | # TODO: Test 70 | train_data = [] 71 | print('test AUC: %f' % model.evaluate(x=test_data[0], y=test_data[1], batch_size=batch_size)[1]) -------------------------------------------------------------------------------- /example/r_deep_crossing_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 14, 2021 3 | train Deep Crossing demo 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import tensorflow as tf 7 | from tensorflow.keras.losses import binary_crossentropy 8 | from tensorflow.keras.optimizers import Adam 9 | from tensorflow.keras.metrics import AUC 10 | 11 | from reclearn.models.ranking import Deep_Crossing 12 | from reclearn.data.datasets.criteo import get_split_file_path, get_fea_map, create_criteo_dataset 13 | 14 | import pickle 15 | import os 16 | 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 18 | # If you have GPU, and the value is GPU serial number. 19 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 20 | 21 | 22 | if __name__ == '__main__': 23 | # TODO: Hyper Parameters 24 | file = 'data/criteo/train.txt' 25 | learning_rate = 0.001 26 | batch_size = 4096 27 | embed_dim = 8 28 | model_params = { 29 | 'hidden_units': [256, 128, 64], 30 | 'dnn_dropout': 0.5, 31 | 'embed_reg': 0. 32 | } 33 | # TODO: Split dataset 34 | # If you want to split the file 35 | sample_num = 4600000 36 | split_file_list = get_split_file_path(dataset_path=file, sample_num=sample_num) 37 | # Or if you split the file before 38 | # split_file_list = get_split_file_path(parent_path='data/criteo/split') 39 | print('split file name: %s' % str(split_file_list)) 40 | # TODO: Get Feature Map 41 | # If you want to make feature map. 42 | fea_map = get_fea_map(split_file_list=split_file_list) 43 | # Or if you want to load feature map. 44 | # fea_map = get_fea_map(fea_map_path='data/criteo/split/fea_map.pkl') 45 | # TODO: Load test data 46 | print("load test file: %s" % split_file_list[-1]) 47 | feature_columns, test_data = create_criteo_dataset(split_file_list[-1], fea_map, embed_dim=embed_dim) 48 | # TODO: Build Model 49 | mirrored_strategy = tf.distribute.MirroredStrategy() 50 | with mirrored_strategy.scope(): 51 | model = Deep_Crossing(feature_columns=feature_columns, **model_params) 52 | model.summary() 53 | model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate), 54 | metrics=[AUC()]) 55 | # TODO: Load train data 56 | for file in split_file_list[:-1]: 57 | print("load %s" % file) 58 | _, train_data = create_criteo_dataset(file, fea_map) 59 | # TODO: Fit 60 | model.fit( 61 | x=train_data[0], 62 | y=train_data[1], 63 | epochs=1, 64 | batch_size=batch_size, 65 | validation_split=0.1 66 | ) 67 | # TODO: Test 68 | train_data = [] 69 | print('test AUC: %f' % model.evaluate(x=test_data[0], y=test_data[1], batch_size=batch_size)[1]) -------------------------------------------------------------------------------- /example/r_deepfm_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 14, 2021 3 | train DeepFM demo 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import tensorflow as tf 7 | from tensorflow.keras.losses import binary_crossentropy 8 | from tensorflow.keras.optimizers import Adam 9 | from tensorflow.keras.metrics import AUC 10 | 11 | from reclearn.models.ranking import DeepFM 12 | from reclearn.data.datasets.criteo import get_split_file_path, get_fea_map, create_criteo_dataset 13 | 14 | import pickle 15 | import os 16 | 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 18 | # If you have GPU, and the value is GPU serial number. 19 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 20 | 21 | 22 | if __name__ == '__main__': 23 | # TODO: Hyper Parameters 24 | file = 'data/criteo/train.txt' 25 | learning_rate = 0.001 26 | batch_size = 4096 27 | embed_dim = 8 28 | model_params = { 29 | 'hidden_units': [256, 128, 64], 30 | 'dnn_dropout': 0.5, 31 | 'fm_w_reg': 0., 32 | 'embed_reg': 0. 33 | } 34 | # TODO: Split dataset 35 | # If you want to split the file 36 | sample_num = 4600000 37 | split_file_list = get_split_file_path(dataset_path=file, sample_num=sample_num) 38 | # Or if you split the file before 39 | # split_file_list = get_split_file_path(parent_path='data/criteo/split') 40 | print('split file name: %s' % str(split_file_list)) 41 | # TODO: Get Feature Map 42 | # If you want to make feature map. 43 | fea_map = get_fea_map(split_file_list=split_file_list) 44 | # Or if you want to load feature map. 45 | # fea_map = get_fea_map(fea_map_path='data/criteo/split/fea_map.pkl') 46 | # TODO: Load test data 47 | print("load test file: %s" % split_file_list[-1]) 48 | feature_columns, test_data = create_criteo_dataset(split_file_list[-1], fea_map, embed_dim=embed_dim) 49 | # TODO: Build Model 50 | mirrored_strategy = tf.distribute.MirroredStrategy() 51 | with mirrored_strategy.scope(): 52 | model = DeepFM(feature_columns=feature_columns, **model_params) 53 | model.summary() 54 | model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate), 55 | metrics=[AUC()]) 56 | # TODO: Load train data 57 | for file in split_file_list[:-1]: 58 | print("load %s" % file) 59 | _, train_data = create_criteo_dataset(file, fea_map) 60 | # TODO: Fit 61 | model.fit( 62 | x=train_data[0], 63 | y=train_data[1], 64 | epochs=1, 65 | batch_size=batch_size, 66 | validation_split=0.1 67 | ) 68 | # TODO: Test 69 | train_data = [] 70 | print('test AUC: %f' % model.evaluate(x=test_data[0], y=test_data[1], batch_size=batch_size)[1]) -------------------------------------------------------------------------------- /example/r_fm_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Aug 25, 2020 3 | Updated on Mar 11, 2022 4 | train FM demo 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import tensorflow as tf 8 | from tensorflow.keras.losses import binary_crossentropy 9 | from tensorflow.keras.optimizers import Adam 10 | from tensorflow.keras.metrics import AUC 11 | 12 | from reclearn.models.ranking import FM 13 | from reclearn.data.datasets.criteo import get_split_file_path, get_fea_map, \ 14 | create_criteo_dataset, create_small_criteo_dataset 15 | 16 | import pickle 17 | import os 18 | 19 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 20 | # If you have GPU, and the value is GPU serial number. 21 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 22 | 23 | 24 | # TODO: Hyper Parameters 25 | learning_rate = 0.001 26 | batch_size = 4096 27 | model_params = { 28 | 'k': 8, 29 | 'w_reg': 0., 30 | 'v_reg': 0. 31 | } 32 | 33 | 34 | def easy_demo(file, sample_num=500000, read_part=True, test_size=0.1, epochs=10): 35 | feature_columns, train, test = create_small_criteo_dataset(file=file, 36 | read_part=read_part, 37 | sample_num=sample_num, 38 | test_size=test_size) 39 | train_X, train_y = train 40 | test_X, test_y = test 41 | # TODO: Build Model 42 | model = FM(feature_columns=feature_columns, **model_params) 43 | model.summary() 44 | model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate), 45 | metrics=[AUC()]) 46 | # TODO: Fit 47 | model.fit( 48 | train_X, 49 | train_y, 50 | epochs=epochs, 51 | batch_size=batch_size, 52 | validation_split=0.1 53 | ) 54 | # TODO: Test 55 | print('test AUC: %f' % model.evaluate(test_X, test_y, batch_size=batch_size)[1]) 56 | 57 | 58 | def main(file): 59 | # TODO: Split dataset 60 | # If you want to split the file 61 | sample_num = 4600000 62 | split_file_list = get_split_file_path(dataset_path=file, sample_num=sample_num) 63 | # Or if you have split the file before 64 | # split_file_list = get_split_file_path(parent_path='data/criteo/split') 65 | print('split file name: %s' % str(split_file_list)) 66 | # TODO: Get Feature Map 67 | # If you want to make feature map. 68 | fea_map = get_fea_map(split_file_list=split_file_list) 69 | # Or if you want to load feature map. 70 | # fea_map = get_fea_map(fea_map_path='data/criteo/split/fea_map.pkl') 71 | # TODO: Load test data 72 | print("load test file: %s" % split_file_list[-1]) 73 | feature_columns, test_data = create_criteo_dataset(split_file_list[-1], fea_map) 74 | # TODO: Build Model 75 | model = FM(feature_columns=feature_columns, **model_params) 76 | model.summary() 77 | model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate), 78 | metrics=[AUC()]) 79 | # TODO: Load train data 80 | for file in split_file_list[:-1]: 81 | print("load %s" % file) 82 | _, train_data = create_criteo_dataset(file, fea_map) 83 | # TODO: Fit 84 | model.fit( 85 | x=train_data[0], 86 | y=train_data[1], 87 | epochs=1, 88 | batch_size=batch_size, 89 | validation_split=0.1 90 | ) 91 | # TODO: Test 92 | print('test AUC: %f' % model.evaluate(x=test_data[0], y=test_data[1], batch_size=batch_size)[1]) 93 | 94 | 95 | if __name__ == '__main__': 96 | file = 'data/criteo/train.txt' 97 | # easy_demo method only loads sample_num data of the dataset. 98 | easy_demo(file) 99 | # main method can train all data. 100 | # main(file) -------------------------------------------------------------------------------- /example/r_nfm_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 14, 2021 3 | train NFM demo 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import tensorflow as tf 7 | from tensorflow.keras.losses import binary_crossentropy 8 | from tensorflow.keras.optimizers import Adam 9 | from tensorflow.keras.metrics import AUC 10 | 11 | from reclearn.models.ranking import NFM 12 | from reclearn.data.datasets.criteo import get_split_file_path, get_fea_map, create_criteo_dataset 13 | 14 | import pickle 15 | import os 16 | 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 18 | # If you have GPU, and the value is GPU serial number. 19 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 20 | 21 | 22 | if __name__ == '__main__': 23 | # TODO: Hyper Parameters 24 | file = 'data/criteo/train.txt' 25 | learning_rate = 0.001 26 | batch_size = 4096 27 | embed_dim = 8 28 | model_params = { 29 | 'hidden_units': [256, 128, 64], 30 | 'dnn_dropout': 0.5, 31 | 'bn_use': True, 32 | 'embed_reg': 0., 33 | } 34 | # TODO: Split dataset 35 | # If you want to split the file 36 | sample_num = 4600000 37 | split_file_list = get_split_file_path(dataset_path=file, sample_num=sample_num) 38 | # Or if you split the file before 39 | # split_file_list = get_split_file_path(parent_path='data/criteo/split') 40 | print('split file name: %s' % str(split_file_list)) 41 | # TODO: Get Feature Map 42 | # If you want to make feature map. 43 | fea_map = get_fea_map(split_file_list=split_file_list) 44 | # Or if you want to load feature map. 45 | # fea_map = get_fea_map(fea_map_path='data/criteo/split/fea_map.pkl') 46 | # TODO: Load test data 47 | print("load test file: %s" % split_file_list[-1]) 48 | feature_columns, test_data = create_criteo_dataset(split_file_list[-1], fea_map, embed_dim=embed_dim) 49 | # TODO: Build Model 50 | mirrored_strategy = tf.distribute.MirroredStrategy() 51 | with mirrored_strategy.scope(): 52 | model = NFM(feature_columns=feature_columns, **model_params) 53 | model.summary() 54 | model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate), 55 | metrics=[AUC()]) 56 | # TODO: Load train data 57 | for file in split_file_list[:-1]: 58 | print("load %s" % file) 59 | _, train_data = create_criteo_dataset(file, fea_map) 60 | # TODO: Fit 61 | model.fit( 62 | x=train_data[0], 63 | y=train_data[1], 64 | epochs=1, 65 | batch_size=batch_size, 66 | validation_split=0.1 67 | ) 68 | # TODO: Test 69 | train_data = [] 70 | print('test AUC: %f' % model.evaluate(x=test_data[0], y=test_data[1], batch_size=batch_size)[1]) -------------------------------------------------------------------------------- /example/r_pnn_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 14, 2021 3 | train PNN demo 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import tensorflow as tf 7 | from tensorflow.keras.losses import binary_crossentropy 8 | from tensorflow.keras.optimizers import Adam 9 | from tensorflow.keras.metrics import AUC 10 | 11 | from reclearn.models.ranking import PNN 12 | from reclearn.data.datasets.criteo import get_split_file_path, get_fea_map, create_criteo_dataset 13 | 14 | import pickle 15 | import os 16 | 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 18 | # If you have GPU, and the value is GPU serial number. 19 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 20 | 21 | 22 | if __name__ == '__main__': 23 | # TODO: Hyper Parameters 24 | file = 'data/criteo/train.txt' 25 | learning_rate = 0.001 26 | batch_size = 4096 27 | embed_dim = 8 28 | model_params = { 29 | 'hidden_units': [256, 128, 64], 30 | 'mode': 'IPNN', 31 | 'dnn_dropout': 0.5, 32 | 'embed_reg': 0., 33 | 'w_z_reg': 0., 34 | 'w_p_reg': 0., 35 | 'l_b_reg': 0. 36 | } 37 | # TODO: Split dataset 38 | # If you want to split the file 39 | sample_num = 4600000 40 | split_file_list = get_split_file_path(dataset_path=file, sample_num=sample_num) 41 | # Or if you split the file before 42 | # split_file_list = get_split_file_path(parent_path='data/criteo/split') 43 | print('split file name: %s' % str(split_file_list)) 44 | # TODO: Get Feature Map 45 | # If you want to make feature map. 46 | fea_map = get_fea_map(split_file_list=split_file_list) 47 | # Or if you want to load feature map. 48 | # fea_map = get_fea_map(fea_map_path='data/criteo/split/fea_map.pkl') 49 | # TODO: Load test data 50 | print("load test file: %s" % split_file_list[-1]) 51 | feature_columns, test_data = create_criteo_dataset(split_file_list[-1], fea_map, embed_dim=embed_dim) 52 | # TODO: Build Model 53 | mirrored_strategy = tf.distribute.MirroredStrategy() 54 | with mirrored_strategy.scope(): 55 | model = PNN(feature_columns=feature_columns, **model_params) 56 | model.summary() 57 | model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate), 58 | metrics=[AUC()]) 59 | # TODO: Load train data 60 | for file in split_file_list[:-1]: 61 | print("load %s" % file) 62 | _, train_data = create_criteo_dataset(file, fea_map) 63 | # TODO: Fit 64 | model.fit( 65 | x=train_data[0], 66 | y=train_data[1], 67 | epochs=1, 68 | batch_size=batch_size, 69 | validation_split=0.1 70 | ) 71 | # TODO: Test 72 | train_data = [] 73 | print('test AUC: %f' % model.evaluate(x=test_data[0], y=test_data[1], batch_size=batch_size)[1]) -------------------------------------------------------------------------------- /example/r_wdl_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 14, 2021 3 | train WideDeep demo 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import tensorflow as tf 7 | from tensorflow.keras.losses import binary_crossentropy 8 | from tensorflow.keras.optimizers import Adam 9 | from tensorflow.keras.metrics import AUC 10 | 11 | from reclearn.models.ranking import WideDeep 12 | from reclearn.data.datasets.criteo import get_split_file_path, get_fea_map, create_criteo_dataset 13 | 14 | import pickle 15 | import os 16 | 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 18 | # If you have GPU, and the value is GPU serial number. 19 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 20 | 21 | 22 | if __name__ == '__main__': 23 | # TODO: Hyper Parameters 24 | file = 'data/criteo/train.txt' 25 | learning_rate = 0.001 26 | batch_size = 4096 27 | embed_dim = 8 28 | model_params = { 29 | 'hidden_units': [256, 128, 64], 30 | 'dnn_dropout': 0.5, 31 | 'embed_reg': 0., 32 | 'w_reg': 0. 33 | } 34 | # TODO: Split dataset 35 | # If you want to split the file 36 | sample_num = 4600000 37 | split_file_list = get_split_file_path(dataset_path=file, sample_num=sample_num) 38 | # Or if you split the file before 39 | # split_file_list = get_split_file_path(parent_path='data/criteo/split') 40 | print('split file name: %s' % str(split_file_list)) 41 | # TODO: Get Feature Map 42 | # If you want to make feature map. 43 | fea_map = get_fea_map(split_file_list=split_file_list) 44 | # Or if you want to load feature map. 45 | # fea_map = get_fea_map(fea_map_path='data/criteo/split/fea_map.pkl') 46 | # TODO: Load test data 47 | print("load test file: %s" % split_file_list[-1]) 48 | feature_columns, test_data = create_criteo_dataset(split_file_list[-1], fea_map, embed_dim=embed_dim) 49 | # TODO: Build Model 50 | mirrored_strategy = tf.distribute.MirroredStrategy() 51 | with mirrored_strategy.scope(): 52 | model = WideDeep(feature_columns=feature_columns, **model_params) 53 | model.summary() 54 | model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate), 55 | metrics=[AUC()]) 56 | # TODO: Load train data 57 | for file in split_file_list[:-1]: 58 | print("load %s" % file) 59 | _, train_data = create_criteo_dataset(file, fea_map) 60 | # TODO: Fit 61 | model.fit( 62 | x=train_data[0], 63 | y=train_data[1], 64 | epochs=1, 65 | batch_size=batch_size, 66 | validation_split=0.1 67 | ) 68 | # TODO: Test 69 | train_data = [] 70 | print('test AUC: %f' % model.evaluate(x=test_data[0], y=test_data[1], batch_size=batch_size)[1]) -------------------------------------------------------------------------------- /example/r_xdeepfm_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 14, 2021 3 | train xDeepFM demo 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import tensorflow as tf 7 | from tensorflow.keras.losses import binary_crossentropy 8 | from tensorflow.keras.optimizers import Adam 9 | from tensorflow.keras.metrics import AUC 10 | 11 | from reclearn.models.ranking import xDeepFM 12 | from reclearn.data.datasets.criteo import get_split_file_path, get_fea_map, create_criteo_dataset 13 | 14 | import pickle 15 | import os 16 | 17 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 18 | # If you have GPU, and the value is GPU serial number. 19 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 20 | 21 | 22 | if __name__ == '__main__': 23 | # TODO: Hyper Parameters 24 | file = 'data/criteo/train.txt' 25 | learning_rate = 0.001 26 | batch_size = 4096 27 | embed_dim = 8 28 | model_params = { 29 | 'hidden_units': [256, 128, 64], 30 | 'cin_size': [128, 128], 31 | 'dnn_dropout': 0.5, 32 | 'embed_reg': 0., 33 | 'cin_reg': 0., 34 | 'w_reg': 0. 35 | } 36 | # TODO: Split dataset 37 | # If you want to split the file 38 | sample_num = 4600000 39 | split_file_list = get_split_file_path(dataset_path=file, sample_num=sample_num) 40 | # Or if you split the file before 41 | # split_file_list = get_split_file_path(parent_path='data/criteo/split') 42 | print('split file name: %s' % str(split_file_list)) 43 | # TODO: Get Feature Map 44 | # If you want to make feature map. 45 | fea_map = get_fea_map(split_file_list=split_file_list) 46 | # Or if you want to load feature map. 47 | # fea_map = get_fea_map(fea_map_path='data/criteo/split/fea_map.pkl') 48 | # TODO: Load test data 49 | print("load test file: %s" % split_file_list[-1]) 50 | feature_columns, test_data = create_criteo_dataset(split_file_list[-1], fea_map, embed_dim=embed_dim) 51 | # TODO: Build Model 52 | mirrored_strategy = tf.distribute.MirroredStrategy() 53 | with mirrored_strategy.scope(): 54 | model = xDeepFM(feature_columns=feature_columns, **model_params) 55 | model.summary() 56 | model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate), 57 | metrics=[AUC()]) 58 | # TODO: Load train data 59 | for file in split_file_list[:-1]: 60 | print("load %s" % file) 61 | _, train_data = create_criteo_dataset(file, fea_map) 62 | # TODO: Fit 63 | model.fit( 64 | x=train_data[0], 65 | y=train_data[1], 66 | epochs=1, 67 | batch_size=batch_size, 68 | validation_split=0.1 69 | ) 70 | # TODO: Test 71 | train_data = [] 72 | print('test AUC: %f' % model.evaluate(x=test_data[0], y=test_data[1], batch_size=batch_size)[1]) -------------------------------------------------------------------------------- /example/train_small_criteo_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 14, 2021 3 | using small criteo dataset to train the model. 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import tensorflow as tf 7 | from tensorflow.keras.losses import binary_crossentropy 8 | from tensorflow.keras.callbacks import EarlyStopping 9 | from tensorflow.keras.optimizers import Adam 10 | from tensorflow.keras.metrics import AUC 11 | 12 | from reclearn.models.ranking import FM 13 | from reclearn.data.datasets.criteo import create_small_criteo_dataset 14 | 15 | 16 | import os 17 | 18 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 19 | # If you have GPU, and the value is GPU serial number. 20 | os.environ['CUDA_VISIBLE_DEVICES'] = '6' 21 | 22 | 23 | if __name__ == '__main__': 24 | # TODO: Hyper Parameters 25 | file = 'data/criteo/train.txt' 26 | read_part = True 27 | sample_num = 50000 28 | test_size = 0.2 29 | 30 | model_params = { 31 | 'k': 8, 32 | 'w_reg': 0., 33 | 'v_reg': 0. 34 | } 35 | 36 | learning_rate = 0.001 37 | batch_size = 4096 38 | epochs = 10 39 | # TODO: Create Dataset 40 | feature_columns, train, test = create_small_criteo_dataset(file=file, 41 | read_part=read_part, 42 | sample_num=sample_num, 43 | test_size=test_size) 44 | train_X, train_y = train 45 | test_X, test_y = test 46 | # TODO: Build Model 47 | mirrored_strategy = tf.distribute.MirroredStrategy() 48 | with mirrored_strategy.scope(): 49 | model = FM(feature_columns=feature_columns, **model_params) 50 | model.summary() 51 | model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate), 52 | metrics=[AUC()]) 53 | # TODO: Fit 54 | model.fit( 55 | train_X, 56 | train_y, 57 | epochs=epochs, 58 | callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)], 59 | batch_size=batch_size, 60 | validation_split=0.1 61 | ) 62 | # TODO: Test 63 | print('test AUC: %f' % model.evaluate(test_X, test_y, batch_size=batch_size)[1]) -------------------------------------------------------------------------------- /reclearn/__init__.py: -------------------------------------------------------------------------------- 1 | name = "reclearn" -------------------------------------------------------------------------------- /reclearn/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZiyaoGeng/RecLearn/4bbfb492b872c5a3290a2bce1ed5c160162558a3/reclearn/data/__init__.py -------------------------------------------------------------------------------- /reclearn/data/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZiyaoGeng/RecLearn/4bbfb492b872c5a3290a2bce1ed5c160162558a3/reclearn/data/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /reclearn/data/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZiyaoGeng/RecLearn/4bbfb492b872c5a3290a2bce1ed5c160162558a3/reclearn/data/datasets/__init__.py -------------------------------------------------------------------------------- /reclearn/data/datasets/beauty.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 23, 2021 3 | Amazon Beauty Dataset. 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import os 7 | import random 8 | import numpy as np 9 | import pandas as pd 10 | import tensorflow as tf 11 | from tqdm import tqdm 12 | from collections import defaultdict 13 | 14 | from tqdm import tqdm 15 | 16 | 17 | # general recommendation 18 | def split_data(file_path): 19 | """split amazon beauty for general recommendation 20 | Args: 21 | :param file_path: A string. The file path of 'ratings.dat'. 22 | :return: train_path, val_path, test_path, meta_path 23 | """ 24 | dst_path = os.path.dirname(file_path) 25 | train_path = os.path.join(dst_path, "beauty_train.txt") 26 | val_path = os.path.join(dst_path, "beauty_val.txt") 27 | test_path = os.path.join(dst_path, "beauty_test.txt") 28 | meta_path = os.path.join(dst_path, "beauty_meta.txt") 29 | users, items = set(), dict() 30 | user_idx, item_idx = 1, 1 31 | history = {} 32 | with open(file_path, 'r') as f: 33 | lines = f.readlines() 34 | for line in tqdm(lines): 35 | user, item, score, timestamp = line.strip().split(",") 36 | users.add(user) 37 | if items.get(item) is None: 38 | items[item] = str(item_idx) 39 | item_idx += 1 40 | history.setdefault(user, []) 41 | history[user].append([items[item], timestamp]) 42 | with open(train_path, 'w') as f1, open(val_path, 'w') as f2, open(test_path, 'w') as f3: 43 | for user in users: 44 | hist = history[user] 45 | if len(hist) < 4: 46 | continue 47 | hist.sort(key=lambda x: x[1]) 48 | for idx, value in enumerate(hist): 49 | if idx == len(hist) - 1: 50 | f3.write(str(user_idx) + '\t' + value[0] + '\n') 51 | elif idx == len(hist) - 2: 52 | f2.write(str(user_idx) + '\t' + value[0] + '\n') 53 | else: 54 | f1.write(str(user_idx) + '\t' + value[0] + '\n') 55 | user_idx += 1 56 | with open(meta_path, 'w') as f: 57 | f.write(str(user_idx - 1) + '\t' + str(item_idx - 1)) 58 | return train_path, val_path, test_path, meta_path 59 | 60 | 61 | # sequence recommendation 62 | def split_seq_data(file_path): 63 | """split amazon beauty for sequence recommendation 64 | Args: 65 | :param file_path: A string. The file path of 'ratings_Beauty.dat'. 66 | :return: train_path, val_path, test_path, meta_path 67 | """ 68 | dst_path = os.path.dirname(file_path) 69 | train_path = os.path.join(dst_path, "beauty_seq_train.txt") 70 | val_path = os.path.join(dst_path, "beauty_seq_val.txt") 71 | test_path = os.path.join(dst_path, "beauty_seq_test.txt") 72 | meta_path = os.path.join(dst_path, "beauty_seq_meta.txt") 73 | users, items = set(), dict() 74 | user_idx, item_idx = 1, 1 75 | history = {} 76 | with open(file_path, 'r') as f: 77 | lines = f.readlines() 78 | for line in tqdm(lines): 79 | user, item, score, timestamp = line.strip().split(",") 80 | users.add(user) 81 | if items.get(item) is None: 82 | items[item] = str(item_idx) 83 | item_idx += 1 84 | history.setdefault(user, []) 85 | history[user].append([items[item], timestamp]) 86 | with open(train_path, 'w') as f1, open(val_path, 'w') as f2, open(test_path, 'w') as f3: 87 | for user in users: 88 | hist_u = history[user] 89 | if len(hist_u) < 4: 90 | continue 91 | hist_u.sort(key=lambda x: x[1]) 92 | hist = [x[0] for x in hist_u] 93 | time = [x[1] for x in hist_u] 94 | f1.write(str(user_idx) + "\t" + ' '.join(hist[:-2]) + "\t" + ' '.join(time[:-2]) + '\n') 95 | f2.write(str(user_idx) + "\t" + ' '.join(hist[:-2]) + "\t" + ' '.join(time[:-2]) + "\t" + hist[-2] + '\n') 96 | f3.write(str(user_idx) + "\t" + ' '.join(hist[:-1]) + "\t" + ' '.join(time[:-1]) + "\t" + hist[-1] + '\n') 97 | user_idx += 1 98 | with open(meta_path, 'w') as f: 99 | f.write(str(user_idx - 1) + '\t' + str(item_idx - 1)) 100 | return train_path, val_path, test_path, meta_path 101 | 102 | 103 | def load_data(file_path, neg_num, max_item_num): 104 | """load amazon beauty dataset. 105 | Args: 106 | :param file_path: A string. The file path. 107 | :param neg_num: A scalar(int). The negative num of one sample. 108 | :param max_item_num: A scalar(int). The max index of item. 109 | :return: A dict. data. 110 | """ 111 | data = np.array(pd.read_csv(file_path, delimiter='\t')) 112 | np.random.shuffle(data) 113 | neg_items = [] 114 | for i in tqdm(range(len(data))): 115 | neg_item = [random.randint(1, max_item_num) for _ in range(neg_num)] 116 | neg_items.append(neg_item) 117 | return {'user': data[:, 0].astype(int), 'pos_item': data[:, 1].astype(int), 'neg_item': np.array(neg_items)} 118 | 119 | 120 | def load_seq_data(file_path, mode, seq_len, neg_num, max_item_num, contain_user=False, contain_time=False): 121 | """load amazon beauty sequence dataset. 122 | Args: 123 | :param file_path: A string. The file path. 124 | :param mode: A string. "train", "val" or "test". 125 | :param seq_len: A scalar(int). The length of sequence. 126 | :param neg_num: A scalar(int). The negative num of one sample. 127 | :param max_item_num: A scalar(int). The max index of item. 128 | :param contain_user: A boolean. Whether including user'id input or not. 129 | :param contain_time: A boolean. Whether including time sequence input or not. 130 | :return: A dict. data. 131 | """ 132 | users, click_seqs, time_seqs, pos_items, neg_items = [], [], [], [], [] 133 | with open(file_path) as f: 134 | lines = f.readlines() 135 | for line in tqdm(lines): 136 | if mode == "train": 137 | user, click_seq, time_seq = line.split('\t') 138 | click_seq = click_seq.split(' ') 139 | click_seq = [int(x) for x in click_seq] 140 | time_seq = time_seq.split(' ') 141 | time_seq = [x for x in time_seq] 142 | for i in range(len(click_seq)-1): 143 | if i + 1 >= seq_len: 144 | tmp = click_seq[i + 1 - seq_len:i + 1] 145 | tmp2 = time_seq[i + 1 - seq_len:i + 1] 146 | else: 147 | tmp = [0] * (seq_len-i-1) + click_seq[:i + 1] 148 | tmp2 = [0] * (seq_len - i - 1) + time_seq[:i + 1] 149 | 150 | # gen_neg = _gen_negative_samples(neg_num, click_seq, max_item_num) 151 | # neg_item = [neg_item for neg_item in gen_neg] 152 | neg_item = [random.randint(1, max_item_num) for _ in range(neg_num)] 153 | users.append(int(user)) 154 | click_seqs.append(tmp) 155 | time_seqs.append(tmp2) 156 | pos_items.append(click_seq[i + 1]) 157 | neg_items.append(neg_item) 158 | else: 159 | user, click_seq, time_seq, pos_item = line.split('\t') 160 | click_seq = click_seq.split(' ') 161 | click_seq = [int(x) for x in click_seq] 162 | time_seq = time_seq.split(' ') 163 | time_seq = [x for x in time_seq] 164 | if len(click_seq) >= seq_len: 165 | tmp = click_seq[len(click_seq) - seq_len:] 166 | tmp2 = time_seq[len(time_seq) - seq_len:] 167 | else: 168 | tmp = [0] * (seq_len - len(click_seq)) + click_seq[:] 169 | tmp2 = [0] * (seq_len - len(time_seq)) + time_seq[:] 170 | users.append(int(user)) 171 | # gen_neg = _gen_negative_samples(neg_num, click_seq, max_item_num) 172 | # neg_item = [neg_item for neg_item in gen_neg] 173 | neg_item = [random.randint(1, max_item_num) for _ in range(neg_num)] 174 | click_seqs.append(tmp) 175 | time_seqs.append(tmp2) 176 | pos_items.append(int(pos_item)) 177 | neg_items.append(neg_item) 178 | data = list(zip(users, click_seqs, time_seqs, pos_items, neg_items)) 179 | random.shuffle(data) 180 | users, click_seqs, time_seqs, pos_items, neg_items = zip(*data) 181 | data = {'click_seq': np.array(click_seqs), 'pos_item': np.array(pos_items), 'neg_item': np.array(neg_items)} 182 | if contain_user: 183 | data['user'] = np.array(users) 184 | if contain_time: 185 | data['time_seq'] = np.array(click_seqs) 186 | return data 187 | -------------------------------------------------------------------------------- /reclearn/data/datasets/criteo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on July 13, 2020 3 | Updated on Nov 14, 2021 4 | dataset:criteo 5 | features: 6 | - Label - Target variable that indicates if an ad was clicked (1) or not (0). 7 | - I1-I13 - A total of 13 columns of integer features (mostly count features). 8 | - C1-C26 - A total of 26 columns of categorical features. 9 | The values of these features have been hashed onto 32 bits for anonymization purposes. 10 | @author: Ziyao Geng(zggzy1996@163.com) 11 | """ 12 | import pickle 13 | import pandas as pd 14 | import os 15 | 16 | from tqdm import tqdm 17 | from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder 18 | from sklearn.model_selection import train_test_split 19 | 20 | from reclearn.data.utils import recKBinsDiscretizer, splitByLineCount, mkSubFile 21 | from reclearn.data.feature_column import sparseFeature 22 | 23 | NAMES = ['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 24 | 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 25 | 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 26 | 'C23', 'C24', 'C25', 'C26'] 27 | 28 | 29 | def get_split_file_path(parent_path=None, dataset_path=None, sample_num=5000000): 30 | """Get the list of split file path. 31 | Note: Either parent_path or dataset_path must be valid. 32 | If exists dataset_path + "/split", parent_path = dataset_path + "/split". 33 | Args: 34 | :param parent_path: A string. split file's parent path. 35 | :param dataset_path: A string. 36 | :param sample_num: A int. The sample number of every split file. 37 | :return: A list. [file1_path, file2_path, ...] 38 | """ 39 | sub_dir_name = 'split' 40 | if parent_path is None and dataset_path is None: 41 | raise ValueError('Please give parent path or file path.') 42 | if parent_path is None and os.path.exists(os.path.join(os.path.dirname(dataset_path), sub_dir_name)): 43 | parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name) 44 | elif parent_path is None or not os.path.exists(parent_path): 45 | splitByLineCount(dataset_path, sample_num, sub_dir_name) 46 | parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name) 47 | split_file_name = os.listdir(parent_path) 48 | split_file_name.sort() 49 | split_file_list = [parent_path + "/" + file_name for file_name in split_file_name if file_name[-3:] == 'txt'] 50 | return split_file_list 51 | 52 | 53 | def get_fea_map(fea_map_path=None, split_file_list=None): 54 | """Get feature map. 55 | Note: Either parent_path or dataset_path must be valid. 56 | If exists dir(split_file_list[0]) + "/fea_map.pkl", fea_map_path is valid. 57 | If fea_map_path is None and you want to build the feature map, 58 | the default file path is the parent directory of split file + "fea_map.pkl". 59 | Args: 60 | :param fea_map_path: A string. 61 | :param split_file_list: A list. [file1_path, file2_path, ...] 62 | :return: A dict. {'C1':{}, 'C2':{}, ...} 63 | """ 64 | if fea_map_path is None and split_file_list is None: 65 | raise ValueError('Please give feature map path or split file list.') 66 | if fea_map_path is None and os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl"): 67 | fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl") 68 | if os.path.exists(fea_map_path) and fea_map_path[-3:] == 'pkl': 69 | with open(fea_map_path, 'rb') as f: 70 | fea_map = pickle.load(f) 71 | return fea_map 72 | fea_map = {} 73 | for file in tqdm(split_file_list): 74 | f = open(file) 75 | for line in f: 76 | row = line.strip('\n').split('\t') 77 | for i in range(14, 40): 78 | if row[i] == '': 79 | continue 80 | name = NAMES[i] 81 | fea_map.setdefault(name, {}) 82 | if fea_map[name].get(row[i]) is None: 83 | fea_map[name][row[i]] = len(fea_map[name]) 84 | for j in range(1, 14): 85 | if row[j] == '': 86 | continue 87 | name = NAMES[j] 88 | fea_map.setdefault(name, {}) 89 | fea_map[name].setdefault('min', float(row[j])) 90 | fea_map[name].setdefault('max', float(row[j])) 91 | fea_map[name]['min'] = min(fea_map[name]['min'], float(row[j])) 92 | fea_map[name]['max'] = max(fea_map[name]['max'], float(row[j])) 93 | f.close() 94 | for i in range(14, 40): 95 | fea_map[NAMES[i]]['-1'] = len(fea_map[NAMES[i]]) 96 | fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl") 97 | with open(fea_map_path, 'wb') as f: 98 | pickle.dump(fea_map, f, pickle.HIGHEST_PROTOCOL) 99 | f.close() 100 | return fea_map 101 | 102 | 103 | def create_criteo_dataset(file, fea_map, embed_dim=8): 104 | """Load one split file data. 105 | Note: fea_map dict must be available. 106 | Args: 107 | :param file: A string. dataset's path. 108 | :param fea_map: A dict. {'C1':{}, 'C2':{}, ...}. 109 | :param embed_dim: the embedding dimension of sparse features. 110 | :return: feature columns such as [sparseFeature1, sparseFeature2, ...], and 111 | data such as ({'C1': [...], 'C2': [...]]}, [1, 0, 1, ...]). 112 | """ 113 | data_df = pd.read_csv(file, sep='\t', header=None, names=NAMES) 114 | 115 | sparse_features = ['C' + str(i) for i in range(1, 27)] 116 | dense_features = ['I' + str(i) for i in range(1, 14)] 117 | features = sparse_features + dense_features 118 | 119 | data_df[sparse_features] = data_df[sparse_features].fillna('-1') 120 | data_df[dense_features] = data_df[dense_features].fillna(0) 121 | # map 122 | for col in sparse_features: 123 | data_df[col] = data_df[col].map(lambda x: fea_map[col][x]) 124 | # Bin continuous data into intervals. 125 | data_df[dense_features] = recKBinsDiscretizer(data_df[dense_features], 1000, fea_map) 126 | 127 | feature_columns = [sparseFeature(feat, len(fea_map[feat]) + 1, embed_dim=embed_dim) 128 | for feat in features] 129 | 130 | data_X = {feature: data_df[feature].values.astype('int32') for feature in features} 131 | data_y = data_df['label'].values.astype('int32') 132 | 133 | return feature_columns, (data_X, data_y) 134 | 135 | 136 | def create_small_criteo_dataset(file, embed_dim=8, read_part=True, sample_num=100000, test_size=0.2): 137 | """Load small criteo data(sample num) without splitting "train.txt". 138 | Note: If you want to load all data in the memory, please set "read_part" to False. 139 | Args: 140 | :param file: A string. dataset's path. 141 | :param embed_dim: A scalar. the embedding dimension of sparse features. 142 | :param read_part: A boolean. whether to read part of it. 143 | :param sample_num: A scalar. the number of instances if read_part is True. 144 | :param test_size: A scalar(float). ratio of test dataset. 145 | :return: feature columns such as [sparseFeature1, sparseFeature2, ...], 146 | train, such as ({'C1': [...], 'C2': [...]]}, [1, 0, 1, ...]) 147 | and test ({'C1': [...], 'C2': [...]]}, [1, 0, 1, ...]). 148 | """ 149 | if read_part: 150 | data_df = pd.read_csv(file, sep='\t', iterator=True, header=None, 151 | names=NAMES) 152 | data_df = data_df.get_chunk(sample_num) 153 | else: 154 | data_df = pd.read_csv(file, sep='\t', header=None, names=NAMES) 155 | 156 | sparse_features = ['C' + str(i) for i in range(1, 27)] 157 | dense_features = ['I' + str(i) for i in range(1, 14)] 158 | features = sparse_features + dense_features 159 | 160 | data_df[sparse_features] = data_df[sparse_features].fillna('-1') 161 | data_df[dense_features] = data_df[dense_features].fillna(0) 162 | 163 | est = KBinsDiscretizer(n_bins=1000, encode='ordinal', strategy='uniform') 164 | data_df[dense_features] = est.fit_transform(data_df[dense_features]) 165 | 166 | for feat in sparse_features: 167 | le = LabelEncoder() 168 | data_df[feat] = le.fit_transform(data_df[feat]) 169 | 170 | feature_columns = [sparseFeature(feat, int(data_df[feat].max()) + 1, embed_dim=embed_dim) 171 | for feat in features] 172 | train, test = train_test_split(data_df, test_size=test_size) 173 | 174 | train_X = {feature: train[feature].values.astype('int32') for feature in features} 175 | train_y = train['label'].values.astype('int32') 176 | test_X = {feature: test[feature].values.astype('int32') for feature in features} 177 | test_y = test['label'].values.astype('int32') 178 | 179 | return feature_columns, (train_X, train_y), (test_X, test_y) -------------------------------------------------------------------------------- /reclearn/data/datasets/games.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 23, 2021 3 | Amazon Games Dataset. 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import os 7 | import random 8 | import numpy as np 9 | import pandas as pd 10 | import tensorflow as tf 11 | from tqdm import tqdm 12 | from collections import defaultdict 13 | 14 | 15 | # general recommendation 16 | def split_data(file_path): 17 | """split amazon games for general recommendation 18 | Args: 19 | :param file_path: A string. The file path of 'ratings.dat'. 20 | :return: train_path, val_path, test_path, meta_path 21 | """ 22 | dst_path = os.path.dirname(file_path) 23 | train_path = os.path.join(dst_path, "games_train.txt") 24 | val_path = os.path.join(dst_path, "games_val.txt") 25 | test_path = os.path.join(dst_path, "games_test.txt") 26 | meta_path = os.path.join(dst_path, "games_meta.txt") 27 | users, items = set(), dict() 28 | user_idx, item_idx = 1, 1 29 | history = {} 30 | with open(file_path, 'r') as f: 31 | lines = f.readlines() 32 | for line in tqdm(lines): 33 | user, item, score, timestamp = line.strip().split(",") 34 | users.add(user) 35 | if items.get(item) is None: 36 | items[item] = str(item_idx) 37 | item_idx += 1 38 | history.setdefault(user, []) 39 | history[user].append([items[item], timestamp]) 40 | with open(train_path, 'w') as f1, open(val_path, 'w') as f2, open(test_path, 'w') as f3: 41 | for user in users: 42 | hist = history[user] 43 | if len(hist) < 4: 44 | continue 45 | hist.sort(key=lambda x: x[1]) 46 | for idx, value in enumerate(hist): 47 | if idx == len(hist) - 1: 48 | f3.write(str(user_idx) + '\t' + value[0] + '\n') 49 | elif idx == len(hist) - 2: 50 | f2.write(str(user_idx) + '\t' + value[0] + '\n') 51 | else: 52 | f1.write(str(user_idx) + '\t' + value[0] + '\n') 53 | user_idx += 1 54 | with open(meta_path, 'w') as f: 55 | f.write(str(user_idx - 1) + '\t' + str(item_idx - 1)) 56 | return train_path, val_path, test_path, meta_path 57 | 58 | 59 | # sequence recommendation 60 | def split_seq_data(file_path): 61 | """split amazon games for sequence recommendation 62 | Args: 63 | :param file_path: A string. The file path of 'ratings_Beauty.dat'. 64 | :return: train_path, val_path, test_path, meta_path 65 | """ 66 | dst_path = os.path.dirname(file_path) 67 | train_path = os.path.join(dst_path, "games_seq_train.txt") 68 | val_path = os.path.join(dst_path, "games_seq_val.txt") 69 | test_path = os.path.join(dst_path, "games_seq_test.txt") 70 | meta_path = os.path.join(dst_path, "games_seq_meta.txt") 71 | users, items = set(), dict() 72 | user_idx, item_idx = 1, 1 73 | history = {} 74 | with open(file_path, 'r') as f: 75 | lines = f.readlines() 76 | for line in tqdm(lines): 77 | user, item, score, timestamp = line.strip().split(",") 78 | users.add(user) 79 | if items.get(item) is None: 80 | items[item] = str(item_idx) 81 | item_idx += 1 82 | history.setdefault(user, []) 83 | history[user].append([items[item], timestamp]) 84 | with open(train_path, 'w') as f1, open(val_path, 'w') as f2, open(test_path, 'w') as f3: 85 | for user in users: 86 | hist_u = history[user] 87 | if len(hist_u) < 4: 88 | continue 89 | hist_u.sort(key=lambda x: x[1]) 90 | hist = [x[0] for x in hist_u] 91 | time = [x[1] for x in hist_u] 92 | f1.write(str(user_idx) + "\t" + ' '.join(hist[:-2]) + "\t" + ' '.join(time[:-2]) + '\n') 93 | f2.write(str(user_idx) + "\t" + ' '.join(hist[:-2]) + "\t" + ' '.join(time[:-2]) + "\t" + hist[-2] + '\n') 94 | f3.write(str(user_idx) + "\t" + ' '.join(hist[:-1]) + "\t" + ' '.join(time[:-1]) + "\t" + hist[-1] + '\n') 95 | user_idx += 1 96 | with open(meta_path, 'w') as f: 97 | f.write(str(user_idx - 1) + '\t' + str(item_idx - 1)) 98 | return train_path, val_path, test_path, meta_path 99 | 100 | 101 | def load_data(file_path, neg_num, max_item_num): 102 | """load amazon beauty dataset. 103 | Args: 104 | :param file_path: A string. The file path. 105 | :param neg_num: A scalar(int). The negative num of one sample. 106 | :param max_item_num: A scalar(int). The max index of item. 107 | :return: A dict. data. 108 | """ 109 | data = np.array(pd.read_csv(file_path, delimiter='\t')) 110 | np.random.shuffle(data) 111 | neg_items = [] 112 | for i in tqdm(range(len(data))): 113 | neg_item = [random.randint(1, max_item_num) for _ in range(neg_num)] 114 | neg_items.append(neg_item) 115 | return {'user': data[:, 0].astype(int), 'pos_item': data[:, 1].astype(int), 'neg_item': np.array(neg_items)} 116 | 117 | 118 | def load_seq_data(file_path, mode, seq_len, neg_num, max_item_num, contain_user=False, contain_time=False): 119 | """load amazon games sequence dataset. 120 | Args: 121 | :param file_path: A string. The file path. 122 | :param mode: A string. "train", "val" or "test". 123 | :param seq_len: A scalar(int). The length of sequence. 124 | :param neg_num: A scalar(int). The negative num of one sample. 125 | :param max_item_num: A scalar(int). The max index of item. 126 | :param contain_user: A boolean. Whether including user'id input or not. 127 | :param contain_time: A boolean. Whether including time sequence input or not. 128 | :return: A dict. data. 129 | """ 130 | users, click_seqs, time_seqs, pos_items, neg_items = [], [], [], [], [] 131 | with open(file_path) as f: 132 | lines = f.readlines() 133 | for line in tqdm(lines): 134 | if mode == "train": 135 | user, click_seq, time_seq = line.split('\t') 136 | click_seq = click_seq.split(' ') 137 | click_seq = [int(x) for x in click_seq] 138 | time_seq = time_seq.split(' ') 139 | time_seq = [x for x in time_seq] 140 | for i in range(len(click_seq)-1): 141 | if i + 1 >= seq_len: 142 | tmp = click_seq[i + 1 - seq_len:i + 1] 143 | tmp2 = time_seq[i + 1 - seq_len:i + 1] 144 | else: 145 | tmp = [0] * (seq_len-i-1) + click_seq[:i + 1] 146 | tmp2 = [0] * (seq_len - i - 1) + time_seq[:i + 1] 147 | 148 | # gen_neg = _gen_negative_samples(neg_num, click_seq, max_item_num) 149 | # neg_item = [neg_item for neg_item in gen_neg] 150 | neg_item = [random.randint(1, max_item_num) for _ in range(neg_num)] 151 | users.append(int(user)) 152 | click_seqs.append(tmp) 153 | time_seqs.append(tmp2) 154 | pos_items.append(click_seq[i + 1]) 155 | neg_items.append(neg_item) 156 | else: 157 | user, click_seq, time_seq, pos_item = line.split('\t') 158 | click_seq = click_seq.split(' ') 159 | click_seq = [int(x) for x in click_seq] 160 | time_seq = time_seq.split(' ') 161 | time_seq = [x for x in time_seq] 162 | if len(click_seq) >= seq_len: 163 | tmp = click_seq[len(click_seq) - seq_len:] 164 | tmp2 = time_seq[len(time_seq) - seq_len:] 165 | else: 166 | tmp = [0] * (seq_len - len(click_seq)) + click_seq[:] 167 | tmp2 = [0] * (seq_len - len(time_seq)) + time_seq[:] 168 | users.append(int(user)) 169 | # gen_neg = _gen_negative_samples(neg_num, click_seq, max_item_num) 170 | # neg_item = [neg_item for neg_item in gen_neg] 171 | neg_item = [random.randint(1, max_item_num) for _ in range(neg_num)] 172 | click_seqs.append(tmp) 173 | time_seqs.append(tmp2) 174 | pos_items.append(int(pos_item)) 175 | neg_items.append(neg_item) 176 | data = list(zip(users, click_seqs, time_seqs, pos_items, neg_items)) 177 | random.shuffle(data) 178 | users, click_seqs, time_seqs, pos_items, neg_items = zip(*data) 179 | data = {'click_seq': np.array(click_seqs), 'pos_item': np.array(pos_items), 'neg_item': np.array(neg_items)} 180 | if contain_user: 181 | data['user'] = np.array(users) 182 | if contain_time: 183 | data['time_seq'] = np.array(click_seqs) 184 | return data 185 | -------------------------------------------------------------------------------- /reclearn/data/datasets/steam.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 24, 2021 3 | STEAM Dataset. 4 | statistics of processed data: 5 | user: 334732 6 | item: 15474 7 | interaction: 4216807 8 | @author: Ziyao Geng(zggzy1996@163.com) 9 | """ 10 | import os 11 | import random 12 | import re 13 | import pandas as pd 14 | import numpy as np 15 | from tqdm import tqdm 16 | 17 | 18 | # general recommendation 19 | def split_data(file_path): 20 | dst_path = os.path.dirname(file_path) 21 | train_path = os.path.join(dst_path, "steam_train.txt") 22 | val_path = os.path.join(dst_path, "steam_val.txt") 23 | test_path = os.path.join(dst_path, "steam_test.txt") 24 | meta_path = os.path.join(dst_path, "steam_meta.txt") 25 | with open(file_path, 'r', encoding='utf8') as f: 26 | lines = f.readlines() 27 | users, items = set(), dict() 28 | user_idx, item_idx = 1, 1 29 | history = {} 30 | for line in tqdm(lines): 31 | user = re.findall(r'u\'username\': u(\"[^\"]+\"|\'[^\']+\')', line)[0] 32 | item = re.findall(r'u\'product_id\': u\'([^\']+?)\'', line)[0] 33 | timestamp = re.findall(r'u\'date\': u\'([^\']+?)\'', line)[0] 34 | users.add(user) 35 | if items.get(item) is None: 36 | items[item] = str(item_idx) 37 | item_idx += 1 38 | history.setdefault(user, []) 39 | history[user].append([items[item], timestamp]) 40 | with open(train_path, 'w') as f1, open(val_path, 'w') as f2, open(test_path, 'w') as f3: 41 | for user in users: 42 | hist = history[user] 43 | if len(hist) < 5: 44 | continue 45 | hist.sort(key=lambda x: x[1]) 46 | for idx, value in enumerate(hist): 47 | if idx == len(hist) - 1: 48 | f3.write(str(user_idx) + '\t' + value[0] + '\n') 49 | elif idx == len(hist) - 2: 50 | f2.write(str(user_idx) + '\t' + value[0] + '\n') 51 | else: 52 | f1.write(str(user_idx) + '\t' + value[0] + '\n') 53 | user_idx += 1 54 | with open(meta_path, 'w') as f: 55 | f.write(str(user_idx - 1) + '\t' + str(item_idx - 1)) 56 | return train_path, val_path, test_path, meta_path 57 | 58 | 59 | # sequence recommendation 60 | def split_seq_data(file_path): 61 | """split STEAM for sequence recommendation 62 | Args: 63 | :param file_path: A string. The file path of 'ratings_Beauty.dat'. 64 | :return: train_path, val_path, test_path, meta_path 65 | """ 66 | dst_path = os.path.dirname(file_path) 67 | train_path = os.path.join(dst_path, "steam_seq_train.txt") 68 | val_path = os.path.join(dst_path, "steam_seq_val.txt") 69 | test_path = os.path.join(dst_path, "steam_seq_test.txt") 70 | meta_path = os.path.join(dst_path, "steam_seq_meta.txt") 71 | with open(file_path, 'r', encoding='utf8') as f: 72 | lines = f.readlines() 73 | users, items = set(), dict() 74 | user_idx, item_idx = 1, 1 75 | history = {} 76 | for line in tqdm(lines): 77 | user = re.findall(r'u\'username\': u(\"[^\"]+\"|\'[^\']+\')', line)[0] 78 | item = re.findall(r'u\'product_id\': u\'([^\']+?)\'', line)[0] 79 | timestamp = re.findall(r'u\'date\': u\'([^\']+?)\'', line)[0] 80 | users.add(user) 81 | if items.get(item) is None: 82 | items[item] = str(item_idx) 83 | item_idx += 1 84 | history.setdefault(user, []) 85 | history[user].append([items[item], timestamp]) 86 | with open(train_path, 'w') as f1, open(val_path, 'w') as f2, open(test_path, 'w') as f3: 87 | for user in users: 88 | hist_u = history[user] 89 | if len(hist_u) < 5: 90 | continue 91 | hist_u.sort(key=lambda x: x[1]) 92 | hist = [x[0] for x in hist_u] 93 | time = [x[1] for x in hist_u] 94 | f1.write(str(user_idx) + "\t" + ' '.join(hist[:-2]) + "\t" + ' '.join(time[:-2]) + '\n') 95 | f2.write(str(user_idx) + "\t" + ' '.join(hist[:-2]) + "\t" + ' '.join(time[:-2]) + "\t" + hist[-2] + '\n') 96 | f3.write(str(user_idx) + "\t" + ' '.join(hist[:-1]) + "\t" + ' '.join(time[:-1]) + "\t" + hist[-1] + '\n') 97 | user_idx += 1 98 | with open(meta_path, 'w') as f: 99 | f.write(str(user_idx - 1) + '\t' + str(item_idx - 1)) 100 | return train_path, val_path, test_path, meta_path 101 | 102 | 103 | def load_data(file_path, neg_num, max_item_num): 104 | """load steam dataset. 105 | Args: 106 | :param file_path: A string. The file path. 107 | :param neg_num: A scalar(int). The negative num of one sample. 108 | :param max_item_num: A scalar(int). The max index of item. 109 | :return: A dict. data. 110 | """ 111 | data = np.array(pd.read_csv(file_path, delimiter='\t')) 112 | np.random.shuffle(data) 113 | neg_items = [] 114 | for i in tqdm(range(len(data))): 115 | neg_item = [random.randint(1, max_item_num) for _ in range(neg_num)] 116 | neg_items.append(neg_item) 117 | return {'user': data[:, 0].astype(int), 'pos_item': data[:, 1].astype(int), 'neg_item': np.array(neg_items)} 118 | 119 | 120 | def load_seq_data(file_path, mode, seq_len, neg_num, max_item_num, contain_user=False, contain_time=False): 121 | """load STEAM sequence dataset. 122 | Args: 123 | :param file_path: A string. The file path. 124 | :param mode: A string. "train", "val" or "test". 125 | :param seq_len: A scalar(int). The length of sequence. 126 | :param neg_num: A scalar(int). The negative num of one sample. 127 | :param max_item_num: A scalar(int). The max index of item. 128 | :param contain_user: A boolean. Whether including user'id input or not. 129 | :param contain_time: A boolean. Whether including time sequence input or not. 130 | :return: A dict. data. 131 | """ 132 | users, click_seqs, time_seqs, pos_items, neg_items = [], [], [], [], [] 133 | with open(file_path) as f: 134 | lines = f.readlines() 135 | for line in tqdm(lines): 136 | if mode == "train": 137 | user, click_seq, time_seq = line.split('\t') 138 | click_seq = click_seq.split(' ') 139 | click_seq = [int(x) for x in click_seq] 140 | time_seq = time_seq.split(' ') 141 | time_seq = [x for x in time_seq] 142 | for i in range(len(click_seq)-1): 143 | if i + 1 >= seq_len: 144 | tmp = click_seq[i + 1 - seq_len:i + 1] 145 | tmp2 = time_seq[i + 1 - seq_len:i + 1] 146 | else: 147 | tmp = [0] * (seq_len-i-1) + click_seq[:i + 1] 148 | tmp2 = [0] * (seq_len - i - 1) + time_seq[:i + 1] 149 | 150 | # gen_neg = _gen_negative_samples(neg_num, click_seq, max_item_num) 151 | # neg_item = [neg_item for neg_item in gen_neg] 152 | neg_item = [random.randint(1, max_item_num) for _ in range(neg_num)] 153 | users.append(int(user)) 154 | click_seqs.append(tmp) 155 | time_seqs.append(tmp2) 156 | pos_items.append(click_seq[i + 1]) 157 | neg_items.append(neg_item) 158 | else: 159 | user, click_seq, time_seq, pos_item = line.split('\t') 160 | click_seq = click_seq.split(' ') 161 | click_seq = [int(x) for x in click_seq] 162 | time_seq = time_seq.split(' ') 163 | time_seq = [x for x in time_seq] 164 | if len(click_seq) >= seq_len: 165 | tmp = click_seq[len(click_seq) - seq_len:] 166 | tmp2 = time_seq[len(time_seq) - seq_len:] 167 | else: 168 | tmp = [0] * (seq_len - len(click_seq)) + click_seq[:] 169 | tmp2 = [0] * (seq_len - len(time_seq)) + time_seq[:] 170 | users.append(int(user)) 171 | # gen_neg = _gen_negative_samples(neg_num, click_seq, max_item_num) 172 | # neg_item = [neg_item for neg_item in gen_neg] 173 | neg_item = [random.randint(1, max_item_num) for _ in range(neg_num)] 174 | click_seqs.append(tmp) 175 | time_seqs.append(tmp2) 176 | pos_items.append(int(pos_item)) 177 | neg_items.append(neg_item) 178 | data = list(zip(users, click_seqs, time_seqs, pos_items, neg_items)) 179 | random.shuffle(data) 180 | users, click_seqs, time_seqs, pos_items, neg_items = zip(*data) 181 | data = {'click_seq': np.array(click_seqs), 'pos_item': np.array(pos_items), 'neg_item': np.array(neg_items)} 182 | if contain_user: 183 | data['user'] = np.array(users) 184 | if contain_time: 185 | data['time_seq'] = np.array(click_seqs) 186 | return data 187 | -------------------------------------------------------------------------------- /reclearn/data/feature_column.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on May 18, 2021 3 | input feature columns: sparseFeature, denseFeature 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | 7 | 8 | def sparseFeature(feat_name, feat_num, embed_dim=4): 9 | """Create dictionary for sparse feature. 10 | Args: 11 | :param feat_name: A string. feature name. 12 | :param feat_num: A scalar(int). The total number of sparse features that do not repeat. 13 | :param embed_dim: A scalar(int). embedding dimension for this feature. 14 | :return: 15 | """ 16 | return {'feat_name': feat_name, 'feat_num': feat_num, 'embed_dim': embed_dim} 17 | 18 | 19 | def denseFeature(feat_name): 20 | """Create dictionary for dense feature. 21 | Args: 22 | :param feat_name: A string. dense feature name. 23 | :return: 24 | """ 25 | return {'feat_name': feat_name} 26 | -------------------------------------------------------------------------------- /reclearn/data/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on July 13, 2020 3 | Updated on May 18, 2021 4 | Some functions. 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import os 8 | import time 9 | import numpy as np 10 | 11 | 12 | def mkSubFile(lines, head, srcName, sub_dir_name, sub): 13 | """Write sub-data. 14 | Args: 15 | :param lines: A list. Several pieces of data. 16 | :param head: A string. ['label', 'I1', 'I2', ...]. 17 | :param srcName: A string. The name of data. 18 | :param sub_dir_name: A string. 19 | :param sub: A scalar(Int). Record the current number of sub file. 20 | :return: sub + 1. 21 | """ 22 | root_path, file = os.path.split(srcName) 23 | file_name, suffix = file.split('.') 24 | split_file_name = file_name + "_" + str(sub).zfill(2) + "." + suffix 25 | split_file = os.path.join(root_path, sub_dir_name, split_file_name) 26 | if not os.path.exists(os.path.join(root_path, sub_dir_name)): 27 | os.mkdir(os.path.join(root_path, sub_dir_name)) 28 | print('make file: %s' % split_file) 29 | f = open(split_file, 'w') 30 | try: 31 | f.writelines([head]) 32 | f.writelines(lines) 33 | return sub + 1 34 | finally: 35 | f.close() 36 | 37 | 38 | def splitByLineCount(filename, count, sub_dir_name): 39 | """Split File. 40 | Note: You can specify how many rows of data each sub file contains. 41 | Args: 42 | :param filename: A string. 43 | :param count: A scalar(int). 44 | :param sub_dir_name: A string. 45 | :return: 46 | """ 47 | f = open(filename, 'r') 48 | try: 49 | head = f.readline() 50 | buf = [] 51 | sub = 1 52 | for line in f: 53 | buf.append(line) 54 | if len(buf) == count: 55 | sub = mkSubFile(buf, head, filename, sub_dir_name, sub) 56 | buf = [] 57 | if len(buf) != 0: 58 | mkSubFile(buf, head, filename, sub_dir_name, sub) 59 | finally: 60 | f.close() 61 | 62 | 63 | def recKBinsDiscretizer(data_df, n_bins, min_max_dict): 64 | """Bin continuous data into intervals. 65 | Note: The strategy is "uniform". 66 | Args: 67 | :param data_df: A dataframe. 68 | :param n_bins: A scalar(int). 69 | :param min_max_dict: A dict such as {'min': , 'max': }. 70 | :return: The new dataframe. 71 | """ 72 | features = data_df.columns 73 | n_features = len(features) 74 | bin_edges = np.zeros(n_features, dtype=object) 75 | for idx, feature in enumerate(features): 76 | bin_edges[idx] = np.linspace(min_max_dict[feature]['min'], min_max_dict[feature]['max'], n_bins + 1) 77 | rtol = 1.e-5 78 | atol = 1.e-8 79 | eps = atol + rtol * np.abs(data_df[feature]) 80 | np.digitize(data_df[feature] + eps, bin_edges[idx][1:]) 81 | return data_df -------------------------------------------------------------------------------- /reclearn/evaluator/__init__.py: -------------------------------------------------------------------------------- 1 | from reclearn.evaluator.evaluator import * -------------------------------------------------------------------------------- /reclearn/evaluator/evaluator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 14, 2021 3 | Evaluate Functions. 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | from reclearn.evaluator.metrics import * 7 | 8 | 9 | def eval_pos_neg(model, test_data, metric_names, k=10, batch_size=None): 10 | """Evaluate the performance of Top-k recommendation algorithm. 11 | Note: Test data must contain some negative samples(>= k) and one positive samples. 12 | Args: 13 | :param model: A model built-by tensorflow. 14 | :param test_data: A dict. 15 | :param metric_names: A list like ['hr']. 16 | :param k: A scalar(int). 17 | :param batch_size: A scalar(int). 18 | :return: A result dict such as {'hr':, 'ndcg':, ...} 19 | """ 20 | pred_y = - model.predict(test_data, batch_size) 21 | return eval_rank(pred_y, metric_names, k) 22 | 23 | 24 | def eval_rank(pred_y, metric_names, k=10): 25 | """Evaluate 26 | Args: 27 | :param pred_y: A ndarray. 28 | :param metric_names: A list like ['hr']. 29 | :param k: A scalar(int). 30 | :return: A result dict such as {'hr':, 'ndcg':, ...} 31 | """ 32 | rank = pred_y.argsort().argsort()[:, 0] 33 | res_dict = {} 34 | for name in metric_names: 35 | if name == 'hr': 36 | res = hr(rank, k) 37 | elif name == 'ndcg': 38 | res = ndcg(rank, k) 39 | elif name == 'mrr': 40 | res = mrr(rank, k) 41 | else: 42 | break 43 | res_dict[name] = res 44 | return res_dict -------------------------------------------------------------------------------- /reclearn/evaluator/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 14, 2021 3 | @author: Ziyao Geng(zggzy1996@163.com) 4 | """ 5 | import numpy as np 6 | 7 | 8 | def hr(rank, k): 9 | """Hit Rate. 10 | Args: 11 | :param rank: A list. 12 | :param k: A scalar(int). 13 | :return: hit rate. 14 | """ 15 | res = 0.0 16 | for r in rank: 17 | if r < k: 18 | res += 1 19 | return res / len(rank) 20 | 21 | 22 | def mrr(rank, k): 23 | """Mean Reciprocal Rank. 24 | Args: 25 | :param rank: A list. 26 | :param k: A scalar(int). 27 | :return: mrr. 28 | """ 29 | mrr = 0.0 30 | for r in rank: 31 | if r < k: 32 | mrr += 1 / (r + 1) 33 | return mrr / len(rank) 34 | 35 | 36 | def ndcg(rank, k): 37 | """Normalized Discounted Cumulative Gain. 38 | Args: 39 | :param rank: A list. 40 | :param k: A scalar(int). 41 | :return: ndcg. 42 | """ 43 | res = 0.0 44 | for r in rank: 45 | if r < k: 46 | res += 1 / np.log2(r + 2) 47 | return res / len(rank) 48 | -------------------------------------------------------------------------------- /reclearn/layers/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This library provides a set of high-level neural networks layers. 3 | """ 4 | 5 | from reclearn.layers.core import Linear 6 | from reclearn.layers.core import MLP 7 | from reclearn.layers.core import TransformerEncoder 8 | from reclearn.layers.core import SelfAttention 9 | from reclearn.layers.core import FM_Layer 10 | from reclearn.layers.core import FFM_Layer 11 | from reclearn.layers.core import Residual_Units 12 | from reclearn.layers.core import CrossNetwork 13 | from reclearn.layers.core import New_FM 14 | from reclearn.layers.core import CIN 15 | from reclearn.layers.core import LBA 16 | from reclearn.layers.core import Item_similarity_gating -------------------------------------------------------------------------------- /reclearn/layers/utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def scaled_dot_product_attention(q, k, v, mask): 5 | """Attention Mechanism Function. 6 | Args: 7 | :param q: A 3d/4d tensor with shape of (None, ..., seq_len, dim) 8 | :param k: A 3d/4d tensor with shape of (None, ..., seq_len, dim) 9 | :param v: A 3d/4d tensor with shape of (None, ..., seq_len, dim) 10 | :param mask: A 3d/4d tensor with shape of (None, ..., seq_len, 1) 11 | :return: 12 | """ 13 | mat_qk = tf.matmul(q, k, transpose_b=True) # (None, seq_len, seq_len) 14 | # Scaled 15 | dk = tf.cast(k.shape[-1], dtype=tf.float32) 16 | scaled_att_logits = mat_qk / tf.sqrt(dk) 17 | 18 | paddings = tf.ones_like(scaled_att_logits) * (-2 ** 32 + 1) # (None, seq_len, seq_len) 19 | outputs = tf.where(tf.equal(mask, tf.zeros_like(mask)), paddings, scaled_att_logits) # (None, seq_len, seq_len) 20 | # softmax 21 | outputs = tf.nn.softmax(logits=outputs) # (None, seq_len, seq_len) 22 | outputs = tf.matmul(outputs, v) # (None, seq_len, dim) 23 | 24 | return outputs 25 | 26 | 27 | def split_heads(x, seq_len, num_heads, depth): 28 | """Split the last dimension into (num_heads, depth). 29 | Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth) 30 | Args: 31 | :param x: A Tensor with shape of [batch_size, seq_len, num_heads * depth] 32 | :param seq_len: A scalar(int). 33 | :param num_heads: A scalar(int). 34 | :param depth: A scalar(int). 35 | :return: A tensor with shape of [batch_size, num_heads, seq_len, depth] 36 | """ 37 | x = tf.reshape(x, (-1, seq_len, num_heads, depth)) 38 | return tf.transpose(x, perm=[0, 2, 1, 3]) 39 | 40 | 41 | def index_mapping(inputs_dict, map_dict): 42 | """Feature index mapping 43 | Args: 44 | :param inputs_dict: A dict such as {'I1': [], 'I2': [], ...} 45 | :param map_dict: A dict such as {'I1': 0, 'I2': 100, ...} 46 | :return: new inputs tensor. 47 | """ 48 | outputs_dict = {} 49 | for key, value in inputs_dict.items(): 50 | if map_dict.get(key) is None: 51 | raise ValueError("map dict error!") 52 | outputs_dict[key] = tf.reshape(value + tf.convert_to_tensor(map_dict[key]), [-1, 1]) 53 | return outputs_dict -------------------------------------------------------------------------------- /reclearn/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZiyaoGeng/RecLearn/4bbfb492b872c5a3290a2bce1ed5c160162558a3/reclearn/models/__init__.py -------------------------------------------------------------------------------- /reclearn/models/losses.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 14, 2021 3 | Loss function. 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import tensorflow as tf 7 | 8 | 9 | def get_loss(pos_scores, neg_scores, loss_name, gamma=None): 10 | """Get loss scores. 11 | Args: 12 | :param pos_scores: A tensor with shape of [batch_size, 1]. 13 | :param neg_scores: A tensor with shape of [batch_size, neg_num]. 14 | :param loss_name: A string such as 'bpr_loss', 'hing_loss' and etc. 15 | :param gamma: A scalar(int). If loss_name == 'hinge_loss', the gamma must be valid. 16 | :return: 17 | """ 18 | pos_scores = tf.tile(pos_scores, [1, neg_scores.shape[1]]) 19 | if loss_name == 'bpr_loss': 20 | loss = bpr_loss(pos_scores, neg_scores) 21 | elif loss_name == 'hinge_loss': 22 | loss = hinge_loss(pos_scores, neg_scores, gamma) 23 | else: 24 | loss = binary_cross_entropy_loss(pos_scores, neg_scores) 25 | return loss 26 | 27 | 28 | def bpr_loss(pos_scores, neg_scores): 29 | """bpr loss. 30 | Args: 31 | :param pos_scores: A tensor with shape of [batch_size, 1]. 32 | :param neg_scores: A tensor with shape of [batch_size, neg_num]. 33 | :return: 34 | """ 35 | loss = tf.reduce_mean(-tf.math.log(tf.nn.sigmoid(pos_scores - neg_scores))) 36 | return loss 37 | 38 | 39 | def hinge_loss(pos_scores, neg_scores, gamma=0.5): 40 | """hinge loss. 41 | Args: 42 | :param pos_scores: A tensor with shape of [batch_size, neg_num]. 43 | :param neg_scores: A tensor with shape of [batch_size, neg_num]. 44 | :param gamma: A scalar(int). 45 | :return: 46 | """ 47 | loss = tf.reduce_mean(tf.nn.relu(neg_scores - pos_scores + gamma)) 48 | return loss 49 | 50 | 51 | def binary_cross_entropy_loss(pos_scores, neg_scores): 52 | """binary cross entropy loss. 53 | Args: 54 | :param pos_scores: A tensor with shape of [batch_size, neg_num]. 55 | :param neg_scores: A tensor with shape of [batch_size, neg_num]. 56 | :return: 57 | """ 58 | loss = tf.reduce_mean(- tf.math.log(tf.nn.sigmoid(pos_scores)) - tf.math.log(1 - tf.nn.sigmoid(neg_scores))) / 2 59 | return loss -------------------------------------------------------------------------------- /reclearn/models/matching/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from reclearn.models.matching.poprec import PopRec 3 | from reclearn.models.matching.bpr import BPR 4 | from reclearn.models.matching.ncf import NCF 5 | from reclearn.models.matching.dssm import DSSM 6 | from reclearn.models.matching.youtubednn import YoutubeDNN 7 | from reclearn.models.matching.gru4rec import GRU4Rec 8 | from reclearn.models.matching.sasrec import SASRec 9 | from reclearn.models.matching.attrec import AttRec 10 | from reclearn.models.matching.caser import Caser 11 | from reclearn.models.matching.fissa import FISSA 12 | from reclearn.models.matching.mind import MIND 13 | 14 | 15 | __all__ = ['PopRec', 'BPR', 'NCF', 'DSSM', 'YoutubeDNN', 'GRU4Rec', 'SASRec', 'AttRec', 'Caser', 'FISSA', 'MIND'] -------------------------------------------------------------------------------- /reclearn/models/matching/attrec.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 10, 2020 3 | Updated on Apr 23, 2022 4 | Reference: "Next Item Recommendation with Self-Attentive Metric Learning", AAAI, 2019 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import tensorflow as tf 8 | from tensorflow.keras import Model 9 | from tensorflow.keras.layers import Embedding, Input 10 | from tensorflow.keras.regularizers import l2 11 | 12 | from reclearn.layers import SelfAttention 13 | from reclearn.models.losses import get_loss 14 | 15 | 16 | class AttRec(Model): 17 | def __init__(self, user_num, item_num, embed_dim, mode='inner', w=0.5, use_l2norm=False, 18 | loss_name="hinge_loss", gamma=0.5, embed_reg=0., seed=None): 19 | """AttRec, Sequential Recommendation Model. 20 | Args: 21 | :param user_num: An integer type. The largest user index + 1. 22 | :param item_num: An integer type. The largest item index + 1. 23 | :param embed_dim: An integer type. Embedding dimension of user vector and item vector. 24 | :param mode: A string. inner or dist. 25 | :param w: A float type. The weight of short interest. 26 | :param use_l2norm: A boolean. Whether user embedding, item embedding should be normalized or not. 27 | :param loss_name: A string. You can specify the current point-loss function 'binary_cross_entropy_loss' or 28 | pair-loss function as 'bpr_loss'、'hinge_loss'. 29 | :param gamma: A float type. If hinge_loss is selected as the loss function, you can specify the margin. 30 | :param embed_reg: A float type. The regularizer of embedding. 31 | :param seed: A Python integer to use as random seed. 32 | """ 33 | super(AttRec, self).__init__() 34 | # user embedding 35 | self.user_embedding = Embedding(input_dim=user_num, 36 | input_length=1, 37 | output_dim=embed_dim, 38 | embeddings_initializer='random_normal', 39 | embeddings_regularizer=l2(embed_reg)) 40 | # item embedding 41 | self.item_embedding = Embedding(input_dim=item_num, 42 | input_length=1, 43 | output_dim=embed_dim, 44 | embeddings_initializer='random_normal', 45 | embeddings_regularizer=l2(embed_reg)) 46 | # item2 embedding, not share embedding 47 | self.item2_embedding = Embedding(input_dim=item_num, 48 | input_length=1, 49 | output_dim=embed_dim, 50 | embeddings_initializer='random_normal', 51 | embeddings_regularizer=l2(embed_reg)) 52 | # self-attention 53 | self.self_attention = SelfAttention() 54 | # w 55 | self.w = w 56 | # mode 57 | self.mode = mode 58 | # norm 59 | self.use_l2norm = use_l2norm 60 | # loss name 61 | self.loss_name = loss_name 62 | self.gamma = gamma 63 | # seed 64 | tf.random.set_seed(seed) 65 | 66 | def call(self, inputs): 67 | # user info 68 | user_embed = self.user_embedding(tf.reshape(inputs['user'], [-1, ])) # (None, embed_dim) 69 | # mask 70 | mask = tf.expand_dims(tf.cast(tf.not_equal(inputs['click_seq'], 0), dtype=tf.float32), axis=-1) # (None, seq_len, 1) 71 | # seq info 72 | seq_embed = self.item_embedding(inputs['click_seq']) # (None, seq_len, embed_dim) 73 | seq_embed *= mask 74 | # short-term interest 75 | short_interest = self.self_attention([seq_embed, seq_embed, seq_embed, mask]) # (None, dim) 76 | # item 77 | pos_embed = self.item_embedding(tf.reshape(inputs['pos_item'], [-1, ])) # (None, embed_dim) 78 | neg_embed = self.item_embedding(inputs['neg_item']) # (None, neg_num, embed_dim) 79 | # item2 embed 80 | pos_embed2 = self.item2_embedding(tf.reshape(inputs['pos_item'], [-1, ])) # (None, embed_dim) 81 | neg_embed2 = self.item2_embedding(inputs['neg_item']) # (None, neg_num, embed_dim) 82 | # mode 83 | if self.mode == 'inner': 84 | if self.use_l2norm: 85 | user_embed = tf.math.l2_normalize(user_embed, axis=-1) 86 | pos_embed = tf.math.l2_normalize(pos_embed, axis=-1) 87 | neg_embed = tf.math.l2_normalize(neg_embed, axis=-1) 88 | pos_embed2 = tf.math.l2_normalize(pos_embed2, axis=-1) 89 | neg_embed2 = tf.math.l2_normalize(neg_embed2, axis=-1) 90 | # long-term interest, pos and neg 91 | pos_long_interest = tf.multiply(user_embed, pos_embed2) # (None, embed_dim) 92 | neg_long_interest = tf.multiply(tf.expand_dims(user_embed, axis=1), neg_embed2) # (None, neg_num, embed_dim) 93 | pos_scores = self.w * tf.reduce_sum(pos_long_interest, axis=-1, keepdims=True) \ 94 | + (1 - self.w) * tf.reduce_sum(tf.multiply(short_interest, pos_embed), axis=-1, keepdims=True) 95 | neg_scores = self.w * tf.reduce_sum(neg_long_interest, axis=-1) \ 96 | + (1 - self.w) * tf.reduce_sum(tf.multiply(tf.expand_dims(short_interest, axis=1), neg_embed), axis=-1) 97 | else: 98 | # clip by norm 99 | user_embed = tf.clip_by_norm(user_embed, 1, -1) 100 | pos_embed = tf.clip_by_norm(pos_embed, 1, -1) 101 | neg_embed = tf.clip_by_norm(neg_embed, 1, -1) 102 | pos_embed2 = tf.clip_by_norm(pos_embed2, 1, -1) 103 | neg_embed2 = tf.clip_by_norm(neg_embed2, 1, -1) 104 | # distance, long-term interest, pos and neg 105 | pos_long_interest = tf.square(user_embed - pos_embed2) # (None, embed_dim) 106 | neg_long_interest = tf.square(tf.expand_dims(user_embed, axis=1) - neg_embed2) # (None, neg_num, embed_dim) 107 | # combine. Here is a difference from the original paper. 108 | pos_scores = self.w * tf.reduce_sum(pos_long_interest, axis=-1, keepdims=True) + \ 109 | (1 - self.w) * tf.reduce_sum(tf.square(short_interest - pos_embed), axis=-1, keepdims=True) 110 | neg_scores = self.w * tf.reduce_sum(neg_long_interest, axis=-1) + \ 111 | (1 - self.w) * tf.reduce_sum(tf.square(tf.expand_dims(short_interest, axis=1) - neg_embed), axis=-1) 112 | self.add_loss(get_loss(pos_scores, neg_scores, self.loss_name, self.gamma)) 113 | logits = tf.concat([pos_scores, neg_scores], axis=-1) 114 | return logits 115 | 116 | def summary(self): 117 | inputs = { 118 | 'user': Input(shape=(), dtype=tf.int32), 119 | 'click_seq': Input(shape=(100,), dtype=tf.int32), # suppose sequence length=1 120 | 'pos_item': Input(shape=(), dtype=tf.int32), 121 | 'neg_item': Input(shape=(1,), dtype=tf.int32) # suppose neg_num=1 122 | } 123 | Model(inputs=inputs, outputs=self.call(inputs)).summary() 124 | -------------------------------------------------------------------------------- /reclearn/models/matching/bpr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 13, 2020 3 | Updated on Apr 9, 2022 4 | Reference: "BPR: Bayesian Personalized Ranking from Implicit Feedback", UAI, 2009 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import tensorflow as tf 8 | from tensorflow.keras import Model 9 | from tensorflow.keras.layers import Embedding, Input 10 | from tensorflow.keras.regularizers import l2 11 | 12 | from reclearn.models.losses import bpr_loss 13 | 14 | 15 | class BPR(Model): 16 | def __init__(self, user_num, item_num, embed_dim, use_l2norm=False, embed_reg=0., seed=None): 17 | """Bayesian Personalized Ranking - Matrix Factorization 18 | Args: 19 | :param user_num: An integer type. The largest user index + 1. 20 | :param item_num: An integer type. The largest item index + 1. 21 | :param embed_dim: An integer type. Embedding dimension of user vector and item vector. 22 | :param use_l2norm: A boolean. Whether user embedding, item embedding should be normalized or not. 23 | :param embed_reg: A float type. The regularizer of embedding. 24 | :param seed: A Python integer to use as random seed. 25 | :return: 26 | """ 27 | super(BPR, self).__init__() 28 | # user embedding 29 | self.user_embedding = Embedding(input_dim=user_num, 30 | input_length=1, 31 | output_dim=embed_dim, 32 | embeddings_initializer='random_normal', 33 | embeddings_regularizer=l2(embed_reg)) 34 | # item embedding 35 | self.item_embedding = Embedding(input_dim=item_num, 36 | input_length=1, 37 | output_dim=embed_dim, 38 | embeddings_initializer='random_normal', 39 | embeddings_regularizer=l2(embed_reg)) 40 | # norm 41 | self.use_l2norm = use_l2norm 42 | # seed 43 | tf.random.set_seed(seed) 44 | 45 | def call(self, inputs): 46 | # user info 47 | user_embed = self.user_embedding(tf.reshape(inputs['user'], [-1, ])) # (None, embed_dim) 48 | # item info 49 | pos_info = self.item_embedding(tf.reshape(inputs['pos_item'], [-1, ])) # (None, embed_dim) 50 | neg_info = self.item_embedding(inputs['neg_item']) # (None, neg_num, embed_dim) 51 | # norm 52 | if self.use_l2norm: 53 | pos_info = tf.math.l2_normalize(pos_info, axis=-1) 54 | neg_info = tf.math.l2_normalize(neg_info, axis=-1) 55 | user_embed = tf.math.l2_normalize(user_embed, axis=-1) 56 | # calculate positive item scores and negative item scores 57 | pos_scores = tf.reduce_sum(tf.multiply(user_embed, pos_info), axis=-1, keepdims=True) # (None, 1) 58 | neg_scores = tf.reduce_sum(tf.multiply(tf.expand_dims(user_embed, axis=1), neg_info), axis=-1) # (None, neg_num) 59 | # add loss 60 | self.add_loss(bpr_loss(pos_scores, neg_scores)) 61 | logits = tf.concat([pos_scores, neg_scores], axis=-1) 62 | return logits 63 | 64 | def get_user_vector(self, inputs): 65 | if len(inputs) < 2 and inputs.get('user') is not None: 66 | return self.user_embedding(inputs['user']) 67 | 68 | def summary(self): 69 | inputs = { 70 | 'user': Input(shape=(), dtype=tf.int32), 71 | 'pos_item': Input(shape=(), dtype=tf.int32), 72 | 'neg_item': Input(shape=(1,), dtype=tf.int32) # suppose neg_num=1 73 | } 74 | Model(inputs=inputs, outputs=self.call(inputs)).summary() -------------------------------------------------------------------------------- /reclearn/models/matching/caser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 18, 2020 3 | Updated on Apr 23, 2022 4 | Reference: "Personalized Top-N Sequential Recommendation via Convolutional Sequence Embedding", WSDM, 2018 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import tensorflow as tf 8 | from tensorflow.keras import Model 9 | from tensorflow.keras.layers import Embedding, Input, Conv1D, GlobalMaxPooling1D, Dense, Dropout 10 | from tensorflow.keras.regularizers import l2 11 | 12 | from reclearn.models.losses import get_loss 13 | 14 | 15 | class Caser(Model): 16 | def __init__(self, user_num, item_num, embed_dim, seq_len=100, hor_n=8, hor_h=2, ver_n=4, 17 | activation='relu', dnn_dropout=0., use_l2norm=False, 18 | loss_name="binary_cross_entropy_loss", gamma=0.5, embed_reg=0, seed=None): 19 | """Caser, Sequential Recommendation Model. 20 | Args: 21 | :param user_num: An integer type. The largest user index + 1. 22 | :param item_num: An integer type. The largest item index + 1. 23 | :param embed_dim: An integer type. Embedding dimension of user vector and item vector. 24 | :param seq_len: An integer type. The length of the input sequence. 25 | :param hor_n: An integer type. The number of horizontal filters. 26 | :param hor_h: An integer type. Height of horizontal filters. 27 | :param ver_n: An integer type. The number of vertical filters. 28 | :param activation: A string. Activation function name of user and item MLP layer. 29 | :param dnn_dropout: Float between 0 and 1. Dropout of user and item MLP layer. 30 | :param use_l2norm: A boolean. Whether user embedding, item embedding should be normalized or not. 31 | :param loss_name: A string. You can specify the current point-loss function 'binary_cross_entropy_loss' or 32 | pair-loss function as 'bpr_loss'、'hinge_loss'. 33 | :param gamma: A float type. If hinge_loss is selected as the loss function, you can specify the margin. 34 | :param embed_reg: A float type. The regularizer of embedding. 35 | :param seed: A Python integer to use as random seed. 36 | :return: 37 | """ 38 | super(Caser, self).__init__() 39 | # user embedding 40 | self.user_embedding = Embedding(input_dim=user_num, 41 | input_length=1, 42 | output_dim=embed_dim // 2, 43 | embeddings_initializer='random_normal', 44 | embeddings_regularizer=l2(embed_reg)) 45 | # item embedding 46 | self.item_embedding = Embedding(input_dim=item_num, 47 | input_length=1, 48 | output_dim=embed_dim // 2, 49 | embeddings_initializer='random_normal', 50 | embeddings_regularizer=l2(embed_reg)) 51 | # item2 embedding 52 | self.item2_embedding = Embedding(input_dim=item_num, 53 | input_length=1, 54 | output_dim=embed_dim, 55 | embeddings_initializer='random_normal', 56 | embeddings_regularizer=l2(embed_reg)) 57 | # seq_len 58 | self.seq_len = seq_len 59 | # horizontal filters 60 | self.hor_n = hor_n 61 | self.hor_h = hor_h if hor_h <= self.seq_len else self.seq_len 62 | # vertical filters 63 | self.ver_n = ver_n 64 | self.ver_w = 1 65 | # horizontal conv 66 | self.hor_conv_list = [ 67 | Conv1D(filters=i+1, kernel_size=self.hor_h) for i in range(self.hor_n + 1) 68 | ] 69 | # vertical conv, should transpose 70 | self.ver_conv = Conv1D(filters=self.ver_n, kernel_size=self.ver_w) 71 | # dense 72 | self.dense = Dense(embed_dim // 2, activation=activation) 73 | self.dropout = Dropout(dnn_dropout) 74 | # norm 75 | self.use_l2norm = use_l2norm 76 | # loss name 77 | self.loss_name = loss_name 78 | self.gamma = gamma 79 | # seed 80 | tf.random.set_seed(seed) 81 | 82 | def call(self, inputs): 83 | # user info 84 | user_embed = self.user_embedding(tf.reshape(inputs['user'], [-1, ])) # (None, embed_dim // 2) 85 | # mask 86 | mask = tf.expand_dims(tf.cast(tf.not_equal(inputs['click_seq'], 0), dtype=tf.float32), axis=-1) # (None, seq_len, 1) 87 | # seq info 88 | seq_embed = self.item_embedding(inputs['click_seq']) # (None, seq_len, embed_dim // 2) 89 | seq_embed *= mask 90 | # horizontal conv (None, (seq_len - kernel_size + 2 * pad) / stride +1, hor_n) 91 | hor_info = list() 92 | for hor_conv in self.hor_conv_list: 93 | hor_info_i = hor_conv(seq_embed) 94 | hor_info_i = GlobalMaxPooling1D()(hor_info_i) # (None, hor_n) 95 | hor_info.append(hor_info_i) 96 | hor_info = tf.concat(hor_info, axis=1) 97 | # vertical conv (None, (dim - 1 + 2 * pad) / stride + 1, ver_n) 98 | ver_info = self.ver_conv(tf.transpose(seq_embed, perm=(0, 2, 1))) 99 | ver_info = tf.reshape(ver_info, shape=(-1, ver_info.shape[1] * ver_info.shape[2])) # (None, ?) 100 | # info 101 | seq_info = self.dense(tf.concat([hor_info, ver_info], axis=-1)) # (None, dim) 102 | seq_info = self.dropout(seq_info) 103 | # concat 104 | user_info = tf.concat([seq_info, user_embed], axis=-1) # (None, embed_dim) 105 | # pos info 106 | pos_info = self.item2_embedding(tf.reshape(inputs['pos_item'], [-1, ])) # (None, embed_dim) 107 | # neg info 108 | neg_info = self.item2_embedding(inputs['neg_item']) # (None, neg_num, embed_dim) 109 | # norm 110 | if self.use_l2norm: 111 | pos_info = tf.math.l2_normalize(pos_info, axis=-1) 112 | neg_info = tf.math.l2_normalize(neg_info, axis=-1) 113 | user_info = tf.math.l2_normalize(user_info, axis=-1) 114 | # scores 115 | pos_scores = tf.reduce_sum(tf.multiply(user_info, pos_info), axis=-1, keepdims=True) # (None, 1) 116 | neg_scores = tf.reduce_sum(tf.multiply(tf.expand_dims(user_info, axis=1), neg_info), axis=-1) # (None, neg_num) 117 | # loss 118 | self.add_loss(get_loss(pos_scores, neg_scores, self.loss_name, self.gamma)) 119 | logits = tf.concat([pos_scores, neg_scores], axis=-1) 120 | return logits 121 | 122 | def summary(self): 123 | inputs = { 124 | 'user': Input(shape=(), dtype=tf.int32), 125 | 'click_seq': Input(shape=(self.seq_len,), dtype=tf.int32), 126 | 'pos_item': Input(shape=(), dtype=tf.int32), 127 | 'neg_item': Input(shape=(1,), dtype=tf.int32) # suppose neg_num=1 128 | } 129 | Model(inputs=inputs, outputs=self.call(inputs)).summary() 130 | -------------------------------------------------------------------------------- /reclearn/models/matching/dssm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Mar 31, 2022 3 | Reference: "Learning Deep Structured Semantic Models for Web Search using Clickthrough Data", CIKM, 2013 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import tensorflow as tf 7 | from tensorflow.keras import Model 8 | from tensorflow.keras.layers import Embedding, Input 9 | from tensorflow.keras.regularizers import l2 10 | from reclearn.layers import MLP 11 | from reclearn.models.losses import get_loss 12 | 13 | 14 | class DSSM(Model): 15 | def __init__(self, user_num, item_num, embed_dim, user_mlp, item_mlp, activation='relu', 16 | dnn_dropout=0., use_l2norm=False, loss_name="binary_cross_entropy_loss", 17 | gamma=0.5, embed_reg=0., seed=None): 18 | """DSSM: The two-tower matching model commonly used in industry. 19 | Args: 20 | :param user_num: An integer type. The largest user index + 1. 21 | :param item_num: An integer type. The largest item index + 1. 22 | :param embed_dim: An integer type. Embedding dimension of user vector and item vector. 23 | :param user_mlp: A list of user MLP hidden units such as [128, 64, 32]. 24 | :param item_mlp: A list of item MLP hidden units such as [128, 64, 32] and 25 | the last unit must be equal to the user's. 26 | :param activation: A string. Activation function name of user and item MLP layer. 27 | :param dnn_dropout: Float between 0 and 1. Dropout of user and item MLP layer. 28 | :param use_l2norm: A boolean. Whether user embedding, item embedding should be normalized or not. 29 | :param loss_name: A string. You can specify the current point-loss function 'binary_cross_entropy_loss' or 30 | pair-loss function as 'bpr_loss'、'hinge_loss'. 31 | :param gamma: A float type. If hinge_loss is selected as the loss function, you can specify the margin. 32 | :param embed_reg: A float type. The regularizer of embedding. 33 | :param seed: A Python integer to use as random seed. 34 | :return: 35 | """ 36 | super(DSSM, self).__init__() 37 | if user_mlp[-1] != item_mlp[-1]: 38 | raise ValueError("The last value of user_mlp must be equal to item_mlp's.") 39 | # user embedding 40 | self.user_embedding_table = Embedding(input_dim=user_num, 41 | input_length=1, 42 | output_dim=embed_dim, 43 | embeddings_initializer='random_normal', 44 | embeddings_regularizer=l2(embed_reg)) 45 | # item embedding 46 | self.item_embedding_table = Embedding(input_dim=item_num, 47 | input_length=1, 48 | output_dim=embed_dim, 49 | embeddings_initializer='random_normal', 50 | embeddings_regularizer=l2(embed_reg)) 51 | # user_mlp_layer 52 | self.user_mlp_layer = MLP(user_mlp, activation, dnn_dropout) 53 | # item_mlp_layer 54 | self.item_mlp_layer = MLP(item_mlp, activation, dnn_dropout) 55 | self.use_l2norm = use_l2norm 56 | self.loss_name = loss_name 57 | self.gamma = gamma 58 | # seed 59 | tf.random.set_seed(seed) 60 | 61 | def call(self, inputs): 62 | # user info 63 | user_info = self.user_embedding_table(tf.reshape(inputs['user'], [-1, ])) # (None, embed_dim) 64 | # item info 65 | pos_info = self.item_embedding_table(tf.reshape(inputs['pos_item'], [-1, ])) # (None, embed_dim) 66 | neg_info = self.item_embedding_table(inputs['neg_item']) # (None, neg_num, embed_dim) 67 | # mlp 68 | user_info = self.user_mlp_layer(user_info) 69 | pos_info = self.item_mlp_layer(pos_info) 70 | neg_info = self.item_mlp_layer(neg_info) 71 | # norm 72 | if self.use_l2norm: 73 | user_info = tf.math.l2_normalize(user_info, axis=-1) 74 | pos_info = tf.math.l2_normalize(pos_info, axis=-1) 75 | neg_info = tf.math.l2_normalize(neg_info, axis=-1) 76 | # calculate similar scores. 77 | pos_scores = tf.reduce_sum(tf.multiply(user_info, pos_info), axis=-1, keepdims=True) # (None, 1) 78 | neg_scores = tf.reduce_sum(tf.multiply(tf.expand_dims(user_info, axis=1), neg_info), axis=-1) # (None, neg_num) 79 | # add loss 80 | self.add_loss(get_loss(pos_scores, neg_scores, self.loss_name, self.gamma)) 81 | logits = tf.concat([pos_scores, neg_scores], axis=-1) 82 | return logits 83 | 84 | def summary(self): 85 | inputs = { 86 | 'user': Input(shape=(), dtype=tf.int32), 87 | 'pos_item': Input(shape=(), dtype=tf.int32), 88 | 'neg_item': Input(shape=(1,), dtype=tf.int32) # suppose neg_num=1 89 | } 90 | Model(inputs=inputs, outputs=self.call(inputs)).summary() -------------------------------------------------------------------------------- /reclearn/models/matching/fissa.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 20, 2021 3 | Updated on Apr 23, 2022 4 | Reference: "FISSA: fusing item similarity models with self-attention networks for sequential recommendation", 5 | RecSys, 2020 6 | @author: Ziyao Geng(zggzy1996@163.com) 7 | """ 8 | import tensorflow as tf 9 | 10 | from tensorflow.keras import Model 11 | from tensorflow.keras.layers import Embedding, Input, LayerNormalization, Dropout 12 | from tensorflow.keras.regularizers import l2 13 | 14 | from reclearn.layers import TransformerEncoder, LBA, Item_similarity_gating 15 | from reclearn.models.losses import get_loss 16 | 17 | 18 | class FISSA(Model): 19 | def __init__(self, item_num, embed_dim, seq_len=100, blocks=2, num_heads=2, ffn_hidden_unit=128, 20 | dnn_dropout=0., layer_norm_eps=1e-6, use_l2norm=False, 21 | loss_name="binary_entropy_loss", gamma=0.5, embed_reg=0., seed=None): 22 | """FISSA, Sequential Recommendation Model. 23 | Args: 24 | :param item_num: An integer type. The largest item index + 1. 25 | :param embed_dim: An integer type. Embedding dimension of item vector. 26 | :param seq_len: An integer type. The length of the input sequence. 27 | :param blocks: An integer type. The Number of blocks. 28 | :param num_heads: An integer type. The Number of attention heads. 29 | :param ffn_hidden_unit: An integer type. Number of hidden unit in FFN. 30 | :param dnn_dropout: Float between 0 and 1. Dropout of user and item MLP layer. 31 | :param layer_norm_eps: A float type. Small float added to variance to avoid dividing by zero. 32 | :param use_l2norm: A boolean. Whether user embedding, item embedding should be normalized or not. 33 | :param loss_name: A string. You can specify the current point-loss function 'binary_cross_entropy_loss' or 34 | pair-loss function as 'bpr_loss'、'hinge_loss'. 35 | :param gamma: A float type. If hinge_loss is selected as the loss function, you can specify the margin. 36 | :param embed_reg: A float type. The regularizer of embedding. 37 | :param seed: A Python integer to use as random seed. 38 | """ 39 | super(FISSA, self).__init__() 40 | # item embedding 41 | self.item_embedding = Embedding(input_dim=item_num, 42 | input_length=1, 43 | output_dim=embed_dim, 44 | embeddings_initializer='random_normal', 45 | embeddings_regularizer=l2(embed_reg)) 46 | self.pos_embedding = Embedding(input_dim=seq_len, 47 | input_length=1, 48 | output_dim=embed_dim, 49 | embeddings_initializer='random_normal', 50 | embeddings_regularizer=l2(embed_reg)) 51 | # item2 embedding, not share embedding 52 | self.item2_embedding = Embedding(input_dim=item_num, 53 | input_length=1, 54 | output_dim=embed_dim, 55 | embeddings_initializer='random_normal', 56 | embeddings_regularizer=l2(embed_reg)) 57 | # encoder 58 | self.encoder_layer = [TransformerEncoder(embed_dim, num_heads, ffn_hidden_unit, 59 | dnn_dropout, layer_norm_eps) for _ in range(blocks)] 60 | self.lba = LBA(dnn_dropout) 61 | self.gating = Item_similarity_gating(dnn_dropout) 62 | # layer normalization 63 | self.layer_norm = LayerNormalization() 64 | # dropout 65 | self.dropout = Dropout(dnn_dropout) 66 | # norm 67 | self.use_l2norm = use_l2norm 68 | # loss name 69 | self.loss_name = loss_name 70 | self.gamma = gamma 71 | # seq_len 72 | self.seq_len = seq_len 73 | # seed 74 | tf.random.set_seed(seed) 75 | 76 | def call(self, inputs): 77 | # seq info 78 | seq_embed = self.item_embedding(inputs['click_seq']) # (None, seq_len, dim) 79 | # mask 80 | mask = tf.expand_dims(tf.cast(tf.not_equal(inputs['click_seq'], 0), dtype=tf.float32), axis=-1) # (None, seq_len, 1) 81 | # pos encoding 82 | pos_encoding = tf.expand_dims(self.pos_embedding(tf.range(self.seq_len)), axis=0) # (1, seq_len, embed_dim) 83 | seq_embed += pos_encoding # (None, seq_len, embed_dim), broadcasting 84 | 85 | seq_embed = self.dropout(seq_embed) 86 | seq_embed = self.layer_norm(seq_embed) 87 | att_outputs = seq_embed # (None, seq_len, embed_dim) 88 | att_outputs *= mask 89 | # transformer encoder part 90 | for block in self.encoder_layer: 91 | att_outputs = block([att_outputs, mask]) # (None, seq_len, embed_dim) 92 | att_outputs *= mask 93 | 94 | local_info = tf.slice(att_outputs, begin=[0, self.seq_len - 1, 0], size=[-1, 1, -1]) # (None, 1, embed_dim) 95 | global_info = self.lba([seq_embed, seq_embed, mask]) # (None, embed_dim) 96 | pos_info = self.item_embedding(tf.reshape(inputs['pos_item'], [-1, ])) # (None, dim) 97 | neg_info = self.item_embedding(inputs['neg_item']) # (None, neg_num, dim) 98 | 99 | weights = self.gating([tf.tile(tf.slice(seq_embed, begin=[0, self.seq_len - 1, 0], size=[-1, 1, -1]), [1, neg_info.shape[1] + 1, 1]), 100 | tf.tile(local_info, [1, neg_info.shape[1] + 1, 1]), 101 | tf.concat([tf.expand_dims(pos_info, axis=1), neg_info], 1)] 102 | ) # (None, 1 + neg_num, 1) 103 | 104 | user_info = tf.multiply(local_info, weights) + \ 105 | tf.multiply(tf.expand_dims(global_info, axis=1), tf.ones_like(weights) - weights) # (None, 1 + neg_num, embed_dim) 106 | # norm 107 | if self.use_l2norm: 108 | pos_info = tf.math.l2_normalize(pos_info, axis=-1) 109 | neg_info = tf.math.l2_normalize(neg_info, axis=-1) 110 | user_info = tf.math.l2_normalize(user_info, axis=-1) 111 | 112 | pos_scores = tf.reduce_sum(tf.multiply(tf.slice(user_info, [0, 0, 0], [-1, 1, -1]), tf.expand_dims(pos_info, axis=1)), axis=-1) # (None, 1) 113 | neg_scores = tf.reduce_sum(tf.multiply(tf.slice(user_info, [0, 1, 0], [-1, neg_info.shape[1], -1]), neg_info), axis=-1) # (None, neg_num) 114 | # loss 115 | self.add_loss(get_loss(pos_scores, neg_scores, self.loss_name, self.gamma)) 116 | logits = tf.concat([pos_scores, neg_scores], axis=-1) 117 | return logits 118 | 119 | def summary(self): 120 | inputs = { 121 | 'click_seq': Input(shape=(self.seq_len,), dtype=tf.int32), 122 | 'pos_item': Input(shape=(), dtype=tf.int32), 123 | 'neg_item': Input(shape=(1,), dtype=tf.int32) # suppose neg_num=1 124 | } 125 | Model(inputs=inputs, outputs=self.call(inputs)).summary() -------------------------------------------------------------------------------- /reclearn/models/matching/gru4rec.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 20, 2021 3 | Updated on Apr 23, 2022 4 | Reference: "Session-based Recommendation with Recurrent Neural Networks", ICLR, 2016 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import tensorflow as tf 8 | from tensorflow.keras import Model 9 | from tensorflow.keras.layers import Dense, Dropout, Embedding, Input, GRU 10 | from tensorflow.keras.regularizers import l2 11 | 12 | from reclearn.models.losses import get_loss 13 | 14 | 15 | class GRU4Rec(Model): 16 | def __init__(self, item_num, embed_dim, gru_layers=1, gru_unit=64, gru_activation='tanh', 17 | dnn_dropout=0., use_l2norm=False, loss_name='bpr_loss', gamma=0.5, embed_reg=0., seed=None): 18 | """GRU4Rec, Sequential Recommendation Model. 19 | Args: 20 | :param item_num: An integer type. The largest item index + 1. 21 | :param embed_dim: An integer type. Embedding dimension of item vector. 22 | :param gru_layers: An integer type. The number of GRU Layers. 23 | :param gru_unit:An integer type. The unit of GRU Layer. 24 | :param gru_activation: A string. The name of activation function. Default 'tanh'. 25 | :param dnn_dropout: Float between 0 and 1. Dropout of user and item MLP layer. 26 | :param use_l2norm: A boolean. Whether user embedding, item embedding should be normalized or not. 27 | :param loss_name: A string. You can specify the current point-loss function 'binary_cross_entropy_loss' or 28 | pair-loss function as 'bpr_loss'、'hinge_loss'. 29 | :param gamma: A float type. If hinge_loss is selected as the loss function, you can specify the margin. 30 | :param embed_reg: A float type. The regularizer of embedding. 31 | :param seed: A Python integer to use as random seed. 32 | :return: 33 | """ 34 | super(GRU4Rec, self).__init__() 35 | self.item_embedding = Embedding(input_dim=item_num, 36 | input_length=1, 37 | output_dim=embed_dim, 38 | embeddings_initializer='random_normal', 39 | embeddings_regularizer=l2(embed_reg)) 40 | self.dropout = Dropout(dnn_dropout) 41 | self.gru_layers = [ 42 | GRU(units=gru_unit, activation=gru_activation, return_sequences=True) 43 | if i < gru_layers - 1 else 44 | GRU(units=gru_unit, activation=gru_activation, return_sequences=False) 45 | for i in range(gru_layers) 46 | ] 47 | self.dense = Dense(units=embed_dim) 48 | # norm 49 | self.use_l2norm = use_l2norm 50 | # loss name 51 | self.loss_name = loss_name 52 | self.gamma = gamma 53 | # seed 54 | tf.random.set_seed(seed) 55 | 56 | def call(self, inputs): 57 | # seq info 58 | seq_embed = self.item_embedding(inputs['click_seq']) # (None, seq_len, dim) 59 | # mask 60 | mask = tf.cast(tf.not_equal(inputs['click_seq'], 0), dtype=tf.float32) # (None, seq_len) 61 | seq_embed = tf.multiply(seq_embed, tf.expand_dims(mask, axis=-1)) 62 | # dropout 63 | seq_info = self.dropout(seq_embed) 64 | # gru 65 | for gru_layer in self.gru_layers: 66 | seq_info = gru_layer(seq_info) 67 | seq_info = self.dense(seq_info) 68 | # positive, negative embedding vector. 69 | pos_info = self.item_embedding(tf.reshape(inputs['pos_item'], [-1, ])) # (None, embed_dim) 70 | neg_info = self.item_embedding(inputs['neg_item']) # (None, neg_num, embed_dim) 71 | # norm 72 | if self.use_l2norm: 73 | pos_info = tf.math.l2_normalize(pos_info, axis=-1) 74 | neg_info = tf.math.l2_normalize(neg_info, axis=-1) 75 | seq_info = tf.math.l2_normalize(seq_info, axis=-1) 76 | # calculate positive item scores and negative item scores 77 | pos_scores = tf.reduce_sum(tf.multiply(seq_info, pos_info), axis=-1, keepdims=True) # (None, 1) 78 | neg_scores = tf.reduce_sum(tf.multiply(tf.expand_dims(seq_info, axis=1), neg_info), axis=-1) # (None, neg_num) 79 | # loss 80 | self.add_loss(get_loss(pos_scores, neg_scores, self.loss_name, self.gamma)) 81 | logits = tf.concat([pos_scores, neg_scores], axis=-1) 82 | return logits 83 | 84 | def summary(self): 85 | inputs = { 86 | 'click_seq': Input(shape=(100,), dtype=tf.int32), # suppose sequence length=1 87 | 'pos_item': Input(shape=(), dtype=tf.int32), 88 | 'neg_item': Input(shape=(1,), dtype=tf.int32) # suppose neg_num=1 89 | } 90 | Model(inputs=inputs, outputs=self.call(inputs)).summary() -------------------------------------------------------------------------------- /reclearn/models/matching/mind.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Apr 25, 2022 3 | Reference: "Multi-Interest Network with Dynamic Routing for Recommendation at Tmall", CIKM, 2019 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import tensorflow as tf 7 | from tensorflow.keras import Model 8 | from tensorflow.keras.layers import Input 9 | from tensorflow.keras.regularizers import l2 10 | 11 | from reclearn.layers.core import CapsuleNetwork 12 | 13 | 14 | class MIND(Model): 15 | def __init__(self, item_num, embed_dim, seq_len=100, num_interest=4, stop_grad=True, label_attention=True, 16 | neg_num=4, batch_size=512, embed_reg=0., seed=None): 17 | """MIND 18 | Args: 19 | :param item_num: An integer type. The largest item index + 1. 20 | :param embed_dim: An integer type. Embedding dimension of item vector. 21 | :param seq_len: An integer type. The length of the input sequence. 22 | :param bilinear_type: An integer type. The number of user interests. 23 | :param num_interest: An integer type. The number of user interests. 24 | :param stop_grad: A boolean type. The weights in the capsule network are updated without gradient descent. 25 | :param label_attention: A boolean type. Whether using label-aware attention or not. 26 | :param neg_num: A integer type. The number of negative samples for each positive sample. 27 | :param batch_size: A integer type. The number of samples per batch. 28 | :param embed_reg: A float type. The regularizer of embedding. 29 | :param seed: A Python integer to use as random seed. 30 | :return 31 | """ 32 | super(MIND, self).__init__() 33 | with tf.name_scope("Embedding_layer"): 34 | # item embedding 35 | self.item_embedding_table = self.add_weight(name='item_embedding_table', 36 | shape=(item_num, embed_dim), 37 | initializer='random_normal', 38 | regularizer=l2(embed_reg), 39 | trainable=True) 40 | # embedding bias 41 | self.embedding_bias = self.add_weight(name='embedding_bias', 42 | shape=(item_num,), 43 | initializer=tf.zeros_initializer(), 44 | trainable=False) 45 | self.capsule_network = CapsuleNetwork(embed_dim, seq_len, 0, num_interest, stop_grad) 46 | self.seq_len = seq_len 47 | self.num_interest = num_interest 48 | self.label_attention = label_attention 49 | self.item_num = item_num 50 | self.embed_dim = embed_dim 51 | self.neg_num = neg_num 52 | self.batch_size = batch_size 53 | # seed 54 | tf.random.set_seed(seed) 55 | 56 | def call(self, inputs, training=False): 57 | user_hist_emb = tf.nn.embedding_lookup(self.item_embedding_table, inputs['click_seq']) 58 | mask = tf.cast(tf.not_equal(inputs['click_seq'], 0), dtype=tf.float32) # (None, seq_len) 59 | user_hist_emb = tf.multiply(user_hist_emb, tf.expand_dims(mask, axis=-1)) # (None, seq_len, embed_dim) 60 | # capsule network 61 | interest_capsule = self.capsule_network(user_hist_emb, mask) # (None, num_inter, embed_dim) 62 | 63 | if training: 64 | if self.label_attention: 65 | item_embed = tf.nn.embedding_lookup(self.item_embedding_table, tf.reshape(inputs['pos_item'], [-1, ])) 66 | inter_att = tf.matmul(interest_capsule, tf.reshape(item_embed, [-1, self.embed_dim, 1])) # (None, num_inter, 1) 67 | inter_att = tf.nn.softmax(tf.pow(tf.reshape(inter_att, [-1, self.num_interest]), 1)) 68 | 69 | user_info = tf.matmul(tf.reshape(inter_att, [-1, 1, self.num_interest]), interest_capsule) # (None, 1, embed_dim) 70 | user_info = tf.reshape(user_info, [-1, self.embed_dim]) 71 | else: 72 | user_info = tf.reduce_max(interest_capsule, axis=1) # (None, embed_dim) 73 | # train, sample softmax loss 74 | loss = tf.reduce_mean(tf.nn.sampled_softmax_loss( 75 | weights=self.item_embedding_table, 76 | biases=self.embedding_bias, 77 | labels=tf.reshape(inputs['pos_item'], shape=[-1, 1]), 78 | inputs=user_info, 79 | num_sampled=self.neg_num * self.batch_size, 80 | num_classes=self.item_num 81 | )) 82 | # add loss 83 | self.add_loss(loss) 84 | return loss 85 | else: 86 | # predict/eval 87 | pos_info = tf.nn.embedding_lookup(self.item_embedding_table, inputs['pos_item']) # (None, embed_dim) 88 | neg_info = tf.nn.embedding_lookup(self.item_embedding_table, inputs['neg_item']) # (None, neg_num, embed_dim) 89 | 90 | if self.label_attention: 91 | user_info = tf.reduce_max(interest_capsule, axis=1) # (None, embed_dim) 92 | else: 93 | user_info = tf.reduce_max(interest_capsule, axis=1) # (None, embed_dim) 94 | 95 | # calculate similar scores. 96 | pos_scores = tf.reduce_sum(tf.multiply(user_info, pos_info), axis=-1, keepdims=True) # (None, 1) 97 | neg_scores = tf.reduce_sum(tf.multiply(tf.expand_dims(user_info, axis=1), neg_info), 98 | axis=-1) # (None, neg_num) 99 | logits = tf.concat([pos_scores, neg_scores], axis=-1) 100 | return logits 101 | 102 | def summary(self): 103 | inputs = { 104 | 'click_seq': Input(shape=(self.seq_len,), dtype=tf.int32), 105 | 'pos_item': Input(shape=(), dtype=tf.int32), 106 | 'neg_item': Input(shape=(1,), dtype=tf.int32) # suppose neg_num=1 107 | } 108 | Model(inputs=inputs, outputs=self.call(inputs)).summary() 109 | 110 | -------------------------------------------------------------------------------- /reclearn/models/matching/ncf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Dec 20, 2020 3 | Updated on Apr 23, 2022 4 | Reference: "Neural Collaborative Filtering", WWW, 2017 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import tensorflow as tf 8 | from tensorflow.keras import Model 9 | from tensorflow.keras.layers import Embedding, Dense, Input 10 | from tensorflow.keras.regularizers import l2 11 | 12 | from reclearn.layers import MLP 13 | from reclearn.models.losses import get_loss 14 | 15 | 16 | class NCF(Model): 17 | def __init__(self, user_num, item_num, embed_dim, hidden_units=None, activation='relu', dnn_dropout=0., 18 | use_batch_norm=False, use_l2norm=False, loss_name='bpr_loss', gamma=0.5, embed_reg=0., seed=None): 19 | """Neural Collaborative Filtering 20 | Args: 21 | :param user_num: An integer type. The largest user index + 1. 22 | :param item_num: An integer type. The largest item index + 1. 23 | :param embed_dim: An integer type. Embedding dimension of user vector and item vector. 24 | :param hidden_units: A list. The list of hidden layer units's numbers, such as [64, 32, 16, 8]. 25 | :param activation: A string. The name of activation function, like 'relu', 'sigmoid' and so on. 26 | :param dnn_dropout: Float between 0 and 1. Dropout of user and item MLP layer. 27 | :param use_batch_norm: A boolean. Whether using batch normalization or not. 28 | :param use_l2norm: A boolean. Whether user embedding, item embedding should be normalized or not. 29 | :param loss_name: A string. You can specify the current point-loss function 'binary_cross_entropy_loss' or 30 | pair-loss function as 'bpr_loss'、'hinge_loss'. 31 | :param gamma: A float type. If hinge_loss is selected as the loss function, you can specify the margin. 32 | :param embed_reg: A float type. The regularizer of embedding. 33 | :param seed: A Python integer to use as random seed. 34 | :return: 35 | """ 36 | super(NCF, self).__init__() 37 | if hidden_units is None: 38 | hidden_units = [64, 32, 16, 8] 39 | # MF user embedding 40 | self.mf_user_embedding = Embedding(input_dim=user_num, 41 | input_length=1, 42 | output_dim=embed_dim, 43 | embeddings_initializer='random_normal', 44 | embeddings_regularizer=l2(embed_reg)) 45 | # MF item embedding 46 | self.mf_item_embedding = Embedding(input_dim=item_num, 47 | input_length=1, 48 | output_dim=embed_dim, 49 | embeddings_initializer='random_normal', 50 | embeddings_regularizer=l2(embed_reg)) 51 | # MLP user embedding 52 | self.mlp_user_embedding = Embedding(input_dim=user_num, 53 | input_length=1, 54 | output_dim=embed_dim, 55 | embeddings_initializer='random_normal', 56 | embeddings_regularizer=l2(embed_reg)) 57 | # MLP item embedding 58 | self.mlp_item_embedding = Embedding(input_dim=item_num, 59 | input_length=1, 60 | output_dim=embed_dim, 61 | embeddings_initializer='random_normal', 62 | embeddings_regularizer=l2(embed_reg)) 63 | # dnn 64 | self.mlp = MLP(hidden_units, activation=activation, dnn_dropout=dnn_dropout, use_batch_norm=use_batch_norm) 65 | self.dense = Dense(1, activation=None) 66 | # norm 67 | self.use_l2norm = use_l2norm 68 | # loss name 69 | self.loss_name = loss_name 70 | self.gamma = gamma 71 | # seed 72 | tf.random.set_seed(seed) 73 | 74 | def call(self, inputs): 75 | # user info 76 | mf_user_embed = self.mf_user_embedding(tf.reshape(inputs['user'], [-1, ])) # (None, embed_dim) 77 | mlp_user_embed = self.mlp_user_embedding(tf.reshape(inputs['user'], [-1, ])) # (None, embed_dim) 78 | # item 79 | mf_pos_embed = self.mf_item_embedding(tf.reshape(inputs['pos_item'], [-1, ])) # (None, embed_dim) 80 | mf_neg_embed = self.mf_item_embedding(inputs['neg_item']) # (None, neg_num, embed_dim) 81 | mlp_pos_embed = self.mlp_item_embedding(tf.reshape(inputs['pos_item'], [-1, ])) # (None, embed_dim) 82 | mlp_neg_embed = self.mlp_item_embedding(inputs['neg_item']) # (None, neg_num, embed_dim) 83 | # MF 84 | mf_pos_vector = tf.nn.sigmoid(tf.multiply(mf_user_embed, mf_pos_embed)) # (None, embed_dim) 85 | mf_neg_vector = tf.nn.sigmoid(tf.multiply(tf.expand_dims(mf_user_embed, axis=1), 86 | mf_neg_embed)) # (None, neg_num, embed_dim) 87 | # MLP 88 | mlp_pos_vector = tf.concat([mlp_user_embed, mlp_pos_embed], axis=-1) # (None, 2 * embed_dim) 89 | mlp_neg_vector = tf.concat([tf.tile(tf.expand_dims(mlp_user_embed, axis=1), [1, mlp_neg_embed.shape[1], 1]), 90 | mlp_neg_embed], axis=-1) # (None, neg_num, 2 * embed_dim) 91 | mlp_pos_vector = self.mlp(mlp_pos_vector) # (None, dim) 92 | mlp_neg_vector = self.mlp(mlp_neg_vector) # (None, neg_num, dim) 93 | # concat 94 | pos_vector = tf.concat([mf_pos_vector, mlp_pos_vector], axis=-1) # (None, embed_dim+dim) 95 | neg_vector = tf.concat([mf_neg_vector, mlp_neg_vector], axis=-1) # (None, neg_num, embed_dim+dim) 96 | # norm 97 | if self.use_l2norm: 98 | pos_vector = tf.math.l2_normalize(pos_vector, axis=-1) 99 | neg_vector = tf.math.l2_normalize(neg_vector, axis=-1) 100 | # result 101 | pos_scores = self.dense(pos_vector) # (None, 1) 102 | neg_scores = tf.squeeze(self.dense(neg_vector), axis=-1) # (None, neg_num) 103 | # loss 104 | self.add_loss(get_loss(pos_scores, neg_scores, self.loss_name, self.gamma)) 105 | logits = tf.concat([pos_scores, neg_scores], axis=-1) 106 | return logits 107 | 108 | def summary(self): 109 | inputs = { 110 | 'user': Input(shape=(), dtype=tf.int32), 111 | 'pos_item': Input(shape=(), dtype=tf.int32), 112 | 'neg_item': Input(shape=(1,), dtype=tf.int32) # suppose neg_num=1 113 | } 114 | Model(inputs=inputs, outputs=self.call(inputs)).summary() -------------------------------------------------------------------------------- /reclearn/models/matching/poprec.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Nov 20, 2021 3 | Model: Pop Recommendation 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from tqdm import tqdm 10 | 11 | from reclearn.evaluator.metrics import * 12 | 13 | 14 | class PopRec: 15 | def __init__(self, train_path, delimiter='\t'): 16 | """Pop recommendation 17 | Args: 18 | :param train_data: A String. The path of data, such as "*.txt" or "*.csv". 19 | :param delimiter: A character. Please give field delimiter. 20 | :return: 21 | """ 22 | self.item_count = dict() 23 | self.pop_item_list = list() 24 | self.max_count = 0 25 | self.__build_item_count_dict(train_path, delimiter) 26 | 27 | def __build_item_count_dict(self, train_path, delimiter='\t'): 28 | data = np.array(pd.read_csv(train_path, delimiter=delimiter)) 29 | for i in tqdm(range(len(data))): 30 | user, item = data[i] 31 | self.item_count.setdefault(int(item), 0) 32 | self.item_count[int(item)] += 1 33 | self.max_count = max(self.item_count[int(item)], self.max_count) 34 | # sorting 35 | self.pop_item_list = [x[0] for x in sorted(self.item_count.items(), key=lambda x: x[1], reverse=True)] 36 | 37 | def update(self, data_path, delimiter='\t'): 38 | """Update 39 | Args: 40 | :param data_path: A String. The path of data, such as "*.txt" or "*.csv". 41 | :param delimiter: A character. Please give field delimiter. 42 | :return: 43 | """ 44 | self.__build_item_count_dict(data_path, delimiter) 45 | 46 | def clear(self): 47 | self.item_count = dict() 48 | self.pop_item_list = list() 49 | self.__build_item_count_dict(train_path, delimiter) 50 | 51 | def predict(self, test_data, batch_size=None): 52 | """predict recommended items 53 | :param test_data: A dict. 54 | :param batch_size: None. 55 | :return: A recommendation list of length k. 56 | """ 57 | pos_item_list, neg_items_list = test_data['pos_item'], test_data['neg_item'] 58 | pos_item_list = np.reshape(pos_item_list, (-1, 1)) 59 | item_list = np.hstack((pos_item_list, neg_items_list)) 60 | pred_item_list = [list(map(lambda x: self.item_count.get(x, -1) / self.max_count, l)) for l in item_list] 61 | return np.array(pred_item_list) 62 | 63 | def evaluate(self, test_path, k, metric_names, delimiter='\t'): 64 | """evaluate PopRec 65 | Args: 66 | :param test_path: A String. The path of data, such as "*.txt" or "*.csv". 67 | :param k: A scalar(int). 68 | :param metric_names: A list like ['hr']. 69 | :param delimiter: A character. Please give field delimiter. 70 | :return: A result dict such as {'hr':, 'ndcg':, ...} 71 | """ 72 | data = np.array(pd.read_csv(test_path, delimiter=delimiter)) 73 | pred_items = self.predict(k) 74 | rank = [] 75 | for i in range(len(data)): 76 | user, item = data[i] 77 | # if item in pred_items 78 | try: 79 | rank.append(pred_items.index(item)) 80 | except: 81 | rank.append(k+1) 82 | res_dict = {} 83 | for name in metric_names: 84 | if name == 'hr': 85 | res = hr(rank, k) 86 | elif name == 'ndcg': 87 | res = ndcg(rank, k) 88 | elif name == 'mrr': 89 | res = mrr(rank, k) 90 | else: 91 | break 92 | res_dict[name] = res 93 | return res_dict -------------------------------------------------------------------------------- /reclearn/models/matching/sasrec.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Dec 20, 2020 3 | Updated on Apr 22, 2022 4 | Reference: "Self-Attentive Sequential Recommendation", ICDM, 2018 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import tensorflow as tf 8 | from tensorflow.keras import Model 9 | from tensorflow.keras.layers import Layer, Dense, LayerNormalization, Dropout, Embedding, Input 10 | from tensorflow.keras.regularizers import l2 11 | 12 | from reclearn.layers import TransformerEncoder 13 | from reclearn.models.losses import get_loss 14 | 15 | 16 | class SASRec(Model): 17 | def __init__(self, item_num, embed_dim, seq_len=100, blocks=1, num_heads=1, ffn_hidden_unit=128, 18 | dnn_dropout=0., layer_norm_eps=1e-6, use_l2norm=False, 19 | loss_name="binary_cross_entropy_loss", gamma=0.5, embed_reg=0., seed=None): 20 | """Self-Attentive Sequential Recommendation 21 | :param item_num: An integer type. The largest item index + 1. 22 | :param embed_dim: An integer type. Embedding dimension of item vector. 23 | :param seq_len: An integer type. The length of the input sequence. 24 | :param blocks: An integer type. The Number of blocks. 25 | :param num_heads: An integer type. The Number of attention heads. 26 | :param ffn_hidden_unit: An integer type. Number of hidden unit in FFN. 27 | :param dnn_dropout: Float between 0 and 1. Dropout of user and item MLP layer. 28 | :param layer_norm_eps: A float type. Small float added to variance to avoid dividing by zero. 29 | :param use_l2norm: A boolean. Whether user embedding, item embedding should be normalized or not. 30 | :param loss_name: A string. You can specify the current point-loss function 'binary_cross_entropy_loss' or 31 | pair-loss function as 'bpr_loss'、'hinge_loss'. 32 | :param gamma: A float type. If hinge_loss is selected as the loss function, you can specify the margin. 33 | :param embed_reg: A float type. The regularizer of embedding. 34 | :param seed: A Python integer to use as random seed. 35 | """ 36 | super(SASRec, self).__init__() 37 | # item embedding 38 | self.item_embedding = Embedding(input_dim=item_num, 39 | input_length=1, 40 | output_dim=embed_dim, 41 | embeddings_initializer='random_normal', 42 | embeddings_regularizer=l2(embed_reg)) 43 | self.pos_embedding = Embedding(input_dim=seq_len, 44 | input_length=1, 45 | output_dim=embed_dim, 46 | embeddings_initializer='random_normal', 47 | embeddings_regularizer=l2(embed_reg)) 48 | self.dropout = Dropout(dnn_dropout) 49 | # multi encoder block 50 | self.encoder_layer = [TransformerEncoder(embed_dim, num_heads, ffn_hidden_unit, 51 | dnn_dropout, layer_norm_eps) for _ in range(blocks)] 52 | # norm 53 | self.use_l2norm = use_l2norm 54 | # loss name 55 | self.loss_name = loss_name 56 | self.gamma = gamma 57 | # seq_len 58 | self.seq_len = seq_len 59 | # seed 60 | tf.random.set_seed(seed) 61 | 62 | def call(self, inputs): 63 | # seq info 64 | seq_embed = self.item_embedding(inputs['click_seq']) # (None, seq_len, dim) 65 | # mask 66 | mask = tf.expand_dims(tf.cast(tf.not_equal(inputs['click_seq'], 0), dtype=tf.float32), axis=-1) # (None, seq_len, 1) 67 | # pos encoding 68 | pos_encoding = tf.expand_dims(self.pos_embedding(tf.range(self.seq_len)), axis=0) # (1, seq_len, embed_dim) 69 | seq_embed += pos_encoding # (None, seq_len, embed_dim), broadcasting 70 | 71 | seq_embed = self.dropout(seq_embed) 72 | att_outputs = seq_embed # (None, seq_len, embed_dim) 73 | att_outputs *= mask 74 | # transformer encoder part 75 | for block in self.encoder_layer: 76 | att_outputs = block([att_outputs, mask]) # (None, seq_len, embed_dim) 77 | att_outputs *= mask 78 | # user_info. There are two ways to get the user vector. 79 | # user_info = tf.reduce_mean(att_outputs, axis=1) # (None, dim) 80 | user_info = tf.slice(att_outputs, begin=[0, self.seq_len-1, 0], size=[-1, 1, -1]) # (None, 1, embed_dim) 81 | # item info contain pos_info and neg_info. 82 | pos_info = self.item_embedding(tf.reshape(inputs['pos_item'], [-1, ])) # (None, dim) 83 | neg_info = self.item_embedding(inputs['neg_item']) # (None, neg_num, dim) 84 | # norm 85 | if self.use_l2norm: 86 | pos_info = tf.math.l2_normalize(pos_info, axis=-1) 87 | neg_info = tf.math.l2_normalize(neg_info, axis=-1) 88 | user_info = tf.math.l2_normalize(user_info, axis=-1) 89 | pos_scores = tf.reduce_sum(tf.multiply(user_info, tf.expand_dims(pos_info, axis=1)), axis=-1) # (None, 1) 90 | neg_scores = tf.reduce_sum(tf.multiply(user_info, neg_info), axis=-1) # (None, neg_num) 91 | # loss 92 | self.add_loss(get_loss(pos_scores, neg_scores, self.loss_name, self.gamma)) 93 | logits = tf.concat([pos_scores, neg_scores], axis=-1) 94 | return logits 95 | 96 | def summary(self): 97 | inputs = { 98 | 'click_seq': Input(shape=(self.seq_len,), dtype=tf.int32), 99 | 'pos_item': Input(shape=(), dtype=tf.int32), 100 | 'neg_item': Input(shape=(1,), dtype=tf.int32) # suppose neg_num=1 101 | } 102 | Model(inputs=inputs, outputs=self.call(inputs)).summary() 103 | -------------------------------------------------------------------------------- /reclearn/models/matching/youtubednn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Mar 31, 2022 3 | Reference: "Learning Deep Structured Semantic Models for Web Search using Clickthrough Data", CIKM, 2013 4 | @author: Ziyao Geng(zggzy1996@163.com) 5 | """ 6 | import tensorflow as tf 7 | from tensorflow.keras import Model 8 | from tensorflow.keras.layers import Input 9 | from tensorflow.keras.regularizers import l2 10 | from reclearn.layers import MLP 11 | 12 | 13 | class YoutubeDNN(Model): 14 | def __init__(self, item_num, embed_dim, user_mlp, activation='relu', 15 | dnn_dropout=0., use_l2norm=False, neg_num=4, batch_size=512, 16 | embed_reg=0., seed=None): 17 | """YouTubeDNN: Select this section of the two-towers recall model. 18 | Args: 19 | :param item_num: An integer type. The largest item index + 1. 20 | :param embed_dim: An integer type. Embedding dimension of item vector. 21 | :param user_mlp: A list of user MLP hidden units such as [128, 64, 32]. 22 | User initial vector is the mean of the user's historical behavior sequence vector. 23 | :param activation: A string. Activation function name of user and item MLP layer. 24 | :param dnn_dropout: Float between 0 and 1. Dropout of user and item MLP layer. 25 | :param use_l2norm: A boolean. Whether user embedding, item embedding should be normalized or not. 26 | :param neg_num: A integer type. The number of negative samples for each positive sample. 27 | :param batch_size: A integer type. The number of samples per batch. 28 | :param embed_reg: A float type. The regularizer of embedding. 29 | :param seed: A Python integer to use as random seed. 30 | :return: 31 | """ 32 | super(YoutubeDNN, self).__init__() 33 | with tf.name_scope("Embedding_layer"): 34 | # item embedding 35 | self.item_embedding_table = self.add_weight(name='item_embedding_table', 36 | shape=(item_num, embed_dim), 37 | initializer='random_normal', 38 | regularizer=l2(embed_reg), 39 | trainable=True) 40 | # embedding bias 41 | self.embedding_bias = self.add_weight(name='embedding_bias', 42 | shape=(item_num,), 43 | initializer=tf.zeros_initializer(), 44 | trainable=False) 45 | # user_mlp_layer 46 | self.user_mlp_layer = MLP(user_mlp, activation, dnn_dropout) 47 | self.use_l2norm = use_l2norm 48 | self.embed_dim = embed_dim 49 | self.item_num = item_num 50 | self.neg_num = neg_num 51 | self.batch_size = batch_size 52 | # seed 53 | tf.random.set_seed(seed) 54 | 55 | def call(self, inputs, training=False): 56 | seq_embed = tf.nn.embedding_lookup(self.item_embedding_table, inputs['click_seq']) 57 | # mask 58 | mask = tf.cast(tf.not_equal(inputs['click_seq'], 0), dtype=tf.float32) # (None, seq_len) 59 | seq_embed = tf.multiply(seq_embed, tf.expand_dims(mask, axis=-1)) 60 | # user_info 61 | user_info = tf.reduce_mean(seq_embed, axis=1) # (None, embed_dim) 62 | # mlp 63 | user_info = self.user_mlp_layer(user_info) 64 | if user_info.shape[-1] != self.embed_dim: 65 | raise ValueError("The last hidden unit must be equal to the embedding dimension.") 66 | # norm 67 | if self.use_l2norm: 68 | user_info = tf.math.l2_normalize(user_info, axis=-1) 69 | if training: 70 | # train, sample softmax loss 71 | loss = tf.reduce_mean(tf.nn.sampled_softmax_loss( 72 | weights=self.item_embedding_table, 73 | biases=self.embedding_bias, 74 | labels=tf.reshape(inputs['pos_item'], shape=[-1, 1]), 75 | inputs=user_info, 76 | num_sampled=self.neg_num * self.batch_size, 77 | num_classes=self.item_num 78 | )) 79 | # add loss 80 | self.add_loss(loss) 81 | return loss 82 | else: 83 | # predict/eval 84 | pos_info = tf.nn.embedding_lookup(self.item_embedding_table, inputs['pos_item']) # (None, embed_dim) 85 | neg_info = tf.nn.embedding_lookup(self.item_embedding_table, inputs['neg_item']) # (None, neg_num, embed_dim) 86 | # calculate similar scores. 87 | pos_scores = tf.reduce_sum(tf.multiply(user_info, pos_info), axis=-1, keepdims=True) # (None, 1) 88 | neg_scores = tf.reduce_sum(tf.multiply(tf.expand_dims(user_info, axis=1), neg_info), axis=-1) # (None, neg_num) 89 | logits = tf.concat([pos_scores, neg_scores], axis=-1) 90 | return logits 91 | 92 | def summary(self): 93 | inputs = { 94 | 'click_seq': Input(shape=(100,), dtype=tf.int32), # suppose sequence length=1 95 | 'pos_item': Input(shape=(), dtype=tf.int32), 96 | 'neg_item': Input(shape=(1,), dtype=tf.int32) # suppose neg_num=1 97 | } 98 | Model(inputs=inputs, outputs=self.call(inputs)).summary() -------------------------------------------------------------------------------- /reclearn/models/ranking/__init__.py: -------------------------------------------------------------------------------- 1 | from reclearn.models.ranking.fm import FM 2 | from reclearn.models.ranking.ffm import FFM 3 | from reclearn.models.ranking.deepfm import DeepFM 4 | from reclearn.models.ranking.deep_crossing import Deep_Crossing 5 | from reclearn.models.ranking.dcn import DCN 6 | from reclearn.models.ranking.nfm import NFM 7 | from reclearn.models.ranking.pnn import PNN 8 | from reclearn.models.ranking.wdl import WideDeep 9 | from reclearn.models.ranking.afm import AFM 10 | from reclearn.models.ranking.xdeepfm import xDeepFM 11 | 12 | 13 | __all__ = ['FM', 'FFM', 'DeepFM', 'DeepFM', 'Deep_Crossing', 'DCN', 'NFM', 'PNN', 14 | 'WideDeep', 'AFM', 'xDeepFM'] -------------------------------------------------------------------------------- /reclearn/models/ranking/afm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on August 3, 2020 3 | Updated on Nov 13, 2021 4 | Reference: "Attentional Factorization Machines: Learning the Weight of Feature Interactions via Attention Networks", IJCAI, 2017 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import itertools 8 | import tensorflow as tf 9 | from tensorflow.keras import Model 10 | from tensorflow.keras.regularizers import l2 11 | from tensorflow.keras.layers import Embedding, Dropout, Dense, Dropout, Input 12 | 13 | 14 | class AFM(Model): 15 | def __init__(self, feature_columns, mode, att_dim=8, activation='relu', dnn_dropout=0., embed_reg=0.): 16 | """Attentional Factorization Machines. 17 | Args: 18 | :param feature_columns: A list. [{'feat_name':, 'feat_num':, 'embed_dim':}, ...] 19 | :param mode: A string. 'max'(MAX Pooling) or 'avg'(Average Pooling) or 'att'(Attention) 20 | :param att_dim: A scalar. attention vector dimension. 21 | :param activation: A string. Activation function of attention. 22 | :param dnn_dropout: A scalar. Dropout of MLP. 23 | :param embed_reg: A scalar. The regularization coefficient of embedding. 24 | :return: 25 | """ 26 | super(AFM, self).__init__() 27 | self.feature_columns = feature_columns 28 | self.mode = mode 29 | self.embed_layers = { 30 | feat['feat_name']: Embedding(input_dim=feat['feat_num'], 31 | input_length=1, 32 | output_dim=feat['embed_dim'], 33 | embeddings_initializer='random_normal', 34 | embeddings_regularizer=l2(embed_reg)) 35 | for feat in self.feature_columns 36 | } 37 | self.embed_dim = self.feature_columns[0]['embed_dim'] 38 | self.field_num = len(self.feature_columns) 39 | if self.mode == 'att': 40 | self.attention_W = Dense(units=att_dim, activation=activation) 41 | self.attention_dense = Dense(units=1, activation=None) 42 | self.dropout = Dropout(dnn_dropout) 43 | self.dense = Dense(units=1, activation=None) 44 | 45 | def call(self, inputs): 46 | # Embedding Layer 47 | sparse_embed = tf.concat([self.embed_layers[feat_name](value) for feat_name, value in inputs.items()], axis=-1) 48 | sparse_embed = tf.reshape(sparse_embed, [-1, self.field_num, self.embed_dim]) # (None, filed_num, embed_dim) 49 | # Pair-wise Interaction Layer 50 | row = [] 51 | col = [] 52 | for r, c in itertools.combinations(range(self.field_num), 2): 53 | row.append(r) 54 | col.append(c) 55 | p = tf.gather(sparse_embed, row, axis=1) # (None, (field_num * (field_num - 1)) / 2, k) 56 | q = tf.gather(sparse_embed, col, axis=1) # (None, (field_num * (field_num - 1)) / 2, k) 57 | bi_interaction = p * q # (None, (field_num * (field_num - 1)) / 2, k) 58 | # mode 59 | if self.mode == 'max': 60 | # MaxPooling Layer 61 | x = tf.reduce_sum(bi_interaction, axis=1) # (None, k) 62 | elif self.mode == 'avg': 63 | # AvgPooling Layer 64 | x = tf.reduce_mean(bi_interaction, axis=1) # (None, k) 65 | else: 66 | # Attention Layer 67 | x = self._attention(bi_interaction) # (None, k) 68 | # Output Layer 69 | outputs = tf.nn.sigmoid(self.dense(x)) 70 | 71 | return outputs 72 | 73 | def _attention(self, bi_interaction): 74 | print(bi_interaction) 75 | a = self.attention_W(bi_interaction) # (None, (field_num * (field_num - 1)) / 2, embed_dim) 76 | a = self.attention_dense(a) # (None, (field_num * (field_num - 1)) / 2, 1) 77 | a_score = tf.nn.softmax(a, axis=1) # (None, (field_num * (field_num - 1)) / 2, 1) 78 | outputs = tf.reduce_sum(bi_interaction * a_score, axis=1) # (None, embed_dim) 79 | return outputs 80 | 81 | def summary(self): 82 | inputs = { 83 | feat['feat_name']: Input(shape=(), dtype=tf.int32, name=feat['feat_name']) 84 | for feat in self.feature_columns 85 | } 86 | Model(inputs=inputs, outputs=self.call(inputs)).summary() 87 | -------------------------------------------------------------------------------- /reclearn/models/ranking/dcn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on July 13, 2020 3 | Updated on Nov 14, 2021 4 | Reference: "Deep & Cross Network for Ad Click Predictions", ADKDD, 2017 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | 8 | import tensorflow as tf 9 | from tensorflow.keras import Model 10 | from tensorflow.keras.layers import Embedding, Dense, Input 11 | from tensorflow.keras.regularizers import l2 12 | 13 | from reclearn.layers import CrossNetwork, MLP 14 | 15 | 16 | class DCN(Model): 17 | def __init__(self, feature_columns, hidden_units, activation='relu', 18 | dnn_dropout=0., embed_reg=0., cross_w_reg=0., cross_b_reg=0.): 19 | """Deep&Cross Network. 20 | Args: 21 | :param feature_columns: A list. [{'feat_name':, 'feat_num':, 'embed_dim':}, ...] 22 | :param hidden_units: A list. Neural network hidden units. 23 | :param activation: A string. Activation function of MLP. 24 | :param dnn_dropout: A scalar. Dropout of MLP. 25 | :param embed_reg: A scalar. The regularization coefficient of embedding. 26 | :param cross_w_reg: A scalar. The regularization coefficient of cross network. 27 | :param cross_b_reg: A scalar. The regularization coefficient of cross network. 28 | :return: 29 | """ 30 | super(DCN, self).__init__() 31 | self.feature_columns = feature_columns 32 | self.layer_num = len(hidden_units) 33 | self.embed_layers = { 34 | feat['feat_name']: Embedding(input_dim=feat['feat_num'], 35 | input_length=1, 36 | output_dim=feat['embed_dim'], 37 | embeddings_initializer='random_normal', 38 | embeddings_regularizer=l2(embed_reg)) 39 | for feat in self.feature_columns 40 | } 41 | self.cross_network = CrossNetwork(self.layer_num, cross_w_reg, cross_b_reg) 42 | self.dnn_network = MLP(hidden_units, activation, dnn_dropout) 43 | self.dense_final = Dense(1, activation=None) 44 | 45 | def call(self, inputs): 46 | # embedding, (batch_size, embed_dim * fields) 47 | sparse_embed = tf.concat([self.embed_layers[feat_name](value) for feat_name, value in inputs.items()], axis=-1) 48 | x = sparse_embed 49 | # Cross Network 50 | cross_x = self.cross_network(x) 51 | # DNN 52 | dnn_x = self.dnn_network(x) 53 | # Concatenate 54 | total_x = tf.concat([cross_x, dnn_x], axis=-1) 55 | outputs = tf.nn.sigmoid(self.dense_final(total_x)) 56 | return outputs 57 | 58 | def summary(self): 59 | inputs = { 60 | feat['feat_name']: Input(shape=(), dtype=tf.int32, name=feat['feat_name']) 61 | for feat in self.feature_columns 62 | } 63 | Model(inputs=inputs, outputs=self.call(inputs)).summary() -------------------------------------------------------------------------------- /reclearn/models/ranking/deep_crossing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on July 27, 2020 3 | Updated on Nov 14, 2021 4 | Reference: "Deep Crossing: Web-Scale Modeling without Manually Crafted Combinatorial Features", KDD, 2016 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import tensorflow as tf 8 | from tensorflow.keras import Model 9 | from tensorflow.keras.layers import Embedding, Dense, Dropout, Input 10 | from tensorflow.keras.regularizers import l2 11 | 12 | from reclearn.layers import Residual_Units 13 | 14 | 15 | class Deep_Crossing(Model): 16 | def __init__(self, feature_columns, hidden_units, dnn_dropout=0., embed_reg=0.): 17 | """Deep&Crossing. 18 | Args: 19 | :param feature_columns: A list. [{'feat_name':, 'feat_num':, 'embed_dim':}, ...] 20 | :param hidden_units: A list. A list of MLP hidden units. 21 | :param dnn_dropout: A scalar. Dropout of resnet. 22 | :param embed_reg: A scalar. The regularization coefficient of embedding. 23 | :return: 24 | """ 25 | super(Deep_Crossing, self).__init__() 26 | self.feature_columns = feature_columns 27 | self.embed_layers = { 28 | feat['feat_name']: Embedding(input_dim=feat['feat_num'], 29 | input_length=1, 30 | output_dim=feat['embed_dim'], 31 | embeddings_initializer='random_normal', 32 | embeddings_regularizer=l2(embed_reg)) 33 | for feat in self.feature_columns 34 | } 35 | # the total length of embedding layers 36 | embed_layers_len = sum([feat['embed_dim'] for feat in self.feature_columns]) 37 | self.res_network = [Residual_Units(unit, embed_layers_len) for unit in hidden_units] 38 | self.res_dropout = Dropout(dnn_dropout) 39 | self.dense = Dense(1, activation=None) 40 | 41 | def call(self, inputs): 42 | sparse_embed = tf.concat([self.embed_layers[feat_name](value) for feat_name, value in inputs.items()], axis=-1) 43 | r = sparse_embed 44 | for res in self.res_network: 45 | r = res(r) 46 | r = self.res_dropout(r) 47 | outputs = tf.nn.sigmoid(self.dense(r)) 48 | return outputs 49 | 50 | def summary(self): 51 | inputs = { 52 | feat['feat_name']: Input(shape=(), dtype=tf.int32, name=feat['feat_name']) 53 | for feat in self.feature_columns 54 | } 55 | Model(inputs=inputs, outputs=self.call(inputs)).summary() -------------------------------------------------------------------------------- /reclearn/models/ranking/deepfm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on July 31, 2020 3 | Updated on Nov 14, 2021 4 | Reference: "DeepFM: A Factorization-Machine based Neural Network for CTR Prediction", 2017, IJCAI 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import tensorflow as tf 8 | from tensorflow.keras import Model 9 | from tensorflow.keras.regularizers import l2 10 | from tensorflow.keras.layers import Embedding, Dropout, Dense, Input, Layer 11 | 12 | from reclearn.layers import New_FM, MLP 13 | from reclearn.layers.utils import index_mapping 14 | 15 | 16 | class DeepFM(Model): 17 | def __init__(self, feature_columns, hidden_units=(200, 200, 200), activation='relu', 18 | dnn_dropout=0., fm_w_reg=0., embed_reg=0.): 19 | """DeepFM 20 | Args: 21 | :param feature_columns: A list. [{'feat_name':, 'feat_num':, 'embed_dim':}, ...] 22 | :param hidden_units: A list. A list of MLP hidden units. 23 | :param dnn_dropout: A scalar. Dropout of MLP. 24 | :param activation: A string. Activation function of MLP. 25 | :param fm_w_reg: A scalar. The regularization coefficient of w in fm. 26 | :param embed_reg: A scalar. The regularization coefficient of embedding. 27 | :return 28 | """ 29 | super(DeepFM, self).__init__() 30 | self.feature_columns = feature_columns 31 | self.embed_layers = { 32 | feat['feat_name']: Embedding(input_dim=feat['feat_num'], 33 | input_length=1, 34 | output_dim=feat['embed_dim'], 35 | embeddings_initializer='random_normal', 36 | embeddings_regularizer=l2(embed_reg)) 37 | for feat in self.feature_columns 38 | } 39 | self.map_dict = {} 40 | self.feature_length = 0 41 | self.field_num = len(self.feature_columns) 42 | for feat in self.feature_columns: 43 | self.map_dict[feat['feat_name']] = self.feature_length 44 | self.feature_length += feat['feat_num'] 45 | self.embed_dim = self.feature_columns[0]['embed_dim'] # all sparse features have the same embed_dim 46 | self.fm = New_FM(self.feature_length, fm_w_reg) 47 | self.mlp = MLP(hidden_units, activation, dnn_dropout) 48 | self.dense = Dense(1, activation=None) 49 | 50 | def call(self, inputs): 51 | # embedding, (batch_size, embed_dim * fields) 52 | sparse_embed = tf.concat([self.embed_layers[feat_name](value) for feat_name, value in inputs.items()], axis=-1) 53 | # wide 54 | sparse_inputs = index_mapping(inputs, self.map_dict) 55 | wide_inputs = {'sparse_inputs': sparse_inputs, 56 | 'embed_inputs': tf.reshape(sparse_embed, shape=(-1, self.field_num, self.embed_dim))} 57 | wide_outputs = tf.reshape(self.fm(wide_inputs), [-1, 1]) # (batch_size, 1) 58 | # deep 59 | deep_outputs = self.mlp(sparse_embed) 60 | deep_outputs = tf.reshape(self.dense(deep_outputs), [-1, 1]) # (batch_size, 1) 61 | # outputs 62 | outputs = tf.nn.sigmoid(tf.add(wide_outputs, deep_outputs)) 63 | return outputs 64 | 65 | def summary(self): 66 | inputs = { 67 | feat['feat_name']: Input(shape=(), dtype=tf.int32, name=feat['feat_name']) 68 | for feat in self.feature_columns 69 | } 70 | Model(inputs=inputs, outputs=self.call(inputs)).summary() -------------------------------------------------------------------------------- /reclearn/models/ranking/ffm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on August 26, 2020 3 | Updated on Nov 13, 2021 4 | Reference: "Field-aware Factorization Machines for CTR Prediction", RecSys, 2016 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | 8 | import tensorflow as tf 9 | from tensorflow.keras import Model 10 | from tensorflow.keras.layers import Input, Layer 11 | from tensorflow.keras.regularizers import l2 12 | 13 | from reclearn.layers import FFM_Layer 14 | 15 | 16 | class FFM(Model): 17 | def __init__(self, feature_columns, k=8, w_reg=0., v_reg=0.): 18 | """Field-aware Factorization Machines. 19 | Args: 20 | :param feature_columns: A list. [{'feat_name':, 'feat_num':, 'embed_dim':}, ...] 21 | :param k: A scalar. The latent vector. 22 | :param w_reg: A scalar. The regularization coefficient of parameter w. 23 | :param v_reg: A scalar. The regularization coefficient of parameter v. 24 | :return: 25 | """ 26 | super(FFM, self).__init__() 27 | self.feature_columns = feature_columns 28 | self.ffm = FFM_Layer(self.feature_columns, k, w_reg, v_reg) 29 | 30 | def call(self, inputs): 31 | ffm_out = self.ffm(inputs) 32 | outputs = tf.nn.sigmoid(ffm_out) 33 | return outputs 34 | 35 | def summary(self): 36 | inputs = { 37 | feat['feat_name']: Input(shape=(), dtype=tf.int32, name=feat['feat_name']) 38 | for feat in self.feature_columns 39 | } 40 | Model(inputs=inputs, outputs=self.call(inputs)).summary() 41 | -------------------------------------------------------------------------------- /reclearn/models/ranking/fm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on August 25, 2020 3 | Updated on Nov, 11, 2021 4 | Reference: "Factorization Machines", ICDM, 2010 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import tensorflow as tf 8 | from tensorflow.keras import Model 9 | from tensorflow.keras.layers import Layer, Input 10 | from tensorflow.keras.regularizers import l2 11 | 12 | from reclearn.layers import FM_Layer 13 | 14 | 15 | class FM(Model): 16 | def __init__(self, feature_columns, k=8, w_reg=0., v_reg=0.): 17 | """Factorization Machines. 18 | Args: 19 | :param feature_columns: A list. [{'feat_name':, 'feat_num':, 'embed_dim':}, ...] 20 | :param k: A scalar. The latent vector. 21 | :param w_reg: A scalar. The regularization coefficient of parameter w. 22 | :param v_reg: A scalar. The regularization coefficient of parameter v. 23 | :return 24 | """ 25 | super(FM, self).__init__() 26 | self.feature_columns = feature_columns 27 | self.fm = FM_Layer(feature_columns, k, w_reg, v_reg) 28 | 29 | def call(self, inputs): 30 | fm_outputs = self.fm(inputs) 31 | outputs = tf.nn.sigmoid(fm_outputs) 32 | return outputs 33 | 34 | def summary(self): 35 | inputs = { 36 | feat['feat_name']: Input(shape=(), dtype=tf.int32, name=feat['feat_name']) 37 | for feat in self.feature_columns 38 | } 39 | Model(inputs=inputs, outputs=self.call(inputs)).summary() -------------------------------------------------------------------------------- /reclearn/models/ranking/nfm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on August 2, 2020 3 | Updated on Nov 14, 2021 4 | Reference: "Neural Factorization Machines for Sparse Predictive Analytics", SIGIR, 2018 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | import tensorflow as tf 8 | from tensorflow.keras import Model 9 | from tensorflow.keras.layers import Embedding, Dropout, Layer, Dense, Input, BatchNormalization 10 | from tensorflow.keras.regularizers import l2 11 | 12 | from reclearn.layers import MLP 13 | 14 | 15 | class NFM(Model): 16 | def __init__(self, feature_columns, hidden_units, dnn_dropout=0., activation='relu', bn_use=True, embed_reg=0.): 17 | """Neural Factorization Machines. 18 | Args: 19 | :param feature_columns: A list. [{'feat_name':, 'feat_num':, 'embed_dim':}, ...] 20 | :param hidden_units: A list. Neural network hidden units. 21 | :param activation: A string. Activation function of dnn. 22 | :param dnn_dropout: A scalar. Dropout of dnn. 23 | :param bn_use: A Boolean. Use BatchNormalization or not. 24 | :param embed_reg: A scalar. The regularization coefficient of embedding. 25 | :return: 26 | """ 27 | super(NFM, self).__init__() 28 | self.feature_columns = feature_columns 29 | self.embed_layers = { 30 | feat['feat_name']: Embedding(input_dim=feat['feat_num'], 31 | input_length=1, 32 | output_dim=feat['embed_dim'], 33 | embeddings_initializer='random_normal', 34 | embeddings_regularizer=l2(embed_reg)) 35 | for feat in self.feature_columns 36 | } 37 | self.embed_dim = self.feature_columns[0]['embed_dim'] 38 | self.field_num = len(self.feature_columns) 39 | self.bn = BatchNormalization() 40 | self.bn_use = bn_use 41 | self.dnn_network = MLP(hidden_units, activation, dnn_dropout) 42 | self.dense = Dense(1, activation=None) 43 | 44 | def call(self, inputs): 45 | # Embedding layer, (batch_size, fields * embed_dim) 46 | sparse_embed = tf.concat([self.embed_layers[feat_name](value) for feat_name, value in inputs.items()], axis=-1) 47 | sparse_embed = tf.reshape(sparse_embed, [-1, self.field_num, self.embed_dim]) # (None, filed_num, embed_dim) 48 | # Bi-Interaction Layer 49 | sparse_embed = 0.5 * (tf.pow(tf.reduce_sum(sparse_embed, axis=1), 2) - 50 | tf.reduce_sum(tf.pow(sparse_embed, 2), axis=1)) # (None, embed_dim) 51 | # Concat 52 | x = sparse_embed 53 | # BatchNormalization 54 | if self.bn_use: 55 | x = self.bn(x) 56 | # Hidden Layers 57 | x = self.dnn_network(x) 58 | outputs = tf.nn.sigmoid(self.dense(x)) 59 | return outputs 60 | 61 | def summary(self): 62 | inputs = { 63 | feat['feat_name']: Input(shape=(), dtype=tf.int32, name=feat['feat_name']) 64 | for feat in self.feature_columns 65 | } 66 | Model(inputs=inputs, outputs=self.call(inputs)).summary() -------------------------------------------------------------------------------- /reclearn/models/ranking/pnn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on July 20, 2020 3 | Updated on Nov 13, 2021 4 | Reference: "Product-based Neural Networks for User Response Prediction", ICDM, 2016 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | 8 | import tensorflow as tf 9 | from tensorflow.keras import Model 10 | from tensorflow.keras.layers import Embedding, Dense, Layer, Dropout, Input 11 | from tensorflow.keras.regularizers import l2 12 | 13 | from reclearn.layers import MLP 14 | 15 | 16 | class PNN(Model): 17 | def __init__(self, feature_columns, hidden_units, mode='in', dnn_dropout=0., 18 | activation='relu', embed_reg=0., w_z_reg=0., w_p_reg=0., l_b_reg=0.): 19 | """Product-based Neural Networks. 20 | Args: 21 | :param feature_columns: A list. [{'feat_name':, 'feat_num':, 'embed_dim':}, ...] 22 | :param hidden_units: A list. Neural network hidden units. 23 | :param mode: A string. 'in' IPNN or 'out' OPNN. 24 | :param activation: A string. Activation function of MLP. 25 | :param dnn_dropout: A scalar. Dropout of MLP. 26 | :param embed_reg: A scalar. The regularization coefficient of embedding. 27 | :param w_z_reg: A scalar. The regularization coefficient of w_z_ in product layer. 28 | :param w_p_reg: A scalar. The regularization coefficient of w_p in product layer. 29 | :param l_b_reg: A scalar. The regularization coefficient of l_b in product layer. 30 | :return: 31 | """ 32 | super(PNN, self).__init__() 33 | # inner product or outer product 34 | self.mode = mode 35 | self.feature_columns = feature_columns 36 | # the number of feature fields 37 | self.field_num = len(self.feature_columns) 38 | self.embed_dim = self.feature_columns[0]['embed_dim'] 39 | # The embedding dimension of each feature field must be the same 40 | self.embed_layers = { 41 | feat['feat_name']: Embedding(input_dim=feat['feat_num'], 42 | input_length=1, 43 | output_dim=feat['embed_dim'], 44 | embeddings_initializer='random_normal', 45 | embeddings_regularizer=l2(embed_reg)) 46 | for feat in self.feature_columns 47 | } 48 | # parameters 49 | self.w_z = self.add_weight(name='w_z', 50 | shape=(self.field_num, self.embed_dim, hidden_units[0]), 51 | initializer='random_normal', 52 | regularizer=l2(w_z_reg), 53 | trainable=True 54 | ) 55 | if mode == 'in': 56 | self.w_p = self.add_weight(name='w_p', 57 | shape=(self.field_num * (self.field_num - 1) // 2, self.embed_dim, 58 | hidden_units[0]), 59 | initializer='random_normal', 60 | reguarizer=l2(w_p_reg), 61 | trainable=True) 62 | # out 63 | else: 64 | self.w_p = self.add_weight(name='w_p', 65 | shape=(self.field_num * (self.field_num - 1) // 2, self.embed_dim, 66 | self.embed_dim, hidden_units[0]), 67 | initializer='random_normal', 68 | regularizer=l2(w_p_reg), 69 | trainable=True) 70 | self.l_b = self.add_weight(name='l_b', shape=(hidden_units[0], ), 71 | initializer='random_normal', 72 | regularizer=l2(l_b_reg), 73 | trainable=True) 74 | # dnn 75 | self.dnn_network = MLP(hidden_units[1:], activation, dnn_dropout) 76 | self.dense_final = Dense(1) 77 | 78 | def call(self, inputs): 79 | # embedding 80 | sparse_embed = tf.concat([self.embed_layers[feat_name](value) for feat_name, value in inputs.items()], axis=-1) 81 | sparse_embed = tf.reshape(sparse_embed, [-1, self.field_num, self.embed_dim]) # (None, filed_num, embed_dim) 82 | # product layer 83 | row = [] 84 | col = [] 85 | for i in range(self.field_num - 1): 86 | for j in range(i + 1, self.field_num): 87 | row.append(i) 88 | col.append(j) 89 | p = tf.gather(sparse_embed, row, axis=1) 90 | q = tf.gather(sparse_embed, col, axis=1) 91 | if self.mode == 'in': 92 | l_p = tf.tensordot(p*q, self.w_p, axes=2) # (None, hidden[0]) 93 | else: # out 94 | u = tf.expand_dims(q, 2) # (None, field_num(field_num-1)/2, 1, emb_dim) 95 | v = tf.expand_dims(p, 2) # (None, field_num(field_num-1)/2, 1, emb_dim) 96 | l_p = tf.tensordot(tf.matmul(tf.transpose(u, [0, 1, 3, 2]), v), self.w_p, axes=3) # (None, hidden[0]) 97 | 98 | l_z = tf.tensordot(sparse_embed, self.w_z, axes=2) # (None, hidden[0]) 99 | l_1 = tf.nn.relu(tf.concat([l_z + l_p + self.l_b], axis=-1)) 100 | # dnn layer 101 | dnn_x = self.dnn_network(l_1) 102 | outputs = tf.nn.sigmoid(self.dense_final(dnn_x)) 103 | return outputs 104 | 105 | def summary(self): 106 | inputs = { 107 | feat['feat_name']: Input(shape=(), dtype=tf.int32, name=feat['feat_name']) 108 | for feat in self.feature_columns 109 | } 110 | Model(inputs=inputs, outputs=self.call(inputs)).summary() -------------------------------------------------------------------------------- /reclearn/models/ranking/wdl.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on July 9, 2020 3 | Updated on Nov 13, 2021 4 | Reference: "Wide & Deep Learning for Recommender Systems", DLRS, 2016 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | 8 | import tensorflow as tf 9 | from tensorflow.keras import Model 10 | from tensorflow.keras.layers import Dense, Embedding, Dropout, Input 11 | from tensorflow.keras.regularizers import l2 12 | 13 | from reclearn.layers import Linear, MLP 14 | from reclearn.layers.utils import index_mapping 15 | 16 | 17 | class WideDeep(Model): 18 | def __init__(self, feature_columns, hidden_units, activation='relu', 19 | dnn_dropout=0., embed_reg=0., w_reg=0.): 20 | """Wide&Deep. 21 | Args: 22 | :param feature_columns: A list. [{'feat_name':, 'feat_num':, 'embed_dim':}, ...] 23 | :param hidden_units: A list. Neural network hidden units. 24 | :param activation: A string. Activation function of MLP. 25 | :param dnn_dropout: A scalar. Dropout of MLP. 26 | :param embed_reg: A scalar. The regularization coefficient of embedding. 27 | :param w_reg: A scalar. The regularization coefficient of Linear. 28 | :return 29 | """ 30 | super(WideDeep, self).__init__() 31 | self.feature_columns = feature_columns 32 | self.embed_layers = { 33 | feat['feat_name']: Embedding(input_dim=feat['feat_num'], 34 | input_length=1, 35 | output_dim=feat['embed_dim'], 36 | embeddings_initializer='random_normal', 37 | embeddings_regularizer=l2(embed_reg)) 38 | for feat in self.feature_columns 39 | } 40 | self.map_dict = {} 41 | self.feature_length = 0 42 | for feat in self.feature_columns: 43 | self.map_dict[feat['feat_name']] = self.feature_length 44 | self.feature_length += feat['feat_num'] 45 | self.dnn_network = MLP(hidden_units, activation, dnn_dropout) 46 | self.linear = Linear(self.feature_length, w_reg=w_reg) 47 | self.final_dense = Dense(1, activation=None) 48 | 49 | def call(self, inputs): 50 | sparse_embed = tf.concat([self.embed_layers[feat_name](value) for feat_name, value in inputs.items()], axis=-1) 51 | x = sparse_embed # (batch_size, field * embed_dim) 52 | # Wide 53 | wide_inputs = index_mapping(inputs, self.map_dict) 54 | wide_inputs = tf.concat([value for _, value in wide_inputs.items()], axis=-1) 55 | wide_out = self.linear(wide_inputs) 56 | # Deep 57 | deep_out = self.dnn_network(x) 58 | deep_out = self.final_dense(deep_out) 59 | # out 60 | outputs = tf.nn.sigmoid(0.5 * wide_out + 0.5 * deep_out) 61 | return outputs 62 | 63 | def summary(self): 64 | inputs = { 65 | feat['feat_name']: Input(shape=(), dtype=tf.int32, name=feat['feat_name']) 66 | for feat in self.feature_columns 67 | } 68 | Model(inputs=inputs, outputs=self.call(inputs)).summary() -------------------------------------------------------------------------------- /reclearn/models/ranking/xdeepfm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on August 20, 2020 3 | Updated on Nov 14, 2021 4 | Reference: "xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems", KDD, 2018 5 | @author: Ziyao Geng(zggzy1996@163.com) 6 | """ 7 | 8 | import tensorflow as tf 9 | from tensorflow.keras import Model 10 | from tensorflow.keras.layers import Embedding, Dropout, Flatten, Dense, Input 11 | from tensorflow.keras.regularizers import l2 12 | 13 | from reclearn.layers import Linear, MLP, CIN 14 | from reclearn.layers.utils import index_mapping 15 | 16 | 17 | class xDeepFM(Model): 18 | def __init__(self, feature_columns, hidden_units, cin_size, activation='relu', dnn_dropout=0, 19 | embed_reg=0., cin_reg=0., w_reg=0.): 20 | """xDeepFM. 21 | Args: 22 | :param feature_columns: A list. [{'feat_name':, 'feat_num':, 'embed_dim':}, ...] 23 | :param hidden_units: A list. Neural network hidden units. 24 | :param cin_size: A list. a list of the number of CIN layers. 25 | :param activation: A string. activation function of MLP. 26 | :param dnn_dropout: A scalar. dropout of MLP. 27 | :param embed_reg: A scalar. The regularization coefficient of embedding. 28 | :param cin_reg: A scalar. The regularization coefficient of CIN. 29 | :param w_reg: A scalar. The regularization coefficient of Linear. 30 | :return: 31 | """ 32 | super(xDeepFM, self).__init__() 33 | self.feature_columns = feature_columns 34 | self.embed_dim = self.feature_columns[0]['embed_dim'] 35 | self.embed_layers = { 36 | feat['feat_name']: Embedding(input_dim=feat['feat_num'], 37 | input_length=1, 38 | output_dim=feat['embed_dim'], 39 | embeddings_initializer='random_normal', 40 | embeddings_regularizer=l2(embed_reg)) 41 | for feat in self.feature_columns 42 | } 43 | self.map_dict = {} 44 | self.feature_length = 0 45 | for feat in self.feature_columns: 46 | self.map_dict[feat['feat_name']] = self.feature_length 47 | self.feature_length += feat['feat_num'] 48 | self.field_num = len(self.feature_columns) 49 | self.linear = Linear(self.feature_length, w_reg) 50 | self.cin = CIN(cin_size=cin_size, l2_reg=cin_reg) 51 | self.mlp = MLP(hidden_units=hidden_units, activation=activation, dnn_dropout=dnn_dropout) 52 | self.cin_dense = Dense(1) 53 | self.dnn_dense = Dense(1) 54 | self.bias = self.add_weight(name='bias', shape=(1, ), initializer=tf.zeros_initializer()) 55 | 56 | def call(self, inputs): 57 | # Linear 58 | linear_inputs = index_mapping(inputs, self.map_dict) 59 | linear_inputs = tf.concat([value for _, value in linear_inputs.items()], axis=-1) 60 | linear_out = self.linear(linear_inputs) # (batch_size, 1) 61 | # cin 62 | sparse_embed = tf.concat([self.embed_layers[feat_name](value) for feat_name, value in inputs.items()], axis=-1) 63 | embed_matrix = tf.reshape(sparse_embed, [-1, self.field_num, self.embed_dim]) # (None, filed_num, embed_dim) 64 | cin_out = self.cin(embed_matrix) # (batch_size, dim) 65 | cin_out = self.cin_dense(cin_out) # (batch_size, 1) 66 | # dnn 67 | embed_vector = tf.reshape(embed_matrix, shape=(-1, embed_matrix.shape[1] * embed_matrix.shape[2])) 68 | dnn_out = self.mlp(embed_vector) 69 | dnn_out = self.dnn_dense(dnn_out) # (batch_size, 1)) 70 | # output 71 | output = tf.nn.sigmoid(linear_out + cin_out + dnn_out + self.bias) 72 | return output 73 | 74 | def summary(self): 75 | inputs = { 76 | feat['feat_name']: Input(shape=(), dtype=tf.int32, name=feat['feat_name']) 77 | for feat in self.feature_columns 78 | } 79 | Model(inputs=inputs, outputs=self.call(inputs)).summary() -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="reclearn", 8 | version="1.1.0", 9 | author="Ziyao Geng", 10 | author_email="zggzy1996@163.com", 11 | description="A simple package about learning recommendation", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/ZiyaoGeng/RecLearn", 15 | packages=setuptools.find_packages(), 16 | python_requires=">=3.8", 17 | classifiers=[ 18 | "Programming Language :: Python :: 3", 19 | "License :: OSI Approved :: MIT License", 20 | "Operating System :: OS Independent", 21 | ], 22 | license="MIT", 23 | ) --------------------------------------------------------------------------------