├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── README_CN.md ├── assets ├── recstudio_framework.png └── recstudio_logo.png ├── environment.yml ├── example ├── example.ipynb └── sasrec_demo.py ├── nni-experiments ├── config │ ├── bpr.yaml │ └── sasrec.yaml └── search_space │ ├── bpr.yaml │ └── sasrec.yaml ├── recstudio ├── __init__.py ├── ann │ ├── __init__.py │ └── sampler.py ├── data │ ├── __init__.py │ ├── advance_dataset.py │ ├── config │ │ ├── all.yaml │ │ ├── amazon-beauty.yaml │ │ ├── amazon-books.yaml │ │ ├── amazon-electronics.yaml │ │ ├── criteo.yaml │ │ ├── gowalla.yaml │ │ ├── kuairand-pure.yaml │ │ ├── ml-100k.yaml │ │ ├── ml-10m.yaml │ │ ├── ml-1m.yaml │ │ ├── ml-20m.yaml │ │ ├── tmall.yaml │ │ └── yelp.yaml │ └── dataset.py ├── dataset_demo │ ├── __init__.py │ └── ml-100k │ │ ├── ml-100k.inter │ │ ├── ml-100k.item │ │ ├── ml-100k.kg │ │ ├── ml-100k.link │ │ ├── ml-100k.user │ │ └── social.txt ├── eval │ └── __init__.py ├── model │ ├── __init__.py │ ├── ae │ │ ├── config │ │ │ ├── all.yaml │ │ │ ├── multidae.yaml │ │ │ └── multivae.yaml │ │ ├── multidae.py │ │ └── multivae.py │ ├── basemodel │ │ ├── __init__.py │ │ ├── basemodel.yaml │ │ ├── baseranker.py │ │ ├── baseretriever.py │ │ └── recommender.py │ ├── debias │ │ └── __init__.py │ ├── fm │ │ ├── __init__.py │ │ ├── afm.py │ │ ├── afn.py │ │ ├── aoanet.py │ │ ├── autoint.py │ │ ├── ccpm.py │ │ ├── config │ │ │ ├── afm.yaml │ │ │ ├── afn.yaml │ │ │ ├── all.yaml │ │ │ ├── aoanet.yaml │ │ │ ├── autoint.yaml │ │ │ ├── ccpm.yaml │ │ │ ├── dcn.yaml │ │ │ ├── dcnv2.yaml │ │ │ ├── deepcrossing.yaml │ │ │ ├── deepfm.yaml │ │ │ ├── deepim.yaml │ │ │ ├── destine.yaml │ │ │ ├── difm.yaml │ │ │ ├── dlrm.yaml │ │ │ ├── edcn.yaml │ │ │ ├── ffm.yaml │ │ │ ├── fgcnn.yaml │ │ │ ├── fibinet.yaml │ │ │ ├── fignn.yaml │ │ │ ├── finalmlp.yaml │ │ │ ├── flen.yaml │ │ │ ├── fm.yaml │ │ │ ├── fmfm.yaml │ │ │ ├── fwfm.yaml │ │ │ ├── hfm.yaml │ │ │ ├── ifm.yaml │ │ │ ├── interhat.yaml │ │ │ ├── lorentzfm.yaml │ │ │ ├── lr.yaml │ │ │ ├── masknet.yaml │ │ │ ├── nfm.yaml │ │ │ ├── onn.yaml │ │ │ ├── pnn.yaml │ │ │ ├── ppnet.yaml │ │ │ ├── sam.yaml │ │ │ ├── widedeep.yaml │ │ │ └── xdeepfm.yaml │ │ ├── dcn.py │ │ ├── dcnv2.py │ │ ├── deepcrossing.py │ │ ├── deepfm.py │ │ ├── deepim.py │ │ ├── destine.py │ │ ├── difm.py │ │ ├── dlrm.py │ │ ├── edcn.py │ │ ├── ffm.py │ │ ├── fgcnn.py │ │ ├── fibinet.py │ │ ├── fignn.py │ │ ├── finalmlp.py │ │ ├── flen.py │ │ ├── fm.py │ │ ├── fmfm.py │ │ ├── fwfm.py │ │ ├── hfm.py │ │ ├── ifm.py │ │ ├── interhat.py │ │ ├── lorentzfm.py │ │ ├── lr.py │ │ ├── masknet.py │ │ ├── nfm.py │ │ ├── onn.py │ │ ├── pnn.py │ │ ├── ppnet.py │ │ ├── sam.py │ │ ├── widedeep.py │ │ └── xdeepfm.py │ ├── graph │ │ ├── __init__.py │ │ ├── config │ │ │ ├── all.yaml │ │ │ ├── lightgcn.yaml │ │ │ ├── ncl.yaml │ │ │ ├── ngcf.yaml │ │ │ ├── sgl.yaml │ │ │ └── simgcl.yaml │ │ ├── lightgcn.py │ │ ├── ncl.py │ │ ├── ngcf.py │ │ ├── sgl.py │ │ └── simgcl.py │ ├── init.py │ ├── kg │ │ ├── KGLearning.py │ │ ├── __init__.py │ │ ├── cfkg.py │ │ ├── cke.py │ │ ├── config │ │ │ ├── all.yaml │ │ │ ├── cfkg.yaml │ │ │ ├── cke.yaml │ │ │ ├── kgat.yaml │ │ │ ├── kgcn.yaml │ │ │ ├── kgin.yaml │ │ │ ├── kgnnls.yaml │ │ │ ├── ktup.yaml │ │ │ ├── mkr.yaml │ │ │ └── ripplenet.yaml │ │ ├── kgat.py │ │ ├── kgcn.py │ │ ├── kgin.py │ │ ├── kgnnls.py │ │ ├── ktup.py │ │ ├── mkr.py │ │ └── ripplenet.py │ ├── loss_func.py │ ├── mf │ │ ├── __init__.py │ │ ├── bpr.py │ │ ├── cml.py │ │ ├── config │ │ │ ├── all.yaml │ │ │ ├── bpr.yaml │ │ │ ├── cml.yaml │ │ │ ├── dssm.yaml │ │ │ ├── ease.yaml │ │ │ ├── irgan.yaml │ │ │ ├── itemknn.yaml │ │ │ ├── logisticmf.yaml │ │ │ ├── ncf.yaml │ │ │ ├── pmf.yaml │ │ │ ├── slim.yaml │ │ │ └── wrmf.yaml │ │ ├── dssm.py │ │ ├── ease.py │ │ ├── irgan.py │ │ ├── itemknn.py │ │ ├── logisticmf.py │ │ ├── ncf.py │ │ ├── pmf.py │ │ ├── slim.py │ │ └── wrmf.py │ ├── module │ │ ├── __init__.py │ │ ├── ctr.py │ │ ├── data_augmentation.py │ │ ├── functional.py │ │ ├── graphmodule.py │ │ ├── gru.py │ │ └── layers.py │ ├── multitask │ │ ├── __init__.py │ │ ├── aitm.py │ │ ├── config │ │ │ ├── aitm.yaml │ │ │ ├── all.yaml │ │ │ ├── hardshare.yaml │ │ │ ├── mmoe.yaml │ │ │ └── ple.yaml │ │ ├── hardshare.py │ │ ├── mmoe.py │ │ └── ple.py │ ├── ranker.py │ ├── retriever.py │ ├── scorer.py │ └── seq │ │ ├── __init__.py │ │ ├── bert4rec.py │ │ ├── caser.py │ │ ├── cl4srec.py │ │ ├── config │ │ ├── all.yaml │ │ ├── bert4rec.yaml │ │ ├── caser.yaml │ │ ├── cl4srec.yaml │ │ ├── coserec.yaml │ │ ├── din.yaml │ │ ├── fpmc.yaml │ │ ├── gru4rec.yaml │ │ ├── hgn.yaml │ │ ├── iclrec.yaml │ │ ├── narm.yaml │ │ ├── npe.yaml │ │ ├── sasrec.yaml │ │ ├── stamp.yaml │ │ └── transrec.yaml │ │ ├── coserec.py │ │ ├── din.py │ │ ├── fpmc.py │ │ ├── gru4rec.py │ │ ├── hgn.py │ │ ├── iclrec.py │ │ ├── narm.py │ │ ├── npe.py │ │ ├── sasrec.py │ │ ├── stamp.py │ │ └── transrec.py ├── quickstart │ ├── __init__.py │ ├── config_dataset.py │ └── run.py └── utils │ ├── __init__.py │ ├── arguments.py │ ├── callbacks.py │ ├── compress_file.py │ ├── data_parallel.py │ ├── trainer.py │ └── utils.py ├── run.py ├── setup.py └── test ├── test_config_dataset.py ├── test_dataset.py ├── test_ddp.py ├── test_quickrun.py ├── test_retriever.py └── test_training_pipeline.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *$py.class 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | build/ 11 | develop-eggs/ 12 | dist/ 13 | downloads/ 14 | eggs/ 15 | .eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | share/python-wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .nox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | *.py,cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | cover/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | .pybuilder/ 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | # For a library or package, you might want to ignore these files since the code is 86 | # intended to run in multiple environments; otherwise, check them in: 87 | # .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | 133 | # pytype static type analyzer 134 | .pytype/ 135 | 136 | # Cython debug symbols 137 | cython_debug/ 138 | 139 | tensorboard/ 140 | .vscode/ 141 | **/log 142 | **/saved 143 | **/.recstudio 144 | datasets/* 145 | !datasets/ml-100k/ 146 | 147 | .recstudio/ 148 | 149 | nni-experiments/* 150 | !nni-experiments/config/ 151 | !nni-experiments/search_space/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2022] [ustcml] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include recstudio/ *.yaml 2 | recursive-include recstudio/dataset_demo *.inter *.item *.kg *.link *.user *txt 3 | -------------------------------------------------------------------------------- /assets/recstudio_framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USTCLLM/RecStudio/9114975b8e9ec85bce16c1ed8abbf0e194e4afb3/assets/recstudio_framework.png -------------------------------------------------------------------------------- /assets/recstudio_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USTCLLM/RecStudio/9114975b8e9ec85bce16c1ed8abbf0e194e4afb3/assets/recstudio_logo.png -------------------------------------------------------------------------------- /nni-experiments/config/bpr.yaml: -------------------------------------------------------------------------------- 1 | experimentName: BPR-ml-100k # Mnemonic name of the experiment, which will be shown in WebUI and nnictl 2 | searchSpaceFile: ../search_space/bpr.yaml # Path to the TAML file containing the search space 3 | 4 | trialCommand: python3 run.py -m BPR -d ml-100k --mode tune # Command to launch trial 5 | trialCodeDirectory: ../.. # Path to the directory containing trial source files 6 | trialConcurrency: 4 # Specify how many trials should be run concurrently 7 | trialGpuNumber: 1 8 | 9 | maxExperimentDuration: 100d # Stop generating trials after the limitation 10 | maxTrialNumber: 1000 # Limit the number of trials to create 11 | maxTrialDuration: ~ # Limit the duration of trial job 12 | 13 | experimentWorkingDirectory: ../ # The directory to place log, checkpoint, metadata, etc 14 | 15 | 16 | tuner: # Tuning algorithm 17 | name: TPE 18 | classArgs: 19 | optimize_mode: maximize # Optimization direction, consistent with the first metric in 20 | # train/val_metrics and train/test_metrics 21 | 22 | 23 | # assessor: # used to terminate trials early 24 | # name: Curvefitting 25 | # classArgs: 26 | # epoch_num: 200 27 | # start_step: 20 28 | # threshold: 0.9 29 | # gap: 1 30 | 31 | # assessor: 32 | # name: Medianstop 33 | # classArgs: 34 | # optimize_mode: maximize 35 | # start_step: 200 36 | 37 | 38 | trainingService: 39 | platform: local 40 | useActiveGpu: true 41 | maxTrialNumberPerGpu: 2 # how many trials can share one GPU 42 | gpuIndices: 0, 1 # GPUs visible to trial processes -------------------------------------------------------------------------------- /nni-experiments/config/sasrec.yaml: -------------------------------------------------------------------------------- 1 | experimentName: SASRec-ml-100k # Mnemonic name of the experiment, which will be shown in WebUI and nnictl 2 | searchSpaceFile: ../search_space/sasrec.yaml # Path to the TAML file containing the search space 3 | 4 | trialCommand: python3 run.py -m SASRec -d ml-100k --mode tune # Command to launch trial 5 | trialCodeDirectory: ../.. # Path to the directory containing trial source files 6 | trialConcurrency: 4 # Specify how many trials should be run concurrently 7 | trialGpuNumber: 1 8 | 9 | maxExperimentDuration: 100d # Stop generating trials after the limitation 10 | maxTrialNumber: 1000 # Limit the number of trials to create 11 | maxTrialDuration: ~ # Limit the duration of trial job 12 | 13 | experimentWorkingDirectory: ../ # The directory to place log, checkpoint, metadata, and other run-time stuff 14 | 15 | 16 | tuner: # Tuning algorithm 17 | name: TPE 18 | classArgs: 19 | optimize_mode: maximize # Optimization direction, consistent with the first metric in 20 | # train/val_metrics and train/test_metrics 21 | 22 | 23 | # assessor: # used to terminate trials early 24 | # name: Curvefitting 25 | # classArgs: 26 | # epoch_num: 200 27 | # start_step: 20 28 | # threshold: 0.9 29 | # gap: 1 30 | 31 | # assessor: 32 | # name: Medianstop 33 | # classArgs: 34 | # optimize_mode: maximize 35 | # start_step: 200 36 | 37 | 38 | trainingService: 39 | platform: local 40 | useActiveGpu: true 41 | maxTrialNumberPerGpu: 2 # how many trials can share one GPU 42 | gpuIndices: 0, 1 # GPUs visible to trial processes -------------------------------------------------------------------------------- /nni-experiments/search_space/bpr.yaml: -------------------------------------------------------------------------------- 1 | train/learning_rate: 2 | _type: choice 3 | _value: [0.001] 4 | 5 | train/weight_decay: 6 | _type: choice 7 | _value: [0.0001, 0.0005, 0.001, 0.005] -------------------------------------------------------------------------------- /nni-experiments/search_space/sasrec.yaml: -------------------------------------------------------------------------------- 1 | train/learning_rate: 2 | _type: choice 3 | _value: [0.0005, 0.001, 0.005] 4 | 5 | model/dropout_rate: 6 | _type: choice 7 | _value: [0.2, 0.5] -------------------------------------------------------------------------------- /recstudio/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | from __future__ import division 4 | 5 | __version__ = '0.0.2' 6 | LOG_DIR = r"./log/" 7 | DEFAULT_CACHE_DIR = r"./.recstudio/" 8 | -------------------------------------------------------------------------------- /recstudio/ann/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USTCLLM/RecStudio/9114975b8e9ec85bce16c1ed8abbf0e194e4afb3/recstudio/ann/__init__.py -------------------------------------------------------------------------------- /recstudio/data/__init__.py: -------------------------------------------------------------------------------- 1 | from recstudio.data.dataset import TripletDataset, SeqDataset, UserDataset, FullSeqDataset 2 | from recstudio.data.advance_dataset import ALSDataset 3 | 4 | import os 5 | 6 | supported_dataset = [] 7 | for f in os.listdir(os.path.join(os.path.dirname(__file__), 'config')): 8 | if f != "all.yaml": 9 | supported_dataset.append(f.split(".")[0]) 10 | -------------------------------------------------------------------------------- /recstudio/data/config/all.yaml: -------------------------------------------------------------------------------- 1 | url: ~ 2 | user_id_field: &u user_id:token # TODO: comments for &u and *u 3 | item_id_field: &i item_id:token 4 | rating_field: &r rating:float 5 | time_field: &t timestamp:float 6 | time_format: ~ 7 | 8 | encoding_method: utf-8 9 | 10 | inter_feat_name: ~ 11 | inter_feat_field: [*u, *i, *r, *t] 12 | inter_feat_header: ~ 13 | 14 | user_feat_name: ~ 15 | user_feat_field: [[*u, age:token, gender:token, occupation:token]] 16 | user_feat_header: ~ 17 | 18 | 19 | item_feat_name: ~ 20 | item_feat_field: [[*i, movie_title:token_seq:" ", release_year:token, class:token_seq:" "]] 21 | item_feat_header: ~ 22 | 23 | 24 | field_separator: "\t" 25 | min_user_inter: 0 26 | min_item_inter: 0 27 | field_max_len: ~ # a YAML-format dict, for example 28 | # field_max_len: 29 | # age: 1 30 | # gender: 1 31 | # occupation: 1 32 | low_rating_thres: ~ # low rating threshold, which is used for drop low rating interactions 33 | # drop_low_rating: True # if true, the interactions with rating lower than `rating_thres` would be dropped. 34 | 35 | # negative rating threshold, interactions with rating below than the threshold would be regarded as negative interactions. 36 | # Note that when `drop_low_rating` is True, only interactions with rating above `low_rating_thres` and below `negative_rating_thres` 37 | # would be regared as negative interactions. 38 | # The threshold value should be larger than `low_rating_thres`. If not, the threshold would be invalid, which means all interactions kept 39 | # would be regarded as positives. 40 | # negative_rating_thres: 0.0 41 | 42 | # `binarized_rating` controls whether to binarize the rating to 0/1 with the `rating_thres`. 43 | # If true, ratings above `rating_thres` would be mapped as 1 and ratings above `rating_thres` would be mapped as 0; 44 | # If false, the ratings would not be changed 45 | binarized_rating_thres: ~ 46 | 47 | drop_dup: True 48 | max_seq_len: 20 49 | 50 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features 51 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]] 52 | mapped_feat_field: [*u, *i] 53 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]] 54 | network_feat_header: [0, 0] 55 | 56 | # sklearn.preprocessing (Arguments supportable; args are sepped with blankspace; same with tuple) 57 | # MinMaxScaler(), StandardScaler(), RobustScaler(), MaxAbsScaler() 58 | # Binarizer(), KBinsDiscretizer(encode="ordinal") 59 | # Normalizer() 60 | # KernelCenterer() 61 | # QuantileTransformer(), SplineTransformer() 62 | # Customized: LogTransformer(), or use FunctionTransformer(...) 63 | float_field_preprocess: ~ # [float_field:MinMaxScaler(), ...] 64 | 65 | save_cache: False # whether to save processed dataset to cache. 66 | -------------------------------------------------------------------------------- /recstudio/data/config/amazon-beauty.yaml: -------------------------------------------------------------------------------- 1 | url: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Beauty.csv 2 | user_id_field: &u user_id:token # TODO: comments for &u and *u 3 | item_id_field: &i item_id:token 4 | rating_field: &r rating:float 5 | time_field: &t timestamp:float 6 | time_format: ~ 7 | 8 | encoding_method: utf-8 9 | inter_feat_name: ratings_Beauty.csv 10 | inter_feat_field: [*u, *i, *r, *t] 11 | inter_feat_header: ~ 12 | 13 | user_feat_name: ~ 14 | user_feat_field: ~ 15 | user_feat_header: ~ 16 | 17 | item_feat_name: ~ 18 | item_feat_field: ~ 19 | item_feat_header: ~ 20 | 21 | field_separator: "," 22 | min_user_inter: 5 23 | min_item_inter: 5 24 | field_max_len: ~ 25 | low_rating_thres: 3.0 26 | max_seq_len: 50 27 | 28 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features 29 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]] 30 | mapped_feat_field: [*u, *i] 31 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]] 32 | 33 | save_cache: True # whether to save processed dataset to cache. 34 | -------------------------------------------------------------------------------- /recstudio/data/config/amazon-books.yaml: -------------------------------------------------------------------------------- 1 | url: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Books.csv 2 | user_id_field: &u user_id:token 3 | item_id_field: &i item_id:token 4 | rating_field: &r rating:float 5 | time_field: &t timestamp:float 6 | time_format: ~ 7 | 8 | encoding_method: utf-8 9 | inter_feat_name: ratings_Books.csv 10 | inter_feat_field: [*u, *i, *r, *t,] 11 | inter_feat_header: ~ 12 | 13 | user_feat_name: ~ 14 | user_feat_field: ~ 15 | user_feat_header: ~ 16 | 17 | item_feat_name: ~ 18 | item_feat_field: ~ 19 | item_feat_header: ~ 20 | 21 | use_fields: ~ # TODO: 22 | field_separator: "," 23 | min_user_inter: 5 24 | min_item_inter: 5 25 | field_max_len: ~ 26 | low_rating_thres: ~ 27 | max_seq_len: 20 28 | 29 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features 30 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]] 31 | mapped_feat_field: [*u, *i] 32 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]] 33 | 34 | save_cache: True # whether to save processed dataset to cache. 35 | -------------------------------------------------------------------------------- /recstudio/data/config/amazon-electronics.yaml: -------------------------------------------------------------------------------- 1 | url: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Electronics.csv 2 | user_id_field: &u user_id:token 3 | item_id_field: &i item_id:token 4 | rating_field: &r rating:float 5 | time_field: &t timestamp:float 6 | time_format: ~ 7 | 8 | encoding_method: utf-8 9 | inter_feat_name: ratings_Electronics.csv 10 | inter_feat_field: [*u, *i, *r, *t] 11 | inter_feat_header: ~ 12 | 13 | user_feat_name: ~ 14 | user_feat_field: ~ 15 | user_feat_header: ~ 16 | 17 | item_feat_name: ~ 18 | item_feat_field: ~ 19 | item_feat_header: ~ 20 | 21 | use_fields: ~ # TODO: 22 | field_separator: "," 23 | min_user_inter: 5 24 | min_item_inter: 5 25 | field_max_len: ~ 26 | low_rating_thres: 3 27 | max_seq_len: 20 28 | 29 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features 30 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]] 31 | mapped_feat_field: [*u, *i] 32 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]] 33 | 34 | save_cache: True # whether to save processed dataset to cache. 35 | -------------------------------------------------------------------------------- /recstudio/data/config/criteo.yaml: -------------------------------------------------------------------------------- 1 | url: https://rec.ustc.edu.cn/share/f519a9c0-e593-11ed-a011-2d240ca5a5b5 2 | user_id_field: ~ 3 | item_id_field: ~ 4 | rating_field: &r rating:float 5 | time_field: ~ 6 | time_format: ~ 7 | 8 | 9 | inter_feat_name: train.txt 10 | inter_feat_field: [*r, I1:float, I2:float, I3:float, I4:float, I5:float, I6:float, I7:float, I8:float, I9:float, I10:float, I11:float, I12:float, I13:float, C1:token, C2:token, C3:token, C4:token, C5:token, C6:token, C7:token, C8:token, C9:token, C10:token, C11:token, C12:token, C13:token, C14:token, C15:token, C16:token, C17:token, C18:token, C19:token, C20:token, C21:token, C22:token, C23:token, C24:token, C25:token, C26:token] 11 | inter_feat_header: ~ 12 | 13 | user_feat_name: ~ 14 | user_feat_field: ~ 15 | user_feat_header: ~ 16 | 17 | 18 | item_feat_name: ~ 19 | item_feat_field: ~ 20 | item_feat_header: ~ 21 | 22 | 23 | field_separator: "\t" 24 | min_user_inter: 0 25 | min_item_inter: 0 26 | field_max_len: ~ 27 | low_rating_thres: ~ 28 | drop_dup: False 29 | max_seq_len: 20 30 | 31 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features 32 | network_feat_name: ~ 33 | mapped_feat_field: ~ 34 | network_feat_field: ~ 35 | network_feat_header: ~ 36 | save_cache: True # whether to save processed dataset to cache. -------------------------------------------------------------------------------- /recstudio/data/config/gowalla.yaml: -------------------------------------------------------------------------------- 1 | url: https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz 2 | user_id_field: &u user:token 3 | item_id_field: &i location_id:token 4 | rating_field: ~ 5 | time_field: &t check_in_time:str 6 | time_format: "%Y-%m-%dT%H:%M:%Sz" 7 | 8 | encoding_method: utf-8 9 | inter_feat_name: loc-gowalla_totalCheckins.txt 10 | inter_feat_field: [*u, *t, latitude:float, longitude:float ,*i] 11 | inter_feat_header: ~ 12 | 13 | user_feat_name: ~ 14 | user_feat_field: ~ 15 | user_feat_header: ~ 16 | 17 | item_feat_name: ~ 18 | item_feat_field: ~ 19 | item_feat_header: ~ 20 | 21 | use_fields: ~ 22 | field_separator: "\t" 23 | seq_separator: " " 24 | min_user_inter: 5 25 | min_item_inter: 5 26 | field_max_len: ~ 27 | low_rating_thres: ~ 28 | max_seq_len: 20 29 | 30 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features 31 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]] 32 | mapped_feat_field: [*u, *i] 33 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]] 34 | 35 | save_cache: True # whether to save processed dataset to cache. 36 | -------------------------------------------------------------------------------- /recstudio/data/config/kuairand-pure.yaml: -------------------------------------------------------------------------------- 1 | url: ~ # Please look for https://kuairand.com/ for downloading 2 | user_id_field: &u user_id:token 3 | item_id_field: &i video_id_id:token 4 | rating_field: [is_click:float, is_like:float, is_follow:float, is_comment:float, is_forward:float, is_hate:float] 5 | time_field: &t date:float 6 | time_format: ~ 7 | 8 | 9 | inter_feat_name: log_standard_4_22_to_5_08_pure.csv 10 | inter_feat_field: [*u, *i, *t, hourmin:float, time_ms:float, is_click:float, is_like:float, is_follow:float, is_comment:float, is_forward:float, is_hate:float, long_view:float, play_time_ms:float, duration_ms:float, profile_stay_time:float, comment_stay_time:float, is_probfile_enter:float, is_rand:float, tab:float] 11 | inter_feat_header: 0 12 | 13 | user_feat_name: [user_features_pure.csv] 14 | user_feat_field: [[*u, user_active_degree:token, is_lowactive_period:float, is_live_streamer:float, is_video_author:float, follow_user_num:float, follow_user_num_range:token, fans_user_num:float, fans_user_num_range:token, friend_user_num:float, friend_user_num_range:token, register_days:float, register_days_range:token, onehot_feat0:float, onehot_feat1:float, onehot_feat2:float, onehot_feat3:float, onehot_feat4:float, onehot_feat5:float, onehot_feat6:float, onehot_feat7:float, onehot_feat8:float, onehot_feat9:float, onehot_feat10:float, onehot_feat11:float, onehot_feat12:float, onehot_feat13:float, onehot_feat14:float, onehot_feat15:float, onehot_feat16:float, onehot_feat17:float]] 15 | user_feat_header: 0 16 | 17 | 18 | item_feat_name: ~ #[video_features_basic_pure.csv, video_features_statistic_pure.csv] 19 | item_feat_field: ~ # [[...]] 20 | item_feat_header: 0 21 | 22 | 23 | field_separator: "," 24 | min_user_inter: 0 25 | min_item_inter: 0 26 | field_max_len: ~ 27 | low_rating_thres: ~ 28 | max_seq_len: ~ 29 | 30 | save_cache: True 31 | -------------------------------------------------------------------------------- /recstudio/data/config/ml-100k.yaml: -------------------------------------------------------------------------------- 1 | url: "recstudio:dataset_demo/ml-100k" 2 | user_id_field: &u user_id:token # TODO: comments for &u and *u 3 | item_id_field: &i item_id:token 4 | rating_field: &r rating:float 5 | time_field: &t timestamp:float 6 | time_format: ~ 7 | 8 | 9 | inter_feat_name: ml-100k.inter 10 | inter_feat_field: [*u, *i, *r, *t] 11 | inter_feat_header: 0 12 | 13 | user_feat_name: [ml-100k.user] 14 | user_feat_field: [[*u, age:token, gender:token, occupation:token, zip_code:token]] 15 | user_feat_header: 0 16 | 17 | 18 | item_feat_name: ~ # [ml-100k.item] 19 | item_feat_field: [[*i, movie_title:token_seq:" ", release_year:token, class:token_seq:" "]] 20 | item_feat_header: 0 21 | 22 | 23 | field_separator: "\t" 24 | min_user_inter: 0 25 | min_item_inter: 0 26 | field_max_len: ~ 27 | low_rating_thres: 3.0 28 | max_seq_len: 20 29 | 30 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features 31 | network_feat_name: ~ # [[social.txt], [ml-100k.kg, ml-100k.link]] 32 | # mapped_feat_field: [*u, *i] 33 | mapped_feat_field: [[*u, *u], [*i, ~, *i]] 34 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, relation_id:token, tail_id:token], [*i, entity_id:token]]] 35 | network_feat_header: [[0], [0, 0]] 36 | 37 | save_cache: False # whether to save processed dataset to cache. 38 | float_field_preprocess: [timestamp:StandardScaler()] -------------------------------------------------------------------------------- /recstudio/data/config/ml-10m.yaml: -------------------------------------------------------------------------------- 1 | url: https://files.grouplens.org/datasets/movielens/ml-10m.zip 2 | user_id_field: &u UserID:token # TODO: comments for &u and *u 3 | item_id_field: &i MovieID:token 4 | rating_field: &r Rating:float 5 | time_field: &t Timestamp:float 6 | time_format: ~ 7 | 8 | encoding_method: ISO-8859-1 9 | inter_feat_name: ratings.dat 10 | inter_feat_field: [*u, *i, *r, *t] 11 | inter_feat_header: ~ 12 | 13 | 14 | user_feat_name: ~ 15 | user_feat_field: ~ 16 | user_feat_header: ~ 17 | 18 | 19 | item_feat_name: [movies.dat] 20 | item_feat_field: [[*i, Title:token_seq:" ", Genres:token_seq:"|")]] 21 | item_feat_header: ~ 22 | 23 | 24 | use_fields: ~ 25 | field_separator: "::" 26 | min_user_inter: 5 27 | min_item_inter: 5 28 | field_max_len: ~ 29 | low_rating_thres: 3.0 30 | max_seq_len: 20 31 | 32 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features 33 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]] 34 | mapped_feat_field: [*u, *i] 35 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]] 36 | network_feat_header: [~, ~] 37 | 38 | 39 | save_cache: False # whether to save processed dataset to cache. 40 | -------------------------------------------------------------------------------- /recstudio/data/config/ml-1m.yaml: -------------------------------------------------------------------------------- 1 | url: https://files.grouplens.org/datasets/movielens/ml-1m.zip 2 | user_id_field: &u UserID:token # TODO: comments for &u and *u 3 | item_id_field: &i MovieID:token 4 | rating_field: &r Rating:float 5 | time_field: &t Timestamp:float 6 | time_format: ~ 7 | 8 | 9 | encoding_method: ISO-8859-1 10 | inter_feat_name: ratings.dat 11 | inter_feat_field: [*u, *i, *r, *t] 12 | inter_feat_header: ~ 13 | 14 | 15 | user_feat_name: ~ #[users.dat] 16 | user_feat_field: [[*u, Gender:token, Age:token, Occupation:token, Zip-code:token]] 17 | user_feat_header: ~ 18 | 19 | 20 | item_feat_name: ~ #[movies.dat] 21 | item_feat_field: [[*i, Title:token_seq:" ", Genres:token_seq:"|")]] 22 | item_feat_header: ~ 23 | 24 | 25 | use_fields: ~ 26 | field_separator: "::" 27 | min_user_inter: 5 28 | min_item_inter: 5 29 | field_max_len: ~ 30 | low_rating_thres: 3.0 31 | max_seq_len: 20 32 | 33 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features 34 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]] 35 | mapped_feat_field: [*u, *i] 36 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]] 37 | network_feat_header: [~, ~] 38 | 39 | 40 | save_cache: False # whether to save processed dataset to cache. 41 | -------------------------------------------------------------------------------- /recstudio/data/config/ml-20m.yaml: -------------------------------------------------------------------------------- 1 | url: https://files.grouplens.org/datasets/movielens/ml-20m.zip 2 | user_id_field: &u userId:token 3 | item_id_field: &i movieId:token 4 | rating_field: &r rating:float 5 | time_field: &t timestamp:float 6 | time_format: ~ 7 | 8 | encoding_method: ISO-8859-1 9 | inter_feat_name: ratings.csv 10 | inter_feat_field: [*u, *i, *r, *t] 11 | inter_feat_header: 0 12 | 13 | 14 | user_feat_name: ~ 15 | user_feat_field: ~ 16 | user_feat_header: ~ 17 | 18 | 19 | item_feat_name: [movies.csv] 20 | item_feat_field: [[*i, title:token_seq:" ", genres:token_seq:"|")]] 21 | item_feat_header: 0 22 | 23 | 24 | use_fields: ~ 25 | field_separator: "," 26 | min_user_inter: 5 27 | min_item_inter: 5 28 | field_max_len: ~ 29 | low_rating_thres: ~ 30 | max_seq_len: 20 31 | 32 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features 33 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]] 34 | mapped_feat_field: [*u, *i] 35 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]] 36 | network_feat_header: [~, ~] 37 | 38 | 39 | save_cache: True # whether to save processed dataset to cache. 40 | -------------------------------------------------------------------------------- /recstudio/data/config/tmall.yaml: -------------------------------------------------------------------------------- 1 | url: https://rec.ustc.edu.cn/share/62299ea0-e083-11ec-8586-b7917c2cff26 2 | user_id_field: &u use_ID:token 3 | item_id_field: &i ite_ID:token 4 | rating_field: &r act_ID:float 5 | time_field: &t time:float 6 | time_format: ~ 7 | 8 | encoding_method: utf-8 9 | inter_feat_name: ijcai2016_taobao.csv 10 | inter_feat_field: [*u, sel_ID:token, *i, cat_id:token, *r, *t] 11 | inter_feat_header: 0 12 | 13 | 14 | user_feat_name: ~ 15 | user_feat_field: ~ 16 | user_feat_header: ~ 17 | 18 | 19 | item_feat_name: ~ 20 | item_feat_field: ~ 21 | item_feat_header: ~ 22 | 23 | 24 | use_fields: ~ 25 | field_separator: "," 26 | min_user_inter: 5 27 | min_item_inter: 5 28 | field_max_len: ~ 29 | low_rating_thres: ~ 30 | max_seq_len: 50 31 | 32 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features 33 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]] 34 | mapped_feat_field: [*u, *i] 35 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]] 36 | network_feat_header: [~, ~] 37 | 38 | 39 | save_cache: True # whether to save processed dataset to cache. 40 | -------------------------------------------------------------------------------- /recstudio/data/config/yelp.yaml: -------------------------------------------------------------------------------- 1 | url: https://rec.ustc.edu.cn/share/cdc6de70-2f87-11ed-b5db-4d1a26914a90 2 | user_id_field: &u user_id:token # TODO: comments for &u and *u 3 | item_id_field: &i business_id:token 4 | rating_field: &r stars:float 5 | time_field: &t date:float 6 | time_format: ~ 7 | 8 | 9 | encoding_method: utf-8 10 | inter_feat_name: yelp_inter.csv 11 | inter_feat_field: [review_id:token, *u, *i, *r, *t] 12 | inter_feat_header: 0 13 | 14 | 15 | user_feat_name: ~ #[yelp_user.csv] 16 | user_feat_field: [[*u, user_name:token, yelping_since:float, fans:float, average_stars:float]] 17 | user_feat_header: 0 18 | 19 | 20 | item_feat_name: ~ #[yelp_item.csv] 21 | item_feat_field: [[business_id:token, business_name:token, city:token, state:token, postal_code:token, latitude:float, longitude:float, business_stars:float, 'categories:token_seq:", "']] 22 | item_feat_header: 0 23 | 24 | 25 | use_fields: ~ # TODO: 26 | field_separator: "," 27 | min_user_inter: 5 28 | min_item_inter: 5 29 | field_max_len: ~ 30 | low_rating_thres: 3 31 | max_seq_len: 20 32 | 33 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features 34 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]] 35 | mapped_feat_field: [*u, *i] 36 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]] 37 | network_feat_header: [~, ~] 38 | 39 | 40 | save_cache: True # whether to save processed dataset to cache. 41 | -------------------------------------------------------------------------------- /recstudio/dataset_demo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USTCLLM/RecStudio/9114975b8e9ec85bce16c1ed8abbf0e194e4afb3/recstudio/dataset_demo/__init__.py -------------------------------------------------------------------------------- /recstudio/dataset_demo/ml-100k/social.txt: -------------------------------------------------------------------------------- 1 | source_id target_id 2 | 391 184 3 | 343 738 4 | 872 542 5 | 247 55 6 | 606 461 7 | 574 195 8 | 464 810 9 | 187 929 10 | 600 545 11 | 583 662 12 | 263 533 13 | 432 65 14 | 561 137 15 | 363 496 16 | 836 899 17 | 192 907 18 | 664 627 19 | 213 312 20 | 523 110 21 | 722 883 22 | 243 456 23 | 844 374 24 | 353 424 25 | 225 616 26 | 327 445 27 | 787 685 28 | 399 132 29 | 522 415 30 | 87 284 31 | 196 417 32 | 911 870 33 | 800 85 34 | 598 868 35 | 290 142 36 | 632 379 37 | 292 725 38 | 230 358 39 | 331 874 40 | 304 129 41 | 75 162 42 | 647 360 43 | 771 396 44 | 720 306 45 | 833 839 46 | 820 779 47 | 201 125 48 | 95 301 49 | 369 726 50 | 499 18 51 | 249 491 52 | 88 684 53 | 910 384 54 | 239 248 55 | 936 894 56 | 897 933 57 | 672 37 58 | 532 724 59 | 915 229 60 | 478 841 61 | 510 761 62 | 557 190 63 | 717 255 64 | 311 211 65 | 516 482 66 | 330 398 67 | 711 875 68 | 643 530 69 | 473 512 70 | 834 355 71 | 115 209 72 | 146 70 73 | 394 509 74 | 751 666 75 | 615 75 76 | 570 467 77 | 566 492 78 | 831 281 79 | 116 753 80 | 593 677 81 | 935 522 82 | 215 587 83 | 518 213 84 | 479 105 85 | 316 635 86 | 423 243 87 | 726 6 88 | 120 152 89 | 375 569 90 | 602 852 91 | 100 83 92 | 812 584 93 | 160 778 94 | 366 60 95 | 608 437 96 | 716 165 97 | 714 815 98 | 367 661 99 | 492 830 100 | 18 723 101 | 401 1 102 | 1000 13 103 | -------------------------------------------------------------------------------- /recstudio/model/__init__.py: -------------------------------------------------------------------------------- 1 | from . import ae, fm, kg, mf, seq, basemodel -------------------------------------------------------------------------------- /recstudio/model/ae/config/all.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | shuffle: True 3 | -------------------------------------------------------------------------------- /recstudio/model/ae/config/multidae.yaml: -------------------------------------------------------------------------------- 1 | eval: 2 | batch_size: 200 3 | 4 | model: 5 | embed_dim: 200 6 | dropout: 0.5 7 | encoder_dims: [64,32] 8 | decoder_dims: [32,64] 9 | activation: relu 10 | 11 | train: 12 | batch_size: 256 13 | epochs: 200 14 | learner: adam 15 | learning_rate: 0.01 16 | weight_decay: 0.00001 17 | -------------------------------------------------------------------------------- /recstudio/model/ae/config/multivae.yaml: -------------------------------------------------------------------------------- 1 | eval: 2 | batch_size: 200 3 | 4 | model: 5 | embed_dim: 600 6 | dropout_rate: 0.5 7 | encoder_dims: [200] 8 | decoder_dims: [200] 9 | activation: tanh 10 | 11 | train: 12 | anneal_max: 0.2 13 | anneal_total_step: 2000000 14 | batch_size: 500 15 | epochs: 500 16 | learner: adam 17 | learning_rate: 0.001 18 | weight_decay: 1e-5 19 | early_stop_patience: 100 20 | -------------------------------------------------------------------------------- /recstudio/model/ae/multidae.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.data.dataset import UserDataset 3 | from recstudio.model.basemodel import BaseRetriever, Recommender 4 | from recstudio.model.loss_func import SoftmaxLoss 5 | from recstudio.model.module import MLPModule 6 | from recstudio.model.scorer import InnerProductScorer 7 | 8 | 9 | class MultiDAEQueryEncoder(torch.nn.Module): 10 | def __init__(self, fiid, num_items, embed_dim, dropout_rate, 11 | encoder_dims, decoder_dims, activation='relu'): 12 | super().__init__() 13 | assert encoder_dims[-1] == decoder_dims[0], 'expecting the output size of'\ 14 | 'encoder is equal to the input size of decoder.' 15 | assert encoder_dims[0] == decoder_dims[-1], 'expecting the output size of'\ 16 | 'decoder is equal to the input size of encoder.' 17 | 18 | self.fiid = fiid 19 | self.item_embedding = torch.nn.Embedding(num_items, embed_dim, 0) 20 | self.dropout = torch.nn.Dropout(p=dropout_rate) 21 | 22 | self.encoder_decoder = torch.nn.Sequential( 23 | MLPModule([embed_dim]+encoder_dims+decoder_dims[1:], activation), 24 | torch.nn.Linear(decoder_dims[-1], embed_dim) 25 | ) 26 | 27 | def forward(self, batch): 28 | # encode 29 | seq_emb = self.item_embedding(batch["in_"+self.fiid]) 30 | non_zero_num = batch["in_"+self.fiid].count_nonzero(dim=1).unsqueeze(-1) 31 | seq_emb = seq_emb.sum(1) / non_zero_num.pow(0.5) 32 | h = self.dropout(seq_emb) 33 | 34 | return self.encoder_decoder(h) 35 | 36 | 37 | class MultiDAE(BaseRetriever): 38 | 39 | # def add_model_specific_args(parent_parser): 40 | # parent_parser = Recommender.add_model_specific_args(parent_parser) 41 | # parent_parser.add_argument_group('MultiDAE') 42 | # parent_parser.add_argument("--dropout", type=int, default=0.5, help='dropout rate for MLP layers') 43 | # parent_parser.add_argument("--encoder_dims", type=int, nargs='+', default=64, help='MLP layer size for encoder') 44 | # parent_parser.add_argument("--decoder_dims", type=int, nargs='+', default=64, help='MLP layer size for decocer') 45 | # parent_parser.add_argument("--activation", type=str, default='relu', help='activation function for MLP layers') 46 | # return parent_parser 47 | 48 | def _get_dataset_class(): 49 | return UserDataset 50 | 51 | def _get_item_encoder(self, train_data): 52 | return torch.nn.Embedding(train_data.num_items, self.embed_dim, 0) 53 | 54 | def _get_query_encoder(self, train_data): 55 | model_config = self.config['model'] 56 | return MultiDAEQueryEncoder(train_data.fiid, train_data.num_items, 57 | self.embed_dim, model_config['dropout'], model_config['encoder_dims'], 58 | model_config['decoder_dims'], model_config['activation']) 59 | 60 | def _get_score_func(self): 61 | return InnerProductScorer() 62 | 63 | def _get_sampler(self, train_data): 64 | return None 65 | 66 | def _get_loss_func(self): 67 | return SoftmaxLoss() 68 | -------------------------------------------------------------------------------- /recstudio/model/basemodel/__init__.py: -------------------------------------------------------------------------------- 1 | from recstudio.model.basemodel.recommender import Recommender 2 | from recstudio.model.basemodel.baseranker import BaseRanker 3 | from recstudio.model.basemodel.baseretriever import BaseRetriever 4 | # from recstudio.model.basemodel.sequential_retriever import SequentialRetriever -------------------------------------------------------------------------------- /recstudio/model/basemodel/basemodel.yaml: -------------------------------------------------------------------------------- 1 | # This is a configuration file for all models, which could be regarded as an example of 2 | # configuration file. 3 | 4 | # All the configuration parameters are divided into four groups: data, model, train and eval. 5 | # - data: data group contains some parameters related to dataset construction. For example, 6 | # `fm_eval` controls whether a sample is one interaction or all interactions for one user 7 | # in evaluation. 8 | # - model: model group contains some parameters related to the model size (or model architechture). 9 | # - train: train group contains parameters for the training procedure, such as epochs, learning rate. 10 | # - eval: eval group contains parameters for the evaluation procedure (validation and test), such as 11 | # batch size. 12 | 13 | 14 | data: # params related to dataset 15 | binarized_rating_thres: ~ # whether to binarized rating 16 | fm_eval: False # whether to set fm_eval to organize the batch data as one interactions per sample. 17 | 18 | # the sampler for dataset, only uniform sampler is supported now. 19 | neg_count: 0 20 | sampler: ~ # [uniform] 21 | shuffle: True 22 | split_mode: user_entry # [user, entry, user_entry] 23 | split_ratio: [0.8,0.1,0.1] # list or int type, list type for split by ratio, int type for leave one out 24 | 25 | 26 | model: 27 | embed_dim: 64 # embedding dimension for embedding layers, usually for item and user embeddings 28 | item_bias: False # whether to add item bias 29 | 30 | 31 | train: 32 | accelerator: gpu # [cpu, gpu, dp] 33 | 34 | # ann: {index: 'IVFx,Flat', parameter: ~} ## 1 HNSWx,Flat; 2 Flat; 3 IVFx,Flat ## {nprobe: 1} {efSearch: 1} 35 | ann: ~ 36 | batch_size: 512 37 | 38 | early_stop_mode: max 39 | early_stop_patience: 10 40 | 41 | epochs: 1000 42 | gpu: 1 43 | grad_clip_norm: ~ 44 | init_method: xavier_normal # [xavier_normal, normal] 45 | item_batch_size: 1024 # batch size for items to get all item features or get full item scores. 46 | learner: adam 47 | learning_rate: 0.001 48 | num_threads: 10 49 | 50 | # negative sampler configuration in training procedure 51 | # `method` describes the retrieving method used to retrieve negative items with a retriever. 52 | sampling_method: none # [none, sir, dns, toprand, top&rand, brute] 53 | 54 | # `sampler` describes the negative sampler used to train models. 55 | sampler: uniform # [uniform, pop, midx-uni, midx-pop, cluster-uni, cluster-pop] 56 | 57 | negative_count: 0 # number of negative items to be sampled 58 | excluding_hist: False # whether to exclude user history in negative sampling 59 | 60 | # learning rate scheduler, refer to https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate 61 | scheduler: ~ # [onplateau, exponential] 62 | 63 | seed: 2022 # random seed, usually 42 is a magic number 64 | weight_decay: 0.0 # weight decay for the optimizer 65 | tensorboard_path: ~ 66 | 67 | 68 | eval: 69 | batch_size: 128 70 | cutoff: [5, 10, 20] 71 | val_metrics: [ndcg, recall] 72 | val_n_epoch: 1 73 | test_metrics: [ndcg, recall, precision, map, mrr, hit] 74 | topk: 100 75 | save_path: './saved/' 76 | -------------------------------------------------------------------------------- /recstudio/model/debias/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USTCLLM/RecStudio/9114975b8e9ec85bce16c1ed8abbf0e194e4afb3/recstudio/model/debias/__init__.py -------------------------------------------------------------------------------- /recstudio/model/fm/__init__.py: -------------------------------------------------------------------------------- 1 | from .lr import LR 2 | from .fm import FM 3 | from .dcn import DCN 4 | from .nfm import NFM 5 | from .deepfm import DeepFM 6 | -------------------------------------------------------------------------------- /recstudio/model/fm/afm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from collections import OrderedDict 3 | from recstudio.data.dataset import TripletDataset 4 | from .. import loss_func 5 | from ..basemodel import BaseRanker 6 | from ..module import ctr 7 | 8 | r""" 9 | AFM 10 | ###################### 11 | 12 | Paper Reference: 13 | Attentional Factorization Machines: Learning the Weight of Feature Interactions via Attention Networks (IJCAI'17) 14 | https://dl.acm.org/doi/10.5555/3172077.3172324 15 | """ 16 | 17 | class AFM(BaseRanker): 18 | 19 | def _get_dataset_class(): 20 | return TripletDataset 21 | 22 | def _init_model(self, train_data, drop_unused_field=True): 23 | super()._init_model(train_data, drop_unused_field) 24 | num_fields = len(self.fields) - 1 25 | self.linear = ctr.LinearLayer(self.fields, train_data) 26 | self.afm = nn.Sequential( 27 | OrderedDict([ 28 | ("embeddings", 29 | ctr.Embeddings( 30 | self.fields, 31 | self.embed_dim, 32 | train_data)), 33 | ("afm_layer", 34 | ctr.AFMLayer( 35 | self.embed_dim, 36 | self.config['model']['attention_dim'], 37 | num_fields, 38 | self.config['model']['dropout'])) 39 | ])) 40 | 41 | def score(self, batch): 42 | lr_score = self.linear(batch) 43 | afm_score = self.afm(batch) 44 | return {'score' : lr_score + afm_score} 45 | 46 | def _get_loss_func(self): 47 | return loss_func.BCEWithLogitLoss() 48 | -------------------------------------------------------------------------------- /recstudio/model/fm/afn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from collections import OrderedDict 4 | from recstudio.data.dataset import TripletDataset 5 | from .. import loss_func 6 | from ..basemodel import BaseRanker 7 | from ..module import ctr, MLPModule 8 | 9 | r""" 10 | AFN 11 | ###################### 12 | 13 | Paper Reference: 14 | Adaptive Factorization Network: Learning Adaptive-Order Feature Interactions (AAAI'20) 15 | https://arxiv.org/abs/1909.03276 16 | """ 17 | 18 | class AFN(BaseRanker): 19 | 20 | def _get_dataset_class(): 21 | return TripletDataset 22 | 23 | def _init_model(self, train_data, drop_unused_field=True): 24 | super()._init_model(train_data, drop_unused_field) 25 | model_config = self.config['model'] 26 | num_fields = len(self.fields) - 1 27 | self.afn = nn.Sequential( 28 | OrderedDict([ 29 | ("embeddings", 30 | ctr.Embeddings( 31 | self.fields, 32 | self.embed_dim, 33 | train_data)), 34 | ("logtransform_layer", 35 | ctr.LogTransformLayer( 36 | num_fields, 37 | model_config['log_hidden_size'])), 38 | ("mlp", 39 | MLPModule( 40 | [model_config['log_hidden_size'] * self.embed_dim] + model_config['mlp_layer'] + [1], 41 | model_config['activation'], 42 | model_config['dropout'], 43 | last_activation=False, 44 | last_bn=False)) 45 | ])) 46 | if model_config['ensemble']: 47 | self.ensemble_embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 48 | self.ensemble_mlp = MLPModule( 49 | [num_fields * self.embed_dim] + model_config['ensemble_mlp_layer'] + [1], 50 | model_config['ensemble_activation'], 51 | model_config['ensemble_dropout'], 52 | last_activation=False, 53 | last_bn=False 54 | ) 55 | self.ensemble_fc = nn.Linear(2, 1) 56 | 57 | 58 | def score(self, batch): 59 | afn_score = self.afn(batch) 60 | if self.config['model']['ensemble']: 61 | ensemble_emb = self.ensemble_embedding(batch) 62 | ensemble_mlp_score = self.ensemble_mlp(ensemble_emb.flatten(1)) 63 | score = self.ensemble_fc( 64 | torch.cat([afn_score, ensemble_mlp_score], dim=-1) 65 | ) 66 | else: 67 | score = afn_score 68 | score = score.squeeze(-1) 69 | return {'score' : score} 70 | 71 | def _get_loss_func(self): 72 | return loss_func.BCEWithLogitLoss() 73 | -------------------------------------------------------------------------------- /recstudio/model/fm/aoanet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from collections import OrderedDict 4 | from recstudio.data.dataset import TripletDataset 5 | from .. import loss_func 6 | from ..basemodel import BaseRanker 7 | from ..module import ctr, MLPModule 8 | 9 | r""" 10 | AOANet 11 | ###################### 12 | 13 | Paper Reference: 14 | Architecture and Operation Adaptive Network for Online Recommendations (KDD'21) 15 | https://dl.acm.org/doi/10.1145/3447548.3467133 16 | """ 17 | 18 | class AOANet(BaseRanker): 19 | 20 | def _get_dataset_class(): 21 | return TripletDataset 22 | 23 | def _init_model(self, train_data, drop_unused_field=True): 24 | super()._init_model(train_data, drop_unused_field) 25 | model_config = self.config['model'] 26 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 27 | self.mlp = MLPModule( 28 | [self.embedding.num_features * self.embed_dim] + model_config['mlp_layer'], 29 | model_config['activation'], 30 | model_config['dropout'], 31 | last_activation=False, 32 | last_bn=False) 33 | self.gin = ctr.GeneralizedInteractionNet( 34 | self.embedding.num_features, 35 | self.embed_dim, 36 | model_config['num_interaction_layers'], 37 | model_config['num_subspaces']) 38 | self.fc = nn.Linear(model_config['mlp_layer'][-1] + model_config['num_subspaces'] * self.embed_dim, 1) 39 | 40 | 41 | def score(self, batch): 42 | emb = self.embedding(batch) 43 | mlp_out = self.mlp(emb.flatten(1)) 44 | gin_out = self.gin(emb).flatten(1) 45 | score = self.fc(torch.cat([mlp_out, gin_out], dim=-1)).squeeze(-1) 46 | return {'score' : score} 47 | 48 | def _get_loss_func(self): 49 | return loss_func.BCEWithLogitLoss() 50 | -------------------------------------------------------------------------------- /recstudio/model/fm/autoint.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from recstudio.data.dataset import TripletDataset 3 | from .. import loss_func 4 | from ..basemodel import BaseRanker 5 | from ..module import ctr, MLPModule 6 | 7 | r""" 8 | AutoInt 9 | ###################### 10 | 11 | Paper Reference: 12 | AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks (CIKM'19) 13 | https://dl.acm.org/doi/abs/10.1145/3357384.3357925 14 | """ 15 | 16 | class AutoInt(BaseRanker): 17 | 18 | def _get_dataset_class(): 19 | return TripletDataset 20 | 21 | def _init_model(self, train_data, drop_unused_field=True): 22 | super()._init_model(train_data, drop_unused_field) 23 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 24 | model_config = self.config['model'] 25 | if model_config['wide']: 26 | self.linear = ctr.LinearLayer(self.fields, train_data) 27 | if model_config['deep']: 28 | self.mlp = MLPModule([self.embedding.num_features * self.embed_dim] + model_config['mlp_layer'] + [1], 29 | model_config['activation'], 30 | model_config['dropout'], 31 | last_activation=False, 32 | last_bn=False 33 | ) 34 | self.int = nn.Sequential(*[ 35 | ctr.SelfAttentionInteractingLayer( 36 | self.embed_dim if i == 0 else model_config['attention_dim'], 37 | n_head=model_config['n_head'], 38 | dropout=model_config['dropout'], 39 | residual=model_config['residual'], 40 | residual_project=model_config['residual_project'], 41 | layer_norm=model_config['layer_norm'] 42 | ) 43 | for i in range(model_config['num_attention_layers'])]) 44 | self.fc = nn.Linear(self.embedding.num_features * self.embed_dim, 1) 45 | 46 | def score(self, batch): 47 | emb = self.embedding(batch) 48 | attn_out = self.int(emb) 49 | int_score = self.fc(attn_out.flatten(1)).squeeze(-1) 50 | score = int_score 51 | if self.config['model']['wide']: 52 | lr_score = self.linear(batch) 53 | score += lr_score 54 | if self.config['model']['deep']: 55 | mlp_score = self.mlp(emb.flatten(1)).squeeze(-1) 56 | score += mlp_score 57 | return {'score' : score} 58 | 59 | def _get_loss_func(self): 60 | return loss_func.BCEWithLogitLoss() 61 | -------------------------------------------------------------------------------- /recstudio/model/fm/ccpm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from collections import OrderedDict 3 | from recstudio.data.dataset import TripletDataset 4 | from .. import loss_func 5 | from ..basemodel import BaseRanker 6 | from ..module import ctr, MLPModule 7 | 8 | r""" 9 | CCPM 10 | ###################### 11 | 12 | Paper Reference: 13 | A Convolutional Click Prediction Model (CIKM'15) 14 | https://dl.acm.org/doi/10.1145/2806416.2806603 15 | """ 16 | 17 | class CCPM(BaseRanker): 18 | 19 | def _get_dataset_class(): 20 | return TripletDataset 21 | 22 | def _init_model(self, train_data, drop_unused_field=True): 23 | super()._init_model(train_data, drop_unused_field) 24 | self.linear = ctr.LinearLayer(self.fields, train_data) 25 | model_config = self.config['model'] 26 | num_fields = len(self.fields) - 1 27 | self.conv = nn.Sequential( 28 | OrderedDict([ 29 | ("embeddings", 30 | ctr.Embeddings( 31 | self.fields, 32 | self.embed_dim, 33 | train_data)), 34 | ("conv_layer", 35 | ctr.ConvLayer( 36 | num_fields, 37 | channels=model_config['channels'], 38 | heights=model_config['heights'])) 39 | ])) 40 | self.mlp = MLPModule( 41 | [3 * self.embed_dim * model_config['channels'][-1]] + model_config['mlp_layer'] + [1], 42 | model_config['activation'], 43 | model_config['dropout'], 44 | last_activation=False, 45 | last_bn=False) 46 | 47 | def score(self, batch): 48 | conv_out = self.conv(batch) 49 | score = self.mlp(conv_out.flatten(1)).squeeze(-1) 50 | return {'score' : score} 51 | 52 | def _get_loss_func(self): 53 | return loss_func.BCEWithLogitLoss() 54 | -------------------------------------------------------------------------------- /recstudio/model/fm/config/afm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | attention_dim: 4 3 | dropout: 0.5 -------------------------------------------------------------------------------- /recstudio/model/fm/config/afn.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | log_hidden_size: 128 3 | mlp_layer: [128, 128] 4 | activation: relu 5 | dropout: 0.5 6 | 7 | ensemble: True 8 | ensemble_mlp_layer: [256, 64] 9 | ensemble_activation: relu 10 | ensemble_dropout: 0.5 11 | 12 | -------------------------------------------------------------------------------- /recstudio/model/fm/config/all.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | fmeval: True 3 | low_rating_thres: 0.0 4 | binarized_rating_thres: 3.0 5 | 6 | eval: 7 | val_metrics: [auc, logloss] 8 | test_metrics: [auc, logloss] 9 | -------------------------------------------------------------------------------- /recstudio/model/fm/config/aoanet.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | mlp_layer: [64, 64] 3 | activation: relu 4 | dropout: 0.2 5 | num_subspaces: 3 6 | num_interaction_layers: 3 -------------------------------------------------------------------------------- /recstudio/model/fm/config/autoint.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | wide: True 3 | 4 | deep: True 5 | mlp_layer: [128, 64] 6 | activation: relu 7 | 8 | dropout: 0.5 9 | attention_dim: 64 10 | num_attention_layers: 3 11 | n_head: 2 12 | 13 | residual: True 14 | residual_project: True 15 | layer_norm: False 16 | -------------------------------------------------------------------------------- /recstudio/model/fm/config/ccpm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | channels: [3, 3] 3 | heights: [6, 5] 4 | 5 | mlp_layer: [256] 6 | activation: relu 7 | dropout: 0.5 -------------------------------------------------------------------------------- /recstudio/model/fm/config/dcn.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 10 3 | mlp_layer: [256, 256, 256] 4 | activation: relu 5 | num_layers: 6 6 | dropout: 0.5 7 | batch_norm: True -------------------------------------------------------------------------------- /recstudio/model/fm/config/dcnv2.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | combination: parallel 3 | low_rank: ~ 4 | num_experts: 4 5 | num_layers: 3 6 | embed_dim: 10 7 | mlp_layer: [256,256,256] 8 | activation: relu 9 | cross_activation: tanh 10 | dropout: 0.5 11 | batch_norm: True 12 | 13 | scheduler: onplateau -------------------------------------------------------------------------------- /recstudio/model/fm/config/deepcrossing.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | hidden_dims: [64, 64, 64] 3 | activation: relu 4 | dropout: 0.5 5 | batch_norm: False 6 | layer_norm: True -------------------------------------------------------------------------------- /recstudio/model/fm/config/deepfm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 10 3 | mlp_layer: [256, 256, 256] 4 | activation: tanh 5 | dropout: 0.3 6 | -------------------------------------------------------------------------------- /recstudio/model/fm/config/deepim.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | order: 5 3 | deep: True 4 | mlp_layer: [128, 128] 5 | activation: relu 6 | dropout: 0.5 -------------------------------------------------------------------------------- /recstudio/model/fm/config/destine.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | wide: True 3 | 4 | deep: True 5 | mlp_layer: [128, 64] 6 | activation: relu 7 | 8 | dropout: 0.5 9 | attention_dim: 64 10 | num_attention_layers: 3 11 | n_head: 2 12 | 13 | res_mode: each_layer 14 | scale: False 15 | relu_before_att: False 16 | -------------------------------------------------------------------------------- /recstudio/model/fm/config/difm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | mlp_layer: [256, 256, 256] 3 | activation: relu 4 | dropout: 0.5 5 | batch_norm: False 6 | layer_norm: False 7 | n_head: 2 8 | -------------------------------------------------------------------------------- /recstudio/model/fm/config/dlrm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | op: sum 3 | top_mlp_layer: [128, 128] 4 | top_activation: relu 5 | top_dropout: 0.5 6 | bottom_mlp_layer: [128, 128] 7 | bottom_activation: relu 8 | bottom_dropout: 0.5 9 | -------------------------------------------------------------------------------- /recstudio/model/fm/config/edcn.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | bridge_type: hadamard_product 3 | temperature: 1.0 4 | embed_dim: 10 5 | activation: relu 6 | num_layers: 5 7 | dropout: 0.5 8 | batch_norm: True -------------------------------------------------------------------------------- /recstudio/model/fm/config/ffm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 10 -------------------------------------------------------------------------------- /recstudio/model/fm/config/fgcnn.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | channels: [3, 4, 5] 3 | heights: [16, 16, 16] 4 | recombine_channels: [5, 6, 7] 5 | pooling_sizes: [3, 1, 1] 6 | 7 | mlp_layer: [256] 8 | activation: relu 9 | dropout: 0.5 10 | batch_norm: True -------------------------------------------------------------------------------- /recstudio/model/fm/config/fibinet.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | reduction_ratio: 3 3 | excitation_activation: relu 4 | bilinear_type: interaction 5 | mlp_layer: [128, 32] 6 | activation: relu 7 | dropout: 0.5 8 | shared_bilinear: True 9 | learning_rate: 1e-4 -------------------------------------------------------------------------------- /recstudio/model/fm/config/fignn.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | deep: True 3 | mlp_layer: [256, 256, 256] 4 | activation: relu 5 | dropout: 0.5 6 | 7 | layer_norm: True 8 | num_gnn_layers: 3 9 | n_head: 2 -------------------------------------------------------------------------------- /recstudio/model/fm/config/finalmlp.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 10 3 | mlp_layer1: [256, 256, 256] 4 | mlp_layer2: [256, 256, 256] 5 | activation1: relu 6 | activation2: relu 7 | dropout1: 0.3 8 | dropout2: 0.3 9 | batch_norm1: False 10 | batch_norm2: False 11 | feature_selection: True 12 | fs_mlp_layer: [32] 13 | n_head: 2 14 | -------------------------------------------------------------------------------- /recstudio/model/fm/config/flen.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 10 3 | mlp_layer: [256, 256, 256] 4 | activation: relu 5 | dropout: 0.3 6 | fields: ~ # [[...], [...], ...] 7 | -------------------------------------------------------------------------------- /recstudio/model/fm/config/fm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 10 3 | -------------------------------------------------------------------------------- /recstudio/model/fm/config/fmfm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 10 -------------------------------------------------------------------------------- /recstudio/model/fm/config/fwfm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 10 3 | linear_type: filv -------------------------------------------------------------------------------- /recstudio/model/fm/config/hfm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | op: circular_correlation 3 | deep: True 4 | embed_dim: 10 5 | mlp_layer: [256, 256, 256] 6 | activation: relu 7 | dropout: 0.3 -------------------------------------------------------------------------------- /recstudio/model/fm/config/ifm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | mlp_layer: [256, 256, 256] 3 | activation: relu 4 | dropout: 0.5 5 | batch_norm: False -------------------------------------------------------------------------------- /recstudio/model/fm/config/interhat.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | order: 3 3 | feedforward_dim: 32 4 | aggregation_dim: 32 5 | mlp_layer: [256, 256, 256] 6 | activation: relu 7 | dropout: 0.5 8 | n_head: 2 -------------------------------------------------------------------------------- /recstudio/model/fm/config/lorentzfm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 10 -------------------------------------------------------------------------------- /recstudio/model/fm/config/lr.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 1 -------------------------------------------------------------------------------- /recstudio/model/fm/config/masknet.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | parallel: False 3 | num_blocks: 3 4 | block_dim: 50 5 | reduction_ratio: 1 6 | hidden_layer_norm: False 7 | mlp_layer: [512, 128] 8 | activation: relu 9 | dropout: 0.5 10 | learning_rate: 1e-4 -------------------------------------------------------------------------------- /recstudio/model/fm/config/nfm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 10 3 | mlp_layer: [128, 128, 128] 4 | dropout: 0.3 5 | batch_norm: True 6 | activation: sigmoid -------------------------------------------------------------------------------- /recstudio/model/fm/config/onn.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | mlp_layer: [128, 64] 3 | activation: relu 4 | dropout: 0.2 5 | batch_norm: True -------------------------------------------------------------------------------- /recstudio/model/fm/config/pnn.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | product_type: inner 3 | mlp_layer: [128, 64] 4 | activation: relu 5 | dropout: 0.5 6 | batch_norm: False 7 | stack_dim: 2 8 | -------------------------------------------------------------------------------- /recstudio/model/fm/config/ppnet.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | id_fields: ~ 3 | mlp_layer: [256, 256, 256] 4 | activation: relu 5 | dropout: 0.5 6 | batch_norm: False 7 | id_embed_dim: 8 8 | pp_hidden_dims: [128, 128, 128] 9 | gate_hidden_dims: [64, 64, 64] -------------------------------------------------------------------------------- /recstudio/model/fm/config/sam.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | dropout: 0 3 | interaction_type: sam2e -------------------------------------------------------------------------------- /recstudio/model/fm/config/widedeep.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | batch_norm: True 3 | embed_dim: 10 4 | mlp_layer: [256, 256, 256] 5 | activation: relu 6 | dropout: 0.3 7 | -------------------------------------------------------------------------------- /recstudio/model/fm/config/xdeepfm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 10 3 | cin_layer_size: [100, 100, 100] 4 | mlp_layer: [128, 128, 128] 5 | activation: relu 6 | dropout: 0.2 7 | direct: False 8 | -------------------------------------------------------------------------------- /recstudio/model/fm/dcn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.data.dataset import TripletDataset 3 | from ..basemodel import BaseRanker 4 | from ..loss_func import BCEWithLogitLoss 5 | from ..module import ctr, MLPModule 6 | 7 | 8 | class DCN(BaseRanker): 9 | 10 | def _get_dataset_class(): 11 | return TripletDataset 12 | 13 | def _init_model(self, train_data): 14 | super()._init_model(train_data) 15 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 16 | num_features = self.embedding.num_features 17 | model_config = self.config['model'] 18 | mlp_layer = model_config['mlp_layer'] 19 | self.cross_net = ctr.CrossNetwork(num_features * self.embed_dim, model_config['num_layers']) 20 | self.mlp = MLPModule( 21 | [num_features * self.embed_dim] + mlp_layer, 22 | model_config['activation'], 23 | model_config['dropout'], 24 | batch_norm=model_config['batch_norm']) 25 | self.fc = torch.nn.Linear(num_features*self.embed_dim + mlp_layer[-1], 1) 26 | 27 | def score(self, batch): 28 | emb = self.embedding(batch).flatten(1) 29 | cross_out = self.cross_net(emb) 30 | deep_out = self.mlp(emb) 31 | score = self.fc(torch.cat([deep_out, cross_out], -1)).squeeze(-1) 32 | return {'score' : score} 33 | 34 | def _get_loss_func(self): 35 | return BCEWithLogitLoss() 36 | -------------------------------------------------------------------------------- /recstudio/model/fm/dcnv2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from recstudio.data.dataset import TripletDataset 4 | from ..basemodel import BaseRanker 5 | from ..loss_func import BCEWithLogitLoss 6 | from ..module import ctr, MLPModule 7 | 8 | r""" 9 | DCNv2 10 | ###################### 11 | 12 | Paper Reference: 13 | DCN V2: Improved Deep & Cross Network and Practical Lessons for Web-scale Learning to Rank Systems (WWW'21) 14 | https://dl.acm.org/doi/10.1145/3442381.3450078 15 | """ 16 | 17 | class DCNv2(BaseRanker): 18 | 19 | def _get_dataset_class(): 20 | return TripletDataset 21 | 22 | def _init_model(self, train_data): 23 | super()._init_model(train_data) 24 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 25 | num_fields = self.embedding.num_features 26 | model_config = self.config['model'] 27 | if model_config['low_rank'] is None: 28 | self.cross_net = ctr.CrossNetworkV2(num_fields * self.embed_dim, model_config['num_layers']) 29 | else: 30 | self.cross_net = ctr.CrossNetworkMix(num_fields * self.embed_dim, model_config['num_layers'], 31 | model_config['low_rank'], model_config['num_experts'], 32 | model_config['cross_activation']) 33 | 34 | if model_config['combination'].lower() == 'parallel': 35 | self.mlp = MLPModule( 36 | [num_fields * self.embed_dim] + model_config['mlp_layer'], 37 | model_config['activation'], 38 | model_config['dropout'], 39 | batch_norm=model_config['batch_norm']) 40 | self.fc = nn.Linear(num_fields*self.embed_dim + model_config['mlp_layer'][-1], 1) 41 | elif model_config['combination'].lower() == 'stacked': 42 | self.mlp = MLPModule( 43 | [num_fields * self.embed_dim] + model_config['mlp_layer'] + [1], 44 | model_config['activation'], 45 | model_config['dropout'], 46 | batch_norm=model_config['batch_norm'], 47 | last_activation=False, 48 | last_bn=False) 49 | else: 50 | raise ValueError(f'Expect combination to be `parallel`|`stacked`, but got {model_config["combination"]}.') 51 | 52 | def score(self, batch): 53 | emb = self.embedding(batch).flatten(1) 54 | cross_out = self.cross_net(emb) 55 | if self.config['model']['combination'].lower() == 'parallel': 56 | deep_out = self.mlp(emb) 57 | score = self.fc(torch.cat([deep_out, cross_out], -1)).squeeze(-1) 58 | else: 59 | deep_out = self.mlp(cross_out) 60 | score = deep_out.squeeze(-1) 61 | return {'score' : score} 62 | 63 | def _get_loss_func(self): 64 | return BCEWithLogitLoss() 65 | -------------------------------------------------------------------------------- /recstudio/model/fm/deepcrossing.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from collections import OrderedDict 3 | from recstudio.data.dataset import TripletDataset 4 | from ..basemodel import BaseRanker 5 | from ..loss_func import BCEWithLogitLoss 6 | from ..module import ctr, MLPModule, ResidualLayer 7 | 8 | r""" 9 | DeepCrossing 10 | ###################### 11 | 12 | Paper Reference: 13 | Deep Crossing: Web-Scale Modeling without Manually Crafted Combinatorial Features (KDD'16) 14 | https://dl.acm.org/doi/10.1145/2939672.2939704 15 | """ 16 | 17 | class DeepCrossing(BaseRanker): 18 | 19 | def _get_dataset_class(): 20 | return TripletDataset 21 | 22 | def _init_model(self, train_data, drop_unused_field=True): 23 | super()._init_model(train_data, drop_unused_field) 24 | model_config = self.config['model'] 25 | num_fields = len(self.fields) - 1 26 | self.dc = nn.Sequential(OrderedDict([ 27 | ("embedding", 28 | ctr.Embeddings(self.fields, self.embed_dim, train_data)), 29 | ("residuals", 30 | nn.Sequential(*[ 31 | ResidualLayer( 32 | MLPModule( 33 | [num_fields*self.embed_dim, hidden_dim, num_fields*self.embed_dim], 34 | model_config['activation'], 35 | last_activation=False, last_bn=False 36 | ), 37 | num_fields, 38 | self.embed_dim, 39 | model_config['activation'], 40 | model_config['dropout'], 41 | model_config['batch_norm'], 42 | model_config['layer_norm'] 43 | ) 44 | for hidden_dim in model_config['hidden_dims']])) 45 | ])) 46 | self.fc = nn.Linear(num_fields * self.embed_dim, 1) 47 | def score(self, batch): 48 | dc_out = self.dc(batch) 49 | score = self.fc(dc_out.flatten(1)).squeeze(-1) 50 | return {'score' : score} 51 | 52 | def _get_loss_func(self): 53 | return BCEWithLogitLoss() 54 | -------------------------------------------------------------------------------- /recstudio/model/fm/deepfm.py: -------------------------------------------------------------------------------- 1 | from recstudio.data.dataset import TripletDataset 2 | from ..basemodel import BaseRanker 3 | from ..loss_func import BCEWithLogitLoss 4 | from ..module import ctr, MLPModule 5 | 6 | 7 | class DeepFM(BaseRanker): 8 | 9 | def _get_dataset_class(): 10 | return TripletDataset 11 | 12 | def _init_model(self, train_data, drop_unused_field=True): 13 | super()._init_model(train_data, drop_unused_field) 14 | self.linear = ctr.LinearLayer(self.fields, train_data) 15 | self.fm = ctr.FMLayer(reduction='sum') 16 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 17 | model_config = self.config['model'] 18 | self.mlp = MLPModule([self.embedding.num_features*self.embed_dim]+model_config['mlp_layer']+[1], 19 | model_config['activation'], model_config['dropout'], 20 | last_activation=False, last_bn=False) 21 | 22 | def score(self, batch): 23 | lr_score = self.linear(batch) 24 | emb = self.embedding(batch) 25 | fm_score = self.fm(emb) 26 | mlp_score = self.mlp(emb.flatten(1)).squeeze(-1) 27 | return {'score' : lr_score + fm_score + mlp_score} 28 | 29 | def _get_loss_func(self): 30 | return BCEWithLogitLoss() 31 | -------------------------------------------------------------------------------- /recstudio/model/fm/deepim.py: -------------------------------------------------------------------------------- 1 | from recstudio.data.dataset import TripletDataset 2 | from ..basemodel import BaseRanker 3 | from ..loss_func import BCEWithLogitLoss 4 | from ..module import ctr, MLPModule 5 | 6 | r""" 7 | DeepIM 8 | ###################### 9 | 10 | Paper Reference: 11 | Deep Interaction Machine: A Simple but Effective Model for High-order Feature Interactions (CIKM'20) 12 | https://doi.org/10.1145/3340531.3412077 13 | """ 14 | 15 | class DeepIM(BaseRanker): 16 | 17 | def _get_dataset_class(): 18 | return TripletDataset 19 | 20 | def _init_model(self, train_data, drop_unused_field=True): 21 | super()._init_model(train_data, drop_unused_field) 22 | model_config = self.config['model'] 23 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 24 | self.im = ctr.InteractionMachine(self.embed_dim, model_config['order']) 25 | if model_config['deep']: 26 | self.mlp = MLPModule( 27 | [self.embedding.num_features * self.embed_dim] + model_config['mlp_layer'] + [1], 28 | model_config['activation'], 29 | model_config['dropout'], 30 | last_activation=False, 31 | last_bn=False) 32 | 33 | def score(self, batch): 34 | emb = self.embedding(batch) 35 | im_score = self.im(emb).squeeze(-1) 36 | if self.config['model']['deep']: 37 | mlp_score = self.mlp(emb.flatten(1)).squeeze(-1) 38 | return {'score' : im_score + mlp_score} 39 | else: 40 | return{'score': im_score} 41 | 42 | def _get_loss_func(self): 43 | return BCEWithLogitLoss() 44 | -------------------------------------------------------------------------------- /recstudio/model/fm/destine.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from recstudio.data.dataset import TripletDataset 3 | from .. import loss_func 4 | from ..basemodel import BaseRanker 5 | from ..module import ctr, MLPModule 6 | 7 | r""" 8 | DESTINE 9 | ###################### 10 | 11 | Paper Reference: 12 | Disentangled Self-Attentive Neural Networks for Click-Through Rate Prediction (CIKM'21) 13 | https://dl.acm.org/doi/10.1145/3459637.3482088 14 | """ 15 | 16 | class DESTINE(BaseRanker): 17 | 18 | def _get_dataset_class(): 19 | return TripletDataset 20 | 21 | def _init_model(self, train_data, drop_unused_field=True): 22 | super()._init_model(train_data, drop_unused_field) 23 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 24 | model_config = self.config['model'] 25 | if model_config['wide']: 26 | self.linear = ctr.LinearLayer(self.fields, train_data) 27 | if model_config['deep']: 28 | self.mlp = MLPModule([self.embedding.num_features * self.embed_dim] + model_config['mlp_layer'] + [1], 29 | model_config['activation'], 30 | model_config['dropout'], 31 | last_activation=False, 32 | last_bn=False 33 | ) 34 | 35 | if model_config['res_mode'].lower() == 'last_layer': 36 | self.res = nn.Linear(self.embed_dim, model_config['attention_dim']) 37 | elif model_config['res_mode'] is not None and model_config['res_mode'] != 'each_layer': 38 | raise ValueError(f'Expect res_mode to be `last_layer`|`each_layer`|None, but got {model_config["res_mode"]}.') 39 | 40 | self.dsa = nn.Sequential(*[ 41 | ctr.DisentangledSelfAttentionInteractingLayer( 42 | self.embed_dim, 43 | attention_dim=self.embed_dim if i == 0 else model_config['attention_dim'], 44 | n_head=model_config['n_head'], 45 | dropout=model_config['dropout'], 46 | residual=(model_config['res_mode']=='each_layer'), 47 | scale=model_config['scale'], 48 | relu_before_att=model_config['relu_before_att'] if i == 0 else False, 49 | ) 50 | for i in range(model_config['num_attention_layers'])]) 51 | self.fc = nn.Linear(self.embedding.num_features * self.embed_dim, 1) 52 | 53 | def score(self, batch): 54 | emb = self.embedding(batch) 55 | attn_out = self.dsa(emb) 56 | if self.config['model']['res_mode'].lower() == 'last_layer': 57 | attn_out += self.res(emb) 58 | attn_out = attn_out.relu() 59 | attn_score = self.fc(attn_out.flatten(1)).squeeze(-1) 60 | score = attn_score 61 | if self.config['model']['wide']: 62 | lr_score = self.linear(batch) 63 | score += lr_score 64 | if self.config['model']['deep']: 65 | mlp_score = self.mlp(emb.flatten(1)).squeeze(-1) 66 | score += mlp_score 67 | return {'score' : score} 68 | 69 | def _get_loss_func(self): 70 | return loss_func.BCEWithLogitLoss() 71 | -------------------------------------------------------------------------------- /recstudio/model/fm/difm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from recstudio.data.dataset import TripletDataset 3 | from .. import loss_func 4 | from ..basemodel import BaseRanker 5 | from ..module import ctr, LambdaLayer 6 | 7 | r""" 8 | DIFM 9 | ###################### 10 | 11 | Paper Reference: 12 | A Dual Input-aware Factorization Machine for CTR Prediction (IJCAI'20) 13 | https://dl.acm.org/doi/10.5555/3491440.3491874 14 | """ 15 | 16 | class DIFM(BaseRanker): 17 | 18 | def _get_dataset_class(): 19 | return TripletDataset 20 | 21 | def _init_model(self, train_data, drop_unused_field=True): 22 | super()._init_model(train_data, drop_unused_field) 23 | self.linear = ctr.LinearLayer(self.fields, train_data) 24 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 25 | model_config = self.config['model'] 26 | num_fields = self.embedding.num_features 27 | self.vec_wise_fen = nn.Sequential( 28 | ctr.SelfAttentionInteractingLayer( 29 | self.embed_dim, 30 | model_config['n_head'], 31 | model_config['dropout'], 32 | layer_norm=model_config['layer_norm']), 33 | LambdaLayer(lambda x: x.reshape(x.size(0), -1)), 34 | nn.Linear( 35 | num_fields * self.embed_dim, 36 | num_fields, 37 | bias=False)) 38 | self.bit_wise_fen = nn.Sequential( 39 | ctr.MLPModule( 40 | [num_fields * self.embed_dim] + model_config['mlp_layer'], 41 | model_config['activation'], 42 | model_config['dropout'], 43 | batch_norm=model_config['batch_norm']), 44 | nn.Linear( 45 | model_config['mlp_layer'][-1], 46 | num_fields, 47 | bias=False)) 48 | self.fm = ctr.FMLayer(reduction='sum') 49 | 50 | def score(self, batch): 51 | emb = self.embedding(batch) 52 | m_vec = self.vec_wise_fen(emb) 53 | m_bit = self.bit_wise_fen(emb.flatten(1)) 54 | weight = m_vec + m_bit 55 | lr_score = (super(ctr.LinearLayer, self.linear).forward(batch).squeeze(-1) * weight).sum(-1) + self.linear.bias 56 | fm_score = self.fm(emb * weight.unsqueeze(-1)) 57 | return {'score' : lr_score + fm_score} 58 | 59 | def _get_loss_func(self): 60 | return loss_func.BCEWithLogitLoss() 61 | -------------------------------------------------------------------------------- /recstudio/model/fm/dlrm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from recstudio.data.dataset import TripletDataset 4 | from ..basemodel import BaseRanker 5 | from ..loss_func import BCEWithLogitLoss 6 | from ..module import ctr, MLPModule, LambdaLayer 7 | 8 | r""" 9 | DLRM 10 | ###################### 11 | 12 | Paper Reference: 13 | Deep Learning Recommendation Model for Personalization and Recommendation Systems 14 | https://arxiv.org/abs/1906.00091 15 | """ 16 | 17 | class DLRM(BaseRanker): 18 | 19 | def _get_dataset_class(): 20 | return TripletDataset 21 | 22 | def _init_model(self, train_data, drop_unused_field=True): 23 | super()._init_model(train_data, drop_unused_field) 24 | model_config = self.config['model'] 25 | sparse_fields = {f for f in self.fields if train_data.field2type[f] != 'float'} 26 | dense_fields = {f for f in self.fields if train_data.field2type[f] == 'float' and f != self.frating} 27 | num_fields = len(sparse_fields) + int(len(dense_fields) > 0) 28 | self.embedding = ctr.Embeddings(sparse_fields, self.embed_dim, train_data) 29 | if len(dense_fields) > 0: 30 | self.bottom_mlp = MLPModule( 31 | [len(dense_fields)] + model_config['bottom_mlp_layer'] + [self.embed_dim], 32 | model_config['bottom_activation'], 33 | model_config['bottom_dropout'], 34 | last_activation=False, 35 | last_bn=False) 36 | if model_config['op'].lower() == 'dot': 37 | self.interaction = ctr.InnerProductLayer(num_fields) 38 | top_mlp_in = num_fields * (num_fields - 1) // 2 + self.embed_dim * int(len(dense_fields) > 0) 39 | elif model_config['op'].lower() == 'cat': 40 | self.interaction = nn.Flatten(start_dim=1) 41 | top_mlp_in = num_fields * self.embed_dim 42 | elif model_config['op'].lower() == 'sum': 43 | self.interaction = LambdaLayer(lambda emb: emb.sum(1)) 44 | top_mlp_in = self.embed_dim 45 | else: 46 | raise ValueError(f'Expect op to be `dot`|`cat`|`sum`, but got{model_config["op"]}.') 47 | self.top_mlp = MLPModule( 48 | [top_mlp_in] + model_config['top_mlp_layer'] + [1], 49 | model_config['top_activation'], 50 | model_config['top_dropout'], 51 | last_activation=False, 52 | last_bn=False) 53 | 54 | def score(self, batch): 55 | emb = self.embedding(batch) 56 | dense_fields = {f for f in self.fields if f not in self.embedding.field2types and f != self.frating} 57 | if len(dense_fields) > 0: 58 | dense_values = torch.vstack([batch[f] for f in dense_fields]).t() 59 | if dense_values.dim() == 1: 60 | dense_values = dense_values.unsqueeze(-1) 61 | dense_emb = self.bottom_mlp(dense_values) 62 | emb = torch.cat([emb, dense_emb.unsqueeze(1)], dim=1) 63 | inter_out = self.interaction(emb) 64 | if self.config['model']['op'] == 'dot' and len(dense_fields) > 0: 65 | inter_out = torch.cat([inter_out, dense_emb], dim=-1) 66 | score = self.top_mlp(inter_out).squeeze(-1) 67 | return {'score': score} 68 | 69 | def _get_loss_func(self): 70 | return BCEWithLogitLoss() 71 | -------------------------------------------------------------------------------- /recstudio/model/fm/edcn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from recstudio.data.dataset import TripletDataset 4 | from ..basemodel import BaseRanker 5 | from ..loss_func import BCEWithLogitLoss 6 | from ..module import ctr, MLPModule, LambdaLayer 7 | 8 | r""" 9 | EDCN 10 | ###################### 11 | 12 | Paper Reference: 13 | Enhancing Explicit and Implicit Feature Interactions via Information Sharing for Parallel Deep CTR Models (CIKM'21) 14 | https://dl.acm.org/doi/10.1145/3459637.3481915 15 | """ 16 | 17 | class EDCN(BaseRanker): 18 | 19 | def _get_dataset_class(): 20 | return TripletDataset 21 | 22 | def _init_model(self, train_data): 23 | super()._init_model(train_data) 24 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 25 | num_fields = self.embedding.num_features 26 | model_config = self.config['model'] 27 | self.cross = nn.ModuleList([ 28 | ctr.CrossInteraction(num_fields * self.embed_dim) 29 | for _ in range(model_config['num_layers']) 30 | ]) 31 | self.mlp = nn.ModuleList([ 32 | MLPModule( 33 | 2 * [num_fields * self.embed_dim], 34 | model_config['activation'], 35 | model_config['dropout'], 36 | batch_norm=model_config['batch_norm']) 37 | for _ in range(model_config['num_layers']) 38 | ]) 39 | self.bridge = nn.ModuleList([ 40 | ctr.BridgeLayer( 41 | num_fields * self.embed_dim, 42 | model_config['bridge_type']) 43 | for _ in range(model_config['num_layers']) 44 | ]) 45 | self.regulation = nn.ModuleList([ 46 | ctr.RegulationLayer( 47 | num_fields, 48 | self.embed_dim, 49 | model_config['temperature'], 50 | model_config['batch_norm']) 51 | for _ in range(model_config['num_layers']) 52 | ]) 53 | self.fc = torch.nn.Linear(3 * num_fields * self.embed_dim, 1) 54 | 55 | def score(self, batch): 56 | emb = self.embedding(batch) 57 | ci, di = self.regulation[0](emb.flatten(1)) 58 | c0 = ci 59 | for i, (cross, deep, bridge) in enumerate(zip(self.cross, self.mlp, self.bridge)): 60 | ci = cross(c0, ci) 61 | di = deep(di) 62 | bi = bridge(ci, di) 63 | if i + 1 < self.config['model']['num_layers']: 64 | ci, di = self.regulation[i + 1](bi) 65 | score = self.fc(torch.cat([ci, di, bi], -1)).squeeze(-1) 66 | return {'score' : score} 67 | 68 | def _get_loss_func(self): 69 | return BCEWithLogitLoss() 70 | -------------------------------------------------------------------------------- /recstudio/model/fm/ffm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from collections import OrderedDict 3 | from recstudio.data.dataset import TripletDataset 4 | from .. import loss_func 5 | from ..basemodel import BaseRanker 6 | from ..module import ctr 7 | 8 | r""" 9 | FFM 10 | ###################### 11 | 12 | Paper Reference: 13 | Field-aware Factorization Machines for CTR Prediction (RecSys'16) 14 | https://dl.acm.org/doi/10.1145/2959100.2959134 15 | """ 16 | 17 | class FFM(BaseRanker): 18 | 19 | def _get_dataset_class(): 20 | return TripletDataset 21 | 22 | def _init_model(self, train_data, drop_unused_field=True): 23 | super()._init_model(train_data, drop_unused_field) 24 | self.linear = ctr.LinearLayer(self.fields, train_data) 25 | num_fields = len(self.fields) - 1 26 | self.ffm = nn.Sequential( 27 | OrderedDict([ 28 | ("embedding", 29 | ctr.Embeddings( 30 | self.fields, 31 | self.embed_dim * (num_fields - 1), 32 | train_data)), 33 | ("ffm_layer", 34 | ctr.FieldAwareFMLayer( 35 | num_fields 36 | )) 37 | ])) 38 | 39 | def score(self, batch): 40 | lr_score = self.linear(batch) 41 | ffm_score = self.ffm(batch) 42 | return {'score' : lr_score + ffm_score} 43 | 44 | def _get_loss_func(self): 45 | return loss_func.BCEWithLogitLoss() 46 | -------------------------------------------------------------------------------- /recstudio/model/fm/fgcnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.data.dataset import TripletDataset 3 | from .. import loss_func 4 | from ..basemodel import BaseRanker 5 | from ..module import ctr, MLPModule 6 | 7 | r""" 8 | FGCNN 9 | ###################### 10 | 11 | Paper Reference: 12 | Feature Generation by Convolutional Neural Network for Click-Through Rate Prediction (WWW'19) 13 | https://dl.acm.org/doi/abs/10.1145/3308558.3313497 14 | """ 15 | 16 | class FGCNN(BaseRanker): 17 | 18 | def _get_dataset_class(): 19 | return TripletDataset 20 | 21 | def _init_model(self, train_data, drop_unused_field=True): 22 | super()._init_model(train_data, drop_unused_field) 23 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 24 | num_raw_fields = self.embedding.num_features 25 | model_config = self.config['model'] 26 | self.fgcnn = ctr.FGCNNLayer( 27 | num_raw_fields, 28 | self.embed_dim, 29 | model_config['channels'], 30 | model_config['heights'], 31 | model_config['pooling_sizes'], 32 | model_config['recombine_channels'], 33 | model_config['batch_norm']) 34 | num_new_fields = sum([rc * oh for rc, oh in zip(model_config['recombine_channels'], self.fgcnn.out_height[1:])]) 35 | num_total_fields = num_raw_fields + num_new_fields 36 | self.inner_product = ctr.InnerProductLayer(num_total_fields) 37 | mlp_in = num_total_fields * (num_total_fields - 1) // 2 + num_total_fields * self.embed_dim 38 | self.mlp = MLPModule( 39 | [mlp_in] + model_config['mlp_layer'] + [1], 40 | model_config['activation'], 41 | model_config['dropout'], 42 | last_activation=False, 43 | last_bn=False) 44 | 45 | def score(self, batch): 46 | raw_emb = self.embedding(batch) 47 | new_emb = self.fgcnn(raw_emb) 48 | comb_emb = torch.cat([raw_emb, new_emb], dim=1) 49 | inner_prod = self.inner_product(comb_emb) 50 | mlp_in = torch.cat([comb_emb.flatten(1), inner_prod], dim=1) 51 | score = self.mlp(mlp_in).squeeze(-1) 52 | return {'score' : score} 53 | 54 | def _get_loss_func(self): 55 | return loss_func.BCEWithLogitLoss() 56 | -------------------------------------------------------------------------------- /recstudio/model/fm/fibinet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from recstudio.data.dataset import TripletDataset 4 | from .. import loss_func 5 | from ..basemodel import BaseRanker 6 | from ..module import ctr, MLPModule 7 | 8 | r""" 9 | FiBiNET 10 | ###################### 11 | 12 | Paper Reference: 13 | FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction (RecSys'19) 14 | https://dl.acm.org/doi/abs/10.1145/3298689.3347043 15 | """ 16 | 17 | class FiBiNET(BaseRanker): 18 | 19 | def _get_dataset_class(): 20 | return TripletDataset 21 | 22 | def _init_model(self, train_data, drop_unused_field=True): 23 | super()._init_model(train_data, drop_unused_field) 24 | self.linear = ctr.LinearLayer(self.fields, train_data) 25 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 26 | model_config = self.config['model'] 27 | num_fields = self.embedding.num_features 28 | self.senet = ctr.SqueezeExcitation( 29 | num_fields, 30 | model_config['reduction_ratio'], 31 | model_config['excitation_activation']) 32 | self.bilinear = ctr.BilinearInteraction( 33 | num_fields, 34 | self.embed_dim, 35 | model_config['bilinear_type']) 36 | if not model_config['shared_bilinear']: 37 | self.bilinear4se = ctr.BilinearInteraction( 38 | num_fields, 39 | self.embed_dim, 40 | model_config['bilinear_type']) 41 | self.mlp = MLPModule( 42 | [num_fields * (num_fields - 1) * self.embed_dim] + model_config['mlp_layer'] + [1], 43 | model_config['activation'], 44 | model_config['dropout'], 45 | last_activation=False, 46 | last_bn=False) 47 | 48 | def score(self, batch): 49 | lr_score = self.linear(batch) 50 | emb = self.embedding(batch) 51 | senet_emb = self.senet(emb) 52 | bilinear_ori = self.bilinear(emb) 53 | if self.config['model']['shared_bilinear']: 54 | bilinear_senet = self.bilinear(senet_emb) 55 | else: 56 | bilinear_senet = self.bilinear4se(senet_emb) 57 | comb = torch.cat([bilinear_ori, bilinear_senet], dim=1) 58 | mlp_score = self.mlp(comb.flatten(1)).squeeze(-1) 59 | return {'score' : lr_score + mlp_score} 60 | 61 | def _get_loss_func(self): 62 | return loss_func.BCEWithLogitLoss() 63 | -------------------------------------------------------------------------------- /recstudio/model/fm/fignn.py: -------------------------------------------------------------------------------- 1 | 2 | import torch.nn as nn 3 | from collections import OrderedDict 4 | from recstudio.data.dataset import TripletDataset 5 | from ..basemodel import BaseRanker 6 | from ..loss_func import BCEWithLogitLoss 7 | from ..module import ctr, MLPModule 8 | 9 | r""" 10 | FiGNN 11 | ###################### 12 | 13 | Paper Reference: 14 | Fi-GNN: Modeling Feature Interactions via Graph Neural Networks for CTR Prediction (CIKM'19) 15 | https://doi.org/10.1145/3357384.3357951 16 | """ 17 | 18 | class FiGNN(BaseRanker): 19 | 20 | def _get_dataset_class(): 21 | return TripletDataset 22 | 23 | def _init_model(self, train_data, drop_unused_field=True): 24 | super()._init_model(train_data, drop_unused_field) 25 | model_config = self.config['model'] 26 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 27 | num_fields = self.embedding.num_features 28 | self.gnn = nn.Sequential(OrderedDict([ 29 | ('self_attn', 30 | ctr.SelfAttentionInteractingLayer( 31 | self.embed_dim, 32 | model_config['n_head'], 33 | model_config['dropout'], 34 | residual=True, 35 | residual_project=False, 36 | layer_norm=model_config['layer_norm'])), 37 | ('fignn', 38 | ctr.FiGNNLayer( 39 | num_fields, 40 | self.embed_dim, 41 | model_config['num_gnn_layers'])) 42 | ])) 43 | self.attn_pred = nn.ModuleDict({ 44 | 'mlp1': nn.Linear(self.embed_dim, 1), 45 | 'mlp2': nn.Linear(num_fields * self.embed_dim, num_fields) 46 | }) 47 | if model_config['deep']: 48 | self.mlp = MLPModule( 49 | [self.embedding.num_features * self.embed_dim] + model_config['mlp_layer'] + [1], 50 | model_config['activation'], 51 | model_config['dropout'], 52 | last_activation=False, 53 | last_bn=False) 54 | 55 | def score(self, batch): 56 | emb = self.embedding(batch) 57 | gnn_out = self.gnn(emb) 58 | gnn_score = (self.attn_pred['mlp2'](gnn_out.flatten(1)) * \ 59 | self.attn_pred['mlp1'](gnn_out).squeeze(-1)).sum(-1) 60 | if self.config['model']['deep']: 61 | mlp_score = self.mlp(emb.flatten(1)).squeeze(-1) 62 | return {'score' : gnn_score + mlp_score} 63 | else: 64 | return{'score': gnn_score} 65 | 66 | def _get_loss_func(self): 67 | return BCEWithLogitLoss() 68 | -------------------------------------------------------------------------------- /recstudio/model/fm/flen.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from recstudio.data.dataset import TripletDataset 4 | from ..basemodel import BaseRanker 5 | from ..loss_func import BCEWithLogitLoss 6 | from ..module import ctr, MLPModule, get_act 7 | 8 | r""" 9 | FLEN 10 | ###################### 11 | 12 | Paper Reference: 13 | FLEN: Leveraging Field for Scalable CTR Prediction (DLP KDD'20) 14 | https://dlp-kdd.github.io/dlp-kdd2020/assets/pdf/a3-chen.pdf 15 | """ 16 | 17 | class FLEN(BaseRanker): 18 | 19 | def _get_dataset_class(): 20 | return TripletDataset 21 | 22 | def _init_model(self, train_data, drop_unused_field=True): 23 | super()._init_model(train_data, drop_unused_field) 24 | model_config = self.config['model'] 25 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 26 | if model_config.get('fields', None) is None: 27 | fields = [f.fields for f in train_data._get_feat_list()] 28 | else: 29 | fields = model_config['fields'] 30 | all_fields = set() 31 | for f in fields: 32 | if all_fields.intersection(set(f)) not in [{self.fuid}, {self.fiid}, 33 | {self.fuid, self.fiid}, set()] : 34 | raise ValueError('Expect no intersection between fields ' 35 | f'expcept {self.fuid} and {self.fiid}, ' 36 | f'but got mutilple {all_fields.intersection(set(f))}.') 37 | all_fields = all_fields.union(set(f)) 38 | if len(all_fields) != self.embedding.num_features: 39 | raise ValueError(f'Expect fields consist {self.embedding.num_features}, ' 40 | f'but got {all_fields - self.fields}.') 41 | 42 | self.fwbi = ctr.FieldWiseBiInteraction( 43 | self.embed_dim, 44 | train_data, 45 | model_config['activation'], 46 | model_config['dropout'], 47 | fields) 48 | self.mlp = MLPModule( 49 | [self.embedding.num_features*self.embed_dim] + model_config['mlp_layer'], 50 | model_config['activation'], 51 | model_config['dropout'], 52 | batch_norm=True, 53 | last_activation=True, 54 | last_bn=True) 55 | self.fc = nn.Linear(model_config['mlp_layer'][-1] + self.embed_dim + 1, 1, bias=False) 56 | 57 | def score(self, batch): 58 | emb = self.embedding(batch) 59 | field_embs = [] 60 | for field in self.fwbi.fields: 61 | field_idx = [list(self.embedding.embeddings).index(f) for f in field if f != self.frating] 62 | field_embs.append(emb[:, field_idx, :]) 63 | fwbi_out = self.fwbi(batch, field_embs) 64 | mlp_out = self.mlp(emb.flatten(1)) 65 | score = self.fc(torch.cat([mlp_out, fwbi_out], dim=-1)).squeeze(-1) 66 | return {'score': score} 67 | 68 | def _get_loss_func(self): 69 | return BCEWithLogitLoss() 70 | -------------------------------------------------------------------------------- /recstudio/model/fm/fm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from collections import OrderedDict 3 | from recstudio.data.dataset import TripletDataset 4 | from .. import loss_func 5 | from ..basemodel import BaseRanker 6 | from ..module import ctr 7 | 8 | 9 | class FM(BaseRanker): 10 | 11 | def _get_dataset_class(): 12 | return TripletDataset 13 | 14 | def _init_model(self, train_data, drop_unused_field=True): 15 | super()._init_model(train_data, drop_unused_field) 16 | self.fm = torch.nn.Sequential(OrderedDict([ 17 | ("embeddings", ctr.Embeddings( 18 | fields=self.fields, 19 | embed_dim=self.embed_dim, 20 | data=train_data)), 21 | ("fm_layer", ctr.FMLayer(reduction='sum')), 22 | ])) 23 | self.linear = ctr.LinearLayer(self.fields, train_data) 24 | 25 | def score(self, batch): 26 | fm_score = self.fm(batch) 27 | lr_score = self.linear(batch) 28 | return {'score' : fm_score + lr_score} 29 | 30 | def _get_loss_func(self): 31 | return loss_func.BCEWithLogitLoss() 32 | -------------------------------------------------------------------------------- /recstudio/model/fm/fmfm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from collections import OrderedDict 4 | from recstudio.data.dataset import TripletDataset 5 | from .. import loss_func 6 | from ..basemodel import BaseRanker 7 | from ..module import ctr 8 | 9 | r""" 10 | FmFM 11 | ###################### 12 | 13 | Paper Reference: 14 | FM^2: Field-matrixed Factorization Machines for Recommender Systems (WWW'21) 15 | https://dl.acm.org/doi/10.1145/3442381.3449930 16 | """ 17 | 18 | class FmFM(BaseRanker): 19 | 20 | def _get_dataset_class(): 21 | return TripletDataset 22 | 23 | def _init_model(self, train_data, drop_unused_field=True): 24 | super()._init_model(train_data, drop_unused_field) 25 | self.linear = ctr.LinearLayer(self.fields, train_data) 26 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 27 | num_fields = self.embedding.num_features 28 | self.field_weight = nn.Parameter(torch.randn(num_fields*(num_fields - 1)//2, self.embed_dim, self.embed_dim)) 29 | self.triu_index = nn.Parameter( 30 | torch.triu_indices(num_fields, num_fields, offset=1), 31 | requires_grad=False) 32 | 33 | def score(self, batch): 34 | lr_score = self.linear(batch) 35 | emb = self.embedding(batch) 36 | emb0 = torch.index_select(emb, 1, self.triu_index[0]) 37 | emb1 = torch.index_select(emb, 1, self.triu_index[1]) 38 | fmfm_score = ((emb0.unsqueeze(-2) @ self.field_weight).squeeze(-2) * emb1).sum((-1, -2)) 39 | return {'score' : lr_score + fmfm_score} 40 | 41 | def _get_loss_func(self): 42 | return loss_func.BCEWithLogitLoss() 43 | -------------------------------------------------------------------------------- /recstudio/model/fm/fwfm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from collections import OrderedDict 3 | from recstudio.data.dataset import TripletDataset 4 | from .. import loss_func 5 | from ..basemodel import BaseRanker 6 | from ..module import ctr 7 | 8 | r""" 9 | FwFM 10 | ###################### 11 | 12 | Paper Reference: 13 | Field-weighted Factorization Machines for Click-Through Rate Prediction in Display Advertising (WWW'18) 14 | https://dl.acm.org/doi/abs/10.1145/3178876.3186040 15 | """ 16 | 17 | class FwFM(BaseRanker): 18 | 19 | def _get_dataset_class(): 20 | return TripletDataset 21 | 22 | def _init_model(self, train_data, drop_unused_field=True): 23 | super()._init_model(train_data, drop_unused_field) 24 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 25 | num_fields = self.embedding.num_features 26 | self.fwfm = nn.Sequential( 27 | OrderedDict([ 28 | ("fm_layer", 29 | ctr.InnerProductLayer( 30 | num_fields)), 31 | ("field_weighted", 32 | nn.Linear(num_fields * (num_fields - 1) // 2, 1)) 33 | ])) 34 | if self.config['model']['linear_type'].lower() == 'lw': 35 | self.linear = ctr.LinearLayer(self.fields, train_data) 36 | elif self.config['model']['linear_type'].lower() == 'felv': 37 | self.linear_embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 38 | elif self.config['model']['linear_type'].lower() == 'filv': 39 | self.linear = nn.Linear(num_fields * self.embed_dim, 1, bias=False) 40 | else: 41 | raise ValueError('Expect linear_type to be `lw`|`felv`|`filv`, ' 42 | f'but got {self.config["model"]["linear_type"]}.') 43 | 44 | def score(self, batch): 45 | emb = self.embedding(batch) 46 | if self.config['model']['linear_type'].lower() == 'lw': 47 | lr_score = self.linear(batch) 48 | elif self.config['model']['linear_type'].lower() == 'felv': 49 | lr_emb = self.linear_embedding(batch) 50 | lr_score = (lr_emb * emb).sum((1, 2)) 51 | else: 52 | lr_score = self.linear(emb.flatten(1)).squeeze(-1) 53 | fwfm_score = self.fwfm(emb).squeeze(-1) 54 | return {'score' : lr_score + fwfm_score} 55 | 56 | def _get_loss_func(self): 57 | return loss_func.BCEWithLogitLoss() 58 | -------------------------------------------------------------------------------- /recstudio/model/fm/hfm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from recstudio.data.dataset import TripletDataset 3 | from ..basemodel import BaseRanker 4 | from ..loss_func import BCEWithLogitLoss 5 | from ..module import ctr, MLPModule 6 | 7 | r""" 8 | HFM 9 | ###################### 10 | 11 | Paper Reference: 12 | Holographic Factorization Machines for Recommendation (AAAI'19) 13 | https://dl.acm.org/doi/10.1609/aaai.v33i01.33015143 14 | """ 15 | 16 | class HFM(BaseRanker): 17 | 18 | def _get_dataset_class(): 19 | return TripletDataset 20 | 21 | def _init_model(self, train_data, drop_unused_field=True): 22 | super()._init_model(train_data, drop_unused_field) 23 | self.linear = ctr.LinearLayer(self.fields, train_data) 24 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 25 | num_fields = self.embedding.num_features 26 | model_config = self.config['model'] 27 | self.hfm = ctr.HolographicFMLayer(num_fields, model_config['op']) 28 | if model_config['deep']: 29 | self.mlp = MLPModule( 30 | [num_fields * (num_fields - 1) // 2 * self.embed_dim] + model_config['mlp_layer'] + [1], 31 | model_config['activation'], 32 | model_config['dropout'], 33 | last_activation=False, 34 | last_bn=False) 35 | else: 36 | self.fc = nn.Linear(self.embed_dim, 1, bias=False) 37 | 38 | def score(self, batch): 39 | lr_score = self.linear(batch) 40 | emb = self.embedding(batch) 41 | hfm_out = self.hfm(emb) 42 | if self.config['model']['deep']: 43 | hfm_score = self.mlp(hfm_out.flatten(1)).squeeze(-1) 44 | else: 45 | hfm_score = self.fc(hfm_out.sum(1)).squeeze(-1) 46 | return{'score': lr_score + hfm_score} 47 | 48 | def _get_loss_func(self): 49 | return BCEWithLogitLoss() 50 | -------------------------------------------------------------------------------- /recstudio/model/fm/ifm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from recstudio.data.dataset import TripletDataset 3 | from .. import loss_func 4 | from ..basemodel import BaseRanker 5 | from ..module import ctr 6 | 7 | r""" 8 | IFM 9 | ###################### 10 | 11 | Paper Reference: 12 | An Input-aware Factorization Machine for Sparse Prediction (IJCAI'19) 13 | https://dl.acm.org/doi/10.5555/3367032.3367240 14 | """ 15 | 16 | class IFM(BaseRanker): 17 | 18 | def _get_dataset_class(): 19 | return TripletDataset 20 | 21 | def _init_model(self, train_data, drop_unused_field=True): 22 | super()._init_model(train_data, drop_unused_field) 23 | self.linear = ctr.LinearLayer(self.fields, train_data) 24 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 25 | model_config = self.config['model'] 26 | num_fields = self.embedding.num_features 27 | self.fen = ctr.MLPModule( 28 | [num_fields * self.embed_dim] + model_config['mlp_layer'], 29 | model_config['activation'], 30 | model_config['dropout'], 31 | batch_norm=model_config['batch_norm']) 32 | self.fen.add_modules( 33 | nn.Linear(model_config['mlp_layer'][-1], num_fields, bias=False), 34 | nn.Softmax(dim=-1)) 35 | self.fm = ctr.FMLayer(reduction='sum') 36 | 37 | def score(self, batch): 38 | emb = self.embedding(batch) 39 | weight = self.fen(emb.flatten(1)) 40 | lr_score = (super(ctr.LinearLayer, self.linear).forward(batch).squeeze(-1) * weight).sum(-1) + self.linear.bias 41 | fm_score = self.fm(emb * weight.unsqueeze(-1)) 42 | return {'score' : lr_score + fm_score} 43 | 44 | def _get_loss_func(self): 45 | return loss_func.BCEWithLogitLoss() 46 | -------------------------------------------------------------------------------- /recstudio/model/fm/interhat.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from recstudio.data.dataset import TripletDataset 4 | from .. import loss_func 5 | from ..basemodel import BaseRanker 6 | from ..module import ctr, MLPModule 7 | 8 | r""" 9 | InterHAT 10 | ###################### 11 | 12 | Paper Reference: 13 | Interpretable Click-Through Rate Prediction through Hierarchical Attention (WSDM'20) 14 | https://dl.acm.org/doi/10.1145/3336191.3371785 15 | """ 16 | 17 | class InterHAT(BaseRanker): 18 | 19 | def _get_dataset_class(): 20 | return TripletDataset 21 | 22 | def _init_model(self, train_data, drop_unused_field=True): 23 | super()._init_model(train_data, drop_unused_field) 24 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 25 | model_config = self.config['model'] 26 | self.trm = nn.TransformerEncoderLayer( 27 | self.embed_dim, model_config['n_head'], 28 | model_config['feedforward_dim'], 29 | model_config['dropout'], 30 | model_config['activation'], 31 | batch_first=True) 32 | self.aggs = nn.ModuleList([ 33 | ctr.AttentionalAggregation( 34 | self.embed_dim, 35 | model_config['aggregation_dim']) 36 | for _ in range(model_config['order'] + 1) 37 | ]) 38 | self.mlp = MLPModule([self.embed_dim] + model_config['mlp_layer'] + [1], 39 | model_config['activation'], 40 | model_config['dropout'], 41 | last_activation=False, 42 | last_bn=False 43 | ) 44 | 45 | def score(self, batch): 46 | emb = self.embedding(batch) 47 | xi = x1 = self.trm(emb) 48 | U = [] 49 | for i, agg in enumerate(self.aggs[:-1]): 50 | ui = agg(xi, xi) 51 | U.append(ui) 52 | if i < self.config['model']['order']: 53 | xi = ui.unsqueeze(1) * x1 + xi 54 | U = torch.stack(U, dim=1) 55 | uf = self.aggs[-1](U, U) 56 | score = self.mlp(uf).squeeze(-1) 57 | return {'score' : score} 58 | 59 | def _get_loss_func(self): 60 | return loss_func.BCEWithLogitLoss() 61 | -------------------------------------------------------------------------------- /recstudio/model/fm/lorentzfm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from collections import OrderedDict 3 | from recstudio.data.dataset import TripletDataset 4 | from .. import loss_func 5 | from ..basemodel import BaseRanker 6 | from ..module import ctr 7 | 8 | r""" 9 | LorentzFM 10 | ###################### 11 | 12 | Paper Reference: 13 | Learning Feature Interactions with Lorentzian Factorization Machine (AAAI'20) 14 | https://arxiv.org/pdf/1911.09821 15 | """ 16 | 17 | class LorentzFM(BaseRanker): 18 | 19 | def _get_dataset_class(): 20 | return TripletDataset 21 | 22 | def _init_model(self, train_data, drop_unused_field=True): 23 | super()._init_model(train_data, drop_unused_field) 24 | self.lfm = nn.Sequential(OrderedDict([ 25 | ("embeddings", 26 | ctr.Embeddings(fields=self.fields, embed_dim=self.embed_dim, data=train_data)), 27 | ("triangle_pooling_layer", 28 | ctr.TrianglePoolingLayer((len(self.fields) - 1))) 29 | ])) 30 | 31 | def score(self, batch): 32 | lfm_score = self.lfm(batch) 33 | return {'score' : lfm_score} 34 | 35 | def _get_loss_func(self): 36 | return loss_func.BCEWithLogitLoss() 37 | -------------------------------------------------------------------------------- /recstudio/model/fm/lr.py: -------------------------------------------------------------------------------- 1 | from ..basemodel import BaseRanker 2 | from ..module import ctr 3 | from ..loss_func import BCEWithLogitLoss 4 | from recstudio.data.dataset import TripletDataset 5 | 6 | 7 | class LR(BaseRanker): 8 | 9 | def _get_dataset_class(): 10 | return TripletDataset 11 | 12 | def _init_model(self, train_data, drop_unused_field=True): 13 | super()._init_model(train_data, drop_unused_field) 14 | self.linear = ctr.LinearLayer(self.fields, train_data) 15 | 16 | def _get_loss_func(self): 17 | return BCEWithLogitLoss() 18 | 19 | def score(self, batch): 20 | return {'score' : self.linear(batch)} 21 | -------------------------------------------------------------------------------- /recstudio/model/fm/masknet.py: -------------------------------------------------------------------------------- 1 | from recstudio.data.dataset import TripletDataset 2 | from .. import loss_func 3 | from ..basemodel import BaseRanker 4 | from ..module import ctr 5 | 6 | r""" 7 | MaskNet 8 | ###################### 9 | 10 | Paper Reference: 11 | MaskNet: Introducing Feature-Wise Multiplication to CTR Ranking Models by Instance-Guided Mask (DLP KDD'21) 12 | https://arxiv.org/abs/2102.07619 13 | """ 14 | 15 | class MaskNet(BaseRanker): 16 | 17 | def _get_dataset_class(): 18 | return TripletDataset 19 | 20 | def _init_model(self, train_data, drop_unused_field=True): 21 | super()._init_model(train_data, drop_unused_field) 22 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 23 | model_config = self.config['model'] 24 | if model_config['parallel']: 25 | self.masknet = ctr.ParallelMaskNet( 26 | self.embedding.num_features, 27 | self.embed_dim, 28 | model_config['num_blocks'], 29 | model_config['block_dim'], 30 | model_config['reduction_ratio'], 31 | model_config['mlp_layer'], 32 | model_config['activation'], 33 | model_config['dropout'], 34 | model_config['hidden_layer_norm']) 35 | else: 36 | self.masknet = ctr.SerialMaskNet( 37 | self.embedding.num_features, 38 | self.embed_dim, 39 | model_config['block_dim'], 40 | model_config['reduction_ratio'], 41 | model_config['activation'], 42 | model_config['dropout'], 43 | model_config['hidden_layer_norm']) 44 | 45 | def score(self, batch): 46 | emb = self.embedding(batch) 47 | score = self.masknet(emb).squeeze(-1) 48 | return {'score' : score} 49 | 50 | def _get_loss_func(self): 51 | return loss_func.BCEWithLogitLoss() 52 | -------------------------------------------------------------------------------- /recstudio/model/fm/nfm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from collections import OrderedDict 3 | from recstudio.model.basemodel import BaseRanker 4 | from recstudio.model.module import ctr, MLPModule 5 | from recstudio.data.dataset import TripletDataset 6 | from ..loss_func import BCEWithLogitLoss 7 | 8 | 9 | class NFM(BaseRanker): 10 | 11 | def _get_dataset_class(): 12 | return TripletDataset 13 | 14 | def _init_model(self, train_data, drop_unused_field=True): 15 | super()._init_model(train_data, drop_unused_field) 16 | self.linear = ctr.LinearLayer(self.fields, train_data) 17 | model_config = self.config['model'] 18 | self.nfm = nn.Sequential( 19 | OrderedDict([ 20 | ("embedding", 21 | ctr.Embeddings( 22 | self.fields, 23 | self.embed_dim, 24 | train_data)), 25 | ("fm_layer", 26 | ctr.FMLayer()), 27 | ("batch_norm", 28 | nn.BatchNorm1d(self.embed_dim)), 29 | ("mlp", 30 | MLPModule( 31 | [self.embed_dim]+model_config['mlp_layer']+[1], 32 | model_config['activation'], 33 | model_config['dropout'], 34 | batch_norm=model_config['batch_norm'], 35 | last_activation=False, last_bn=False)) 36 | ])) 37 | 38 | def score(self, batch): 39 | linear_score = self.linear(batch) 40 | mlp_score = self.nfm(batch).squeeze(-1) 41 | return {'score' : linear_score + mlp_score} 42 | 43 | def _get_loss_func(self): 44 | return BCEWithLogitLoss() 45 | -------------------------------------------------------------------------------- /recstudio/model/fm/onn.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from collections import OrderedDict 3 | from recstudio.data.dataset import TripletDataset 4 | from .. import loss_func 5 | from ..basemodel import BaseRanker 6 | from ..module import ctr, MLPModule 7 | 8 | r""" 9 | ONN 10 | ###################### 11 | 12 | Paper Reference: 13 | Operation-aware Neural Networks for user response prediction (Neural Networks'20) 14 | https://dl.acm.org/doi/10.1016/j.neunet.2019.09.020 15 | """ 16 | 17 | class ONN(BaseRanker): 18 | 19 | def _get_dataset_class(): 20 | return TripletDataset 21 | 22 | def _init_model(self, train_data, drop_unused_field=True): 23 | super()._init_model(train_data, drop_unused_field) 24 | num_fields = len(self.fields) - 1 25 | model_config = self.config['model'] 26 | self.onn = nn.Sequential( 27 | OrderedDict([ 28 | ("embedding", 29 | ctr.Embeddings( 30 | self.fields, 31 | self.embed_dim * num_fields, 32 | train_data)), 33 | ("ofm_layer", 34 | ctr.OperationAwareFMLayer( 35 | num_fields 36 | )), 37 | ("mlp", 38 | MLPModule( 39 | [num_fields * self.embed_dim + num_fields * (num_fields - 1) // 2] + model_config['mlp_layer'] + [1], 40 | model_config['activation'], 41 | model_config['dropout'], 42 | batch_norm=model_config['batch_norm'], 43 | last_activation=False, last_bn=False)) 44 | ])) 45 | 46 | 47 | def score(self, batch): 48 | onn_score = self.onn(batch).squeeze(-1) 49 | return {'score' : onn_score} 50 | 51 | def _get_loss_func(self): 52 | return loss_func.BCEWithLogitLoss() 53 | -------------------------------------------------------------------------------- /recstudio/model/fm/ppnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from recstudio.data.dataset import TripletDataset 4 | from .. import loss_func 5 | from ..basemodel import BaseRanker 6 | from ..module import ctr, MLPModule 7 | 8 | r""" 9 | PPNet 10 | ###################### 11 | 12 | Used in Kuai 2019 13 | """ 14 | 15 | class BCEWithLogitLossWithAux(loss_func.BCEWithLogitLoss): 16 | def forward(self, aux_score, label, pos_score): 17 | return super().forward(label, aux_score) + super().forward(label, pos_score) 18 | 19 | class PPNet(BaseRanker): 20 | 21 | def _get_dataset_class(): 22 | return TripletDataset 23 | 24 | def _init_model(self, train_data, drop_unused_field=True): 25 | super()._init_model(train_data, drop_unused_field) 26 | model_config = self.config['model'] 27 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 28 | self.mlp = MLPModule([self.embedding.num_features*self.embed_dim] + model_config['mlp_layer'] + [1], 29 | model_config['activation'], 30 | model_config['dropout'], 31 | last_activation=False, 32 | last_bn=False 33 | ) 34 | if model_config['id_fields'] is None: 35 | id_fields = [] 36 | if self.fuid is not None: 37 | id_fields.append(self.fuid) 38 | if self.fiid is not None: 39 | id_fields.append(self.fiid) 40 | if len(id_fields) == 0: 41 | raise ValueError('Expect id_fields, but got None.') 42 | else: 43 | id_fields = model_config['id_fields'] 44 | self.id_embedding = ctr.Embeddings(id_fields, model_config['id_embed_dim'], train_data) 45 | pp_hidden_dims = [self.embedding.num_features*self.embed_dim] + model_config['pp_hidden_dims'] 46 | self.ppnet = nn.ModuleList([ 47 | ctr.PPLayer( 48 | pp_hidden_dims[i : i + 2], 49 | self.embedding.num_features*self.embed_dim + len(id_fields)*model_config['id_embed_dim'], 50 | model_config['gate_hidden_dims'][i], 51 | model_config['activation'], 52 | model_config['dropout'], 53 | model_config['batch_norm']) 54 | for i in range(len(pp_hidden_dims) - 1) 55 | ]) 56 | self.fc = nn.Linear(pp_hidden_dims[-1], 1) 57 | 58 | def score(self, batch): 59 | emb = self.embedding(batch) 60 | mlp_score = self.mlp(emb.flatten(1)).squeeze(-1) 61 | 62 | id_emb = self.id_embedding(batch) 63 | gate_in = torch.cat([emb.flatten(1).detach(), id_emb.flatten(1)], dim=-1) 64 | mlp_in = emb.flatten(1).detach() 65 | for pplayer in self.ppnet: 66 | mlp_in = pplayer(gate_in, mlp_in) 67 | ppnet_score = self.fc(mlp_in).squeeze(-1) 68 | return {'aux_score' : mlp_score, 'score': ppnet_score} 69 | 70 | def _get_loss_func(self): 71 | return BCEWithLogitLossWithAux() 72 | 73 | def training_step(self, batch): 74 | y_h, output = self.forward(batch) 75 | loss = self.loss_fn(output['aux_score'], **y_h) 76 | return loss 77 | -------------------------------------------------------------------------------- /recstudio/model/fm/sam.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from collections import OrderedDict 3 | from recstudio.data.dataset import TripletDataset 4 | from ..basemodel import BaseRanker 5 | from ..loss_func import BCEWithLogitLoss 6 | from ..module import ctr, LambdaLayer 7 | 8 | r""" 9 | SAM 10 | ###################### 11 | 12 | Paper Reference: 13 | Looking at CTR Prediction Again: Is Attention All You Need? (SIGIR'21) 14 | https://dl.acm.org/doi/10.1145/3404835.3462936 15 | """ 16 | 17 | class SAM(BaseRanker): 18 | 19 | def _get_dataset_class(): 20 | return TripletDataset 21 | 22 | def _init_model(self, train_data, drop_unused_field=True): 23 | super()._init_model(train_data, drop_unused_field) 24 | num_fields = len(self.fields) - 1 25 | model_config = self.config['model'] 26 | fi = model_config['interaction_type'].lower() 27 | self.sam = nn.Sequential(OrderedDict([ 28 | ('embedding', 29 | ctr.Embeddings(self.fields, self.embed_dim, train_data)), 30 | ('interaction', 31 | ctr.SAMFeatureInteraction( 32 | fi, 33 | self.embed_dim, 34 | num_fields, 35 | model_config['dropout'])) 36 | ])) 37 | if fi == 'sam1': 38 | self.sam.add_module('agg', nn.Flatten(start_dim=1)) 39 | self.sam.add_module('fc', nn.Linear(num_fields * self.embed_dim, 1)) 40 | elif fi in ['sam2a', 'sam2e']: 41 | self.sam.add_module('agg', nn.Flatten(start_dim=1)) 42 | self.sam.add_module('fc', nn.Linear(num_fields * num_fields * self.embed_dim, 1)) 43 | else: 44 | self.sam.add_module('agg', nn.Sequential( 45 | LambdaLayer(lambda x: x.transpose(1, 2)), 46 | nn.Linear(num_fields, 1, bias=False), 47 | LambdaLayer(lambda x: x.sum(-1)))) 48 | self.sam.add_module('fc', nn.Linear(self.embed_dim, 1)) 49 | 50 | 51 | def score(self, batch): 52 | score = self.sam(batch).squeeze(-1) 53 | return{'score': score} 54 | 55 | def _get_loss_func(self): 56 | return BCEWithLogitLoss() 57 | -------------------------------------------------------------------------------- /recstudio/model/fm/widedeep.py: -------------------------------------------------------------------------------- 1 | from recstudio.data.dataset import TripletDataset 2 | from ..basemodel import BaseRanker 3 | from ..loss_func import BCEWithLogitLoss 4 | from ..module import ctr, MLPModule 5 | 6 | 7 | class WideDeep(BaseRanker): 8 | 9 | def _get_dataset_class(): 10 | return TripletDataset 11 | 12 | def _init_model(self, train_data): 13 | super()._init_model(train_data) 14 | self.linear = ctr.LinearLayer(self.fields, train_data) 15 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 16 | model_config = self.config['model'] 17 | self.mlp = MLPModule( 18 | [self.embedding.num_features*self.embed_dim]+model_config['mlp_layer']+[1], 19 | activation_func = model_config['activation'], 20 | dropout = model_config['dropout'], 21 | batch_norm = model_config['batch_norm'], 22 | last_activation = False, last_bn=False) 23 | 24 | def score(self, batch): 25 | wide_score = self.linear(batch) 26 | emb = self.embedding(batch) 27 | deep_score = self.mlp(emb.flatten(1)).squeeze(-1) 28 | return {'score' : wide_score + deep_score} 29 | 30 | def _get_loss_func(self): 31 | return BCEWithLogitLoss() 32 | -------------------------------------------------------------------------------- /recstudio/model/fm/xdeepfm.py: -------------------------------------------------------------------------------- 1 | from recstudio.data.dataset import TripletDataset 2 | 3 | from ..basemodel import BaseRanker 4 | from ..loss_func import BCEWithLogitLoss 5 | from ..module import ctr, MLPModule 6 | 7 | 8 | class xDeepFM(BaseRanker): 9 | 10 | def _get_dataset_class(): 11 | return TripletDataset 12 | 13 | def _init_model(self, train_data, drop_unused_field=True): 14 | super()._init_model(train_data, drop_unused_field) 15 | self.linear = ctr.LinearLayer(self.fields, train_data) 16 | self.fm = ctr.FMLayer(reduction='sum') 17 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 18 | model_config = self.config['model'] 19 | self.cin = ctr.CIN(self.embed_dim, self.embedding.num_features, 20 | model_config['cin_layer_size'], model_config['activation'], 21 | direct=model_config['direct']) 22 | self.mlp = MLPModule([self.embedding.num_features*self.embed_dim]+model_config['mlp_layer']+[1], 23 | model_config['activation'], model_config['dropout'], 24 | last_activation=False, last_bn=False) 25 | 26 | def score(self, batch): 27 | lr_score = self.linear(batch) 28 | emb = self.embedding(batch) 29 | cin_score = self.cin(emb).squeeze(-1) 30 | mlp_score = self.mlp(emb.flatten(1)).squeeze(-1) 31 | return {'score' : lr_score + cin_score + mlp_score} 32 | 33 | def _get_loss_func(self): 34 | return BCEWithLogitLoss() 35 | -------------------------------------------------------------------------------- /recstudio/model/graph/__init__.py: -------------------------------------------------------------------------------- 1 | from recstudio.model.graph.ngcf import NGCF 2 | from recstudio.model.graph.lightgcn import LightGCN -------------------------------------------------------------------------------- /recstudio/model/graph/config/all.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | split_ratio: [0.8, 0.1, 0.1] 3 | 4 | train: 5 | early_stop_patience: 100 6 | epochs: 1000 7 | learning_rate: 0.001 8 | negative_count: 1 9 | 10 | eval: 11 | batch_size: 128 12 | cutoff: [20, 10, 5] 13 | -------------------------------------------------------------------------------- /recstudio/model/graph/config/lightgcn.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | n_layers: 3 3 | l2_reg_weight: 1e-4 4 | 5 | train: 6 | weight_decay: 0 7 | -------------------------------------------------------------------------------- /recstudio/model/graph/config/ncl.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | n_layers: 3 3 | hyper_layers: 1 4 | num_clusters: 10 5 | l2_reg_weight: 1e-4 6 | temperature: 0.05 7 | 8 | # Contrastive loss is not calculated for each batch in the original paper, 9 | # which will cause that lambda should be changed with batch size. 10 | # The Lambda here has multiplied batch size and contrastive loss is the average of that of the batch, 11 | # so Lambda won't change with batch size. 12 | ssl_reg: 0.005 # 1e-6 * 4096. 13 | alpha: 0.5 14 | proto_reg: 0.0002 # 5e-8 * 4096 15 | 16 | # ml-1m 17 | # num_clusters: 1000 18 | # l2_reg_weight: 1e-4 19 | # temperature: 0.1 20 | # ssl_reg: 5e4 # 1e-7 21 | # alpha: 1 22 | # proto_reg: 3e4 # 8e-8 23 | 24 | eval: 25 | val_metrics: [recall, ndcg] 26 | 27 | train: 28 | num_m_epoch: 1 29 | warm_up_epoch: 20 30 | batch_size: 2048 31 | learning_rate: 2e-3 32 | 33 | -------------------------------------------------------------------------------- /recstudio/model/graph/config/ngcf.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | split_ratio: [0.8, 0.1, 0.1] 3 | 4 | eval: 5 | batch_size: 128 6 | cutoff: [20, 10, 5] 7 | 8 | model: 9 | embed_dim: 64 10 | layer_size: [64, 64, 64, 64] 11 | mess_dropout: [0.1, 0.1, 0.1] 12 | node_dropout: 0.1 13 | l2_reg_weight: 1e-5 14 | 15 | train: 16 | batch_size: 2048 17 | early_stop_patience: 100 18 | learning_rate: 0.0001 19 | negative_count: 1 20 | -------------------------------------------------------------------------------- /recstudio/model/graph/config/sgl.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | split_ratio: [0.8, 0.1, 0.1] 3 | 4 | eval: 5 | batch_size: 128 6 | cutoff: [20, 10, 5] 7 | 8 | model: 9 | aug_type: 'ED' 10 | embed_dim: 64 11 | n_layers: 3 12 | l2_reg_weight: 1e-4 13 | negative_count: 1 14 | ssl_ratio: 0.1 15 | ssl_reg: 0.1 16 | temperature: 0.2 17 | 18 | train: 19 | batch_size: 2048 20 | early_stop_patience: 100 21 | epochs: 1000 22 | learning_rate: 0.001 23 | -------------------------------------------------------------------------------- /recstudio/model/graph/config/simgcl.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | split_ratio: [0.8, 0.1, 0.1] 3 | 4 | eval: 5 | cutoff: [20, 10, 5] 6 | 7 | model: 8 | embed_dim: 64 9 | eps: 0.1 10 | n_layers: 3 11 | 12 | l2_reg_weight: 1e-4 13 | cl_neg_type: all 14 | cl_weight: 0.5 15 | temperature: 0.2 16 | 17 | 18 | train: 19 | batch_size: 2048 20 | early_stop_patience: 100 21 | epochs: 1000 22 | eval_batch_size: 128 23 | learning_rate: 0.001 24 | -------------------------------------------------------------------------------- /recstudio/model/init.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.nn.init import xavier_normal_, xavier_uniform_, constant_ 3 | 4 | 5 | def xavier_normal_initialization(module): 6 | if isinstance(module, nn.Embedding): 7 | xavier_normal_(module.weight.data) 8 | if module.padding_idx is not None: 9 | constant_(module.weight.data[module.padding_idx], 0.) 10 | elif isinstance(module, nn.Linear): 11 | xavier_normal_(module.weight.data) 12 | if module.bias is not None: 13 | constant_(module.bias.data, 0) 14 | elif isinstance(module, nn.LayerNorm): 15 | module.bias.data.zero_() 16 | module.weight.data.fill_(1.0) 17 | 18 | class normal_initialization(object): 19 | def __init__(self, initial_range=0.02) -> None: 20 | super().__init__() 21 | self.initial_range = initial_range 22 | 23 | def __call__(self, module): 24 | if isinstance(module, nn.Embedding): 25 | module.weight.data.normal_(mean=0.0, std=self.initial_range) 26 | if module.padding_idx is not None: 27 | constant_(module.weight.data[module.padding_idx], 0.) 28 | elif isinstance(module, nn.Linear): 29 | module.weight.data.normal_(mean=0.0, std=self.initial_range) 30 | if module.bias is not None: 31 | module.bias.data.zero_() 32 | elif isinstance(module, nn.LayerNorm): 33 | module.bias.data.zero_() 34 | module.weight.data.fill_(1.0) 35 | 36 | def xavier_uniform_initialization(module): 37 | if isinstance(module, nn.Embedding): 38 | xavier_uniform_(module.weight.data) 39 | if module.padding_idx is not None: 40 | constant_(module.weight.data[module.padding_idx], 0.) 41 | elif isinstance(module, nn.Linear): 42 | xavier_uniform_(module.weight.data) 43 | if module.bias is not None: 44 | constant_(module.bias.data, 0) 45 | elif isinstance(module, nn.LayerNorm): 46 | module.bias.data.zero_() 47 | module.weight.data.fill_(1.0) 48 | 49 | -------------------------------------------------------------------------------- /recstudio/model/kg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USTCLLM/RecStudio/9114975b8e9ec85bce16c1ed8abbf0e194e4afb3/recstudio/model/kg/__init__.py -------------------------------------------------------------------------------- /recstudio/model/kg/config/all.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | kg_network_index: 1 # the index of the knowledge graph network in the dataset configuration file. -------------------------------------------------------------------------------- /recstudio/model/kg/config/cfkg.yaml: -------------------------------------------------------------------------------- 1 | 2 | train: 3 | negative_count: 1 4 | 5 | model: 6 | embed_dim: 64 7 | margin: 1.0 -------------------------------------------------------------------------------- /recstudio/model/kg/config/cke.yaml: -------------------------------------------------------------------------------- 1 | 2 | train: 3 | negative_count: 1 4 | weight_decay: 0.000001 5 | 6 | model: 7 | embed_dim: 150 8 | pro_embed_dim: 150 9 | normalize: True -------------------------------------------------------------------------------- /recstudio/model/kg/config/kgat.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | batch_size: 2048 3 | negative_count: 1 4 | weight_decay: 1e-6 5 | 6 | 7 | model: 8 | embed_dim: 64 9 | alg_type: bi 10 | pro_embed_dim: 64 11 | layer_size: [64, 16] 12 | mess_dropout: [0.1, 0.1] 13 | n_fold: 100 14 | -------------------------------------------------------------------------------- /recstudio/model/kg/config/kgcn.yaml: -------------------------------------------------------------------------------- 1 | 2 | data: 3 | fmeval: True 4 | low_rating_thres: 0.0 5 | binarized_rating_thres: 3.0 6 | 7 | eval: 8 | val_metrics: [auc, logloss] 9 | test_metrics: [auc, logloss] 10 | 11 | train: 12 | embed_dim: 64 13 | weight_decay: 1e-7 14 | 15 | model: 16 | neighbor_sample_size: 4 17 | n_iter: 2 18 | aggregator_type: sum -------------------------------------------------------------------------------- /recstudio/model/kg/config/kgin.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | negative_count: 1 3 | learning_rate: 0.0001 4 | 5 | model: 6 | embed_dim: 64 7 | num_factors: 4 8 | l2_reg: 1e-5 9 | sim_regularity: 1e-4 10 | intents_indep: distance 11 | num_layers: 2 12 | 13 | # dropout 14 | node_dropout: 0.2 15 | mess_dropout: 0.1 -------------------------------------------------------------------------------- /recstudio/model/kg/config/kgnnls.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | fmeval: True 3 | kg_network_index: 1 4 | low_rating_thres: 0.0 5 | binarized_rating_thres: 3.0 6 | 7 | 8 | eval: 9 | val_metrics: [auc, logloss] 10 | test_metrics: [auc, logloss] 11 | 12 | model: 13 | embed_dim: 64 14 | weight_decay: 1e-7 15 | 16 | n_iter: 2 17 | neighbor_sample_size: 4 18 | aggregator_type: sum 19 | ls_weight: 1.0 -------------------------------------------------------------------------------- /recstudio/model/kg/config/ktup.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | negative_count: 1 3 | weight_decay: 1e-6 4 | 5 | model: 6 | embed_dim: 100 7 | train_rec_step: 5 8 | train_kg_step: 5 9 | use_st_gumbel: True 10 | L1_flag: False 11 | margin: 1.0 12 | kg_weight: 0.5 -------------------------------------------------------------------------------- /recstudio/model/kg/config/mkr.yaml: -------------------------------------------------------------------------------- 1 | 2 | train: 3 | negative_count: 1 4 | weight_decay: 1e-6 5 | 6 | model: 7 | # learning_rate_kg: 0.001 8 | embed_dim: 64 9 | kge_interval: 3 10 | use_inner_product: True 11 | dropout: 0.0 12 | L: 1 13 | H: 1 14 | -------------------------------------------------------------------------------- /recstudio/model/kg/config/ripplenet.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | kg_network_index: 1 3 | fmeval: True 4 | low_rating_thres: 0.0 5 | binarized_rating_thres: 4.0 6 | 7 | eval: 8 | val_metrics: [auc, logloss] 9 | test_metrics: [auc, logloss] 10 | 11 | train: 12 | weight_decay: 1e-6 13 | batch_size: 2048 14 | negative_count: 1 15 | 16 | model: 17 | embed_dim: 64 18 | using_all_hops: True 19 | item_update_mode: plus_transform # how to update item at the end of each hop 20 | kge_weight: 0.01 21 | n_memory: 16 # size of ripple set for each hop 22 | n_hop: 2 # maximum hops 23 | -------------------------------------------------------------------------------- /recstudio/model/mf/__init__.py: -------------------------------------------------------------------------------- 1 | from .bpr import BPR 2 | from .cml import CML 3 | from .ease import EASE 4 | from .irgan import IRGAN 5 | from .itemknn import ItemKNN 6 | from .logisticmf import LogisticMF 7 | from .ncf import NCF 8 | from .slim import SLIM 9 | from .wrmf import WRMF 10 | from .dssm import DSSM -------------------------------------------------------------------------------- /recstudio/model/mf/bpr.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.ann import sampler 3 | from recstudio.data import dataset 4 | from recstudio.model import basemodel, loss_func, scorer 5 | 6 | 7 | class BPR(basemodel.BaseRetriever): 8 | 9 | def _get_dataset_class(): 10 | return dataset.TripletDataset 11 | 12 | def _get_item_encoder(self, train_data): 13 | return torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0) 14 | 15 | def _get_query_encoder(self, train_data): 16 | return torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0) 17 | 18 | def _get_score_func(self): 19 | return scorer.InnerProductScorer() 20 | 21 | def _get_loss_func(self): 22 | return loss_func.BPRLoss() 23 | 24 | def _get_sampler(self, train_data): 25 | return sampler.UniformSampler(train_data.num_items) 26 | -------------------------------------------------------------------------------- /recstudio/model/mf/cml.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.ann import sampler 3 | from recstudio.data import advance_dataset 4 | from recstudio.model import basemodel, loss_func, scorer 5 | 6 | 7 | class CML(basemodel.BaseRetriever): 8 | 9 | def _get_dataset_class(): 10 | return advance_dataset.ALSDataset 11 | 12 | def _get_item_encoder(self, train_data): 13 | return torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0) 14 | 15 | def _get_query_encoder(self, train_data): 16 | return torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0) 17 | 18 | def _get_score_func(self): 19 | return scorer.EuclideanScorer() 20 | 21 | def _get_loss_func(self, train_data): 22 | class CMLoss(loss_func.PairwiseLoss): 23 | def __init__(self, margin=2, use_rank_weight=False, n_items: int=None): 24 | super().__init__() 25 | self.margin = margin 26 | self.use_rank_weight = use_rank_weight 27 | self.n_items = n_items - 1 # remove padding 28 | 29 | def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob): 30 | pos_score[pos_score == -float("inf")] = float("inf") 31 | loss = torch.max(torch.max(neg_score, dim=-1).values.unsqueeze(-1) \ 32 | - pos_score + self.margin, pos_score.new_zeros(pos_score.size(1))) 33 | if self.use_rank_weight is not None: 34 | impostors = neg_score.unsqueeze(1) - pos_score.unsqueeze(-1) + self.margin > 0 35 | rank = torch.mean(impostors.to(torch.float32), -1) * self.n_items 36 | return torch.mean(loss * torch.log(rank + 1)) 37 | else: 38 | return torch.mean(loss) 39 | return CMLoss(self.config['model']['margin'], self.config['model']['use_rank_weight'], train_data.num_items) 40 | 41 | 42 | def _get_sampler(self, train_data): 43 | return sampler.UniformSampler(train_data.num_items) 44 | -------------------------------------------------------------------------------- /recstudio/model/mf/config/all.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | split_mode: user_entry # user # entry 3 | fmeval: False 4 | binaried_rating_thres: 0.0 5 | 6 | eval: 7 | batch_size: 20 8 | 9 | model: 10 | embed_dim: 64 11 | -------------------------------------------------------------------------------- /recstudio/model/mf/config/bpr.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | negative_count: 1 3 | excluding_hist: False 4 | -------------------------------------------------------------------------------- /recstudio/model/mf/config/cml.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | learning_rate: 0.01 3 | negative_count: 5 4 | exculing_hist: False 5 | 6 | model: 7 | margin: 1 8 | use_rank_weight: ~ 9 | -------------------------------------------------------------------------------- /recstudio/model/mf/config/dssm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 64 3 | mlp_layer: [128, 128, 128] 4 | activation: tanh 5 | dropout: 0.3 6 | batch_norm: False 7 | 8 | train: 9 | negative_count: 1 10 | 11 | eval: 12 | cutoff: [10, 20, 50] -------------------------------------------------------------------------------- /recstudio/model/mf/config/ease.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | epochs: 1 3 | gpu: ~ 4 | lambda: 250 -------------------------------------------------------------------------------- /recstudio/model/mf/config/irgan.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 64 3 | sample_lambda: 0.2 4 | T_dis: 0.2 5 | T_gen: 1 6 | 7 | train: 8 | batch_size: 16 9 | early_stop_patience: 100 10 | epochs: 1000 11 | every_n_epoch_gen: 2 12 | every_n_epoch_dis: 5 13 | learning_rate_dis: 0.001 14 | learning_rate_gen: 0.001 15 | negative_count: 1 16 | weight_decay_dis: 0.0001 17 | weight_decay_gen: 0.0001 18 | -------------------------------------------------------------------------------- /recstudio/model/mf/config/itemknn.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | epochs: 1 3 | gpu: ~ 4 | knn: 100 5 | similarity: cosine #| jaccard 6 | -------------------------------------------------------------------------------- /recstudio/model/mf/config/logisticmf.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | alpha: 0.5 3 | excluding_hist: False 4 | lambda: 0.01 5 | learner: adagrad 6 | learning_rate: 0.01 7 | negative_count: 10 -------------------------------------------------------------------------------- /recstudio/model/mf/config/ncf.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | activation: relu 3 | dropout: 0.1 4 | mlp_hidden_size: [128, 64] 5 | score_mode: fusion 6 | 7 | train: 8 | excluding_hist: False 9 | negative_count: 1 -------------------------------------------------------------------------------- /recstudio/model/mf/config/pmf.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/USTCLLM/RecStudio/9114975b8e9ec85bce16c1ed8abbf0e194e4afb3/recstudio/model/mf/config/pmf.yaml -------------------------------------------------------------------------------- /recstudio/model/mf/config/slim.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | alpha: 1 3 | epochs: 1 4 | gpu: ~ 5 | knn: 100 6 | l1_ratio: 0.1 7 | positive_only: True -------------------------------------------------------------------------------- /recstudio/model/mf/config/wrmf.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | alpha: 1 3 | batch_size: 100 4 | lambda: 0.5 -------------------------------------------------------------------------------- /recstudio/model/mf/dssm.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | from recstudio.data import TripletDataset 5 | 6 | from .. import basemodel, loss_func, scorer 7 | from ..module import LambdaLayer, MLPModule, ctr 8 | 9 | 10 | class DSSM(basemodel.BaseRetriever): 11 | 12 | def _set_data_field(self, data): 13 | data.use_field = data.field 14 | 15 | def _get_dataset_class(): 16 | return TripletDataset 17 | 18 | def _get_query_encoder(self, train_data): 19 | if len(self.query_fields) == 1 and list(self.query_fields)[0] == self.fuid: 20 | embedding = torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0) 21 | mlp_input_dim = self.embed_dim 22 | else: 23 | embedding = ctr.Embeddings( 24 | fields=self.query_fields, 25 | data=train_data, 26 | embed_dim=self.embed_dim) 27 | mlp_input_dim = embedding.num_features * self.embed_dim 28 | model_config = self.config['model'] 29 | mlp = MLPModule( 30 | [mlp_input_dim] + model_config['mlp_layer'], 31 | dropout=model_config['dropout'], activation_func=model_config['activation'], 32 | batch_norm=model_config['batch_norm']) 33 | return torch.nn.Sequential( 34 | OrderedDict( 35 | {'embedding': embedding, 36 | 'flatten': LambdaLayer(lambda x: x.view(x.size(0), -1)), 37 | 'MLP': mlp})) 38 | 39 | def _get_item_encoder(self, train_data): 40 | if len(self.item_fields) == 1 and list(self.item_fields)[0] == self.fiid: 41 | embedding = torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0) 42 | mlp_input_dim = self.embed_dim 43 | flatten_layer = LambdaLayer(lambda x: x) 44 | else: 45 | embedding = ctr.Embeddings( 46 | fields=self.item_fields, 47 | data=train_data, 48 | embed_dim=self.embed_dim, 49 | ) 50 | mlp_input_dim = embedding.num_features * self.embed_dim 51 | flatten_layer = LambdaLayer(lambda x: x.view(*x.shape[: -2], -1)) 52 | 53 | model_config = self.config['model'] 54 | mlp = MLPModule( 55 | [mlp_input_dim] + model_config['mlp_layer'], 56 | activation_func = model_config['activation'], 57 | dropout = model_config['dropout'], 58 | batch_norm = model_config['batch_norm']) 59 | return torch.nn.Sequential( 60 | OrderedDict( 61 | {'embedding': embedding, 62 | 'flatten': flatten_layer, 63 | 'MLP': mlp})) 64 | 65 | def _get_score_func(self): 66 | return scorer.InnerProductScorer() 67 | 68 | def _get_loss_func(self): 69 | return loss_func.BinaryCrossEntropyLoss() 70 | -------------------------------------------------------------------------------- /recstudio/model/mf/ease.py: -------------------------------------------------------------------------------- 1 | from typing import OrderedDict 2 | 3 | import numpy as np 4 | import torch 5 | from recstudio.data.dataset import TripletDataset 6 | from recstudio.model import basemodel 7 | 8 | 9 | class QueryEncoder(object): 10 | def __init__(self, user) -> None: 11 | self.user = user 12 | 13 | def __call__(self, batch): 14 | return self.user[batch, :] 15 | 16 | 17 | class EASE(basemodel.BaseRetriever): 18 | 19 | def _get_dataset_class(): 20 | return TripletDataset 21 | 22 | def _get_train_loaders(self, train_data): 23 | return {'user_item_matrix': train_data.get_graph(0, 'csr')[0]} 24 | 25 | def training_epoch(self, nepoch): 26 | if self.config['train']['gpu'] is not None: 27 | self.logger.warning("expecting EASE run on cpu while get gpu setting, automatically set gpu as None.") 28 | self.config['train']['gpu'] = None 29 | data, iscombine = self.current_epoch_trainloaders(nepoch) 30 | R = data['user_item_matrix'] 31 | G = R.T @ R 32 | diagIndices = np.diag_indices_from(G) 33 | G[diagIndices] += self.config['train']['lambda'] 34 | P = np.linalg.inv(G.todense()) 35 | B = P / (-np.diag(P)) 36 | B[diagIndices] = 0 37 | self.item_vector = B[:, 1:] 38 | self.query_encoder.user = R 39 | return torch.tensor(np.linalg.norm(R-R*B, 'fro')) 40 | 41 | def _get_query_encoder(self, train_data): 42 | return QueryEncoder(None) 43 | 44 | def _get_score_func(self): 45 | def scorer(query, items): 46 | return torch.from_numpy((query @ items).A) 47 | return scorer 48 | 49 | def _get_loss_func(self): 50 | return None 51 | 52 | def _get_item_encoder(self, train_data): 53 | return None 54 | 55 | def _get_sampler(self, train_data): 56 | return None 57 | 58 | def _get_optimizers(self): 59 | return None 60 | 61 | def _get_item_vector(self): 62 | return self.item_vector 63 | 64 | def state_dict(self): 65 | return OrderedDict({ 66 | 'item_vector': getattr(self, 'item_vector', None), 67 | 'query_encoder': getattr(self, 'query_encoder', None) 68 | }) 69 | 70 | def load_state_dict(self, state_dict: OrderedDict): 71 | for k, v in state_dict.items(): 72 | setattr(self, k, v) 73 | -------------------------------------------------------------------------------- /recstudio/model/mf/itemknn.py: -------------------------------------------------------------------------------- 1 | from recstudio.model import basemodel 2 | from recstudio.model.mf.ease import EASE 3 | import scipy.sparse as sp 4 | import numpy as np 5 | import torch 6 | class ItemKNN(EASE): 7 | 8 | def training_epoch(self, nepoch): 9 | config = self.config['train'] 10 | data, iscombine = self.current_epoch_trainloaders(nepoch) 11 | R = data['user_item_matrix'] 12 | item_norm = np.sqrt(R.multiply(R).sum(0).A.ravel()) 13 | item_nz = (R > 0).sum(0).A.ravel() 14 | G = R.T @ R 15 | diagIndices = np.diag_indices_from(G) 16 | G[diagIndices] = 0 17 | G.eliminate_zeros() 18 | all_col = [] 19 | all_row = [] 20 | all_val = [] 21 | for col in range(G.shape[0]): 22 | if G.indptr[col] < G.indptr[col+1]: 23 | score = G.data[G.indptr[col]:G.indptr[col+1]] 24 | rows = G.indices[G.indptr[col]:G.indptr[col+1]] 25 | if config['similarity'] == 'cosine': 26 | score = score / (item_norm[rows] * item_norm[col] + 1e-6) 27 | elif config['similarity'] == 'jaccard': 28 | score = score / (item_nz[rows] + item_nz[col] - score + 1e-6) 29 | else: 30 | raise ValueError('unsupported similarity metric') 31 | topk = config['knn'] 32 | if G.indptr[col] < G.indptr[col+1] - topk: 33 | idx = np.argpartition(score, -topk)[-topk:] 34 | rows_ = rows[idx] 35 | scores_ = score[idx] 36 | else: 37 | rows_ = rows 38 | scores_ = score 39 | all_col.extend([col] * len(scores_)) 40 | all_row.extend(rows_) 41 | all_val.extend(scores_) 42 | 43 | B = sp.csc_matrix((all_val, (all_row, all_col)), G.shape) 44 | self.item_vector = B[:, 1:] 45 | self.query_encoder.user = R 46 | return torch.tensor(0.) -------------------------------------------------------------------------------- /recstudio/model/mf/logisticmf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.ann import sampler 3 | from recstudio.data import dataset 4 | from recstudio.model import basemodel, scorer, loss_func 5 | 6 | class LogisticMF(basemodel.BaseRetriever): 7 | 8 | def _get_dataset_class(): 9 | return dataset.TripletDataset 10 | 11 | def _get_item_encoder(self, train_data): 12 | return torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0) 13 | 14 | def _get_query_encoder(self, train_data): 15 | return torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0) 16 | 17 | def _get_score_func(self): 18 | return scorer.InnerProductScorer() 19 | 20 | def _get_loss_func(self): 21 | class LogitLoss(loss_func.PairwiseLoss): 22 | def __init__(self, alpha) -> None: 23 | super().__init__() 24 | self.alpha = alpha 25 | 26 | def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob): 27 | l1 = self.alpha * pos_score - (1+self.alpha) * torch.nn.functional.softplus(pos_score) 28 | l2 = torch.nn.functional.softplus(neg_score).mean(dim=-1) 29 | loss = (l1 - l2).mean() 30 | return -loss 31 | 32 | return LogitLoss(self.config['train']['alpha']) 33 | 34 | 35 | def _get_sampler(self, train_data): 36 | return sampler.UniformSampler(train_data.num_items) 37 | -------------------------------------------------------------------------------- /recstudio/model/mf/ncf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.ann import sampler 3 | from recstudio.data import dataset 4 | from recstudio.model import basemodel, loss_func, scorer, module 5 | 6 | class NCF(basemodel.BaseRetriever): 7 | 8 | def _get_dataset_class(): 9 | return dataset.TripletDataset 10 | 11 | def _get_item_encoder(self, train_data): 12 | return torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0) 13 | 14 | def _get_query_encoder(self, train_data): 15 | return torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0) 16 | 17 | def _get_score_func(self): 18 | model_config = self.config['model'] 19 | score_mode = model_config['score_mode'] 20 | assert score_mode in set(['mlp', 'mf', 'fusion']), \ 21 | "Only 3 score modes are supported for NCF: ['mlp', 'mf', 'fusion']" 22 | if score_mode == 'mlp': 23 | return scorer.MLPScorer(module.MLPModule( 24 | mlp_layers = [self.embed_dim*2]+model_config['mlp_hidden_size']+[1], 25 | activation_func = model_config['activation'], 26 | dropout = model_config['dropout'])) 27 | elif score_mode == 'mf': 28 | return scorer.GMFScorer(self.embed_dim, activation=model_config['activation']) 29 | else: 30 | mlp = module.MLPModule( 31 | mlp_layers = [self.embed_dim*2]+model_config['mlp_hidden_size'], 32 | activation_func = model_config['activation'], 33 | dropout = model_config['dropout']) 34 | return scorer.FusionMFMLPScorer( 35 | emb_dim = self.embed_dim, 36 | hidden_size = model_config['mlp_hidden_size'][-1], 37 | mlp = mlp, 38 | activation = model_config['activation']) 39 | 40 | 41 | def _get_loss_func(self): 42 | return loss_func.BinaryCrossEntropyLoss() 43 | 44 | def _get_sampler(self, train_data): 45 | return sampler.UniformSampler(train_data.num_items) 46 | -------------------------------------------------------------------------------- /recstudio/model/mf/pmf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.data import dataset 3 | 4 | from .. import basemodel, scorer, loss_func 5 | 6 | class PMF(basemodel.BaseRetriever): 7 | 8 | def _get_dataset_class(): 9 | return dataset.TripletDataset 10 | 11 | def _get_item_encoder(self, train_data): 12 | return torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0) 13 | 14 | def _get_query_encoder(self, train_data): 15 | return torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0) 16 | 17 | def _get_score_func(self): 18 | return scorer.InnerProductScorer() 19 | 20 | def _get_loss_func(self): 21 | return loss_func.SquareLoss() 22 | 23 | def _get_sampler(self, train_data): 24 | return None 25 | -------------------------------------------------------------------------------- /recstudio/model/mf/slim.py: -------------------------------------------------------------------------------- 1 | from recstudio.model.mf.ease import EASE 2 | from recstudio.model.basemodel import Recommender 3 | from sklearn.linear_model import ElasticNet 4 | from sklearn.exceptions import ConvergenceWarning 5 | import scipy.sparse as sp 6 | import torch 7 | import warnings 8 | 9 | 10 | class SLIM(EASE): 11 | 12 | def add_model_specific_args(parent_parser): 13 | parent_parser = Recommender.add_model_specific_args(parent_parser) 14 | parent_parser.add_argument_group('SLIM') 15 | parent_parser.add_argument("--knn", type=int, default=100, help='k for K-nearest neighbor') 16 | parent_parser.add_argument("--alpha", type=float, default=1.0, help='alpha coef') 17 | parent_parser.add_argument("--l1_ratio", type=float, default=0.1, help='coef for L1 regularization') 18 | parent_parser.add_argument("--positive_only", action='store_true', default=True, help='positive only flag') 19 | return parent_parser 20 | 21 | def training_epoch(self, nepoch): 22 | train_config = self.config['train'] 23 | data, iscombine = self.current_epoch_trainloaders(nepoch) 24 | X = data['user_item_matrix'].tolil() 25 | model = ElasticNet( 26 | alpha=train_config.get('alpha', 1), 27 | l1_ratio=train_config.get('l1_ratio', 0.1), 28 | positive=train_config.get('positive_only', True), 29 | fit_intercept=False, 30 | copy_X=False, 31 | precompute=True, 32 | selection='random', 33 | max_iter=100, 34 | tol=1e-4 35 | ) 36 | item_coeffs = [] 37 | with warnings.catch_warnings(): 38 | warnings.simplefilter("ignore", category=ConvergenceWarning) 39 | for j in range(X.shape[1]): 40 | r = X[:, j] 41 | X[:, j] = 0 42 | model.fit(X, r.A) 43 | item_coeffs.append(model.sparse_coef_) 44 | X[:, j] = r 45 | B = sp.vstack(item_coeffs).T 46 | self.item_vector = B[:, 1:] 47 | self.query_encoder.user = X 48 | return torch.tensor(0.) 49 | -------------------------------------------------------------------------------- /recstudio/model/module/__init__.py: -------------------------------------------------------------------------------- 1 | from recstudio.model.module.layers import * 2 | from recstudio.model.module.graphmodule import * 3 | -------------------------------------------------------------------------------- /recstudio/model/module/functional.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def seq_pooling_function(batch_seq_embeddings: torch.Tensor, seq_len: torch.Tensor, weight=None, mask_token=None, pooling_type='mean', keepdim=False): 5 | # batch_seq_embeddings: [B, L, D] or [B, Neg, L, D] 6 | # seq_len: [B] or [B,Neg], weight: [B,L] or [B,Neg,L] 7 | B = batch_seq_embeddings.size(0) 8 | _need_reshape = False 9 | if batch_seq_embeddings.dim() == 4: 10 | _need_reshape = True 11 | batch_seq_embeddings = batch_seq_embeddings.view( 12 | -1, *batch_seq_embeddings.shape[2:]) 13 | seq_len = seq_len.view(-1) 14 | if weight is not None: 15 | weight = weight.view(-1, weight.size(-1)) 16 | 17 | N, L, D = batch_seq_embeddings.shape 18 | 19 | if weight is not None: 20 | batch_seq_embeddings = weight.unsqueeze(-1) * batch_seq_embeddings 21 | 22 | if pooling_type == 'mask': 23 | # Data type of mask_token should be bool and 24 | # the shape of mask_token should be [B, L] 25 | assert mask_token != None, "mask_token can be None when pooling_type is 'mask'." 26 | result = batch_seq_embeddings[mask_token] 27 | 28 | elif pooling_type in ['origin', 'concat', 'mean', 'sum', 'max']: 29 | mask = torch.arange(L).unsqueeze(0).unsqueeze(2).to(batch_seq_embeddings.device) 30 | mask = mask.expand(N, -1, D) 31 | seq_len = seq_len.unsqueeze(1).unsqueeze(2) 32 | seq_len_ = seq_len.expand(-1, mask.size(1), -1) 33 | mask = mask >= seq_len_ 34 | 35 | batch_seq_embeddings = batch_seq_embeddings.masked_fill(mask, 0.0) 36 | 37 | if pooling_type == 'origin': 38 | return batch_seq_embeddings 39 | elif pooling_type in ['origin', 'concat', 'max']: 40 | if not keepdim: 41 | if pooling_type == 'concat': 42 | result = batch_seq_embeddings.reshape(N, -1) 43 | else: 44 | result = batch_seq_embeddings.max(dim=1) 45 | else: 46 | if pooling_type == 'concat': 47 | result = batch_seq_embeddings.reshape(N, -1).unsqueeze(1) 48 | else: 49 | result = batch_seq_embeddings.max(dim=1).unsqueeze(1) 50 | elif pooling_type in ['mean', 'sum']: 51 | batch_seq_embeddings_sum = batch_seq_embeddings.sum(dim=1, keepdim=keepdim) 52 | if pooling_type == 'sum': 53 | result = batch_seq_embeddings_sum 54 | else: 55 | result = batch_seq_embeddings_sum / (seq_len + torch.finfo(torch.float32).eps if keepdim else seq_len.squeeze(2)) 56 | 57 | elif pooling_type == 'last': 58 | gather_index = (seq_len-1).view(-1, 1, 1).expand(-1, -1, D) # B x 1 x D 59 | output = batch_seq_embeddings.gather( 60 | dim=1, index=gather_index).squeeze(1) # B x D 61 | result = output if not keepdim else output.unsqueeze(1) 62 | 63 | if _need_reshape: 64 | return result.reshape(B, N//B, *result.shape[1:]) 65 | else: 66 | return result -------------------------------------------------------------------------------- /recstudio/model/multitask/__init__.py: -------------------------------------------------------------------------------- 1 | from .hardshare import HardShare 2 | from .mmoe import MMoE 3 | from .ple import PLE 4 | from .aitm import AITM -------------------------------------------------------------------------------- /recstudio/model/multitask/config/aitm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | tower_mlp_layer: [128, 64] 3 | tower_activation: relu 4 | tower_dropout: 0.5 5 | tower_batch_norm: False 6 | 7 | alpha: 0.6 8 | -------------------------------------------------------------------------------- /recstudio/model/multitask/config/all.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | fmeval: True 3 | low_rating_thres: ~ 4 | binarized_rating_thres: ~ 5 | 6 | eval: 7 | val_metrics: [auc, logloss] 8 | test_metrics: [auc, logloss] 9 | 10 | train: 11 | weights: ~ -------------------------------------------------------------------------------- /recstudio/model/multitask/config/hardshare.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | top_mlp_layer: [128, 128] 3 | top_activation: relu 4 | top_dropout: 0.5 5 | top_batch_norm: False 6 | bottom_mlp_layer: [128, 128] 7 | bottom_activation: relu 8 | bottom_dropout: 0.5 9 | bottom_batch_norm: False 10 | -------------------------------------------------------------------------------- /recstudio/model/multitask/config/mmoe.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | num_experts: 2 3 | expert_mlp_layer: [128, 128] 4 | expert_activation: relu 5 | expert_dropout: 0.5 6 | expert_batch_norm: False 7 | 8 | gate_mlp_layer: [128, ] 9 | gate_activation: relu 10 | gate_dropout: 0.5 11 | gate_batch_norm: False 12 | 13 | tower_mlp_layer: [128, ] 14 | tower_activation: relu 15 | tower_dropout: 0.5 16 | tower_batch_norm: False 17 | -------------------------------------------------------------------------------- /recstudio/model/multitask/config/ple.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | num_levels: 1 3 | specific_experts_per_task: 2 4 | num_shared_experts: 2 5 | expert_mlp_layer: [128, 128] 6 | expert_activation: relu 7 | expert_dropout: 0.5 8 | 9 | gate_mlp_layer: [128, ] 10 | gate_activation: relu 11 | gate_dropout: 0.5 12 | 13 | tower_mlp_layer: [128, ] 14 | tower_activation: relu 15 | tower_dropout: 0.5 16 | tower_batch_norm: False 17 | -------------------------------------------------------------------------------- /recstudio/model/multitask/hardshare.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from collections import defaultdict 4 | from recstudio.data.dataset import TripletDataset 5 | from ..basemodel import BaseRanker 6 | from ..loss_func import BCEWithLogitLoss 7 | from ..module import ctr, MLPModule 8 | 9 | r""" 10 | HardShare 11 | ###################### 12 | 13 | Paper Reference: 14 | An overview of multi-task learning in deep neural networks ('17) 15 | https://arxiv.org/abs/1706.05098 16 | """ 17 | 18 | class HardShare(BaseRanker): 19 | 20 | def _get_dataset_class(): 21 | return TripletDataset 22 | 23 | def _init_model(self, train_data, drop_unused_field=True): 24 | super()._init_model(train_data, drop_unused_field) 25 | model_config = self.config['model'] 26 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 27 | self.bottom_mlp = MLPModule( 28 | [self.embedding.num_features * self.embed_dim] + model_config['bottom_mlp_layer'], 29 | model_config['bottom_activation'], 30 | model_config['bottom_dropout'], 31 | batch_norm=model_config['bottom_batch_norm']) 32 | assert isinstance(self.frating, list), f'Expect rating_field to be a list, but got {self.frating}.' 33 | self.top_mlp = nn.ModuleDict({ 34 | r: MLPModule( 35 | [model_config['bottom_mlp_layer'][-1]] + model_config['top_mlp_layer'] + [1], 36 | model_config['top_activation'], 37 | model_config['top_dropout'], 38 | last_activation=False, 39 | batch_norm=model_config['top_batch_norm']) 40 | for r in self.frating 41 | }) 42 | 43 | def score(self, batch): 44 | emb = self.embedding(batch) 45 | shared_emb = self.bottom_mlp(emb.flatten(1)) 46 | score = defaultdict(dict) 47 | for r, top_mlp in self.top_mlp.items(): 48 | score[r]['score'] = top_mlp(shared_emb).squeeze(-1) 49 | return score 50 | 51 | def _get_loss_func(self): 52 | return BCEWithLogitLoss() 53 | 54 | def training_step(self, batch): 55 | y_h, _ = self.forward(batch) 56 | loss = {} 57 | for r in self.frating: 58 | loss[r] = self.loss_fn(**y_h[r]) 59 | 60 | weights = self.config['train'].get('weights', [1.0]*len(self.frating)) 61 | if weights is None: 62 | weights = [1.0]*len(self.frating) 63 | assert len(weights) == len(self.frating), \ 64 | f'Expect {len(self.frating)} float(s) for weights, but got {self.config["train"]["weights"]}.' 65 | weights = torch.tensor(weights, device=self.device).softmax(0) 66 | 67 | loss['loss'] = sum(w*v for w, (_, v) in zip(weights, loss.items())) 68 | return loss 69 | -------------------------------------------------------------------------------- /recstudio/model/multitask/mmoe.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from collections import defaultdict 4 | from recstudio.model.multitask.hardshare import HardShare 5 | from ..module import ctr, MLPModule 6 | 7 | r""" 8 | MMoE 9 | ###################### 10 | 11 | Paper Reference: 12 | Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts (KDD'18) 13 | https://dl.acm.org/doi/10.1145/3219819.3220007 14 | """ 15 | 16 | class MMoE(HardShare): 17 | 18 | def _init_model(self, train_data, drop_unused_field=True): 19 | super()._init_model(train_data, drop_unused_field) 20 | model_config = self.config['model'] 21 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 22 | assert isinstance(self.frating, list), f'Expect rating_field to be a list, but got {self.frating}.' 23 | self.experts = nn.ModuleList([ 24 | MLPModule( 25 | [self.embedding.num_features * self.embed_dim] + model_config['expert_mlp_layer'], 26 | model_config['expert_activation'], 27 | model_config['expert_dropout'], 28 | batch_norm=model_config['expert_batch_norm']) 29 | for _ in range(model_config['num_experts']) 30 | ]) 31 | self.gates = nn.ModuleDict({ 32 | r: MLPModule( 33 | [self.embedding.num_features * self.embed_dim] + model_config['gate_mlp_layer'] + [model_config['num_experts']], 34 | model_config['gate_activation'], 35 | model_config['gate_dropout'], 36 | batch_norm=model_config['gate_batch_norm']) 37 | for r in self.frating 38 | }) 39 | for _, g in self.gates.items(): 40 | g.add_modules(nn.Softmax(-1)) 41 | self.towers = nn.ModuleDict({ 42 | r: MLPModule( 43 | [model_config['expert_mlp_layer'][-1]] + model_config['tower_mlp_layer'] + [1], 44 | model_config['tower_activation'], 45 | model_config['tower_dropout'], 46 | batch_norm=model_config['tower_batch_norm'], 47 | last_activation=False, 48 | last_bn=False) 49 | for r in self.frating 50 | }) 51 | 52 | def score(self, batch): 53 | emb = self.embedding(batch).flatten(1) 54 | experts_out = torch.stack([e(emb) for e in self.experts], dim=1) # B x E x De 55 | score = defaultdict(dict) 56 | for r, gate in self.gates.items(): 57 | gate_out = gate(emb) # B x E 58 | mmoe_out = (gate_out.unsqueeze(-1) * experts_out).sum(1) # B x De 59 | score[r]['score'] = self.towers[r](mmoe_out).squeeze(-1) 60 | return score -------------------------------------------------------------------------------- /recstudio/model/multitask/ple.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from collections import defaultdict 4 | from recstudio.model.multitask.hardshare import HardShare 5 | from ..module import ctr, MLPModule 6 | 7 | r""" 8 | PLE 9 | ###################### 10 | 11 | Paper Reference: 12 | Progressive Layered Extraction (PLE): A Novel Multi-Task Learning (MTL) Model for Personalized Recommendations (RecSys'20) 13 | https://dl.acm.org/doi/10.1145/3383313.3412236 14 | """ 15 | 16 | class PLE(HardShare): 17 | 18 | def _init_model(self, train_data, drop_unused_field=True): 19 | super()._init_model(train_data, drop_unused_field) 20 | model_config = self.config['model'] 21 | self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data) 22 | assert isinstance(self.frating, list), f'Expect rating_field to be a list, but got {self.frating}.' 23 | self.extraction_layers = nn.Sequential(*[ 24 | ctr.ExtractionLayer( 25 | self.embedding.num_features * self.embed_dim if i == 0 else model_config['expert_mlp_layer'][-1], 26 | model_config['specific_experts_per_task'], 27 | len(self.frating), 28 | model_config['num_shared_experts'], 29 | True if i != model_config['num_levels'] - 1 else False, 30 | model_config['expert_mlp_layer'], 31 | model_config['expert_activation'], 32 | model_config['expert_dropout'], 33 | model_config['gate_mlp_layer'], 34 | model_config['gate_activation'], 35 | model_config['gate_dropout']) 36 | for i in range(model_config['num_levels']) 37 | ]) 38 | self.towers = nn.ModuleDict({ 39 | r: MLPModule( 40 | [model_config['expert_mlp_layer'][-1]] + model_config['tower_mlp_layer'] + [1], 41 | model_config['tower_activation'], 42 | model_config['tower_dropout'], 43 | batch_norm=model_config['tower_batch_norm'], 44 | last_activation=False, 45 | last_bn=False) 46 | for r in self.frating 47 | }) 48 | 49 | def score(self, batch): 50 | emb = self.embedding(batch).flatten(1) 51 | extraction_out = self.extraction_layers([emb] * (len(self.frating) + 1)) 52 | score = defaultdict(dict) 53 | for i, (r, tower) in enumerate(self.towers.items()): 54 | score[r]['score'] = tower(extraction_out[i]).squeeze(-1) 55 | return score -------------------------------------------------------------------------------- /recstudio/model/ranker.py: -------------------------------------------------------------------------------- 1 | from recstudio.model.fm import * -------------------------------------------------------------------------------- /recstudio/model/retriever.py: -------------------------------------------------------------------------------- 1 | from recstudio.model.mf import * 2 | from recstudio.model.seq import * 3 | from recstudio.model.ae import * 4 | -------------------------------------------------------------------------------- /recstudio/model/seq/__init__.py: -------------------------------------------------------------------------------- 1 | from .caser import Caser 2 | from .din import DIN 3 | from .fpmc import FPMC 4 | from .gru4rec import GRU4Rec 5 | from .hgn import HGN 6 | from .narm import NARM 7 | from .npe import NPE 8 | from .sasrec import SASRec 9 | from .stamp import STAMP 10 | from .transrec import TransRec 11 | -------------------------------------------------------------------------------- /recstudio/model/seq/bert4rec.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.ann import sampler 3 | from recstudio.data import dataset 4 | from recstudio.model import basemodel, loss_func, scorer 5 | from recstudio.model.module import functional as recfn 6 | from .sasrec import SASRecQueryEncoder 7 | 8 | 9 | class BERT4Rec(basemodel.BaseRetriever): 10 | 11 | def _init_model(self, train_data): 12 | super()._init_model(train_data) 13 | self.mask_token = train_data.num_items 14 | self.query_fields = self.query_fields | set(["mask_token"]) 15 | 16 | def _get_dataset_class(): 17 | return dataset.SeqDataset 18 | 19 | def _get_query_encoder(self, train_data): 20 | model_config = self.config['model'] 21 | return SASRecQueryEncoder( 22 | fiid=self.fiid, embed_dim=self.embed_dim, 23 | max_seq_len=train_data.config['max_seq_len'], n_head=model_config['head_num'], 24 | hidden_size=model_config['hidden_size'], dropout=model_config['dropout'], 25 | activation=model_config['activation'], layer_norm_eps=model_config['layer_norm_eps'], 26 | n_layer=model_config['layer_num'], 27 | training_pooling_type='mask', 28 | item_encoder=self.item_encoder, 29 | bidirectional=True, 30 | ) 31 | 32 | def _get_item_encoder(self, train_data): 33 | # id num_items is used for mask token 34 | return torch.nn.Embedding(train_data.num_items+1, self.embed_dim, padding_idx=0) 35 | 36 | def _get_score_func(self): 37 | return scorer.InnerProductScorer() 38 | 39 | def _get_loss_func(self): 40 | r"""SoftmaxLoss is used as the loss function.""" 41 | return loss_func.SoftmaxLoss() 42 | 43 | def _get_sampler(self, train_data): 44 | return None 45 | 46 | def _reconstruct_train_data(self, batch): 47 | item_seq = batch['in_'+self.fiid] 48 | 49 | padding_mask = item_seq == 0 50 | rand_prob = torch.rand_like(item_seq, dtype=torch.float) 51 | rand_prob.masked_fill_(padding_mask, 1.0) 52 | masked_mask = rand_prob < self.config['train']['mask_ratio'] 53 | masked_token = item_seq[masked_mask] 54 | 55 | item_seq[masked_mask] = self.mask_token 56 | batch['in_'+self.fiid] = item_seq 57 | 58 | batch[self.fiid] = masked_token # N 59 | batch['mask_token'] = masked_mask 60 | return batch 61 | 62 | def training_step(self, batch): 63 | batch = self._reconstruct_train_data(batch) 64 | return super().training_step(batch) 65 | -------------------------------------------------------------------------------- /recstudio/model/seq/cl4srec.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.data import dataset 3 | from recstudio.model.module import data_augmentation 4 | from .sasrec import SASRec, SASRecQueryEncoder 5 | 6 | r""" 7 | CL4SRec 8 | ############# 9 | Contrastive Learning for Sequential Recommendation(SIGIR'21) 10 | Reference: 11 | https://arxiv.org/abs/2010.14395 12 | """ 13 | class CL4SRec(SASRec): 14 | r""" 15 | Model hyper parameters: 16 | - ``embed_dim(int)``: The dimension of embedding layers. Default: ``64``. 17 | - ``hidden_size(int)``: The output size of Transformer layer. Default: ``128``. 18 | - ``layer_num(int)``: The number of layers for the Transformer. Default: ``2``. 19 | - ``dropout_rate(float)``: The dropout probablity for dropout layers after item embedding 20 | | and in Transformer layer. Default: ``0.5``. 21 | - ``head_num(int)``: The number of heads for MultiHeadAttention in Transformer. Default: ``2``. 22 | - ``activation(str)``: The activation function in transformer. Default: ``"gelu"``. 23 | - ``layer_norm_eps``: The layer norm epsilon in transformer. Default: ``1e-12``. 24 | """ 25 | 26 | def _init_model(self, train_data): 27 | super()._init_model(train_data) 28 | self.augmentation_model = data_augmentation.CL4SRecAugmentation(self.config['model'], train_data) 29 | 30 | def _get_dataset_class(): 31 | return dataset.SeqToSeqDataset 32 | 33 | def _get_item_encoder(self, train_data): 34 | return torch.nn.Embedding(train_data.num_items + 1, self.embed_dim, padding_idx=0) # the last item is mask 35 | 36 | def _get_query_encoder(self, train_data): 37 | model_config = self.config['model'] 38 | return SASRecQueryEncoder( 39 | fiid=self.fiid, embed_dim=self.embed_dim, 40 | max_seq_len=train_data.config['max_seq_len'], n_head=model_config['head_num'], 41 | hidden_size=model_config['hidden_size'], dropout=model_config['dropout_rate'], 42 | activation=model_config['activation'], layer_norm_eps=model_config['layer_norm_eps'], 43 | n_layer=model_config['layer_num'], 44 | training_pooling_type='origin', 45 | item_encoder=self.item_encoder 46 | ) 47 | 48 | def training_step(self, batch): 49 | output = self.forward(batch, False) 50 | cl_output = self.augmentation_model(batch, self.query_encoder) 51 | loss_value = self.loss_fn(batch[self.frating], **output['score']) + \ 52 | self.config['model']['cl_weight'] * cl_output['cl_loss'] 53 | return loss_value 54 | 55 | -------------------------------------------------------------------------------- /recstudio/model/seq/config/all.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | test_rep: True 3 | train_rep: True 4 | split_ratio: 2 5 | 6 | eval: 7 | batch_size: 128 8 | -------------------------------------------------------------------------------- /recstudio/model/seq/config/bert4rec.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | activation: 'gelu' 3 | dropout: 0.2 4 | embed_dim: 64 5 | head_num: 2 6 | hidden_size: 128 7 | layer_num: 2 8 | layer_norm_eps: 1e-12 9 | 10 | train: 11 | mask_ratio: 0.2 12 | negative_count: 1 13 | weight_decay: 1e-5 -------------------------------------------------------------------------------- /recstudio/model/seq/config/caser.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 64 3 | n_v: 8 4 | n_h: 16 5 | dropout: 0.4 6 | 7 | train: 8 | negative_count: 1 9 | weight_decay: 1e-5 -------------------------------------------------------------------------------- /recstudio/model/seq/config/cl4srec.yaml: -------------------------------------------------------------------------------- 1 | eval: 2 | batch_size: 128 3 | cutoff: [20, 50, 10, 5] 4 | 5 | model: 6 | # transformer 7 | activation: 'gelu' 8 | dropout_rate: 0.5 9 | hidden_size: 64 10 | head_num: 2 11 | layer_norm_eps: 1e-12 12 | layer_num: 1 13 | # contrastive 14 | temperature: 1.0 15 | augment_type: item_crop # item_crop, item_mask, item_reorder 16 | tau: 0.2 17 | cl_weight: 0.1 18 | 19 | train: 20 | batch_size: 256 21 | early_stop_patience: 40 22 | epochs: 1000 23 | init_method: normal 24 | negative_count: 1 25 | -------------------------------------------------------------------------------- /recstudio/model/seq/config/coserec.yaml: -------------------------------------------------------------------------------- 1 | eval: 2 | batch_size: 128 3 | cutoff: [20, 50, 10, 5] 4 | 5 | 6 | model: 7 | # transformer 8 | hidden_size: 64 9 | layer_num: 1 10 | head_num: 2 11 | dropout_rate: 0.5 12 | activation: 'gelu' 13 | layer_norm_eps: 1e-12 14 | 15 | # contrastive 16 | temperature: 1.0 17 | insert_rate: 0.5 18 | substitute_rate: 0.05 19 | cl_weight: 0.1 20 | augment_threshold: 12 21 | augment_type_for_short: 'SIMRC' 22 | augmentation_warm_up_epochs: 5 23 | 24 | 25 | train: 26 | batch_size: 256 27 | epochs: 1000 28 | early_stop_patience: 40 29 | init_method: normal 30 | negative_count: 1 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /recstudio/model/seq/config/din.yaml: -------------------------------------------------------------------------------- 1 | eval: 2 | batch_size: 32 3 | test_metrics: [auc, logloss] 4 | val_metrics: [auc, logloss] 5 | 6 | model: 7 | activation: dice 8 | attention_mlp: [128, 64] 9 | batch_norm: True 10 | dropout: 0.3 11 | embed_dim: 128 12 | fc_mlp: [128, 64, 64] 13 | 14 | train: 15 | batch_size: 256 16 | negative_count: 20 17 | 18 | data: 19 | low_rating_thres: 0.0 20 | binarized_rating_thres: 3.0 21 | 22 | -------------------------------------------------------------------------------- /recstudio/model/seq/config/fpmc.yaml: -------------------------------------------------------------------------------- 1 | train: 2 | negative_count: 1 -------------------------------------------------------------------------------- /recstudio/model/seq/config/gru4rec.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | hidden_size: 128 3 | dropout_rate: 0.3 4 | layer_num: 1 5 | 6 | train: 7 | negative_count: 1 -------------------------------------------------------------------------------- /recstudio/model/seq/config/hgn.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | pooling_type: mean 3 | 4 | train: 5 | negative_count: 1 6 | -------------------------------------------------------------------------------- /recstudio/model/seq/config/iclrec.yaml: -------------------------------------------------------------------------------- 1 | eval: 2 | batch_size: 128 3 | cutoff: [20, 50, 10, 5] 4 | 5 | 6 | model: 7 | # transformer 8 | hidden_size: 64 9 | layer_num: 1 10 | head_num: 2 11 | dropout_rate: 0.5 12 | activation: 'gelu' 13 | layer_norm_eps: 1e-5 14 | # contrastive learning 15 | temperature: 1.0 16 | augment_type: item_random # item_crop, item_mask, item_reorder, item_random 17 | cl_weight: 0.1 18 | intent_cl_weight: 0.1 19 | num_intent_clusters: 256 20 | intent_seq_representation_type: 'mean' 21 | instance_seq_representation_type: 'mean' 22 | 23 | train: 24 | batch_size: 256 25 | epochs: 1000 26 | early_stop_patience: 40 27 | init_method: normal 28 | negative_count: 1 29 | warm_up_epoches: 0 # number of epochs to start IntentCL. 30 | -------------------------------------------------------------------------------- /recstudio/model/seq/config/narm.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | hidden_size: 128 3 | dropout_rate: [0.25, 0.5] 4 | layer_num: 1 -------------------------------------------------------------------------------- /recstudio/model/seq/config/npe.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | dropout_rate: 0.3 3 | 4 | train: 5 | negative_count: 1 6 | -------------------------------------------------------------------------------- /recstudio/model/seq/config/sasrec.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | hidden_size: 128 3 | layer_num: 2 4 | head_num: 2 5 | dropout_rate: 0.5 6 | activation: 'gelu' 7 | layer_norm_eps: 1e-12 8 | 9 | train: 10 | negative_count: 1 11 | init_method: normal 12 | -------------------------------------------------------------------------------- /recstudio/model/seq/config/stamp.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | embed_dim: 64 3 | -------------------------------------------------------------------------------- /recstudio/model/seq/config/transrec.yaml: -------------------------------------------------------------------------------- 1 | trian: 2 | negative_count: 1 -------------------------------------------------------------------------------- /recstudio/model/seq/coserec.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.ann import sampler 3 | from recstudio.data import dataset 4 | from recstudio.model import basemodel, loss_func, scorer 5 | from recstudio.model.module import data_augmentation 6 | from .sasrec import SASRecQueryEncoder 7 | 8 | 9 | r""" 10 | CoSeRec 11 | ############# 12 | Contrastive Self-supervised Sequential Recommendation with Robust Augmentation 13 | Reference: 14 | https://doi.org/10.48550/arXiv.2108.06479 15 | """ 16 | class CoSeRec(basemodel.BaseRetriever): 17 | r""" 18 | Model hyper parameters: 19 | - ``embed_dim(int)``: The dimension of embedding layers. Default: ``64``. 20 | - ``hidden_size(int)``: The output size of Transformer layer. Default: ``128``. 21 | - ``layer_num(int)``: The number of layers for the Transformer. Default: ``2``. 22 | - ``dropout_rate(float)``: The dropout probablity for dropout layers after item embedding 23 | | and in Transformer layer. Default: ``0.5``. 24 | - ``head_num(int)``: The number of heads for MultiHeadAttention in Transformer. Default: ``2``. 25 | - ``activation(str)``: The activation function in transformer. Default: ``"gelu"``. 26 | - ``layer_norm_eps``: The layer norm epsilon in transformer. Default: ``1e-12``. 27 | """ 28 | 29 | def _init_model(self, train_data:dataset.SeqToSeqDataset): 30 | super()._init_model(train_data) 31 | self.num_items = train_data.num_items 32 | self.augmentation_model = data_augmentation.CoSeRecAugmentation(self.config['model'], train_data) 33 | 34 | def _get_dataset_class(): 35 | return dataset.SeqToSeqDataset 36 | 37 | def _get_item_encoder(self, train_data): 38 | return torch.nn.Embedding(train_data.num_items + 1, self.embed_dim, padding_idx=0) # the last is masking 39 | 40 | def _get_query_encoder(self, train_data): 41 | model_config = self.config['model'] 42 | return SASRecQueryEncoder( 43 | fiid=self.fiid, embed_dim=self.embed_dim, 44 | max_seq_len=train_data.config['max_seq_len'], n_head=model_config['head_num'], 45 | hidden_size=model_config['hidden_size'], dropout=model_config['dropout_rate'], 46 | activation=model_config['activation'], layer_norm_eps=model_config['layer_norm_eps'], 47 | n_layer=model_config['layer_num'], 48 | training_pooling_type='origin', 49 | item_encoder=self.item_encoder 50 | ) 51 | 52 | def _get_score_func(self): 53 | return scorer.InnerProductScorer() 54 | 55 | def _get_loss_func(self): 56 | return loss_func.BinaryCrossEntropyLoss() 57 | 58 | def _get_sampler(self, train_data): 59 | return sampler.UniformSampler(train_data.num_items) 60 | 61 | def training_step(self, batch): 62 | output = self.forward(batch, isinstance(self.loss_fn, loss_func.FullScoreLoss)) 63 | cl_output = self.augmentation_model(batch, self.query_encoder) 64 | loss_value = self.loss_fn(batch[self.frating], **output['score']) + \ 65 | self.config['model']['cl_weight'] * cl_output['cl_loss'] 66 | return loss_value 67 | 68 | def training_epoch(self, nepoch): 69 | if nepoch + 1 >= self.config['model']['augmentation_warm_up_epochs'] + 1: 70 | self.augmentation_model.update_online_model(nepoch, self.item_encoder) 71 | return super().training_epoch(nepoch) -------------------------------------------------------------------------------- /recstudio/model/seq/fpmc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.ann import sampler 3 | from recstudio.data import dataset 4 | from recstudio.model import basemodel, loss_func, module, scorer 5 | 6 | r""" 7 | FPMC 8 | ######### 9 | 10 | Paper Reference: 11 | Steffen Rendle, et al. "Factorizing personalized Markov chains for next-basket recommendation" in WWW2010. 12 | https://dl.acm.org/doi/10.1145/1772690.1772773 13 | """ 14 | 15 | 16 | class FPMC(basemodel.BaseRetriever): 17 | r""" 18 | | FPMC is based on personalized transition graphs over underlying Markov chains. It 19 | factorizes the transition cube with a pairwise interaction model which is a special case of 20 | the Tucker Decomposition. 21 | """ 22 | 23 | def _get_dataset_class(): 24 | r"""The dataset FPMC used is SeqDataset.""" 25 | return dataset.SeqDataset 26 | 27 | def _get_item_encoder(self, train_data): 28 | return torch.nn.Embedding(train_data.num_items, 2*self.embed_dim, padding_idx=0) 29 | 30 | def _get_query_encoder(self, train_data): 31 | return module.VStackLayer( 32 | module.HStackLayer( 33 | module.VStackLayer( 34 | module.LambdaLayer(lambda x: x[self.fuid]), 35 | torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0), 36 | ), 37 | module.VStackLayer( 38 | module.HStackLayer( 39 | module.VStackLayer( 40 | module.LambdaLayer(lambda x: x['in_'+self.fiid]), 41 | torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0), 42 | ), 43 | module.LambdaLayer(lambda x: x['seqlen']) 44 | ), 45 | module.SeqPoolingLayer(pooling_type='last'), 46 | ) 47 | ), 48 | module.LambdaLayer(lambda x: torch.cat(x, dim=-1)) 49 | ) 50 | 51 | def _get_score_func(self): 52 | r"""Inner Product is used as the score function.""" 53 | return scorer.InnerProductScorer() 54 | 55 | def _get_loss_func(self): 56 | r"""The loss function is BPR loss.""" 57 | return loss_func.BPRLoss() 58 | 59 | def _get_sampler(self, train_data): 60 | return sampler.UniformSampler(train_data.num_items) 61 | -------------------------------------------------------------------------------- /recstudio/model/seq/gru4rec.py: -------------------------------------------------------------------------------- 1 | from operator import mod 2 | import torch 3 | from recstudio.ann import sampler 4 | from recstudio.data import dataset 5 | from recstudio.model import basemodel, loss_func, module, scorer 6 | 7 | r""" 8 | GRU4Rec 9 | ############ 10 | 11 | Paper Reference: 12 | Balazs Hidasi, et al. "Session-Based Recommendations with Recurrent Neural Networks" in ICLR2016. 13 | https://arxiv.org/abs/1511.06939 14 | """ 15 | 16 | 17 | class GRU4Rec(basemodel.BaseRetriever): 18 | r""" 19 | GRU4Rec apply RNN in Recommendation System, where sequential behavior of user is regarded as input 20 | of the RNN. 21 | """ 22 | 23 | # def add_model_specific_args(parent_parser): 24 | # parent_parser = basemodel.Recommender.add_model_specific_args(parent_parser) 25 | # parent_parser.add_argument_group('GRU4Rec') 26 | # parent_parser.add_argument("--hidden_size", type=int, default=128, help='hidden size of feedforward') 27 | # parent_parser.add_argument("--layer_num", type=int, default=1, help='layer num of transformers') 28 | # parent_parser.add_argument("--dropout_rate", type=float, default=0.2, help='dropout rate') 29 | # parent_parser.add_argument("--negative_count", type=int, default=1, help='negative sampling numbers') 30 | # return parent_parser 31 | 32 | def _get_dataset_class(): 33 | r"""The dataset is SeqDataset.""" 34 | return dataset.SeqDataset 35 | 36 | def _get_query_encoder(self, train_data): 37 | model_config = self.config['model'] 38 | return ( 39 | module.VStackLayer( 40 | module.HStackLayer( 41 | torch.nn.Sequential( 42 | module.LambdaLayer(lambda x: x['in_'+self.fiid]), 43 | self.item_encoder, 44 | torch.nn.Dropout(model_config['dropout_rate']), 45 | module.GRULayer(self.embed_dim, model_config['hidden_size'], model_config['layer_num']), 46 | ), 47 | module.LambdaLayer(lambda_func=lambda x: x['seqlen']), 48 | ), 49 | module.SeqPoolingLayer(pooling_type='last'), 50 | torch.nn.Linear(model_config['hidden_size'], self.embed_dim) 51 | ) 52 | ) 53 | 54 | def _get_score_func(self): 55 | return scorer.InnerProductScorer() 56 | 57 | def _get_loss_func(self): 58 | r"""SoftmaxLoss is used as the loss function.""" 59 | return loss_func.BPRLoss() 60 | 61 | def _get_sampler(self, train_data): 62 | return sampler.UniformSampler(train_data.num_items) 63 | -------------------------------------------------------------------------------- /recstudio/model/seq/hgn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.ann import sampler 3 | from recstudio.data import dataset 4 | from recstudio.model import basemodel, loss_func, scorer 5 | 6 | r""" 7 | HGN 8 | ######## 9 | 10 | Paper Reference: 11 | Chen ma, et al. "HGN: Hierarchical Gating Networks for Sequential Recommendation" in KDD2019. 12 | https://dl.acm.org/doi/abs/10.1145/3292500.3330984 13 | """ 14 | 15 | 16 | class HGNQueryEncoder(torch.nn.Module): 17 | 18 | def __init__(self, fuid, fiid, num_users, embed_dim, max_seq_len, item_encoder, pooling_type='mean') -> None: 19 | super().__init__() 20 | self.fuid = fuid 21 | self.fiid = fiid 22 | self.item_encoder = item_encoder 23 | self.pooling_type = pooling_type 24 | self.user_embedding = torch.nn.Embedding(num_users, embed_dim, 0) 25 | self.W_g_1 = torch.nn.Linear(embed_dim, embed_dim, bias=False) 26 | self.W_g_2 = torch.nn.Linear(embed_dim, embed_dim, bias=False) 27 | self.b_g = torch.nn.Parameter(torch.empty(embed_dim), requires_grad=True) 28 | self.w_g_3 = torch.nn.Linear(embed_dim, 1, bias=False) 29 | self.W_g_4 = torch.nn.Linear(embed_dim, max_seq_len) 30 | 31 | def forward(self, batch): 32 | U = self.user_embedding(batch[self.fuid]) 33 | S = self.item_encoder(batch['in_'+self.fiid]) 34 | S_F = S * torch.sigmoid(self.W_g_1(S) + self.W_g_2(U).view(U.size(0), 1, -1) + self.b_g) 35 | weight = torch.sigmoid(self.w_g_3(S_F) + (U@self.W_g_4.weight[:S.size(1)].T).view(U.size(0), -1, 1)) # BxLx1 36 | S_I = S_F * weight 37 | if self.pooling_type == 'mean': 38 | s = S_I.sum(1) / weight.sum(1) 39 | elif self.pooling_type == 'max': 40 | s = torch.max(S_I, dim=1).values 41 | else: 42 | raise ValueError("`pooling_type` only support `avg` and `max`") 43 | query = U + s + S.sum(1) 44 | return query 45 | 46 | 47 | class HGN(basemodel.BaseRetriever): 48 | r"""HGN proposes a hierarchical gating network, integrated with the Bayesian Personalized Ranking 49 | (BPR) to capture both the long-term and short-term user interests. HGN consists of a feature 50 | gating module, an instance gating module, and an item-item product module.""" 51 | 52 | def _get_dataset_class(): 53 | r"""The dataset is SeqDataset.""" 54 | return dataset.SeqDataset 55 | 56 | def _get_query_encoder(self, train_data): 57 | return HGNQueryEncoder(self.fuid, self.fiid, train_data.num_users, self.embed_dim, train_data.config['max_seq_len'], 58 | self.item_encoder, self.config['model']['pooling_type']) 59 | 60 | def _get_scorer_func(self): 61 | return scorer.InnerProductScorer() 62 | 63 | def _get_loss_func(self): 64 | r"""BPR loss is used.""" 65 | return loss_func.BPRLoss() 66 | 67 | def _get_sampler(self, train_data): 68 | return sampler.UniformSampler(train_data.num_items) 69 | -------------------------------------------------------------------------------- /recstudio/model/seq/iclrec.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.data import dataset 3 | from recstudio.model.module import data_augmentation 4 | from recstudio.model.seq.sasrec import SASRec, SASRecQueryEncoder 5 | 6 | r""" 7 | ICLRec 8 | ############# 9 | Intent Contrastive Learning for Sequential Recommendation (WWW'22) 10 | Reference: 11 | https://doi.org/10.1145/3485447.3512090 12 | """ 13 | class ICLRec(SASRec): 14 | 15 | def _init_model(self, train_data): 16 | super()._init_model(train_data) 17 | self.augmentation_model = data_augmentation.ICLRecAugmentation(self.config['model'], train_data) 18 | 19 | def _get_dataset_class(): 20 | return dataset.SeqToSeqDataset 21 | 22 | def _get_train_loaders(self, train_data:dataset.SeqToSeqDataset, ddp=False): 23 | rec_train_loader = train_data.train_loader(batch_size = self.config['train']['batch_size'], 24 | shuffle = True, ddp=ddp) 25 | kmeans_train_loader = train_data.train_loader(batch_size = self.config['train']['batch_size'], 26 | shuffle = False, ddp=ddp) 27 | return [rec_train_loader, kmeans_train_loader] 28 | 29 | def current_epoch_trainloaders(self, nepoch): 30 | return self.trainloaders[0], False 31 | 32 | def _get_item_encoder(self, train_data): 33 | return torch.nn.Embedding(train_data.num_items + 1, self.embed_dim, padding_idx=0) # the last is masking 34 | 35 | def _get_query_encoder(self, train_data): 36 | model_config = self.config['model'] 37 | return SASRecQueryEncoder( 38 | fiid=self.fiid, embed_dim=self.embed_dim, 39 | max_seq_len=train_data.config['max_seq_len'], n_head=model_config['head_num'], 40 | hidden_size=model_config['hidden_size'], dropout=model_config['dropout_rate'], 41 | activation=model_config['activation'], layer_norm_eps=model_config['layer_norm_eps'], 42 | n_layer=model_config['layer_num'], 43 | training_pooling_type='origin', 44 | item_encoder=self.item_encoder 45 | ) 46 | 47 | def training_step(self, batch): 48 | output = self.forward(batch, False, return_query=True) 49 | cl_output = self.augmentation_model(batch, output['query'], self.query_encoder) 50 | loss_value = self.loss_fn(batch[self.frating], **output['score']) \ 51 | + self.config['model']['cl_weight'] * cl_output['instance_cl_loss'] \ 52 | + self.config['model']['intent_cl_weight'] * cl_output['intent_cl_loss'] 53 | return loss_value 54 | 55 | def training_epoch(self, nepoch): 56 | self.augmentation_model.train_kmeans(self.query_encoder, self.trainloaders[1], \ 57 | self._parameter_device) 58 | return super().training_epoch(nepoch) 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /recstudio/model/seq/npe.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.ann import sampler 3 | from recstudio.data import dataset 4 | from recstudio.model import basemodel, loss_func, module, scorer 5 | 6 | r""" 7 | NPE 8 | ####################### 9 | 10 | Paper Reference: 11 | ThaiBinh Nguyen, et al. "NPE: Neural Personalized Embedding for Collaborative Filtering" in IJCAI2018. 12 | https://www.ijcai.org/proceedings/2018/0219.pdf 13 | """ 14 | 15 | 16 | class NPE(basemodel.BaseRetriever): 17 | r""" 18 | NPE models a user’s click to an item in two terms: the personal preference of the user for the item, 19 | and the relationships between this item and other items clicked by the user. 20 | """ 21 | 22 | def _get_dataset_class(): 23 | r"""SeqDataset is used for NPE.""" 24 | return dataset.SeqDataset 25 | 26 | def _get_query_encoder(self, train_data): 27 | dropout_rate = self.config['model']['dropout_rate'] 28 | return torch.nn.Sequential( 29 | module.HStackLayer( 30 | torch.nn.Sequential( 31 | module.LambdaLayer(lambda x: x['in_'+self.fiid]), 32 | self.item_encoder[0], 33 | module.LambdaLayer(lambda x: torch.sum(x, dim=1)), 34 | torch.nn.ReLU(), 35 | torch.nn.Dropout(p=dropout_rate) 36 | ), 37 | torch.nn.Sequential( 38 | module.LambdaLayer(lambda x: x[self.fuid]), 39 | torch.nn.Embedding(train_data.num_users, self.embed_dim, 0), 40 | torch.nn.ReLU(), 41 | torch.nn.Dropout(p=dropout_rate) 42 | ) 43 | ), 44 | module.LambdaLayer(lambda x: x[0]+x[1]) 45 | ) 46 | 47 | def _get_item_encoder(self, train_data): 48 | r"""NPE combine an Embedding layer with a ReLU layer as item encoder.""" 49 | return torch.nn.Sequential( 50 | super()._get_item_encoder(train_data), 51 | torch.nn.ReLU()) 52 | 53 | def _get_item_vector(self): 54 | """Get all item vectors, simply apply ReLU operation on the weight of Embedding layer.""" 55 | return self.item_encoder[1](self.item_encoder[0].weight[1:]) 56 | 57 | def _get_score_func(self): 58 | r"""Innerproduct operation is applied to calculate scores between query and item.""" 59 | return scorer.InnerProductScorer() 60 | 61 | def _get_loss_func(self): 62 | r"""According to the original paper, BCE loss is applied. 63 | Also, other loss functions like softmax loss and BPR loss can be used too. 64 | """ 65 | return loss_func.BinaryCrossEntropyLoss() 66 | 67 | def _get_sampler(self, train_data): 68 | return sampler.UniformSampler(train_data.num_items) 69 | -------------------------------------------------------------------------------- /recstudio/model/seq/stamp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.data import dataset 3 | from recstudio.model import basemodel, loss_func, module, scorer 4 | 5 | 6 | class STAMPQueryEncoder(torch.nn.Module): 7 | 8 | def __init__(self, fiid, embed_dim, item_encoder) -> None: 9 | super().__init__() 10 | self.fiid = fiid 11 | self.item_encoder = item_encoder 12 | self.gather_layer = module.SeqPoolingLayer(pooling_type='last') 13 | self.attention_layer = module.AttentionLayer( 14 | q_dim=2 * embed_dim, 15 | k_dim=embed_dim, 16 | mlp_layers=[embed_dim], 17 | ) 18 | self.mlpA = module.MLPModule([embed_dim, embed_dim], torch.nn.Tanh()) 19 | self.mlpB = module.MLPModule([embed_dim, embed_dim], torch.nn.Tanh()) 20 | 21 | def forward(self, batch): 22 | user_hist = batch['in_'+self.fiid] 23 | seq_emb = self.item_encoder(user_hist) 24 | m_t = self.gather_layer(seq_emb, batch['seqlen']) 25 | m_s = seq_emb.sum(dim=1) / batch['seqlen'].unsqueeze(1).float() # B x D 26 | 27 | query = torch.cat((m_t, m_s), dim=1) # Bx2D 28 | m_a = self.attention_layer(query.unsqueeze(1), seq_emb, seq_emb, 29 | key_padding_mask=(user_hist == 0)).squeeze(1) 30 | h_s = self.mlpA(m_a) 31 | h_t = self.mlpB(m_t) 32 | return h_s * h_t 33 | 34 | 35 | class STAMP(basemodel.BaseRetriever): 36 | r""" 37 | STAMP is capable of capturing users’ general interests from the long-term memory of a session 38 | context, while taking into account users’ current interests from the short-term memory of the 39 | last-clicks. 40 | 41 | Model hyper parameters: 42 | - ``embed_dim(int)``: The dimension of embedding layers. Default: ``64``. 43 | """ 44 | 45 | def _get_dataset_class(): 46 | r"""SeqDataset is used for STAMP.""" 47 | return dataset.SeqDataset 48 | 49 | def _get_query_encoder(self, train_data): 50 | return STAMPQueryEncoder(self.fiid, self.embed_dim, self.item_encoder) 51 | 52 | def _get_score_func(self): 53 | r"""InnerProduct is used as the score function.""" 54 | return scorer.InnerProductScorer() 55 | 56 | def _get_loss_func(self): 57 | r"""SoftmaxLoss is used as the loss function.""" 58 | return loss_func.SoftmaxLoss() 59 | 60 | def _get_sampler(self, train_data): 61 | return None 62 | -------------------------------------------------------------------------------- /recstudio/model/seq/transrec.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from recstudio.ann import sampler 3 | from recstudio.data import dataset 4 | from recstudio.model import basemodel, loss_func, scorer 5 | 6 | 7 | class TransRecQueryEncoder(torch.nn.Module): 8 | def __init__(self, fuid, fiid, num_users, embed_dim, item_encoder): 9 | super().__init__() 10 | self.fuid = fuid 11 | self.fiid = fiid 12 | self.item_encoder = item_encoder 13 | self.user_embedding = torch.nn.Embedding(num_users, embed_dim, 0) 14 | self.global_user_emb = torch.nn.Parameter(torch.zeros(embed_dim)) 15 | 16 | def forward(self, batch): 17 | user_hist = batch['in_'+self.fiid] 18 | seq_len = batch['seqlen'] - 1 19 | local_user_emb = self.user_embedding(batch[self.fuid]) 20 | user_emb = local_user_emb + self.global_user_emb.expand_as(local_user_emb) # B x D 21 | last_item_id = torch.gather(user_hist, dim=-1, index=seq_len.unsqueeze(1)) 22 | last_item_emb = self.item_encoder(last_item_id).squeeze(1) # B x D 23 | query = user_emb + last_item_emb 24 | return query 25 | 26 | 27 | class TransRec(basemodel.BaseRetriever): 28 | r""" 29 | TransRec embeds items into a ‘transition space’ where users are modeled as translation vectors operating on item sequences. 30 | 31 | Model hyper parameters: 32 | - ``embed_dim(int)``: The dimension of embedding layers. Default: ``64``. 33 | """ 34 | 35 | # TODO(@AngusHuang17): bias here is not easy to construct query, abandoned now 36 | 37 | def _get_dataset_class(): 38 | r"""SeqDataset is used for TransRec.""" 39 | return dataset.SeqDataset 40 | 41 | def _get_item_encoder(self, train_data): 42 | return torch.nn.Embedding(train_data.num_items, self.embed_dim, 0) 43 | 44 | def _get_query_encoder(self, train_data): 45 | return TransRecQueryEncoder( 46 | self.fuid, self.fiid, train_data.num_users, self.embed_dim, self.item_encoder 47 | ) 48 | 49 | def _get_sampler(self, train_data): 50 | return sampler.UniformSampler(train_data.num_items, self.score_func) 51 | 52 | def _get_scorer(self): 53 | r"""InnerProduct is used as the score function.""" 54 | return scorer.EuclideanScorer() 55 | 56 | def _get_loss_func(self): 57 | r"""BPRLoss is used as the loss function.""" 58 | return loss_func.BPRLoss() 59 | -------------------------------------------------------------------------------- /recstudio/quickstart/__init__.py: -------------------------------------------------------------------------------- 1 | from .run import run 2 | from .config_dataset import generate_dataset_config -------------------------------------------------------------------------------- /recstudio/quickstart/config_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | 5 | def generate_dataset_config(name: str, dir: str, interaction_file: str, user_id: str, 6 | item_id: str, rating:str, timestamp: str, sep='\t', user_file: str=None, item_file: str=None): 7 | config_file_name = f"{name}.yaml" 8 | config_path = os.path.join(dir, name) 9 | config_dict = { 10 | 'url': dir, 11 | 'user_id_field': f"&u {user_id}:token", 12 | 'item_id_field': f"&i {item_id}:token", 13 | 'rating_field': f"&r {rating}:float", 14 | 'time_field': f"&t {timestamp}:float", 15 | 'inter_feat_name': f"{interaction_file}", 16 | 'user_feat_name': f"{user_file}" if user_file else "~", 17 | 'item_feat_name': f"{item_file}" if item_file else "~", 18 | 19 | } 20 | raise NotImplementedError("Sorry, not supported now, we will implement the function soon.") -------------------------------------------------------------------------------- /recstudio/quickstart/run.py: -------------------------------------------------------------------------------- 1 | import os, datetime, torch 2 | from typing import * 3 | from recstudio.utils import * 4 | from recstudio import LOG_DIR 5 | 6 | def run(model: str, dataset: str, model_config: Dict=None, data_config: Dict=None, model_config_path: str=None, data_config_path: str=None, verbose=True, run_mode='light', **kwargs): 7 | model_class, model_conf = get_model(model) 8 | 9 | if model_config_path is not None: 10 | if isinstance(model_config_path, str): 11 | model_conf = deep_update(model_conf, parser_yaml(model_config_path)) 12 | else: 13 | raise TypeError(f"expecting `model_config_path` to be str, while get {type(model_config_path)} instead.") 14 | 15 | if model_config is not None: 16 | if isinstance(model_config, Dict): 17 | model_conf = deep_update(model_conf, model_config) 18 | else: 19 | raise TypeError(f"expecting `model_config` to be Dict, while get {type(model_config)} instead.") 20 | 21 | if kwargs is not None: 22 | model_conf = deep_update(model_conf, kwargs) 23 | 24 | log_path = f"{model}/{dataset}/{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')}.log" 25 | logger = get_logger(log_path) 26 | torch.set_num_threads(model_conf['train']['num_threads']) 27 | 28 | if not verbose: 29 | import logging 30 | logger.setLevel(logging.ERROR) 31 | 32 | logger.info("Log saved in {}.".format(os.path.abspath(os.path.join(LOG_DIR, log_path)))) 33 | if run_mode == 'tune': 34 | model_conf = update_config_with_nni(model_conf) 35 | model = model_class(model_conf) 36 | dataset_class = model_class._get_dataset_class() 37 | 38 | data_conf = {} 39 | if data_config_path is not None: 40 | if isinstance(data_config_path, str): 41 | # load dataset config from file 42 | conf = parser_yaml(data_config_path) 43 | data_conf.update(conf) 44 | else: 45 | raise TypeError(f"expecting `data_config_path` to be str, while get {type(data_config_path)} instead.") 46 | 47 | if data_config is not None: 48 | if isinstance(data_config, dict): 49 | # update config with given dict 50 | data_conf.update(data_config) 51 | else: 52 | raise TypeError(f"expecting `data_config` to be Dict, while get {type(data_config)} instead.") 53 | 54 | data_conf.update(model_conf['data']) # update model-specified config 55 | 56 | datasets = dataset_class(name=dataset, config=data_conf).build(**model_conf['data']) 57 | logger.info(f"{datasets[0]}") 58 | logger.info(f"\n{set_color('Model Config', 'green')}: \n\n" + color_dict_normal(model_conf, False)) 59 | val_result = model.fit(*datasets[:2], run_mode=run_mode) 60 | test_result = model.evaluate(datasets[-1]) 61 | return (model, datasets), (val_result, test_result) 62 | -------------------------------------------------------------------------------- /recstudio/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from recstudio.utils.utils import * 2 | from recstudio.utils.data_parallel import * 3 | from recstudio.utils.arguments import get_default_parser, add_model_arguments, parser2nested_dict -------------------------------------------------------------------------------- /recstudio/utils/compress_file.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import zipfile 4 | import gzip 5 | 6 | 7 | class CompressedFile(object): 8 | magic = None 9 | file_type = None 10 | mime_type = None 11 | 12 | def __init__(self, fname, save_dir): 13 | self.extract_all(fname, save_dir) 14 | 15 | @classmethod 16 | def is_magic(self, data): 17 | return data.startswith(self.magic) 18 | 19 | def extract_all(self, fname, save_dir): 20 | pass 21 | 22 | 23 | class ZIPFile (CompressedFile): 24 | magic = b'\x50\x4b\x03\x04' 25 | file_type = 'zip' 26 | mime_type = 'compressed/zip' 27 | 28 | def extract_all(self, fname, save_dir): 29 | with zipfile.ZipFile(fname) as f: 30 | for member in f.namelist(): 31 | filename = os.path.basename(member) 32 | # skip directories 33 | if not filename: 34 | continue 35 | 36 | source = f.open(member) 37 | target = open(os.path.join(save_dir, filename), "wb") 38 | with source, target: 39 | shutil.copyfileobj(source, target) 40 | 41 | 42 | class GZFile (CompressedFile): 43 | magic = b'\x1f\x8b\x08' 44 | file_type = 'gz' 45 | mime_type = 'compressed/gz' 46 | 47 | def extract_all(self, fname, save_dir): 48 | decompressed_fname = os.path.basename(fname)[:-3] 49 | with gzip.open(fname, 'rb') as f_in: 50 | with open(os.path.join(save_dir, decompressed_fname), 'wb') as f_out: 51 | shutil.copyfileobj(f_in, f_out) 52 | 53 | 54 | def extract_compressed_file(filename, save_dir): 55 | with open(filename, 'rb') as f: 56 | start_of_file = f.read(1024) 57 | 58 | f.seek(0) 59 | if filename.endswith('csv'): 60 | pass 61 | else: 62 | for cls in (ZIPFile, GZFile): 63 | if cls.is_magic(start_of_file): 64 | cls(filename, save_dir) 65 | break 66 | os.remove(filename) 67 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | from recstudio.utils import * 2 | from recstudio import quickstart 3 | 4 | 5 | if __name__ == '__main__': 6 | parser = get_default_parser() 7 | args, command_line_args = parser.parse_known_args() 8 | parser = add_model_arguments(parser, args.model) 9 | command_line_conf = parser2nested_dict(parser, command_line_args) 10 | 11 | model_class, model_conf = get_model(args.model) 12 | model_conf = deep_update(model_conf, command_line_conf) 13 | 14 | quickstart.run(args.model, args.dataset, model_config=model_conf, data_config_path=args.data_config_path, run_mode=args.mode) 15 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | from __future__ import division 4 | 5 | import os 6 | 7 | from setuptools import setup, find_packages 8 | 9 | install_requires = ['numpy>=1.20.1', 'torch>=1.9.0', 'scipy>=1.6.0', 'pandas>=1.3.0', 'tqdm>=4.48.2', 10 | 'colorlog==4.7.2','colorama==0.4.4', 'pyyaml>=5.1.0', 'tensorboard>=2.5.0', 11 | 'faiss-gpu==1.7.2', 'torchmetrics==0.7.3'] 12 | 13 | setup_requires = [] 14 | 15 | extras_require = {} 16 | 17 | classifiers = ['License :: OSI Approved :: MIT License', 18 | 'License :: OSI Approved :: MIT License', 19 | 'Programming Language :: Python', 20 | 'Programming Language :: Python :: 3', 21 | 'Programming Language :: Python :: 3.8'] 22 | 23 | long_description = 'RecStudio is a modular, efficient, unified, and comprehensive recommendation library based on PyTorch.'\ 24 | 'We divide all the models into 3 basic classes according to the number of towers: TowerFree, ItemTower, TwoTower, '\ 25 | 'and cover models in 4 tasks: General Recommendation, Sequential Recommendation, Knowledge-based Recommendation, Social-Network-based Recommendation. '\ 26 | 'View github page: https://github.com/ustcml/RecStudio' 27 | 28 | # Readthedocs requires Sphinx extensions to be specified as part of 29 | # install_requires in order to build properly. 30 | on_rtd = os.environ.get('READTHEDOCS', None) == 'True' 31 | if on_rtd: 32 | install_requires.extend(setup_requires) 33 | 34 | setup( 35 | name='recstudio', 36 | version= 37 | '0.0.2a1', # please remember to edit recbole/__init__.py in response, once updating the version 38 | description='A modular, efficient, unified, and comprehensive recommendation library based on PyTorch.', 39 | long_description=long_description, 40 | long_description_content_type="text/markdown", 41 | url='https://github.com/ustcml/RecStudio', 42 | author='USTCML', 43 | author_email='liandefu@ustc.edu.cn', 44 | packages=[ 45 | package for package in find_packages() 46 | if package.startswith('recstudio') 47 | ], 48 | include_package_data=True, 49 | install_requires=install_requires, 50 | setup_requires=setup_requires, 51 | extras_require=extras_require, 52 | zip_safe=False, 53 | classifiers=classifiers, 54 | ) -------------------------------------------------------------------------------- /test/test_config_dataset.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | sys.path.append(".") 3 | 4 | from recstudio.quickstart import generate_dataset_config 5 | 6 | generate_dataset_config(name='mydataset', data_dir='dataset_dir/', 7 | interaction_file='inter.csv', user_id='user_id', item_id='item_id', 8 | rating='rating', timestamp='timestamp', sep='\t') -------------------------------------------------------------------------------- /test/test_dataset.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | sys.path.append(".") 3 | # sys.path.append(os.path.join(__file__, '../')) 4 | # sys.path.insert(0, os.path.join(__file__, '../')) 5 | 6 | from recstudio.data.dataset import TripletDataset 7 | 8 | data = TripletDataset(name='ml-100k') 9 | trn, val, tst = data.build(split_ratio=[0.7, 0.2, 0.1]) 10 | 11 | trn_loader = trn.train_loader(batch_size=128, shuffle=True) 12 | 13 | batch = next(iter(trn_loader)) 14 | print(batch) 15 | 16 | # print("End.") -------------------------------------------------------------------------------- /test/test_ddp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.distributed as dist 4 | import torch.multiprocessing as mp 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | from torch.nn.parallel import DistributedDataParallel as DDP 8 | 9 | 10 | def example(rank, world_size): 11 | # create default process group 12 | dist.init_process_group("gloo", rank=rank, world_size=world_size) 13 | # create local model 14 | model = nn.Linear(10, 10).to(rank) 15 | # construct DDP model 16 | ddp_model = DDP(model, device_ids=[rank]) 17 | # define loss function and optimizer 18 | loss_fn = nn.MSELoss() 19 | optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) 20 | 21 | # forward pass 22 | outputs = ddp_model(torch.randn(20, 10).to(rank)) 23 | labels = torch.randn(20, 10).to(rank) 24 | # backward pass 25 | loss_fn(outputs, labels).backward() 26 | # update parameters 27 | optimizer.step() 28 | 29 | def main(): 30 | world_size = 4 31 | mp.spawn(example, 32 | args=(world_size,), 33 | nprocs=world_size, 34 | join=True) 35 | 36 | if __name__=="__main__": 37 | # Environment variables which need to be 38 | # set when using c10d's default "env" 39 | # initialization mode. 40 | os.environ["MASTER_ADDR"] = "localhost" 41 | os.environ["MASTER_PORT"] = "29500" 42 | main() -------------------------------------------------------------------------------- /test/test_quickrun.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") 3 | from recstudio import quickstart 4 | 5 | quickstart.run(model='MultiVAE', dataset='ml-100k', gpu=[2]) 6 | 7 | 8 | import recstudio.data as recdata 9 | 10 | print(recdata.supported_dataset) -------------------------------------------------------------------------------- /test/test_retriever.py: -------------------------------------------------------------------------------- 1 | from recstudio.model import scorer, loss_func # 导入打分和损失函数模块 2 | from recstudio.ann import sampler # 导入采样器模块 3 | from recstudio.model.basemodel import BaseRetriever # 导入召回模型基类 4 | from recstudio.data import dataset # 导入数据集模块 5 | import torch 6 | import sys 7 | sys.path.append(".") 8 | 9 | 10 | ml_1m_data = dataset.TripletDataset(name='ml-100k') 11 | trn, val, tst = ml_1m_data.build(split_ratio=[0.7, 0.2, 0.1]) 12 | 13 | bpr = BaseRetriever( 14 | item_encoder=torch.nn.Embedding(trn.num_items, 64, 0), 15 | query_encoder=torch.nn.Embedding(trn.num_users, 64, 0), 16 | scorer=scorer.InnerProductScorer(), 17 | loss=loss_func.BPRLoss(), 18 | sampler=sampler.UniformSampler(trn.num_items) 19 | ) 20 | 21 | bpr.fit(trn, val, negative_count=1) 22 | bpr.evaluate(tst) 23 | -------------------------------------------------------------------------------- /test/test_training_pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from tqdm import tqdm 4 | from termcolor import colored 5 | 6 | 7 | dir = os.path.join( os.path.dirname(__file__), '../' ) 8 | sys.path.append( os.path.abspath(dir) ) 9 | 10 | from recstudio.quickstart import run 11 | 12 | # Note: please add models to be tested here 13 | datasets = ['ml-100k'] 14 | 15 | fm_model = ['DCN', 'DeepFM', 'FM', 'LR', 'NFM', 'WideDeep', 'xDeepFM'] 16 | mf_model = ['BPR', 'CML', 'DSSM', 'EASE', 'IRGAN', 'ItemKNN', 'LogisticMF', 'NCF', 'SLIM', 'WRMF'] 17 | seq_model = ['BERT4Rec', 'Caser', 'FPMC', 'GRU4Rec', 'HGN', 'NARM', 'NPE', 'SASRec', 'STAMP', 'TransRec'] 18 | ae_model = ['MultiDAE', 'MultiVAE'] 19 | graph_model = ['LightGCN', 'NCL', 'NGCF', 'SGL', 'SimGCL'] 20 | cl_seq_model = ['CL4SRec', 'CoSeRec', 'ICLRec'] 21 | 22 | 23 | training_configs = [ 24 | {'train': {'epochs': 3} }, 25 | ] 26 | 27 | all_models = { 28 | 'FM': fm_model, 29 | 'MF': mf_model, 30 | 'SEQ': seq_model, 31 | 'AE': ae_model, 32 | 'GRAPH': graph_model, 33 | 'CL-SEQ': cl_seq_model, 34 | } 35 | 36 | # test loop 37 | num_exps = sum([len(m) for m in all_models.values()]) * len(datasets) * len(training_configs) 38 | pbar = tqdm(total=num_exps) 39 | for cate, models in all_models.items(): 40 | tqdm.write(f"Test {cate} models - {len(mf_model)} models:") 41 | failed_exp = [] 42 | for m in models: 43 | for d in datasets: 44 | for i, config in enumerate(training_configs): 45 | pbar.update(1) 46 | tqdm.write(colored(f"### Test: model-{m}, data-{d}, {i}-th configurations.", on_color='on_blue')) 47 | try: 48 | run(m, d, config, verbose=False) 49 | tqdm.write(colored(f"$$$ Test passed!", 'green')) 50 | except: 51 | tqdm.write(colored(f"!!! Test failed!", 'red')) 52 | failed_exp.append({ 53 | 'model': m, 54 | 'dataset': d, 55 | 'config': config 56 | }) 57 | tqdm.write("{} models test End. {}/{} failed.".format( cate, len(failed_exp), (len(models) * len(datasets) * len(config)) )) 58 | 59 | pbar.close() --------------------------------------------------------------------------------