├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── README_CN.md
├── assets
    ├── recstudio_framework.png
    └── recstudio_logo.png
├── environment.yml
├── example
    ├── example.ipynb
    └── sasrec_demo.py
├── nni-experiments
    ├── config
    │   ├── bpr.yaml
    │   └── sasrec.yaml
    └── search_space
    │   ├── bpr.yaml
    │   └── sasrec.yaml
├── recstudio
    ├── __init__.py
    ├── ann
    │   ├── __init__.py
    │   └── sampler.py
    ├── data
    │   ├── __init__.py
    │   ├── advance_dataset.py
    │   ├── config
    │   │   ├── all.yaml
    │   │   ├── amazon-beauty.yaml
    │   │   ├── amazon-books.yaml
    │   │   ├── amazon-electronics.yaml
    │   │   ├── criteo.yaml
    │   │   ├── gowalla.yaml
    │   │   ├── kuairand-pure.yaml
    │   │   ├── ml-100k.yaml
    │   │   ├── ml-10m.yaml
    │   │   ├── ml-1m.yaml
    │   │   ├── ml-20m.yaml
    │   │   ├── tmall.yaml
    │   │   └── yelp.yaml
    │   └── dataset.py
    ├── dataset_demo
    │   ├── __init__.py
    │   └── ml-100k
    │   │   ├── ml-100k.inter
    │   │   ├── ml-100k.item
    │   │   ├── ml-100k.kg
    │   │   ├── ml-100k.link
    │   │   ├── ml-100k.user
    │   │   └── social.txt
    ├── eval
    │   └── __init__.py
    ├── model
    │   ├── __init__.py
    │   ├── ae
    │   │   ├── config
    │   │   │   ├── all.yaml
    │   │   │   ├── multidae.yaml
    │   │   │   └── multivae.yaml
    │   │   ├── multidae.py
    │   │   └── multivae.py
    │   ├── basemodel
    │   │   ├── __init__.py
    │   │   ├── basemodel.yaml
    │   │   ├── baseranker.py
    │   │   ├── baseretriever.py
    │   │   └── recommender.py
    │   ├── debias
    │   │   └── __init__.py
    │   ├── fm
    │   │   ├── __init__.py
    │   │   ├── afm.py
    │   │   ├── afn.py
    │   │   ├── aoanet.py
    │   │   ├── autoint.py
    │   │   ├── ccpm.py
    │   │   ├── config
    │   │   │   ├── afm.yaml
    │   │   │   ├── afn.yaml
    │   │   │   ├── all.yaml
    │   │   │   ├── aoanet.yaml
    │   │   │   ├── autoint.yaml
    │   │   │   ├── ccpm.yaml
    │   │   │   ├── dcn.yaml
    │   │   │   ├── dcnv2.yaml
    │   │   │   ├── deepcrossing.yaml
    │   │   │   ├── deepfm.yaml
    │   │   │   ├── deepim.yaml
    │   │   │   ├── destine.yaml
    │   │   │   ├── difm.yaml
    │   │   │   ├── dlrm.yaml
    │   │   │   ├── edcn.yaml
    │   │   │   ├── ffm.yaml
    │   │   │   ├── fgcnn.yaml
    │   │   │   ├── fibinet.yaml
    │   │   │   ├── fignn.yaml
    │   │   │   ├── finalmlp.yaml
    │   │   │   ├── flen.yaml
    │   │   │   ├── fm.yaml
    │   │   │   ├── fmfm.yaml
    │   │   │   ├── fwfm.yaml
    │   │   │   ├── hfm.yaml
    │   │   │   ├── ifm.yaml
    │   │   │   ├── interhat.yaml
    │   │   │   ├── lorentzfm.yaml
    │   │   │   ├── lr.yaml
    │   │   │   ├── masknet.yaml
    │   │   │   ├── nfm.yaml
    │   │   │   ├── onn.yaml
    │   │   │   ├── pnn.yaml
    │   │   │   ├── ppnet.yaml
    │   │   │   ├── sam.yaml
    │   │   │   ├── widedeep.yaml
    │   │   │   └── xdeepfm.yaml
    │   │   ├── dcn.py
    │   │   ├── dcnv2.py
    │   │   ├── deepcrossing.py
    │   │   ├── deepfm.py
    │   │   ├── deepim.py
    │   │   ├── destine.py
    │   │   ├── difm.py
    │   │   ├── dlrm.py
    │   │   ├── edcn.py
    │   │   ├── ffm.py
    │   │   ├── fgcnn.py
    │   │   ├── fibinet.py
    │   │   ├── fignn.py
    │   │   ├── finalmlp.py
    │   │   ├── flen.py
    │   │   ├── fm.py
    │   │   ├── fmfm.py
    │   │   ├── fwfm.py
    │   │   ├── hfm.py
    │   │   ├── ifm.py
    │   │   ├── interhat.py
    │   │   ├── lorentzfm.py
    │   │   ├── lr.py
    │   │   ├── masknet.py
    │   │   ├── nfm.py
    │   │   ├── onn.py
    │   │   ├── pnn.py
    │   │   ├── ppnet.py
    │   │   ├── sam.py
    │   │   ├── widedeep.py
    │   │   └── xdeepfm.py
    │   ├── graph
    │   │   ├── __init__.py
    │   │   ├── config
    │   │   │   ├── all.yaml
    │   │   │   ├── lightgcn.yaml
    │   │   │   ├── ncl.yaml
    │   │   │   ├── ngcf.yaml
    │   │   │   ├── sgl.yaml
    │   │   │   └── simgcl.yaml
    │   │   ├── lightgcn.py
    │   │   ├── ncl.py
    │   │   ├── ngcf.py
    │   │   ├── sgl.py
    │   │   └── simgcl.py
    │   ├── init.py
    │   ├── kg
    │   │   ├── KGLearning.py
    │   │   ├── __init__.py
    │   │   ├── cfkg.py
    │   │   ├── cke.py
    │   │   ├── config
    │   │   │   ├── all.yaml
    │   │   │   ├── cfkg.yaml
    │   │   │   ├── cke.yaml
    │   │   │   ├── kgat.yaml
    │   │   │   ├── kgcn.yaml
    │   │   │   ├── kgin.yaml
    │   │   │   ├── kgnnls.yaml
    │   │   │   ├── ktup.yaml
    │   │   │   ├── mkr.yaml
    │   │   │   └── ripplenet.yaml
    │   │   ├── kgat.py
    │   │   ├── kgcn.py
    │   │   ├── kgin.py
    │   │   ├── kgnnls.py
    │   │   ├── ktup.py
    │   │   ├── mkr.py
    │   │   └── ripplenet.py
    │   ├── loss_func.py
    │   ├── mf
    │   │   ├── __init__.py
    │   │   ├── bpr.py
    │   │   ├── cml.py
    │   │   ├── config
    │   │   │   ├── all.yaml
    │   │   │   ├── bpr.yaml
    │   │   │   ├── cml.yaml
    │   │   │   ├── dssm.yaml
    │   │   │   ├── ease.yaml
    │   │   │   ├── irgan.yaml
    │   │   │   ├── itemknn.yaml
    │   │   │   ├── logisticmf.yaml
    │   │   │   ├── ncf.yaml
    │   │   │   ├── pmf.yaml
    │   │   │   ├── slim.yaml
    │   │   │   └── wrmf.yaml
    │   │   ├── dssm.py
    │   │   ├── ease.py
    │   │   ├── irgan.py
    │   │   ├── itemknn.py
    │   │   ├── logisticmf.py
    │   │   ├── ncf.py
    │   │   ├── pmf.py
    │   │   ├── slim.py
    │   │   └── wrmf.py
    │   ├── module
    │   │   ├── __init__.py
    │   │   ├── ctr.py
    │   │   ├── data_augmentation.py
    │   │   ├── functional.py
    │   │   ├── graphmodule.py
    │   │   ├── gru.py
    │   │   └── layers.py
    │   ├── multitask
    │   │   ├── __init__.py
    │   │   ├── aitm.py
    │   │   ├── config
    │   │   │   ├── aitm.yaml
    │   │   │   ├── all.yaml
    │   │   │   ├── hardshare.yaml
    │   │   │   ├── mmoe.yaml
    │   │   │   └── ple.yaml
    │   │   ├── hardshare.py
    │   │   ├── mmoe.py
    │   │   └── ple.py
    │   ├── ranker.py
    │   ├── retriever.py
    │   ├── scorer.py
    │   └── seq
    │   │   ├── __init__.py
    │   │   ├── bert4rec.py
    │   │   ├── caser.py
    │   │   ├── cl4srec.py
    │   │   ├── config
    │   │       ├── all.yaml
    │   │       ├── bert4rec.yaml
    │   │       ├── caser.yaml
    │   │       ├── cl4srec.yaml
    │   │       ├── coserec.yaml
    │   │       ├── din.yaml
    │   │       ├── fpmc.yaml
    │   │       ├── gru4rec.yaml
    │   │       ├── hgn.yaml
    │   │       ├── iclrec.yaml
    │   │       ├── narm.yaml
    │   │       ├── npe.yaml
    │   │       ├── sasrec.yaml
    │   │       ├── stamp.yaml
    │   │       └── transrec.yaml
    │   │   ├── coserec.py
    │   │   ├── din.py
    │   │   ├── fpmc.py
    │   │   ├── gru4rec.py
    │   │   ├── hgn.py
    │   │   ├── iclrec.py
    │   │   ├── narm.py
    │   │   ├── npe.py
    │   │   ├── sasrec.py
    │   │   ├── stamp.py
    │   │   └── transrec.py
    ├── quickstart
    │   ├── __init__.py
    │   ├── config_dataset.py
    │   └── run.py
    └── utils
    │   ├── __init__.py
    │   ├── arguments.py
    │   ├── callbacks.py
    │   ├── compress_file.py
    │   ├── data_parallel.py
    │   ├── trainer.py
    │   └── utils.py
├── run.py
├── setup.py
└── test
    ├── test_config_dataset.py
    ├── test_dataset.py
    ├── test_ddp.py
    ├── test_quickrun.py
    ├── test_retriever.py
    └── test_training_pipeline.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | __pycache__/
  2 | *.py[cod]
  3 | *$py.class
  4 | 
  5 | # C extensions
  6 | *.so
  7 | 
  8 | # Distribution / packaging
  9 | .Python
 10 | build/
 11 | develop-eggs/
 12 | dist/
 13 | downloads/
 14 | eggs/
 15 | .eggs/
 16 | lib/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | wheels/
 22 | share/python-wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .nox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | *.py,cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | cover/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | .pybuilder/
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | #   For a library or package, you might want to ignore these files since the code is
 86 | #   intended to run in multiple environments; otherwise, check them in:
 87 | # .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | 
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 | 
119 | # Rope project settings
120 | .ropeproject
121 | 
122 | # mkdocs documentation
123 | /site
124 | 
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 | 
130 | # Pyre type checker
131 | .pyre/
132 | 
133 | # pytype static type analyzer
134 | .pytype/
135 | 
136 | # Cython debug symbols
137 | cython_debug/
138 | 
139 | tensorboard/
140 | .vscode/
141 | **/log
142 | **/saved
143 | **/.recstudio
144 | datasets/*
145 | !datasets/ml-100k/
146 | 
147 | .recstudio/
148 | 
149 | nni-experiments/*
150 | !nni-experiments/config/
151 | !nni-experiments/search_space/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) [2022] [ustcml]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include recstudio/ *.yaml
2 | recursive-include recstudio/dataset_demo *.inter *.item *.kg *.link *.user *txt
3 | 


--------------------------------------------------------------------------------
/assets/recstudio_framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/USTCLLM/RecStudio/9114975b8e9ec85bce16c1ed8abbf0e194e4afb3/assets/recstudio_framework.png


--------------------------------------------------------------------------------
/assets/recstudio_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/USTCLLM/RecStudio/9114975b8e9ec85bce16c1ed8abbf0e194e4afb3/assets/recstudio_logo.png


--------------------------------------------------------------------------------
/nni-experiments/config/bpr.yaml:
--------------------------------------------------------------------------------
 1 | experimentName: BPR-ml-100k                                     # Mnemonic name of the experiment, which will be shown in WebUI and nnictl
 2 | searchSpaceFile: ../search_space/bpr.yaml                       # Path to the TAML file containing the search space
 3 | 
 4 | trialCommand: python3 run.py -m BPR -d ml-100k --mode tune      # Command to launch trial
 5 | trialCodeDirectory: ../..                                       # Path to the directory containing trial source files
 6 | trialConcurrency: 4                                             # Specify how many trials should be run concurrently
 7 | trialGpuNumber: 1
 8 | 
 9 | maxExperimentDuration: 100d                                     # Stop generating trials after the limitation
10 | maxTrialNumber: 1000                                            # Limit the number of trials to create
11 | maxTrialDuration: ~                                             # Limit the duration of trial job
12 | 
13 | experimentWorkingDirectory: ../                                 # The directory to place log, checkpoint, metadata, etc
14 | 
15 | 
16 | tuner:                                                          # Tuning algorithm
17 |     name: TPE
18 |     classArgs:
19 |         optimize_mode: maximize                                     # Optimization direction, consistent with the first metric in 
20 |                                                                 # train/val_metrics and train/test_metrics
21 | 
22 | 
23 | # assessor:                                                     # used to terminate trials early
24 | #   name: Curvefitting
25 | #   classArgs:
26 | #     epoch_num: 200
27 | #     start_step: 20
28 | #     threshold: 0.9
29 | #     gap: 1
30 | 
31 | # assessor:
32 | #   name: Medianstop
33 | #   classArgs:
34 | #     optimize_mode: maximize
35 | #     start_step: 200
36 | 
37 | 
38 | trainingService:
39 |     platform: local
40 |     useActiveGpu: true
41 |     maxTrialNumberPerGpu: 2                                       # how many trials can share one GPU
42 |     gpuIndices: 0, 1                                              # GPUs visible to trial processes


--------------------------------------------------------------------------------
/nni-experiments/config/sasrec.yaml:
--------------------------------------------------------------------------------
 1 | experimentName: SASRec-ml-100k                                      # Mnemonic name of the experiment, which will be shown in WebUI and nnictl
 2 | searchSpaceFile: ../search_space/sasrec.yaml                        # Path to the TAML file containing the search space
 3 | 
 4 | trialCommand: python3 run.py -m SASRec -d ml-100k --mode tune       # Command to launch trial
 5 | trialCodeDirectory: ../..                                           # Path to the directory containing trial source files
 6 | trialConcurrency: 4                                                 # Specify how many trials should be run concurrently
 7 | trialGpuNumber: 1
 8 | 
 9 | maxExperimentDuration: 100d                                         # Stop generating trials after the limitation
10 | maxTrialNumber: 1000                                                # Limit the number of trials to create
11 | maxTrialDuration: ~                                                 # Limit the duration of trial job
12 | 
13 | experimentWorkingDirectory: ../                                     # The directory to place log, checkpoint, metadata, and other run-time stuff
14 | 
15 | 
16 | tuner:                                                              # Tuning algorithm
17 |   name: TPE
18 |   classArgs:
19 |     optimize_mode: maximize                                         # Optimization direction, consistent with the first metric in 
20 |                                                                     # train/val_metrics and train/test_metrics
21 | 
22 | 
23 | # assessor:                                                         # used to terminate trials early
24 | #   name: Curvefitting
25 | #   classArgs:
26 | #     epoch_num: 200
27 | #     start_step: 20
28 | #     threshold: 0.9
29 | #     gap: 1
30 | 
31 | # assessor:
32 | #   name: Medianstop
33 | #   classArgs:
34 | #     optimize_mode: maximize
35 | #     start_step: 200
36 | 
37 | 
38 | trainingService:
39 |   platform: local
40 |   useActiveGpu: true
41 |   maxTrialNumberPerGpu: 2                                           # how many trials can share one GPU
42 |   gpuIndices: 0, 1                                                  # GPUs visible to trial processes


--------------------------------------------------------------------------------
/nni-experiments/search_space/bpr.yaml:
--------------------------------------------------------------------------------
1 | train/learning_rate:
2 |     _type: choice
3 |     _value: [0.001]
4 | 
5 | train/weight_decay: 
6 |     _type: choice
7 |     _value: [0.0001, 0.0005, 0.001, 0.005]


--------------------------------------------------------------------------------
/nni-experiments/search_space/sasrec.yaml:
--------------------------------------------------------------------------------
1 | train/learning_rate: 
2 |     _type: choice
3 |     _value: [0.0005, 0.001, 0.005]
4 | 
5 | model/dropout_rate:
6 |     _type: choice
7 |     _value: [0.2, 0.5]


--------------------------------------------------------------------------------
/recstudio/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import print_function
3 | from __future__ import division
4 | 
5 | __version__ = '0.0.2'
6 | LOG_DIR = r"./log/"
7 | DEFAULT_CACHE_DIR = r"./.recstudio/"
8 | 


--------------------------------------------------------------------------------
/recstudio/ann/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/USTCLLM/RecStudio/9114975b8e9ec85bce16c1ed8abbf0e194e4afb3/recstudio/ann/__init__.py


--------------------------------------------------------------------------------
/recstudio/data/__init__.py:
--------------------------------------------------------------------------------
 1 | from recstudio.data.dataset import TripletDataset, SeqDataset, UserDataset, FullSeqDataset
 2 | from recstudio.data.advance_dataset import ALSDataset
 3 | 
 4 | import os
 5 | 
 6 | supported_dataset = []
 7 | for f in os.listdir(os.path.join(os.path.dirname(__file__), 'config')):
 8 |     if f != "all.yaml":
 9 |         supported_dataset.append(f.split(".")[0])
10 | 


--------------------------------------------------------------------------------
/recstudio/data/config/all.yaml:
--------------------------------------------------------------------------------
 1 | url: ~
 2 | user_id_field: &u user_id:token # TODO: comments for &u and *u
 3 | item_id_field: &i item_id:token
 4 | rating_field: &r rating:float
 5 | time_field: &t timestamp:float
 6 | time_format: ~
 7 | 
 8 | encoding_method: utf-8
 9 | 
10 | inter_feat_name: ~
11 | inter_feat_field: [*u, *i, *r, *t]
12 | inter_feat_header: ~
13 | 
14 | user_feat_name: ~
15 | user_feat_field: [[*u, age:token, gender:token, occupation:token]]
16 | user_feat_header: ~
17 | 
18 | 
19 | item_feat_name: ~
20 | item_feat_field: [[*i, movie_title:token_seq:" ", release_year:token, class:token_seq:" "]]
21 | item_feat_header: ~
22 | 
23 | 
24 | field_separator: "\t"
25 | min_user_inter: 0
26 | min_item_inter: 0
27 | field_max_len: ~      # a YAML-format dict, for example
28 | # field_max_len:
29 | #   age: 1
30 | #   gender: 1
31 | #   occupation: 1
32 | low_rating_thres: ~   # low rating threshold, which is used for drop low rating interactions
33 | # drop_low_rating: True # if true, the interactions with rating lower than `rating_thres` would be dropped.
34 | 
35 | # negative rating threshold, interactions with rating below than the threshold would be regarded as negative interactions.
36 | # Note that when `drop_low_rating` is True, only interactions with rating above `low_rating_thres` and below `negative_rating_thres`
37 | # would be regared as negative interactions.
38 | # The threshold value should be larger than `low_rating_thres`. If not, the threshold would be invalid, which means all interactions kept
39 | # would be regarded as positives.
40 | # negative_rating_thres: 0.0
41 | 
42 | # `binarized_rating` controls whether to binarize the rating to 0/1 with the `rating_thres`.
43 | # If true, ratings above `rating_thres` would be mapped as 1 and ratings above `rating_thres` would be mapped as 0;
44 | # If false, the ratings would not be changed
45 | binarized_rating_thres: ~
46 | 
47 | drop_dup: True
48 | max_seq_len: 20
49 | 
50 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features
51 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]]
52 | mapped_feat_field: [*u, *i]
53 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]]
54 | network_feat_header: [0, 0]
55 | 
56 | # sklearn.preprocessing (Arguments supportable; args are sepped with blankspace; same with tuple)
57 | # MinMaxScaler(), StandardScaler(), RobustScaler(), MaxAbsScaler()
58 | # Binarizer(), KBinsDiscretizer(encode="ordinal")
59 | # Normalizer()
60 | # KernelCenterer()
61 | # QuantileTransformer(), SplineTransformer()
62 | # Customized: LogTransformer(), or use FunctionTransformer(...)
63 | float_field_preprocess: ~ # [float_field:MinMaxScaler(), ...]
64 | 
65 | save_cache: False # whether to save processed dataset to cache.
66 | 


--------------------------------------------------------------------------------
/recstudio/data/config/amazon-beauty.yaml:
--------------------------------------------------------------------------------
 1 | url: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Beauty.csv
 2 | user_id_field: &u user_id:token # TODO: comments for &u and *u
 3 | item_id_field: &i item_id:token
 4 | rating_field: &r rating:float
 5 | time_field: &t timestamp:float
 6 | time_format: ~
 7 | 
 8 | encoding_method: utf-8
 9 | inter_feat_name: ratings_Beauty.csv
10 | inter_feat_field: [*u, *i, *r, *t]
11 | inter_feat_header: ~
12 | 
13 | user_feat_name: ~
14 | user_feat_field: ~
15 | user_feat_header: ~
16 | 
17 | item_feat_name: ~
18 | item_feat_field: ~
19 | item_feat_header: ~
20 | 
21 | field_separator: ","
22 | min_user_inter: 5
23 | min_item_inter: 5
24 | field_max_len: ~
25 | low_rating_thres: 3.0
26 | max_seq_len: 50
27 | 
28 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features
29 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]]
30 | mapped_feat_field: [*u, *i]
31 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]]
32 | 
33 | save_cache: True # whether to save processed dataset to cache.
34 | 


--------------------------------------------------------------------------------
/recstudio/data/config/amazon-books.yaml:
--------------------------------------------------------------------------------
 1 | url: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Books.csv
 2 | user_id_field: &u user_id:token
 3 | item_id_field: &i item_id:token
 4 | rating_field: &r rating:float
 5 | time_field: &t timestamp:float
 6 | time_format: ~
 7 | 
 8 | encoding_method: utf-8
 9 | inter_feat_name: ratings_Books.csv
10 | inter_feat_field: [*u, *i, *r, *t,]
11 | inter_feat_header: ~
12 | 
13 | user_feat_name: ~
14 | user_feat_field: ~
15 | user_feat_header: ~
16 | 
17 | item_feat_name: ~
18 | item_feat_field: ~
19 | item_feat_header: ~
20 | 
21 | use_fields: ~  # TODO:
22 | field_separator: ","
23 | min_user_inter: 5
24 | min_item_inter: 5
25 | field_max_len: ~
26 | low_rating_thres: ~
27 | max_seq_len: 20
28 | 
29 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features
30 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]]
31 | mapped_feat_field: [*u, *i]
32 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]]
33 | 
34 | save_cache: True # whether to save processed dataset to cache.
35 | 


--------------------------------------------------------------------------------
/recstudio/data/config/amazon-electronics.yaml:
--------------------------------------------------------------------------------
 1 | url: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Electronics.csv
 2 | user_id_field: &u user_id:token
 3 | item_id_field: &i item_id:token
 4 | rating_field: &r rating:float
 5 | time_field: &t timestamp:float
 6 | time_format: ~
 7 | 
 8 | encoding_method: utf-8
 9 | inter_feat_name: ratings_Electronics.csv
10 | inter_feat_field: [*u, *i, *r, *t]
11 | inter_feat_header: ~
12 | 
13 | user_feat_name: ~
14 | user_feat_field: ~
15 | user_feat_header: ~
16 | 
17 | item_feat_name: ~
18 | item_feat_field: ~
19 | item_feat_header: ~
20 | 
21 | use_fields: ~  # TODO:
22 | field_separator: ","
23 | min_user_inter: 5
24 | min_item_inter: 5
25 | field_max_len: ~
26 | low_rating_thres: 3
27 | max_seq_len: 20
28 | 
29 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features
30 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]]
31 | mapped_feat_field: [*u, *i]
32 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]]
33 | 
34 | save_cache: True # whether to save processed dataset to cache.
35 | 


--------------------------------------------------------------------------------
/recstudio/data/config/criteo.yaml:
--------------------------------------------------------------------------------
 1 | url: https://rec.ustc.edu.cn/share/f519a9c0-e593-11ed-a011-2d240ca5a5b5
 2 | user_id_field: ~
 3 | item_id_field: ~
 4 | rating_field: &r rating:float
 5 | time_field: ~
 6 | time_format: ~
 7 | 
 8 | 
 9 | inter_feat_name: train.txt
10 | inter_feat_field: [*r, I1:float, I2:float, I3:float, I4:float, I5:float, I6:float, I7:float, I8:float, I9:float, I10:float, I11:float, I12:float, I13:float, C1:token, C2:token, C3:token, C4:token, C5:token, C6:token, C7:token, C8:token, C9:token, C10:token, C11:token, C12:token, C13:token, C14:token, C15:token, C16:token, C17:token, C18:token, C19:token, C20:token, C21:token, C22:token, C23:token, C24:token, C25:token, C26:token]
11 | inter_feat_header: ~
12 | 
13 | user_feat_name: ~
14 | user_feat_field: ~
15 | user_feat_header: ~
16 | 
17 | 
18 | item_feat_name: ~
19 | item_feat_field: ~
20 | item_feat_header: ~
21 | 
22 | 
23 | field_separator: "\t"
24 | min_user_inter: 0
25 | min_item_inter: 0
26 | field_max_len: ~
27 | low_rating_thres: ~
28 | drop_dup: False
29 | max_seq_len: 20
30 | 
31 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features
32 | network_feat_name: ~
33 | mapped_feat_field: ~
34 | network_feat_field: ~
35 | network_feat_header: ~
36 | save_cache: True # whether to save processed dataset to cache.


--------------------------------------------------------------------------------
/recstudio/data/config/gowalla.yaml:
--------------------------------------------------------------------------------
 1 | url: https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz
 2 | user_id_field: &u user:token
 3 | item_id_field: &i location_id:token
 4 | rating_field: ~
 5 | time_field: &t check_in_time:str
 6 | time_format: "%Y-%m-%dT%H:%M:%Sz"
 7 | 
 8 | encoding_method: utf-8
 9 | inter_feat_name: loc-gowalla_totalCheckins.txt
10 | inter_feat_field: [*u, *t, latitude:float, longitude:float ,*i]
11 | inter_feat_header: ~
12 | 
13 | user_feat_name: ~
14 | user_feat_field: ~
15 | user_feat_header: ~
16 | 
17 | item_feat_name: ~
18 | item_feat_field: ~
19 | item_feat_header: ~
20 | 
21 | use_fields: ~
22 | field_separator: "\t"
23 | seq_separator: " "
24 | min_user_inter: 5
25 | min_item_inter: 5
26 | field_max_len: ~
27 | low_rating_thres: ~
28 | max_seq_len: 20
29 | 
30 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features
31 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]]
32 | mapped_feat_field: [*u, *i]
33 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]]
34 | 
35 | save_cache: True # whether to save processed dataset to cache.
36 | 


--------------------------------------------------------------------------------
/recstudio/data/config/kuairand-pure.yaml:
--------------------------------------------------------------------------------
 1 | url: ~ # Please look for https://kuairand.com/ for downloading
 2 | user_id_field: &u user_id:token
 3 | item_id_field: &i video_id_id:token
 4 | rating_field: [is_click:float, is_like:float, is_follow:float, is_comment:float, is_forward:float, is_hate:float]
 5 | time_field: &t date:float
 6 | time_format: ~
 7 | 
 8 | 
 9 | inter_feat_name: log_standard_4_22_to_5_08_pure.csv
10 | inter_feat_field: [*u, *i, *t, hourmin:float, time_ms:float, is_click:float, is_like:float, is_follow:float, is_comment:float, is_forward:float, is_hate:float, long_view:float, play_time_ms:float, duration_ms:float, profile_stay_time:float, comment_stay_time:float, is_probfile_enter:float, is_rand:float, tab:float]
11 | inter_feat_header: 0
12 | 
13 | user_feat_name: [user_features_pure.csv]
14 | user_feat_field: [[*u, user_active_degree:token, is_lowactive_period:float, is_live_streamer:float, is_video_author:float, follow_user_num:float, follow_user_num_range:token, fans_user_num:float, fans_user_num_range:token, friend_user_num:float, friend_user_num_range:token, register_days:float, register_days_range:token, onehot_feat0:float, onehot_feat1:float, onehot_feat2:float, onehot_feat3:float, onehot_feat4:float, onehot_feat5:float, onehot_feat6:float, onehot_feat7:float, onehot_feat8:float, onehot_feat9:float, onehot_feat10:float, onehot_feat11:float, onehot_feat12:float, onehot_feat13:float, onehot_feat14:float, onehot_feat15:float, onehot_feat16:float, onehot_feat17:float]]
15 | user_feat_header: 0
16 | 
17 | 
18 | item_feat_name: ~ #[video_features_basic_pure.csv, video_features_statistic_pure.csv]
19 | item_feat_field: ~ # [[...]]
20 | item_feat_header: 0
21 | 
22 | 
23 | field_separator: ","
24 | min_user_inter: 0
25 | min_item_inter: 0
26 | field_max_len: ~
27 | low_rating_thres: ~
28 | max_seq_len: ~
29 | 
30 | save_cache: True
31 | 


--------------------------------------------------------------------------------
/recstudio/data/config/ml-100k.yaml:
--------------------------------------------------------------------------------
 1 | url: "recstudio:dataset_demo/ml-100k"
 2 | user_id_field: &u user_id:token # TODO: comments for &u and *u
 3 | item_id_field: &i item_id:token
 4 | rating_field: &r rating:float
 5 | time_field: &t timestamp:float
 6 | time_format: ~
 7 | 
 8 | 
 9 | inter_feat_name: ml-100k.inter
10 | inter_feat_field: [*u, *i, *r, *t]
11 | inter_feat_header: 0
12 | 
13 | user_feat_name: [ml-100k.user]
14 | user_feat_field: [[*u, age:token, gender:token, occupation:token, zip_code:token]]
15 | user_feat_header: 0
16 | 
17 | 
18 | item_feat_name: ~ # [ml-100k.item]
19 | item_feat_field: [[*i, movie_title:token_seq:" ", release_year:token, class:token_seq:" "]]
20 | item_feat_header: 0
21 | 
22 | 
23 | field_separator: "\t"
24 | min_user_inter: 0
25 | min_item_inter: 0
26 | field_max_len: ~
27 | low_rating_thres: 3.0
28 | max_seq_len: 20
29 | 
30 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features
31 | network_feat_name: ~ # [[social.txt], [ml-100k.kg, ml-100k.link]]
32 | # mapped_feat_field: [*u, *i]
33 | mapped_feat_field: [[*u, *u], [*i, ~, *i]]
34 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, relation_id:token, tail_id:token], [*i, entity_id:token]]]
35 | network_feat_header: [[0], [0, 0]]
36 | 
37 | save_cache: False # whether to save processed dataset to cache.
38 | float_field_preprocess: [timestamp:StandardScaler()]


--------------------------------------------------------------------------------
/recstudio/data/config/ml-10m.yaml:
--------------------------------------------------------------------------------
 1 | url: https://files.grouplens.org/datasets/movielens/ml-10m.zip
 2 | user_id_field: &u UserID:token # TODO: comments for &u and *u
 3 | item_id_field: &i MovieID:token
 4 | rating_field: &r Rating:float
 5 | time_field: &t Timestamp:float
 6 | time_format: ~
 7 | 
 8 | encoding_method: ISO-8859-1
 9 | inter_feat_name: ratings.dat
10 | inter_feat_field: [*u, *i, *r, *t]
11 | inter_feat_header: ~
12 | 
13 | 
14 | user_feat_name: ~
15 | user_feat_field: ~
16 | user_feat_header: ~
17 | 
18 | 
19 | item_feat_name: [movies.dat]
20 | item_feat_field: [[*i, Title:token_seq:" ", Genres:token_seq:"|")]]
21 | item_feat_header: ~
22 | 
23 | 
24 | use_fields: ~
25 | field_separator: "::"
26 | min_user_inter: 5
27 | min_item_inter: 5
28 | field_max_len: ~
29 | low_rating_thres: 3.0
30 | max_seq_len: 20
31 | 
32 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features
33 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]]
34 | mapped_feat_field: [*u, *i]
35 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]]
36 | network_feat_header: [~, ~]
37 | 
38 | 
39 | save_cache: False # whether to save processed dataset to cache.
40 | 


--------------------------------------------------------------------------------
/recstudio/data/config/ml-1m.yaml:
--------------------------------------------------------------------------------
 1 | url: https://files.grouplens.org/datasets/movielens/ml-1m.zip
 2 | user_id_field: &u UserID:token # TODO: comments for &u and *u
 3 | item_id_field: &i MovieID:token
 4 | rating_field: &r Rating:float
 5 | time_field: &t Timestamp:float
 6 | time_format: ~
 7 | 
 8 | 
 9 | encoding_method: ISO-8859-1
10 | inter_feat_name: ratings.dat
11 | inter_feat_field: [*u, *i, *r, *t]
12 | inter_feat_header: ~
13 | 
14 | 
15 | user_feat_name: ~ #[users.dat]
16 | user_feat_field: [[*u, Gender:token, Age:token, Occupation:token, Zip-code:token]]
17 | user_feat_header: ~
18 | 
19 | 
20 | item_feat_name: ~ #[movies.dat]
21 | item_feat_field: [[*i, Title:token_seq:" ", Genres:token_seq:"|")]]
22 | item_feat_header: ~
23 | 
24 | 
25 | use_fields: ~
26 | field_separator: "::"
27 | min_user_inter: 5
28 | min_item_inter: 5
29 | field_max_len: ~
30 | low_rating_thres: 3.0
31 | max_seq_len: 20
32 | 
33 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features
34 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]]
35 | mapped_feat_field: [*u, *i]
36 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]]
37 | network_feat_header: [~, ~]
38 | 
39 | 
40 | save_cache: False # whether to save processed dataset to cache.
41 | 


--------------------------------------------------------------------------------
/recstudio/data/config/ml-20m.yaml:
--------------------------------------------------------------------------------
 1 | url: https://files.grouplens.org/datasets/movielens/ml-20m.zip
 2 | user_id_field: &u userId:token
 3 | item_id_field: &i movieId:token
 4 | rating_field: &r rating:float
 5 | time_field: &t timestamp:float
 6 | time_format: ~
 7 | 
 8 | encoding_method: ISO-8859-1
 9 | inter_feat_name: ratings.csv
10 | inter_feat_field: [*u, *i, *r, *t]
11 | inter_feat_header: 0
12 | 
13 | 
14 | user_feat_name: ~
15 | user_feat_field: ~
16 | user_feat_header: ~
17 | 
18 | 
19 | item_feat_name: [movies.csv]
20 | item_feat_field: [[*i, title:token_seq:" ", genres:token_seq:"|")]]
21 | item_feat_header: 0
22 | 
23 | 
24 | use_fields: ~
25 | field_separator: ","
26 | min_user_inter: 5
27 | min_item_inter: 5
28 | field_max_len: ~
29 | low_rating_thres: ~
30 | max_seq_len: 20
31 | 
32 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features
33 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]]
34 | mapped_feat_field: [*u, *i]
35 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]]
36 | network_feat_header: [~, ~]
37 | 
38 | 
39 | save_cache: True # whether to save processed dataset to cache.
40 | 


--------------------------------------------------------------------------------
/recstudio/data/config/tmall.yaml:
--------------------------------------------------------------------------------
 1 | url: https://rec.ustc.edu.cn/share/62299ea0-e083-11ec-8586-b7917c2cff26
 2 | user_id_field: &u use_ID:token
 3 | item_id_field: &i ite_ID:token
 4 | rating_field: &r act_ID:float
 5 | time_field: &t time:float
 6 | time_format: ~
 7 | 
 8 | encoding_method: utf-8
 9 | inter_feat_name: ijcai2016_taobao.csv
10 | inter_feat_field: [*u, sel_ID:token, *i, cat_id:token, *r, *t]
11 | inter_feat_header: 0
12 | 
13 | 
14 | user_feat_name: ~
15 | user_feat_field: ~
16 | user_feat_header: ~
17 | 
18 | 
19 | item_feat_name: ~
20 | item_feat_field: ~
21 | item_feat_header: ~
22 | 
23 | 
24 | use_fields: ~
25 | field_separator: ","
26 | min_user_inter: 5
27 | min_item_inter: 5
28 | field_max_len: ~
29 | low_rating_thres: ~
30 | max_seq_len: 50
31 | 
32 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features
33 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]]
34 | mapped_feat_field: [*u, *i]
35 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]]
36 | network_feat_header: [~, ~]
37 | 
38 | 
39 | save_cache: True # whether to save processed dataset to cache.
40 | 


--------------------------------------------------------------------------------
/recstudio/data/config/yelp.yaml:
--------------------------------------------------------------------------------
 1 | url: https://rec.ustc.edu.cn/share/cdc6de70-2f87-11ed-b5db-4d1a26914a90
 2 | user_id_field: &u user_id:token # TODO: comments for &u and *u
 3 | item_id_field: &i business_id:token
 4 | rating_field: &r stars:float
 5 | time_field: &t date:float
 6 | time_format: ~
 7 | 
 8 | 
 9 | encoding_method: utf-8
10 | inter_feat_name: yelp_inter.csv
11 | inter_feat_field: [review_id:token, *u, *i, *r, *t]
12 | inter_feat_header: 0
13 | 
14 | 
15 | user_feat_name: ~ #[yelp_user.csv]
16 | user_feat_field: [[*u, user_name:token, yelping_since:float, fans:float, average_stars:float]]
17 | user_feat_header: 0
18 | 
19 | 
20 | item_feat_name: ~ #[yelp_item.csv]
21 | item_feat_field: [[business_id:token, business_name:token, city:token, state:token, postal_code:token, latitude:float, longitude:float, business_stars:float, 'categories:token_seq:", "']]
22 | item_feat_header: 0
23 | 
24 | 
25 | use_fields: ~  # TODO:
26 | field_separator: ","
27 | min_user_inter: 5
28 | min_item_inter: 5
29 | field_max_len: ~
30 | low_rating_thres: 3
31 | max_seq_len: 20
32 | 
33 | # network feature, including social network and knowledge graph, the first two fields are remapped the corresponding features
34 | network_feat_name: ~ #[[social.txt], [ml-100k.kg, ml-100k.link]]
35 | mapped_feat_field: [*u, *i]
36 | network_feat_field: [[[source_id:token, target_id:token]], [[head_id:token, tail_id:token, relation_id:token], [*i, entity_id:token]]]
37 | network_feat_header: [~, ~]
38 | 
39 | 
40 | save_cache: True # whether to save processed dataset to cache.
41 | 


--------------------------------------------------------------------------------
/recstudio/dataset_demo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/USTCLLM/RecStudio/9114975b8e9ec85bce16c1ed8abbf0e194e4afb3/recstudio/dataset_demo/__init__.py


--------------------------------------------------------------------------------
/recstudio/dataset_demo/ml-100k/social.txt:
--------------------------------------------------------------------------------
  1 | source_id	target_id
  2 | 391	184
  3 | 343	738
  4 | 872	542
  5 | 247	55
  6 | 606	461
  7 | 574	195
  8 | 464	810
  9 | 187	929
 10 | 600	545
 11 | 583	662
 12 | 263	533
 13 | 432	65
 14 | 561	137
 15 | 363	496
 16 | 836	899
 17 | 192	907
 18 | 664	627
 19 | 213	312
 20 | 523	110
 21 | 722	883
 22 | 243	456
 23 | 844	374
 24 | 353	424
 25 | 225	616
 26 | 327	445
 27 | 787	685
 28 | 399	132
 29 | 522	415
 30 | 87	284
 31 | 196	417
 32 | 911	870
 33 | 800	85
 34 | 598	868
 35 | 290	142
 36 | 632	379
 37 | 292	725
 38 | 230	358
 39 | 331	874
 40 | 304	129
 41 | 75	162
 42 | 647	360
 43 | 771	396
 44 | 720	306
 45 | 833	839
 46 | 820	779
 47 | 201	125
 48 | 95	301
 49 | 369	726
 50 | 499	18
 51 | 249	491
 52 | 88	684
 53 | 910	384
 54 | 239	248
 55 | 936	894
 56 | 897	933
 57 | 672	37
 58 | 532	724
 59 | 915	229
 60 | 478	841
 61 | 510	761
 62 | 557	190
 63 | 717	255
 64 | 311	211
 65 | 516	482
 66 | 330	398
 67 | 711	875
 68 | 643	530
 69 | 473	512
 70 | 834	355
 71 | 115	209
 72 | 146	70
 73 | 394	509
 74 | 751	666
 75 | 615	75
 76 | 570	467
 77 | 566	492
 78 | 831	281
 79 | 116	753
 80 | 593	677
 81 | 935	522
 82 | 215	587
 83 | 518	213
 84 | 479	105
 85 | 316	635
 86 | 423	243
 87 | 726	6
 88 | 120	152
 89 | 375	569
 90 | 602	852
 91 | 100	83
 92 | 812	584
 93 | 160	778
 94 | 366	60
 95 | 608	437
 96 | 716	165
 97 | 714	815
 98 | 367	661
 99 | 492	830
100 | 18	723
101 | 401	1
102 | 1000	13
103 | 


--------------------------------------------------------------------------------
/recstudio/model/__init__.py:
--------------------------------------------------------------------------------
1 | from . import ae, fm, kg, mf, seq, basemodel


--------------------------------------------------------------------------------
/recstudio/model/ae/config/all.yaml:
--------------------------------------------------------------------------------
1 | data:
2 |   shuffle: True
3 | 


--------------------------------------------------------------------------------
/recstudio/model/ae/config/multidae.yaml:
--------------------------------------------------------------------------------
 1 | eval:
 2 |   batch_size: 200
 3 | 
 4 | model:
 5 |   embed_dim: 200
 6 |   dropout: 0.5
 7 |   encoder_dims: [64,32]
 8 |   decoder_dims: [32,64]
 9 |   activation: relu
10 | 
11 | train:
12 |   batch_size: 256
13 |   epochs: 200
14 |   learner: adam
15 |   learning_rate: 0.01
16 |   weight_decay: 0.00001
17 | 


--------------------------------------------------------------------------------
/recstudio/model/ae/config/multivae.yaml:
--------------------------------------------------------------------------------
 1 | eval:
 2 |   batch_size: 200
 3 | 
 4 | model:
 5 |   embed_dim: 600
 6 |   dropout_rate: 0.5
 7 |   encoder_dims: [200]
 8 |   decoder_dims: [200]
 9 |   activation: tanh
10 | 
11 | train:
12 |   anneal_max: 0.2
13 |   anneal_total_step: 2000000
14 |   batch_size: 500
15 |   epochs: 500
16 |   learner: adam
17 |   learning_rate: 0.001
18 |   weight_decay: 1e-5
19 |   early_stop_patience: 100
20 | 


--------------------------------------------------------------------------------
/recstudio/model/ae/multidae.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.data.dataset import UserDataset
 3 | from recstudio.model.basemodel import BaseRetriever, Recommender
 4 | from recstudio.model.loss_func import SoftmaxLoss
 5 | from recstudio.model.module import MLPModule
 6 | from recstudio.model.scorer import InnerProductScorer
 7 | 
 8 | 
 9 | class MultiDAEQueryEncoder(torch.nn.Module):
10 |     def __init__(self, fiid, num_items, embed_dim, dropout_rate,
11 |                  encoder_dims, decoder_dims, activation='relu'):
12 |         super().__init__()
13 |         assert encoder_dims[-1] == decoder_dims[0], 'expecting the output size of'\
14 |             'encoder is equal to the input size of decoder.'
15 |         assert encoder_dims[0] == decoder_dims[-1], 'expecting the output size of'\
16 |             'decoder is equal to the input size of encoder.'
17 | 
18 |         self.fiid = fiid
19 |         self.item_embedding = torch.nn.Embedding(num_items, embed_dim, 0)
20 |         self.dropout = torch.nn.Dropout(p=dropout_rate)
21 | 
22 |         self.encoder_decoder = torch.nn.Sequential(
23 |             MLPModule([embed_dim]+encoder_dims+decoder_dims[1:], activation),
24 |             torch.nn.Linear(decoder_dims[-1], embed_dim)
25 |         )
26 | 
27 |     def forward(self, batch):
28 |         # encode
29 |         seq_emb = self.item_embedding(batch["in_"+self.fiid])
30 |         non_zero_num = batch["in_"+self.fiid].count_nonzero(dim=1).unsqueeze(-1)
31 |         seq_emb = seq_emb.sum(1) / non_zero_num.pow(0.5)
32 |         h = self.dropout(seq_emb)
33 | 
34 |         return self.encoder_decoder(h)
35 | 
36 | 
37 | class MultiDAE(BaseRetriever):
38 | 
39 |     # def add_model_specific_args(parent_parser):
40 |     #     parent_parser = Recommender.add_model_specific_args(parent_parser)
41 |     #     parent_parser.add_argument_group('MultiDAE')
42 |     #     parent_parser.add_argument("--dropout", type=int, default=0.5, help='dropout rate for MLP layers')
43 |     #     parent_parser.add_argument("--encoder_dims", type=int, nargs='+', default=64, help='MLP layer size for encoder')
44 |     #     parent_parser.add_argument("--decoder_dims", type=int, nargs='+', default=64, help='MLP layer size for decocer')
45 |     #     parent_parser.add_argument("--activation", type=str, default='relu', help='activation function for MLP layers')
46 |     #     return parent_parser
47 | 
48 |     def _get_dataset_class():
49 |         return UserDataset
50 | 
51 |     def _get_item_encoder(self, train_data):
52 |         return torch.nn.Embedding(train_data.num_items, self.embed_dim, 0)
53 | 
54 |     def _get_query_encoder(self, train_data):
55 |         model_config = self.config['model']
56 |         return MultiDAEQueryEncoder(train_data.fiid, train_data.num_items,
57 |                                     self.embed_dim, model_config['dropout'], model_config['encoder_dims'],
58 |                                     model_config['decoder_dims'], model_config['activation'])
59 | 
60 |     def _get_score_func(self):
61 |         return InnerProductScorer()
62 | 
63 |     def _get_sampler(self, train_data):
64 |         return None
65 | 
66 |     def _get_loss_func(self):
67 |         return SoftmaxLoss()
68 | 


--------------------------------------------------------------------------------
/recstudio/model/basemodel/__init__.py:
--------------------------------------------------------------------------------
1 | from recstudio.model.basemodel.recommender import Recommender
2 | from recstudio.model.basemodel.baseranker import BaseRanker
3 | from recstudio.model.basemodel.baseretriever import BaseRetriever
4 | # from recstudio.model.basemodel.sequential_retriever import SequentialRetriever


--------------------------------------------------------------------------------
/recstudio/model/basemodel/basemodel.yaml:
--------------------------------------------------------------------------------
 1 | # This is a configuration file for all models, which could be regarded as an example of
 2 | # configuration file.
 3 | 
 4 | # All the configuration parameters are divided into four groups: data, model, train and eval.
 5 | #   - data: data group contains some parameters related to dataset construction. For example,
 6 | #           `fm_eval` controls whether a sample is one interaction or all interactions for one user
 7 | #           in evaluation.
 8 | #   - model: model group contains some parameters related to the model size (or model architechture).
 9 | #   - train: train group contains parameters for the training procedure, such as epochs, learning rate.
10 | #   - eval: eval group contains parameters for the evaluation procedure (validation and test), such as
11 | #           batch size.
12 | 
13 | 
14 | data:   # params related to dataset
15 |     binarized_rating_thres: ~ # whether to binarized rating
16 |     fm_eval: False    # whether to set fm_eval to organize the batch data as one interactions per sample.
17 | 
18 |     # the sampler for dataset, only uniform sampler is supported now.
19 |     neg_count: 0
20 |     sampler: ~ # [uniform]
21 |     shuffle: True
22 |     split_mode: user_entry    # [user, entry, user_entry]
23 |     split_ratio: [0.8,0.1,0.1]  # list or int type, list type for split by ratio, int type for leave one out
24 | 
25 | 
26 | model:
27 |     embed_dim: 64       # embedding dimension for embedding layers, usually for item and user embeddings
28 |     item_bias: False    # whether to add item bias
29 | 
30 | 
31 | train:
32 |     accelerator: gpu    # [cpu, gpu, dp]
33 | 
34 |     # ann: {index: 'IVFx,Flat', parameter: ~}  ## 1 HNSWx,Flat; 2 Flat; 3 IVFx,Flat ## {nprobe: 1}  {efSearch: 1}
35 |     ann: ~
36 |     batch_size: 512
37 | 
38 |     early_stop_mode: max
39 |     early_stop_patience: 10
40 | 
41 |     epochs: 1000
42 |     gpu: 1
43 |     grad_clip_norm: ~
44 |     init_method: xavier_normal  # [xavier_normal, normal]
45 |     item_batch_size: 1024  # batch size for items to get all item features or get full item scores.
46 |     learner: adam
47 |     learning_rate: 0.001
48 |     num_threads: 10
49 | 
50 |     # negative sampler configuration in training procedure
51 |     # `method` describes the retrieving method used to retrieve negative items with a retriever.
52 |     sampling_method: none    # [none, sir, dns, toprand, top&rand, brute]
53 | 
54 |     # `sampler` describes the negative sampler used to train models.
55 |     sampler: uniform    # [uniform, pop, midx-uni, midx-pop, cluster-uni, cluster-pop]
56 | 
57 |     negative_count: 0   # number of negative items to be sampled
58 |     excluding_hist: False   # whether to exclude user history in negative sampling
59 | 
60 |     # learning rate scheduler, refer to https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
61 |     scheduler: ~    # [onplateau, exponential]
62 | 
63 |     seed: 2022  # random seed, usually 42 is a magic number
64 |     weight_decay: 0.0      # weight decay for the optimizer
65 |     tensorboard_path: ~
66 | 
67 | 
68 | eval:
69 |     batch_size: 128
70 |     cutoff: [5, 10, 20]
71 |     val_metrics: [ndcg, recall]
72 |     val_n_epoch: 1
73 |     test_metrics: [ndcg, recall, precision, map, mrr, hit]
74 |     topk: 100
75 |     save_path: './saved/'
76 | 


--------------------------------------------------------------------------------
/recstudio/model/debias/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/USTCLLM/RecStudio/9114975b8e9ec85bce16c1ed8abbf0e194e4afb3/recstudio/model/debias/__init__.py


--------------------------------------------------------------------------------
/recstudio/model/fm/__init__.py:
--------------------------------------------------------------------------------
1 | from .lr import LR
2 | from .fm import FM
3 | from .dcn import DCN
4 | from .nfm import NFM
5 | from .deepfm import DeepFM
6 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/afm.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from collections import OrderedDict
 3 | from recstudio.data.dataset import TripletDataset
 4 | from .. import loss_func
 5 | from ..basemodel import BaseRanker
 6 | from ..module import ctr
 7 | 
 8 | r"""
 9 | AFM
10 | ######################
11 | 
12 | Paper Reference:
13 |     Attentional Factorization Machines: Learning the Weight of Feature Interactions via Attention Networks (IJCAI'17)
14 |     https://dl.acm.org/doi/10.5555/3172077.3172324
15 | """
16 | 
17 | class AFM(BaseRanker):
18 | 
19 |     def _get_dataset_class():
20 |         return TripletDataset
21 | 
22 |     def _init_model(self, train_data, drop_unused_field=True):
23 |         super()._init_model(train_data, drop_unused_field)
24 |         num_fields = len(self.fields) - 1
25 |         self.linear = ctr.LinearLayer(self.fields, train_data)
26 |         self.afm = nn.Sequential(
27 |                     OrderedDict([
28 |                         ("embeddings", 
29 |                             ctr.Embeddings(
30 |                                 self.fields, 
31 |                                 self.embed_dim, 
32 |                                 train_data)),
33 |                         ("afm_layer", 
34 |                             ctr.AFMLayer(
35 |                                 self.embed_dim,
36 |                                 self.config['model']['attention_dim'],
37 |                                 num_fields,
38 |                                 self.config['model']['dropout']))
39 |                     ]))
40 | 
41 |     def score(self, batch):
42 |         lr_score = self.linear(batch)
43 |         afm_score = self.afm(batch)
44 |         return {'score' : lr_score + afm_score}
45 | 
46 |     def _get_loss_func(self):
47 |         return loss_func.BCEWithLogitLoss()
48 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/afn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from collections import OrderedDict
 4 | from recstudio.data.dataset import TripletDataset
 5 | from .. import loss_func
 6 | from ..basemodel import BaseRanker
 7 | from ..module import ctr, MLPModule
 8 | 
 9 | r"""
10 | AFN
11 | ######################
12 | 
13 | Paper Reference:
14 |     Adaptive Factorization Network: Learning Adaptive-Order Feature Interactions (AAAI'20)
15 |     https://arxiv.org/abs/1909.03276
16 | """
17 | 
18 | class AFN(BaseRanker):
19 |     
20 |     def _get_dataset_class():
21 |         return TripletDataset
22 | 
23 |     def _init_model(self, train_data, drop_unused_field=True):
24 |         super()._init_model(train_data, drop_unused_field)
25 |         model_config = self.config['model']
26 |         num_fields = len(self.fields) - 1
27 |         self.afn = nn.Sequential(
28 |                     OrderedDict([
29 |                         ("embeddings", 
30 |                             ctr.Embeddings(
31 |                                 self.fields, 
32 |                                 self.embed_dim, 
33 |                                 train_data)),
34 |                         ("logtransform_layer",
35 |                             ctr.LogTransformLayer(
36 |                                 num_fields,
37 |                                 model_config['log_hidden_size'])),
38 |                         ("mlp",
39 |                             MLPModule(
40 |                                 [model_config['log_hidden_size'] * self.embed_dim] + model_config['mlp_layer'] + [1],
41 |                                 model_config['activation'], 
42 |                                 model_config['dropout'],
43 |                                 last_activation=False, 
44 |                                 last_bn=False))
45 |                     ]))    
46 |         if model_config['ensemble']:
47 |             self.ensemble_embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
48 |             self.ensemble_mlp = MLPModule(
49 |                                     [num_fields * self.embed_dim] + model_config['ensemble_mlp_layer'] + [1],
50 |                                     model_config['ensemble_activation'], 
51 |                                     model_config['ensemble_dropout'],
52 |                                     last_activation=False, 
53 |                                     last_bn=False
54 |                                 )
55 |             self.ensemble_fc = nn.Linear(2, 1)
56 |             
57 | 
58 |     def score(self, batch):
59 |         afn_score = self.afn(batch)
60 |         if self.config['model']['ensemble']:
61 |             ensemble_emb = self.ensemble_embedding(batch)
62 |             ensemble_mlp_score = self.ensemble_mlp(ensemble_emb.flatten(1))
63 |             score = self.ensemble_fc(
64 |                         torch.cat([afn_score, ensemble_mlp_score], dim=-1)
65 |                     )
66 |         else:
67 |             score = afn_score
68 |         score = score.squeeze(-1)
69 |         return {'score' : score}
70 | 
71 |     def _get_loss_func(self):
72 |         return loss_func.BCEWithLogitLoss()
73 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/aoanet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from collections import OrderedDict
 4 | from recstudio.data.dataset import TripletDataset
 5 | from .. import loss_func
 6 | from ..basemodel import BaseRanker
 7 | from ..module import ctr, MLPModule
 8 | 
 9 | r"""
10 | AOANet
11 | ######################
12 | 
13 | Paper Reference:
14 |     Architecture and Operation Adaptive Network for Online Recommendations (KDD'21)
15 |     https://dl.acm.org/doi/10.1145/3447548.3467133
16 | """
17 | 
18 | class AOANet(BaseRanker):
19 | 
20 |     def _get_dataset_class():
21 |         return TripletDataset
22 | 
23 |     def _init_model(self, train_data, drop_unused_field=True):
24 |         super()._init_model(train_data, drop_unused_field)
25 |         model_config = self.config['model']
26 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
27 |         self.mlp = MLPModule(
28 |                     [self.embedding.num_features * self.embed_dim] + model_config['mlp_layer'],
29 |                     model_config['activation'], 
30 |                     model_config['dropout'],
31 |                     last_activation=False, 
32 |                     last_bn=False)
33 |         self.gin = ctr.GeneralizedInteractionNet(
34 |                         self.embedding.num_features,
35 |                         self.embed_dim,
36 |                         model_config['num_interaction_layers'],
37 |                         model_config['num_subspaces'])
38 |         self.fc = nn.Linear(model_config['mlp_layer'][-1] + model_config['num_subspaces'] * self.embed_dim, 1)
39 |             
40 | 
41 |     def score(self, batch):
42 |         emb = self.embedding(batch)
43 |         mlp_out = self.mlp(emb.flatten(1))
44 |         gin_out = self.gin(emb).flatten(1)
45 |         score = self.fc(torch.cat([mlp_out, gin_out], dim=-1)).squeeze(-1)
46 |         return {'score' : score}
47 | 
48 |     def _get_loss_func(self):
49 |         return loss_func.BCEWithLogitLoss()
50 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/autoint.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from recstudio.data.dataset import TripletDataset
 3 | from .. import loss_func
 4 | from ..basemodel import BaseRanker
 5 | from ..module import ctr, MLPModule
 6 | 
 7 | r"""
 8 | AutoInt
 9 | ######################
10 | 
11 | Paper Reference:
12 |     AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks (CIKM'19)
13 |     https://dl.acm.org/doi/abs/10.1145/3357384.3357925
14 | """
15 | 
16 | class AutoInt(BaseRanker):
17 | 
18 |     def _get_dataset_class():
19 |         return TripletDataset
20 | 
21 |     def _init_model(self, train_data, drop_unused_field=True):
22 |         super()._init_model(train_data, drop_unused_field)
23 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
24 |         model_config = self.config['model']
25 |         if model_config['wide']:
26 |             self.linear = ctr.LinearLayer(self.fields, train_data)
27 |         if model_config['deep']:
28 |             self.mlp = MLPModule([self.embedding.num_features * self.embed_dim] + model_config['mlp_layer'] + [1],
29 |                             model_config['activation'], 
30 |                             model_config['dropout'],
31 |                             last_activation=False, 
32 |                             last_bn=False
33 |                         )
34 |         self.int = nn.Sequential(*[
35 |                         ctr.SelfAttentionInteractingLayer(
36 |                             self.embed_dim if i == 0 else model_config['attention_dim'],
37 |                             n_head=model_config['n_head'],
38 |                             dropout=model_config['dropout'],
39 |                             residual=model_config['residual'],
40 |                             residual_project=model_config['residual_project'],
41 |                             layer_norm=model_config['layer_norm']
42 |                         )
43 |                         for i in range(model_config['num_attention_layers'])])
44 |         self.fc = nn.Linear(self.embedding.num_features * self.embed_dim, 1)
45 | 
46 |     def score(self, batch):
47 |         emb = self.embedding(batch)
48 |         attn_out = self.int(emb)
49 |         int_score = self.fc(attn_out.flatten(1)).squeeze(-1)
50 |         score = int_score
51 |         if self.config['model']['wide']:
52 |             lr_score = self.linear(batch)
53 |             score += lr_score
54 |         if self.config['model']['deep']:
55 |             mlp_score = self.mlp(emb.flatten(1)).squeeze(-1)
56 |             score += mlp_score
57 |         return {'score' : score}
58 | 
59 |     def _get_loss_func(self):
60 |         return loss_func.BCEWithLogitLoss()
61 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/ccpm.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from collections import OrderedDict
 3 | from recstudio.data.dataset import TripletDataset
 4 | from .. import loss_func
 5 | from ..basemodel import BaseRanker
 6 | from ..module import ctr, MLPModule
 7 | 
 8 | r"""
 9 | CCPM
10 | ######################
11 | 
12 | Paper Reference:
13 |     A Convolutional Click Prediction Model (CIKM'15)
14 |     https://dl.acm.org/doi/10.1145/2806416.2806603
15 | """
16 | 
17 | class CCPM(BaseRanker):
18 | 
19 |     def _get_dataset_class():
20 |         return TripletDataset
21 | 
22 |     def _init_model(self, train_data, drop_unused_field=True):
23 |         super()._init_model(train_data, drop_unused_field)
24 |         self.linear = ctr.LinearLayer(self.fields, train_data)
25 |         model_config = self.config['model']
26 |         num_fields = len(self.fields) - 1
27 |         self.conv = nn.Sequential(
28 |                         OrderedDict([
29 |                             ("embeddings", 
30 |                                 ctr.Embeddings(
31 |                                     self.fields, 
32 |                                     self.embed_dim, 
33 |                                     train_data)),
34 |                             ("conv_layer", 
35 |                                 ctr.ConvLayer(
36 |                                     num_fields,
37 |                                     channels=model_config['channels'],
38 |                                     heights=model_config['heights']))
39 |                         ]))                               
40 |         self.mlp = MLPModule(
41 |                     [3 * self.embed_dim * model_config['channels'][-1]] + model_config['mlp_layer'] + [1],
42 |                     model_config['activation'], 
43 |                     model_config['dropout'],
44 |                     last_activation=False,
45 |                     last_bn=False)
46 | 
47 |     def score(self, batch):
48 |         conv_out = self.conv(batch)
49 |         score = self.mlp(conv_out.flatten(1)).squeeze(-1)
50 |         return {'score' : score}
51 | 
52 |     def _get_loss_func(self):
53 |         return loss_func.BCEWithLogitLoss()
54 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/config/afm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   attention_dim: 4
3 |   dropout: 0.5


--------------------------------------------------------------------------------
/recstudio/model/fm/config/afn.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   log_hidden_size: 128
 3 |   mlp_layer: [128, 128]
 4 |   activation: relu
 5 |   dropout: 0.5
 6 | 
 7 |   ensemble: True
 8 |   ensemble_mlp_layer: [256, 64]
 9 |   ensemble_activation: relu
10 |   ensemble_dropout: 0.5
11 | 
12 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/config/all.yaml:
--------------------------------------------------------------------------------
1 | data:
2 |   fmeval: True
3 |   low_rating_thres: 0.0
4 |   binarized_rating_thres: 3.0
5 | 
6 | eval:
7 |   val_metrics: [auc, logloss]
8 |   test_metrics: [auc, logloss]
9 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/config/aoanet.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   mlp_layer: [64, 64]
3 |   activation: relu
4 |   dropout: 0.2
5 |   num_subspaces: 3
6 |   num_interaction_layers: 3


--------------------------------------------------------------------------------
/recstudio/model/fm/config/autoint.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   wide: True
 3 | 
 4 |   deep: True
 5 |   mlp_layer: [128, 64]
 6 |   activation: relu
 7 | 
 8 |   dropout: 0.5
 9 |   attention_dim: 64
10 |   num_attention_layers: 3
11 |   n_head: 2
12 |   
13 |   residual: True
14 |   residual_project: True
15 |   layer_norm: False
16 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/config/ccpm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   channels: [3, 3]
3 |   heights: [6, 5]
4 | 
5 |   mlp_layer: [256]
6 |   activation: relu
7 |   dropout: 0.5


--------------------------------------------------------------------------------
/recstudio/model/fm/config/dcn.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   embed_dim: 10
3 |   mlp_layer: [256, 256, 256]
4 |   activation: relu
5 |   num_layers: 6
6 |   dropout: 0.5
7 |   batch_norm: True


--------------------------------------------------------------------------------
/recstudio/model/fm/config/dcnv2.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   combination: parallel
 3 |   low_rank: ~
 4 |   num_experts: 4
 5 |   num_layers: 3
 6 |   embed_dim: 10
 7 |   mlp_layer: [256,256,256]
 8 |   activation: relu
 9 |   cross_activation: tanh
10 |   dropout: 0.5
11 |   batch_norm: True
12 |   
13 |   scheduler: onplateau


--------------------------------------------------------------------------------
/recstudio/model/fm/config/deepcrossing.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   hidden_dims: [64, 64, 64]
3 |   activation: relu
4 |   dropout: 0.5
5 |   batch_norm: False
6 |   layer_norm: True


--------------------------------------------------------------------------------
/recstudio/model/fm/config/deepfm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   embed_dim: 10
3 |   mlp_layer: [256, 256, 256]
4 |   activation: tanh
5 |   dropout: 0.3
6 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/config/deepim.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   order: 5
3 |   deep: True
4 |   mlp_layer: [128, 128]
5 |   activation: relu
6 |   dropout: 0.5


--------------------------------------------------------------------------------
/recstudio/model/fm/config/destine.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   wide: True
 3 | 
 4 |   deep: True
 5 |   mlp_layer: [128, 64]
 6 |   activation: relu
 7 | 
 8 |   dropout: 0.5
 9 |   attention_dim: 64
10 |   num_attention_layers: 3
11 |   n_head: 2
12 |   
13 |   res_mode: each_layer
14 |   scale: False
15 |   relu_before_att: False
16 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/config/difm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   mlp_layer: [256, 256, 256]
3 |   activation: relu
4 |   dropout: 0.5
5 |   batch_norm: False
6 |   layer_norm: False
7 |   n_head: 2
8 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/config/dlrm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   op: sum
3 |   top_mlp_layer: [128, 128]
4 |   top_activation: relu
5 |   top_dropout: 0.5
6 |   bottom_mlp_layer: [128, 128]
7 |   bottom_activation: relu
8 |   bottom_dropout: 0.5
9 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/config/edcn.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   bridge_type: hadamard_product
3 |   temperature: 1.0
4 |   embed_dim: 10
5 |   activation: relu
6 |   num_layers: 5
7 |   dropout: 0.5
8 |   batch_norm: True


--------------------------------------------------------------------------------
/recstudio/model/fm/config/ffm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   embed_dim: 10


--------------------------------------------------------------------------------
/recstudio/model/fm/config/fgcnn.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   channels: [3, 4, 5]
 3 |   heights: [16, 16, 16]
 4 |   recombine_channels: [5, 6, 7]
 5 |   pooling_sizes: [3, 1, 1]
 6 | 
 7 |   mlp_layer: [256]
 8 |   activation: relu
 9 |   dropout: 0.5
10 |   batch_norm: True


--------------------------------------------------------------------------------
/recstudio/model/fm/config/fibinet.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   reduction_ratio: 3
3 |   excitation_activation: relu
4 |   bilinear_type: interaction
5 |   mlp_layer: [128, 32]
6 |   activation: relu
7 |   dropout: 0.5
8 |   shared_bilinear: True
9 |   learning_rate: 1e-4


--------------------------------------------------------------------------------
/recstudio/model/fm/config/fignn.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   deep: True
3 |   mlp_layer: [256, 256, 256]
4 |   activation: relu
5 |   dropout: 0.5
6 | 
7 |   layer_norm: True
8 |   num_gnn_layers: 3
9 |   n_head: 2


--------------------------------------------------------------------------------
/recstudio/model/fm/config/finalmlp.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   embed_dim: 10
 3 |   mlp_layer1: [256, 256, 256]
 4 |   mlp_layer2: [256, 256, 256]
 5 |   activation1: relu
 6 |   activation2: relu
 7 |   dropout1: 0.3
 8 |   dropout2: 0.3
 9 |   batch_norm1: False
10 |   batch_norm2: False
11 |   feature_selection: True
12 |   fs_mlp_layer: [32]
13 |   n_head: 2
14 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/config/flen.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   embed_dim: 10
3 |   mlp_layer: [256, 256, 256]
4 |   activation: relu
5 |   dropout: 0.3
6 |   fields: ~ # [[...], [...], ...]
7 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/config/fm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |     embed_dim: 10
3 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/config/fmfm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |     embed_dim: 10


--------------------------------------------------------------------------------
/recstudio/model/fm/config/fwfm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |     embed_dim: 10
3 |     linear_type: filv


--------------------------------------------------------------------------------
/recstudio/model/fm/config/hfm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   op: circular_correlation
3 |   deep: True
4 |   embed_dim: 10
5 |   mlp_layer: [256, 256, 256]
6 |   activation: relu
7 |   dropout: 0.3


--------------------------------------------------------------------------------
/recstudio/model/fm/config/ifm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   mlp_layer: [256, 256, 256]
3 |   activation: relu
4 |   dropout: 0.5
5 |   batch_norm: False


--------------------------------------------------------------------------------
/recstudio/model/fm/config/interhat.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   order: 3
3 |   feedforward_dim: 32
4 |   aggregation_dim: 32
5 |   mlp_layer: [256, 256, 256]
6 |   activation: relu
7 |   dropout: 0.5
8 |   n_head: 2


--------------------------------------------------------------------------------
/recstudio/model/fm/config/lorentzfm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   embed_dim: 10


--------------------------------------------------------------------------------
/recstudio/model/fm/config/lr.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   embed_dim: 1


--------------------------------------------------------------------------------
/recstudio/model/fm/config/masknet.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   parallel: False
 3 |   num_blocks: 3
 4 |   block_dim: 50
 5 |   reduction_ratio: 1
 6 |   hidden_layer_norm: False
 7 |   mlp_layer: [512, 128]
 8 |   activation: relu
 9 |   dropout: 0.5
10 |   learning_rate: 1e-4


--------------------------------------------------------------------------------
/recstudio/model/fm/config/nfm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   embed_dim: 10
3 |   mlp_layer: [128, 128, 128]
4 |   dropout: 0.3
5 |   batch_norm: True
6 |   activation: sigmoid


--------------------------------------------------------------------------------
/recstudio/model/fm/config/onn.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   mlp_layer: [128, 64]
3 |   activation: relu
4 |   dropout: 0.2
5 |   batch_norm: True


--------------------------------------------------------------------------------
/recstudio/model/fm/config/pnn.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   product_type: inner
3 |   mlp_layer: [128, 64]
4 |   activation: relu
5 |   dropout: 0.5
6 |   batch_norm: False
7 |   stack_dim: 2
8 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/config/ppnet.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   id_fields: ~
3 |   mlp_layer: [256, 256, 256]
4 |   activation: relu
5 |   dropout: 0.5
6 |   batch_norm: False
7 |   id_embed_dim: 8
8 |   pp_hidden_dims: [128, 128, 128]
9 |   gate_hidden_dims: [64, 64, 64]


--------------------------------------------------------------------------------
/recstudio/model/fm/config/sam.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   dropout: 0
3 |   interaction_type: sam2e


--------------------------------------------------------------------------------
/recstudio/model/fm/config/widedeep.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   batch_norm: True
3 |   embed_dim: 10
4 |   mlp_layer: [256, 256, 256]
5 |   activation: relu
6 |   dropout: 0.3
7 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/config/xdeepfm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   embed_dim: 10
3 |   cin_layer_size: [100, 100, 100]
4 |   mlp_layer: [128, 128, 128]
5 |   activation: relu
6 |   dropout: 0.2
7 |   direct: False
8 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/dcn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.data.dataset import TripletDataset
 3 | from ..basemodel import BaseRanker
 4 | from ..loss_func import BCEWithLogitLoss
 5 | from ..module import ctr, MLPModule
 6 | 
 7 | 
 8 | class DCN(BaseRanker):
 9 | 
10 |     def _get_dataset_class():
11 |         return TripletDataset
12 | 
13 |     def _init_model(self, train_data):
14 |         super()._init_model(train_data)
15 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
16 |         num_features = self.embedding.num_features
17 |         model_config = self.config['model']
18 |         mlp_layer = model_config['mlp_layer']
19 |         self.cross_net = ctr.CrossNetwork(num_features * self.embed_dim, model_config['num_layers'])
20 |         self.mlp = MLPModule(
21 |                     [num_features * self.embed_dim] + mlp_layer,
22 |                     model_config['activation'],
23 |                     model_config['dropout'],
24 |                     batch_norm=model_config['batch_norm'])
25 |         self.fc = torch.nn.Linear(num_features*self.embed_dim + mlp_layer[-1], 1)
26 | 
27 |     def score(self, batch):
28 |         emb = self.embedding(batch).flatten(1)
29 |         cross_out = self.cross_net(emb)
30 |         deep_out = self.mlp(emb)
31 |         score = self.fc(torch.cat([deep_out, cross_out], -1)).squeeze(-1)
32 |         return {'score' : score}
33 | 
34 |     def _get_loss_func(self):
35 |         return BCEWithLogitLoss()
36 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/dcnv2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from recstudio.data.dataset import TripletDataset
 4 | from ..basemodel import BaseRanker
 5 | from ..loss_func import BCEWithLogitLoss
 6 | from ..module import ctr, MLPModule
 7 | 
 8 | r"""
 9 | DCNv2
10 | ######################
11 | 
12 | Paper Reference:
13 |     DCN V2: Improved Deep & Cross Network and Practical Lessons for Web-scale Learning to Rank Systems (WWW'21)
14 |     https://dl.acm.org/doi/10.1145/3442381.3450078
15 | """
16 | 
17 | class DCNv2(BaseRanker):
18 | 
19 |     def _get_dataset_class():
20 |         return TripletDataset
21 | 
22 |     def _init_model(self, train_data):
23 |         super()._init_model(train_data)
24 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
25 |         num_fields = self.embedding.num_features
26 |         model_config = self.config['model']
27 |         if model_config['low_rank'] is None:
28 |             self.cross_net = ctr.CrossNetworkV2(num_fields * self.embed_dim, model_config['num_layers'])
29 |         else:
30 |             self.cross_net = ctr.CrossNetworkMix(num_fields * self.embed_dim, model_config['num_layers'], 
31 |                                                 model_config['low_rank'], model_config['num_experts'],
32 |                                                 model_config['cross_activation'])
33 |             
34 |         if model_config['combination'].lower() == 'parallel':
35 |             self.mlp = MLPModule(
36 |                         [num_fields * self.embed_dim] + model_config['mlp_layer'],
37 |                         model_config['activation'],
38 |                         model_config['dropout'],
39 |                         batch_norm=model_config['batch_norm'])
40 |             self.fc = nn.Linear(num_fields*self.embed_dim + model_config['mlp_layer'][-1], 1)
41 |         elif model_config['combination'].lower() == 'stacked':
42 |             self.mlp = MLPModule(
43 |                         [num_fields * self.embed_dim] + model_config['mlp_layer'] + [1],
44 |                         model_config['activation'],
45 |                         model_config['dropout'],
46 |                         batch_norm=model_config['batch_norm'],
47 |                         last_activation=False,
48 |                         last_bn=False)
49 |         else:
50 |             raise ValueError(f'Expect combination to be `parallel`|`stacked`, but got {model_config["combination"]}.')
51 | 
52 |     def score(self, batch):
53 |         emb = self.embedding(batch).flatten(1)
54 |         cross_out = self.cross_net(emb)
55 |         if self.config['model']['combination'].lower() == 'parallel':
56 |             deep_out = self.mlp(emb)
57 |             score = self.fc(torch.cat([deep_out, cross_out], -1)).squeeze(-1)
58 |         else:
59 |             deep_out = self.mlp(cross_out)
60 |             score = deep_out.squeeze(-1)
61 |         return {'score' : score}
62 | 
63 |     def _get_loss_func(self):
64 |         return BCEWithLogitLoss()
65 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/deepcrossing.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from collections import OrderedDict
 3 | from recstudio.data.dataset import TripletDataset
 4 | from ..basemodel import BaseRanker
 5 | from ..loss_func import BCEWithLogitLoss
 6 | from ..module import ctr, MLPModule, ResidualLayer
 7 | 
 8 | r"""
 9 | DeepCrossing
10 | ######################
11 | 
12 | Paper Reference:
13 |     Deep Crossing: Web-Scale Modeling without Manually Crafted Combinatorial Features (KDD'16)
14 |     https://dl.acm.org/doi/10.1145/2939672.2939704
15 | """
16 | 
17 | class DeepCrossing(BaseRanker):
18 | 
19 |     def _get_dataset_class():
20 |         return TripletDataset
21 | 
22 |     def _init_model(self, train_data, drop_unused_field=True):
23 |         super()._init_model(train_data, drop_unused_field)
24 |         model_config = self.config['model']
25 |         num_fields = len(self.fields) - 1
26 |         self.dc = nn.Sequential(OrderedDict([
27 |                     ("embedding",
28 |                         ctr.Embeddings(self.fields, self.embed_dim, train_data)),
29 |                     ("residuals",
30 |                         nn.Sequential(*[
31 |                             ResidualLayer(
32 |                                 MLPModule(
33 |                                     [num_fields*self.embed_dim, hidden_dim, num_fields*self.embed_dim],
34 |                                     model_config['activation'], 
35 |                                     last_activation=False, last_bn=False
36 |                                 ),
37 |                                 num_fields,
38 |                                 self.embed_dim,
39 |                                 model_config['activation'],
40 |                                 model_config['dropout'],
41 |                                 model_config['batch_norm'],
42 |                                 model_config['layer_norm']
43 |                             ) 
44 |                             for hidden_dim in model_config['hidden_dims']]))
45 |                 ]))
46 |         self.fc = nn.Linear(num_fields * self.embed_dim, 1)
47 |     def score(self, batch):
48 |         dc_out = self.dc(batch)
49 |         score = self.fc(dc_out.flatten(1)).squeeze(-1)
50 |         return {'score' : score}
51 | 
52 |     def _get_loss_func(self):
53 |         return BCEWithLogitLoss()
54 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/deepfm.py:
--------------------------------------------------------------------------------
 1 | from recstudio.data.dataset import TripletDataset
 2 | from ..basemodel import BaseRanker
 3 | from ..loss_func import BCEWithLogitLoss
 4 | from ..module import ctr, MLPModule
 5 | 
 6 | 
 7 | class DeepFM(BaseRanker):
 8 | 
 9 |     def _get_dataset_class():
10 |         return TripletDataset
11 | 
12 |     def _init_model(self, train_data, drop_unused_field=True):
13 |         super()._init_model(train_data, drop_unused_field)
14 |         self.linear = ctr.LinearLayer(self.fields, train_data)
15 |         self.fm = ctr.FMLayer(reduction='sum')
16 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
17 |         model_config = self.config['model']
18 |         self.mlp = MLPModule([self.embedding.num_features*self.embed_dim]+model_config['mlp_layer']+[1],
19 |                              model_config['activation'], model_config['dropout'],
20 |                              last_activation=False, last_bn=False)
21 | 
22 |     def score(self, batch):
23 |         lr_score = self.linear(batch)
24 |         emb = self.embedding(batch)
25 |         fm_score = self.fm(emb)
26 |         mlp_score = self.mlp(emb.flatten(1)).squeeze(-1)
27 |         return {'score' : lr_score + fm_score + mlp_score}
28 | 
29 |     def _get_loss_func(self):
30 |         return BCEWithLogitLoss()
31 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/deepim.py:
--------------------------------------------------------------------------------
 1 | from recstudio.data.dataset import TripletDataset
 2 | from ..basemodel import BaseRanker
 3 | from ..loss_func import BCEWithLogitLoss
 4 | from ..module import ctr, MLPModule
 5 | 
 6 | r"""
 7 | DeepIM
 8 | ######################
 9 | 
10 | Paper Reference:
11 |     Deep Interaction Machine: A Simple but Effective Model for High-order Feature Interactions (CIKM'20)
12 |     https://doi.org/10.1145/3340531.3412077
13 | """
14 | 
15 | class DeepIM(BaseRanker):
16 | 
17 |     def _get_dataset_class():
18 |         return TripletDataset
19 | 
20 |     def _init_model(self, train_data, drop_unused_field=True):
21 |         super()._init_model(train_data, drop_unused_field)
22 |         model_config = self.config['model']
23 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
24 |         self.im = ctr.InteractionMachine(self.embed_dim, model_config['order'])
25 |         if model_config['deep']:
26 |             self.mlp = MLPModule(
27 |                     [self.embedding.num_features * self.embed_dim] + model_config['mlp_layer'] + [1],
28 |                     model_config['activation'], 
29 |                     model_config['dropout'],
30 |                     last_activation=False, 
31 |                     last_bn=False)
32 |             
33 |     def score(self, batch):
34 |         emb = self.embedding(batch)
35 |         im_score = self.im(emb).squeeze(-1)
36 |         if self.config['model']['deep']:
37 |             mlp_score = self.mlp(emb.flatten(1)).squeeze(-1)
38 |             return {'score' : im_score + mlp_score}
39 |         else:
40 |             return{'score': im_score}
41 | 
42 |     def _get_loss_func(self):
43 |         return BCEWithLogitLoss()
44 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/destine.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from recstudio.data.dataset import TripletDataset
 3 | from .. import loss_func
 4 | from ..basemodel import BaseRanker
 5 | from ..module import ctr, MLPModule
 6 | 
 7 | r"""
 8 | DESTINE
 9 | ######################
10 | 
11 | Paper Reference:
12 |     Disentangled Self-Attentive Neural Networks for Click-Through Rate Prediction (CIKM'21)
13 |     https://dl.acm.org/doi/10.1145/3459637.3482088
14 | """
15 | 
16 | class DESTINE(BaseRanker):
17 | 
18 |     def _get_dataset_class():
19 |         return TripletDataset
20 | 
21 |     def _init_model(self, train_data, drop_unused_field=True):
22 |         super()._init_model(train_data, drop_unused_field)
23 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
24 |         model_config = self.config['model']
25 |         if model_config['wide']:
26 |             self.linear = ctr.LinearLayer(self.fields, train_data)
27 |         if model_config['deep']:
28 |             self.mlp = MLPModule([self.embedding.num_features * self.embed_dim] + model_config['mlp_layer'] + [1],
29 |                             model_config['activation'], 
30 |                             model_config['dropout'],
31 |                             last_activation=False, 
32 |                             last_bn=False
33 |                         )
34 |         
35 |         if model_config['res_mode'].lower() == 'last_layer':
36 |             self.res = nn.Linear(self.embed_dim, model_config['attention_dim'])
37 |         elif model_config['res_mode'] is not None and model_config['res_mode'] != 'each_layer':
38 |             raise ValueError(f'Expect res_mode to be `last_layer`|`each_layer`|None, but got {model_config["res_mode"]}.')
39 |             
40 |         self.dsa = nn.Sequential(*[
41 |                         ctr.DisentangledSelfAttentionInteractingLayer(
42 |                             self.embed_dim,
43 |                             attention_dim=self.embed_dim if i == 0 else model_config['attention_dim'],
44 |                             n_head=model_config['n_head'],
45 |                             dropout=model_config['dropout'],
46 |                             residual=(model_config['res_mode']=='each_layer'),
47 |                             scale=model_config['scale'],
48 |                             relu_before_att=model_config['relu_before_att'] if i == 0 else False,
49 |                         )
50 |                         for i in range(model_config['num_attention_layers'])])
51 |         self.fc = nn.Linear(self.embedding.num_features * self.embed_dim, 1)
52 | 
53 |     def score(self, batch):
54 |         emb = self.embedding(batch)
55 |         attn_out = self.dsa(emb)
56 |         if self.config['model']['res_mode'].lower() == 'last_layer':
57 |             attn_out += self.res(emb)
58 |         attn_out = attn_out.relu()
59 |         attn_score = self.fc(attn_out.flatten(1)).squeeze(-1)
60 |         score = attn_score
61 |         if self.config['model']['wide']:
62 |             lr_score = self.linear(batch)
63 |             score += lr_score
64 |         if self.config['model']['deep']:
65 |             mlp_score = self.mlp(emb.flatten(1)).squeeze(-1)
66 |             score += mlp_score
67 |         return {'score' : score}
68 | 
69 |     def _get_loss_func(self):
70 |         return loss_func.BCEWithLogitLoss()
71 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/difm.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from recstudio.data.dataset import TripletDataset
 3 | from .. import loss_func
 4 | from ..basemodel import BaseRanker
 5 | from ..module import ctr, LambdaLayer
 6 | 
 7 | r"""
 8 | DIFM
 9 | ######################
10 | 
11 | Paper Reference:
12 |     A Dual Input-aware Factorization Machine for CTR Prediction (IJCAI'20)
13 |     https://dl.acm.org/doi/10.5555/3491440.3491874
14 | """
15 | 
16 | class DIFM(BaseRanker):
17 | 
18 |     def _get_dataset_class():
19 |         return TripletDataset
20 | 
21 |     def _init_model(self, train_data, drop_unused_field=True):
22 |         super()._init_model(train_data, drop_unused_field)
23 |         self.linear = ctr.LinearLayer(self.fields, train_data)
24 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
25 |         model_config = self.config['model']
26 |         num_fields = self.embedding.num_features
27 |         self.vec_wise_fen = nn.Sequential(
28 |                                 ctr.SelfAttentionInteractingLayer(
29 |                                     self.embed_dim,
30 |                                     model_config['n_head'],
31 |                                     model_config['dropout'],
32 |                                     layer_norm=model_config['layer_norm']),
33 |                                 LambdaLayer(lambda x: x.reshape(x.size(0), -1)),
34 |                                 nn.Linear(
35 |                                     num_fields * self.embed_dim, 
36 |                                     num_fields, 
37 |                                     bias=False))
38 |         self.bit_wise_fen = nn.Sequential(
39 |                                 ctr.MLPModule(
40 |                                     [num_fields * self.embed_dim] + model_config['mlp_layer'],
41 |                                     model_config['activation'],
42 |                                     model_config['dropout'],
43 |                                     batch_norm=model_config['batch_norm']),
44 |                                 nn.Linear(
45 |                                     model_config['mlp_layer'][-1], 
46 |                                     num_fields, 
47 |                                     bias=False))
48 |         self.fm = ctr.FMLayer(reduction='sum')
49 |         
50 |     def score(self, batch):
51 |         emb = self.embedding(batch)
52 |         m_vec = self.vec_wise_fen(emb)
53 |         m_bit = self.bit_wise_fen(emb.flatten(1))
54 |         weight = m_vec + m_bit
55 |         lr_score = (super(ctr.LinearLayer, self.linear).forward(batch).squeeze(-1) * weight).sum(-1) + self.linear.bias
56 |         fm_score = self.fm(emb * weight.unsqueeze(-1))
57 |         return {'score' : lr_score + fm_score}
58 | 
59 |     def _get_loss_func(self):
60 |         return loss_func.BCEWithLogitLoss()
61 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/dlrm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from recstudio.data.dataset import TripletDataset
 4 | from ..basemodel import BaseRanker
 5 | from ..loss_func import BCEWithLogitLoss
 6 | from ..module import ctr, MLPModule, LambdaLayer
 7 | 
 8 | r"""
 9 | DLRM
10 | ######################
11 | 
12 | Paper Reference:
13 |     Deep Learning Recommendation Model for Personalization and Recommendation Systems
14 |     https://arxiv.org/abs/1906.00091
15 | """
16 | 
17 | class DLRM(BaseRanker):
18 | 
19 |     def _get_dataset_class():
20 |         return TripletDataset
21 | 
22 |     def _init_model(self, train_data, drop_unused_field=True):
23 |         super()._init_model(train_data, drop_unused_field)
24 |         model_config = self.config['model']
25 |         sparse_fields = {f for f in self.fields if train_data.field2type[f] != 'float'}
26 |         dense_fields = {f for f in self.fields if train_data.field2type[f] == 'float' and f != self.frating}
27 |         num_fields = len(sparse_fields) + int(len(dense_fields) > 0)
28 |         self.embedding = ctr.Embeddings(sparse_fields, self.embed_dim, train_data)
29 |         if len(dense_fields) > 0:
30 |             self.bottom_mlp = MLPModule(
31 |                                 [len(dense_fields)] + model_config['bottom_mlp_layer'] + [self.embed_dim],
32 |                                 model_config['bottom_activation'], 
33 |                                 model_config['bottom_dropout'],
34 |                                 last_activation=False, 
35 |                                 last_bn=False)
36 |         if model_config['op'].lower() == 'dot':
37 |             self.interaction = ctr.InnerProductLayer(num_fields)
38 |             top_mlp_in = num_fields * (num_fields - 1) // 2 + self.embed_dim * int(len(dense_fields) > 0)
39 |         elif model_config['op'].lower() == 'cat':
40 |             self.interaction = nn.Flatten(start_dim=1)
41 |             top_mlp_in = num_fields * self.embed_dim
42 |         elif model_config['op'].lower() == 'sum':
43 |             self.interaction = LambdaLayer(lambda emb: emb.sum(1))
44 |             top_mlp_in = self.embed_dim
45 |         else:
46 |             raise ValueError(f'Expect op to be `dot`|`cat`|`sum`, but got{model_config["op"]}.')
47 |         self.top_mlp = MLPModule(
48 |                         [top_mlp_in] + model_config['top_mlp_layer'] + [1],
49 |                         model_config['top_activation'], 
50 |                         model_config['top_dropout'],
51 |                         last_activation=False, 
52 |                         last_bn=False)
53 |             
54 |     def score(self, batch):
55 |         emb = self.embedding(batch)
56 |         dense_fields = {f for f in self.fields if f not in self.embedding.field2types and f != self.frating}
57 |         if len(dense_fields) > 0:
58 |             dense_values = torch.vstack([batch[f] for f in dense_fields]).t()
59 |             if dense_values.dim() == 1:
60 |                 dense_values = dense_values.unsqueeze(-1)
61 |             dense_emb = self.bottom_mlp(dense_values)
62 |             emb = torch.cat([emb, dense_emb.unsqueeze(1)], dim=1)
63 |         inter_out = self.interaction(emb)
64 |         if self.config['model']['op'] == 'dot' and len(dense_fields) > 0:
65 |             inter_out = torch.cat([inter_out, dense_emb], dim=-1)
66 |         score = self.top_mlp(inter_out).squeeze(-1)
67 |         return {'score': score}
68 | 
69 |     def _get_loss_func(self):
70 |         return BCEWithLogitLoss()
71 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/edcn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from recstudio.data.dataset import TripletDataset
 4 | from ..basemodel import BaseRanker
 5 | from ..loss_func import BCEWithLogitLoss
 6 | from ..module import ctr, MLPModule, LambdaLayer
 7 | 
 8 | r"""
 9 | EDCN
10 | ######################
11 | 
12 | Paper Reference:
13 |     Enhancing Explicit and Implicit Feature Interactions via Information Sharing for Parallel Deep CTR Models (CIKM'21)
14 |     https://dl.acm.org/doi/10.1145/3459637.3481915
15 | """
16 | 
17 | class EDCN(BaseRanker):
18 | 
19 |     def _get_dataset_class():
20 |         return TripletDataset
21 | 
22 |     def _init_model(self, train_data):
23 |         super()._init_model(train_data)
24 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
25 |         num_fields = self.embedding.num_features
26 |         model_config = self.config['model']
27 |         self.cross = nn.ModuleList([
28 |                             ctr.CrossInteraction(num_fields * self.embed_dim)
29 |                             for _ in range(model_config['num_layers'])
30 |                         ])
31 |         self.mlp = nn.ModuleList([
32 |                         MLPModule(
33 |                             2 * [num_fields * self.embed_dim],
34 |                             model_config['activation'],
35 |                             model_config['dropout'],
36 |                             batch_norm=model_config['batch_norm'])
37 |                         for _ in range(model_config['num_layers'])   
38 |                     ])
39 |         self.bridge = nn.ModuleList([
40 |                         ctr.BridgeLayer(
41 |                             num_fields * self.embed_dim, 
42 |                             model_config['bridge_type'])
43 |                         for _ in range(model_config['num_layers'])
44 |                     ])
45 |         self.regulation = nn.ModuleList([
46 |                             ctr.RegulationLayer(
47 |                                 num_fields, 
48 |                                 self.embed_dim, 
49 |                                 model_config['temperature'], 
50 |                                 model_config['batch_norm'])
51 |                             for _ in range(model_config['num_layers'])
52 |                         ])
53 |         self.fc = torch.nn.Linear(3 * num_fields * self.embed_dim, 1)
54 | 
55 |     def score(self, batch):
56 |         emb = self.embedding(batch)
57 |         ci, di = self.regulation[0](emb.flatten(1))
58 |         c0 = ci
59 |         for i, (cross, deep, bridge) in enumerate(zip(self.cross, self.mlp, self.bridge)):
60 |             ci = cross(c0, ci)
61 |             di = deep(di)
62 |             bi = bridge(ci, di)
63 |             if i + 1 < self.config['model']['num_layers']:
64 |                 ci, di = self.regulation[i + 1](bi)
65 |         score = self.fc(torch.cat([ci, di, bi], -1)).squeeze(-1)
66 |         return {'score' : score}
67 | 
68 |     def _get_loss_func(self):
69 |         return BCEWithLogitLoss()
70 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/ffm.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from collections import OrderedDict
 3 | from recstudio.data.dataset import TripletDataset
 4 | from .. import loss_func
 5 | from ..basemodel import BaseRanker
 6 | from ..module import ctr
 7 | 
 8 | r"""
 9 | FFM
10 | ######################
11 | 
12 | Paper Reference:
13 |     Field-aware Factorization Machines for CTR Prediction (RecSys'16)
14 |     https://dl.acm.org/doi/10.1145/2959100.2959134
15 | """
16 | 
17 | class FFM(BaseRanker):
18 | 
19 |     def _get_dataset_class():
20 |         return TripletDataset
21 | 
22 |     def _init_model(self, train_data, drop_unused_field=True):
23 |         super()._init_model(train_data, drop_unused_field)
24 |         self.linear = ctr.LinearLayer(self.fields, train_data)
25 |         num_fields = len(self.fields) - 1
26 |         self.ffm = nn.Sequential(
27 |                         OrderedDict([
28 |                             ("embedding",
29 |                                 ctr.Embeddings(
30 |                                     self.fields, 
31 |                                     self.embed_dim * (num_fields - 1), 
32 |                                     train_data)),
33 |                             ("ffm_layer",
34 |                                 ctr.FieldAwareFMLayer(
35 |                                     num_fields
36 |                                 ))
37 |                         ]))
38 |         
39 |     def score(self, batch):
40 |         lr_score = self.linear(batch)
41 |         ffm_score = self.ffm(batch)
42 |         return {'score' : lr_score + ffm_score}
43 | 
44 |     def _get_loss_func(self):
45 |         return loss_func.BCEWithLogitLoss()
46 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/fgcnn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.data.dataset import TripletDataset
 3 | from .. import loss_func
 4 | from ..basemodel import BaseRanker
 5 | from ..module import ctr, MLPModule
 6 | 
 7 | r"""
 8 | FGCNN
 9 | ######################
10 | 
11 | Paper Reference:
12 |     Feature Generation by Convolutional Neural Network for Click-Through Rate Prediction (WWW'19)
13 |     https://dl.acm.org/doi/abs/10.1145/3308558.3313497
14 | """
15 | 
16 | class FGCNN(BaseRanker):
17 | 
18 |     def _get_dataset_class():
19 |         return TripletDataset
20 | 
21 |     def _init_model(self, train_data, drop_unused_field=True):
22 |         super()._init_model(train_data, drop_unused_field)
23 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
24 |         num_raw_fields = self.embedding.num_features
25 |         model_config = self.config['model']
26 |         self.fgcnn = ctr.FGCNNLayer(
27 |                         num_raw_fields,
28 |                         self.embed_dim,
29 |                         model_config['channels'],
30 |                         model_config['heights'],
31 |                         model_config['pooling_sizes'],
32 |                         model_config['recombine_channels'],
33 |                         model_config['batch_norm'])  
34 |         num_new_fields = sum([rc * oh for rc, oh in zip(model_config['recombine_channels'], self.fgcnn.out_height[1:])])
35 |         num_total_fields = num_raw_fields + num_new_fields
36 |         self.inner_product = ctr.InnerProductLayer(num_total_fields)
37 |         mlp_in = num_total_fields * (num_total_fields - 1) // 2 + num_total_fields * self.embed_dim                       
38 |         self.mlp = MLPModule(
39 |                     [mlp_in] + model_config['mlp_layer'] + [1],
40 |                     model_config['activation'], 
41 |                     model_config['dropout'],
42 |                     last_activation=False,
43 |                     last_bn=False)
44 | 
45 |     def score(self, batch):
46 |         raw_emb = self.embedding(batch)
47 |         new_emb = self.fgcnn(raw_emb)
48 |         comb_emb = torch.cat([raw_emb, new_emb], dim=1)
49 |         inner_prod = self.inner_product(comb_emb)
50 |         mlp_in = torch.cat([comb_emb.flatten(1), inner_prod], dim=1)
51 |         score = self.mlp(mlp_in).squeeze(-1)
52 |         return {'score' : score}
53 | 
54 |     def _get_loss_func(self):
55 |         return loss_func.BCEWithLogitLoss()
56 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/fibinet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from recstudio.data.dataset import TripletDataset
 4 | from .. import loss_func
 5 | from ..basemodel import BaseRanker
 6 | from ..module import ctr, MLPModule
 7 | 
 8 | r"""
 9 | FiBiNET
10 | ######################
11 | 
12 | Paper Reference:
13 |     FiBiNET: Combining Feature Importance and Bilinear feature Interaction for Click-Through Rate Prediction (RecSys'19)
14 |     https://dl.acm.org/doi/abs/10.1145/3298689.3347043
15 | """
16 | 
17 | class FiBiNET(BaseRanker):
18 | 
19 |     def _get_dataset_class():
20 |         return TripletDataset
21 | 
22 |     def _init_model(self, train_data, drop_unused_field=True):
23 |         super()._init_model(train_data, drop_unused_field)
24 |         self.linear = ctr.LinearLayer(self.fields, train_data)
25 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
26 |         model_config = self.config['model']
27 |         num_fields = self.embedding.num_features
28 |         self.senet = ctr.SqueezeExcitation(
29 |                         num_fields, 
30 |                         model_config['reduction_ratio'], 
31 |                         model_config['excitation_activation'])
32 |         self.bilinear = ctr.BilinearInteraction(
33 |                             num_fields, 
34 |                             self.embed_dim, 
35 |                             model_config['bilinear_type'])
36 |         if not model_config['shared_bilinear']:
37 |             self.bilinear4se = ctr.BilinearInteraction(
38 |                                 num_fields, 
39 |                                 self.embed_dim, 
40 |                                 model_config['bilinear_type'])
41 |         self.mlp = MLPModule(
42 |                         [num_fields * (num_fields - 1) * self.embed_dim] + model_config['mlp_layer'] + [1],
43 |                         model_config['activation'], 
44 |                         model_config['dropout'],
45 |                         last_activation=False, 
46 |                         last_bn=False)
47 | 
48 |     def score(self, batch):
49 |         lr_score = self.linear(batch)
50 |         emb = self.embedding(batch)
51 |         senet_emb = self.senet(emb)
52 |         bilinear_ori = self.bilinear(emb)
53 |         if self.config['model']['shared_bilinear']:
54 |             bilinear_senet = self.bilinear(senet_emb)
55 |         else:
56 |             bilinear_senet = self.bilinear4se(senet_emb)
57 |         comb = torch.cat([bilinear_ori, bilinear_senet], dim=1)
58 |         mlp_score = self.mlp(comb.flatten(1)).squeeze(-1)
59 |         return {'score' : lr_score + mlp_score}
60 | 
61 |     def _get_loss_func(self):
62 |         return loss_func.BCEWithLogitLoss()
63 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/fignn.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch.nn as nn
 3 | from collections import OrderedDict
 4 | from recstudio.data.dataset import TripletDataset
 5 | from ..basemodel import BaseRanker
 6 | from ..loss_func import BCEWithLogitLoss
 7 | from ..module import ctr, MLPModule
 8 | 
 9 | r"""
10 | FiGNN
11 | ######################
12 | 
13 | Paper Reference:
14 |     Fi-GNN: Modeling Feature Interactions via Graph Neural Networks for CTR Prediction (CIKM'19)
15 |     https://doi.org/10.1145/3357384.3357951
16 | """
17 | 
18 | class FiGNN(BaseRanker):
19 | 
20 |     def _get_dataset_class():
21 |         return TripletDataset
22 | 
23 |     def _init_model(self, train_data, drop_unused_field=True):
24 |         super()._init_model(train_data, drop_unused_field)
25 |         model_config = self.config['model']
26 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
27 |         num_fields = self.embedding.num_features
28 |         self.gnn = nn.Sequential(OrderedDict([
29 |                         ('self_attn', 
30 |                             ctr.SelfAttentionInteractingLayer(
31 |                                 self.embed_dim,
32 |                                 model_config['n_head'],
33 |                                 model_config['dropout'],
34 |                                 residual=True,
35 |                                 residual_project=False,
36 |                                 layer_norm=model_config['layer_norm'])),
37 |                         ('fignn', 
38 |                             ctr.FiGNNLayer(
39 |                                 num_fields,
40 |                                 self.embed_dim,
41 |                                 model_config['num_gnn_layers']))
42 |                     ]))
43 |         self.attn_pred = nn.ModuleDict({
44 |                             'mlp1': nn.Linear(self.embed_dim, 1),
45 |                             'mlp2': nn.Linear(num_fields * self.embed_dim, num_fields)
46 |                         })
47 |         if model_config['deep']:
48 |             self.mlp = MLPModule(
49 |                     [self.embedding.num_features * self.embed_dim] + model_config['mlp_layer'] + [1],
50 |                     model_config['activation'], 
51 |                     model_config['dropout'],
52 |                     last_activation=False, 
53 |                     last_bn=False)
54 |             
55 |     def score(self, batch):
56 |         emb = self.embedding(batch)
57 |         gnn_out = self.gnn(emb)
58 |         gnn_score = (self.attn_pred['mlp2'](gnn_out.flatten(1)) * \
59 |                     self.attn_pred['mlp1'](gnn_out).squeeze(-1)).sum(-1)
60 |         if self.config['model']['deep']:
61 |             mlp_score = self.mlp(emb.flatten(1)).squeeze(-1)
62 |             return {'score' : gnn_score + mlp_score}
63 |         else:
64 |             return{'score': gnn_score}
65 | 
66 |     def _get_loss_func(self):
67 |         return BCEWithLogitLoss()
68 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/flen.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from recstudio.data.dataset import TripletDataset
 4 | from ..basemodel import BaseRanker
 5 | from ..loss_func import BCEWithLogitLoss
 6 | from ..module import ctr, MLPModule, get_act
 7 | 
 8 | r"""
 9 | FLEN
10 | ######################
11 | 
12 | Paper Reference:
13 |     FLEN: Leveraging Field for Scalable CTR Prediction (DLP KDD'20)
14 |     https://dlp-kdd.github.io/dlp-kdd2020/assets/pdf/a3-chen.pdf
15 | """
16 | 
17 | class FLEN(BaseRanker):
18 | 
19 |     def _get_dataset_class():
20 |         return TripletDataset
21 | 
22 |     def _init_model(self, train_data, drop_unused_field=True):
23 |         super()._init_model(train_data, drop_unused_field)
24 |         model_config = self.config['model']
25 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
26 |         if model_config.get('fields', None) is None:
27 |             fields = [f.fields for f in train_data._get_feat_list()]  
28 |         else:
29 |             fields = model_config['fields']
30 |             all_fields = set()
31 |             for f in fields:
32 |                 if all_fields.intersection(set(f)) not in [{self.fuid}, {self.fiid}, 
33 |                                                         {self.fuid, self.fiid}, set()] :
34 |                     raise ValueError('Expect no intersection between fields '
35 |                                         f'expcept {self.fuid} and {self.fiid}, '
36 |                                         f'but got mutilple {all_fields.intersection(set(f))}.')
37 |                 all_fields = all_fields.union(set(f))
38 |             if len(all_fields) != self.embedding.num_features:
39 |                 raise ValueError(f'Expect fields consist {self.embedding.num_features}, '
40 |                                 f'but got {all_fields - self.fields}.')
41 |                 
42 |         self.fwbi = ctr.FieldWiseBiInteraction(
43 |                         self.embed_dim,
44 |                         train_data,
45 |                         model_config['activation'],
46 |                         model_config['dropout'],
47 |                         fields)
48 |         self.mlp = MLPModule(
49 |                         [self.embedding.num_features*self.embed_dim] + model_config['mlp_layer'],
50 |                         model_config['activation'], 
51 |                         model_config['dropout'],
52 |                         batch_norm=True,
53 |                         last_activation=True, 
54 |                         last_bn=True)
55 |         self.fc = nn.Linear(model_config['mlp_layer'][-1] + self.embed_dim + 1, 1, bias=False)
56 |             
57 |     def score(self, batch):
58 |         emb = self.embedding(batch)
59 |         field_embs = []
60 |         for field in self.fwbi.fields:
61 |             field_idx = [list(self.embedding.embeddings).index(f) for f in field if f != self.frating]
62 |             field_embs.append(emb[:, field_idx, :])
63 |         fwbi_out = self.fwbi(batch, field_embs)
64 |         mlp_out = self.mlp(emb.flatten(1))
65 |         score = self.fc(torch.cat([mlp_out, fwbi_out], dim=-1)).squeeze(-1)
66 |         return {'score': score}
67 | 
68 |     def _get_loss_func(self):
69 |         return BCEWithLogitLoss()
70 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/fm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from collections import OrderedDict
 3 | from recstudio.data.dataset import TripletDataset
 4 | from .. import loss_func
 5 | from ..basemodel import BaseRanker
 6 | from ..module import ctr
 7 | 
 8 | 
 9 | class FM(BaseRanker):
10 | 
11 |     def _get_dataset_class():
12 |         return TripletDataset
13 | 
14 |     def _init_model(self, train_data, drop_unused_field=True):
15 |         super()._init_model(train_data, drop_unused_field)
16 |         self.fm = torch.nn.Sequential(OrderedDict([
17 |             ("embeddings", ctr.Embeddings(
18 |                 fields=self.fields,
19 |                 embed_dim=self.embed_dim,
20 |                 data=train_data)),
21 |             ("fm_layer", ctr.FMLayer(reduction='sum')),
22 |         ]))
23 |         self.linear = ctr.LinearLayer(self.fields, train_data)
24 | 
25 |     def score(self, batch):
26 |         fm_score = self.fm(batch)
27 |         lr_score = self.linear(batch)
28 |         return {'score' : fm_score + lr_score}
29 | 
30 |     def _get_loss_func(self):
31 |         return loss_func.BCEWithLogitLoss()
32 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/fmfm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from collections import OrderedDict
 4 | from recstudio.data.dataset import TripletDataset
 5 | from .. import loss_func
 6 | from ..basemodel import BaseRanker
 7 | from ..module import ctr
 8 | 
 9 | r"""
10 | FmFM
11 | ######################
12 | 
13 | Paper Reference:
14 |     FM^2: Field-matrixed Factorization Machines for Recommender Systems (WWW'21)
15 |     https://dl.acm.org/doi/10.1145/3442381.3449930
16 | """
17 | 
18 | class FmFM(BaseRanker):
19 | 
20 |     def _get_dataset_class():
21 |         return TripletDataset
22 | 
23 |     def _init_model(self, train_data, drop_unused_field=True):
24 |         super()._init_model(train_data, drop_unused_field)
25 |         self.linear = ctr.LinearLayer(self.fields, train_data)
26 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
27 |         num_fields = self.embedding.num_features
28 |         self.field_weight = nn.Parameter(torch.randn(num_fields*(num_fields - 1)//2, self.embed_dim, self.embed_dim))
29 |         self.triu_index = nn.Parameter(
30 |                             torch.triu_indices(num_fields, num_fields, offset=1), 
31 |                             requires_grad=False)
32 |         
33 |     def score(self, batch):
34 |         lr_score = self.linear(batch)
35 |         emb = self.embedding(batch)
36 |         emb0 = torch.index_select(emb, 1, self.triu_index[0])
37 |         emb1 = torch.index_select(emb, 1, self.triu_index[1])
38 |         fmfm_score = ((emb0.unsqueeze(-2) @ self.field_weight).squeeze(-2) * emb1).sum((-1, -2))
39 |         return {'score' : lr_score + fmfm_score}
40 | 
41 |     def _get_loss_func(self):
42 |         return loss_func.BCEWithLogitLoss()
43 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/fwfm.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from collections import OrderedDict
 3 | from recstudio.data.dataset import TripletDataset
 4 | from .. import loss_func
 5 | from ..basemodel import BaseRanker
 6 | from ..module import ctr
 7 | 
 8 | r"""
 9 | FwFM
10 | ######################
11 | 
12 | Paper Reference:
13 |     Field-weighted Factorization Machines for Click-Through Rate Prediction in Display Advertising (WWW'18)
14 |     https://dl.acm.org/doi/abs/10.1145/3178876.3186040
15 | """
16 | 
17 | class FwFM(BaseRanker):
18 | 
19 |     def _get_dataset_class():
20 |         return TripletDataset
21 | 
22 |     def _init_model(self, train_data, drop_unused_field=True):
23 |         super()._init_model(train_data, drop_unused_field)
24 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
25 |         num_fields = self.embedding.num_features
26 |         self.fwfm = nn.Sequential(
27 |                         OrderedDict([
28 |                             ("fm_layer",
29 |                                 ctr.InnerProductLayer(
30 |                                     num_fields)),
31 |                             ("field_weighted",
32 |                                 nn.Linear(num_fields * (num_fields - 1) // 2, 1))
33 |                         ]))
34 |         if self.config['model']['linear_type'].lower() == 'lw':
35 |             self.linear = ctr.LinearLayer(self.fields, train_data)
36 |         elif self.config['model']['linear_type'].lower() == 'felv':
37 |             self.linear_embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
38 |         elif self.config['model']['linear_type'].lower() == 'filv':
39 |             self.linear = nn.Linear(num_fields * self.embed_dim, 1, bias=False)
40 |         else:
41 |             raise ValueError('Expect linear_type to be `lw`|`felv`|`filv`, '
42 |                              f'but got {self.config["model"]["linear_type"]}.')
43 |         
44 |     def score(self, batch):
45 |         emb = self.embedding(batch)
46 |         if self.config['model']['linear_type'].lower() == 'lw':
47 |             lr_score = self.linear(batch)
48 |         elif self.config['model']['linear_type'].lower() == 'felv':
49 |             lr_emb = self.linear_embedding(batch)
50 |             lr_score = (lr_emb * emb).sum((1, 2))
51 |         else:
52 |             lr_score = self.linear(emb.flatten(1)).squeeze(-1)
53 |         fwfm_score = self.fwfm(emb).squeeze(-1)
54 |         return {'score' : lr_score + fwfm_score}
55 | 
56 |     def _get_loss_func(self):
57 |         return loss_func.BCEWithLogitLoss()
58 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/hfm.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from recstudio.data.dataset import TripletDataset
 3 | from ..basemodel import BaseRanker
 4 | from ..loss_func import BCEWithLogitLoss
 5 | from ..module import ctr, MLPModule
 6 | 
 7 | r"""
 8 | HFM
 9 | ######################
10 | 
11 | Paper Reference:
12 |     Holographic Factorization Machines for Recommendation (AAAI'19)
13 |     https://dl.acm.org/doi/10.1609/aaai.v33i01.33015143
14 | """
15 | 
16 | class HFM(BaseRanker):
17 | 
18 |     def _get_dataset_class():
19 |         return TripletDataset
20 | 
21 |     def _init_model(self, train_data, drop_unused_field=True):
22 |         super()._init_model(train_data, drop_unused_field)
23 |         self.linear = ctr.LinearLayer(self.fields, train_data)
24 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
25 |         num_fields = self.embedding.num_features
26 |         model_config = self.config['model']
27 |         self.hfm = ctr.HolographicFMLayer(num_fields, model_config['op'])
28 |         if model_config['deep']:
29 |             self.mlp = MLPModule(
30 |                     [num_fields * (num_fields - 1) // 2 * self.embed_dim] + model_config['mlp_layer'] + [1],
31 |                     model_config['activation'], 
32 |                     model_config['dropout'],
33 |                     last_activation=False, 
34 |                     last_bn=False)
35 |         else:
36 |             self.fc = nn.Linear(self.embed_dim, 1, bias=False)
37 |             
38 |     def score(self, batch):
39 |         lr_score = self.linear(batch)
40 |         emb = self.embedding(batch)
41 |         hfm_out = self.hfm(emb)
42 |         if self.config['model']['deep']:
43 |             hfm_score = self.mlp(hfm_out.flatten(1)).squeeze(-1)
44 |         else:
45 |             hfm_score = self.fc(hfm_out.sum(1)).squeeze(-1)
46 |         return{'score': lr_score + hfm_score}
47 | 
48 |     def _get_loss_func(self):
49 |         return BCEWithLogitLoss()
50 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/ifm.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from recstudio.data.dataset import TripletDataset
 3 | from .. import loss_func
 4 | from ..basemodel import BaseRanker
 5 | from ..module import ctr
 6 | 
 7 | r"""
 8 | IFM
 9 | ######################
10 | 
11 | Paper Reference:
12 |     An Input-aware Factorization Machine for Sparse Prediction (IJCAI'19)
13 |     https://dl.acm.org/doi/10.5555/3367032.3367240
14 | """
15 | 
16 | class IFM(BaseRanker):
17 | 
18 |     def _get_dataset_class():
19 |         return TripletDataset
20 | 
21 |     def _init_model(self, train_data, drop_unused_field=True):
22 |         super()._init_model(train_data, drop_unused_field)
23 |         self.linear = ctr.LinearLayer(self.fields, train_data)
24 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
25 |         model_config = self.config['model']
26 |         num_fields = self.embedding.num_features
27 |         self.fen = ctr.MLPModule(
28 |                         [num_fields * self.embed_dim] + model_config['mlp_layer'],
29 |                         model_config['activation'],
30 |                         model_config['dropout'],
31 |                         batch_norm=model_config['batch_norm'])
32 |         self.fen.add_modules(
33 |                     nn.Linear(model_config['mlp_layer'][-1], num_fields, bias=False),
34 |                     nn.Softmax(dim=-1))
35 |         self.fm = ctr.FMLayer(reduction='sum')
36 |         
37 |     def score(self, batch):
38 |         emb = self.embedding(batch)
39 |         weight = self.fen(emb.flatten(1))
40 |         lr_score = (super(ctr.LinearLayer, self.linear).forward(batch).squeeze(-1) * weight).sum(-1) + self.linear.bias
41 |         fm_score = self.fm(emb * weight.unsqueeze(-1))
42 |         return {'score' : lr_score + fm_score}
43 | 
44 |     def _get_loss_func(self):
45 |         return loss_func.BCEWithLogitLoss()
46 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/interhat.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from recstudio.data.dataset import TripletDataset
 4 | from .. import loss_func
 5 | from ..basemodel import BaseRanker
 6 | from ..module import ctr, MLPModule
 7 | 
 8 | r"""
 9 | InterHAT
10 | ######################
11 | 
12 | Paper Reference:
13 |     Interpretable Click-Through Rate Prediction through Hierarchical Attention (WSDM'20)
14 |     https://dl.acm.org/doi/10.1145/3336191.3371785
15 | """
16 | 
17 | class InterHAT(BaseRanker):
18 | 
19 |     def _get_dataset_class():
20 |         return TripletDataset
21 | 
22 |     def _init_model(self, train_data, drop_unused_field=True):
23 |         super()._init_model(train_data, drop_unused_field)
24 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
25 |         model_config = self.config['model']
26 |         self.trm = nn.TransformerEncoderLayer(
27 |                         self.embed_dim, model_config['n_head'], 
28 |                         model_config['feedforward_dim'], 
29 |                         model_config['dropout'],
30 |                         model_config['activation'],
31 |                         batch_first=True)
32 |         self.aggs = nn.ModuleList([
33 |                         ctr.AttentionalAggregation(
34 |                             self.embed_dim, 
35 |                             model_config['aggregation_dim']) 
36 |                         for _ in range(model_config['order'] + 1)
37 |                     ])
38 |         self.mlp = MLPModule([self.embed_dim] + model_config['mlp_layer'] + [1],
39 |                         model_config['activation'], 
40 |                         model_config['dropout'],
41 |                         last_activation=False, 
42 |                         last_bn=False
43 |                     )
44 | 
45 |     def score(self, batch):
46 |         emb = self.embedding(batch)
47 |         xi = x1 = self.trm(emb)
48 |         U = []
49 |         for i, agg in enumerate(self.aggs[:-1]):
50 |             ui = agg(xi, xi)
51 |             U.append(ui)
52 |             if i < self.config['model']['order']:
53 |                 xi = ui.unsqueeze(1) * x1 + xi
54 |         U = torch.stack(U, dim=1)
55 |         uf = self.aggs[-1](U, U)
56 |         score = self.mlp(uf).squeeze(-1)
57 |         return {'score' : score}
58 | 
59 |     def _get_loss_func(self):
60 |         return loss_func.BCEWithLogitLoss()
61 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/lorentzfm.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from collections import OrderedDict
 3 | from recstudio.data.dataset import TripletDataset
 4 | from .. import loss_func
 5 | from ..basemodel import BaseRanker
 6 | from ..module import ctr
 7 | 
 8 | r"""
 9 | LorentzFM
10 | ######################
11 | 
12 | Paper Reference:
13 |     Learning Feature Interactions with Lorentzian Factorization Machine (AAAI'20)
14 |     https://arxiv.org/pdf/1911.09821
15 | """
16 | 
17 | class LorentzFM(BaseRanker):
18 | 
19 |     def _get_dataset_class():
20 |         return TripletDataset
21 | 
22 |     def _init_model(self, train_data, drop_unused_field=True):
23 |         super()._init_model(train_data, drop_unused_field)
24 |         self.lfm = nn.Sequential(OrderedDict([
25 |                         ("embeddings", 
26 |                             ctr.Embeddings(fields=self.fields, embed_dim=self.embed_dim, data=train_data)),
27 |                         ("triangle_pooling_layer", 
28 |                             ctr.TrianglePoolingLayer((len(self.fields) - 1)))
29 |                     ]))
30 | 
31 |     def score(self, batch):
32 |         lfm_score = self.lfm(batch)
33 |         return {'score' : lfm_score}
34 | 
35 |     def _get_loss_func(self):
36 |         return loss_func.BCEWithLogitLoss()
37 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/lr.py:
--------------------------------------------------------------------------------
 1 | from ..basemodel import BaseRanker
 2 | from ..module import ctr
 3 | from ..loss_func import BCEWithLogitLoss
 4 | from recstudio.data.dataset import TripletDataset
 5 | 
 6 | 
 7 | class LR(BaseRanker):
 8 | 
 9 |     def _get_dataset_class():
10 |         return TripletDataset
11 | 
12 |     def _init_model(self, train_data, drop_unused_field=True):
13 |         super()._init_model(train_data, drop_unused_field)
14 |         self.linear = ctr.LinearLayer(self.fields, train_data)
15 | 
16 |     def _get_loss_func(self):
17 |         return BCEWithLogitLoss()
18 | 
19 |     def score(self, batch):
20 |         return {'score' : self.linear(batch)}
21 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/masknet.py:
--------------------------------------------------------------------------------
 1 | from recstudio.data.dataset import TripletDataset
 2 | from .. import loss_func
 3 | from ..basemodel import BaseRanker
 4 | from ..module import ctr
 5 | 
 6 | r"""
 7 | MaskNet
 8 | ######################
 9 | 
10 | Paper Reference:
11 |     MaskNet: Introducing Feature-Wise Multiplication to CTR Ranking Models by Instance-Guided Mask (DLP KDD'21)
12 |     https://arxiv.org/abs/2102.07619
13 | """
14 | 
15 | class MaskNet(BaseRanker):
16 | 
17 |     def _get_dataset_class():
18 |         return TripletDataset
19 | 
20 |     def _init_model(self, train_data, drop_unused_field=True):
21 |         super()._init_model(train_data, drop_unused_field)
22 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
23 |         model_config = self.config['model']
24 |         if model_config['parallel']:
25 |             self.masknet = ctr.ParallelMaskNet(
26 |                             self.embedding.num_features, 
27 |                             self.embed_dim, 
28 |                             model_config['num_blocks'], 
29 |                             model_config['block_dim'], 
30 |                             model_config['reduction_ratio'],
31 |                             model_config['mlp_layer'],
32 |                             model_config['activation'],
33 |                             model_config['dropout'],
34 |                             model_config['hidden_layer_norm'])
35 |         else:
36 |             self.masknet = ctr.SerialMaskNet(
37 |                             self.embedding.num_features, 
38 |                             self.embed_dim, 
39 |                             model_config['block_dim'], 
40 |                             model_config['reduction_ratio'],
41 |                             model_config['activation'],
42 |                             model_config['dropout'],
43 |                             model_config['hidden_layer_norm'])
44 | 
45 |     def score(self, batch):
46 |         emb = self.embedding(batch)
47 |         score = self.masknet(emb).squeeze(-1)
48 |         return {'score' : score}
49 | 
50 |     def _get_loss_func(self):
51 |         return loss_func.BCEWithLogitLoss()
52 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/nfm.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from collections import OrderedDict
 3 | from recstudio.model.basemodel import BaseRanker
 4 | from recstudio.model.module import ctr, MLPModule
 5 | from recstudio.data.dataset import TripletDataset
 6 | from ..loss_func import BCEWithLogitLoss
 7 | 
 8 | 
 9 | class NFM(BaseRanker):
10 | 
11 |     def _get_dataset_class():
12 |         return TripletDataset
13 | 
14 |     def _init_model(self, train_data, drop_unused_field=True):
15 |         super()._init_model(train_data, drop_unused_field)
16 |         self.linear = ctr.LinearLayer(self.fields, train_data)
17 |         model_config = self.config['model']
18 |         self.nfm = nn.Sequential(
19 |                         OrderedDict([
20 |                             ("embedding",
21 |                                 ctr.Embeddings(
22 |                                     self.fields, 
23 |                                     self.embed_dim, 
24 |                                     train_data)),
25 |                             ("fm_layer",
26 |                                 ctr.FMLayer()),
27 |                             ("batch_norm",
28 |                                 nn.BatchNorm1d(self.embed_dim)),
29 |                             ("mlp",
30 |                                 MLPModule(
31 |                                     [self.embed_dim]+model_config['mlp_layer']+[1],
32 |                                     model_config['activation'],
33 |                                     model_config['dropout'],
34 |                                     batch_norm=model_config['batch_norm'],
35 |                                     last_activation=False, last_bn=False))
36 |                         ]))
37 | 
38 |     def score(self, batch):
39 |         linear_score = self.linear(batch)
40 |         mlp_score = self.nfm(batch).squeeze(-1)
41 |         return {'score' : linear_score + mlp_score}
42 | 
43 |     def _get_loss_func(self):
44 |         return BCEWithLogitLoss()
45 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/onn.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from collections import OrderedDict
 3 | from recstudio.data.dataset import TripletDataset
 4 | from .. import loss_func
 5 | from ..basemodel import BaseRanker
 6 | from ..module import ctr, MLPModule
 7 | 
 8 | r"""
 9 | ONN
10 | ######################
11 | 
12 | Paper Reference:
13 |     Operation-aware Neural Networks for user response prediction (Neural Networks'20)
14 |     https://dl.acm.org/doi/10.1016/j.neunet.2019.09.020
15 | """
16 | 
17 | class ONN(BaseRanker):
18 | 
19 |     def _get_dataset_class():
20 |         return TripletDataset
21 | 
22 |     def _init_model(self, train_data, drop_unused_field=True):
23 |         super()._init_model(train_data, drop_unused_field)
24 |         num_fields = len(self.fields) - 1
25 |         model_config = self.config['model']
26 |         self.onn = nn.Sequential(
27 |                         OrderedDict([
28 |                             ("embedding",
29 |                                 ctr.Embeddings(
30 |                                     self.fields, 
31 |                                     self.embed_dim * num_fields, 
32 |                                     train_data)),
33 |                             ("ofm_layer",
34 |                                 ctr.OperationAwareFMLayer(
35 |                                     num_fields
36 |                                 )),
37 |                             ("mlp",
38 |                                 MLPModule(
39 |                                     [num_fields * self.embed_dim + num_fields * (num_fields - 1) // 2] + model_config['mlp_layer'] + [1],
40 |                                     model_config['activation'],
41 |                                     model_config['dropout'],
42 |                                     batch_norm=model_config['batch_norm'],
43 |                                     last_activation=False, last_bn=False))
44 |                         ]))
45 |         
46 | 
47 |     def score(self, batch):
48 |         onn_score = self.onn(batch).squeeze(-1)                    
49 |         return {'score' : onn_score}
50 | 
51 |     def _get_loss_func(self):
52 |         return loss_func.BCEWithLogitLoss()
53 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/ppnet.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from recstudio.data.dataset import TripletDataset
 4 | from .. import loss_func
 5 | from ..basemodel import BaseRanker
 6 | from ..module import ctr, MLPModule
 7 | 
 8 | r"""
 9 | PPNet
10 | ######################
11 | 
12 |     Used in Kuai 2019
13 | """
14 | 
15 | class BCEWithLogitLossWithAux(loss_func.BCEWithLogitLoss):
16 |     def forward(self, aux_score, label, pos_score):
17 |         return super().forward(label, aux_score) + super().forward(label, pos_score)
18 | 
19 | class PPNet(BaseRanker):
20 | 
21 |     def _get_dataset_class():
22 |         return TripletDataset
23 | 
24 |     def _init_model(self, train_data, drop_unused_field=True):
25 |         super()._init_model(train_data, drop_unused_field)
26 |         model_config = self.config['model']
27 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
28 |         self.mlp = MLPModule([self.embedding.num_features*self.embed_dim] + model_config['mlp_layer'] + [1],
29 |                         model_config['activation'], 
30 |                         model_config['dropout'],
31 |                         last_activation=False, 
32 |                         last_bn=False
33 |                     )
34 |         if model_config['id_fields'] is None:
35 |             id_fields = []
36 |             if self.fuid is not None:
37 |                 id_fields.append(self.fuid)
38 |             if self.fiid is not None:
39 |                 id_fields.append(self.fiid)
40 |             if len(id_fields) == 0:
41 |                 raise ValueError('Expect id_fields, but got None.')
42 |         else:
43 |             id_fields = model_config['id_fields']
44 |         self.id_embedding = ctr.Embeddings(id_fields, model_config['id_embed_dim'], train_data)
45 |         pp_hidden_dims = [self.embedding.num_features*self.embed_dim] + model_config['pp_hidden_dims']
46 |         self.ppnet = nn.ModuleList([
47 |                         ctr.PPLayer(
48 |                             pp_hidden_dims[i : i + 2],
49 |                             self.embedding.num_features*self.embed_dim + len(id_fields)*model_config['id_embed_dim'],
50 |                             model_config['gate_hidden_dims'][i],
51 |                             model_config['activation'],
52 |                             model_config['dropout'],
53 |                             model_config['batch_norm']) 
54 |                         for i in range(len(pp_hidden_dims) - 1)
55 |                     ])
56 |         self.fc = nn.Linear(pp_hidden_dims[-1], 1)
57 | 
58 |     def score(self, batch):
59 |         emb = self.embedding(batch)
60 |         mlp_score = self.mlp(emb.flatten(1)).squeeze(-1)
61 |         
62 |         id_emb = self.id_embedding(batch)
63 |         gate_in = torch.cat([emb.flatten(1).detach(), id_emb.flatten(1)], dim=-1)
64 |         mlp_in = emb.flatten(1).detach()
65 |         for pplayer in self.ppnet:
66 |             mlp_in = pplayer(gate_in, mlp_in)
67 |         ppnet_score = self.fc(mlp_in).squeeze(-1)
68 |         return {'aux_score' : mlp_score, 'score': ppnet_score}
69 | 
70 |     def _get_loss_func(self):
71 |         return BCEWithLogitLossWithAux()
72 |     
73 |     def training_step(self, batch):
74 |         y_h, output = self.forward(batch)
75 |         loss = self.loss_fn(output['aux_score'], **y_h)
76 |         return loss
77 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/sam.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from collections import OrderedDict
 3 | from recstudio.data.dataset import TripletDataset
 4 | from ..basemodel import BaseRanker
 5 | from ..loss_func import BCEWithLogitLoss
 6 | from ..module import ctr, LambdaLayer
 7 | 
 8 | r"""
 9 | SAM
10 | ######################
11 | 
12 | Paper Reference:
13 |     Looking at CTR Prediction Again: Is Attention All You Need? (SIGIR'21)
14 |     https://dl.acm.org/doi/10.1145/3404835.3462936
15 | """
16 | 
17 | class SAM(BaseRanker):
18 | 
19 |     def _get_dataset_class():
20 |         return TripletDataset
21 | 
22 |     def _init_model(self, train_data, drop_unused_field=True):
23 |         super()._init_model(train_data, drop_unused_field)
24 |         num_fields = len(self.fields) - 1
25 |         model_config = self.config['model']
26 |         fi = model_config['interaction_type'].lower()
27 |         self.sam = nn.Sequential(OrderedDict([
28 |                         ('embedding',
29 |                             ctr.Embeddings(self.fields, self.embed_dim, train_data)),
30 |                         ('interaction',
31 |                             ctr.SAMFeatureInteraction(
32 |                                 fi,
33 |                                 self.embed_dim,
34 |                                 num_fields,
35 |                                 model_config['dropout']))
36 |                     ]))
37 |         if fi == 'sam1':
38 |             self.sam.add_module('agg', nn.Flatten(start_dim=1))
39 |             self.sam.add_module('fc', nn.Linear(num_fields * self.embed_dim, 1))
40 |         elif fi in ['sam2a', 'sam2e']:
41 |             self.sam.add_module('agg', nn.Flatten(start_dim=1))
42 |             self.sam.add_module('fc', nn.Linear(num_fields * num_fields * self.embed_dim, 1))
43 |         else:
44 |             self.sam.add_module('agg', nn.Sequential(
45 |                                             LambdaLayer(lambda x: x.transpose(1, 2)),
46 |                                             nn.Linear(num_fields, 1, bias=False),
47 |                                             LambdaLayer(lambda x: x.sum(-1))))
48 |             self.sam.add_module('fc', nn.Linear(self.embed_dim, 1))
49 |             
50 |             
51 |     def score(self, batch):
52 |         score = self.sam(batch).squeeze(-1)
53 |         return{'score': score}
54 | 
55 |     def _get_loss_func(self):
56 |         return BCEWithLogitLoss()
57 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/widedeep.py:
--------------------------------------------------------------------------------
 1 | from recstudio.data.dataset import TripletDataset
 2 | from ..basemodel import BaseRanker
 3 | from ..loss_func import BCEWithLogitLoss
 4 | from ..module import ctr, MLPModule
 5 | 
 6 | 
 7 | class WideDeep(BaseRanker):
 8 | 
 9 |     def _get_dataset_class():
10 |         return TripletDataset
11 | 
12 |     def _init_model(self, train_data):
13 |         super()._init_model(train_data)
14 |         self.linear = ctr.LinearLayer(self.fields, train_data)
15 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
16 |         model_config = self.config['model']
17 |         self.mlp = MLPModule(
18 |                         [self.embedding.num_features*self.embed_dim]+model_config['mlp_layer']+[1],
19 |                         activation_func = model_config['activation'],
20 |                         dropout = model_config['dropout'],
21 |                         batch_norm = model_config['batch_norm'],
22 |                         last_activation = False, last_bn=False)
23 | 
24 |     def score(self, batch):
25 |         wide_score = self.linear(batch)
26 |         emb = self.embedding(batch)
27 |         deep_score = self.mlp(emb.flatten(1)).squeeze(-1)
28 |         return {'score' : wide_score + deep_score}
29 | 
30 |     def _get_loss_func(self):
31 |         return BCEWithLogitLoss()
32 | 


--------------------------------------------------------------------------------
/recstudio/model/fm/xdeepfm.py:
--------------------------------------------------------------------------------
 1 | from recstudio.data.dataset import TripletDataset
 2 | 
 3 | from ..basemodel import BaseRanker
 4 | from ..loss_func import BCEWithLogitLoss
 5 | from ..module import ctr, MLPModule
 6 | 
 7 | 
 8 | class xDeepFM(BaseRanker):
 9 | 
10 |     def _get_dataset_class():
11 |         return TripletDataset
12 | 
13 |     def _init_model(self, train_data, drop_unused_field=True):
14 |         super()._init_model(train_data, drop_unused_field)
15 |         self.linear = ctr.LinearLayer(self.fields, train_data)
16 |         self.fm = ctr.FMLayer(reduction='sum')
17 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
18 |         model_config = self.config['model']
19 |         self.cin = ctr.CIN(self.embed_dim, self.embedding.num_features,
20 |                            model_config['cin_layer_size'], model_config['activation'],
21 |                            direct=model_config['direct'])
22 |         self.mlp = MLPModule([self.embedding.num_features*self.embed_dim]+model_config['mlp_layer']+[1],
23 |                              model_config['activation'], model_config['dropout'],
24 |                              last_activation=False, last_bn=False)
25 | 
26 |     def score(self, batch):
27 |         lr_score = self.linear(batch)
28 |         emb = self.embedding(batch)
29 |         cin_score = self.cin(emb).squeeze(-1)
30 |         mlp_score = self.mlp(emb.flatten(1)).squeeze(-1)
31 |         return {'score' : lr_score + cin_score + mlp_score}
32 | 
33 |     def _get_loss_func(self):
34 |         return BCEWithLogitLoss()
35 | 


--------------------------------------------------------------------------------
/recstudio/model/graph/__init__.py:
--------------------------------------------------------------------------------
1 | from recstudio.model.graph.ngcf import NGCF
2 | from recstudio.model.graph.lightgcn import LightGCN


--------------------------------------------------------------------------------
/recstudio/model/graph/config/all.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   split_ratio: [0.8, 0.1, 0.1]
 3 | 
 4 | train:
 5 |   early_stop_patience: 100
 6 |   epochs: 1000
 7 |   learning_rate: 0.001
 8 |   negative_count: 1
 9 | 
10 | eval:
11 |   batch_size: 128
12 |   cutoff: [20, 10, 5]
13 | 


--------------------------------------------------------------------------------
/recstudio/model/graph/config/lightgcn.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   n_layers: 3
3 |   l2_reg_weight: 1e-4
4 | 
5 | train:
6 |   weight_decay: 0
7 | 


--------------------------------------------------------------------------------
/recstudio/model/graph/config/ncl.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   n_layers: 3
 3 |   hyper_layers: 1
 4 |   num_clusters: 10
 5 |   l2_reg_weight: 1e-4
 6 |   temperature: 0.05
 7 | 
 8 |   # Contrastive loss is not calculated for each batch in the original paper,
 9 |   # which will cause that lambda should be changed with batch size.
10 |   # The Lambda here has multiplied batch size and contrastive loss is the average of that of the batch,
11 |   # so Lambda won't change with batch size.
12 |   ssl_reg: 0.005 # 1e-6 * 4096.
13 |   alpha: 0.5
14 |   proto_reg: 0.0002 # 5e-8 * 4096
15 | 
16 |   # ml-1m
17 |   # num_clusters: 1000
18 |   # l2_reg_weight: 1e-4
19 |   # temperature: 0.1
20 |   # ssl_reg: 5e4  # 1e-7
21 |   # alpha: 1
22 |   # proto_reg: 3e4 # 8e-8
23 | 
24 | eval:
25 |   val_metrics: [recall, ndcg]
26 | 
27 | train:
28 |   num_m_epoch: 1
29 |   warm_up_epoch: 20
30 |   batch_size: 2048
31 |   learning_rate: 2e-3
32 | 
33 | 


--------------------------------------------------------------------------------
/recstudio/model/graph/config/ngcf.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   split_ratio: [0.8, 0.1, 0.1]
 3 | 
 4 | eval:
 5 |   batch_size: 128
 6 |   cutoff: [20, 10, 5]
 7 | 
 8 | model:
 9 |   embed_dim: 64
10 |   layer_size: [64, 64, 64, 64]
11 |   mess_dropout: [0.1, 0.1, 0.1]
12 |   node_dropout: 0.1
13 |   l2_reg_weight: 1e-5
14 | 
15 | train:
16 |   batch_size: 2048
17 |   early_stop_patience: 100
18 |   learning_rate: 0.0001
19 |   negative_count: 1
20 | 


--------------------------------------------------------------------------------
/recstudio/model/graph/config/sgl.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   split_ratio: [0.8, 0.1, 0.1]
 3 | 
 4 | eval:
 5 |   batch_size: 128
 6 |   cutoff: [20, 10, 5]
 7 | 
 8 | model:
 9 |   aug_type: 'ED'
10 |   embed_dim: 64
11 |   n_layers: 3
12 |   l2_reg_weight: 1e-4
13 |   negative_count: 1
14 |   ssl_ratio: 0.1
15 |   ssl_reg: 0.1
16 |   temperature: 0.2
17 | 
18 | train:
19 |   batch_size: 2048
20 |   early_stop_patience: 100
21 |   epochs: 1000
22 |   learning_rate: 0.001
23 | 


--------------------------------------------------------------------------------
/recstudio/model/graph/config/simgcl.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   split_ratio: [0.8, 0.1, 0.1]
 3 | 
 4 | eval:
 5 |   cutoff: [20, 10, 5]
 6 | 
 7 | model:
 8 |   embed_dim: 64
 9 |   eps: 0.1
10 |   n_layers: 3
11 | 
12 |   l2_reg_weight: 1e-4
13 |   cl_neg_type: all
14 |   cl_weight: 0.5
15 |   temperature: 0.2
16 | 
17 | 
18 | train:
19 |   batch_size: 2048
20 |   early_stop_patience: 100
21 |   epochs: 1000
22 |   eval_batch_size: 128
23 |   learning_rate: 0.001
24 | 


--------------------------------------------------------------------------------
/recstudio/model/init.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from torch.nn.init import xavier_normal_, xavier_uniform_, constant_
 3 | 
 4 | 
 5 | def xavier_normal_initialization(module):
 6 |     if isinstance(module, nn.Embedding):
 7 |         xavier_normal_(module.weight.data)
 8 |         if module.padding_idx is not None:
 9 |             constant_(module.weight.data[module.padding_idx], 0.)
10 |     elif isinstance(module, nn.Linear):
11 |         xavier_normal_(module.weight.data)
12 |         if module.bias is not None:
13 |             constant_(module.bias.data, 0)
14 |     elif isinstance(module, nn.LayerNorm):
15 |         module.bias.data.zero_()
16 |         module.weight.data.fill_(1.0)
17 | 
18 | class normal_initialization(object):
19 |     def __init__(self, initial_range=0.02) -> None:
20 |         super().__init__()
21 |         self.initial_range = initial_range
22 | 
23 |     def __call__(self, module):
24 |         if isinstance(module, nn.Embedding):
25 |             module.weight.data.normal_(mean=0.0, std=self.initial_range)
26 |             if module.padding_idx is not None:
27 |                 constant_(module.weight.data[module.padding_idx], 0.)
28 |         elif isinstance(module, nn.Linear):
29 |             module.weight.data.normal_(mean=0.0, std=self.initial_range)
30 |             if module.bias is not None:
31 |                 module.bias.data.zero_()
32 |         elif isinstance(module, nn.LayerNorm):
33 |             module.bias.data.zero_()
34 |             module.weight.data.fill_(1.0)
35 | 
36 | def xavier_uniform_initialization(module):
37 |     if isinstance(module, nn.Embedding):
38 |         xavier_uniform_(module.weight.data)
39 |         if module.padding_idx is not None:
40 |             constant_(module.weight.data[module.padding_idx], 0.)
41 |     elif isinstance(module, nn.Linear):
42 |         xavier_uniform_(module.weight.data)
43 |         if module.bias is not None:
44 |             constant_(module.bias.data, 0)
45 |     elif isinstance(module, nn.LayerNorm):
46 |         module.bias.data.zero_()
47 |         module.weight.data.fill_(1.0)
48 | 
49 | 


--------------------------------------------------------------------------------
/recstudio/model/kg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/USTCLLM/RecStudio/9114975b8e9ec85bce16c1ed8abbf0e194e4afb3/recstudio/model/kg/__init__.py


--------------------------------------------------------------------------------
/recstudio/model/kg/config/all.yaml:
--------------------------------------------------------------------------------
1 | data:
2 |   kg_network_index: 1  # the index of the knowledge graph network in the dataset configuration file. 


--------------------------------------------------------------------------------
/recstudio/model/kg/config/cfkg.yaml:
--------------------------------------------------------------------------------
1 | 
2 | train:
3 |   negative_count: 1
4 | 
5 | model:
6 |   embed_dim: 64
7 |   margin: 1.0


--------------------------------------------------------------------------------
/recstudio/model/kg/config/cke.yaml:
--------------------------------------------------------------------------------
1 | 
2 | train:
3 |   negative_count: 1
4 |   weight_decay: 0.000001
5 | 
6 | model: 
7 |   embed_dim: 150
8 |   pro_embed_dim: 150
9 |   normalize: True


--------------------------------------------------------------------------------
/recstudio/model/kg/config/kgat.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   batch_size: 2048
 3 |   negative_count: 1
 4 |   weight_decay: 1e-6
 5 | 
 6 | 
 7 | model:
 8 |   embed_dim: 64
 9 |   alg_type: bi
10 |   pro_embed_dim: 64
11 |   layer_size: [64, 16]
12 |   mess_dropout: [0.1, 0.1]
13 |   n_fold: 100
14 | 


--------------------------------------------------------------------------------
/recstudio/model/kg/config/kgcn.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | data:
 3 |   fmeval: True
 4 |   low_rating_thres: 0.0
 5 |   binarized_rating_thres: 3.0
 6 | 
 7 | eval:
 8 |   val_metrics: [auc, logloss]
 9 |   test_metrics: [auc, logloss]
10 | 
11 | train:
12 |   embed_dim: 64
13 |   weight_decay: 1e-7
14 | 
15 | model:
16 |   neighbor_sample_size: 4
17 |   n_iter: 2
18 |   aggregator_type: sum 


--------------------------------------------------------------------------------
/recstudio/model/kg/config/kgin.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   negative_count: 1
 3 |   learning_rate: 0.0001
 4 | 
 5 | model:
 6 |   embed_dim: 64
 7 |   num_factors: 4
 8 |   l2_reg: 1e-5
 9 |   sim_regularity: 1e-4
10 |   intents_indep: distance 
11 |   num_layers: 2
12 | 
13 |   # dropout
14 |   node_dropout: 0.2
15 |   mess_dropout: 0.1


--------------------------------------------------------------------------------
/recstudio/model/kg/config/kgnnls.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   fmeval: True
 3 |   kg_network_index: 1
 4 |   low_rating_thres: 0.0
 5 |   binarized_rating_thres: 3.0
 6 | 
 7 | 
 8 | eval:
 9 |   val_metrics: [auc, logloss]
10 |   test_metrics: [auc, logloss]
11 | 
12 | model:
13 |   embed_dim: 64
14 |   weight_decay: 1e-7
15 | 
16 |   n_iter: 2
17 |   neighbor_sample_size: 4
18 |   aggregator_type: sum 
19 |   ls_weight: 1.0


--------------------------------------------------------------------------------
/recstudio/model/kg/config/ktup.yaml:
--------------------------------------------------------------------------------
 1 | train:
 2 |   negative_count: 1
 3 |   weight_decay: 1e-6
 4 | 
 5 | model:
 6 |   embed_dim: 100
 7 |   train_rec_step: 5
 8 |   train_kg_step: 5
 9 |   use_st_gumbel: True
10 |   L1_flag: False
11 |   margin: 1.0
12 |   kg_weight: 0.5


--------------------------------------------------------------------------------
/recstudio/model/kg/config/mkr.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | train:
 3 |   negative_count: 1
 4 |   weight_decay: 1e-6
 5 | 
 6 | model:
 7 | # learning_rate_kg: 0.001
 8 |   embed_dim: 64
 9 |   kge_interval: 3
10 |   use_inner_product: True
11 |   dropout: 0.0
12 |   L: 1
13 |   H: 1
14 | 


--------------------------------------------------------------------------------
/recstudio/model/kg/config/ripplenet.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   kg_network_index: 1
 3 |   fmeval: True
 4 |   low_rating_thres: 0.0
 5 |   binarized_rating_thres: 4.0
 6 | 
 7 | eval:
 8 |   val_metrics: [auc, logloss]
 9 |   test_metrics: [auc, logloss]
10 | 
11 | train:
12 |   weight_decay: 1e-6
13 |   batch_size: 2048
14 |   negative_count: 1
15 | 
16 | model:
17 |   embed_dim: 64
18 |   using_all_hops: True
19 |   item_update_mode: plus_transform  # how to update item at the end of each hop
20 |   kge_weight: 0.01
21 |   n_memory: 16 # size of ripple set for each hop
22 |   n_hop: 2 # maximum hops
23 | 


--------------------------------------------------------------------------------
/recstudio/model/mf/__init__.py:
--------------------------------------------------------------------------------
 1 | from .bpr import BPR
 2 | from .cml import CML
 3 | from .ease import EASE
 4 | from .irgan import IRGAN
 5 | from .itemknn import ItemKNN
 6 | from .logisticmf import LogisticMF
 7 | from .ncf import NCF
 8 | from .slim import SLIM
 9 | from .wrmf import WRMF
10 | from .dssm import DSSM


--------------------------------------------------------------------------------
/recstudio/model/mf/bpr.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.ann import sampler
 3 | from recstudio.data import dataset
 4 | from recstudio.model import basemodel, loss_func, scorer
 5 | 
 6 | 
 7 | class BPR(basemodel.BaseRetriever):
 8 | 
 9 |     def _get_dataset_class():
10 |         return dataset.TripletDataset
11 | 
12 |     def _get_item_encoder(self, train_data):
13 |         return torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0)
14 | 
15 |     def _get_query_encoder(self, train_data):
16 |         return torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0)
17 | 
18 |     def _get_score_func(self):
19 |         return scorer.InnerProductScorer()
20 | 
21 |     def _get_loss_func(self):
22 |         return loss_func.BPRLoss()
23 | 
24 |     def _get_sampler(self, train_data):
25 |         return sampler.UniformSampler(train_data.num_items)
26 | 


--------------------------------------------------------------------------------
/recstudio/model/mf/cml.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.ann import sampler
 3 | from recstudio.data import advance_dataset
 4 | from recstudio.model import basemodel, loss_func, scorer
 5 | 
 6 | 
 7 | class CML(basemodel.BaseRetriever):
 8 | 
 9 |     def _get_dataset_class():
10 |         return advance_dataset.ALSDataset
11 | 
12 |     def _get_item_encoder(self, train_data):
13 |         return torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0)
14 | 
15 |     def _get_query_encoder(self, train_data):
16 |         return torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0)
17 | 
18 |     def _get_score_func(self):
19 |         return scorer.EuclideanScorer()
20 | 
21 |     def _get_loss_func(self, train_data):
22 |         class CMLoss(loss_func.PairwiseLoss):
23 |             def __init__(self, margin=2, use_rank_weight=False, n_items: int=None):
24 |                 super().__init__()
25 |                 self.margin = margin
26 |                 self.use_rank_weight = use_rank_weight
27 |                 self.n_items = n_items - 1  # remove padding
28 | 
29 |             def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob):
30 |                 pos_score[pos_score == -float("inf")] = float("inf")
31 |                 loss = torch.max(torch.max(neg_score, dim=-1).values.unsqueeze(-1) \
32 |                     - pos_score + self.margin, pos_score.new_zeros(pos_score.size(1)))
33 |                 if self.use_rank_weight is not None:
34 |                     impostors = neg_score.unsqueeze(1) - pos_score.unsqueeze(-1) + self.margin > 0
35 |                     rank = torch.mean(impostors.to(torch.float32), -1) * self.n_items
36 |                     return torch.mean(loss * torch.log(rank + 1))
37 |                 else:
38 |                     return torch.mean(loss)
39 |         return CMLoss(self.config['model']['margin'], self.config['model']['use_rank_weight'], train_data.num_items)
40 | 
41 | 
42 |     def _get_sampler(self, train_data):
43 |         return sampler.UniformSampler(train_data.num_items)
44 | 


--------------------------------------------------------------------------------
/recstudio/model/mf/config/all.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     split_mode: user_entry # user # entry
 3 |     fmeval: False
 4 |     binaried_rating_thres: 0.0
 5 | 
 6 | eval:
 7 |     batch_size: 20
 8 | 
 9 | model:
10 |     embed_dim: 64
11 | 


--------------------------------------------------------------------------------
/recstudio/model/mf/config/bpr.yaml:
--------------------------------------------------------------------------------
1 | train:
2 |     negative_count: 1
3 |     excluding_hist: False
4 | 


--------------------------------------------------------------------------------
/recstudio/model/mf/config/cml.yaml:
--------------------------------------------------------------------------------
1 | train:
2 |   learning_rate: 0.01
3 |   negative_count: 5
4 |   exculing_hist: False
5 | 
6 | model:
7 |   margin: 1
8 |   use_rank_weight: ~
9 | 


--------------------------------------------------------------------------------
/recstudio/model/mf/config/dssm.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   embed_dim: 64
 3 |   mlp_layer: [128, 128, 128]
 4 |   activation: tanh
 5 |   dropout: 0.3
 6 |   batch_norm: False
 7 | 
 8 | train:
 9 |   negative_count: 1
10 | 
11 | eval:
12 |   cutoff: [10, 20, 50]


--------------------------------------------------------------------------------
/recstudio/model/mf/config/ease.yaml:
--------------------------------------------------------------------------------
1 | train:
2 |   epochs: 1
3 |   gpu: ~
4 |   lambda: 250


--------------------------------------------------------------------------------
/recstudio/model/mf/config/irgan.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   embed_dim: 64
 3 |   sample_lambda: 0.2
 4 |   T_dis: 0.2
 5 |   T_gen: 1
 6 | 
 7 | train:
 8 |   batch_size: 16
 9 |   early_stop_patience: 100
10 |   epochs: 1000
11 |   every_n_epoch_gen: 2
12 |   every_n_epoch_dis: 5
13 |   learning_rate_dis: 0.001
14 |   learning_rate_gen: 0.001
15 |   negative_count: 1
16 |   weight_decay_dis: 0.0001
17 |   weight_decay_gen: 0.0001
18 | 


--------------------------------------------------------------------------------
/recstudio/model/mf/config/itemknn.yaml:
--------------------------------------------------------------------------------
1 | train:
2 |   epochs: 1
3 |   gpu: ~
4 |   knn: 100
5 |   similarity: cosine #| jaccard
6 | 


--------------------------------------------------------------------------------
/recstudio/model/mf/config/logisticmf.yaml:
--------------------------------------------------------------------------------
1 | train:
2 |   alpha: 0.5
3 |   excluding_hist: False
4 |   lambda: 0.01
5 |   learner: adagrad
6 |   learning_rate: 0.01
7 |   negative_count: 10


--------------------------------------------------------------------------------
/recstudio/model/mf/config/ncf.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   activation: relu
3 |   dropout: 0.1
4 |   mlp_hidden_size: [128, 64]
5 |   score_mode: fusion
6 | 
7 | train:
8 |   excluding_hist: False
9 |   negative_count: 1


--------------------------------------------------------------------------------
/recstudio/model/mf/config/pmf.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/USTCLLM/RecStudio/9114975b8e9ec85bce16c1ed8abbf0e194e4afb3/recstudio/model/mf/config/pmf.yaml


--------------------------------------------------------------------------------
/recstudio/model/mf/config/slim.yaml:
--------------------------------------------------------------------------------
1 | train:
2 |   alpha: 1
3 |   epochs: 1
4 |   gpu: ~
5 |   knn: 100
6 |   l1_ratio: 0.1
7 |   positive_only: True


--------------------------------------------------------------------------------
/recstudio/model/mf/config/wrmf.yaml:
--------------------------------------------------------------------------------
1 | train:
2 |   alpha: 1
3 |   batch_size: 100
4 |   lambda: 0.5


--------------------------------------------------------------------------------
/recstudio/model/mf/dssm.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import torch
 4 | from recstudio.data import TripletDataset
 5 | 
 6 | from .. import basemodel, loss_func, scorer
 7 | from ..module import LambdaLayer, MLPModule, ctr
 8 | 
 9 | 
10 | class DSSM(basemodel.BaseRetriever):
11 | 
12 |     def _set_data_field(self, data):
13 |         data.use_field = data.field
14 | 
15 |     def _get_dataset_class():
16 |         return TripletDataset
17 | 
18 |     def _get_query_encoder(self, train_data):
19 |         if len(self.query_fields) == 1 and list(self.query_fields)[0] == self.fuid:
20 |             embedding = torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0)
21 |             mlp_input_dim = self.embed_dim
22 |         else:
23 |             embedding = ctr.Embeddings(
24 |                 fields=self.query_fields,
25 |                 data=train_data,
26 |                 embed_dim=self.embed_dim)
27 |             mlp_input_dim = embedding.num_features * self.embed_dim
28 |         model_config = self.config['model']
29 |         mlp = MLPModule(
30 |             [mlp_input_dim] + model_config['mlp_layer'],
31 |             dropout=model_config['dropout'], activation_func=model_config['activation'],
32 |             batch_norm=model_config['batch_norm'])
33 |         return torch.nn.Sequential(
34 |             OrderedDict(
35 |                 {'embedding': embedding,
36 |                  'flatten': LambdaLayer(lambda x: x.view(x.size(0), -1)),
37 |                  'MLP': mlp}))
38 | 
39 |     def _get_item_encoder(self, train_data):
40 |         if len(self.item_fields) == 1 and list(self.item_fields)[0] == self.fiid:
41 |             embedding = torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0)
42 |             mlp_input_dim = self.embed_dim
43 |             flatten_layer = LambdaLayer(lambda x: x)
44 |         else:
45 |             embedding = ctr.Embeddings(
46 |                 fields=self.item_fields,
47 |                 data=train_data,
48 |                 embed_dim=self.embed_dim,
49 |             )
50 |             mlp_input_dim = embedding.num_features * self.embed_dim
51 |             flatten_layer = LambdaLayer(lambda x: x.view(*x.shape[: -2], -1))
52 | 
53 |         model_config = self.config['model']
54 |         mlp = MLPModule(
55 |             [mlp_input_dim] + model_config['mlp_layer'],
56 |             activation_func = model_config['activation'],
57 |             dropout = model_config['dropout'],
58 |             batch_norm = model_config['batch_norm'])
59 |         return torch.nn.Sequential(
60 |             OrderedDict(
61 |                 {'embedding': embedding,
62 |                  'flatten': flatten_layer,
63 |                  'MLP': mlp}))
64 | 
65 |     def _get_score_func(self):
66 |         return scorer.InnerProductScorer()
67 | 
68 |     def _get_loss_func(self):
69 |         return loss_func.BinaryCrossEntropyLoss()
70 | 


--------------------------------------------------------------------------------
/recstudio/model/mf/ease.py:
--------------------------------------------------------------------------------
 1 | from typing import OrderedDict
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | from recstudio.data.dataset import TripletDataset
 6 | from recstudio.model import basemodel
 7 | 
 8 | 
 9 | class QueryEncoder(object):
10 |     def __init__(self, user) -> None:
11 |         self.user = user
12 | 
13 |     def __call__(self, batch):
14 |         return self.user[batch, :]
15 | 
16 | 
17 | class EASE(basemodel.BaseRetriever):
18 | 
19 |     def _get_dataset_class():
20 |         return TripletDataset
21 | 
22 |     def _get_train_loaders(self, train_data):
23 |         return {'user_item_matrix': train_data.get_graph(0, 'csr')[0]}
24 | 
25 |     def training_epoch(self, nepoch):
26 |         if self.config['train']['gpu'] is not None:
27 |             self.logger.warning("expecting EASE run on cpu while get gpu setting, automatically set gpu as None.")
28 |             self.config['train']['gpu'] = None
29 |         data, iscombine = self.current_epoch_trainloaders(nepoch)
30 |         R = data['user_item_matrix']
31 |         G = R.T @ R
32 |         diagIndices = np.diag_indices_from(G)
33 |         G[diagIndices] += self.config['train']['lambda']
34 |         P = np.linalg.inv(G.todense())
35 |         B = P / (-np.diag(P))
36 |         B[diagIndices] = 0
37 |         self.item_vector = B[:, 1:]
38 |         self.query_encoder.user = R
39 |         return torch.tensor(np.linalg.norm(R-R*B, 'fro'))
40 | 
41 |     def _get_query_encoder(self, train_data):
42 |         return QueryEncoder(None)
43 | 
44 |     def _get_score_func(self):
45 |         def scorer(query, items):
46 |             return torch.from_numpy((query @ items).A)
47 |         return scorer
48 | 
49 |     def _get_loss_func(self):
50 |         return None
51 | 
52 |     def _get_item_encoder(self, train_data):
53 |         return None
54 | 
55 |     def _get_sampler(self, train_data):
56 |         return None
57 | 
58 |     def _get_optimizers(self):
59 |         return None
60 | 
61 |     def _get_item_vector(self):
62 |         return self.item_vector
63 | 
64 |     def state_dict(self):
65 |         return OrderedDict({
66 |             'item_vector': getattr(self, 'item_vector', None),
67 |             'query_encoder': getattr(self, 'query_encoder', None)
68 |         })
69 | 
70 |     def load_state_dict(self, state_dict: OrderedDict):
71 |         for k, v in state_dict.items():
72 |             setattr(self, k, v)
73 | 


--------------------------------------------------------------------------------
/recstudio/model/mf/itemknn.py:
--------------------------------------------------------------------------------
 1 | from recstudio.model import basemodel
 2 | from recstudio.model.mf.ease import EASE
 3 | import scipy.sparse as sp
 4 | import numpy as np
 5 | import torch
 6 | class ItemKNN(EASE):
 7 | 
 8 |     def training_epoch(self, nepoch):
 9 |         config = self.config['train']
10 |         data, iscombine = self.current_epoch_trainloaders(nepoch)
11 |         R = data['user_item_matrix']
12 |         item_norm = np.sqrt(R.multiply(R).sum(0).A.ravel())
13 |         item_nz = (R > 0).sum(0).A.ravel()
14 |         G = R.T @ R
15 |         diagIndices = np.diag_indices_from(G)
16 |         G[diagIndices] = 0
17 |         G.eliminate_zeros()
18 |         all_col = []
19 |         all_row = []
20 |         all_val = []
21 |         for col in range(G.shape[0]):
22 |             if G.indptr[col] < G.indptr[col+1]:
23 |                 score = G.data[G.indptr[col]:G.indptr[col+1]]
24 |                 rows = G.indices[G.indptr[col]:G.indptr[col+1]]
25 |                 if config['similarity'] == 'cosine':
26 |                     score = score / (item_norm[rows] * item_norm[col] + 1e-6)
27 |                 elif config['similarity'] == 'jaccard':
28 |                     score = score / (item_nz[rows] + item_nz[col] - score + 1e-6)
29 |                 else:
30 |                     raise ValueError('unsupported similarity metric')
31 |                 topk = config['knn']
32 |                 if G.indptr[col] < G.indptr[col+1] - topk:
33 |                     idx = np.argpartition(score, -topk)[-topk:]
34 |                     rows_ = rows[idx]
35 |                     scores_ = score[idx]
36 |                 else:
37 |                     rows_ = rows
38 |                     scores_ = score
39 |                 all_col.extend([col] * len(scores_))
40 |                 all_row.extend(rows_)
41 |                 all_val.extend(scores_)
42 | 
43 |         B = sp.csc_matrix((all_val, (all_row, all_col)), G.shape)
44 |         self.item_vector = B[:, 1:]
45 |         self.query_encoder.user = R
46 |         return torch.tensor(0.)


--------------------------------------------------------------------------------
/recstudio/model/mf/logisticmf.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.ann import sampler
 3 | from recstudio.data import dataset
 4 | from recstudio.model import basemodel, scorer, loss_func
 5 | 
 6 | class LogisticMF(basemodel.BaseRetriever):
 7 | 
 8 |     def _get_dataset_class():
 9 |         return dataset.TripletDataset
10 | 
11 |     def _get_item_encoder(self, train_data):
12 |         return torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0)
13 | 
14 |     def _get_query_encoder(self, train_data):
15 |         return torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0)
16 | 
17 |     def _get_score_func(self):
18 |         return scorer.InnerProductScorer()
19 | 
20 |     def _get_loss_func(self):
21 |         class LogitLoss(loss_func.PairwiseLoss):
22 |             def __init__(self, alpha) -> None:
23 |                 super().__init__()
24 |                 self.alpha = alpha
25 | 
26 |             def forward(self, label, pos_score, log_pos_prob, neg_score, log_neg_prob):
27 |                 l1 = self.alpha * pos_score - (1+self.alpha) * torch.nn.functional.softplus(pos_score)
28 |                 l2 = torch.nn.functional.softplus(neg_score).mean(dim=-1)
29 |                 loss = (l1 - l2).mean()
30 |                 return -loss
31 | 
32 |         return LogitLoss(self.config['train']['alpha'])
33 | 
34 | 
35 |     def _get_sampler(self, train_data):
36 |         return sampler.UniformSampler(train_data.num_items)
37 | 


--------------------------------------------------------------------------------
/recstudio/model/mf/ncf.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.ann import sampler
 3 | from recstudio.data import dataset
 4 | from recstudio.model import basemodel, loss_func, scorer, module
 5 | 
 6 | class NCF(basemodel.BaseRetriever):
 7 | 
 8 |     def _get_dataset_class():
 9 |         return dataset.TripletDataset
10 | 
11 |     def _get_item_encoder(self, train_data):
12 |         return torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0)
13 | 
14 |     def _get_query_encoder(self, train_data):
15 |         return torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0)
16 | 
17 |     def _get_score_func(self):
18 |         model_config = self.config['model']
19 |         score_mode = model_config['score_mode']
20 |         assert score_mode in set(['mlp', 'mf', 'fusion']), \
21 |             "Only 3 score modes are supported for NCF: ['mlp', 'mf', 'fusion']"
22 |         if score_mode == 'mlp':
23 |             return scorer.MLPScorer(module.MLPModule(
24 |                 mlp_layers = [self.embed_dim*2]+model_config['mlp_hidden_size']+[1],
25 |                 activation_func = model_config['activation'],
26 |                 dropout = model_config['dropout']))
27 |         elif score_mode == 'mf':
28 |             return scorer.GMFScorer(self.embed_dim, activation=model_config['activation'])
29 |         else:
30 |             mlp = module.MLPModule(
31 |                 mlp_layers = [self.embed_dim*2]+model_config['mlp_hidden_size'],
32 |                 activation_func = model_config['activation'],
33 |                 dropout = model_config['dropout'])
34 |             return scorer.FusionMFMLPScorer(
35 |                 emb_dim = self.embed_dim,
36 |                 hidden_size = model_config['mlp_hidden_size'][-1],
37 |                 mlp = mlp,
38 |                 activation = model_config['activation'])
39 | 
40 | 
41 |     def _get_loss_func(self):
42 |         return loss_func.BinaryCrossEntropyLoss()
43 | 
44 |     def _get_sampler(self, train_data):
45 |         return sampler.UniformSampler(train_data.num_items)
46 | 


--------------------------------------------------------------------------------
/recstudio/model/mf/pmf.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.data import dataset
 3 | 
 4 | from .. import basemodel, scorer, loss_func
 5 | 
 6 | class PMF(basemodel.BaseRetriever):
 7 | 
 8 |     def _get_dataset_class():
 9 |         return dataset.TripletDataset
10 | 
11 |     def _get_item_encoder(self, train_data):
12 |         return torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0)
13 | 
14 |     def _get_query_encoder(self, train_data):
15 |         return torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0)
16 | 
17 |     def _get_score_func(self):
18 |         return scorer.InnerProductScorer()
19 | 
20 |     def _get_loss_func(self):
21 |         return loss_func.SquareLoss()
22 |     
23 |     def _get_sampler(self, train_data):
24 |         return None
25 | 


--------------------------------------------------------------------------------
/recstudio/model/mf/slim.py:
--------------------------------------------------------------------------------
 1 | from recstudio.model.mf.ease import EASE
 2 | from recstudio.model.basemodel import Recommender
 3 | from sklearn.linear_model import ElasticNet
 4 | from sklearn.exceptions import ConvergenceWarning
 5 | import scipy.sparse as sp
 6 | import torch
 7 | import warnings
 8 | 
 9 | 
10 | class SLIM(EASE):
11 | 
12 |     def add_model_specific_args(parent_parser):
13 |         parent_parser = Recommender.add_model_specific_args(parent_parser)
14 |         parent_parser.add_argument_group('SLIM')
15 |         parent_parser.add_argument("--knn", type=int, default=100, help='k for K-nearest neighbor')
16 |         parent_parser.add_argument("--alpha", type=float, default=1.0, help='alpha coef')
17 |         parent_parser.add_argument("--l1_ratio", type=float, default=0.1, help='coef for L1 regularization')
18 |         parent_parser.add_argument("--positive_only", action='store_true', default=True, help='positive only flag')
19 |         return parent_parser
20 | 
21 |     def training_epoch(self, nepoch):
22 |         train_config = self.config['train']
23 |         data, iscombine = self.current_epoch_trainloaders(nepoch)
24 |         X = data['user_item_matrix'].tolil()
25 |         model = ElasticNet(
26 |             alpha=train_config.get('alpha', 1),
27 |             l1_ratio=train_config.get('l1_ratio', 0.1),
28 |             positive=train_config.get('positive_only', True),
29 |             fit_intercept=False,
30 |             copy_X=False,
31 |             precompute=True,
32 |             selection='random',
33 |             max_iter=100,
34 |             tol=1e-4
35 |         )
36 |         item_coeffs = []
37 |         with warnings.catch_warnings():
38 |             warnings.simplefilter("ignore", category=ConvergenceWarning)
39 |             for j in range(X.shape[1]):
40 |                 r = X[:, j]
41 |                 X[:, j] = 0
42 |                 model.fit(X, r.A)
43 |                 item_coeffs.append(model.sparse_coef_)
44 |                 X[:, j] = r
45 |         B = sp.vstack(item_coeffs).T
46 |         self.item_vector = B[:, 1:]
47 |         self.query_encoder.user = X
48 |         return torch.tensor(0.)
49 | 


--------------------------------------------------------------------------------
/recstudio/model/module/__init__.py:
--------------------------------------------------------------------------------
1 | from recstudio.model.module.layers import *
2 | from recstudio.model.module.graphmodule import *
3 | 


--------------------------------------------------------------------------------
/recstudio/model/module/functional.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def seq_pooling_function(batch_seq_embeddings: torch.Tensor, seq_len: torch.Tensor, weight=None, mask_token=None, pooling_type='mean', keepdim=False):
 5 |     # batch_seq_embeddings: [B, L, D] or [B, Neg, L, D]
 6 |     # seq_len: [B] or [B,Neg], weight: [B,L] or [B,Neg,L]
 7 |     B = batch_seq_embeddings.size(0)
 8 |     _need_reshape = False
 9 |     if batch_seq_embeddings.dim() == 4:
10 |         _need_reshape = True
11 |         batch_seq_embeddings = batch_seq_embeddings.view(
12 |             -1, *batch_seq_embeddings.shape[2:])
13 |         seq_len = seq_len.view(-1)
14 |         if weight is not None:
15 |             weight = weight.view(-1, weight.size(-1))
16 | 
17 |     N, L, D = batch_seq_embeddings.shape
18 | 
19 |     if weight is not None:
20 |         batch_seq_embeddings = weight.unsqueeze(-1) * batch_seq_embeddings
21 |     
22 |     if pooling_type == 'mask':
23 |         # Data type of mask_token should be bool and 
24 |         # the shape of mask_token should be [B, L]
25 |         assert mask_token != None, "mask_token can be None when pooling_type is 'mask'."
26 |         result = batch_seq_embeddings[mask_token]
27 | 
28 |     elif pooling_type in ['origin', 'concat', 'mean', 'sum', 'max']:
29 |         mask = torch.arange(L).unsqueeze(0).unsqueeze(2).to(batch_seq_embeddings.device)
30 |         mask = mask.expand(N, -1,  D)
31 |         seq_len = seq_len.unsqueeze(1).unsqueeze(2)
32 |         seq_len_ = seq_len.expand(-1, mask.size(1), -1)
33 |         mask = mask >= seq_len_
34 | 
35 |         batch_seq_embeddings = batch_seq_embeddings.masked_fill(mask, 0.0)
36 | 
37 |         if pooling_type == 'origin':
38 |             return batch_seq_embeddings
39 |         elif pooling_type in ['origin', 'concat', 'max']:
40 |             if not keepdim: 
41 |                 if pooling_type == 'concat':
42 |                     result = batch_seq_embeddings.reshape(N, -1)
43 |                 else:
44 |                     result = batch_seq_embeddings.max(dim=1)
45 |             else:
46 |                 if pooling_type == 'concat':
47 |                     result = batch_seq_embeddings.reshape(N, -1).unsqueeze(1)
48 |                 else:
49 |                     result = batch_seq_embeddings.max(dim=1).unsqueeze(1)
50 |         elif pooling_type in ['mean', 'sum']:
51 |             batch_seq_embeddings_sum = batch_seq_embeddings.sum(dim=1, keepdim=keepdim)
52 |             if pooling_type == 'sum':
53 |                 result = batch_seq_embeddings_sum
54 |             else:
55 |                 result = batch_seq_embeddings_sum / (seq_len + torch.finfo(torch.float32).eps if keepdim else seq_len.squeeze(2))
56 | 
57 |     elif pooling_type == 'last':
58 |         gather_index = (seq_len-1).view(-1, 1, 1).expand(-1, -1, D)  # B x 1 x D
59 |         output = batch_seq_embeddings.gather(
60 |             dim=1, index=gather_index).squeeze(1)  # B x D
61 |         result = output if not keepdim else output.unsqueeze(1)
62 | 
63 |     if _need_reshape:
64 |         return result.reshape(B, N//B, *result.shape[1:])
65 |     else:
66 |         return result


--------------------------------------------------------------------------------
/recstudio/model/multitask/__init__.py:
--------------------------------------------------------------------------------
1 | from .hardshare import HardShare
2 | from .mmoe import MMoE
3 | from .ple import PLE
4 | from .aitm import AITM


--------------------------------------------------------------------------------
/recstudio/model/multitask/config/aitm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   tower_mlp_layer: [128, 64]
3 |   tower_activation: relu
4 |   tower_dropout: 0.5
5 |   tower_batch_norm: False
6 | 
7 |   alpha: 0.6
8 | 


--------------------------------------------------------------------------------
/recstudio/model/multitask/config/all.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   fmeval: True
 3 |   low_rating_thres: ~
 4 |   binarized_rating_thres: ~
 5 | 
 6 | eval:
 7 |   val_metrics: [auc, logloss]
 8 |   test_metrics: [auc, logloss]
 9 | 
10 | train:
11 |   weights: ~


--------------------------------------------------------------------------------
/recstudio/model/multitask/config/hardshare.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   top_mlp_layer: [128, 128]
 3 |   top_activation: relu
 4 |   top_dropout: 0.5
 5 |   top_batch_norm: False
 6 |   bottom_mlp_layer: [128, 128]
 7 |   bottom_activation: relu
 8 |   bottom_dropout: 0.5
 9 |   bottom_batch_norm: False
10 | 


--------------------------------------------------------------------------------
/recstudio/model/multitask/config/mmoe.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   num_experts: 2
 3 |   expert_mlp_layer: [128, 128]
 4 |   expert_activation: relu
 5 |   expert_dropout: 0.5
 6 |   expert_batch_norm: False
 7 | 
 8 |   gate_mlp_layer: [128, ]
 9 |   gate_activation: relu
10 |   gate_dropout: 0.5
11 |   gate_batch_norm: False
12 | 
13 |   tower_mlp_layer: [128, ]
14 |   tower_activation: relu
15 |   tower_dropout: 0.5
16 |   tower_batch_norm: False
17 | 


--------------------------------------------------------------------------------
/recstudio/model/multitask/config/ple.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   num_levels: 1
 3 |   specific_experts_per_task: 2
 4 |   num_shared_experts: 2
 5 |   expert_mlp_layer: [128, 128]
 6 |   expert_activation: relu
 7 |   expert_dropout: 0.5
 8 | 
 9 |   gate_mlp_layer: [128, ]
10 |   gate_activation: relu
11 |   gate_dropout: 0.5
12 | 
13 |   tower_mlp_layer: [128, ]
14 |   tower_activation: relu
15 |   tower_dropout: 0.5
16 |   tower_batch_norm: False
17 | 


--------------------------------------------------------------------------------
/recstudio/model/multitask/hardshare.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from collections import defaultdict
 4 | from recstudio.data.dataset import TripletDataset
 5 | from ..basemodel import BaseRanker
 6 | from ..loss_func import BCEWithLogitLoss
 7 | from ..module import ctr, MLPModule
 8 | 
 9 | r"""
10 | HardShare
11 | ######################
12 | 
13 | Paper Reference:
14 |     An overview of multi-task learning in deep neural networks ('17)
15 |     https://arxiv.org/abs/1706.05098
16 | """
17 | 
18 | class HardShare(BaseRanker):
19 | 
20 |     def _get_dataset_class():
21 |         return TripletDataset
22 | 
23 |     def _init_model(self, train_data, drop_unused_field=True):
24 |         super()._init_model(train_data, drop_unused_field)
25 |         model_config = self.config['model']
26 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
27 |         self.bottom_mlp = MLPModule(
28 |                             [self.embedding.num_features * self.embed_dim] + model_config['bottom_mlp_layer'],
29 |                             model_config['bottom_activation'], 
30 |                             model_config['bottom_dropout'],
31 |                             batch_norm=model_config['bottom_batch_norm'])
32 |         assert isinstance(self.frating, list), f'Expect rating_field to be a list, but got {self.frating}.'
33 |         self.top_mlp = nn.ModuleDict({
34 |                             r: MLPModule(
35 |                                 [model_config['bottom_mlp_layer'][-1]] + model_config['top_mlp_layer'] + [1],
36 |                                 model_config['top_activation'], 
37 |                                 model_config['top_dropout'],
38 |                                 last_activation=False, 
39 |                                 batch_norm=model_config['top_batch_norm'])
40 |                             for r in self.frating
41 |                         })
42 |             
43 |     def score(self, batch):
44 |         emb = self.embedding(batch)
45 |         shared_emb = self.bottom_mlp(emb.flatten(1))
46 |         score = defaultdict(dict)
47 |         for r, top_mlp in self.top_mlp.items():
48 |             score[r]['score'] = top_mlp(shared_emb).squeeze(-1)
49 |         return score
50 | 
51 |     def _get_loss_func(self):
52 |         return BCEWithLogitLoss()
53 |     
54 |     def training_step(self, batch):
55 |         y_h, _ = self.forward(batch)
56 |         loss = {}
57 |         for r in self.frating:
58 |             loss[r] = self.loss_fn(**y_h[r])
59 |         
60 |         weights = self.config['train'].get('weights', [1.0]*len(self.frating))
61 |         if weights is None:
62 |             weights = [1.0]*len(self.frating)
63 |         assert len(weights) == len(self.frating), \
64 |             f'Expect {len(self.frating)} float(s) for weights, but got {self.config["train"]["weights"]}.'
65 |         weights = torch.tensor(weights, device=self.device).softmax(0)
66 |         
67 |         loss['loss'] = sum(w*v for w, (_, v) in zip(weights, loss.items()))
68 |         return loss
69 | 


--------------------------------------------------------------------------------
/recstudio/model/multitask/mmoe.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from collections import defaultdict
 4 | from recstudio.model.multitask.hardshare import HardShare
 5 | from ..module import ctr, MLPModule
 6 | 
 7 | r"""
 8 | MMoE
 9 | ######################
10 | 
11 | Paper Reference:
12 |     Modeling Task Relationships in Multi-task Learning with Multi-gate Mixture-of-Experts (KDD'18)
13 |     https://dl.acm.org/doi/10.1145/3219819.3220007
14 | """
15 | 
16 | class MMoE(HardShare):
17 | 
18 |     def _init_model(self, train_data, drop_unused_field=True):
19 |         super()._init_model(train_data, drop_unused_field)
20 |         model_config = self.config['model']
21 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
22 |         assert isinstance(self.frating, list), f'Expect rating_field to be a list, but got {self.frating}.'
23 |         self.experts = nn.ModuleList([
24 |                             MLPModule(
25 |                                 [self.embedding.num_features * self.embed_dim] + model_config['expert_mlp_layer'],
26 |                                 model_config['expert_activation'], 
27 |                                 model_config['expert_dropout'],
28 |                                 batch_norm=model_config['expert_batch_norm'])
29 |                             for _ in range(model_config['num_experts'])
30 |                         ])
31 |         self.gates = nn.ModuleDict({
32 |                             r: MLPModule(
33 |                                 [self.embedding.num_features * self.embed_dim] + model_config['gate_mlp_layer'] + [model_config['num_experts']],
34 |                                 model_config['gate_activation'], 
35 |                                 model_config['gate_dropout'],
36 |                                 batch_norm=model_config['gate_batch_norm'])
37 |                             for r in self.frating
38 |                         })
39 |         for _, g in self.gates.items():
40 |             g.add_modules(nn.Softmax(-1))
41 |         self.towers = nn.ModuleDict({
42 |                             r: MLPModule(
43 |                                 [model_config['expert_mlp_layer'][-1]] + model_config['tower_mlp_layer'] + [1],
44 |                                 model_config['tower_activation'], 
45 |                                 model_config['tower_dropout'],
46 |                                 batch_norm=model_config['tower_batch_norm'],
47 |                                 last_activation=False, 
48 |                                 last_bn=False)
49 |                             for r in self.frating
50 |                         })
51 |             
52 |     def score(self, batch):
53 |         emb = self.embedding(batch).flatten(1)
54 |         experts_out = torch.stack([e(emb) for e in self.experts], dim=1)        # B x E x De
55 |         score = defaultdict(dict)
56 |         for r, gate in self.gates.items():
57 |             gate_out = gate(emb)                                                # B x E
58 |             mmoe_out = (gate_out.unsqueeze(-1) * experts_out).sum(1)            # B x De
59 |             score[r]['score'] = self.towers[r](mmoe_out).squeeze(-1)
60 |         return score


--------------------------------------------------------------------------------
/recstudio/model/multitask/ple.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from collections import defaultdict
 4 | from recstudio.model.multitask.hardshare import HardShare
 5 | from ..module import ctr, MLPModule
 6 | 
 7 | r"""
 8 | PLE
 9 | ######################
10 | 
11 | Paper Reference:
12 |     Progressive Layered Extraction (PLE): A Novel Multi-Task Learning (MTL) Model for Personalized Recommendations (RecSys'20)
13 |     https://dl.acm.org/doi/10.1145/3383313.3412236
14 | """
15 | 
16 | class PLE(HardShare):
17 | 
18 |     def _init_model(self, train_data, drop_unused_field=True):
19 |         super()._init_model(train_data, drop_unused_field)
20 |         model_config = self.config['model']
21 |         self.embedding = ctr.Embeddings(self.fields, self.embed_dim, train_data)
22 |         assert isinstance(self.frating, list), f'Expect rating_field to be a list, but got {self.frating}.'
23 |         self.extraction_layers = nn.Sequential(*[
24 |                                     ctr.ExtractionLayer(
25 |                                         self.embedding.num_features * self.embed_dim if i == 0 else model_config['expert_mlp_layer'][-1],
26 |                                         model_config['specific_experts_per_task'],
27 |                                         len(self.frating),
28 |                                         model_config['num_shared_experts'],
29 |                                         True if i != model_config['num_levels'] - 1 else False,
30 |                                         model_config['expert_mlp_layer'],
31 |                                         model_config['expert_activation'],
32 |                                         model_config['expert_dropout'],
33 |                                         model_config['gate_mlp_layer'],
34 |                                         model_config['gate_activation'],
35 |                                         model_config['gate_dropout'])
36 |                                     for i in range(model_config['num_levels'])
37 |                                 ])
38 |         self.towers = nn.ModuleDict({
39 |                             r: MLPModule(
40 |                                 [model_config['expert_mlp_layer'][-1]] + model_config['tower_mlp_layer'] + [1],
41 |                                 model_config['tower_activation'], 
42 |                                 model_config['tower_dropout'],
43 |                                 batch_norm=model_config['tower_batch_norm'],
44 |                                 last_activation=False, 
45 |                                 last_bn=False)
46 |                             for r in self.frating
47 |                         })
48 |             
49 |     def score(self, batch):
50 |         emb = self.embedding(batch).flatten(1)
51 |         extraction_out = self.extraction_layers([emb] * (len(self.frating) + 1))
52 |         score = defaultdict(dict)
53 |         for i, (r, tower) in enumerate(self.towers.items()):
54 |             score[r]['score'] = tower(extraction_out[i]).squeeze(-1)
55 |         return score


--------------------------------------------------------------------------------
/recstudio/model/ranker.py:
--------------------------------------------------------------------------------
1 | from recstudio.model.fm import *


--------------------------------------------------------------------------------
/recstudio/model/retriever.py:
--------------------------------------------------------------------------------
1 | from recstudio.model.mf import *
2 | from recstudio.model.seq import *
3 | from recstudio.model.ae import *
4 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/__init__.py:
--------------------------------------------------------------------------------
 1 | from .caser import Caser
 2 | from .din import DIN
 3 | from .fpmc import FPMC
 4 | from .gru4rec import GRU4Rec
 5 | from .hgn import HGN
 6 | from .narm import NARM
 7 | from .npe import NPE
 8 | from .sasrec import SASRec
 9 | from .stamp import STAMP
10 | from .transrec import TransRec
11 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/bert4rec.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.ann import sampler
 3 | from recstudio.data import dataset
 4 | from recstudio.model import basemodel, loss_func, scorer
 5 | from recstudio.model.module import functional as recfn
 6 | from .sasrec import SASRecQueryEncoder
 7 | 
 8 | 
 9 | class BERT4Rec(basemodel.BaseRetriever):
10 | 
11 |     def _init_model(self, train_data):
12 |         super()._init_model(train_data)
13 |         self.mask_token = train_data.num_items
14 |         self.query_fields = self.query_fields | set(["mask_token"])
15 | 
16 |     def _get_dataset_class():
17 |         return dataset.SeqDataset
18 | 
19 |     def _get_query_encoder(self, train_data):
20 |         model_config = self.config['model']
21 |         return SASRecQueryEncoder(
22 |             fiid=self.fiid, embed_dim=self.embed_dim,
23 |             max_seq_len=train_data.config['max_seq_len'], n_head=model_config['head_num'],
24 |             hidden_size=model_config['hidden_size'], dropout=model_config['dropout'],
25 |             activation=model_config['activation'], layer_norm_eps=model_config['layer_norm_eps'],
26 |             n_layer=model_config['layer_num'],
27 |             training_pooling_type='mask',
28 |             item_encoder=self.item_encoder,
29 |             bidirectional=True,
30 |         )
31 | 
32 |     def _get_item_encoder(self, train_data):
33 |         # id num_items is used for mask token
34 |         return torch.nn.Embedding(train_data.num_items+1, self.embed_dim, padding_idx=0)
35 | 
36 |     def _get_score_func(self):
37 |         return scorer.InnerProductScorer()
38 | 
39 |     def _get_loss_func(self):
40 |         r"""SoftmaxLoss is used as the loss function."""
41 |         return loss_func.SoftmaxLoss()
42 | 
43 |     def _get_sampler(self, train_data):
44 |         return None
45 | 
46 |     def _reconstruct_train_data(self, batch):
47 |         item_seq = batch['in_'+self.fiid]
48 | 
49 |         padding_mask = item_seq == 0
50 |         rand_prob = torch.rand_like(item_seq, dtype=torch.float)
51 |         rand_prob.masked_fill_(padding_mask, 1.0)
52 |         masked_mask = rand_prob < self.config['train']['mask_ratio']
53 |         masked_token = item_seq[masked_mask]
54 | 
55 |         item_seq[masked_mask] = self.mask_token
56 |         batch['in_'+self.fiid] = item_seq
57 | 
58 |         batch[self.fiid] = masked_token     # N
59 |         batch['mask_token'] = masked_mask
60 |         return batch
61 | 
62 |     def training_step(self, batch):
63 |         batch = self._reconstruct_train_data(batch)
64 |         return super().training_step(batch)
65 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/cl4srec.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.data import dataset
 3 | from recstudio.model.module import data_augmentation
 4 | from .sasrec import SASRec, SASRecQueryEncoder
 5 | 
 6 | r"""
 7 | CL4SRec
 8 | #############
 9 |     Contrastive Learning for Sequential Recommendation(SIGIR'21)
10 |     Reference:
11 |         https://arxiv.org/abs/2010.14395
12 | """
13 | class CL4SRec(SASRec):
14 |     r"""
15 |     Model hyper parameters:
16 |         - ``embed_dim(int)``: The dimension of embedding layers. Default: ``64``.
17 |         - ``hidden_size(int)``: The output size of Transformer layer. Default: ``128``.
18 |         - ``layer_num(int)``: The number of layers for the Transformer. Default: ``2``.
19 |         - ``dropout_rate(float)``:  The dropout probablity for dropout layers after item embedding
20 |          | and in Transformer layer. Default: ``0.5``.
21 |         - ``head_num(int)``: The number of heads for MultiHeadAttention in Transformer. Default: ``2``.
22 |         - ``activation(str)``: The activation function in transformer. Default: ``"gelu"``.
23 |         - ``layer_norm_eps``: The layer norm epsilon in transformer. Default: ``1e-12``.
24 |     """
25 | 
26 |     def _init_model(self, train_data):
27 |         super()._init_model(train_data)
28 |         self.augmentation_model = data_augmentation.CL4SRecAugmentation(self.config['model'], train_data)
29 | 
30 |     def _get_dataset_class():
31 |         return dataset.SeqToSeqDataset
32 | 
33 |     def _get_item_encoder(self, train_data):
34 |         return torch.nn.Embedding(train_data.num_items + 1, self.embed_dim, padding_idx=0) # the last item is mask
35 | 
36 |     def _get_query_encoder(self, train_data):
37 |         model_config = self.config['model']
38 |         return SASRecQueryEncoder(
39 |             fiid=self.fiid, embed_dim=self.embed_dim,
40 |             max_seq_len=train_data.config['max_seq_len'], n_head=model_config['head_num'],
41 |             hidden_size=model_config['hidden_size'], dropout=model_config['dropout_rate'],
42 |             activation=model_config['activation'], layer_norm_eps=model_config['layer_norm_eps'],
43 |             n_layer=model_config['layer_num'],
44 |             training_pooling_type='origin',
45 |             item_encoder=self.item_encoder
46 |         )
47 | 
48 |     def training_step(self, batch):
49 |         output = self.forward(batch, False)
50 |         cl_output = self.augmentation_model(batch, self.query_encoder)
51 |         loss_value = self.loss_fn(batch[self.frating], **output['score']) + \
52 |             self.config['model']['cl_weight'] * cl_output['cl_loss']
53 |         return loss_value
54 | 
55 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/config/all.yaml:
--------------------------------------------------------------------------------
1 | data:
2 |   test_rep: True
3 |   train_rep: True
4 |   split_ratio: 2
5 | 
6 | eval:
7 |   batch_size: 128
8 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/config/bert4rec.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   activation: 'gelu'
 3 |   dropout: 0.2
 4 |   embed_dim: 64
 5 |   head_num: 2
 6 |   hidden_size: 128
 7 |   layer_num: 2
 8 |   layer_norm_eps: 1e-12
 9 | 
10 | train:
11 |   mask_ratio: 0.2
12 |   negative_count: 1
13 |   weight_decay: 1e-5


--------------------------------------------------------------------------------
/recstudio/model/seq/config/caser.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   embed_dim: 64
3 |   n_v: 8
4 |   n_h: 16
5 |   dropout: 0.4
6 | 
7 | train:
8 |   negative_count: 1
9 |   weight_decay: 1e-5


--------------------------------------------------------------------------------
/recstudio/model/seq/config/cl4srec.yaml:
--------------------------------------------------------------------------------
 1 | eval:
 2 |   batch_size: 128
 3 |   cutoff: [20, 50, 10, 5]
 4 | 
 5 | model:
 6 |   # transformer
 7 |   activation: 'gelu'
 8 |   dropout_rate: 0.5
 9 |   hidden_size: 64
10 |   head_num: 2
11 |   layer_norm_eps: 1e-12
12 |   layer_num: 1
13 |   # contrastive
14 |   temperature: 1.0
15 |   augment_type: item_crop  # item_crop, item_mask, item_reorder
16 |   tau: 0.2
17 |   cl_weight: 0.1
18 | 
19 | train:
20 |   batch_size: 256
21 |   early_stop_patience: 40
22 |   epochs: 1000
23 |   init_method: normal
24 |   negative_count: 1
25 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/config/coserec.yaml:
--------------------------------------------------------------------------------
 1 | eval:
 2 |   batch_size: 128
 3 |   cutoff: [20, 50, 10, 5]
 4 | 
 5 | 
 6 | model:
 7 |   # transformer
 8 |   hidden_size: 64
 9 |   layer_num: 1
10 |   head_num: 2
11 |   dropout_rate: 0.5
12 |   activation: 'gelu'
13 |   layer_norm_eps: 1e-12
14 | 
15 |   # contrastive
16 |   temperature: 1.0
17 |   insert_rate: 0.5
18 |   substitute_rate: 0.05
19 |   cl_weight: 0.1
20 |   augment_threshold: 12
21 |   augment_type_for_short: 'SIMRC'
22 |   augmentation_warm_up_epochs: 5
23 | 
24 | 
25 | train:
26 |   batch_size: 256
27 |   epochs: 1000
28 |   early_stop_patience: 40
29 |   init_method: normal
30 |   negative_count: 1
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/config/din.yaml:
--------------------------------------------------------------------------------
 1 | eval:
 2 |   batch_size: 32
 3 |   test_metrics: [auc, logloss]
 4 |   val_metrics: [auc, logloss]
 5 | 
 6 | model:
 7 |   activation: dice
 8 |   attention_mlp: [128, 64]
 9 |   batch_norm: True
10 |   dropout: 0.3
11 |   embed_dim: 128
12 |   fc_mlp: [128, 64, 64]
13 | 
14 | train:
15 |   batch_size: 256
16 |   negative_count: 20
17 | 
18 | data:
19 |   low_rating_thres: 0.0
20 |   binarized_rating_thres: 3.0
21 | 
22 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/config/fpmc.yaml:
--------------------------------------------------------------------------------
1 | train:
2 |   negative_count: 1


--------------------------------------------------------------------------------
/recstudio/model/seq/config/gru4rec.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   hidden_size: 128
3 |   dropout_rate: 0.3
4 |   layer_num: 1
5 | 
6 | train:
7 |   negative_count: 1


--------------------------------------------------------------------------------
/recstudio/model/seq/config/hgn.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   pooling_type: mean
3 | 
4 | train:
5 |   negative_count: 1
6 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/config/iclrec.yaml:
--------------------------------------------------------------------------------
 1 | eval:
 2 |   batch_size: 128
 3 |   cutoff: [20, 50, 10, 5]
 4 | 
 5 | 
 6 | model:
 7 |   # transformer
 8 |   hidden_size: 64
 9 |   layer_num: 1
10 |   head_num: 2
11 |   dropout_rate: 0.5
12 |   activation: 'gelu'
13 |   layer_norm_eps: 1e-5
14 |   # contrastive learning
15 |   temperature: 1.0
16 |   augment_type: item_random # item_crop, item_mask, item_reorder, item_random
17 |   cl_weight: 0.1
18 |   intent_cl_weight: 0.1
19 |   num_intent_clusters: 256
20 |   intent_seq_representation_type: 'mean'
21 |   instance_seq_representation_type: 'mean'
22 | 
23 | train:
24 |   batch_size: 256
25 |   epochs: 1000
26 |   early_stop_patience: 40
27 |   init_method: normal
28 |   negative_count: 1
29 |   warm_up_epoches: 0 # number of epochs to start IntentCL.
30 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/config/narm.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   hidden_size: 128
3 |   dropout_rate: [0.25, 0.5]
4 |   layer_num: 1


--------------------------------------------------------------------------------
/recstudio/model/seq/config/npe.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   dropout_rate: 0.3
3 | 
4 | train:
5 |   negative_count: 1
6 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/config/sasrec.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   hidden_size: 128
 3 |   layer_num: 2
 4 |   head_num: 2
 5 |   dropout_rate: 0.5
 6 |   activation: 'gelu'
 7 |   layer_norm_eps: 1e-12
 8 | 
 9 | train:
10 |   negative_count: 1
11 |   init_method: normal
12 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/config/stamp.yaml:
--------------------------------------------------------------------------------
1 | model:
2 |   embed_dim: 64
3 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/config/transrec.yaml:
--------------------------------------------------------------------------------
1 | trian:
2 |   negative_count: 1


--------------------------------------------------------------------------------
/recstudio/model/seq/coserec.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.ann import sampler
 3 | from recstudio.data import dataset
 4 | from recstudio.model import basemodel, loss_func, scorer
 5 | from recstudio.model.module import data_augmentation
 6 | from .sasrec import SASRecQueryEncoder
 7 | 
 8 | 
 9 | r"""
10 | CoSeRec
11 | #############
12 |     Contrastive Self-supervised Sequential Recommendation with Robust Augmentation
13 |     Reference:
14 |         https://doi.org/10.48550/arXiv.2108.06479
15 | """
16 | class CoSeRec(basemodel.BaseRetriever):
17 |     r"""
18 |     Model hyper parameters:
19 |         - ``embed_dim(int)``: The dimension of embedding layers. Default: ``64``.
20 |         - ``hidden_size(int)``: The output size of Transformer layer. Default: ``128``.
21 |         - ``layer_num(int)``: The number of layers for the Transformer. Default: ``2``.
22 |         - ``dropout_rate(float)``:  The dropout probablity for dropout layers after item embedding
23 |          | and in Transformer layer. Default: ``0.5``.
24 |         - ``head_num(int)``: The number of heads for MultiHeadAttention in Transformer. Default: ``2``.
25 |         - ``activation(str)``: The activation function in transformer. Default: ``"gelu"``.
26 |         - ``layer_norm_eps``: The layer norm epsilon in transformer. Default: ``1e-12``.
27 |     """
28 | 
29 |     def _init_model(self, train_data:dataset.SeqToSeqDataset):
30 |         super()._init_model(train_data)
31 |         self.num_items = train_data.num_items
32 |         self.augmentation_model = data_augmentation.CoSeRecAugmentation(self.config['model'], train_data)
33 | 
34 |     def _get_dataset_class():
35 |         return dataset.SeqToSeqDataset
36 | 
37 |     def _get_item_encoder(self, train_data):
38 |         return torch.nn.Embedding(train_data.num_items + 1, self.embed_dim, padding_idx=0) # the last is masking
39 | 
40 |     def _get_query_encoder(self, train_data):
41 |         model_config = self.config['model']
42 |         return SASRecQueryEncoder(
43 |             fiid=self.fiid, embed_dim=self.embed_dim,
44 |             max_seq_len=train_data.config['max_seq_len'], n_head=model_config['head_num'],
45 |             hidden_size=model_config['hidden_size'], dropout=model_config['dropout_rate'],
46 |             activation=model_config['activation'], layer_norm_eps=model_config['layer_norm_eps'],
47 |             n_layer=model_config['layer_num'],
48 |             training_pooling_type='origin',
49 |             item_encoder=self.item_encoder
50 |         )
51 | 
52 |     def _get_score_func(self):
53 |         return scorer.InnerProductScorer()
54 | 
55 |     def _get_loss_func(self):
56 |         return loss_func.BinaryCrossEntropyLoss()
57 | 
58 |     def _get_sampler(self, train_data):
59 |         return sampler.UniformSampler(train_data.num_items)
60 | 
61 |     def training_step(self, batch):
62 |         output = self.forward(batch, isinstance(self.loss_fn, loss_func.FullScoreLoss))
63 |         cl_output = self.augmentation_model(batch, self.query_encoder)
64 |         loss_value = self.loss_fn(batch[self.frating], **output['score']) + \
65 |             self.config['model']['cl_weight'] * cl_output['cl_loss']
66 |         return loss_value
67 | 
68 |     def training_epoch(self, nepoch):
69 |         if nepoch + 1 >= self.config['model']['augmentation_warm_up_epochs'] + 1:
70 |             self.augmentation_model.update_online_model(nepoch, self.item_encoder)
71 |         return super().training_epoch(nepoch)


--------------------------------------------------------------------------------
/recstudio/model/seq/fpmc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.ann import sampler
 3 | from recstudio.data import dataset
 4 | from recstudio.model import basemodel, loss_func, module, scorer
 5 | 
 6 | r"""
 7 | FPMC
 8 | #########
 9 | 
10 | Paper Reference:
11 |     Steffen Rendle, et al. "Factorizing personalized Markov chains for next-basket recommendation" in WWW2010.
12 |     https://dl.acm.org/doi/10.1145/1772690.1772773
13 | """
14 | 
15 | 
16 | class FPMC(basemodel.BaseRetriever):
17 |     r"""
18 |     | FPMC is based on personalized transition graphs over underlying Markov chains. It
19 |       factorizes the transition cube with a pairwise interaction model which is a special case of
20 |       the Tucker Decomposition.
21 |     """
22 | 
23 |     def _get_dataset_class():
24 |         r"""The dataset FPMC used is SeqDataset."""
25 |         return dataset.SeqDataset
26 | 
27 |     def _get_item_encoder(self, train_data):
28 |         return torch.nn.Embedding(train_data.num_items, 2*self.embed_dim, padding_idx=0)
29 | 
30 |     def _get_query_encoder(self, train_data):
31 |         return module.VStackLayer(
32 |             module.HStackLayer(
33 |                 module.VStackLayer(
34 |                     module.LambdaLayer(lambda x: x[self.fuid]),
35 |                     torch.nn.Embedding(train_data.num_users, self.embed_dim, padding_idx=0),
36 |                 ),
37 |                 module.VStackLayer(
38 |                     module.HStackLayer(
39 |                         module.VStackLayer(
40 |                             module.LambdaLayer(lambda x: x['in_'+self.fiid]),
41 |                             torch.nn.Embedding(train_data.num_items, self.embed_dim, padding_idx=0),
42 |                         ),
43 |                         module.LambdaLayer(lambda x: x['seqlen'])
44 |                     ),
45 |                     module.SeqPoolingLayer(pooling_type='last'),
46 |                 )
47 |             ),
48 |             module.LambdaLayer(lambda x: torch.cat(x, dim=-1))
49 |         )
50 | 
51 |     def _get_score_func(self):
52 |         r"""Inner Product is used as the score function."""
53 |         return scorer.InnerProductScorer()
54 | 
55 |     def _get_loss_func(self):
56 |         r"""The loss function is BPR loss."""
57 |         return loss_func.BPRLoss()
58 | 
59 |     def _get_sampler(self, train_data):
60 |         return sampler.UniformSampler(train_data.num_items)
61 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/gru4rec.py:
--------------------------------------------------------------------------------
 1 | from operator import mod
 2 | import torch
 3 | from recstudio.ann import sampler
 4 | from recstudio.data import dataset
 5 | from recstudio.model import basemodel, loss_func, module, scorer
 6 | 
 7 | r"""
 8 | GRU4Rec
 9 | ############
10 | 
11 | Paper Reference:
12 |     Balazs Hidasi, et al. "Session-Based Recommendations with Recurrent Neural Networks" in ICLR2016.
13 |     https://arxiv.org/abs/1511.06939
14 | """
15 | 
16 | 
17 | class GRU4Rec(basemodel.BaseRetriever):
18 |     r"""
19 |     GRU4Rec apply RNN in Recommendation System, where sequential behavior of user is regarded as input
20 |     of the RNN.
21 |     """
22 | 
23 |     # def add_model_specific_args(parent_parser):
24 |     #     parent_parser = basemodel.Recommender.add_model_specific_args(parent_parser)
25 |     #     parent_parser.add_argument_group('GRU4Rec')
26 |     #     parent_parser.add_argument("--hidden_size", type=int, default=128, help='hidden size of feedforward')
27 |     #     parent_parser.add_argument("--layer_num", type=int, default=1, help='layer num of transformers')
28 |     #     parent_parser.add_argument("--dropout_rate", type=float, default=0.2, help='dropout rate')
29 |     #     parent_parser.add_argument("--negative_count", type=int, default=1, help='negative sampling numbers')
30 |     #     return parent_parser
31 | 
32 |     def _get_dataset_class():
33 |         r"""The dataset is SeqDataset."""
34 |         return dataset.SeqDataset
35 | 
36 |     def _get_query_encoder(self, train_data):
37 |         model_config = self.config['model']
38 |         return (
39 |             module.VStackLayer(
40 |                 module.HStackLayer(
41 |                     torch.nn.Sequential(
42 |                         module.LambdaLayer(lambda x: x['in_'+self.fiid]),
43 |                         self.item_encoder,
44 |                         torch.nn.Dropout(model_config['dropout_rate']),
45 |                         module.GRULayer(self.embed_dim, model_config['hidden_size'], model_config['layer_num']),
46 |                     ),
47 |                     module.LambdaLayer(lambda_func=lambda x: x['seqlen']),
48 |                 ),
49 |                 module.SeqPoolingLayer(pooling_type='last'),
50 |                 torch.nn.Linear(model_config['hidden_size'], self.embed_dim)
51 |             )
52 |         )
53 | 
54 |     def _get_score_func(self):
55 |         return scorer.InnerProductScorer()
56 | 
57 |     def _get_loss_func(self):
58 |         r"""SoftmaxLoss is used as the loss function."""
59 |         return loss_func.BPRLoss()
60 | 
61 |     def _get_sampler(self, train_data):
62 |         return sampler.UniformSampler(train_data.num_items)
63 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/hgn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.ann import sampler
 3 | from recstudio.data import dataset
 4 | from recstudio.model import basemodel, loss_func, scorer
 5 | 
 6 | r"""
 7 | HGN
 8 | ########
 9 | 
10 | Paper Reference:
11 |     Chen ma, et al. "HGN: Hierarchical Gating Networks for Sequential Recommendation" in KDD2019.
12 |     https://dl.acm.org/doi/abs/10.1145/3292500.3330984
13 | """
14 | 
15 | 
16 | class HGNQueryEncoder(torch.nn.Module):
17 | 
18 |     def __init__(self, fuid, fiid, num_users, embed_dim, max_seq_len, item_encoder, pooling_type='mean') -> None:
19 |         super().__init__()
20 |         self.fuid = fuid
21 |         self.fiid = fiid
22 |         self.item_encoder = item_encoder
23 |         self.pooling_type = pooling_type
24 |         self.user_embedding = torch.nn.Embedding(num_users, embed_dim, 0)
25 |         self.W_g_1 = torch.nn.Linear(embed_dim, embed_dim, bias=False)
26 |         self.W_g_2 = torch.nn.Linear(embed_dim, embed_dim, bias=False)
27 |         self.b_g = torch.nn.Parameter(torch.empty(embed_dim), requires_grad=True)
28 |         self.w_g_3 = torch.nn.Linear(embed_dim, 1, bias=False)
29 |         self.W_g_4 = torch.nn.Linear(embed_dim, max_seq_len)
30 | 
31 |     def forward(self, batch):
32 |         U = self.user_embedding(batch[self.fuid])
33 |         S = self.item_encoder(batch['in_'+self.fiid])
34 |         S_F = S * torch.sigmoid(self.W_g_1(S) + self.W_g_2(U).view(U.size(0), 1, -1) + self.b_g)
35 |         weight = torch.sigmoid(self.w_g_3(S_F) + (U@self.W_g_4.weight[:S.size(1)].T).view(U.size(0), -1, 1))    # BxLx1
36 |         S_I = S_F * weight
37 |         if self.pooling_type == 'mean':
38 |             s = S_I.sum(1) / weight.sum(1)
39 |         elif self.pooling_type == 'max':
40 |             s = torch.max(S_I, dim=1).values
41 |         else:
42 |             raise ValueError("`pooling_type` only support `avg` and `max`")
43 |         query = U + s + S.sum(1)
44 |         return query
45 | 
46 | 
47 | class HGN(basemodel.BaseRetriever):
48 |     r"""HGN proposes a hierarchical gating network, integrated with the Bayesian Personalized Ranking
49 |     (BPR) to capture both the long-term and short-term user interests. HGN consists of a feature
50 |     gating module, an instance gating module, and an item-item product module."""
51 | 
52 |     def _get_dataset_class():
53 |         r"""The dataset is SeqDataset."""
54 |         return dataset.SeqDataset
55 | 
56 |     def _get_query_encoder(self, train_data):
57 |         return HGNQueryEncoder(self.fuid, self.fiid, train_data.num_users, self.embed_dim, train_data.config['max_seq_len'],
58 |                                self.item_encoder, self.config['model']['pooling_type'])
59 | 
60 |     def _get_scorer_func(self):
61 |         return scorer.InnerProductScorer()
62 | 
63 |     def _get_loss_func(self):
64 |         r"""BPR loss is used."""
65 |         return loss_func.BPRLoss()
66 | 
67 |     def _get_sampler(self, train_data):
68 |         return sampler.UniformSampler(train_data.num_items)
69 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/iclrec.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.data import dataset
 3 | from recstudio.model.module import data_augmentation
 4 | from recstudio.model.seq.sasrec import SASRec, SASRecQueryEncoder
 5 | 
 6 | r"""
 7 | ICLRec
 8 | #############
 9 |     Intent Contrastive Learning for Sequential Recommendation (WWW'22)
10 |     Reference:
11 |         https://doi.org/10.1145/3485447.3512090
12 | """
13 | class ICLRec(SASRec):
14 | 
15 |     def _init_model(self, train_data):
16 |         super()._init_model(train_data)
17 |         self.augmentation_model = data_augmentation.ICLRecAugmentation(self.config['model'], train_data)
18 | 
19 |     def _get_dataset_class():
20 |         return dataset.SeqToSeqDataset
21 | 
22 |     def _get_train_loaders(self, train_data:dataset.SeqToSeqDataset, ddp=False):
23 |         rec_train_loader = train_data.train_loader(batch_size = self.config['train']['batch_size'],
24 |                                                    shuffle = True, ddp=ddp)
25 |         kmeans_train_loader = train_data.train_loader(batch_size = self.config['train']['batch_size'],
26 |                                                       shuffle = False, ddp=ddp)
27 |         return [rec_train_loader, kmeans_train_loader]
28 | 
29 |     def current_epoch_trainloaders(self, nepoch):
30 |         return self.trainloaders[0], False
31 | 
32 |     def _get_item_encoder(self, train_data):
33 |         return torch.nn.Embedding(train_data.num_items + 1, self.embed_dim, padding_idx=0) # the last is masking
34 | 
35 |     def _get_query_encoder(self, train_data):
36 |         model_config = self.config['model']
37 |         return SASRecQueryEncoder(
38 |             fiid=self.fiid, embed_dim=self.embed_dim,
39 |             max_seq_len=train_data.config['max_seq_len'], n_head=model_config['head_num'],
40 |             hidden_size=model_config['hidden_size'], dropout=model_config['dropout_rate'],
41 |             activation=model_config['activation'], layer_norm_eps=model_config['layer_norm_eps'],
42 |             n_layer=model_config['layer_num'],
43 |             training_pooling_type='origin',
44 |             item_encoder=self.item_encoder
45 |         )
46 | 
47 |     def training_step(self, batch):
48 |         output = self.forward(batch, False, return_query=True)
49 |         cl_output = self.augmentation_model(batch, output['query'], self.query_encoder)
50 |         loss_value = self.loss_fn(batch[self.frating], **output['score']) \
51 |            + self.config['model']['cl_weight'] * cl_output['instance_cl_loss'] \
52 |            + self.config['model']['intent_cl_weight'] * cl_output['intent_cl_loss']
53 |         return loss_value
54 | 
55 |     def training_epoch(self, nepoch):
56 |         self.augmentation_model.train_kmeans(self.query_encoder, self.trainloaders[1], \
57 |             self._parameter_device)
58 |         return super().training_epoch(nepoch)
59 | 
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/npe.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.ann import sampler
 3 | from recstudio.data import dataset
 4 | from recstudio.model import basemodel, loss_func, module, scorer
 5 | 
 6 | r"""
 7 | NPE
 8 | #######################
 9 | 
10 | Paper Reference:
11 |     ThaiBinh Nguyen, et al. "NPE: Neural Personalized Embedding for Collaborative Filtering" in IJCAI2018.
12 |     https://www.ijcai.org/proceedings/2018/0219.pdf
13 | """
14 | 
15 | 
16 | class NPE(basemodel.BaseRetriever):
17 |     r"""
18 |         NPE models a user’s click to an item in two terms: the personal preference of the user for the item,
19 |         and the relationships between this item and other items clicked by the user.
20 |     """
21 | 
22 |     def _get_dataset_class():
23 |         r"""SeqDataset is used for NPE."""
24 |         return dataset.SeqDataset
25 | 
26 |     def _get_query_encoder(self, train_data):
27 |         dropout_rate = self.config['model']['dropout_rate']
28 |         return torch.nn.Sequential(
29 |             module.HStackLayer(
30 |                 torch.nn.Sequential(
31 |                     module.LambdaLayer(lambda x: x['in_'+self.fiid]),
32 |                     self.item_encoder[0],
33 |                     module.LambdaLayer(lambda x: torch.sum(x, dim=1)),
34 |                     torch.nn.ReLU(),
35 |                     torch.nn.Dropout(p=dropout_rate)
36 |                 ),
37 |                 torch.nn.Sequential(
38 |                     module.LambdaLayer(lambda x: x[self.fuid]),
39 |                     torch.nn.Embedding(train_data.num_users, self.embed_dim, 0),
40 |                     torch.nn.ReLU(),
41 |                     torch.nn.Dropout(p=dropout_rate)
42 |                 )
43 |             ),
44 |             module.LambdaLayer(lambda x: x[0]+x[1])
45 |         )
46 | 
47 |     def _get_item_encoder(self, train_data):
48 |         r"""NPE combine an Embedding layer with a ReLU layer as item encoder."""
49 |         return torch.nn.Sequential(
50 |             super()._get_item_encoder(train_data),
51 |             torch.nn.ReLU())
52 | 
53 |     def _get_item_vector(self):
54 |         """Get all item vectors, simply apply ReLU operation on the weight of Embedding layer."""
55 |         return self.item_encoder[1](self.item_encoder[0].weight[1:])
56 | 
57 |     def _get_score_func(self):
58 |         r"""Innerproduct operation is applied to calculate scores between query and item."""
59 |         return scorer.InnerProductScorer()
60 | 
61 |     def _get_loss_func(self):
62 |         r"""According to the original paper, BCE loss is applied.
63 |             Also, other loss functions like softmax loss and BPR loss can be used too.
64 |         """
65 |         return loss_func.BinaryCrossEntropyLoss()
66 | 
67 |     def _get_sampler(self, train_data):
68 |         return sampler.UniformSampler(train_data.num_items)
69 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/stamp.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.data import dataset
 3 | from recstudio.model import basemodel, loss_func, module, scorer
 4 | 
 5 | 
 6 | class STAMPQueryEncoder(torch.nn.Module):
 7 | 
 8 |     def __init__(self, fiid, embed_dim, item_encoder) -> None:
 9 |         super().__init__()
10 |         self.fiid = fiid
11 |         self.item_encoder = item_encoder
12 |         self.gather_layer = module.SeqPoolingLayer(pooling_type='last')
13 |         self.attention_layer = module.AttentionLayer(
14 |             q_dim=2 * embed_dim,
15 |             k_dim=embed_dim,
16 |             mlp_layers=[embed_dim],
17 |         )
18 |         self.mlpA = module.MLPModule([embed_dim, embed_dim], torch.nn.Tanh())
19 |         self.mlpB = module.MLPModule([embed_dim, embed_dim], torch.nn.Tanh())
20 | 
21 |     def forward(self, batch):
22 |         user_hist = batch['in_'+self.fiid]
23 |         seq_emb = self.item_encoder(user_hist)
24 |         m_t = self.gather_layer(seq_emb, batch['seqlen'])
25 |         m_s = seq_emb.sum(dim=1) / batch['seqlen'].unsqueeze(1).float()  # B x D
26 | 
27 |         query = torch.cat((m_t, m_s), dim=1)    # Bx2D
28 |         m_a = self.attention_layer(query.unsqueeze(1), seq_emb, seq_emb,
29 |                                    key_padding_mask=(user_hist == 0)).squeeze(1)
30 |         h_s = self.mlpA(m_a)
31 |         h_t = self.mlpB(m_t)
32 |         return h_s * h_t
33 | 
34 | 
35 | class STAMP(basemodel.BaseRetriever):
36 |     r"""
37 |     STAMP is capable of capturing users’ general interests from the long-term memory of a session
38 |     context, while taking into account users’ current interests from the short-term memory of the
39 |     last-clicks.
40 | 
41 |     Model hyper parameters:
42 |         - ``embed_dim(int)``: The dimension of embedding layers. Default: ``64``.
43 |     """
44 | 
45 |     def _get_dataset_class():
46 |         r"""SeqDataset is used for STAMP."""
47 |         return dataset.SeqDataset
48 | 
49 |     def _get_query_encoder(self, train_data):
50 |         return STAMPQueryEncoder(self.fiid, self.embed_dim, self.item_encoder)
51 | 
52 |     def _get_score_func(self):
53 |         r"""InnerProduct is used as the score function."""
54 |         return scorer.InnerProductScorer()
55 | 
56 |     def _get_loss_func(self):
57 |         r"""SoftmaxLoss is used as the loss function."""
58 |         return loss_func.SoftmaxLoss()
59 | 
60 |     def _get_sampler(self, train_data):
61 |         return None
62 | 


--------------------------------------------------------------------------------
/recstudio/model/seq/transrec.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from recstudio.ann import sampler
 3 | from recstudio.data import dataset
 4 | from recstudio.model import basemodel, loss_func, scorer
 5 | 
 6 | 
 7 | class TransRecQueryEncoder(torch.nn.Module):
 8 |     def __init__(self, fuid, fiid, num_users, embed_dim, item_encoder):
 9 |         super().__init__()
10 |         self.fuid = fuid
11 |         self.fiid = fiid
12 |         self.item_encoder = item_encoder
13 |         self.user_embedding = torch.nn.Embedding(num_users, embed_dim, 0)
14 |         self.global_user_emb = torch.nn.Parameter(torch.zeros(embed_dim))
15 | 
16 |     def forward(self, batch):
17 |         user_hist = batch['in_'+self.fiid]
18 |         seq_len = batch['seqlen'] - 1
19 |         local_user_emb = self.user_embedding(batch[self.fuid])
20 |         user_emb = local_user_emb + self.global_user_emb.expand_as(local_user_emb)  # B x D
21 |         last_item_id = torch.gather(user_hist, dim=-1, index=seq_len.unsqueeze(1))
22 |         last_item_emb = self.item_encoder(last_item_id).squeeze(1)  # B x D
23 |         query = user_emb + last_item_emb
24 |         return query
25 | 
26 | 
27 | class TransRec(basemodel.BaseRetriever):
28 |     r"""
29 |     TransRec embeds items into a ‘transition space’ where users are modeled as translation vectors operating on item sequences.
30 | 
31 |     Model hyper parameters:
32 |         - ``embed_dim(int)``: The dimension of embedding layers. Default: ``64``.
33 |     """
34 | 
35 |     # TODO(@AngusHuang17): bias here is not easy to construct query, abandoned now
36 | 
37 |     def _get_dataset_class():
38 |         r"""SeqDataset is used for TransRec."""
39 |         return dataset.SeqDataset
40 | 
41 |     def _get_item_encoder(self, train_data):
42 |         return torch.nn.Embedding(train_data.num_items, self.embed_dim, 0)
43 | 
44 |     def _get_query_encoder(self, train_data):
45 |         return TransRecQueryEncoder(
46 |             self.fuid, self.fiid, train_data.num_users, self.embed_dim, self.item_encoder
47 |         )
48 | 
49 |     def _get_sampler(self, train_data):
50 |         return sampler.UniformSampler(train_data.num_items, self.score_func)
51 | 
52 |     def _get_scorer(self):
53 |         r"""InnerProduct is used as the score function."""
54 |         return scorer.EuclideanScorer()
55 | 
56 |     def _get_loss_func(self):
57 |         r"""BPRLoss is used as the loss function."""
58 |         return loss_func.BPRLoss()
59 | 


--------------------------------------------------------------------------------
/recstudio/quickstart/__init__.py:
--------------------------------------------------------------------------------
1 | from .run import run
2 | from .config_dataset import generate_dataset_config


--------------------------------------------------------------------------------
/recstudio/quickstart/config_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def generate_dataset_config(name: str, dir: str, interaction_file: str, user_id: str, 
 6 |     item_id: str, rating:str, timestamp: str, sep='\t', user_file: str=None, item_file: str=None):
 7 |     config_file_name = f"{name}.yaml"
 8 |     config_path = os.path.join(dir, name)
 9 |     config_dict = {
10 |         'url': dir,
11 |         'user_id_field': f"&u {user_id}:token",
12 |         'item_id_field': f"&i {item_id}:token",
13 |         'rating_field': f"&r {rating}:float",
14 |         'time_field': f"&t {timestamp}:float",
15 |         'inter_feat_name': f"{interaction_file}",
16 |         'user_feat_name': f"{user_file}" if user_file else "~",
17 |         'item_feat_name': f"{item_file}" if item_file else "~",
18 | 
19 |     }
20 |     raise NotImplementedError("Sorry, not supported now, we will implement the function soon.")


--------------------------------------------------------------------------------
/recstudio/quickstart/run.py:
--------------------------------------------------------------------------------
 1 | import os, datetime, torch
 2 | from typing import *
 3 | from recstudio.utils import *
 4 | from recstudio import LOG_DIR
 5 | 
 6 | def run(model: str, dataset: str, model_config: Dict=None, data_config: Dict=None, model_config_path: str=None, data_config_path: str=None, verbose=True, run_mode='light', **kwargs):
 7 |     model_class, model_conf = get_model(model)
 8 | 
 9 |     if model_config_path is not None:
10 |         if isinstance(model_config_path, str):
11 |             model_conf = deep_update(model_conf, parser_yaml(model_config_path))
12 |         else:
13 |             raise TypeError(f"expecting `model_config_path` to be str, while get {type(model_config_path)} instead.")
14 | 
15 |     if model_config is not None:
16 |         if isinstance(model_config, Dict):
17 |             model_conf = deep_update(model_conf, model_config)
18 |         else:
19 |             raise TypeError(f"expecting `model_config` to be Dict, while get {type(model_config)} instead.")
20 | 
21 |     if kwargs is not None:
22 |         model_conf = deep_update(model_conf, kwargs)
23 | 
24 |     log_path = f"{model}/{dataset}/{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')}.log"
25 |     logger = get_logger(log_path)
26 |     torch.set_num_threads(model_conf['train']['num_threads'])
27 | 
28 |     if not verbose:
29 |         import logging
30 |         logger.setLevel(logging.ERROR)
31 | 
32 |     logger.info("Log saved in {}.".format(os.path.abspath(os.path.join(LOG_DIR, log_path))))
33 |     if run_mode == 'tune':
34 |         model_conf = update_config_with_nni(model_conf)
35 |     model = model_class(model_conf)
36 |     dataset_class = model_class._get_dataset_class()
37 | 
38 |     data_conf = {}
39 |     if data_config_path is not None:
40 |         if isinstance(data_config_path, str):
41 |             # load dataset config from file
42 |             conf = parser_yaml(data_config_path)
43 |             data_conf.update(conf)
44 |         else:
45 |             raise TypeError(f"expecting `data_config_path` to be str, while get {type(data_config_path)} instead.")
46 | 
47 |     if data_config is not None:
48 |         if isinstance(data_config, dict):
49 |             # update config with given dict
50 |             data_conf.update(data_config)
51 |         else:
52 |             raise TypeError(f"expecting `data_config` to be Dict, while get {type(data_config)} instead.")
53 | 
54 |     data_conf.update(model_conf['data'])    # update model-specified config
55 | 
56 |     datasets = dataset_class(name=dataset, config=data_conf).build(**model_conf['data'])
57 |     logger.info(f"{datasets[0]}")
58 |     logger.info(f"\n{set_color('Model Config', 'green')}: \n\n" + color_dict_normal(model_conf, False))
59 |     val_result = model.fit(*datasets[:2], run_mode=run_mode)
60 |     test_result = model.evaluate(datasets[-1])
61 |     return (model, datasets), (val_result, test_result)
62 | 


--------------------------------------------------------------------------------
/recstudio/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from recstudio.utils.utils import *
2 | from recstudio.utils.data_parallel import *
3 | from recstudio.utils.arguments import get_default_parser, add_model_arguments, parser2nested_dict


--------------------------------------------------------------------------------
/recstudio/utils/compress_file.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import zipfile
 4 | import gzip
 5 | 
 6 | 
 7 | class CompressedFile(object):
 8 |     magic = None
 9 |     file_type = None
10 |     mime_type = None
11 | 
12 |     def __init__(self, fname, save_dir):
13 |         self.extract_all(fname, save_dir)
14 | 
15 |     @classmethod
16 |     def is_magic(self, data):
17 |         return data.startswith(self.magic)
18 | 
19 |     def extract_all(self, fname, save_dir):
20 |         pass
21 | 
22 | 
23 | class ZIPFile (CompressedFile):
24 |     magic = b'\x50\x4b\x03\x04'
25 |     file_type = 'zip'
26 |     mime_type = 'compressed/zip'
27 | 
28 |     def extract_all(self, fname, save_dir):
29 |         with zipfile.ZipFile(fname) as f:
30 |             for member in f.namelist():
31 |                 filename = os.path.basename(member)
32 |                 # skip directories
33 |                 if not filename:
34 |                     continue
35 | 
36 |                 source = f.open(member)
37 |                 target = open(os.path.join(save_dir, filename), "wb")
38 |                 with source, target:
39 |                     shutil.copyfileobj(source, target)
40 | 
41 | 
42 | class GZFile (CompressedFile):
43 |     magic = b'\x1f\x8b\x08'
44 |     file_type = 'gz'
45 |     mime_type = 'compressed/gz'
46 | 
47 |     def extract_all(self, fname, save_dir):
48 |         decompressed_fname = os.path.basename(fname)[:-3]
49 |         with gzip.open(fname, 'rb') as f_in:
50 |             with open(os.path.join(save_dir, decompressed_fname), 'wb') as f_out:
51 |                 shutil.copyfileobj(f_in, f_out)
52 | 
53 | 
54 | def extract_compressed_file(filename, save_dir):
55 |     with open(filename, 'rb') as f:
56 |         start_of_file = f.read(1024)
57 | 
58 |         f.seek(0)
59 |         if filename.endswith('csv'):
60 |             pass
61 |         else:
62 |             for cls in (ZIPFile, GZFile):
63 |                 if cls.is_magic(start_of_file):
64 |                     cls(filename, save_dir)
65 |                     break
66 |             os.remove(filename)
67 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | from recstudio.utils import *
 2 | from recstudio import quickstart
 3 | 
 4 | 
 5 | if __name__ == '__main__':
 6 |     parser = get_default_parser()
 7 |     args, command_line_args = parser.parse_known_args()
 8 |     parser = add_model_arguments(parser, args.model)
 9 |     command_line_conf = parser2nested_dict(parser, command_line_args)
10 | 
11 |     model_class, model_conf = get_model(args.model)
12 |     model_conf = deep_update(model_conf, command_line_conf)
13 | 
14 |     quickstart.run(args.model, args.dataset, model_config=model_conf, data_config_path=args.data_config_path, run_mode=args.mode)
15 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import print_function
 3 | from __future__ import division
 4 | 
 5 | import os
 6 | 
 7 | from setuptools import setup, find_packages
 8 | 
 9 | install_requires = ['numpy>=1.20.1', 'torch>=1.9.0', 'scipy>=1.6.0', 'pandas>=1.3.0', 'tqdm>=4.48.2',
10 |                     'colorlog==4.7.2','colorama==0.4.4', 'pyyaml>=5.1.0', 'tensorboard>=2.5.0', 
11 |                      'faiss-gpu==1.7.2', 'torchmetrics==0.7.3']
12 | 
13 | setup_requires = []
14 | 
15 | extras_require = {}
16 | 
17 | classifiers =  ['License :: OSI Approved :: MIT License',
18 |                 'License :: OSI Approved :: MIT License',
19 |                 'Programming Language :: Python',
20 |                 'Programming Language :: Python :: 3',
21 |                 'Programming Language :: Python :: 3.8']
22 | 
23 | long_description = 'RecStudio is a modular, efficient, unified, and comprehensive recommendation library based on PyTorch.'\
24 |                    'We divide all the models into 3 basic classes according to the number of towers: TowerFree, ItemTower, TwoTower, '\
25 |                    'and cover models in 4 tasks: General Recommendation, Sequential Recommendation, Knowledge-based Recommendation, Social-Network-based Recommendation. '\
26 |                    'View github page: https://github.com/ustcml/RecStudio'
27 | 
28 | # Readthedocs requires Sphinx extensions to be specified as part of
29 | # install_requires in order to build properly.
30 | on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
31 | if on_rtd:
32 |     install_requires.extend(setup_requires)
33 | 
34 | setup(
35 |     name='recstudio',
36 |     version=
37 |     '0.0.2a1',  # please remember to edit recbole/__init__.py in response, once updating the version
38 |     description='A modular, efficient, unified, and comprehensive recommendation library based on PyTorch.',
39 |     long_description=long_description,
40 |     long_description_content_type="text/markdown",
41 |     url='https://github.com/ustcml/RecStudio',
42 |     author='USTCML',
43 |     author_email='liandefu@ustc.edu.cn',
44 |     packages=[
45 |         package for package in find_packages()
46 |         if package.startswith('recstudio')
47 |     ],
48 |     include_package_data=True,
49 |     install_requires=install_requires,
50 |     setup_requires=setup_requires,
51 |     extras_require=extras_require,
52 |     zip_safe=False,
53 |     classifiers=classifiers,
54 | )


--------------------------------------------------------------------------------
/test/test_config_dataset.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | sys.path.append(".")
3 | 
4 | from recstudio.quickstart import generate_dataset_config
5 | 
6 | generate_dataset_config(name='mydataset', data_dir='dataset_dir/', 
7 |     interaction_file='inter.csv', user_id='user_id', item_id='item_id', 
8 |     rating='rating', timestamp='timestamp', sep='\t')


--------------------------------------------------------------------------------
/test/test_dataset.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | sys.path.append(".")
 3 | # sys.path.append(os.path.join(__file__, '../'))
 4 | # sys.path.insert(0, os.path.join(__file__, '../'))
 5 | 
 6 | from recstudio.data.dataset import TripletDataset
 7 | 
 8 | data = TripletDataset(name='ml-100k')
 9 | trn, val, tst = data.build(split_ratio=[0.7, 0.2, 0.1])
10 | 
11 | trn_loader = trn.train_loader(batch_size=128, shuffle=True)
12 | 
13 | batch = next(iter(trn_loader))
14 | print(batch)
15 | 
16 | # print("End.")


--------------------------------------------------------------------------------
/test/test_ddp.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import torch.distributed as dist
 4 | import torch.multiprocessing as mp
 5 | import torch.nn as nn
 6 | import torch.optim as optim
 7 | from torch.nn.parallel import DistributedDataParallel as DDP
 8 | 
 9 | 
10 | def example(rank, world_size):
11 |     # create default process group
12 |     dist.init_process_group("gloo", rank=rank, world_size=world_size)
13 |     # create local model
14 |     model = nn.Linear(10, 10).to(rank)
15 |     # construct DDP model
16 |     ddp_model = DDP(model, device_ids=[rank])
17 |     # define loss function and optimizer
18 |     loss_fn = nn.MSELoss()
19 |     optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
20 | 
21 |     # forward pass
22 |     outputs = ddp_model(torch.randn(20, 10).to(rank))
23 |     labels = torch.randn(20, 10).to(rank)
24 |     # backward pass
25 |     loss_fn(outputs, labels).backward()
26 |     # update parameters
27 |     optimizer.step()
28 | 
29 | def main():
30 |     world_size = 4
31 |     mp.spawn(example,
32 |         args=(world_size,),
33 |         nprocs=world_size,
34 |         join=True)
35 | 
36 | if __name__=="__main__":
37 |     # Environment variables which need to be
38 |     # set when using c10d's default "env"
39 |     # initialization mode.
40 |     os.environ["MASTER_ADDR"] = "localhost"
41 |     os.environ["MASTER_PORT"] = "29500"
42 |     main()


--------------------------------------------------------------------------------
/test/test_quickrun.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append(".")
 3 | from recstudio import quickstart
 4 | 
 5 | quickstart.run(model='MultiVAE', dataset='ml-100k', gpu=[2])
 6 | 
 7 | 
 8 | import recstudio.data as recdata
 9 | 
10 | print(recdata.supported_dataset)


--------------------------------------------------------------------------------
/test/test_retriever.py:
--------------------------------------------------------------------------------
 1 | from recstudio.model import scorer, loss_func   # 导入打分和损失函数模块
 2 | from recstudio.ann import sampler   # 导入采样器模块
 3 | from recstudio.model.basemodel import BaseRetriever  # 导入召回模型基类
 4 | from recstudio.data import dataset  # 导入数据集模块
 5 | import torch
 6 | import sys
 7 | sys.path.append(".")
 8 | 
 9 | 
10 | ml_1m_data = dataset.TripletDataset(name='ml-100k')
11 | trn, val, tst = ml_1m_data.build(split_ratio=[0.7, 0.2, 0.1])
12 | 
13 | bpr = BaseRetriever(
14 |     item_encoder=torch.nn.Embedding(trn.num_items, 64, 0),
15 |     query_encoder=torch.nn.Embedding(trn.num_users, 64, 0),
16 |     scorer=scorer.InnerProductScorer(),
17 |     loss=loss_func.BPRLoss(),
18 |     sampler=sampler.UniformSampler(trn.num_items)
19 | )
20 | 
21 | bpr.fit(trn, val, negative_count=1)
22 | bpr.evaluate(tst)
23 | 


--------------------------------------------------------------------------------
/test/test_training_pipeline.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from tqdm import tqdm
 4 | from termcolor import colored
 5 | 
 6 | 
 7 | dir = os.path.join( os.path.dirname(__file__), '../' )
 8 | sys.path.append( os.path.abspath(dir) )
 9 | 
10 | from recstudio.quickstart import run
11 | 
12 | # Note: please add models to be tested here
13 | datasets = ['ml-100k']
14 | 
15 | fm_model = ['DCN', 'DeepFM', 'FM', 'LR', 'NFM', 'WideDeep', 'xDeepFM']
16 | mf_model = ['BPR', 'CML', 'DSSM', 'EASE', 'IRGAN', 'ItemKNN', 'LogisticMF', 'NCF', 'SLIM', 'WRMF']
17 | seq_model = ['BERT4Rec', 'Caser', 'FPMC', 'GRU4Rec', 'HGN', 'NARM', 'NPE', 'SASRec', 'STAMP', 'TransRec']
18 | ae_model = ['MultiDAE', 'MultiVAE']
19 | graph_model = ['LightGCN', 'NCL', 'NGCF', 'SGL', 'SimGCL']
20 | cl_seq_model = ['CL4SRec', 'CoSeRec', 'ICLRec']
21 | 
22 | 
23 | training_configs = [
24 |     {'train': {'epochs': 3} },
25 | ]
26 | 
27 | all_models = {
28 |     'FM': fm_model,
29 |     'MF': mf_model,
30 |     'SEQ': seq_model,
31 |     'AE': ae_model,
32 |     'GRAPH': graph_model,
33 |     'CL-SEQ': cl_seq_model,
34 | }
35 | 
36 | # test loop
37 | num_exps = sum([len(m) for m in all_models.values()]) * len(datasets) * len(training_configs)
38 | pbar = tqdm(total=num_exps)
39 | for cate, models in all_models.items():
40 |     tqdm.write(f"Test {cate} models - {len(mf_model)} models:")
41 |     failed_exp = []
42 |     for m in models:
43 |         for d in datasets:
44 |             for i, config in enumerate(training_configs):
45 |                 pbar.update(1)
46 |                 tqdm.write(colored(f"### Test: model-{m}, data-{d}, {i}-th configurations.", on_color='on_blue'))
47 |                 try:
48 |                     run(m, d, config, verbose=False)
49 |                     tqdm.write(colored(f"$$$ Test passed!", 'green'))
50 |                 except:
51 |                     tqdm.write(colored(f"!!! Test failed!", 'red'))
52 |                     failed_exp.append({
53 |                         'model': m,
54 |                         'dataset': d,
55 |                         'config': config
56 |                     })
57 |     tqdm.write("{} models test End. {}/{} failed.".format( cate, len(failed_exp), (len(models) * len(datasets) * len(config)) ))
58 | 
59 | pbar.close()


--------------------------------------------------------------------------------