├── .gitignore ├── LICENSE ├── README.md ├── assets ├── image.png └── overview.png ├── config ├── dataset_config.yaml ├── model_config.yaml └── tuner │ └── TiktokLarge_STEM_tuner.yaml ├── data └── TiktokLarge │ └── preprocess.py ├── fuxictr ├── __init__.py ├── autotuner.py ├── datasets │ ├── __init__.py │ ├── avazu.py │ ├── criteo.py │ └── kkbox.py ├── features.py ├── metrics.py ├── preprocess │ ├── __init__.py │ ├── build_dataset.py │ ├── feature_processor.py │ ├── normalizer.py │ └── tokenizer.py ├── pytorch │ ├── __init__.py │ ├── dataloaders │ │ ├── __init__.py │ │ ├── h5_block_dataloader.py │ │ └── h5_dataloader.py │ ├── layers │ │ ├── __init__.py │ │ ├── activations.py │ │ ├── attentions │ │ │ ├── __init__.py │ │ │ ├── dot_product_attention.py │ │ │ ├── squeeze_excitation.py │ │ │ └── target_attention.py │ │ ├── blocks │ │ │ ├── __init__.py │ │ │ ├── factorization_machine.py │ │ │ ├── logistic_regression.py │ │ │ └── mlp_block.py │ │ ├── embeddings │ │ │ ├── __init__.py │ │ │ ├── feature_embedding.py │ │ │ └── pretrained_embedding.py │ │ ├── interactions │ │ │ ├── __init__.py │ │ │ ├── bilinear_interaction.py │ │ │ ├── compressed_interaction_net.py │ │ │ ├── cross_net.py │ │ │ ├── holographic_interaction.py │ │ │ ├── inner_product.py │ │ │ └── interaction_machine.py │ │ └── pooling.py │ ├── models │ │ ├── __init__.py │ │ ├── multitask_model.py │ │ └── rank_model.py │ └── torch_utils.py ├── tensorflow │ ├── __init__.py │ ├── dataloaders │ │ ├── __init__.py │ │ └── tf_dataloader.py │ ├── layers │ │ ├── __init__.py │ │ ├── blocks │ │ │ ├── __init__.py │ │ │ ├── factorization_machine.py │ │ │ ├── linear.py │ │ │ ├── logistic_regression.py │ │ │ └── mlp_block.py │ │ ├── embeddings │ │ │ ├── __init__.py │ │ │ └── feature_embedding.py │ │ ├── interactions │ │ │ ├── __init__.py │ │ │ ├── cross_net.py │ │ │ └── inner_product.py │ │ └── pooling.py │ ├── models │ │ ├── __init__.py │ │ └── rank_model.py │ └── tf_utils.py ├── utils.py └── version.py ├── models ├── AITM.py ├── ESMM.py ├── MMoE.py ├── OMoE.py ├── PLE.py ├── STEM.py ├── SharedBottom.py └── __init__.py ├── run_expid.py ├── run_expid_list.py └── run_param_tuner.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # data 163 | *.txt 164 | *.tgz 165 | *.csv 166 | data/TiktokLarge/TiktokLarge/ 167 | 168 | #checkpoints 169 | checkpoints -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # STEM: Unleashing the Power of Embeddings for Multi-Task Recommendation 2 | ![logo](assets/image.png) 3 | 4 | The open source code for "[STEM: Unleashing the Power of Embeddings for Multi-Task Recommendation](https://arxiv.org/abs/2308.13537)" presented at AAAI 2024. 5 | 6 | 7 | 🚧 **This code repository is under construction, please stay tuned!** 8 | ## Model Overview 9 | ![overview](assets/overview.png) 10 | 11 | ## Getting Started 12 | ### Data Preparation 13 | #### Tiktok 14 | Step 1. Please download the dataset ```final_track1_train.txt.tgz``` of IEEE ICME 2019 Grand Challenge Track 1 to the directory ```data/TiktokLarge/raw_data```. You can download the original dataset from [BaiduNetDisk](https://pan.baidu.com/s/1ktHIHVx6mJqKnZCNcvQ41w)(password:tk1c). Kindly be informed that the usage of this dataset is restricted to academic research purposes exclusively and it must not be utilized for any commercial or illegal activities. 15 | 16 | Step 2. Then, run the data preprocessing script as follows. 17 | ```sh 18 | cd data/TiktokLarge 19 | # Please put final_track1_train.txt under ./raw_data 20 | # or modify the data set path in preprocess.py 21 | python preprocess.py 22 | ``` 23 | 24 | ### Reproduce Steps 25 | 26 | In the next steps, we assume that the directory is ```src```. 27 | 28 | #### Model and Dataset Configuration 29 | We provided available model configuration file ```model_config.yaml``` and data configuration file ```dataset_config.yaml``` in the ```config``` directory. Please modify these two files according to your needs if necessary. 30 | 31 | #### Reproduce the model performance 32 | You can train a STEM model by running the following script. 33 | ```sh 34 | python run_expid.py --expid STEM_TiktokLarge --config ./config --gpu 0 35 | ``` 36 | For better performance, we provide the tuner to automatically search for optimal hyperparameters. For example, 37 | ```sh 38 | # Please modify TiktokLarge_STEM_tuner.yaml to adjust the search space 39 | python run_param_tuner.py --config ./config/tuner/TiktokLarge_STEM_tuner.yaml --gpu 0 1 40 | ``` 41 | We also provide baseline implementations for comparing model performance. For example, 42 | ```sh 43 | # Shared-Bottom 44 | python run_expid.py --expid SharedBottom_TiktokLarge --config ./config --gpu 0 45 | # MMoE 46 | python run_expid.py --expid MMoE_TiktokLarge --config ./config --gpu 0 47 | # PLE 48 | python run_expid.py --expid PLE_TiktokLarge --config ./config --gpu 0 49 | ``` 50 | 51 | ## Contact 52 | If you have any problem about this implementation, please create an issue or send us an Email at: 53 | - sulc21@mails.tsinghua.edu.cn (Liangcai Su) 54 | 55 | ## Citation 56 | 57 | If you find our code or propcessed data helpful in your research, please kindly cite the following papers. 58 | ```bibtex 59 | @article{AAAI24_STEM, 60 | author = {Liangcai Su and 61 | Junwei Pan and 62 | Ximei Wang and 63 | Xi Xiao and 64 | Shijie Quan and 65 | Xihua Chen and 66 | Jie Jiang}, 67 | title = {{STEM:} Unleashing the Power of Embeddings for Multi-task Recommendation}, 68 | journal={Proceedings of the 38-th AAAI Conference on Artificial Intelligence (AAAI 2024)} 69 | year = {2024}, 70 | } 71 | ``` 72 | 73 | 74 | Our code is based on the FuxiCTR and BARS. 75 | 76 | > Jieming Zhu, Jinyang Liu, Shuai Yang, Qi Zhang, Xiuqiang He. [Open Benchmarking for Click-Through Rate Prediction](https://arxiv.org/abs/2009.05794). *The 30th ACM International Conference on Information and Knowledge Management (CIKM)*, 2021. [[Bibtex](https://dblp.org/rec/conf/cikm/ZhuLYZH21.html?view=bibtex)] 77 | 78 | > Jieming Zhu, Quanyu Dai, Liangcai Su, Rong Ma, Jinyang Liu, Guohao Cai, Xi Xiao, Rui Zhang. [BARS: Towards Open Benchmarking for Recommender Systems](https://arxiv.org/abs/2205.09626). *The 45th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR)*, 2022. [[Bibtex](https://dblp.org/rec/conf/sigir/ZhuDSMLCXZ22.html?view=bibtex)] 79 | -------------------------------------------------------------------------------- /assets/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiangcaiSu/STEM/769e2af0d0d1a0be9f58b475b95e05f502a20df5/assets/image.png -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiangcaiSu/STEM/769e2af0d0d1a0be9f58b475b95e05f502a20df5/assets/overview.png -------------------------------------------------------------------------------- /config/dataset_config.yaml: -------------------------------------------------------------------------------- 1 | TiktokLarge: 2 | data_root: ./data/TiktokLarge 3 | data_format: csv 4 | train_data: ./data/TiktokLarge/train.csv 5 | valid_data: ./data/TiktokLarge/valid.csv 6 | test_data: ./data/TiktokLarge/test.csv 7 | min_categr_count: 10 8 | feature_cols: 9 | [ { name: ['uid', 'item_id', 'author_id', 'item_city', 10 | 'channel', 'music_id', 'device_id', 'video_duration'], 11 | active: True, dtype: str, type: categorical } ] 12 | label_col: [ { name: finish, dtype: float }, 13 | { name: like, dtype: float } ] 14 | group_id: 'uid' 15 | 16 | -------------------------------------------------------------------------------- /config/model_config.yaml: -------------------------------------------------------------------------------- 1 | Base: 2 | model_root: './checkpoints' 3 | num_workers: 3 4 | verbose: 1 5 | early_stop_patience: 1 6 | pickle_feature_encoder: True 7 | save_best_only: True 8 | eval_steps: null 9 | debug_mode: False 10 | group_id: null 11 | use_features: null 12 | feature_specs: null 13 | feature_config: null 14 | 15 | SharedBottom_TiktokLarge_finish: 16 | model: SharedBottom 17 | dataset_id: TiktokLarge 18 | task_labels: ['finish'] 19 | loss: ['binary_crossentropy'] 20 | metrics: ['logloss', 'AUC', 'gAUC'] 21 | task: ['binary_classification'] 22 | num_tasks: 1 23 | optimizer: adam 24 | learning_rate: 1.e-4 25 | bottom_hidden_units: [512,512,512] 26 | tower_hidden_units: [128, 128] 27 | hidden_activations: relu 28 | net_regularizer: 0 29 | embedding_regularizer: 1.e-6 30 | batch_norm: False 31 | net_dropout: 0 32 | batch_size: 4096 33 | embedding_dim: 16 34 | epochs: 20 35 | shuffle: True 36 | seed: 2023 37 | monitor: 'AUC' 38 | monitor_mode: 'max' 39 | group_id: 'uid' 40 | 41 | SharedBottom_TiktokLarge_like: 42 | model: SharedBottom 43 | dataset_id: TiktokLarge 44 | loss: ['binary_crossentropy'] 45 | metrics: ['logloss', 'AUC', 'gAUC'] 46 | task: ['binary_classification'] 47 | task_labels: ['like'] 48 | num_tasks: 1 49 | optimizer: adam 50 | learning_rate: 1.e-4 51 | bottom_hidden_units: [512,512,512] 52 | tower_hidden_units: [128, 128] 53 | hidden_activations: relu 54 | net_regularizer: 0 55 | embedding_regularizer: 1.e-6 56 | batch_norm: False 57 | net_dropout: 0 58 | batch_size: 4096 59 | embedding_dim: 16 60 | epochs: 20 61 | shuffle: True 62 | seed: 2023 63 | monitor: 'AUC' 64 | monitor_mode: 'max' 65 | group_id: 'uid' 66 | 67 | SharedBottom_TiktokLarge: 68 | model: SharedBottom 69 | dataset_id: TiktokLarge 70 | loss: ['binary_crossentropy','binary_crossentropy'] 71 | metrics: ['logloss', 'AUC', 'gAUC'] 72 | task: ['binary_classification','binary_classification'] 73 | num_tasks: 2 74 | optimizer: adam 75 | learning_rate: 1.e-4 76 | bottom_hidden_units: [512,512,512] 77 | tower_hidden_units: [128, 128] 78 | hidden_activations: relu 79 | net_regularizer: 0 80 | embedding_regularizer: 1.e-6 81 | batch_norm: False 82 | net_dropout: 0 83 | batch_size: 4096 84 | embedding_dim: 16 85 | epochs: 20 86 | shuffle: True 87 | seed: 2023 88 | monitor: 'AUC' 89 | monitor_mode: 'max' 90 | group_id: 'uid' 91 | 92 | 93 | OMoE_TiktokLarge: 94 | model: OMoE 95 | dataset_id: TiktokLarge 96 | loss: ['binary_crossentropy','binary_crossentropy'] 97 | metrics: ['logloss', 'AUC', 'gAUC'] 98 | task: ['binary_classification','binary_classification'] 99 | num_tasks: 2 100 | optimizer: adam 101 | learning_rate: 1.e-4 102 | num_experts: 8 103 | expert_hidden_units: [512,512,512] 104 | gate_hidden_units: [128, 64] 105 | tower_hidden_units: [128, 64] 106 | hidden_activations: relu 107 | net_regularizer: 0 108 | embedding_regularizer: 1.e-6 109 | batch_norm: False 110 | net_dropout: 0 111 | batch_size: 4096 112 | embedding_dim: 16 113 | epochs: 20 114 | shuffle: True 115 | seed: 2023 116 | monitor: 'AUC' 117 | monitor_mode: 'max' 118 | group_id: 'uid' 119 | 120 | 121 | MMoE_TiktokLarge: 122 | model: MMoE 123 | dataset_id: TiktokLarge 124 | loss: ['binary_crossentropy','binary_crossentropy'] 125 | metrics: ['logloss', 'AUC', 'gAUC'] 126 | task: ['binary_classification','binary_classification'] 127 | num_tasks: 2 128 | optimizer: adam 129 | learning_rate: 1.e-4 130 | num_experts: 8 131 | expert_hidden_units: [512,512,512] 132 | gate_hidden_units: [128, 64] 133 | tower_hidden_units: [128, 64] 134 | hidden_activations: relu 135 | net_regularizer: 0 136 | embedding_regularizer: 1.e-6 137 | batch_norm: False 138 | net_dropout: 0 139 | batch_size: 4096 140 | embedding_dim: 16 141 | epochs: 20 142 | shuffle: True 143 | seed: 2023 144 | monitor: 'AUC' 145 | monitor_mode: 'max' 146 | group_id: 'uid' 147 | 148 | 149 | PLE_TiktokLarge: 150 | model: PLE 151 | dataset_id: TiktokLarge 152 | loss: ['binary_crossentropy','binary_crossentropy'] 153 | metrics: ['logloss', 'AUC', 'gAUC'] 154 | task: ['binary_classification','binary_classification'] 155 | num_tasks: 2 156 | optimizer: adam 157 | learning_rate: 1.e-4 158 | num_layers: 1 159 | num_workers: 4 160 | num_shared_experts: 8 161 | num_specific_experts: 1 162 | expert_hidden_units: [512,512,512] 163 | gate_hidden_units: [128, 64] 164 | tower_hidden_units: [128, 64] 165 | hidden_activations: relu 166 | net_regularizer: 0 167 | embedding_regularizer: 1.e-6 168 | batch_norm: False 169 | net_dropout: 0 170 | batch_size: 4096 171 | embedding_dim: 16 172 | epochs: 20 173 | shuffle: True 174 | seed: 2023 175 | monitor: 'AUC' 176 | monitor_mode: 'max' 177 | group_id: 'uid' 178 | 179 | AITM_TiktokLarge: 180 | model: AITM 181 | dataset_id: TiktokLarge 182 | loss: ['binary_crossentropy','binary_crossentropy'] 183 | metrics: ['logloss', 'AUC', 'gAUC'] 184 | task: ['binary_classification','binary_classification'] 185 | num_tasks: 2 186 | optimizer: adam 187 | learning_rate: 1.e-4 188 | bottom_hidden_units: [512,512,512] 189 | tower_hidden_units: [128, 128] 190 | hidden_activations: relu 191 | net_regularizer: 0 192 | embedding_regularizer: 1.e-6 193 | batch_norm: False 194 | net_dropout: 0 195 | batch_size: 4096 196 | embedding_dim: 16 197 | epochs: 20 198 | shuffle: True 199 | seed: 2023 200 | monitor: 'AUC' 201 | monitor_mode: 'max' 202 | group_id: 'uid' 203 | 204 | 205 | ESMM_TiktokLarge: 206 | model: ESMM 207 | dataset_id: TiktokLarge 208 | loss: ['binary_crossentropy','binary_crossentropy'] 209 | metrics: ['logloss', 'AUC'] 210 | task: ['binary_classification','binary_classification'] 211 | num_tasks: 2 212 | optimizer: adam 213 | learning_rate: 1.e-4 214 | tower_hidden_units: [512, 512, 512] 215 | hidden_activations: relu 216 | net_regularizer: 0 217 | embedding_regularizer: 1.e-6 218 | batch_norm: False 219 | net_dropout: 0 220 | batch_size: 4096 221 | embedding_dim: 16 222 | epochs: 50 223 | shuffle: True 224 | seed: 2023 225 | monitor: 'AUC' 226 | monitor_mode: 'max' 227 | 228 | 229 | 230 | MMoE_ME_TiktokLarge: 231 | model: MMoE_ME 232 | dataset_id: TiktokLarge 233 | loss: ['binary_crossentropy','binary_crossentropy'] 234 | metrics: ['logloss', 'AUC', 'gAUC'] 235 | task: ['binary_classification','binary_classification'] 236 | num_tasks: 2 237 | optimizer: adam 238 | learning_rate: 1.e-4 239 | num_experts: 8 240 | expert_hidden_units: [512,512,512] 241 | gate_hidden_units: [128, 64] 242 | tower_hidden_units: [128, 64] 243 | hidden_activations: relu 244 | net_regularizer: 0 245 | embedding_regularizer: 1.e-6 246 | batch_norm: False 247 | net_dropout: 0 248 | batch_size: 4096 249 | embedding_dim: 16 250 | epochs: 20 251 | shuffle: True 252 | seed: 2023 253 | monitor: 'AUC' 254 | monitor_mode: 'max' 255 | group_id: 'uid' 256 | 257 | 258 | PLE_ME_TiktokLarge: 259 | model: PLE_ME 260 | dataset_id: TiktokLarge 261 | loss: ['binary_crossentropy','binary_crossentropy'] 262 | metrics: ['logloss', 'AUC', 'gAUC'] 263 | task: ['binary_classification','binary_classification'] 264 | num_tasks: 2 265 | optimizer: adam 266 | learning_rate: 1.e-4 267 | num_layers: 1 268 | num_workers: 4 269 | num_shared_experts: 8 270 | num_specific_experts: 1 271 | expert_hidden_units: [512,512,512] 272 | gate_hidden_units: [128, 64] 273 | tower_hidden_units: [128, 64] 274 | hidden_activations: relu 275 | net_regularizer: 0 276 | embedding_regularizer: 1.e-6 277 | batch_norm: False 278 | net_dropout: 0 279 | batch_size: 4096 280 | embedding_dim: 16 281 | epochs: 20 282 | shuffle: True 283 | seed: 2023 284 | monitor: 'AUC' 285 | monitor_mode: 'max' 286 | group_id: 'uid' 287 | 288 | STEM_TiktokLarge: 289 | model: STEM 290 | dataset_id: TiktokLarge 291 | loss: ['binary_crossentropy','binary_crossentropy'] 292 | metrics: ['logloss', 'AUC', 'gAUC'] 293 | task: ['binary_classification','binary_classification'] 294 | num_tasks: 2 295 | optimizer: adam 296 | learning_rate: 1.e-3 297 | num_layers: 1 298 | num_shared_experts: 8 299 | num_specific_experts: 1 300 | expert_hidden_units: [512,512,512] 301 | gate_hidden_units: [128, 64] 302 | tower_hidden_units: [128, 64] 303 | hidden_activations: relu 304 | net_regularizer: 0 305 | embedding_regularizer: 1.e-6 306 | batch_norm: False 307 | net_dropout: 0 308 | batch_size: 4096 309 | embedding_dim: 16 310 | epochs: 20 311 | shuffle: True 312 | seed: 2023 313 | monitor: 'AUC' 314 | monitor_mode: 'max' 315 | group_id: 'uid' 316 | -------------------------------------------------------------------------------- /config/tuner/TiktokLarge_STEM_tuner.yaml: -------------------------------------------------------------------------------- 1 | base_config: ./config 2 | base_expid: STEM_TiktokLarge 3 | dataset_id: TiktokLarge 4 | 5 | tuner_space: 6 | model_root: './checkpoints/TiktokLarge/TiktokLarge_STEM_tuner' 7 | num_shared_experts: [1,2,4,8] 8 | num_specific_experts: [1,2,4] 9 | expert_hidden_units: [[512,512,512]] 10 | gate_hidden_units: [[128, 64]] 11 | tower_hidden_units: [[128, 64]] 12 | hidden_activations: relu 13 | net_regularizer: 0 14 | embedding_regularizer: [1.e-6, 1.e-5, 1.e-4, 1.e-3, 5.e-6, 5.e-5, 5.e-4, 5.e-3] 15 | net_dropout: [0, 0.1, 0.2, 0.3] 16 | batch_norm: False 17 | learning_rate: [1.e-4] 18 | batch_size: 4096 19 | seed: [2023, 76, 525, 728, 42] 20 | group_id: uid 21 | metrics: [[gAUC, AUC, logloss]] 22 | monitor: {"gAUC": 0, "AUC": 1} 23 | -------------------------------------------------------------------------------- /data/TiktokLarge/preprocess.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import numpy as np 3 | import pandas as pd 4 | # from datetime import datetime 5 | from sklearn.model_selection import train_test_split 6 | 7 | # %% 8 | data = pd.read_csv('./raw_data/final_track1_train.txt',sep='\t', 9 | names=['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'finish', 'like', 'music_id', 10 | 'device_id', 'create_time', 'video_duration'], 11 | header=None) 12 | 13 | 14 | # %% 15 | train_valid_set, test_set = train_test_split(data, test_size=0.1, random_state=42) 16 | train_set, valid_set = train_test_split(train_valid_set, test_size=0.1, random_state=42) 17 | 18 | # %% 19 | train_set.to_csv('./train.csv',index=None) 20 | valid_set.to_csv('./valid.csv',index=None) 21 | test_set.to_csv('./test.csv',index=None) 22 | -------------------------------------------------------------------------------- /fuxictr/__init__.py: -------------------------------------------------------------------------------- 1 | from .version import __version__ -------------------------------------------------------------------------------- /fuxictr/autotuner.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import itertools 19 | import subprocess 20 | import yaml 21 | import os 22 | import numpy as np 23 | import time 24 | import glob 25 | import hashlib 26 | from .utils import print_to_json, load_model_config, load_dataset_config 27 | 28 | # add this line to avoid weird characters in yaml files 29 | yaml.Dumper.ignore_aliases = lambda *args : True 30 | 31 | def enumerate_params(config_file, exclude_expid=[]): 32 | with open(config_file, "r") as cfg: 33 | config_dict = yaml.load(cfg, Loader=yaml.FullLoader) 34 | # tuning space 35 | tune_dict = config_dict["tuner_space"] 36 | for k, v in tune_dict.items(): 37 | if not isinstance(v, list): 38 | tune_dict[k] = [v] 39 | experiment_id = config_dict["base_expid"] 40 | if "model_config" in config_dict: 41 | model_dict = config_dict["model_config"][experiment_id] 42 | else: 43 | base_config_dir = config_dict.get("base_config", os.path.dirname(config_file)) 44 | model_dict = load_model_config(base_config_dir, experiment_id) 45 | 46 | dataset_id = config_dict.get("dataset_id", model_dict["dataset_id"]) 47 | if "dataset_config" in config_dict: 48 | dataset_dict = config_dict["dataset_config"][dataset_id] 49 | else: 50 | dataset_dict = load_dataset_config(base_config_dir, dataset_id) 51 | 52 | if model_dict["dataset_id"] == "TBD": # rename base expid 53 | model_dict["dataset_id"] = dataset_id 54 | experiment_id = model_dict["model"] + "_" + dataset_id 55 | 56 | # key checking 57 | tuner_keys = set(tune_dict.keys()) 58 | base_keys = set(model_dict.keys()).union(set(dataset_dict.keys())) 59 | if len(tuner_keys - base_keys) > 0: 60 | raise RuntimeError("Invalid params in tuner config: {}".format(tuner_keys - base_keys)) 61 | 62 | config_dir = config_file.replace(".yaml", "") 63 | if not os.path.exists(config_dir): 64 | os.makedirs(config_dir) 65 | 66 | # enumerate dataset para combinations 67 | dataset_dict = {k: tune_dict[k] if k in tune_dict else [v] for k, v in dataset_dict.items()} 68 | dataset_para_keys = list(dataset_dict.keys()) 69 | dataset_para_combs = dict() 70 | for idx, values in enumerate(itertools.product(*map(dataset_dict.get, dataset_para_keys))): 71 | dataset_params = dict(zip(dataset_para_keys, values)) 72 | if dataset_params["data_format"] == "h5": 73 | dataset_para_combs[dataset_id] = dataset_params 74 | else: 75 | hash_id = hashlib.md5("".join(sorted(print_to_json(dataset_params))).encode("utf-8")).hexdigest()[0:8] 76 | dataset_para_combs[dataset_id + "_{}".format(hash_id)] = dataset_params 77 | 78 | # dump dataset para combinations to config file 79 | dataset_config = os.path.join(config_dir, "dataset_config.yaml") 80 | with open(dataset_config, "w") as fw: 81 | yaml.dump(dataset_para_combs, fw, default_flow_style=None, indent=4) 82 | 83 | # enumerate model para combinations 84 | model_dict = {k: tune_dict[k] if k in tune_dict else [v] for k, v in model_dict.items()} 85 | model_para_keys = list(model_dict.keys()) 86 | model_param_combs = dict() 87 | for idx, values in enumerate(itertools.product(*map(model_dict.get, model_para_keys))): 88 | model_param_combs[idx + 1] = dict(zip(model_para_keys, values)) 89 | 90 | # update dataset_id into model params 91 | merged_param_combs = dict() 92 | for idx, item in enumerate(itertools.product(model_param_combs.values(), 93 | dataset_para_combs.keys())): 94 | para_dict = item[0] 95 | para_dict["dataset_id"] = item[1] 96 | del para_dict["model_id"] 97 | random_str = "" 98 | if para_dict["debug_mode"]: 99 | random_str = "{:06d}".format(np.random.randint(1e6)) # add a random number to avoid duplicate during debug 100 | hash_id = hashlib.md5(("".join(sorted(print_to_json(para_dict))) + random_str).encode("utf-8")).hexdigest()[0:8] 101 | hash_expid = experiment_id + "_{:03d}_{}".format(idx + 1, hash_id) 102 | if hash_expid not in exclude_expid: 103 | merged_param_combs[hash_expid] = para_dict.copy() 104 | 105 | # dump model para combinations to config file 106 | model_config = os.path.join(config_dir, "model_config.yaml") 107 | with open(model_config, "w") as fw: 108 | yaml.dump(merged_param_combs, fw, default_flow_style=None, indent=4) 109 | print("Enumerate all tuner configurations done.") 110 | return config_dir 111 | 112 | def load_experiment_ids(config_dir): 113 | model_configs = glob.glob(os.path.join(config_dir, "model_config.yaml")) 114 | if not model_configs: 115 | model_configs = glob.glob(os.path.join(config_dir, "model_config/*.yaml")) 116 | experiment_id_list = [] 117 | for config in model_configs: 118 | with open(config, "r") as cfg: 119 | config_dict = yaml.load(cfg, Loader=yaml.FullLoader) 120 | experiment_id_list += config_dict.keys() 121 | return sorted(experiment_id_list) 122 | 123 | def grid_search(config_dir, gpu_list, expid_tag=None, script='run_expid.py'): 124 | experiment_id_list = load_experiment_ids(config_dir) 125 | if expid_tag is not None: 126 | experiment_id_list = [expid for expid in experiment_id_list if str(expid_tag) in expid] 127 | assert len(experiment_id_list) > 0, "tag={} does not match any expid." 128 | gpu_list = list(gpu_list) 129 | idle_queue = list(range(len(gpu_list))) 130 | processes = dict() 131 | while len(experiment_id_list) > 0: 132 | if len(idle_queue) > 0: 133 | idle_idx = idle_queue.pop(0) 134 | gpu_id = gpu_list[idle_idx] 135 | expid = experiment_id_list.pop(0) 136 | cmd = "python -u {} --config {} --expid {} --gpu {}"\ 137 | .format(script, config_dir, expid, gpu_id) 138 | p = subprocess.Popen(cmd.split()) 139 | processes[idle_idx] = p 140 | else: 141 | time.sleep(3) 142 | for idle_idx, p in processes.items(): 143 | if p.poll() is not None: # terminated 144 | idle_queue.append(idle_idx) 145 | [p.wait() for p in processes.values()] 146 | -------------------------------------------------------------------------------- /fuxictr/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from . import criteo 2 | from . import avazu 3 | from . import kkbox 4 | -------------------------------------------------------------------------------- /fuxictr/datasets/avazu.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | from fuxictr.preprocess import FeatureProcessor as BaseFeatureProcessor 19 | from datetime import datetime, date 20 | 21 | 22 | class FeatureProcessor(BaseFeatureProcessor): 23 | def convert_weekday(self, df, col_name): 24 | def _convert_weekday(timestamp): 25 | dt = date(int('20' + timestamp[0:2]), int(timestamp[2:4]), int(timestamp[4:6])) 26 | return int(dt.strftime('%w')) 27 | return df['hour'].apply(_convert_weekday) 28 | 29 | def convert_weekend(self, df, col_name): 30 | def _convert_weekend(timestamp): 31 | dt = date(int('20' + timestamp[0:2]), int(timestamp[2:4]), int(timestamp[4:6])) 32 | return 1 if dt.strftime('%w') in ['6', '0'] else 0 33 | return df['hour'].apply(_convert_weekend) 34 | 35 | def convert_hour(self, df, col_name): 36 | return df['hour'].apply(lambda x: int(x[6:8])) 37 | 38 | 39 | -------------------------------------------------------------------------------- /fuxictr/datasets/criteo.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import numpy as np 19 | from fuxictr.preprocess import FeatureProcessor as BaseFeatureProcessor 20 | 21 | 22 | class FeatureProcessor(BaseFeatureProcessor): 23 | def convert_to_bucket(self, df, col_name): 24 | def _convert_to_bucket(value): 25 | if value > 2: 26 | value = int(np.floor(np.log(value) ** 2)) 27 | else: 28 | value = int(value) 29 | return value 30 | return df[col_name].map(_convert_to_bucket).astype(int) 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /fuxictr/datasets/kkbox.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import pandas as pd 19 | from fuxictr.preprocess import FeatureProcessor as BaseFeatureProcessor 20 | 21 | class FeatureProcessor(BaseFeatureProcessor): 22 | def extract_country_code(self, df, col_name): 23 | return df[col_name].apply(lambda isrc: isrc[0:2] if not pd.isnull(isrc) else "") 24 | 25 | def bucketize_age(self, df, col_name): 26 | def _bucketize(age): 27 | if pd.isnull(age): 28 | return "" 29 | else: 30 | age = float(age) 31 | if age < 1 or age > 95: 32 | return "" 33 | elif age <= 10: 34 | return "1" 35 | elif age <=20: 36 | return "2" 37 | elif age <=30: 38 | return "3" 39 | elif age <=40: 40 | return "4" 41 | elif age <=50: 42 | return "5" 43 | elif age <=60: 44 | return "6" 45 | else: 46 | return "7" 47 | return df[col_name].apply(_bucketize) 48 | 49 | 50 | -------------------------------------------------------------------------------- /fuxictr/features.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | from collections import OrderedDict 19 | import io 20 | import os 21 | import logging 22 | import json 23 | 24 | 25 | class FeatureMap(object): 26 | def __init__(self, dataset_id, data_dir): 27 | self.data_dir = data_dir # must keep to be used in embedding layer for pretrained emb 28 | self.dataset_id = dataset_id 29 | self.num_fields = 0 30 | self.total_features = 0 31 | self.input_length = 0 32 | self.features = OrderedDict() 33 | self.labels = [] 34 | self.column_index = dict() 35 | self.group_id = None 36 | self.default_emb_dim = None 37 | 38 | def load(self, json_file, params): 39 | logging.info("Load feature_map from json: " + json_file) 40 | with io.open(json_file, "r", encoding="utf-8") as fd: 41 | feature_map = json.load(fd) #, object_pairs_hook=OrderedDict 42 | if feature_map["dataset_id"] != self.dataset_id: 43 | raise RuntimeError("dataset_id={} does not match feature_map!".format(self.dataset_id)) 44 | self.num_fields = feature_map["num_fields"] 45 | self.labels = feature_map.get("labels", []) 46 | self.total_features = feature_map.get("total_features", 0) 47 | self.input_length = feature_map.get("input_length", 0) 48 | self.group_id = feature_map.get("group_id", None) 49 | self.default_emb_dim = params.get("embedding_dim", None) 50 | self.features = OrderedDict((k, v) for x in feature_map["features"] for k, v in x.items()) 51 | if params.get("use_features", None): 52 | self.features = OrderedDict((x, self.features[x]) for x in params["use_features"]) 53 | if params.get("feature_specs", None): 54 | self.update_feature_specs(params["feature_specs"]) 55 | self.set_column_index() 56 | 57 | def update_feature_specs(self, feature_specs): 58 | for col in feature_specs: 59 | namelist = col["name"] 60 | if type(namelist) != list: 61 | namelist = [namelist] 62 | for name in namelist: 63 | for k, v in col.items(): 64 | if k != "name": 65 | self.features[name][k] = v 66 | 67 | def save(self, json_file): 68 | logging.info("Save feature_map to json: " + json_file) 69 | os.makedirs(os.path.dirname(json_file), exist_ok=True) 70 | feature_map = OrderedDict() 71 | feature_map["dataset_id"] = self.dataset_id 72 | feature_map["num_fields"] = self.num_fields 73 | feature_map["total_features"] = self.total_features 74 | feature_map["input_length"] = self.input_length 75 | feature_map["labels"] = self.labels 76 | if self.group_id is not None: 77 | feature_map["group_id"] = self.group_id 78 | feature_map["features"] = [{k: v} for k, v in self.features.items()] 79 | with open(json_file, "w") as fd: 80 | json.dump(feature_map, fd, indent=4) 81 | 82 | def get_num_fields(self, feature_source=[]): 83 | if type(feature_source) != list: 84 | feature_source = [feature_source] 85 | num_fields = 0 86 | for feature, feature_spec in self.features.items(): 87 | if feature_spec["type"] == "meta": 88 | continue 89 | if len(feature_source) == 0 or feature_spec.get("source") in feature_source: 90 | num_fields += 1 91 | return num_fields 92 | 93 | def sum_emb_out_dim(self, feature_source=[]): 94 | if type(feature_source) != list: 95 | feature_source = [feature_source] 96 | total_dim = 0 97 | for feature, feature_spec in self.features.items(): 98 | if feature_spec["type"] == "meta": 99 | continue 100 | if len(feature_source) == 0 or feature_spec.get("source") in feature_source: 101 | total_dim += feature_spec.get("emb_output_dim", 102 | feature_spec.get("embedding_dim", 103 | self.default_emb_dim)) 104 | return total_dim 105 | 106 | def set_column_index(self): 107 | logging.info("Set column index...") 108 | idx = 0 109 | for feature, feature_spec in self.features.items(): 110 | if "max_len" in feature_spec: 111 | col_indexes = [i + idx for i in range(feature_spec["max_len"])] 112 | self.column_index[feature] = col_indexes 113 | idx += feature_spec["max_len"] 114 | else: 115 | self.column_index[feature] = idx 116 | idx += 1 117 | self.input_length = idx 118 | for label in self.labels: 119 | self.column_index[label] = idx 120 | idx += 1 121 | 122 | def get_column_index(self, feature): 123 | if feature not in self.column_index: 124 | self.set_column_index() 125 | return self.column_index[feature] 126 | 127 | 128 | -------------------------------------------------------------------------------- /fuxictr/metrics.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | from sklearn.metrics import roc_auc_score, log_loss, accuracy_score 19 | import numpy as np 20 | import pandas as pd 21 | import multiprocessing as mp 22 | from collections import OrderedDict 23 | 24 | 25 | def evaluate_metrics(y_true, y_pred, metrics, group_id=None): 26 | return_dict = OrderedDict() 27 | group_metrics = [] 28 | for metric in metrics: 29 | if metric in ['logloss', 'binary_crossentropy']: 30 | return_dict[metric] = log_loss(y_true, y_pred, eps=1e-7) 31 | elif metric == 'AUC': 32 | return_dict[metric] = roc_auc_score(y_true, y_pred) 33 | elif metric in ["gAUC", "avgAUC", "MRR"] or metric.startswith("NDCG"): 34 | return_dict[metric] = 0 35 | group_metrics.append(metric) 36 | else: 37 | raise ValueError("metric={} not supported.".format(metric)) 38 | if len(group_metrics) > 0: 39 | assert group_id is not None, "group_index is required." 40 | metric_funcs = [] 41 | for metric in group_metrics: 42 | try: 43 | metric_funcs.append(eval(metric)) 44 | except: 45 | raise NotImplementedError('metrics={} not implemented.'.format(metric)) 46 | score_df = pd.DataFrame({"group_index": group_id, 47 | "y_true": y_true, 48 | "y_pred": y_pred}) 49 | results = [] 50 | pool = mp.Pool(processes=mp.cpu_count() // 2) 51 | for idx, df in score_df.groupby("group_index"): 52 | results.append(pool.apply_async(evaluate_block, args=(df, metric_funcs))) 53 | pool.close() 54 | pool.join() 55 | results = [res.get() for res in results] 56 | sum_results = np.array(results).sum(0) 57 | average_result = list(sum_results[:, 0] / sum_results[:, 1]) 58 | return_dict.update(dict(zip(group_metrics, average_result))) 59 | return return_dict 60 | 61 | def evaluate_block(df, metric_funcs): 62 | res_list = [] 63 | for fn in metric_funcs: 64 | v = fn(df.y_true.values, df.y_pred.values) 65 | if type(v) == tuple: 66 | res_list.append(v) 67 | else: # add group weight 68 | res_list.append((v, 1)) 69 | return res_list 70 | 71 | def avgAUC(y_true, y_pred): 72 | """ avgAUC used in MIND news recommendation """ 73 | if np.sum(y_true) > 0 and np.sum(y_true) < len(y_true): 74 | auc = roc_auc_score(y_true, y_pred) 75 | return (auc, 1) 76 | else: # in case all negatives or all positives for a group 77 | return (0, 0) 78 | 79 | def gAUC(y_true, y_pred): 80 | """ gAUC defined in DIN paper """ 81 | if np.sum(y_true) > 0 and np.sum(y_true) < len(y_true): 82 | auc = roc_auc_score(y_true, y_pred) 83 | n_samples = len(y_true) 84 | return (auc * n_samples, n_samples) 85 | else: # in case all negatives or all positives for a group 86 | return (0, 0) 87 | 88 | def MRR(y_true, y_pred): 89 | order = np.argsort(y_pred)[::-1] 90 | y_true = np.take(y_true, order) 91 | rr_score = y_true / (np.arange(len(y_true)) + 1) 92 | mrr = np.sum(rr_score) / (np.sum(y_true) + 1e-12) 93 | return mrr 94 | 95 | 96 | class NDCG(object): 97 | """Normalized discounted cumulative gain metric.""" 98 | def __init__(self, k=1): 99 | self.topk = k 100 | 101 | def dcg_score(self, y_true, y_pred): 102 | order = np.argsort(y_pred)[::-1] 103 | y_true = np.take(y_true, order[:self.topk]) 104 | gains = 2 ** y_true - 1 105 | discounts = np.log2(np.arange(len(y_true)) + 2) 106 | return np.sum(gains / discounts) 107 | 108 | def __call__(self, y_true, y_pred): 109 | idcg = self.dcg_score(y_true, y_true) 110 | dcg = self.dcg_score(y_true, y_pred) 111 | return dcg / (idcg + 1e-12) 112 | 113 | 114 | -------------------------------------------------------------------------------- /fuxictr/preprocess/__init__.py: -------------------------------------------------------------------------------- 1 | from .build_dataset import * 2 | from .feature_processor import * -------------------------------------------------------------------------------- /fuxictr/preprocess/build_dataset.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import h5py 19 | import os 20 | import logging 21 | import numpy as np 22 | import gc 23 | import multiprocessing as mp 24 | 25 | 26 | def save_h5(darray_dict, data_path): 27 | logging.info("Saving data to h5: " + data_path) 28 | os.makedirs(os.path.dirname(data_path), exist_ok=True) 29 | with h5py.File(data_path, 'w') as hf: 30 | for key, arr in darray_dict.items(): 31 | hf.create_dataset(key, data=arr) 32 | 33 | 34 | def split_train_test(train_ddf=None, valid_ddf=None, test_ddf=None, valid_size=0, 35 | test_size=0, split_type="sequential"): 36 | num_samples = len(train_ddf) 37 | train_size = num_samples 38 | instance_IDs = np.arange(num_samples) 39 | if split_type == "random": 40 | np.random.shuffle(instance_IDs) 41 | if test_size > 0: 42 | if test_size < 1: 43 | test_size = int(num_samples * test_size) 44 | train_size = train_size - test_size 45 | test_ddf = train_ddf.loc[instance_IDs[train_size:], :].reset_index() 46 | instance_IDs = instance_IDs[0:train_size] 47 | if valid_size > 0: 48 | if valid_size < 1: 49 | valid_size = int(num_samples * valid_size) 50 | train_size = train_size - valid_size 51 | valid_ddf = train_ddf.loc[instance_IDs[train_size:], :].reset_index() 52 | instance_IDs = instance_IDs[0:train_size] 53 | if valid_size > 0 or test_size > 0: 54 | train_ddf = train_ddf.loc[instance_IDs, :].reset_index() 55 | return train_ddf, valid_ddf, test_ddf 56 | 57 | 58 | def transform_block(feature_encoder, df_block, filename, preprocess=False): 59 | if preprocess: 60 | df_block = feature_encoder.preprocess(df_block) 61 | darray_dict = feature_encoder.transform(df_block) 62 | save_h5(darray_dict, os.path.join(feature_encoder.data_dir, filename)) 63 | 64 | 65 | def transform_h5(feature_encoder, ddf, filename, preprocess=False, block_size=0): 66 | if block_size > 0: 67 | pool = mp.Pool(mp.cpu_count() // 2) 68 | block_id = 0 69 | for idx in range(0, len(ddf), block_size): 70 | df_block = ddf[idx: (idx + block_size)] 71 | pool.apply_async(transform_block, args=(feature_encoder, 72 | df_block, 73 | '{}/part_{}.h5'.format(filename, block_id), 74 | preprocess)) 75 | block_id += 1 76 | pool.close() 77 | pool.join() 78 | else: 79 | transform_block(feature_encoder, ddf, filename + ".h5", preprocess) 80 | 81 | 82 | def build_dataset(feature_encoder, train_data=None, valid_data=None, test_data=None, valid_size=0, 83 | test_size=0, split_type="sequential", data_block_size=0, **kwargs): 84 | """ Build feature_map and transform h5 data """ 85 | 86 | feature_map_json = os.path.join(feature_encoder.data_dir, "feature_map.json") 87 | if os.path.exists(feature_map_json): 88 | logging.warn("Skip rebuilding {}. Please delete it manually if rebuilding is required." \ 89 | .format(feature_map_json)) 90 | else: 91 | # Load csv data 92 | train_ddf = feature_encoder.read_csv(train_data, **kwargs) 93 | valid_ddf = None 94 | test_ddf = None 95 | 96 | # Split data for train/validation/test 97 | if valid_size > 0 or test_size > 0: 98 | valid_ddf = feature_encoder.read_csv(valid_data, **kwargs) 99 | test_ddf = feature_encoder.read_csv(test_data, **kwargs) 100 | train_ddf, valid_ddf, test_ddf = split_train_test(train_ddf, valid_ddf, test_ddf, 101 | valid_size, test_size, split_type) 102 | 103 | # fit and transform train_ddf 104 | train_ddf = feature_encoder.preprocess(train_ddf) 105 | feature_encoder.fit(train_ddf, **kwargs) 106 | transform_h5(feature_encoder, train_ddf, 'train', preprocess=False, block_size=data_block_size) 107 | del train_ddf 108 | gc.collect() 109 | 110 | # Transfrom valid_ddf 111 | if valid_ddf is None and (valid_data is not None): 112 | valid_ddf = feature_encoder.read_csv(valid_data, **kwargs) 113 | if valid_ddf is not None: 114 | transform_h5(feature_encoder, valid_ddf, 'valid', preprocess=True, block_size=data_block_size) 115 | del valid_ddf 116 | gc.collect() 117 | 118 | # Transfrom test_ddf 119 | if test_ddf is None and (test_data is not None): 120 | test_ddf = feature_encoder.read_csv(test_data, **kwargs) 121 | if test_ddf is not None: 122 | transform_h5(feature_encoder, test_ddf, 'test', preprocess=True, block_size=data_block_size) 123 | del test_ddf 124 | gc.collect() 125 | logging.info("Transform csv data to h5 done.") 126 | 127 | # Return processed data splits 128 | return os.path.join(feature_encoder.data_dir, "train"), \ 129 | os.path.join(feature_encoder.data_dir, "valid"), \ 130 | os.path.join(feature_encoder.data_dir, "test") if ( 131 | test_data or test_size > 0) else None 132 | -------------------------------------------------------------------------------- /fuxictr/preprocess/normalizer.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | import numpy as np 18 | import sklearn.preprocessing as sklearn_preprocess 19 | 20 | 21 | class Normalizer(object): 22 | def __init__(self, normalizer): 23 | if not callable(normalizer): 24 | self.callable = False 25 | if normalizer in ['StandardScaler', 'MinMaxScaler']: 26 | self.normalizer = getattr(sklearn_preprocess, normalizer)() 27 | else: 28 | raise NotImplementedError('normalizer={}'.format(normalizer)) 29 | else: 30 | # normalizer is a method 31 | self.normalizer = normalizer 32 | self.callable = True 33 | 34 | def fit(self, X): 35 | if not self.callable: 36 | null_index = np.isnan(X) 37 | self.normalizer.fit(X[~null_index].reshape(-1, 1)) 38 | 39 | def normalize(self, X): 40 | if self.callable: 41 | return self.normalizer(X) 42 | else: 43 | return self.normalizer.transform(X.reshape(-1, 1)).flatten() 44 | -------------------------------------------------------------------------------- /fuxictr/preprocess/tokenizer.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | from collections import Counter 18 | import numpy as np 19 | import pandas as pd 20 | import h5py 21 | from tqdm import tqdm 22 | from keras_preprocessing.sequence import pad_sequences 23 | from concurrent.futures import ProcessPoolExecutor, as_completed 24 | 25 | 26 | class Tokenizer(object): 27 | def __init__(self, max_features=None, na_value="", min_freq=1, splitter=None, remap=True, 28 | lower=False, max_len=0, padding="pre", num_workers=8): 29 | self._max_features = max_features 30 | self._na_value = na_value 31 | self._min_freq = min_freq 32 | self._lower = lower 33 | self._splitter = splitter 34 | self.vocab = dict() 35 | self.max_len = max_len 36 | self.padding = padding 37 | self.num_workers = num_workers 38 | self.remap = remap 39 | 40 | def fit_on_texts(self, texts): 41 | word_counts = Counter() 42 | if self._splitter is not None: # for sequence 43 | max_len = 0 44 | with ProcessPoolExecutor(max_workers=self.num_workers) as executor: 45 | chunks = np.array_split(texts, self.num_workers) 46 | tasks = [executor.submit(count_tokens, chunk, self._splitter) for chunk in chunks] 47 | for future in tqdm(as_completed(tasks), total=len(tasks)): 48 | block_word_counts, block_max_len = future.result() 49 | word_counts.update(block_word_counts) 50 | max_len = max(max_len, block_max_len) 51 | if self.max_len == 0: # if argument max_len not given 52 | self.max_len = max_len 53 | else: 54 | word_counts = Counter(list(texts)) 55 | self.build_vocab(word_counts) 56 | 57 | def build_vocab(self, word_counts): 58 | word_counts = word_counts.items() 59 | # sort to guarantee the determinism of index order 60 | word_counts = sorted(word_counts, key=lambda x: (-x[1], x[0])) 61 | if self._max_features: # keep the most frequent features 62 | word_counts = word_counts[0:self._max_features] 63 | words = [] 64 | for token, count in word_counts: 65 | if count >= self._min_freq: 66 | if token != self._na_value: 67 | words.append(token.lower() if self._lower else token) 68 | else: 69 | break # already sorted in decending order 70 | if self.remap: 71 | self.vocab = dict((token, idx) for idx, token in enumerate(words, 1)) 72 | else: 73 | self.vocab = dict((token, int(token)) for token in words) 74 | self.vocab["__PAD__"] = 0 # use 0 for reserved __PAD__ 75 | self.vocab["__OOV__"] = self.vocab_size() # use the last index for __OOV__ 76 | 77 | def merge_vocab(self, shared_tokenizer): 78 | if self.remap: 79 | new_words = 0 80 | for word in self.vocab.keys(): 81 | if word not in shared_tokenizer.vocab: 82 | shared_tokenizer.vocab[word] = shared_tokenizer.vocab["__OOV__"] + new_words 83 | new_words += 1 84 | else: 85 | shared_tokenizer.vocab.update(self.vocab) 86 | vocab_size = shared_tokenizer.vocab_size() 87 | if shared_tokenizer.vocab["__OOV__"] != vocab_size - 1: 88 | shared_tokenizer.vocab["__OOV__"] = vocab_size 89 | self.vocab = shared_tokenizer.vocab 90 | return shared_tokenizer 91 | 92 | def vocab_size(self): 93 | return max(self.vocab.values()) + 1 94 | 95 | def update_vocab(self, word_list): 96 | new_words = 0 97 | for word in word_list: 98 | if word not in self.vocab: 99 | self.vocab[word] = self.vocab["__OOV__"] + new_words 100 | new_words += 1 101 | if new_words > 0: 102 | self.vocab["__OOV__"] = self.vocab_size() 103 | 104 | def encode_meta(self, values): 105 | word_counts = Counter(list(values)) 106 | if len(self.vocab) == 0: 107 | self.build_vocab(word_counts) 108 | else: # for considering meta data in test data 109 | self.update_vocab(word_counts.keys()) 110 | meta_values = [self.vocab.get(x, self.vocab["__OOV__"]) for x in values] 111 | return np.array(meta_values) 112 | 113 | def encode_category(self, categories): 114 | category_indices = [self.vocab.get(x, self.vocab["__OOV__"]) for x in categories] 115 | return np.array(category_indices) 116 | 117 | def encode_sequence(self, texts): 118 | sequence_list = [] 119 | for text in texts: 120 | if pd.isnull(text) or text == '': 121 | sequence_list.append([]) 122 | else: 123 | sequence_list.append([self.vocab.get(x, self.vocab["__OOV__"]) if x != self._na_value \ 124 | else self.vocab["__PAD__"] for x in text.split(self._splitter)]) 125 | sequence_list = pad_sequences(sequence_list, maxlen=self.max_len, value=self.vocab["__PAD__"], 126 | padding=self.padding, truncating=self.padding) 127 | return np.array(sequence_list) 128 | 129 | def load_pretrained_vocab(self, feature_dtype, pretrain_path, expand_vocab=True): 130 | with h5py.File(pretrain_path, 'r') as hf: 131 | keys = hf["key"][:] 132 | keys = keys.astype(feature_dtype) # in case mismatch of dtype between int and str 133 | # Update vocab with pretrained keys in case new tokens appear in validation or test set 134 | # Do not update OOV index here since it is used in PretrainedEmbedding 135 | if expand_vocab: 136 | for word in keys: 137 | if word not in self.vocab: 138 | self.vocab[word] = self.vocab_size() 139 | 140 | 141 | def count_tokens(texts, splitter): 142 | word_counts = Counter() 143 | max_len = 0 144 | for text in texts: 145 | text_split = text.split(splitter) 146 | max_len = max(max_len, len(text_split)) 147 | for token in text_split: 148 | word_counts[token] += 1 149 | return word_counts, max_len 150 | -------------------------------------------------------------------------------- /fuxictr/pytorch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiangcaiSu/STEM/769e2af0d0d1a0be9f58b475b95e05f502a20df5/fuxictr/pytorch/__init__.py -------------------------------------------------------------------------------- /fuxictr/pytorch/dataloaders/__init__.py: -------------------------------------------------------------------------------- 1 | from .h5_block_dataloader import H5BlockDataLoader 2 | from .h5_dataloader import H5DataLoader -------------------------------------------------------------------------------- /fuxictr/pytorch/dataloaders/h5_block_dataloader.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2023. FuxiCTR Authors. All rights reserved. 3 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ========================================================================= 17 | 18 | 19 | import numpy as np 20 | from fuxictr.utils import load_h5 21 | import h5py 22 | from itertools import chain 23 | import torch 24 | from torch.utils import data 25 | import logging 26 | import glob 27 | 28 | 29 | class BlockIterDataPipe(data.IterDataPipe): 30 | def __init__(self, block_datapipe, feature_map, verbose=0): 31 | self.feature_map = feature_map 32 | self.block_datapipe = block_datapipe 33 | self.verbose = verbose 34 | 35 | def load_data(self, data_path): 36 | data_dict = load_h5(data_path, verbose=self.verbose) 37 | data_arrays = [] 38 | all_cols = list(self.feature_map.features.keys()) + self.feature_map.labels 39 | for col in all_cols: 40 | array = data_dict[col] 41 | if array.ndim == 1: 42 | data_arrays.append(array.reshape(-1, 1)) 43 | else: 44 | data_arrays.append(array) 45 | data_tensor = torch.from_numpy(np.hstack(data_arrays)) 46 | return data_tensor 47 | 48 | def read_block(self, data_block): 49 | darray = self.load_data(data_block) 50 | for idx in range(darray.shape[0]): 51 | yield darray[idx, :] 52 | 53 | def __iter__(self): 54 | worker_info = data.get_worker_info() 55 | if worker_info is None: # single-process data loading 56 | block_list = self.block_datapipe 57 | else: # in a worker process 58 | block_list = [ 59 | block 60 | for idx, block in enumerate(self.block_datapipe) 61 | if idx % worker_info.num_workers == worker_info.id 62 | ] 63 | return chain.from_iterable(map(self.read_block, block_list)) 64 | 65 | 66 | class DataLoader(data.DataLoader): 67 | def __init__(self, feature_map, data_path, batch_size=32, shuffle=False, 68 | num_workers=1, verbose=0, buffer_size=100000, **kwargs): 69 | data_blocks = glob.glob(data_path + "/*.h5") 70 | assert len(data_blocks) > 0, f"invalid data_path: {data_path}" 71 | if len(data_blocks) > 1: 72 | data_blocks.sort(key=lambda x: int(x.split("_")[-1].split(".")[0])) # e.g. "part_1.h5" 73 | self.data_blocks = data_blocks 74 | self.num_blocks = len(self.data_blocks) 75 | self.feature_map = feature_map 76 | self.batch_size = batch_size 77 | self.num_batches, self.num_samples = self.count_batches_and_samples() 78 | datapipe = BlockIterDataPipe(data_blocks, feature_map, verbose) 79 | if shuffle: 80 | datapipe = datapipe.shuffle(buffer_size=buffer_size) 81 | super(DataLoader, self).__init__(dataset=datapipe, batch_size=batch_size, num_workers=num_workers) 82 | 83 | def __len__(self): 84 | return self.num_batches 85 | 86 | def count_batches_and_samples(self): 87 | num_samples = 0 88 | num_batches = 0 89 | for block_path in self.data_blocks: 90 | with h5py.File(block_path, 'r') as hf: 91 | y = hf[self.feature_map.labels[0]][:] 92 | num_samples += len(y) 93 | num_batches += int(np.ceil(len(y) * 1.0 / self.batch_size)) 94 | return num_batches, num_samples 95 | 96 | 97 | class H5BlockDataLoader(object): 98 | def __init__(self, feature_map, stage="both", train_data=None, valid_data=None, test_data=None, 99 | batch_size=32, shuffle=True, verbose=0, **kwargs): 100 | logging.info("Loading data...") 101 | train_gen = None 102 | valid_gen = None 103 | test_gen = None 104 | self.stage = stage 105 | if stage in ["both", "train"]: 106 | train_gen = DataLoader(feature_map, train_data, batch_size=batch_size, shuffle=shuffle, verbose=verbose, **kwargs) 107 | logging.info("Train samples: total/{:d}, blocks/{:d}".format(train_gen.num_samples, train_gen.num_blocks)) 108 | if valid_data: 109 | valid_gen = DataLoader(feature_map, valid_data, batch_size=batch_size, shuffle=False, verbose=verbose, **kwargs) 110 | logging.info("Validation samples: total/{:d}, blocks/{:d}".format(valid_gen.num_samples, valid_gen.num_blocks)) 111 | 112 | if stage in ["both", "test"]: 113 | if test_data: 114 | test_gen = DataLoader(feature_map, test_data, batch_size=batch_size, shuffle=False, verbose=verbose, **kwargs) 115 | logging.info("Test samples: total/{:d}, blocks/{:d}".format(test_gen.num_samples, test_gen.num_blocks)) 116 | self.train_gen, self.valid_gen, self.test_gen = train_gen, valid_gen, test_gen 117 | 118 | def make_iterator(self): 119 | if self.stage == "train": 120 | logging.info("Loading train and validation data done.") 121 | return self.train_gen, self.valid_gen 122 | elif self.stage == "test": 123 | logging.info("Loading test data done.") 124 | return self.test_gen 125 | else: 126 | logging.info("Loading data done.") 127 | return self.train_gen, self.valid_gen, self.test_gen 128 | -------------------------------------------------------------------------------- /fuxictr/pytorch/dataloaders/h5_dataloader.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import numpy as np 19 | from torch.utils import data 20 | from fuxictr.utils import load_h5 21 | import torch 22 | import logging 23 | 24 | 25 | class Dataset(data.Dataset): 26 | def __init__(self, feature_map, data_path): 27 | self.feature_map = feature_map 28 | self.darray = self.load_data_array(data_path) 29 | 30 | def __getitem__(self, index): 31 | return self.darray[index, :] 32 | 33 | def __len__(self): 34 | return self.darray.shape[0] 35 | 36 | def load_data_array(self, data_path): 37 | data_dict = load_h5(data_path) # dict of arrays from h5 38 | data_arrays = [] 39 | all_cols = list(self.feature_map.features.keys()) + self.feature_map.labels 40 | for col in all_cols: 41 | array = data_dict[col] 42 | if array.ndim == 1: 43 | data_arrays.append(array.reshape(-1, 1)) 44 | else: 45 | data_arrays.append(array) 46 | print("lalalala1") 47 | data_tensor = torch.from_numpy(np.hstack(data_arrays)) 48 | print("lalalala2") 49 | return data_tensor 50 | 51 | 52 | class DataLoader(data.DataLoader): 53 | def __init__(self, feature_map, data_path, batch_size=32, shuffle=False, num_workers=1, **kwargs): 54 | if not data_path.endswith(".h5"): 55 | data_path += ".h5" 56 | self.dataset = Dataset(feature_map, data_path) 57 | super(DataLoader, self).__init__(dataset=self.dataset, batch_size=batch_size, 58 | shuffle=shuffle, num_workers=num_workers) 59 | self.num_samples = len(self.dataset) 60 | self.num_batches = int(np.ceil(self.num_samples * 1.0 / self.batch_size)) 61 | 62 | def __len__(self): 63 | return self.num_batches 64 | 65 | 66 | class H5DataLoader(object): 67 | def __init__(self, feature_map, stage="both", train_data=None, valid_data=None, test_data=None, 68 | batch_size=32, shuffle=True, **kwargs): 69 | logging.info("Loading data...") 70 | train_gen = None 71 | valid_gen = None 72 | test_gen = None 73 | self.stage = stage 74 | if stage in ["both", "train"]: 75 | train_gen = DataLoader(feature_map, train_data, batch_size=batch_size, shuffle=shuffle, **kwargs) 76 | logging.info("Train samples: total/{:d}, blocks/{:d}".format(train_gen.num_samples, 1)) 77 | if valid_data: 78 | valid_gen = DataLoader(feature_map, valid_data, batch_size=batch_size, shuffle=False, **kwargs) 79 | logging.info("Validation samples: total/{:d}, blocks/{:d}".format(valid_gen.num_samples, 1)) 80 | if stage in ["both", "test"]: 81 | if test_data: 82 | test_gen = DataLoader(feature_map, test_data, batch_size=batch_size, shuffle=False, **kwargs) 83 | logging.info("Test samples: total/{:d}, blocks/{:d}".format(test_gen.num_samples, 1)) 84 | self.train_gen, self.valid_gen, self.test_gen = train_gen, valid_gen, test_gen 85 | 86 | def make_iterator(self): 87 | if self.stage == "train": 88 | logging.info("Loading train and validation data done.") 89 | return self.train_gen, self.valid_gen 90 | elif self.stage == "test": 91 | logging.info("Loading test data done.") 92 | return self.test_gen 93 | else: 94 | logging.info("Loading data done.") 95 | return self.train_gen, self.valid_gen, self.test_gen 96 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .pooling import * 2 | from .embeddings import * 3 | from .activations import * 4 | from .blocks import * 5 | from .interactions import * 6 | from .attentions import * 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/activations.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import torch 19 | from torch import nn 20 | import numpy as np 21 | 22 | 23 | class Dice(nn.Module): 24 | def __init__(self, input_dim, eps=1e-9): 25 | super(Dice, self).__init__() 26 | self.bn = nn.BatchNorm1d(input_dim, affine=False, eps=eps, momentum=0.01) 27 | self.alpha = nn.Parameter(torch.zeros(input_dim)) 28 | 29 | def forward(self, X): 30 | p = torch.sigmoid(self.bn(X)) 31 | output = p * X + self.alpha * (1 - p) * X 32 | return output 33 | 34 | 35 | class GELU(nn.Module): 36 | def __init__(self): 37 | super(GELU, self).__init__() 38 | 39 | def forward(self, x): 40 | return 0.5 * x * (1 + torch.tanh(np.sqrt(2 / np.pi) * ( x + 0.044715 * torch.pow(x, 3)))) 41 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/attentions/__init__.py: -------------------------------------------------------------------------------- 1 | from .dot_product_attention import * 2 | from .squeeze_excitation import * 3 | from .target_attention import * 4 | 5 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/attentions/dot_product_attention.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. 3 | # Copyright (C) 2018. pengshuang@Github for ScaledDotProductAttention. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ========================================================================= 17 | 18 | 19 | import torch 20 | from torch import nn 21 | 22 | 23 | class ScaledDotProductAttention(nn.Module): 24 | """ Scaled Dot-Product Attention 25 | Ref: https://zhuanlan.zhihu.com/p/47812375 26 | """ 27 | def __init__(self, dropout_rate=0.): 28 | super(ScaledDotProductAttention, self).__init__() 29 | self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else None 30 | 31 | def forward(self, Q, K, V, scale=None, mask=None): 32 | # mask: 0 for masked positions 33 | scores = torch.matmul(Q, K.transpose(-1, -2)) 34 | if scale: 35 | scores = scores / scale 36 | if mask is not None: 37 | mask = mask.view_as(scores) 38 | scores = scores.masked_fill_(mask.float() == 0, -1.e9) # fill -inf if mask=0 39 | attention = scores.softmax(dim=-1) 40 | if self.dropout is not None: 41 | attention = self.dropout(attention) 42 | output = torch.matmul(attention, V) 43 | return output, attention 44 | 45 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/attentions/squeeze_excitation.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import torch 19 | from torch import nn 20 | 21 | 22 | class SqueezeExcitation(nn.Module): 23 | def __init__(self, num_fields, reduction_ratio=3, excitation_activation="ReLU"): 24 | super(SqueezeExcitation, self).__init__() 25 | reduced_size = max(1, int(num_fields / reduction_ratio)) 26 | excitation = [nn.Linear(num_fields, reduced_size, bias=False), 27 | nn.ReLU(), 28 | nn.Linear(reduced_size, num_fields, bias=False)] 29 | if excitation_activation.lower() == "relu": 30 | excitation.append(nn.ReLU()) 31 | elif excitation_activation.lower() == "sigmoid": 32 | excitation.append(nn.Sigmoid()) 33 | else: 34 | raise NotImplementedError 35 | self.excitation = nn.Sequential(*excitation) 36 | 37 | def forward(self, feature_emb): 38 | Z = torch.mean(feature_emb, dim=-1, out=None) 39 | A = self.excitation(Z) 40 | V = feature_emb * A.unsqueeze(-1) 41 | return V 42 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/attentions/target_attention.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import torch 19 | from torch import nn 20 | from .dot_product_attention import ScaledDotProductAttention 21 | from ..activations import Dice 22 | from ..blocks.mlp_block import MLP_Block 23 | 24 | 25 | class DIN_Attention(nn.Module): 26 | def __init__(self, 27 | embedding_dim=64, 28 | attention_units=[32], 29 | hidden_activations="ReLU", 30 | output_activation=None, 31 | dropout_rate=0, 32 | batch_norm=False, 33 | use_softmax=False): 34 | super(DIN_Attention, self).__init__() 35 | self.embedding_dim = embedding_dim 36 | self.use_softmax = use_softmax 37 | if isinstance(hidden_activations, str) and hidden_activations.lower() == "dice": 38 | hidden_activations = [Dice(units) for units in attention_units] 39 | self.attention_layer = MLP_Block(input_dim=4 * embedding_dim, 40 | output_dim=1, 41 | hidden_units=attention_units, 42 | hidden_activations=hidden_activations, 43 | output_activation=output_activation, 44 | dropout_rates=dropout_rate, 45 | batch_norm=batch_norm) 46 | 47 | def forward(self, target_item, history_sequence, mask=None): 48 | """ 49 | target_item: b x emd 50 | history_sequence: b x len x emb 51 | mask: mask of history_sequence, 0 for masked positions 52 | """ 53 | seq_len = history_sequence.size(1) 54 | target_item = target_item.unsqueeze(1).expand(-1, seq_len, -1) 55 | attention_input = torch.cat([target_item, history_sequence, target_item - history_sequence, 56 | target_item * history_sequence], dim=-1) # b x len x 4*emb 57 | attention_weight = self.attention_layer(attention_input.view(-1, 4 * self.embedding_dim)) 58 | attention_weight = attention_weight.view(-1, seq_len) # b x len 59 | if mask is not None: 60 | attention_weight = attention_weight * mask.float() 61 | if self.use_softmax: 62 | if mask is not None: 63 | attention_weight += -1.e9 * (1 - mask.float()) 64 | attention_weight = attention_weight.softmax(dim=-1) 65 | output = (attention_weight.unsqueeze(-1) * history_sequence).sum(dim=1) 66 | return output 67 | 68 | 69 | class MultiHeadTargetAttention(nn.Module): 70 | def __init__(self, 71 | input_dim=64, 72 | attention_dim=64, 73 | num_heads=1, 74 | dropout_rate=0, 75 | use_scale=True, 76 | use_qkvo=True): 77 | super(MultiHeadTargetAttention, self).__init__() 78 | if not use_qkvo: 79 | attention_dim = input_dim 80 | assert attention_dim % num_heads == 0, \ 81 | "attention_dim={} is not divisible by num_heads={}".format(attention_dim, num_heads) 82 | self.num_heads = num_heads 83 | self.head_dim = attention_dim // num_heads 84 | self.scale = self.head_dim ** 0.5 if use_scale else None 85 | self.use_qkvo = use_qkvo 86 | if use_qkvo: 87 | self.W_q = nn.Linear(input_dim, attention_dim, bias=False) 88 | self.W_k = nn.Linear(input_dim, attention_dim, bias=False) 89 | self.W_v = nn.Linear(input_dim, attention_dim, bias=False) 90 | self.W_o = nn.Linear(attention_dim, input_dim, bias=False) 91 | self.dot_attention = ScaledDotProductAttention(dropout_rate) 92 | 93 | def forward(self, target_item, history_sequence, mask=None): 94 | """ 95 | target_item: b x emd 96 | history_sequence: b x len x emb 97 | mask: mask of history_sequence, 0 for masked positions 98 | """ 99 | # linear projection 100 | if self.use_qkvo: 101 | query = self.W_q(target_item) 102 | key = self.W_k(history_sequence) 103 | value = self.W_v(history_sequence) 104 | else: 105 | query, key, value = target_item, history_sequence, history_sequence 106 | 107 | # split by heads 108 | batch_size = query.size(0) 109 | query = query.view(batch_size, 1, self.num_heads, self.head_dim).transpose(1, 2) 110 | key = key.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) 111 | value = value.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) 112 | if mask is not None: 113 | mask = mask.view(batch_size, 1, 1, -1).expand(-1, self.num_heads, -1, -1) 114 | 115 | # scaled dot product attention 116 | output, _ = self.dot_attention(query, key, value, scale=self.scale, mask=mask) 117 | # concat heads 118 | output = output.transpose(1, 2).contiguous().view(-1, self.num_heads * self.head_dim) 119 | if self.use_qkvo: 120 | output = self.W_o(output) 121 | return output 122 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/blocks/__init__.py: -------------------------------------------------------------------------------- 1 | from .logistic_regression import * 2 | from .factorization_machine import * 3 | from .mlp_block import * 4 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/blocks/factorization_machine.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import torch 19 | from torch import nn 20 | from .logistic_regression import LogisticRegression 21 | from ..interactions import InnerProductInteraction 22 | 23 | 24 | class FactorizationMachine(nn.Module): 25 | def __init__(self, feature_map): 26 | super(FactorizationMachine, self).__init__() 27 | self.fm_layer = InnerProductInteraction(feature_map.num_fields, output="product_sum") 28 | self.lr_layer = LogisticRegression(feature_map, use_bias=True) 29 | 30 | def forward(self, X, feature_emb): 31 | lr_out = self.lr_layer(X) 32 | fm_out = self.fm_layer(feature_emb) 33 | output = fm_out + lr_out 34 | return output 35 | 36 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/blocks/logistic_regression.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import torch 19 | from torch import nn 20 | from fuxictr.pytorch.layers import FeatureEmbedding 21 | 22 | 23 | class LogisticRegression(nn.Module): 24 | def __init__(self, feature_map, use_bias=True): 25 | super(LogisticRegression, self).__init__() 26 | self.bias = nn.Parameter(torch.zeros(1), requires_grad=True) if use_bias else None 27 | # A trick for quick one-hot encoding in LR 28 | self.embedding_layer = FeatureEmbedding(feature_map, 1, use_pretrain=False, use_sharing=False) 29 | 30 | def forward(self, X): 31 | embed_weights = self.embedding_layer(X) 32 | output = embed_weights.sum(dim=1) 33 | if self.bias is not None: 34 | output += self.bias 35 | return output 36 | 37 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/blocks/mlp_block.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import numpy as np 19 | from torch import nn 20 | from fuxictr.pytorch.torch_utils import get_activation 21 | 22 | 23 | class MLP_Block(nn.Module): 24 | def __init__(self, 25 | input_dim, 26 | hidden_units=[], 27 | hidden_activations="ReLU", 28 | output_dim=None, 29 | output_activation=None, 30 | dropout_rates=0.0, 31 | batch_norm=False, 32 | bn_only_once=False, # Set True for inference speed up 33 | use_bias=True): 34 | super(MLP_Block, self).__init__() 35 | dense_layers = [] 36 | if not isinstance(dropout_rates, list): 37 | dropout_rates = [dropout_rates] * len(hidden_units) 38 | if not isinstance(hidden_activations, list): 39 | hidden_activations = [hidden_activations] * len(hidden_units) 40 | hidden_activations = get_activation(hidden_activations, hidden_units) 41 | hidden_units = [input_dim] + hidden_units 42 | if batch_norm and bn_only_once: 43 | dense_layers.append(nn.BatchNorm1d(input_dim)) 44 | for idx in range(len(hidden_units) - 1): 45 | dense_layers.append(nn.Linear(hidden_units[idx], hidden_units[idx + 1], bias=use_bias)) 46 | if batch_norm and not bn_only_once: 47 | dense_layers.append(nn.BatchNorm1d(hidden_units[idx + 1])) 48 | if hidden_activations[idx]: 49 | dense_layers.append(hidden_activations[idx]) 50 | if dropout_rates[idx] > 0: 51 | dense_layers.append(nn.Dropout(p=dropout_rates[idx])) 52 | if output_dim is not None: 53 | dense_layers.append(nn.Linear(hidden_units[-1], output_dim, bias=use_bias)) 54 | if output_activation is not None: 55 | dense_layers.append(get_activation(output_activation)) 56 | self.mlp = nn.Sequential(*dense_layers) # * used to unpack list 57 | 58 | def forward(self, inputs): 59 | return self.mlp(inputs) 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | from .feature_embedding import * 2 | from .pretrained_embedding import * 3 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/embeddings/pretrained_embedding.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import torch 19 | from torch import nn 20 | import h5py 21 | import os 22 | import io 23 | import json 24 | import numpy as np 25 | import logging 26 | 27 | 28 | class PretrainedEmbedding(nn.Module): 29 | def __init__(self, 30 | feature_name, 31 | feature_spec, 32 | pretrain_path, 33 | vocab_path, 34 | embedding_dim, 35 | pretrain_dim, 36 | pretrain_usage="init"): 37 | """ 38 | Fusion pretrained embedding with ID embedding 39 | :param: fusion_type: init/sum/concat 40 | """ 41 | super().__init__() 42 | assert pretrain_usage in ["init", "sum", "concat"] 43 | self.pretrain_usage = pretrain_usage 44 | padding_idx = feature_spec.get("padding_idx", None) 45 | self.oov_idx = feature_spec["oov_idx"] 46 | self.freeze_emb = feature_spec["freeze_emb"] 47 | self.pretrain_embedding = self.load_pretrained_embedding(feature_spec["vocab_size"], 48 | pretrain_dim, 49 | pretrain_path, 50 | vocab_path, 51 | feature_name, 52 | freeze=self.freeze_emb, 53 | padding_idx=padding_idx) 54 | if pretrain_usage != "init": 55 | self.id_embedding = nn.Embedding(feature_spec["vocab_size"], 56 | embedding_dim, 57 | padding_idx=padding_idx) 58 | self.proj = None 59 | if pretrain_usage in ["init", "sum"] and embedding_dim != pretrain_dim: 60 | self.proj = nn.Linear(pretrain_dim, embedding_dim) 61 | if pretrain_usage == "concat": 62 | self.proj = nn.Linear(pretrain_dim + embedding_dim, embedding_dim) 63 | 64 | def reset_parameters(self, embedding_initializer): 65 | if self.pretrain_usage in ["sum", "concat"]: 66 | nn.init.zeros_(self.id_embedding.weight) # set oov token embeddings to zeros 67 | embedding_initializer(self.id_embedding.weight[1:self.oov_idx, :]) 68 | 69 | def get_pretrained_embedding(self, pretrain_path): 70 | with h5py.File(pretrain_path, 'r') as hf: 71 | keys = hf["key"][:] 72 | embeddings = hf["value"][:] 73 | logging.info("Loading pretrained_emb: {}".format(pretrain_path)) 74 | return keys, embeddings 75 | 76 | def load_feature_vocab(self, vocab_path, feature_name): 77 | with io.open(vocab_path, "r", encoding="utf-8") as fd: 78 | vocab = json.load(fd) 79 | return vocab[feature_name] 80 | 81 | def load_pretrained_embedding(self, vocab_size, pretrain_dim, pretrain_path, vocab_path, 82 | feature_name, freeze=False, padding_idx=None): 83 | embedding_layer = nn.Embedding(vocab_size, 84 | pretrain_dim, 85 | padding_idx=padding_idx) 86 | if freeze: 87 | embedding_matrix = np.zeros((vocab_size, pretrain_dim)) 88 | else: 89 | embedding_matrix = np.random.normal(loc=0, scale=1.e-4, size=(vocab_size, pretrain_dim)) 90 | if padding_idx: 91 | embedding_matrix[padding_idx, :] = np.zeros(pretrain_dim) # set as zero for PAD 92 | keys, embeddings = self.get_pretrained_embedding(pretrain_path) 93 | assert embeddings.shape[-1] == pretrain_dim, f"pretrain_dim={pretrain_dim} not correct." 94 | vocab = self.load_feature_vocab(vocab_path, feature_name) 95 | for idx, word in enumerate(keys): 96 | if word in vocab: 97 | embedding_matrix[vocab[word]] = embeddings[idx] 98 | embedding_layer.weight = torch.nn.Parameter(torch.from_numpy(embedding_matrix).float()) 99 | if freeze: 100 | embedding_layer.weight.requires_grad = False 101 | return embedding_layer 102 | 103 | def forward(self, inputs): 104 | mask = (inputs <= self.oov_idx).float() 105 | pretrain_emb = self.pretrain_embedding(inputs) 106 | if not self.freeze_emb: 107 | pretrain_emb = pretrain_emb * mask.unsqueeze(-1) 108 | if self.pretrain_usage == "init": 109 | if self.proj is not None: 110 | feature_emb = self.proj(pretrain_emb) 111 | else: 112 | feature_emb = pretrain_emb 113 | else: 114 | id_emb = self.id_embedding(inputs) 115 | id_emb = id_emb * mask.unsqueeze(-1) 116 | if self.pretrain_usage == "sum": 117 | if self.proj is not None: 118 | feature_emb = self.proj(pretrain_emb) + id_emb 119 | else: 120 | feature_emb = pretrain_emb + id_emb 121 | elif self.pretrain_usage == "concat": 122 | feature_emb = torch.cat([pretrain_emb, id_emb], dim=-1) 123 | feature_emb = self.proj(feature_emb) 124 | return feature_emb 125 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/interactions/__init__.py: -------------------------------------------------------------------------------- 1 | from .inner_product import * 2 | from .holographic_interaction import * 3 | from .cross_net import * 4 | from .compressed_interaction_net import * 5 | from .bilinear_interaction import * 6 | from .inner_product import * 7 | from .interaction_machine import * 8 | 9 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/interactions/bilinear_interaction.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | import torch 18 | from torch import nn 19 | from itertools import combinations 20 | 21 | 22 | class BilinearInteraction(nn.Module): 23 | def __init__(self, num_fields, embedding_dim, bilinear_type="field_interaction"): 24 | super(BilinearInteraction, self).__init__() 25 | self.bilinear_type = bilinear_type 26 | self.interact_dim = int(num_fields * (num_fields - 1) / 2) 27 | if self.bilinear_type == "field_all": 28 | self.bilinear_W = nn.Parameter(torch.Tensor(embedding_dim, embedding_dim)) 29 | elif self.bilinear_type == "field_each": 30 | self.bilinear_W = nn.Parameter(torch.Tensor(num_fields, embedding_dim, embedding_dim)) 31 | elif self.bilinear_type == "field_interaction": 32 | self.bilinear_W = nn.Parameter(torch.Tensor(self.interact_dim, embedding_dim, embedding_dim)) 33 | else: 34 | raise NotImplementedError 35 | self.reset_parameters() 36 | 37 | def reset_parameters(self): 38 | nn.init.xavier_normal_(self.bilinear_W) 39 | 40 | def forward(self, feature_emb): 41 | feature_emb_list = torch.split(feature_emb, 1, dim=1) 42 | if self.bilinear_type == "field_all": 43 | bilinear_list = [torch.matmul(v_i, self.bilinear_W) * v_j 44 | for v_i, v_j in combinations(feature_emb_list, 2)] 45 | elif self.bilinear_type == "field_each": 46 | bilinear_W_list = torch.split(self.bilinear_W, 1, dim=0) 47 | bilinear_list = [torch.matmul(feature_emb_list[i], bilinear_W_list[i]) * feature_emb_list[j] 48 | for i, j in combinations(range(len(feature_emb_list)), 2)] 49 | elif self.bilinear_type == "field_interaction": 50 | bilinear_W_list = torch.split(self.bilinear_W, 1, dim=0) 51 | bilinear_list = [torch.matmul(v[0], bilinear_W_list[i]) * v[1] 52 | for i, v in enumerate(combinations(feature_emb_list, 2))] 53 | return torch.cat(bilinear_list, dim=1) 54 | 55 | 56 | class BilinearInteractionV2(nn.Module): 57 | def __init__(self, num_fields, embedding_dim, bilinear_type="field_interaction"): 58 | super(BilinearInteractionV2, self).__init__() 59 | self.bilinear_type = bilinear_type 60 | self.num_fields = num_fields 61 | self.embedding_dim = embedding_dim 62 | self.interact_dim = int(num_fields * (num_fields - 1) / 2) 63 | if self.bilinear_type == "field_all": 64 | self.bilinear_W = nn.Parameter(torch.Tensor(embedding_dim, embedding_dim)) 65 | elif self.bilinear_type == "field_each": 66 | self.bilinear_W = nn.Parameter(torch.Tensor(num_fields, embedding_dim, embedding_dim)) 67 | elif self.bilinear_type == "field_interaction": 68 | self.bilinear_W = nn.Parameter(torch.Tensor(self.interact_dim, embedding_dim, embedding_dim)) 69 | else: 70 | raise NotImplementedError 71 | self.triu_index = nn.Parameter(torch.triu_indices(num_fields, num_fields, offset=1), requires_grad=False) 72 | self.reset_parameters() 73 | 74 | def reset_parameters(self): 75 | nn.init.xavier_normal_(self.bilinear_W) 76 | 77 | def forward(self, feature_emb): 78 | if self.bilinear_type == "field_interaction": 79 | left_emb = torch.index_select(feature_emb, 1, self.triu_index[0]) 80 | right_emb = torch.index_select(feature_emb, 1, self.triu_index[1]) 81 | bilinear_out = torch.matmul(left_emb.unsqueeze(2), self.bilinear_W).squeeze(2) * right_emb 82 | else: 83 | if self.bilinear_type == "field_all": 84 | hidden_emb = torch.matmul(feature_emb, self.bilinear_W) 85 | elif self.bilinear_type == "field_each": 86 | hidden_emb = torch.matmul(feature_emb.unsqueeze(2), self.bilinear_W).squeeze(2) 87 | left_emb = torch.index_select(hidden_emb, 1, self.triu_index[0]) 88 | right_emb = torch.index_select(feature_emb, 1, self.triu_index[1]) 89 | bilinear_out = left_emb * right_emb 90 | return bilinear_out 91 | 92 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/interactions/compressed_interaction_net.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import torch 19 | from torch import nn 20 | 21 | 22 | class CompressedInteractionNet(nn.Module): 23 | def __init__(self, num_fields, cin_hidden_units, output_dim=1): 24 | super(CompressedInteractionNet, self).__init__() 25 | self.cin_hidden_units = cin_hidden_units 26 | self.fc = nn.Linear(sum(cin_hidden_units), output_dim) 27 | self.cin_layer = nn.ModuleDict() 28 | for i, unit in enumerate(self.cin_hidden_units): 29 | in_channels = num_fields * self.cin_hidden_units[i - 1] if i > 0 else num_fields ** 2 30 | out_channels = unit 31 | self.cin_layer["layer_" + str(i + 1)] = nn.Conv1d(in_channels, 32 | out_channels, # how many filters 33 | kernel_size=1) # kernel output shape 34 | 35 | def forward(self, feature_emb): 36 | pooling_outputs = [] 37 | X_0 = feature_emb 38 | batch_size = X_0.shape[0] 39 | embedding_dim = X_0.shape[-1] 40 | X_i = X_0 41 | for i in range(len(self.cin_hidden_units)): 42 | hadamard_tensor = torch.einsum("bhd,bmd->bhmd", X_0, X_i) 43 | hadamard_tensor = hadamard_tensor.view(batch_size, -1, embedding_dim) 44 | X_i = self.cin_layer["layer_" + str(i + 1)](hadamard_tensor) \ 45 | .view(batch_size, -1, embedding_dim) 46 | pooling_outputs.append(X_i.sum(dim=-1)) 47 | output = self.fc(torch.cat(pooling_outputs, dim=-1)) 48 | return output 49 | 50 | 51 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/interactions/cross_net.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # Copyright (C) 2021 The DeepCTR-Torch authors for CrossNetMix 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ========================================================================= 17 | 18 | 19 | import torch 20 | from torch import nn 21 | 22 | 23 | class CrossInteraction(nn.Module): 24 | def __init__(self, input_dim): 25 | super(CrossInteraction, self).__init__() 26 | self.weight = nn.Linear(input_dim, 1, bias=False) 27 | self.bias = nn.Parameter(torch.zeros(input_dim)) 28 | 29 | def forward(self, X_0, X_i): 30 | interact_out = self.weight(X_i) * X_0 + self.bias 31 | return interact_out 32 | 33 | 34 | class CrossNet(nn.Module): 35 | def __init__(self, input_dim, num_layers): 36 | super(CrossNet, self).__init__() 37 | self.num_layers = num_layers 38 | self.cross_net = nn.ModuleList(CrossInteraction(input_dim) 39 | for _ in range(self.num_layers)) 40 | 41 | def forward(self, X_0): 42 | X_i = X_0 # b x dim 43 | for i in range(self.num_layers): 44 | X_i = X_i + self.cross_net[i](X_0, X_i) 45 | return X_i 46 | 47 | 48 | class CrossNetV2(nn.Module): 49 | def __init__(self, input_dim, num_layers): 50 | super(CrossNetV2, self).__init__() 51 | self.num_layers = num_layers 52 | self.cross_layers = nn.ModuleList(nn.Linear(input_dim, input_dim) 53 | for _ in range(self.num_layers)) 54 | 55 | def forward(self, X_0): 56 | X_i = X_0 # b x dim 57 | for i in range(self.num_layers): 58 | X_i = X_i + X_0 * self.cross_layers[i](X_i) 59 | return X_i 60 | 61 | 62 | class CrossNetMix(nn.Module): 63 | """ CrossNetMix improves CrossNetV2 by: 64 | 1. add MOE to learn feature interactions in different subspaces 65 | 2. add nonlinear transformations in low-dimensional space 66 | """ 67 | def __init__(self, in_features, layer_num=2, low_rank=32, num_experts=4): 68 | super(CrossNetMix, self).__init__() 69 | self.layer_num = layer_num 70 | self.num_experts = num_experts 71 | 72 | # U: (in_features, low_rank) 73 | self.U_list = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_( 74 | torch.empty(num_experts, in_features, low_rank))) for i in range(self.layer_num)]) 75 | # V: (in_features, low_rank) 76 | self.V_list = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_( 77 | torch.empty(num_experts, in_features, low_rank))) for i in range(self.layer_num)]) 78 | # C: (low_rank, low_rank) 79 | self.C_list = torch.nn.ParameterList([nn.Parameter(nn.init.xavier_normal_( 80 | torch.empty(num_experts, low_rank, low_rank))) for i in range(self.layer_num)]) 81 | self.gating = nn.ModuleList([nn.Linear(in_features, 1, bias=False) for i in range(self.num_experts)]) 82 | 83 | self.bias = torch.nn.ParameterList([nn.Parameter(nn.init.zeros_( 84 | torch.empty(in_features, 1))) for i in range(self.layer_num)]) 85 | # self.to(device) 86 | 87 | def forward(self, inputs): 88 | x_0 = inputs.unsqueeze(2) # (bs, in_features, 1) 89 | x_l = x_0 90 | for i in range(self.layer_num): 91 | output_of_experts = [] 92 | gating_score_of_experts = [] 93 | for expert_id in range(self.num_experts): 94 | # (1) G(x_l) 95 | # compute the gating score by x_l 96 | gating_score_of_experts.append(self.gating[expert_id](x_l.squeeze(2))) 97 | 98 | # (2) E(x_l) 99 | # project the input x_l to $\mathbb{R}^{r}$ 100 | v_x = torch.matmul(self.V_list[i][expert_id].t(), x_l) # (bs, low_rank, 1) 101 | 102 | # nonlinear activation in low rank space 103 | v_x = torch.tanh(v_x) 104 | v_x = torch.matmul(self.C_list[i][expert_id], v_x) 105 | v_x = torch.tanh(v_x) 106 | 107 | # project back to $\mathbb{R}^{d}$ 108 | uv_x = torch.matmul(self.U_list[i][expert_id], v_x) # (bs, in_features, 1) 109 | 110 | dot_ = uv_x + self.bias[i] 111 | dot_ = x_0 * dot_ # Hadamard-product 112 | 113 | output_of_experts.append(dot_.squeeze(2)) 114 | 115 | # (3) mixture of low-rank experts 116 | output_of_experts = torch.stack(output_of_experts, 2) # (bs, in_features, num_experts) 117 | gating_score_of_experts = torch.stack(gating_score_of_experts, 1) # (bs, num_experts, 1) 118 | moe_out = torch.matmul(output_of_experts, gating_score_of_experts.softmax(1)) 119 | x_l = moe_out + x_l # (bs, in_features, 1) 120 | 121 | x_l = x_l.squeeze() # (bs, in_features) 122 | return x_l 123 | 124 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/interactions/holographic_interaction.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import torch 19 | from torch import nn 20 | from itertools import combinations 21 | 22 | 23 | class HolographicInteraction(nn.Module): 24 | def __init__(self, num_fields, interaction_type="circular_convolution"): 25 | super(HolographicInteraction, self).__init__() 26 | self.interaction_type = interaction_type 27 | if self.interaction_type == "circular_correlation": 28 | self.conj_sign = nn.Parameter(torch.tensor([1., -1.]), requires_grad=False) 29 | self.triu_index = nn.Parameter(torch.triu_indices(num_fields, num_fields, offset=1), requires_grad=False) 30 | 31 | def forward(self, feature_emb): 32 | emb1 = torch.index_select(feature_emb, 1, self.triu_index[0]) 33 | emb2 = torch.index_select(feature_emb, 1, self.triu_index[1]) 34 | if self.interaction_type == "hadamard_product": 35 | interact_tensor = emb1 * emb2 36 | elif self.interaction_type == "circular_convolution": 37 | fft1 = torch.view_as_real(torch.fft.fft(emb1)) 38 | fft2 = torch.view_as_real(torch.fft.fft(emb2)) 39 | fft_product = torch.stack([fft1[..., 0] * fft2[..., 0] - fft1[..., 1] * fft2[..., 1], 40 | fft1[..., 0] * fft2[..., 1] + fft1[..., 1] * fft2[..., 0]], 41 | dim=-1) 42 | interact_tensor = torch.view_as_real(torch.fft.ifft(torch.view_as_complex(fft_product)))[..., 0] 43 | elif self.interaction_type == "circular_correlation": 44 | fft1_emb = torch.view_as_real(torch.fft.fft(emb1)) 45 | fft1 = fft1_emb * self.conj_sign.expand_as(fft1_emb) 46 | fft2 = torch.view_as_real(torch.fft.fft(emb2)) 47 | fft_product = torch.stack([fft1[..., 0] * fft2[..., 0] - fft1[..., 1] * fft2[..., 1], 48 | fft1[..., 0] * fft2[..., 1] + fft1[..., 1] * fft2[..., 0]], 49 | dim=-1) 50 | interact_tensor = torch.view_as_real(torch.fft.ifft(torch.view_as_complex(fft_product)))[..., 0] 51 | else: 52 | raise ValueError("interaction_type={} not supported.".format(self.interaction_type)) 53 | return interact_tensor 54 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/interactions/inner_product.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import torch 19 | from torch import nn 20 | 21 | 22 | class InnerProductInteraction(nn.Module): 23 | """ output: product_sum (bs x 1), 24 | bi_interaction (bs * dim), 25 | inner_product (bs x f^2/2), 26 | elementwise_product (bs x f^2/2 x emb_dim) 27 | """ 28 | def __init__(self, num_fields, output="product_sum"): 29 | super(InnerProductInteraction, self).__init__() 30 | self._output_type = output 31 | if output not in ["product_sum", "bi_interaction", "inner_product", "elementwise_product"]: 32 | raise ValueError("InnerProductInteraction output={} is not supported.".format(output)) 33 | if output == "inner_product": 34 | self.interaction_units = int(num_fields * (num_fields - 1) / 2) 35 | self.triu_mask = nn.Parameter(torch.triu(torch.ones(num_fields, num_fields), 1).bool(), 36 | requires_grad=False) 37 | elif output == "elementwise_product": 38 | self.triu_index = nn.Parameter(torch.triu_indices(num_fields, num_fields, offset=1), requires_grad=False) 39 | 40 | def forward(self, feature_emb): 41 | if self._output_type in ["product_sum", "bi_interaction"]: 42 | sum_of_square = torch.sum(feature_emb, dim=1) ** 2 # sum then square 43 | square_of_sum = torch.sum(feature_emb ** 2, dim=1) # square then sum 44 | bi_interaction = (sum_of_square - square_of_sum) * 0.5 45 | if self._output_type == "bi_interaction": 46 | return bi_interaction 47 | else: 48 | return bi_interaction.sum(dim=-1, keepdim=True) 49 | elif self._output_type == "inner_product": 50 | inner_product_matrix = torch.bmm(feature_emb, feature_emb.transpose(1, 2)) 51 | triu_values = torch.masked_select(inner_product_matrix, self.triu_mask) 52 | return triu_values.view(-1, self.interaction_units) 53 | elif self._output_type == "elementwise_product": 54 | emb1 = torch.index_select(feature_emb, 1, self.triu_index[0]) 55 | emb2 = torch.index_select(feature_emb, 1, self.triu_index[1]) 56 | return emb1 * emb2 57 | 58 | -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/interactions/interaction_machine.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import torch 19 | from torch import nn 20 | 21 | 22 | class InteractionMachine(nn.Module): 23 | def __init__(self, embedding_dim, order=2, batch_norm=False): 24 | super(InteractionMachine, self).__init__() 25 | assert order < 6, "order={} is not supported.".format(order) 26 | self.order = order 27 | self.bn = nn.BatchNorm1d(embedding_dim * order) if batch_norm else None 28 | self.fc = nn.Linear(order * embedding_dim, 1) 29 | 30 | def second_order(self, p1, p2): 31 | return (p1.pow(2) - p2) / 2 32 | 33 | def third_order(self, p1, p2, p3): 34 | return (p1.pow(3) - 3 * p1 * p2 + 2 * p3) / 6 35 | 36 | def fourth_order(self, p1, p2, p3, p4): 37 | return (p1.pow(4) - 6 * p1.pow(2) * p2 + 3 * p2.pow(2) 38 | + 8 * p1 * p3 - 6 * p4) / 24 39 | 40 | def fifth_order(self, p1, p2, p3, p4, p5): 41 | return (p1.pow(5) - 10 * p1.pow(3) * p2 + 20 * p1.pow(2) * p3 - 30 * p1 * p4 42 | - 20 * p2 * p3 + 15 * p1 * p2.pow(2) + 24 * p5) / 120 43 | 44 | def forward(self, X): 45 | out = [] 46 | Q = X 47 | if self.order >= 1: 48 | p1 = Q.sum(dim=1) 49 | out.append(p1) 50 | if self.order >= 2: 51 | Q = Q * X 52 | p2 = Q.sum(dim=1) 53 | out.append(self.second_order(p1, p2)) 54 | if self.order >= 3: 55 | Q = Q * X 56 | p3 = Q.sum(dim=1) 57 | out.append(self.third_order(p1, p2, p3)) 58 | if self.order >= 4: 59 | Q = Q * X 60 | p4 = Q.sum(dim=1) 61 | out.append(self.fourth_order(p1, p2, p3, p4)) 62 | if self.order == 5: 63 | Q = Q * X 64 | p5 = Q.sum(dim=1) 65 | out.append(self.fifth_order(p1, p2, p3, p4, p5)) 66 | out = torch.cat(out, dim=-1) 67 | if self.bn is not None: 68 | out = self.bn(out) 69 | y = self.fc(out) 70 | return y -------------------------------------------------------------------------------- /fuxictr/pytorch/layers/pooling.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | from torch import nn 19 | import torch 20 | 21 | 22 | class MaskedAveragePooling(nn.Module): 23 | def __init__(self): 24 | super(MaskedAveragePooling, self).__init__() 25 | 26 | def forward(self, embedding_matrix, mask=None): 27 | sum_out = torch.sum(embedding_matrix, dim=1) 28 | if mask is None: 29 | mask = embedding_matrix.sum(dim=-1) != 0 # zeros at padding tokens 30 | avg_out = sum_out / (mask.float().sum(-1, keepdim=True) + 1e-12) 31 | return avg_out 32 | 33 | 34 | class MaskedSumPooling(nn.Module): 35 | def __init__(self): 36 | super(MaskedSumPooling, self).__init__() 37 | 38 | def forward(self, embedding_matrix): 39 | # mask by zeros 40 | return torch.sum(embedding_matrix, dim=1) 41 | 42 | 43 | class KMaxPooling(nn.Module): 44 | def __init__(self, k, dim): 45 | super(KMaxPooling, self).__init__() 46 | self.k = k 47 | self.dim = dim 48 | 49 | def forward(self, X): 50 | index = X.topk(self.k, dim=self.dim)[1].sort(dim=self.dim)[0] 51 | output = X.gather(self.dim, index) 52 | return output -------------------------------------------------------------------------------- /fuxictr/pytorch/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .rank_model import BaseModel 2 | from .multitask_model import MultiTaskModel -------------------------------------------------------------------------------- /fuxictr/pytorch/models/multitask_model.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. FuxiCTR Authors. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | import torch 18 | from torch import nn 19 | import numpy as np 20 | import torch 21 | import os, sys 22 | import logging 23 | from fuxictr.pytorch.models import BaseModel 24 | from fuxictr.pytorch.torch_utils import get_device, get_optimizer, get_loss 25 | from tqdm import tqdm 26 | from collections import defaultdict 27 | 28 | 29 | class MultiTaskModel(BaseModel): 30 | def __init__(self, 31 | feature_map, 32 | model_id="MultiTaskModel", 33 | task=["binary_classification"], 34 | num_tasks=1, 35 | loss_weight='EQ', 36 | gpu=-1, 37 | monitor="AUC", 38 | save_best_only=True, 39 | monitor_mode="max", 40 | early_stop_patience=2, 41 | eval_steps=None, 42 | embedding_regularizer=None, 43 | net_regularizer=None, 44 | reduce_lr_on_plateau=True, 45 | **kwargs): 46 | super(MultiTaskModel, self).__init__(feature_map=feature_map, 47 | model_id=model_id, 48 | task="binary_classification", 49 | gpu=gpu, 50 | loss_weight=loss_weight, 51 | monitor=monitor, 52 | save_best_only=save_best_only, 53 | monitor_mode=monitor_mode, 54 | early_stop_patience=early_stop_patience, 55 | eval_steps=eval_steps, 56 | embedding_regularizer=embedding_regularizer, 57 | net_regularizer=net_regularizer, 58 | reduce_lr_on_plateau=reduce_lr_on_plateau, 59 | **kwargs) 60 | self.device = get_device(gpu) 61 | self.num_tasks = num_tasks 62 | self.loss_weight = loss_weight 63 | if isinstance(task, list): 64 | assert len(task) == num_tasks, "the number of tasks must equal the length of \"task\"" 65 | self.output_activation = nn.ModuleList([self.get_output_activation(str(t)) for t in task]) 66 | else: 67 | self.output_activation = nn.ModuleList([self.get_output_activation(task) for _ in range(num_tasks)]) 68 | 69 | def compile(self, optimizer, loss, lr): 70 | self.optimizer = get_optimizer(optimizer, self.parameters(), lr) 71 | if isinstance(loss, list): 72 | self.loss_fn = [get_loss(l) for l in loss] 73 | else: 74 | self.loss_fn = [get_loss(loss) for _ in range(self.num_tasks)] 75 | 76 | def get_labels(self, inputs): 77 | labels = self.feature_map.labels 78 | y = [inputs[:, self.feature_map.get_column_index(labels[i])].to(self.device).float().view(-1, 1) 79 | for i in range(len(labels))] 80 | return y 81 | 82 | def compute_loss(self, return_dict, y_true): 83 | labels = self.feature_map.labels 84 | loss = [self.loss_fn[i](return_dict["{}_pred".format(labels[i])], y_true[i], reduction='mean') 85 | for i in range(len(labels))] 86 | if self.loss_weight == 'EQ': 87 | # Default: All losses are weighted equally 88 | loss = torch.sum(torch.stack(loss)) 89 | loss += self.regularization_loss() 90 | return loss 91 | 92 | def evaluate(self, data_generator, metrics=None): 93 | self.eval() # set to evaluation mode 94 | with torch.no_grad(): 95 | y_pred_all = defaultdict(list) 96 | y_true_all = defaultdict(list) 97 | labels = self.feature_map.labels 98 | group_id = [] 99 | if self._verbose > 0: 100 | data_generator = tqdm(data_generator, disable=False, file=sys.stdout) 101 | for batch_data in data_generator: 102 | return_dict = self.forward(batch_data) 103 | batch_y_true = self.get_labels(batch_data) 104 | for i in range(len(labels)): 105 | y_pred_all[labels[i]].extend( 106 | return_dict["{}_pred".format(labels[i])].data.cpu().numpy().reshape(-1)) 107 | y_true_all[labels[i]].extend(batch_y_true[i].data.cpu().numpy().reshape(-1)) 108 | if self.feature_map.group_id is not None: 109 | group_id.extend(self.get_group_id(batch_data).numpy().reshape(-1)) 110 | all_val_logs = {} 111 | mean_val_logs = defaultdict(list) 112 | group_id = np.array(group_id) if len(group_id) > 0 else None 113 | 114 | for i in range(len(labels)): 115 | y_pred = np.array(y_pred_all[labels[i]], np.float64) 116 | y_true = np.array(y_true_all[labels[i]], np.float64) 117 | if metrics is not None: 118 | val_logs = self.evaluate_metrics(y_true, y_pred, metrics, group_id) 119 | else: 120 | val_logs = self.evaluate_metrics(y_true, y_pred, self.validation_metrics, group_id) 121 | logging.info('[Metrics] [Task: {}] '.format(labels[i]) + ' - '.join( 122 | '{}: {:.6f}'.format(k, v) for k, v in val_logs.items())) 123 | for k, v in val_logs.items(): 124 | all_val_logs['{}_{}'.format(labels[i], k)] = v 125 | mean_val_logs[k].append(v) 126 | for k, v in mean_val_logs.items(): 127 | mean_val_logs[k] = np.mean(v) 128 | all_val_logs.update(mean_val_logs) 129 | return all_val_logs 130 | 131 | def predict(self, data_generator): 132 | self.eval() # set to evaluation mode 133 | with torch.no_grad(): 134 | y_pred_all = defaultdict(list) 135 | labels = self.feature_map.labels 136 | if self._verbose > 0: 137 | data_generator = tqdm(data_generator, disable=False, file=sys.stdout) 138 | for batch_data in data_generator: 139 | return_dict = self.forward(batch_data) 140 | for i in range(len(labels)): 141 | y_pred_all[labels[i]].extend( 142 | return_dict["{}_pred".format(labels[i])].data.cpu().numpy().reshape(-1)) 143 | return y_pred_all -------------------------------------------------------------------------------- /fuxictr/pytorch/torch_utils.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import sys 19 | import os 20 | import numpy as np 21 | import torch 22 | from torch import nn 23 | import random 24 | from functools import partial 25 | import h5py 26 | import re 27 | 28 | 29 | def seed_everything(seed=1029): 30 | random.seed(seed) 31 | os.environ["PYTHONHASHSEED"] = str(seed) 32 | np.random.seed(seed) 33 | torch.manual_seed(seed) 34 | torch.cuda.manual_seed(seed) 35 | torch.backends.cudnn.deterministic = True 36 | 37 | def get_device(gpu=-1): 38 | if gpu >= 0 and torch.cuda.is_available(): 39 | device = torch.device("cuda:" + str(gpu)) 40 | else: 41 | device = torch.device("cpu") 42 | return device 43 | 44 | def get_optimizer(optimizer, params, lr): 45 | if isinstance(optimizer, str): 46 | if optimizer.lower() == "adam": 47 | optimizer = "Adam" 48 | try: 49 | optimizer = getattr(torch.optim, optimizer)(params, lr=lr) 50 | except: 51 | raise NotImplementedError("optimizer={} is not supported.".format(optimizer)) 52 | return optimizer 53 | 54 | def get_loss(loss): 55 | if isinstance(loss, str): 56 | if loss in ["bce", "binary_crossentropy", "binary_cross_entropy"]: 57 | loss = "binary_cross_entropy" 58 | try: 59 | loss_fn = getattr(torch.functional.F, loss) 60 | except: 61 | try: 62 | loss_fn = eval("losses." + loss) 63 | except: 64 | raise NotImplementedError("loss={} is not supported.".format(loss)) 65 | return loss_fn 66 | 67 | def get_regularizer(reg): 68 | reg_pair = [] # of tuples (p_norm, weight) 69 | if isinstance(reg, float): 70 | reg_pair.append((2, reg)) 71 | elif isinstance(reg, str): 72 | try: 73 | if reg.startswith("l1(") or reg.startswith("l2("): 74 | reg_pair.append((int(reg[1]), float(reg.rstrip(")").split("(")[-1]))) 75 | elif reg.startswith("l1_l2"): 76 | l1_reg, l2_reg = reg.rstrip(")").split("(")[-1].split(",") 77 | reg_pair.append((1, float(l1_reg))) 78 | reg_pair.append((2, float(l2_reg))) 79 | else: 80 | raise NotImplementedError 81 | except: 82 | raise NotImplementedError("regularizer={} is not supported.".format(reg)) 83 | return reg_pair 84 | 85 | def get_activation(activation, hidden_units=None): 86 | if isinstance(activation, str): 87 | if activation.lower() in ["prelu", "dice"]: 88 | assert type(hidden_units) == int 89 | if activation.lower() == "relu": 90 | return nn.ReLU() 91 | elif activation.lower() == "sigmoid": 92 | return nn.Sigmoid() 93 | elif activation.lower() == "tanh": 94 | return nn.Tanh() 95 | elif activation.lower() == "softmax": 96 | return nn.Softmax(dim=-1) 97 | elif activation.lower() == "prelu": 98 | return nn.PReLU(hidden_units, init=0.1) 99 | elif activation.lower() == "dice": 100 | from fuxictr.pytorch.layers.activations import Dice 101 | return Dice(hidden_units) 102 | else: 103 | return getattr(nn, activation)() 104 | elif isinstance(activation, list): 105 | if hidden_units is not None: 106 | assert len(activation) == len(hidden_units) 107 | return [get_activation(act, units) for act, units in zip(activation, hidden_units)] 108 | else: 109 | return [get_activation(act) for act in activation] 110 | return activation 111 | 112 | def get_initializer(initializer): 113 | if isinstance(initializer, str): 114 | try: 115 | initializer = eval(initializer) 116 | except: 117 | raise ValueError("initializer={} is not supported."\ 118 | .format(initializer)) 119 | return initializer 120 | 121 | def save_init_embs(model, data_path="init_embs.h5"): 122 | emb_dict = dict() 123 | for k, v in model.state_dict().items(): 124 | if "embedding_layers" in k: 125 | if v.size(-1) > 1: 126 | f_name = re.findall(r"embedding_layers.(.*).weight", k)[0] 127 | emb_dict[f_name] = v.cpu().numpy() 128 | with h5py.File(data_path, 'w') as hf: 129 | for key, arr in emb_dict.items(): 130 | hf.create_dataset(key, data=arr) 131 | 132 | def load_init_embs(model, data_path="init_embs.h5"): 133 | state_dict = model.state_dict() 134 | f_name_dict = dict() 135 | for k in state_dict.keys(): 136 | if "embedding_layers" in k and state_dict[k].size(-1) > 1: 137 | f_name = re.findall(r"embedding_layers.(.*).weight", k)[0] 138 | f_name_dict[f_name] = k 139 | with h5py.File(data_path, 'r') as hf: 140 | for key in hf.keys(): 141 | if key in f_name_dict: 142 | state_dict[f_name_dict[key]] = torch.from_numpy(hf[key][:]) 143 | model.load_state_dict(state_dict) -------------------------------------------------------------------------------- /fuxictr/tensorflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiangcaiSu/STEM/769e2af0d0d1a0be9f58b475b95e05f502a20df5/fuxictr/tensorflow/__init__.py -------------------------------------------------------------------------------- /fuxictr/tensorflow/dataloaders/__init__.py: -------------------------------------------------------------------------------- 1 | from .tf_dataloader import TFRecordDataLoader -------------------------------------------------------------------------------- /fuxictr/tensorflow/dataloaders/tf_dataloader.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | import os 18 | import tensorflow as tf 19 | import logging 20 | 21 | 22 | class TFRecordDataLoader(object): 23 | def __init__(self, feature_map, stage="both", train_data=None, valid_data=None, test_data=None, 24 | batch_size=32, shuffle=True, drop_remainder=False, **kwargs): 25 | logging.info("Loading data...") 26 | self.stage = stage 27 | self.train_data = train_data 28 | self.valid_data = valid_data 29 | self.test_data = test_data 30 | self.batch_size = batch_size 31 | self.shuffle = shuffle 32 | self.drop_remainder = drop_remainder 33 | self.schema = dict() 34 | for feat, feat_spec in feature_map.features.items(): 35 | if feat_spec["type"] == "numeric": 36 | self.schema[feat] = tf.io.FixedLenFeature(dtype=tf.float32, shape=1) 37 | elif feat_spec["type"] in ["categorical", "meta"]: 38 | self.schema[feat] = tf.io.FixedLenFeature(dtype=tf.int64, shape=1) 39 | elif feat_spec["type"] == "sequence": 40 | self.schema[feat] = tf.io.FixedLenFeature(dtype=tf.int64, shape=feat_spec["max_len"]) 41 | for label in feature_map.labels: 42 | self.schema[label] = tf.io.FixedLenFeature(dtype=tf.float32, shape=1) 43 | 44 | def input_fn(self, filenames, batch_size=32, shuffle=True): 45 | def parse_example(example): 46 | example_dict = tf.io.parse_single_example(example, features=self.schema) 47 | return example_dict 48 | dataset = tf.data.TFRecordDataset(filenames).map(parse_example, num_parallel_calls=1) 49 | dataset = dataset.prefetch(buffer_size=1).batch(batch_size, drop_remainder=self.drop_remainder) 50 | if shuffle: 51 | dataset = dataset.shuffle(batch_size * 10) 52 | return dataset 53 | 54 | def make_iterator(self): 55 | if self.stage == "train": 56 | logging.info("Loading train and validation data done.") 57 | return self.input_fn(self.train_data, batch_size=self.batch_size, shuffle=self.shuffle), \ 58 | self.input_fn(self.valid_data, batch_size=self.batch_size, shuffle=False) 59 | elif self.stage == "test": 60 | logging.info("Loading test data done.") 61 | return self.input_fn(self.test_data, batch_size=self.batch_size, shuffle=False) 62 | else: 63 | logging.info("Loading data done.") 64 | return self.input_fn(self.train_data, batch_size=self.batch_size, shuffle=self.shuffle), \ 65 | self.input_fn(self.valid_data, batch_size=self.batch_size, shuffle=False), \ 66 | self.input_fn(self.test_data, batch_size=self.batch_size, shuffle=False) 67 | 68 | -------------------------------------------------------------------------------- /fuxictr/tensorflow/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .pooling import * 2 | from .embeddings import * 3 | from .blocks import * 4 | from .interactions import * 5 | 6 | 7 | -------------------------------------------------------------------------------- /fuxictr/tensorflow/layers/blocks/__init__.py: -------------------------------------------------------------------------------- 1 | from .logistic_regression import * 2 | from .factorization_machine import * 3 | from .linear import * 4 | from .mlp_block import * 5 | -------------------------------------------------------------------------------- /fuxictr/tensorflow/layers/blocks/factorization_machine.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import tensorflow as tf 19 | from tensorflow.keras.layers import Layer 20 | from .logistic_regression import LogisticRegression 21 | from ..interactions import InnerProductInteraction 22 | 23 | 24 | class FactorizationMachine(Layer): 25 | def __init__(self, feature_map, regularizer=None): 26 | super(FactorizationMachine, self).__init__() 27 | self.fm_layer = InnerProductInteraction(feature_map.num_fields, output="product_sum") 28 | self.lr_layer = LogisticRegression(feature_map, use_bias=True, regularizer=regularizer) 29 | 30 | def call(self, X, feature_emb): 31 | lr_out = self.lr_layer(X) 32 | fm_out = self.fm_layer(feature_emb) 33 | output = fm_out + lr_out 34 | return output 35 | -------------------------------------------------------------------------------- /fuxictr/tensorflow/layers/blocks/linear.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | from fuxictr.tensorflow.tf_utils import get_initializer, get_regularizer 19 | from tensorflow.keras.layers import Layer, Dense 20 | 21 | 22 | class Linear(Layer): 23 | def __init__(self, 24 | output_dim, 25 | use_bias=True, 26 | initializer="glorot_normal", 27 | regularizer=None): 28 | super(Linear, self).__init__() 29 | self.linear = Dense(output_dim, use_bias=use_bias, 30 | kernel_initializer=get_initializer(initializer), 31 | kernel_regularizer=get_regularizer(regularizer), 32 | bias_regularizer=get_regularizer(regularizer)) 33 | 34 | def call(self, inputs): 35 | return self.linear(inputs) 36 | -------------------------------------------------------------------------------- /fuxictr/tensorflow/layers/blocks/logistic_regression.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import tensorflow as tf 19 | from tensorflow.keras.layers import Layer 20 | from fuxictr.tensorflow.layers import FeatureEmbedding 21 | 22 | 23 | class LogisticRegression(Layer): 24 | def __init__(self, feature_map, use_bias=True, regularizer=None): 25 | super(LogisticRegression, self).__init__() 26 | self.bias = tf.Variable(tf.zeros(1)) if use_bias else None 27 | self.embedding_layer = FeatureEmbedding(feature_map, 1, use_pretrain=False, 28 | use_sharing=False, 29 | embedding_regularizer=regularizer, 30 | name_prefix="lr_") 31 | 32 | def call(self, X): 33 | embed_weights = self.embedding_layer(X) 34 | output = tf.reduce_sum(embed_weights, axis=1) 35 | if self.bias is not None: 36 | output += self.bias 37 | return output 38 | 39 | -------------------------------------------------------------------------------- /fuxictr/tensorflow/layers/blocks/mlp_block.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import tensorflow as tf 19 | from fuxictr.tensorflow.tf_utils import get_activation, get_initializer, get_regularizer 20 | from tensorflow.keras.layers import Layer, Dense, BatchNormalization, LayerNormalization, Dropout 21 | 22 | 23 | class MLP_Block(Layer): 24 | def __init__(self, 25 | input_dim, 26 | hidden_units=[], 27 | hidden_activations="ReLU", 28 | output_dim=None, 29 | output_activation=None, 30 | dropout_rates=0.0, 31 | batch_norm=False, 32 | layer_norm=False, 33 | norm_before_activation=True, 34 | use_bias=True, 35 | initializer="glorot_normal", 36 | regularizer=None): 37 | super(MLP_Block, self).__init__() 38 | self.mlp = tf.keras.Sequential() 39 | if not isinstance(dropout_rates, list): 40 | dropout_rates = [dropout_rates] * len(hidden_units) 41 | if not isinstance(hidden_activations, list): 42 | hidden_activations = [hidden_activations] * len(hidden_units) 43 | hidden_activations = [get_activation(x) for x in hidden_activations] 44 | hidden_units = [input_dim] + hidden_units 45 | for idx in range(len(hidden_units) - 1): 46 | self.mlp.add(Dense(hidden_units[idx + 1], use_bias=use_bias, 47 | kernel_initializer=get_initializer(initializer), 48 | kernel_regularizer=get_regularizer(regularizer), 49 | bias_regularizer=get_regularizer(regularizer))) 50 | if norm_before_activation: 51 | if batch_norm: 52 | self.mlp.add(BatchNormalization(hidden_units[idx + 1])) 53 | elif layer_norm: 54 | self.mlp.add(LayerNormalization(hidden_units[idx + 1])) 55 | if hidden_activations[idx]: 56 | self.mlp.add(hidden_activations[idx]) 57 | if not norm_before_activation: 58 | if batch_norm: 59 | self.mlp.add(BatchNormalization(hidden_units[idx + 1])) 60 | elif layer_norm: 61 | self.mlp.add(LayerNormalization(hidden_units[idx + 1])) 62 | if dropout_rates[idx] > 0: 63 | self.mlp.add(Dropout(p=dropout_rates[idx])) 64 | if output_dim is not None: 65 | self.mlp.add(Dense(output_dim, use_bias=use_bias, 66 | kernel_initializer=get_initializer(initializer), 67 | kernel_regularizer=get_regularizer(regularizer), 68 | bias_regularizer=get_regularizer(regularizer))) 69 | if output_activation is not None: 70 | self.mlp.add(get_activation(output_activation)) 71 | 72 | def call(self, inputs, training=None): 73 | return self.mlp(inputs, training=training) 74 | 75 | -------------------------------------------------------------------------------- /fuxictr/tensorflow/layers/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | from .feature_embedding import * 2 | 3 | -------------------------------------------------------------------------------- /fuxictr/tensorflow/layers/interactions/__init__.py: -------------------------------------------------------------------------------- 1 | from .inner_product import * 2 | from .cross_net import * 3 | 4 | -------------------------------------------------------------------------------- /fuxictr/tensorflow/layers/interactions/cross_net.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import tensorflow as tf 19 | from tensorflow.keras.layers import Layer, Dense 20 | 21 | 22 | class CrossInteraction(Layer): 23 | def __init__(self, input_dim): 24 | super(CrossInteraction, self).__init__() 25 | self.weight = Dense(1, use_bias=False) 26 | self.bias = tf.Variable(tf.zeros(input_dim)) 27 | 28 | def call(self, X_0, X_i): 29 | interact_out = self.weight(X_i) * X_0 + self.bias 30 | return interact_out 31 | 32 | 33 | class CrossNet(Layer): 34 | def __init__(self, input_dim, num_layers): 35 | super(CrossNet, self).__init__() 36 | self.num_layers = num_layers 37 | self.cross_net = [] 38 | for _ in range(self.num_layers): 39 | self.cross_net.append(CrossInteraction(input_dim)) 40 | 41 | def call(self, X_0): 42 | X_i = X_0 # b x dim 43 | for i in range(self.num_layers): 44 | X_i = X_i + self.cross_net[i](X_0, X_i) 45 | return X_i 46 | 47 | 48 | class CrossNetV2(Layer): 49 | def __init__(self, input_dim, num_layers): 50 | super(CrossNetV2, self).__init__() 51 | self.num_layers = num_layers 52 | self.cross_layers = [] 53 | for _ in range(self.num_layers): 54 | self.cross_layers.append(Dense(input_dim)) 55 | 56 | def call(self, X_0): 57 | X_i = X_0 # b x dim 58 | for i in range(self.num_layers): 59 | X_i = X_i + X_0 * self.cross_layers[i](X_i) 60 | return X_i -------------------------------------------------------------------------------- /fuxictr/tensorflow/layers/interactions/inner_product.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | import tensorflow as tf 19 | import numpy as np 20 | from tensorflow.keras.layers import Layer 21 | 22 | 23 | class InnerProductInteraction(Layer): 24 | """ output: product_sum (bs x 1), 25 | bi_interaction (bs * dim), 26 | inner_product (bs x f^2/2), 27 | elementwise_product (bs x f^2/2 x emb_dim) 28 | """ 29 | def __init__(self, num_fields, output="product_sum"): 30 | super(InnerProductInteraction, self).__init__() 31 | self.output_type = output 32 | if output not in ["product_sum", "bi_interaction", "inner_product", "elementwise_product"]: 33 | raise ValueError("InnerProductInteraction output={} is not supported.".format(output)) 34 | if output == "inner_product": 35 | self.interaction_units = int(num_fields * (num_fields - 1) / 2) 36 | self.triu_mask = tf.Variable(np.triu(np.ones((num_fields, num_fields)), 1).astype(bool), 37 | trainable=False) 38 | elif output == "elementwise_product": 39 | self.triu_index = tf.Variable(np.triu_indices(num_fields, 1), trainable=False) 40 | 41 | def call(self, feature_emb): 42 | if self.output_type in ["product_sum", "bi_interaction"]: 43 | sum_of_square = tf.reduce_sum(feature_emb, axis=1) ** 2 # sum then square 44 | square_of_sum = tf.reduce_sum(feature_emb ** 2, axis=1) # square then sum 45 | bi_interaction = (sum_of_square - square_of_sum) * 0.5 46 | if self.output_type == "bi_interaction": 47 | return bi_interaction 48 | else: 49 | return tf.reduce_sum(bi_interaction, axis=-1, keepdims=True) 50 | elif self.output_type == "inner_product": 51 | inner_product_matrix = tf.einsum('bij,bji->bii', feature_emb, feature_emb.transpose(1, 2)) 52 | triu_values = tf.boolean_mask(inner_product_matrix, self.triu_mask) 53 | return tf.reshape(triu_values, (-1, self.interaction_units)) 54 | elif self.output_type == "elementwise_product": 55 | emb1 = tf.gather(feature_emb, self.triu_index[0], axis=1) 56 | emb2 = tf.gather(feature_emb, self.triu_index[1], axis=1) 57 | return emb1 * emb2 58 | 59 | -------------------------------------------------------------------------------- /fuxictr/tensorflow/layers/pooling.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | import tensorflow as tf 18 | from tensorflow.keras import Model 19 | 20 | 21 | class MaskedSumPooling(Model): 22 | def __init__(self): 23 | super(MaskedSumPooling, self).__init__() 24 | 25 | def forward(self, embedding_matrix): 26 | return tf.reduce_sum(embedding_matrix, axis=1) 27 | 28 | 29 | -------------------------------------------------------------------------------- /fuxictr/tensorflow/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .rank_model import BaseModel -------------------------------------------------------------------------------- /fuxictr/tensorflow/models/rank_model.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | import os 18 | import sys 19 | import numpy as np 20 | import tensorflow as tf 21 | from tensorflow.keras.models import Model 22 | from fuxictr.metrics import evaluate_metrics 23 | from fuxictr.tensorflow.tf_utils import get_optimizer, get_loss 24 | from fuxictr.utils import Monitor 25 | import logging 26 | from tqdm import tqdm 27 | 28 | 29 | class BaseModel(Model): 30 | def __init__(self, 31 | feature_map, 32 | model_id="BaseModel", 33 | task="binary_classification", 34 | monitor="AUC", 35 | save_best_only=True, 36 | monitor_mode="max", 37 | early_stop_patience=2, 38 | eval_steps=None, 39 | reduce_lr_on_plateau=True, 40 | **kwargs): 41 | super(BaseModel, self).__init__() 42 | self.valid_gen = None 43 | self._monitor_mode = monitor_mode 44 | self._monitor = Monitor(kv=monitor) 45 | self._early_stop_patience = early_stop_patience 46 | self._eval_steps = eval_steps # None default, that is evaluating every epoch 47 | self._save_best_only = save_best_only 48 | self._verbose = kwargs["verbose"] 49 | self._reduce_lr_on_plateau = reduce_lr_on_plateau 50 | self.feature_map = feature_map 51 | self.output_activation = self.get_output_activation(task) 52 | self.model_id = model_id 53 | self.model_dir = os.path.join(kwargs["model_root"], feature_map.dataset_id) 54 | self.checkpoint = os.path.abspath(os.path.join(self.model_dir, self.model_id + ".model")) 55 | self.validation_metrics = kwargs["metrics"] 56 | 57 | def compile(self, optimizer, loss, lr): 58 | self.optimizer = get_optimizer(optimizer, lr) 59 | self.loss_fn = get_loss(loss) 60 | 61 | def add_loss(self, inputs): 62 | return_dict = self(inputs, training=True) 63 | y_true = self.get_labels(inputs) 64 | loss = self.loss_fn(return_dict["y_pred"], y_true) 65 | return loss 66 | 67 | def get_total_loss(self, inputs): 68 | total_loss = self.add_loss(inputs) + sum(self.losses) # with regularization 69 | return total_loss 70 | 71 | def get_inputs(self, inputs, feature_source=None): 72 | if feature_source and type(feature_source) == str: 73 | feature_source = [feature_source] 74 | X_dict = dict() 75 | for feature, spec in self.feature_map.features.items(): 76 | if (feature_source is not None) and (spec["source"] not in feature_source): 77 | continue 78 | if spec["type"] == "meta": 79 | continue 80 | X_dict[feature] = inputs[feature] 81 | return X_dict 82 | 83 | def get_labels(self, inputs): 84 | """ assert len(labels) == 1, "Please override get_labels() when using multiple labels!" 85 | """ 86 | labels = self.feature_map.labels 87 | y = inputs[labels[0]] 88 | return y 89 | 90 | def get_group_id(self, inputs): 91 | return inputs[self.feature_map.group_id] 92 | 93 | def lr_decay(self, factor=0.1, min_lr=1e-6): 94 | self.optimizer.learning_rate = max(self.optimizer.learning_rate * factor, min_lr) 95 | return self.optimizer.lr.numpy() 96 | 97 | def fit(self, data_generator, epochs=1, validation_data=None, 98 | max_gradient_norm=10., **kwargs): 99 | self.valid_gen = validation_data 100 | self._max_gradient_norm = max_gradient_norm 101 | self._best_metric = np.Inf if self._monitor_mode == "min" else -np.Inf 102 | self._stopping_steps = 0 103 | self._stop_training = False 104 | self._total_steps = 0 105 | self._batch_index = 0 106 | self._epoch_index = 0 107 | 108 | # logging.info("Start training: {} batches/epoch".format(self._batches_per_epoch)) 109 | logging.info("************ Epoch=1 start ************") 110 | for epoch in range(epochs): 111 | self._epoch_index = epoch 112 | self.train_epoch(data_generator) 113 | if self._stop_training: 114 | break 115 | else: 116 | logging.info("************ Epoch={} end ************".format(self._epoch_index + 1)) 117 | logging.info("Training finished.") 118 | logging.info("Load best model: {}".format(self.checkpoint)) 119 | self.load_weights(self.checkpoint) 120 | 121 | def train_epoch(self, data_generator): 122 | self._batch_index = 0 123 | train_loss = 0 124 | if self._verbose == 0: 125 | batch_iterator = data_generator 126 | else: 127 | batch_iterator = tqdm(data_generator, disable=False, file=sys.stdout) 128 | for batch_index, batch_data in enumerate(batch_iterator): 129 | self._batch_index = batch_index 130 | self._total_steps += 1 131 | loss = self.train_step(batch_data) 132 | train_loss += loss.numpy() 133 | if (self._eval_steps is not None) and (self._total_steps % self._eval_steps == 0): 134 | logging.info("Train loss: {:.6f}".format(train_loss / self._eval_steps)) 135 | train_loss = 0 136 | self.eval_step() 137 | if self._stop_training: 138 | break 139 | if self._eval_steps is None: 140 | logging.info("Train loss: {:.6f}".format(train_loss / (self._batch_index + 1))) 141 | self.eval_step() 142 | 143 | @tf.function 144 | def train_step(self, batch_data): 145 | with tf.GradientTape() as tape: 146 | loss = self.get_total_loss(batch_data) 147 | grads = tape.gradient(loss, self.trainable_variables) 148 | grads, _ = tf.clip_by_global_norm(grads, self._max_gradient_norm) 149 | self.optimizer.apply_gradients(zip(grads, self.trainable_variables)) 150 | return loss 151 | 152 | def eval_step(self): 153 | logging.info('Evaluation @epoch {} - batch {}: '.format(self._epoch_index + 1, self._batch_index + 1)) 154 | val_logs = self.evaluate(self.valid_gen, metrics=self._monitor.get_metrics()) 155 | self.checkpoint_and_earlystop(val_logs) 156 | 157 | def checkpoint_and_earlystop(self, logs, min_delta=1e-6): 158 | monitor_value = self._monitor.get_value(logs) 159 | if (self._monitor_mode == "min" and monitor_value > self._best_metric - min_delta) or \ 160 | (self._monitor_mode == "max" and monitor_value < self._best_metric + min_delta): 161 | self._stopping_steps += 1 162 | logging.info("Monitor({})={:.6f} STOP!".format(self._monitor_mode, monitor_value)) 163 | if self._reduce_lr_on_plateau: 164 | current_lr = self.lr_decay() 165 | logging.info("Reduce learning rate on plateau: {:.6f}".format(current_lr)) 166 | else: 167 | self._stopping_steps = 0 168 | self._best_metric = monitor_value 169 | if self._save_best_only: 170 | logging.info("Save best model: monitor({})={:.6f}"\ 171 | .format(self._monitor_mode, monitor_value)) 172 | self.save_weights(self.checkpoint) 173 | if self._stopping_steps >= self._early_stop_patience: 174 | self._stop_training = True 175 | logging.info("********* Epoch=={} early stop *********".format(self._epoch_index + 1)) 176 | if not self._save_best_only: 177 | self.save_weights(self.checkpoint) 178 | 179 | def evaluate(self, data_generator, metrics=None): 180 | y_pred = [] 181 | y_true = [] 182 | group_id = [] 183 | if self._verbose > 0: 184 | data_generator = tqdm(data_generator, disable=False, file=sys.stdout) 185 | for batch_data in data_generator: 186 | return_dict = self(batch_data, training=True) 187 | y_pred.extend(return_dict["y_pred"].numpy().reshape(-1)) 188 | y_true.extend(self.get_labels(batch_data).numpy().reshape(-1)) 189 | if self.feature_map.group_id is not None: 190 | group_id.extend(self.get_group_id(batch_data).numpy().reshape(-1)) 191 | y_pred = np.array(y_pred, np.float64) 192 | y_true = np.array(y_true, np.float64) 193 | group_id = np.array(group_id) if len(group_id) > 0 else None 194 | if metrics is not None: 195 | val_logs = self.evaluate_metrics(y_true, y_pred, metrics, group_id) 196 | else: 197 | val_logs = self.evaluate_metrics(y_true, y_pred, self.validation_metrics, group_id) 198 | logging.info('[Metrics] ' + ' - '.join('{}: {:.6f}'.format(k, v) for k, v in val_logs.items())) 199 | return val_logs 200 | 201 | def evaluate_metrics(self, y_true, y_pred, metrics, group_id=None): 202 | return evaluate_metrics(y_true, y_pred, metrics, group_id) 203 | 204 | def get_output_activation(self, task): 205 | if task == "binary_classification": 206 | return tf.keras.layers.Activation("sigmoid") 207 | elif task == "regression": 208 | return tf.identity 209 | else: 210 | raise NotImplementedError("task={} is not supported.".format(task)) 211 | -------------------------------------------------------------------------------- /fuxictr/tensorflow/tf_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import os 4 | import tensorflow as tf 5 | from tensorflow.keras import optimizers 6 | from tensorflow.python.keras.regularizers import l2, l1, l1_l2 7 | from tensorflow.python.keras.initializers import * 8 | import logging 9 | 10 | 11 | def seed_everything(seed=2019): 12 | logging.info('Setting random seed={}'.format(seed)) 13 | if seed >= 0: 14 | random.seed(seed) 15 | np.random.seed(seed) 16 | os.environ['PYTHONHASHSEED'] = str(seed) 17 | tf.random.set_seed(seed) 18 | 19 | def get_activation(activation): 20 | if isinstance(activation, str): 21 | if activation.lower() == "relu": 22 | return tf.keras.layers.Activation("relu") 23 | elif activation.lower() == "sigmoid": 24 | return tf.keras.layers.Activation("sigmoid") 25 | elif activation.lower() == "tanh": 26 | return tf.keras.layers.Activation("tanh") 27 | elif activation.lower() == "softmax": 28 | return tf.keras.layers.Softmax() 29 | else: 30 | return getattr(tf.keras.layers, activation)() 31 | else: 32 | return activation 33 | 34 | def get_optimizer(optimizer, learning_rate=1.0e-3): 35 | if isinstance(optimizer, str): 36 | if optimizer.lower() == 'adam': 37 | return optimizers.Adam(learning_rate=learning_rate) 38 | elif optimizer.lower() == 'ftrl': 39 | return optimizers.Ftrl(learning_rate=learning_rate, l1_regularization_strength=0.1) 40 | elif optimizer.lower() == 'adagrad': 41 | return optimizers.Adagrad(learning_rate=learning_rate) 42 | else: 43 | try: 44 | return getattr(optimizers, optimizer)(learning_rate=learning_rate) 45 | except: 46 | raise ValueError('optimizer={} is not supported.'.format(optimizer)) 47 | return optimizer 48 | 49 | def get_loss(loss): 50 | if isinstance(loss, str): 51 | if loss in ['bce', 'binary_crossentropy', 'binary_cross_entropy']: 52 | loss = tf.keras.losses.BinaryCrossentropy(from_logits=False) 53 | else: 54 | raise ValueError('loss={} is not supported.'.format(loss)) 55 | return loss 56 | 57 | def get_regularizer(reg): 58 | if type(reg) in [int, float]: 59 | return l2(reg) 60 | elif isinstance(reg, str): 61 | if '(' in reg: 62 | try: 63 | return eval(reg) 64 | except: 65 | raise ValueError('reg={} is not supported.'.format(reg)) 66 | return reg 67 | 68 | def get_initializer(initializer, seed=20222023): 69 | if isinstance(initializer, str): 70 | try: 71 | if '(' in initializer: 72 | return eval(initializer.rstrip(')') + ', seed={})'.format(seed)) 73 | else: 74 | return eval(initializer)(seed=seed) 75 | except: 76 | raise ValueError("initializer={} not supported.".format(initializer)) 77 | return initializer 78 | 79 | -------------------------------------------------------------------------------- /fuxictr/utils.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | import os 18 | import logging 19 | import logging.config 20 | import yaml 21 | import glob 22 | import json 23 | from collections import OrderedDict 24 | import h5py 25 | 26 | 27 | def load_config(config_dir, experiment_id): 28 | params = load_model_config(config_dir, experiment_id) 29 | data_params = load_dataset_config(config_dir, params['dataset_id']) 30 | params.update(data_params) 31 | return params 32 | 33 | def load_model_config(config_dir, experiment_id): 34 | model_configs = glob.glob(os.path.join(config_dir, "model_config.yaml")) 35 | if not model_configs: 36 | model_configs = glob.glob(os.path.join(config_dir, "model_config/*.yaml")) 37 | if not model_configs: 38 | raise RuntimeError('config_dir={} is not valid!'.format(config_dir)) 39 | found_params = dict() 40 | for config in model_configs: 41 | with open(config, 'r') as cfg: 42 | config_dict = yaml.load(cfg, Loader=yaml.FullLoader) 43 | if 'Base' in config_dict: 44 | found_params['Base'] = config_dict['Base'] 45 | if experiment_id in config_dict: 46 | found_params[experiment_id] = config_dict[experiment_id] 47 | if len(found_params) == 2: 48 | break 49 | # Update base and exp_id settings consectively to allow overwritting when conflicts exist 50 | params = found_params.get('Base', {}) 51 | params.update(found_params.get(experiment_id, {})) 52 | assert "dataset_id" in params, f'expid={experiment_id} is not valid in config.' 53 | params["model_id"] = experiment_id 54 | return params 55 | 56 | def load_dataset_config(config_dir, dataset_id): 57 | params = {"dataset_id": dataset_id} 58 | dataset_configs = glob.glob(os.path.join(config_dir, "dataset_config.yaml")) 59 | if not dataset_configs: 60 | dataset_configs = glob.glob(os.path.join(config_dir, "dataset_config/*.yaml")) 61 | for config in dataset_configs: 62 | with open(config, "r") as cfg: 63 | config_dict = yaml.load(cfg, Loader=yaml.FullLoader) 64 | if dataset_id in config_dict: 65 | params.update(config_dict[dataset_id]) 66 | return params 67 | raise RuntimeError(f'dataset_id={dataset_id} is not found in config.') 68 | 69 | def set_logger(params): 70 | dataset_id = params['dataset_id'] 71 | model_id = params.get('model_id', '') 72 | log_dir = os.path.join(params.get('model_root', './checkpoints'), dataset_id) 73 | os.makedirs(log_dir, exist_ok=True) 74 | log_file = os.path.join(log_dir, model_id + '.log') 75 | 76 | # logs will not show in the file without the two lines. 77 | for handler in logging.root.handlers[:]: 78 | logging.root.removeHandler(handler) 79 | 80 | logging.basicConfig(level=logging.INFO, 81 | format='%(asctime)s P%(process)d %(levelname)s %(message)s', 82 | handlers=[logging.FileHandler(log_file, mode='w'), 83 | logging.StreamHandler()]) 84 | 85 | def print_to_json(data, sort_keys=True): 86 | new_data = dict((k, str(v)) for k, v in data.items()) 87 | if sort_keys: 88 | new_data = OrderedDict(sorted(new_data.items(), key=lambda x: x[0])) 89 | return json.dumps(new_data, indent=4) 90 | 91 | def print_to_list(data): 92 | return ' - '.join('{}: {:.6f}'.format(k, v) for k, v in data.items()) 93 | 94 | class Monitor(object): 95 | def __init__(self, kv): 96 | if isinstance(kv, str): 97 | kv = {kv: 1} 98 | self.kv_pairs = kv 99 | 100 | def get_value(self, logs): 101 | value = 0 102 | for k, v in self.kv_pairs.items(): 103 | value += logs.get(k, 0) * v 104 | return value 105 | 106 | def get_metrics(self): 107 | return list(self.kv_pairs.keys()) 108 | 109 | def load_h5(data_path, verbose=0): 110 | if verbose == 0: 111 | logging.info('Loading data from h5: ' + data_path) 112 | data_dict = dict() 113 | with h5py.File(data_path, 'r') as hf: 114 | for key in hf.keys(): 115 | data_dict[key] = hf[key][:] 116 | return data_dict -------------------------------------------------------------------------------- /fuxictr/version.py: -------------------------------------------------------------------------------- 1 | __version__="2.1.2" 2 | -------------------------------------------------------------------------------- /models/AITM.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from fuxictr.pytorch.models import MultiTaskModel 4 | from fuxictr.pytorch.layers import FeatureEmbedding, MLP_Block 5 | import numpy as np 6 | 7 | ''' 8 | Reference Code: https://github.com/easezyc/Multitask-Recommendation-Library/blob/main/models/aitm.py 9 | ''' 10 | 11 | class AITM(MultiTaskModel): 12 | def __init__(self, 13 | feature_map, 14 | model_id="AITM", 15 | gpu=-1, 16 | task=["binary_classification"], 17 | num_tasks=1, 18 | loss_weight='EQ', 19 | learning_rate=1e-3, 20 | embedding_dim=10, 21 | bottom_hidden_units=[64, 64, 64], 22 | tower_hidden_units=[64, ], 23 | hidden_activations="ReLU", 24 | net_dropout=0, 25 | batch_norm=False, 26 | embedding_regularizer=None, 27 | net_regularizer=None, 28 | **kwargs): 29 | super(AITM, self).__init__(feature_map, 30 | task=task, 31 | loss_weight=loss_weight, 32 | num_tasks=num_tasks, 33 | model_id=model_id, 34 | gpu=gpu, 35 | embedding_regularizer=embedding_regularizer, 36 | net_regularizer=net_regularizer, 37 | **kwargs) 38 | self.embedding_layer = FeatureEmbedding(feature_map, embedding_dim) 39 | self.hidden_dim = bottom_hidden_units[-1] 40 | self.g = torch.nn.ModuleList([torch.nn.Linear(bottom_hidden_units[-1], bottom_hidden_units[-1]) for i in range(num_tasks - 1)]) 41 | self.h1 = torch.nn.Linear(bottom_hidden_units[-1], bottom_hidden_units[-1]) 42 | self.h2 = torch.nn.Linear(bottom_hidden_units[-1], bottom_hidden_units[-1]) 43 | self.h3 = torch.nn.Linear(bottom_hidden_units[-1], bottom_hidden_units[-1]) 44 | 45 | self.bottom = nn.ModuleList([MLP_Block(input_dim=embedding_dim * feature_map.num_fields, 46 | hidden_units=bottom_hidden_units, 47 | hidden_activations=hidden_activations, 48 | output_activation=None, 49 | dropout_rates=net_dropout, 50 | batch_norm=batch_norm) for _ in range(num_tasks)]) 51 | self.tower = nn.ModuleList([MLP_Block(input_dim=bottom_hidden_units[-1], 52 | output_dim=1, 53 | hidden_units=tower_hidden_units, 54 | hidden_activations=hidden_activations, 55 | output_activation=None, 56 | dropout_rates=net_dropout, 57 | batch_norm=batch_norm) 58 | for _ in range(num_tasks)]) 59 | self.compile(kwargs["optimizer"], kwargs["loss"], learning_rate) 60 | self.reset_parameters() 61 | self.model_to_device() 62 | 63 | def forward(self, inputs): 64 | X = self.get_inputs(inputs) 65 | feature_emb = self.embedding_layer(X) 66 | bottom_output = [self.bottom[i](feature_emb.flatten(start_dim=1)) for i in range(self.num_tasks)] # [(?, bottom_hidden_units[-1])] 67 | for i in range(1, self.num_tasks): 68 | p = self.g[i - 1](bottom_output[i - 1]).unsqueeze(1) 69 | q = bottom_output[i].unsqueeze(1) 70 | x = torch.cat([p, q], dim = 1) 71 | V = self.h1(x) 72 | K = self.h2(x) 73 | Q = self.h3(x) 74 | bottom_output[i] = torch.sum(torch.nn.functional.softmax(torch.sum(K * Q, 2, True) / np.sqrt(self.hidden_dim), dim=1) * V, 1) 75 | tower_output = [self.tower[i](bottom_output[i]) for i in range(self.num_tasks)] 76 | y_pred = [self.output_activation[i](tower_output[i]) for i in range(self.num_tasks)] 77 | return_dict = {} 78 | labels = self.feature_map.labels 79 | for i in range(self.num_tasks): 80 | return_dict["{}_pred".format(labels[i])] = y_pred[i] 81 | return return_dict 82 | -------------------------------------------------------------------------------- /models/ESMM.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from fuxictr.pytorch.models import MultiTaskModel 4 | from fuxictr.pytorch.layers import FeatureEmbedding, MLP_Block 5 | 6 | 7 | class ESMM(MultiTaskModel): 8 | def __init__(self, 9 | feature_map, 10 | model_id="ESMM", 11 | gpu=-1, 12 | task=["binary_classification"], 13 | num_tasks=1, 14 | loss_weight='EQ', 15 | learning_rate=1e-3, 16 | embedding_dim=10, 17 | tower_hidden_units=[64, ], 18 | hidden_activations="ReLU", 19 | net_dropout=0, 20 | batch_norm=False, 21 | embedding_regularizer=None, 22 | net_regularizer=None, 23 | **kwargs): 24 | super(ESMM, self).__init__(feature_map, 25 | task=task, 26 | loss_weight=loss_weight, 27 | num_tasks=num_tasks, 28 | model_id=model_id, 29 | gpu=gpu, 30 | embedding_regularizer=embedding_regularizer, 31 | net_regularizer=net_regularizer, 32 | **kwargs) 33 | self.embedding_layer = FeatureEmbedding(feature_map, embedding_dim) 34 | if num_tasks != 2: 35 | raise ValueError("the number of tasks must be equal to 2!") 36 | self.tower = nn.ModuleList([MLP_Block(input_dim=embedding_dim * feature_map.num_fields, 37 | output_dim=1, 38 | hidden_units=tower_hidden_units, 39 | hidden_activations=hidden_activations, 40 | output_activation=None, 41 | dropout_rates=net_dropout, 42 | batch_norm=batch_norm) 43 | for _ in range(num_tasks)]) 44 | self.compile(kwargs["optimizer"], kwargs["loss"], learning_rate) 45 | self.reset_parameters() 46 | self.model_to_device() 47 | 48 | def forward(self, inputs): 49 | X = self.get_inputs(inputs) 50 | feature_emb = self.embedding_layer(X).flatten(start_dim=1) 51 | tower_output = [self.tower[i](feature_emb) for i in range(self.num_tasks)] 52 | cvr_pred = self.output_activation[0](tower_output[0]) 53 | ctr_pred = self.output_activation[1](tower_output[1]) 54 | ctcvr_pred = ctr_pred * cvr_pred 55 | y_pred = [ctr_pred, ctcvr_pred] 56 | return_dict = {} 57 | labels = self.feature_map.labels 58 | for i in range(self.num_tasks): 59 | return_dict["{}_pred".format(labels[i])] = y_pred[i] 60 | return return_dict 61 | -------------------------------------------------------------------------------- /models/MMoE.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from fuxictr.pytorch.models import MultiTaskModel 4 | from fuxictr.pytorch.layers import FeatureEmbedding, MLP_Block 5 | from fuxictr.pytorch.torch_utils import get_activation 6 | import logging 7 | 8 | class MMoE_Layer(nn.Module): 9 | def __init__(self, num_experts, num_tasks, input_dim, expert_hidden_units, gate_hidden_units, hidden_activations, 10 | net_dropout, batch_norm): 11 | super(MMoE_Layer, self).__init__() 12 | self.num_experts = num_experts 13 | self.num_tasks = num_tasks 14 | self.experts = nn.ModuleList([MLP_Block(input_dim=input_dim, 15 | hidden_units=expert_hidden_units, 16 | hidden_activations=hidden_activations, 17 | output_activation=None, 18 | dropout_rates=net_dropout, 19 | batch_norm=batch_norm) for _ in range(self.num_experts)]) 20 | self.gate = nn.ModuleList([MLP_Block(input_dim=input_dim, 21 | hidden_units=gate_hidden_units, 22 | output_dim=num_experts, 23 | hidden_activations=hidden_activations, 24 | output_activation=None, 25 | dropout_rates=net_dropout, 26 | batch_norm=batch_norm) for _ in range(self.num_tasks)]) 27 | self.gate_activation = get_activation('softmax') 28 | 29 | def forward(self, x): 30 | experts_output = torch.stack([self.experts[i](x) for i in range(self.num_experts)], 31 | dim=1) # (?, num_experts, dim) 32 | mmoe_output = [] 33 | gate_output_list = [] 34 | for i in range(self.num_tasks): 35 | gate_output = self.gate[i](x) 36 | if self.gate_activation is not None: 37 | gate_output = self.gate_activation(gate_output) # (?, num_experts) 38 | gate_output_list.append(gate_output.mean(dim=0)) 39 | mmoe_output.append(torch.sum(torch.multiply(gate_output.unsqueeze(-1), experts_output), dim=1)) 40 | return mmoe_output 41 | 42 | 43 | class MMoE(MultiTaskModel): 44 | def __init__(self, 45 | feature_map, 46 | task=["binary_classification"], 47 | num_tasks=1, 48 | model_id="MMoE", 49 | gpu=-1, 50 | learning_rate=1e-3, 51 | embedding_dim=10, 52 | num_experts=4, 53 | expert_hidden_units=[512, 256, 128], 54 | gate_hidden_units=[128, 64], 55 | tower_hidden_units=[128, 64], 56 | hidden_activations="ReLU", 57 | net_dropout=0, 58 | batch_norm=False, 59 | embedding_regularizer=None, 60 | net_regularizer=None, 61 | **kwargs): 62 | super(MMoE, self).__init__(feature_map, 63 | task=task, 64 | num_tasks=num_tasks, 65 | model_id=model_id, 66 | gpu=gpu, 67 | embedding_regularizer=embedding_regularizer, 68 | net_regularizer=net_regularizer, 69 | **kwargs) 70 | self.embedding_layer = FeatureEmbedding(feature_map, embedding_dim) 71 | self.num_experts = num_experts 72 | self.mmoe_layer = MMoE_Layer(num_experts=num_experts, 73 | num_tasks=self.num_tasks, 74 | input_dim=embedding_dim * feature_map.num_fields, 75 | expert_hidden_units=expert_hidden_units, 76 | gate_hidden_units=gate_hidden_units, 77 | hidden_activations=hidden_activations, 78 | net_dropout=net_dropout, 79 | batch_norm=batch_norm) 80 | self.tower = nn.ModuleList([MLP_Block(input_dim=expert_hidden_units[-1], 81 | output_dim=1, 82 | hidden_units=tower_hidden_units, 83 | hidden_activations=hidden_activations, 84 | output_activation=None, 85 | dropout_rates=net_dropout, 86 | batch_norm=batch_norm) 87 | for _ in range(num_tasks)]) 88 | self.compile(kwargs["optimizer"], kwargs["loss"], learning_rate) 89 | self.reset_parameters() 90 | self.model_to_device() 91 | 92 | def forward(self, inputs): 93 | X = self.get_inputs(inputs) 94 | feature_emb = self.embedding_layer(X) 95 | expert_output = self.mmoe_layer(feature_emb.flatten(start_dim=1)) 96 | tower_output = [self.tower[i](expert_output[i]) for i in range(self.num_tasks)] 97 | y_pred = [self.output_activation[i](tower_output[i]) for i in range(self.num_tasks)] 98 | return_dict = {} 99 | labels = self.feature_map.labels 100 | for i in range(self.num_tasks): 101 | return_dict["{}_pred".format(labels[i])] = y_pred[i] 102 | return return_dict -------------------------------------------------------------------------------- /models/OMoE.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from fuxictr.pytorch.models import MultiTaskModel 4 | from fuxictr.pytorch.layers import FeatureEmbedding, MLP_Block 5 | from fuxictr.pytorch.torch_utils import get_activation 6 | 7 | 8 | class OMoE_Layer(nn.Module): 9 | def __init__(self, num_experts, num_tasks, input_dim, expert_hidden_units, gate_hidden_units, hidden_activations, 10 | net_dropout, batch_norm): 11 | super(OMoE_Layer, self).__init__() 12 | self.num_experts = num_experts 13 | self.num_tasks = num_tasks 14 | self.experts = nn.ModuleList([MLP_Block(input_dim=input_dim, 15 | hidden_units=expert_hidden_units, 16 | hidden_activations=hidden_activations, 17 | output_activation=None, 18 | dropout_rates=net_dropout, 19 | batch_norm=batch_norm) for _ in range(self.num_experts)]) 20 | self.gate = MLP_Block(input_dim=input_dim, 21 | hidden_units=gate_hidden_units, 22 | output_dim=num_experts, 23 | hidden_activations=hidden_activations, 24 | output_activation=None, 25 | dropout_rates=net_dropout, 26 | batch_norm=batch_norm) 27 | self.gate_activation = get_activation('softmax') 28 | 29 | def forward(self, x): 30 | experts_output = torch.stack([self.experts[i](x) for i in range(self.num_experts)], 31 | dim=1) # (?, num_experts, dim) 32 | omoe_output = [] 33 | for i in range(self.num_tasks): 34 | gate_output = self.gate(x) 35 | if self.gate_activation is not None: 36 | gate_output = self.gate_activation(gate_output) # (?, num_experts) 37 | omoe_output.append(torch.sum(torch.multiply(gate_output.unsqueeze(-1), experts_output), dim=1)) 38 | return omoe_output 39 | 40 | 41 | class OMoE(MultiTaskModel): 42 | def __init__(self, 43 | feature_map, 44 | task=["binary_classification"], 45 | num_tasks=1, 46 | model_id="OMoE", 47 | gpu=-1, 48 | learning_rate=1e-3, 49 | embedding_dim=10, 50 | num_experts=4, 51 | expert_hidden_units=[512, 256, 128], 52 | gate_hidden_units=[128, 64], 53 | tower_hidden_units=[128, 64], 54 | hidden_activations="ReLU", 55 | net_dropout=0, 56 | batch_norm=False, 57 | embedding_regularizer=None, 58 | net_regularizer=None, 59 | **kwargs): 60 | super(OMoE, self).__init__(feature_map, 61 | task=task, 62 | num_tasks=num_tasks, 63 | model_id=model_id, 64 | gpu=gpu, 65 | embedding_regularizer=embedding_regularizer, 66 | net_regularizer=net_regularizer, 67 | **kwargs) 68 | self.embedding_layer = FeatureEmbedding(feature_map, embedding_dim) 69 | self.omoe_layer = OMoE_Layer(num_experts=num_experts, 70 | num_tasks=self.num_tasks, 71 | input_dim=embedding_dim * feature_map.num_fields, 72 | expert_hidden_units=expert_hidden_units, 73 | gate_hidden_units=gate_hidden_units, 74 | hidden_activations=hidden_activations, 75 | net_dropout=net_dropout, 76 | batch_norm=batch_norm) 77 | self.tower = nn.ModuleList([MLP_Block(input_dim=expert_hidden_units[-1], 78 | output_dim=1, 79 | hidden_units=tower_hidden_units, 80 | hidden_activations=hidden_activations, 81 | output_activation=None, 82 | dropout_rates=net_dropout, 83 | batch_norm=batch_norm) 84 | for _ in range(num_tasks)]) 85 | self.compile(kwargs["optimizer"], kwargs["loss"], learning_rate) 86 | self.reset_parameters() 87 | self.model_to_device() 88 | 89 | def forward(self, inputs): 90 | X = self.get_inputs(inputs) 91 | feature_emb = self.embedding_layer(X) 92 | expert_output = self.omoe_layer(feature_emb.flatten(start_dim=1)) 93 | tower_output = [self.tower[i](expert_output[i]) for i in range(self.num_tasks)] 94 | y_pred = [self.output_activation[i](tower_output[i]) for i in range(self.num_tasks)] 95 | return_dict = {} 96 | labels = self.feature_map.labels 97 | for i in range(self.num_tasks): 98 | return_dict["{}_pred".format(labels[i])] = y_pred[i] 99 | return return_dict 100 | -------------------------------------------------------------------------------- /models/PLE.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from fuxictr.pytorch.models import MultiTaskModel 4 | from fuxictr.pytorch.layers import FeatureEmbedding, MLP_Block 5 | from fuxictr.pytorch.torch_utils import get_activation 6 | import logging 7 | 8 | class CGC_Layer(nn.Module): 9 | def __init__(self, num_shared_experts, num_specific_experts, num_tasks, input_dim, expert_hidden_units, gate_hidden_units, hidden_activations, 10 | net_dropout, batch_norm): 11 | super(CGC_Layer, self).__init__() 12 | self.num_shared_experts = num_shared_experts 13 | self.num_specific_experts = num_specific_experts 14 | self.num_tasks = num_tasks 15 | self.shared_experts = nn.ModuleList([MLP_Block(input_dim=input_dim, 16 | hidden_units=expert_hidden_units, 17 | hidden_activations=hidden_activations, 18 | output_activation=None, 19 | dropout_rates=net_dropout, 20 | batch_norm=batch_norm) for _ in range(self.num_shared_experts)]) 21 | self.specific_experts = nn.ModuleList([nn.ModuleList([MLP_Block(input_dim=input_dim, 22 | hidden_units=expert_hidden_units, 23 | hidden_activations=hidden_activations, 24 | output_activation=None, 25 | dropout_rates=net_dropout, 26 | batch_norm=batch_norm) for _ in range(self.num_specific_experts)]) for _ in range(num_tasks)]) 27 | self.gate = nn.ModuleList([MLP_Block(input_dim=input_dim, 28 | output_dim=num_specific_experts+num_shared_experts if i < num_tasks else num_shared_experts, 29 | hidden_units=gate_hidden_units, 30 | hidden_activations=hidden_activations, 31 | output_activation=None, 32 | dropout_rates=net_dropout, 33 | batch_norm=batch_norm) for i in range(self.num_tasks+1)]) 34 | self.gate_activation = get_activation('softmax') 35 | def forward(self, x, require_gate=False): 36 | """ 37 | x: list, len(x)==num_tasks+1 38 | """ 39 | specific_expert_outputs = [] 40 | shared_expert_outputs = [] 41 | # specific experts 42 | for i in range(self.num_tasks): 43 | task_expert_outputs = [] 44 | for j in range(self.num_specific_experts): 45 | task_expert_outputs.append(self.specific_experts[i][j](x[i])) 46 | specific_expert_outputs.append(task_expert_outputs) 47 | # shared experts 48 | for i in range(self.num_shared_experts): 49 | shared_expert_outputs.append(self.shared_experts[i](x[-1])) 50 | # gate 51 | cgc_outputs = [] 52 | gates = [] 53 | for i in range(self.num_tasks+1): 54 | if i < self.num_tasks: 55 | # for specific experts 56 | gate_input = torch.stack(specific_expert_outputs[i] + shared_expert_outputs, dim=1) # (?, num_specific_experts+num_shared_experts, dim) 57 | gate = self.gate_activation(self.gate[i](x[i])) # (?, num_specific_experts+num_shared_experts) 58 | gates.append(gate.mean(0)) 59 | cgc_output = torch.sum(gate.unsqueeze(-1) * gate_input, dim=1) # (?, dim) 60 | cgc_outputs.append(cgc_output) 61 | else: 62 | # for shared experts 63 | gate_input = torch.stack(shared_expert_outputs, dim=1) # (?, num_shared_experts, dim) 64 | gate = self.gate_activation(self.gate[i](x[-1])) # (?, num_shared_experts) 65 | # gates.append(gate.mean(0)) 66 | cgc_output = torch.sum(gate.unsqueeze(-1) * gate_input, dim=1) # (?, dim) 67 | cgc_outputs.append(cgc_output) 68 | if require_gate: 69 | return cgc_outputs, gates 70 | else: 71 | return cgc_outputs 72 | 73 | class PLE(MultiTaskModel): 74 | def __init__(self, 75 | feature_map, 76 | task=["binary_classification"], 77 | num_tasks=1, 78 | model_id="PLE", 79 | gpu=-1, 80 | learning_rate=1e-3, 81 | embedding_dim=10, 82 | num_layers=1, 83 | num_shared_experts=1, 84 | num_specific_experts=1, 85 | expert_hidden_units=[512, 256, 128], 86 | gate_hidden_units=[128, 64], 87 | tower_hidden_units=[128, 64], 88 | hidden_activations="ReLU", 89 | net_dropout=0, 90 | batch_norm=False, 91 | embedding_regularizer=None, 92 | net_regularizer=None, 93 | **kwargs): 94 | super(PLE, self).__init__(feature_map, 95 | task=task, 96 | num_tasks=num_tasks, 97 | model_id=model_id, 98 | gpu=gpu, 99 | embedding_regularizer=embedding_regularizer, 100 | net_regularizer=net_regularizer, 101 | **kwargs) 102 | self.embedding_layer = FeatureEmbedding(feature_map, embedding_dim) 103 | self.num_layers = num_layers 104 | self.num_shared_experts = num_shared_experts 105 | self.num_specific_experts = num_specific_experts 106 | self.cgc_layers = nn.ModuleList([CGC_Layer(num_shared_experts, 107 | num_specific_experts, 108 | num_tasks, 109 | input_dim= embedding_dim * feature_map.num_fields if i==0 else expert_hidden_units[-1], 110 | expert_hidden_units= expert_hidden_units, 111 | gate_hidden_units=gate_hidden_units, 112 | hidden_activations=hidden_activations, 113 | net_dropout=net_dropout, 114 | batch_norm=batch_norm) for i in range(self.num_layers)]) 115 | self.tower = nn.ModuleList([MLP_Block(input_dim=expert_hidden_units[-1], 116 | output_dim=1, 117 | hidden_units=tower_hidden_units, 118 | hidden_activations=hidden_activations, 119 | output_activation=None, 120 | dropout_rates=net_dropout, 121 | batch_norm=batch_norm) 122 | for _ in range(num_tasks)]) 123 | self.compile(kwargs["optimizer"], kwargs["loss"], learning_rate) 124 | self.reset_parameters() 125 | self.model_to_device() 126 | 127 | def forward(self, inputs): 128 | X = self.get_inputs(inputs) 129 | feature_emb = self.embedding_layer(X) 130 | cgc_inputs = [feature_emb.flatten(start_dim=1) for _ in range(self.num_tasks+1)] 131 | for i in range(self.num_layers): 132 | cgc_outputs = self.cgc_layers[i](cgc_inputs) 133 | cgc_inputs = cgc_outputs 134 | tower_output = [self.tower[i](cgc_outputs[i]) for i in range(self.num_tasks)] 135 | y_pred = [self.output_activation[i](tower_output[i]) for i in range(self.num_tasks)] 136 | return_dict = {} 137 | labels = self.feature_map.labels 138 | for i in range(self.num_tasks): 139 | return_dict["{}_pred".format(labels[i])] = y_pred[i] 140 | return return_dict 141 | -------------------------------------------------------------------------------- /models/STEM.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from fuxictr.pytorch.models import MultiTaskModel 4 | from fuxictr.pytorch.layers import FeatureEmbedding, MLP_Block 5 | from fuxictr.pytorch.torch_utils import get_activation 6 | import numpy as np 7 | import logging 8 | class STEM_Layer(nn.Module): 9 | def __init__(self, num_shared_experts, num_specific_experts, num_tasks, input_dim, expert_hidden_units, gate_hidden_units, hidden_activations, 10 | net_dropout, batch_norm): 11 | super(STEM_Layer, self).__init__() 12 | self.num_shared_experts = num_shared_experts 13 | self.num_specific_experts = num_specific_experts 14 | self.num_tasks = num_tasks 15 | self.shared_experts = nn.ModuleList([MLP_Block(input_dim=input_dim, 16 | hidden_units=expert_hidden_units, 17 | hidden_activations=hidden_activations, 18 | output_activation=None, 19 | dropout_rates=net_dropout, 20 | batch_norm=batch_norm) for _ in range(self.num_shared_experts)]) 21 | self.specific_experts = nn.ModuleList([nn.ModuleList([MLP_Block(input_dim=input_dim, 22 | hidden_units=expert_hidden_units, 23 | hidden_activations=hidden_activations, 24 | output_activation=None, 25 | dropout_rates=net_dropout, 26 | batch_norm=batch_norm) for _ in range(self.num_specific_experts)]) for _ in range(num_tasks)]) 27 | self.gate = nn.ModuleList([MLP_Block(input_dim=input_dim, 28 | output_dim=num_specific_experts*num_tasks+num_shared_experts, 29 | hidden_units=gate_hidden_units, 30 | hidden_activations=hidden_activations, 31 | output_activation=None, 32 | dropout_rates=net_dropout, 33 | batch_norm=batch_norm) for i in range(self.num_tasks+1)]) 34 | self.gate_activation = get_activation('softmax') 35 | def forward(self, x, return_gate=False): 36 | """ 37 | x: list, len(x)==num_tasks+1 38 | """ 39 | specific_expert_outputs = [] 40 | shared_expert_outputs = [] 41 | # specific experts 42 | for i in range(self.num_tasks): 43 | task_expert_outputs = [] 44 | for j in range(self.num_specific_experts): 45 | task_expert_outputs.append(self.specific_experts[i][j](x[i])) 46 | specific_expert_outputs.append(task_expert_outputs) 47 | # shared experts 48 | for i in range(self.num_shared_experts): 49 | shared_expert_outputs.append(self.shared_experts[i](x[-1])) 50 | 51 | # gate 52 | stem_outputs = [] 53 | stem_gates = [] 54 | for i in range(self.num_tasks+1): 55 | if i < self.num_tasks: 56 | # for specific experts 57 | gate_input = [] 58 | for j in range(self.num_tasks): 59 | if j == i: 60 | gate_input.extend(specific_expert_outputs[j]) 61 | else: 62 | specific_expert_outputs_j = specific_expert_outputs[j] 63 | specific_expert_outputs_j = [out.detach() for out in specific_expert_outputs_j] 64 | gate_input.extend(specific_expert_outputs_j) 65 | gate_input.extend(shared_expert_outputs) 66 | gate_input = torch.stack(gate_input, dim=1) # (?, num_specific_experts*num_tasks+num_shared_experts, dim) 67 | gate = self.gate_activation(self.gate[i](x[i]+x[-1])) # (?, num_specific_experts*num_tasks+num_shared_experts) 68 | if return_gate: 69 | specific_gate = gate[:,:self.num_specific_experts*self.num_tasks].mean(0) 70 | task_gate = torch.chunk(specific_gate, chunks=self.num_tasks) 71 | specific_gate_list = [] 72 | for tg in task_gate: 73 | specific_gate_list.append(torch.sum(tg)) 74 | shared_gate = gate[:,-self.num_shared_experts:].mean(0).sum() 75 | target_task_gate = torch.stack(specific_gate_list+[shared_gate],dim=0).view(-1) # (num_task+1,1) 76 | assert len(target_task_gate) == self.num_tasks+1 77 | stem_gates.append(target_task_gate) 78 | stem_output = torch.sum(gate.unsqueeze(-1) * gate_input, dim=1) # (?, dim) 79 | stem_outputs.append(stem_output) 80 | else: 81 | # for shared experts 82 | gate_input = [] 83 | for j in range(self.num_tasks): 84 | gate_input.extend(specific_expert_outputs[j]) 85 | gate_input.extend(shared_expert_outputs) 86 | gate_input = torch.stack(gate_input, dim=1) # (?, num_specific_experts*num_tasks+num_shared_experts, dim) 87 | gate = self.gate_activation(self.gate[i](x[-1])) # (?, num_specific_experts*num_tasks+num_shared_experts) 88 | stem_output = torch.sum(gate.unsqueeze(-1) * gate_input, dim=1) # (?, dim) 89 | stem_outputs.append(stem_output) 90 | 91 | if return_gate: 92 | return stem_outputs, stem_gates 93 | else: 94 | return stem_outputs 95 | 96 | 97 | class STEM(MultiTaskModel): 98 | def __init__(self, 99 | feature_map, 100 | task=["binary_classification"], 101 | num_tasks=1, 102 | model_id="STEM", 103 | gpu=-1, 104 | learning_rate=1e-3, 105 | embedding_dim=10, 106 | num_layers=1, 107 | num_shared_experts=1, 108 | num_specific_experts=1, 109 | expert_hidden_units=[512, 256, 128], 110 | gate_hidden_units=[128, 64], 111 | tower_hidden_units=[128, 64], 112 | hidden_activations="ReLU", 113 | net_dropout=0, 114 | batch_norm=False, 115 | embedding_regularizer=None, 116 | net_regularizer=None, 117 | **kwargs): 118 | super(STEM, self).__init__(feature_map, 119 | task=task, 120 | num_tasks=num_tasks, 121 | model_id=model_id, 122 | gpu=gpu, 123 | embedding_regularizer=embedding_regularizer, 124 | net_regularizer=net_regularizer, 125 | **kwargs) 126 | self.embedding_layer = FeatureEmbedding(feature_map, embedding_dim * (self.num_tasks+1)) 127 | self.num_layers = num_layers 128 | self.embedding_dim = embedding_dim 129 | self.num_specific_experts = num_specific_experts 130 | self.num_shared_experts = num_shared_experts 131 | self.stem_layers = nn.ModuleList([STEM_Layer(num_shared_experts, 132 | num_specific_experts, 133 | num_tasks, 134 | input_dim= self.embedding_dim * feature_map.num_fields if i==0 else expert_hidden_units[-1], 135 | expert_hidden_units= expert_hidden_units, 136 | gate_hidden_units=gate_hidden_units, 137 | hidden_activations=hidden_activations, 138 | net_dropout=net_dropout, 139 | batch_norm=batch_norm) for i in range(self.num_layers)]) 140 | self.tower = nn.ModuleList([MLP_Block(input_dim=expert_hidden_units[-1], 141 | output_dim=1, 142 | hidden_units=tower_hidden_units, 143 | hidden_activations=hidden_activations, 144 | output_activation=None, 145 | dropout_rates=net_dropout, 146 | batch_norm=batch_norm) 147 | for _ in range(num_tasks)]) 148 | self.compile(kwargs["optimizer"], kwargs["loss"], learning_rate) 149 | self.reset_parameters() 150 | self.model_to_device() 151 | 152 | def forward(self, inputs): 153 | X = self.get_inputs(inputs) 154 | feature_emb = self.embedding_layer(X) # (?, num_field, D) 155 | feature_embs = feature_emb.split(self.embedding_dim, dim=2) 156 | stem_inputs = [feature_embs[i].flatten(start_dim=1) for i in range(self.num_tasks+1)] 157 | for i in range(self.num_layers): 158 | stem_outputs = self.stem_layers[i](stem_inputs) 159 | stem_inputs = stem_outputs 160 | tower_output = [self.tower[i](stem_outputs[i]) for i in range(self.num_tasks)] 161 | y_pred = [self.output_activation[i](tower_output[i]) for i in range(self.num_tasks)] 162 | return_dict = {} 163 | labels = self.feature_map.labels 164 | for i in range(self.num_tasks): 165 | return_dict["{}_pred".format(labels[i])] = y_pred[i] 166 | return return_dict -------------------------------------------------------------------------------- /models/SharedBottom.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from fuxictr.pytorch.models import MultiTaskModel 4 | from fuxictr.pytorch.layers import FeatureEmbedding, MLP_Block 5 | 6 | class SharedBottom(MultiTaskModel): 7 | def __init__(self, 8 | feature_map, 9 | model_id="SharedBottom", 10 | gpu=-1, 11 | task=["binary_classification"], 12 | num_tasks=1, 13 | loss_weight='EQ', 14 | learning_rate=1e-3, 15 | embedding_dim=10, 16 | bottom_hidden_units=[64, 64, 64], 17 | tower_hidden_units=[64, ], 18 | hidden_activations="ReLU", 19 | net_dropout=0, 20 | batch_norm=False, 21 | embedding_regularizer=None, 22 | net_regularizer=None, 23 | **kwargs): 24 | super(SharedBottom, self).__init__(feature_map, 25 | task=task, 26 | loss_weight=loss_weight, 27 | num_tasks=num_tasks, 28 | model_id=model_id, 29 | gpu=gpu, 30 | embedding_regularizer=embedding_regularizer, 31 | net_regularizer=net_regularizer, 32 | **kwargs) 33 | self.embedding_layer = FeatureEmbedding(feature_map, embedding_dim) 34 | self.bottom = MLP_Block(input_dim=embedding_dim * feature_map.num_fields, 35 | hidden_units=bottom_hidden_units, 36 | hidden_activations=hidden_activations, 37 | output_activation=None, 38 | dropout_rates=net_dropout, 39 | batch_norm=batch_norm) 40 | self.tower = nn.ModuleList([MLP_Block(input_dim=bottom_hidden_units[-1], 41 | output_dim=1, 42 | hidden_units=tower_hidden_units, 43 | hidden_activations=hidden_activations, 44 | output_activation=None, 45 | dropout_rates=net_dropout, 46 | batch_norm=batch_norm) 47 | for _ in range(num_tasks)]) 48 | self.compile(kwargs["optimizer"], kwargs["loss"], learning_rate) 49 | self.reset_parameters() 50 | self.model_to_device() 51 | 52 | def forward(self, inputs): 53 | X = self.get_inputs(inputs) 54 | feature_emb = self.embedding_layer(X) 55 | bottom_output = self.bottom(feature_emb.flatten(start_dim=1)) # (?, bottom_hidden_units[-1]) 56 | tower_output = [self.tower[i](bottom_output) for i in range(self.num_tasks)] 57 | y_pred = [self.output_activation[i](tower_output[i]) for i in range(self.num_tasks)] 58 | return_dict = {} 59 | labels = self.feature_map.labels 60 | for i in range(self.num_tasks): 61 | return_dict["{}_pred".format(labels[i])] = y_pred[i] 62 | return return_dict -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .SharedBottom import SharedBottom 2 | from .MMoE import MMoE 3 | from .PLE import PLE 4 | from .AITM import AITM 5 | from .ESMM import ESMM 6 | from .OMoE import OMoE 7 | from .STEM import STEM -------------------------------------------------------------------------------- /run_expid.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.chdir(os.path.dirname(os.path.realpath(__file__))) 3 | import sys 4 | import logging 5 | # import fuxictr_version 6 | from fuxictr import datasets 7 | from datetime import datetime 8 | from fuxictr.utils import load_config, set_logger, print_to_json, print_to_list 9 | from fuxictr.features import FeatureMap 10 | from fuxictr.pytorch.torch_utils import seed_everything 11 | from fuxictr.pytorch.dataloaders import H5DataLoader 12 | from fuxictr.preprocess import FeatureProcessor, build_dataset 13 | import models as model_zoo 14 | import gc 15 | import argparse 16 | import os 17 | from pathlib import Path 18 | from fuxictr.pytorch.models import MultiTaskModel, BaseModel 19 | 20 | 21 | 22 | if __name__ == '__main__': 23 | ''' Usage: python run_expid.py --config {config_dir} --expid {experiment_id} --gpu {gpu_device_id} 24 | ''' 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument('--config', type=str, default='./config/', help='The config directory.') 27 | parser.add_argument('--expid', type=str, default='DeepFM_test', help='The experiment id to run.') 28 | parser.add_argument('--gpu', type=int, default=-1, help='The gpu index, -1 for cpu') 29 | args = vars(parser.parse_args()) 30 | 31 | experiment_id = args['expid'] 32 | params = load_config(args['config'], experiment_id) 33 | params['gpu'] = args['gpu'] 34 | set_logger(params) 35 | logging.info("Params: " + print_to_json(params)) 36 | seed_everything(seed=params['seed']) 37 | 38 | data_dir = os.path.join(params['data_root'], params['dataset_id']) 39 | feature_map_json = os.path.join(data_dir, "feature_map.json") 40 | if params["data_format"] == "csv": 41 | # Build feature_map and transform h5 data 42 | feature_encoder = FeatureProcessor(**params) 43 | params["train_data"], params["valid_data"], params["test_data"] = \ 44 | build_dataset(feature_encoder, **params) 45 | 46 | feature_map = FeatureMap(params['dataset_id'], data_dir) 47 | feature_map.load(feature_map_json, params) 48 | logging.info("Feature specs: " + print_to_json(feature_map.features)) 49 | if 'task_labels' in params: 50 | if isinstance(params['task_labels'],list): 51 | task_labels = params['task_labels'] 52 | else: 53 | task_labels = list(params['task_labels']) 54 | feature_map.labels = task_labels 55 | feature_map.set_column_index() 56 | 57 | model_class = getattr(model_zoo, params['model']) 58 | model = model_class(feature_map, **params) 59 | model.count_parameters() # print number of parameters used in model 60 | 61 | train_gen, valid_gen = H5DataLoader(feature_map, stage='train', **params).make_iterator() 62 | model.fit(train_gen, validation_data=valid_gen, **params) 63 | 64 | logging.info('****** Validation evaluation ******') 65 | valid_result = model.evaluate(valid_gen) 66 | del train_gen, valid_gen 67 | gc.collect() 68 | 69 | logging.info('******** Test evaluation ********') 70 | test_gen = H5DataLoader(feature_map, stage='test', **params).make_iterator() 71 | test_result = {} 72 | if test_gen: 73 | test_result = model.evaluate(test_gen) 74 | 75 | result_filename = os.path.join('results',Path(args['config']).name.replace(".yaml", "") + '.csv') 76 | with open(result_filename, 'a+') as fw: 77 | fw.write(' {},[command] python {},[exp_id] {},[dataset_id] {},[train] {},[val] {},[test] {}\n' \ 78 | .format(datetime.now().strftime('%Y%m%d-%H%M%S'), 79 | ' '.join(sys.argv), experiment_id, params['dataset_id'], 80 | "N.A.", print_to_list(valid_result), print_to_list(test_result))) 81 | -------------------------------------------------------------------------------- /run_expid_list.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | 18 | from datetime import datetime 19 | import gc 20 | import argparse 21 | import fuxictr_version 22 | from fuxictr import autotuner 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument('--config', type=str, default='../config/tuner_config_LR_avazu_01/', 27 | help='The config file for para tuning.') 28 | parser.add_argument('--gpu', nargs='+', default=[-1], help='The list of gpu indexes, -1 for cpu.') 29 | args = vars(parser.parse_args()) 30 | gpu_list = args['gpu'] 31 | config_dir = args['config'] 32 | 33 | # generate parameter space combinations 34 | autotuner.grid_search(config_dir, gpu_list) 35 | 36 | -------------------------------------------------------------------------------- /run_param_tuner.py: -------------------------------------------------------------------------------- 1 | # ========================================================================= 2 | # Copyright (C) 2022. Huawei Technologies Co., Ltd. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ========================================================================= 16 | 17 | from datetime import datetime 18 | import gc 19 | import argparse 20 | # import fuxictr_version 21 | from fuxictr import autotuner 22 | 23 | if __name__ == '__main__': 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument('--config', type=str, default='../config/tuner_config.yaml', 26 | help='The config file for para tuning.') 27 | parser.add_argument('--tag', type=str, default=None, help='Use the tag to determine which expid to run (e.g. 001 for the first expid).') 28 | parser.add_argument('--gpu', nargs='+', default=[-1], help='The list of gpu indexes, -1 for cpu.') 29 | args = vars(parser.parse_args()) 30 | gpu_list = args['gpu'] 31 | expid_tag = args['tag'] 32 | 33 | # generate parameter space combinations 34 | config_dir = autotuner.enumerate_params(args['config']) 35 | autotuner.grid_search(config_dir, gpu_list, expid_tag) 36 | 37 | --------------------------------------------------------------------------------