├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── .gitignore ├── .travis.yml ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── examples ├── amazon │ ├── amazon-lightning.ipynb │ ├── amazon.ipynb │ ├── prepare_neg.ipynb │ └── simple_benchmark.png └── movielens │ ├── ml-1m │ ├── README │ ├── preprocess.ipynb │ ├── test.csv │ └── train.csv │ └── movielens-1m.ipynb ├── prediction_flow ├── __init__.py ├── features │ ├── __init__.py │ ├── base.py │ ├── category_feature.py │ ├── features.py │ ├── number_feature.py │ ├── sequence_feature.py │ └── tests │ │ ├── __init__.py │ │ └── test_features.py ├── metrics │ └── __init__.py ├── pytorch │ ├── __init__.py │ ├── base.py │ ├── data │ │ ├── __init__.py │ │ ├── dataset.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ └── test_dataset.py │ ├── deepfm.py │ ├── dien.py │ ├── din.py │ ├── dnn.py │ ├── functions.py │ ├── interest_net.py │ ├── nn │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── fm.py │ │ ├── interest.py │ │ ├── mlp.py │ │ ├── pooling.py │ │ ├── rnn.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── test_attention.py │ │ │ ├── test_fm.py │ │ │ ├── test_interest.py │ │ │ ├── test_mlp.py │ │ │ ├── test_pooling.py │ │ │ └── test_rnn.py │ ├── tests │ │ ├── __init__.py │ │ ├── test_deepfm.py │ │ ├── test_dien.py │ │ ├── test_din.py │ │ ├── test_dnn.py │ │ ├── test_wide_deep.py │ │ └── utils.py │ ├── utils.py │ └── wide_deep.py ├── transformers │ ├── __init__.py │ └── column │ │ ├── __init__.py │ │ ├── base.py │ │ ├── category_encoder.py │ │ ├── column_flow.py │ │ ├── log_transformer.py │ │ ├── sequence_encoder.py │ │ ├── standard_scaler.py │ │ └── tests │ │ ├── __init__.py │ │ ├── test_category_encoder.py │ │ ├── test_column_flow.py │ │ ├── test_log_transformer.py │ │ ├── test_sequence_encoder.py │ │ └── test_standard_scaler.py └── utils │ └── __init__.py ├── requirements.txt ├── setup.cfg └── setup.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: GitHub-HongweiZhang 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Additional context** 27 | Add any other context about the problem here. 28 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: GitHub-HongweiZhang 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | examples/movielens/ml-1m/*.dat 107 | examples/amazon/local* 108 | examples/amazon/*info 109 | tmp/ 110 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "3.6" 5 | 6 | script: 7 | - pytest 8 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | TODO 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Hongwei Zhang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the README 2 | include *.md 3 | 4 | # Include the license file 5 | include LICENSE 6 | 7 | # Include the Requirements 8 | include requirements.txt 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/GitHub-HongweiZhang/prediction-flow.svg?branch=master)](https://travis-ci.org/GitHub-HongweiZhang/prediction-flow) 2 | 3 | [![PyPI version](https://badge.fury.io/py/prediction-flow.svg)](https://badge.fury.io/py/prediction-flow) 4 | 5 | # prediction-flow 6 | **prediction-flow** is a Python package providing modern **Deep-Learning** 7 | based CTR models. Models are implemented by **PyTorch**. 8 | 9 | ## how to use 10 | * Install using pip. 11 | ``` 12 | pip install prediction-flow 13 | ``` 14 | 15 | ## feature 16 | ### how to define feature 17 | There are two parameters for all feature types, name and column_flow. 18 | The name parameter is used to index the column raw data from input data frame. 19 | The column_flow parameter is a single transformer of a list of transformers. 20 | The transformer is used to pre-process the column data before training the model. 21 | 22 | * dense number feature 23 | ``` 24 | Number('age', StandardScaler()) 25 | Number('ctr', None) 26 | ``` 27 | * sparse category feature 28 | ``` 29 | Category('movieId', CategoryEncoder(min_cnt=1)) 30 | ``` 31 | * var length sequence feature 32 | ``` 33 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)) 34 | ``` 35 | 36 | ## transformer 37 | The following transformers are provided now. 38 | 39 | | transformer | supported feature type | detail | 40 | |--|--|--| 41 | | StandardScaler | Number | Wrapper of scikit-learn's StandardScaler. Null value must be filled in advance. | 42 | | LogTransformer | Number | Log scaler. Null value must be filled in advance. | 43 | | CategoryEncoder | Category | Converting str value to int. Null value must be filled in advance using '\_\_UNKNOWN\_\_'. | 44 | | SequenceEncoder | Sequence | Converting sequence str value to int. Null value must be filled in advance using '\_\_UNKNOWN\_\_'. | 45 | 46 | ## model 47 | 48 | | model | reference | 49 | |--|--| 50 | | DNN | - | 51 | | Wide & Deep | [DLRS 2016][Wide & Deep Learning for Recommender Systems](https://arxiv.org/pdf/1606.07792.pdf) | 52 | | DeepFM | [IJCAI 2017][DeepFM: A Factorization-Machine based Neural Network for CTR Prediction](http://www.ijcai.org/proceedings/2017/0239.pdf) | 53 | | DIN | [KDD 2018][Deep Interest Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1706.06978.pdf) | 54 | | DNN + GRU + GRU + Attention | [AAAI 2019][Deep Interest Evolution Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1809.03672.pdf) | 55 | | DNN + GRU + AIGRU | [AAAI 2019][Deep Interest Evolution Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1809.03672.pdf) | 56 | | DNN + GRU + AGRU | [AAAI 2019][Deep Interest Evolution Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1809.03672.pdf) | 57 | | DNN + GRU + AUGRU | [AAAI 2019][Deep Interest Evolution Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1809.03672.pdf) | 58 | | DIEN | [AAAI 2019][Deep Interest Evolution Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1809.03672.pdf) | 59 | | OTHER | TODO | 60 | 61 | ## example 62 | ### movielens-1M 63 | **This dataset is just used to test the code can run, accuracy does not make 64 | sense.** 65 | * Prepare the dataset. [preprocess.ipynb](examples/movielens/ml-1m/preprocess.ipynb) 66 | * Run the model. [movielens-1m.ipynb](examples/movielens/movielens-1m.ipynb) 67 | 68 | ### amazon 69 | * Prepare the dataset. [prepare_neg.ipynb](examples/amazon/prepare_neg.ipynb) 70 | * Run the model. 71 | [amazon.ipynb](examples/amazon/amazon.ipynb) 72 | * An example using [pytorch-lightning](https://github.com/williamFalcon/pytorch-lightning). 73 | [amazon-lightning.ipynb](examples/amazon/amazon-lightning.ipynb) 74 | 75 | **accuracy** 76 | 77 | ![benchmark](examples/amazon/simple_benchmark.png) 78 | 79 | ## acknowledge and reference 80 | * Referring the design from [DeepCTR](https://github.com/shenweichen/DeepCTR), 81 | the features are divided into dense (class Number), sparse (class Category), 82 | sequence (class Sequence) types. 83 | -------------------------------------------------------------------------------- /examples/amazon/amazon-lightning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# script to do experiments described in paper: Deep Interest Evolution Network for Click-Through Rate Prediction\n", 8 | "\n", 9 | "## how to run\n", 10 | "\n", 11 | "1. Please run prepare_neg.ipynb first." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "SEQ_MAX_LEN = 100 # maximum sequence length\n", 21 | "BATCH_SIZE = 128\n", 22 | "EMBEDDING_DIM = 18\n", 23 | "DNN_HIDDEN_SIZE = [200, 80]\n", 24 | "DNN_DROPOUT = 0.0\n", 25 | "TEST_RUN = False\n", 26 | "EPOCH = 2\n", 27 | "SEED = 10" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "%matplotlib inline\n", 37 | "\n", 38 | "import itertools\n", 39 | "from collections import Counter, OrderedDict\n", 40 | "\n", 41 | "import random\n", 42 | "import numpy as np\n", 43 | "import pandas as pd\n", 44 | "import matplotlib.pyplot as plt\n", 45 | "\n", 46 | "import torch\n", 47 | "import torch.nn as nn\n", 48 | "import torch.optim as optim\n", 49 | "import torch.nn.functional as F\n", 50 | "from sklearn.metrics import roc_auc_score\n", 51 | "\n", 52 | "from prediction_flow.features import Number, Category, Sequence, Features\n", 53 | "from prediction_flow.transformers.column import (\n", 54 | " StandardScaler, CategoryEncoder, SequenceEncoder)\n", 55 | "\n", 56 | "from prediction_flow.pytorch.data import Dataset\n", 57 | "from prediction_flow.pytorch import WideDeep, DeepFM, DNN, DIN, DIEN, AttentionGroup\n", 58 | "\n", 59 | "from prediction_flow.pytorch.functions import fit, predict, create_dataloader_fn" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "" 71 | ] 72 | }, 73 | "execution_count": 3, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "random.seed(SEED)\n", 80 | "np.random.seed(SEED)\n", 81 | "torch.manual_seed(SEED)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "train_df = pd.read_csv(\n", 91 | " \"./local_train.csv\", sep='\\t')\n", 92 | "\n", 93 | "valid_df = pd.read_csv(\n", 94 | " \"./local_test.csv\", sep='\\t')" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "if TEST_RUN:\n", 104 | " train_df = train_df.sample(1000)\n", 105 | " valid_df = valid_df.sample(1000)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 6, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/html": [ 116 | "
\n", 117 | "\n", 130 | "\n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | "
labeluidmidcathist_midshist_catsneg_hist_midsneg_hist_cats
00AZPJ9LUT0FEPYB00AMNNTIALiterature & Fiction0307744434\u00020062248391\u00020470530707\u00020978924622\u000215...Books\u0002Books\u0002Books\u0002Books\u0002Books0786890487\u00020618539069\u0002B001IDZJO0\u00021603421548\u000203...Books\u0002Books\u0002Books\u0002Books\u0002Books
11AZPJ9LUT0FEPY0800731603Books0307744434\u00020062248391\u00020470530707\u00020978924622\u000215...Books\u0002Books\u0002Books\u0002Books\u0002BooksB00BEFIHOG\u00021402245270\u00020670031747\u00020615785182\u000214...Literary\u0002Books\u0002Books\u0002Books\u0002Books
20A2NRV79GKAU726B003NNV10ORussian0814472869\u00020071462074\u00021583942300\u00020812538366\u0002B0...Books\u0002Books\u0002Books\u0002Books\u0002Baking\u0002Books\u0002BooksB00LQABRTG\u0002087830178X\u00020991543009\u0002071533154X\u000203...Neuropsychology\u0002Books\u0002Books\u0002Books\u0002Books\u0002Books\u0002...
31A2NRV79GKAU726B000UWJ91OBooks0814472869\u00020071462074\u00021583942300\u00020812538366\u0002B0...Books\u0002Books\u0002Books\u0002Books\u0002Baking\u0002Books\u0002Books1595328149\u00021591797810\u00020451233018\u00020373771355\u000214...Books\u0002Books\u0002Books\u0002Books\u0002Books\u0002Books\u0002Contempora...
40A2GEQVDX2LL4V30321334094Books0743596870\u00020374280991\u00021439140634\u00020976475731Books\u0002Books\u0002Books\u0002Books0316159735\u0002156718359X\u00020786812400\u00020062506110Books\u0002Books\u0002Books\u0002Books
\n", 202 | "
" 203 | ], 204 | "text/plain": [ 205 | " label uid mid cat \\\n", 206 | "0 0 AZPJ9LUT0FEPY B00AMNNTIA Literature & Fiction \n", 207 | "1 1 AZPJ9LUT0FEPY 0800731603 Books \n", 208 | "2 0 A2NRV79GKAU726 B003NNV10O Russian \n", 209 | "3 1 A2NRV79GKAU726 B000UWJ91O Books \n", 210 | "4 0 A2GEQVDX2LL4V3 0321334094 Books \n", 211 | "\n", 212 | " hist_mids \\\n", 213 | "0 0307744434\u00020062248391\u00020470530707\u00020978924622\u000215... \n", 214 | "1 0307744434\u00020062248391\u00020470530707\u00020978924622\u000215... \n", 215 | "2 0814472869\u00020071462074\u00021583942300\u00020812538366\u0002B0... \n", 216 | "3 0814472869\u00020071462074\u00021583942300\u00020812538366\u0002B0... \n", 217 | "4 0743596870\u00020374280991\u00021439140634\u00020976475731 \n", 218 | "\n", 219 | " hist_cats \\\n", 220 | "0 Books\u0002Books\u0002Books\u0002Books\u0002Books \n", 221 | "1 Books\u0002Books\u0002Books\u0002Books\u0002Books \n", 222 | "2 Books\u0002Books\u0002Books\u0002Books\u0002Baking\u0002Books\u0002Books \n", 223 | "3 Books\u0002Books\u0002Books\u0002Books\u0002Baking\u0002Books\u0002Books \n", 224 | "4 Books\u0002Books\u0002Books\u0002Books \n", 225 | "\n", 226 | " neg_hist_mids \\\n", 227 | "0 0786890487\u00020618539069\u0002B001IDZJO0\u00021603421548\u000203... \n", 228 | "1 B00BEFIHOG\u00021402245270\u00020670031747\u00020615785182\u000214... \n", 229 | "2 B00LQABRTG\u0002087830178X\u00020991543009\u0002071533154X\u000203... \n", 230 | "3 1595328149\u00021591797810\u00020451233018\u00020373771355\u000214... \n", 231 | "4 0316159735\u0002156718359X\u00020786812400\u00020062506110 \n", 232 | "\n", 233 | " neg_hist_cats \n", 234 | "0 Books\u0002Books\u0002Books\u0002Books\u0002Books \n", 235 | "1 Literary\u0002Books\u0002Books\u0002Books\u0002Books \n", 236 | "2 Neuropsychology\u0002Books\u0002Books\u0002Books\u0002Books\u0002Books\u0002... \n", 237 | "3 Books\u0002Books\u0002Books\u0002Books\u0002Books\u0002Books\u0002Contempora... \n", 238 | "4 Books\u0002Books\u0002Books\u0002Books " 239 | ] 240 | }, 241 | "execution_count": 6, 242 | "metadata": {}, 243 | "output_type": "execute_result" 244 | } 245 | ], 246 | "source": [ 247 | "train_df.head()" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 7, 253 | "metadata": {}, 254 | "outputs": [ 255 | { 256 | "data": { 257 | "text/html": [ 258 | "
\n", 259 | "\n", 272 | "\n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | "
labeluidmidcathist_midshist_catsneg_hist_midsneg_hist_cats
00A3BI7R43VUZ1TYB00JNHU0T2Literature & Fiction0989464105\u0002B00B01691C\u00021477809732\u00021608442845Books\u0002Literature & Fiction\u0002Books\u0002Books0899576168\u0002B0056ATROO\u00020446600474\u00020615209459Books\u0002Sleep\u0002Books\u0002Books
11A3BI7R43VUZ1TY0989464121Books0989464105\u0002B00B01691C\u00021477809732\u00021608442845Books\u0002Literature & Fiction\u0002Books\u0002Books0373527721\u00020981854524\u00020470404159\u0002B00BWKBSOYBooks\u0002Books\u0002Books\u0002Literature & Fiction
20A2Z3AHJPXG3ZNPB0072YSPJ0Literature & Fiction1478310960\u00021492231452\u00021477603425\u0002B00FRKLA6QBooks\u0002Books\u0002Books\u0002UrbanB00EQAEA60\u0002B007D64VX6\u0002188547766X\u00021590172477Literature & Fiction\u0002Quran\u0002Books\u0002Books
31A2Z3AHJPXG3ZNPB00G4I4I5UUrban1478310960\u00021492231452\u00021477603425\u0002B00FRKLA6QBooks\u0002Books\u0002Books\u0002Urban1583942475\u00021585678600\u00021570199221\u00020312373090Books\u0002Books\u0002Books\u0002Books
40A2KDDPJUNWC5CA0316228532Books0141326085\u0002031026622X\u00020316077046\u00020988649179\u000214...Books\u0002Books\u0002Books\u0002Books\u0002BooksB0077FOPFC\u00021594744106\u0002B00DFGN1DE\u00020972259112\u0002B0...Ghosts\u0002Books\u0002Erotica\u0002Books\u0002Soups & Stews
\n", 344 | "
" 345 | ], 346 | "text/plain": [ 347 | " label uid mid cat \\\n", 348 | "0 0 A3BI7R43VUZ1TY B00JNHU0T2 Literature & Fiction \n", 349 | "1 1 A3BI7R43VUZ1TY 0989464121 Books \n", 350 | "2 0 A2Z3AHJPXG3ZNP B0072YSPJ0 Literature & Fiction \n", 351 | "3 1 A2Z3AHJPXG3ZNP B00G4I4I5U Urban \n", 352 | "4 0 A2KDDPJUNWC5CA 0316228532 Books \n", 353 | "\n", 354 | " hist_mids \\\n", 355 | "0 0989464105\u0002B00B01691C\u00021477809732\u00021608442845 \n", 356 | "1 0989464105\u0002B00B01691C\u00021477809732\u00021608442845 \n", 357 | "2 1478310960\u00021492231452\u00021477603425\u0002B00FRKLA6Q \n", 358 | "3 1478310960\u00021492231452\u00021477603425\u0002B00FRKLA6Q \n", 359 | "4 0141326085\u0002031026622X\u00020316077046\u00020988649179\u000214... \n", 360 | "\n", 361 | " hist_cats \\\n", 362 | "0 Books\u0002Literature & Fiction\u0002Books\u0002Books \n", 363 | "1 Books\u0002Literature & Fiction\u0002Books\u0002Books \n", 364 | "2 Books\u0002Books\u0002Books\u0002Urban \n", 365 | "3 Books\u0002Books\u0002Books\u0002Urban \n", 366 | "4 Books\u0002Books\u0002Books\u0002Books\u0002Books \n", 367 | "\n", 368 | " neg_hist_mids \\\n", 369 | "0 0899576168\u0002B0056ATROO\u00020446600474\u00020615209459 \n", 370 | "1 0373527721\u00020981854524\u00020470404159\u0002B00BWKBSOY \n", 371 | "2 B00EQAEA60\u0002B007D64VX6\u0002188547766X\u00021590172477 \n", 372 | "3 1583942475\u00021585678600\u00021570199221\u00020312373090 \n", 373 | "4 B0077FOPFC\u00021594744106\u0002B00DFGN1DE\u00020972259112\u0002B0... \n", 374 | "\n", 375 | " neg_hist_cats \n", 376 | "0 Books\u0002Sleep\u0002Books\u0002Books \n", 377 | "1 Books\u0002Books\u0002Books\u0002Literature & Fiction \n", 378 | "2 Literature & Fiction\u0002Quran\u0002Books\u0002Books \n", 379 | "3 Books\u0002Books\u0002Books\u0002Books \n", 380 | "4 Ghosts\u0002Books\u0002Erotica\u0002Books\u0002Soups & Stews " 381 | ] 382 | }, 383 | "execution_count": 7, 384 | "metadata": {}, 385 | "output_type": "execute_result" 386 | } 387 | ], 388 | "source": [ 389 | "valid_df.head()" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "# define features" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 8, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "cat_enc = SequenceEncoder(sep=\"\\x02\", min_cnt=1, max_len=SEQ_MAX_LEN)" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 9, 411 | "metadata": {}, 412 | "outputs": [ 413 | { 414 | "data": { 415 | "text/plain": [ 416 | "" 417 | ] 418 | }, 419 | "execution_count": 9, 420 | "metadata": {}, 421 | "output_type": "execute_result" 422 | } 423 | ], 424 | "source": [ 425 | "cat_enc.fit(train_df.hist_cats.values)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 10, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "cat_word2idx, cat_idx2word = cat_enc.word2idx, cat_enc.idx2word" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 11, 440 | "metadata": {}, 441 | "outputs": [ 442 | { 443 | "name": "stdout", 444 | "output_type": "stream", 445 | "text": [ 446 | "1602\n" 447 | ] 448 | } 449 | ], 450 | "source": [ 451 | "print(len(cat_word2idx))" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 12, 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "mid_enc = SequenceEncoder(sep=\"\\x02\", min_cnt=1, max_len=SEQ_MAX_LEN)" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 13, 466 | "metadata": {}, 467 | "outputs": [ 468 | { 469 | "data": { 470 | "text/plain": [ 471 | "" 472 | ] 473 | }, 474 | "execution_count": 13, 475 | "metadata": {}, 476 | "output_type": "execute_result" 477 | } 478 | ], 479 | "source": [ 480 | "mid_enc.fit(np.vstack([train_df.mid.values, train_df.hist_mids.values]))" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": 14, 486 | "metadata": {}, 487 | "outputs": [], 488 | "source": [ 489 | "mid_word2idx, mid_idx2word = mid_enc.word2idx, mid_enc.idx2word" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 15, 495 | "metadata": {}, 496 | "outputs": [ 497 | { 498 | "name": "stdout", 499 | "output_type": "stream", 500 | "text": [ 501 | "367984\n" 502 | ] 503 | } 504 | ], 505 | "source": [ 506 | "print(len(mid_word2idx))" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 16, 512 | "metadata": {}, 513 | "outputs": [], 514 | "source": [ 515 | "number_features = []\n", 516 | "\n", 517 | "category_features = [\n", 518 | " Category('mid',\n", 519 | " CategoryEncoder(min_cnt=1, word2idx=mid_word2idx, idx2word=mid_idx2word),\n", 520 | " embedding_name='mid'),\n", 521 | " Category('cat',\n", 522 | " CategoryEncoder(min_cnt=1, word2idx=cat_word2idx, idx2word=cat_idx2word),\n", 523 | " embedding_name='cat'),\n", 524 | "]\n", 525 | "\n", 526 | "sequence_features = [\n", 527 | " Sequence('hist_mids',\n", 528 | " SequenceEncoder(sep=\"\\x02\", min_cnt=1, max_len=SEQ_MAX_LEN,\n", 529 | " word2idx=mid_word2idx, idx2word=mid_idx2word),\n", 530 | " embedding_name='mid'),\n", 531 | " Sequence('hist_cats',\n", 532 | " SequenceEncoder(sep=\"\\x02\", min_cnt=1, max_len=SEQ_MAX_LEN,\n", 533 | " word2idx=cat_word2idx, idx2word=cat_idx2word),\n", 534 | " embedding_name='cat')\n", 535 | "]\n", 536 | "\n", 537 | "features, train_loader, valid_loader = create_dataloader_fn(\n", 538 | " number_features, category_features, sequence_features, BATCH_SIZE, train_df, 'label', valid_df, 4)" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": 17, 544 | "metadata": {}, 545 | "outputs": [], 546 | "source": [ 547 | "def evaluation(model, df, dataloader):\n", 548 | " preds = predict(model, dataloader)\n", 549 | " return roc_auc_score(df['label'], preds.ravel())" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 18, 555 | "metadata": {}, 556 | "outputs": [], 557 | "source": [ 558 | "import pytorch_lightning as pl\n", 559 | "\n", 560 | "class CoolModel(pl.LightningModule):\n", 561 | " def __init__(self):\n", 562 | " super(CoolModel, self).__init__()\n", 563 | " self.model = DNN(\n", 564 | " features,\n", 565 | " 2,\n", 566 | " EMBEDDING_DIM,\n", 567 | " DNN_HIDDEN_SIZE,\n", 568 | " final_activation='sigmoid',\n", 569 | " dropout=DNN_DROPOUT)\n", 570 | " \n", 571 | " def forward(self, x):\n", 572 | " return self.model(x)\n", 573 | "\n", 574 | " def training_step(self, batch, batch_nb):\n", 575 | " # REQUIRED\n", 576 | " y = batch['label']\n", 577 | " y_hat = self.forward(batch)\n", 578 | " loss = F.binary_cross_entropy(y_hat, y)\n", 579 | " return {\n", 580 | " 'loss': loss,\n", 581 | " 'progress_bar':\n", 582 | " {'training_loss': loss}}\n", 583 | "\n", 584 | " def validation_step(self, batch, batch_nb):\n", 585 | " # OPTIONAL\n", 586 | " y = batch['label']\n", 587 | " y_hat = self.forward(batch)\n", 588 | " loss = F.binary_cross_entropy(y_hat, y)\n", 589 | " return {'val_loss': loss}\n", 590 | "\n", 591 | " def validation_end(self, outputs):\n", 592 | " # OPTIONAL\n", 593 | " avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()\n", 594 | " return {'progress_bar': {'val_loss': avg_loss}}\n", 595 | "\n", 596 | " def configure_optimizers(self):\n", 597 | " # REQUIRED\n", 598 | " return torch.optim.Adam(self.parameters(), lr=0.003)\n", 599 | "\n", 600 | " @pl.data_loader\n", 601 | " def train_dataloader(self):\n", 602 | " return train_loader\n", 603 | "\n", 604 | " @pl.data_loader\n", 605 | " def val_dataloader(self):\n", 606 | " # OPTIONAL\n", 607 | " # can also return a list of val dataloaders\n", 608 | " return valid_loader" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": 22, 614 | "metadata": {}, 615 | "outputs": [ 616 | { 617 | "name": "stdout", 618 | "output_type": "stream", 619 | "text": [ 620 | "gpu available: True, used: True\n", 621 | "VISIBLE GPUS: 0\n" 622 | ] 623 | } 624 | ], 625 | "source": [ 626 | "from pytorch_lightning import Trainer\n", 627 | "\n", 628 | "model = CoolModel()\n", 629 | "\n", 630 | "# most basic trainer, uses good defaults\n", 631 | "trainer = Trainer(max_nb_epochs=EPOCH, gpus=1) " 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 23, 637 | "metadata": {}, 638 | "outputs": [ 639 | { 640 | "name": "stderr", 641 | "output_type": "stream", 642 | "text": [ 643 | " 0%| | 0/5 [00:00] 81,977,637 105MB/s in 0.7s \n", 35 | "\n", 36 | "2019-09-07 17:45:11 (105 MB/s) - ‘data.tar.gz’ saved [81977637/81977637]\n", 37 | "\n", 38 | "--2019-09-07 17:45:11-- https://raw.githubusercontent.com/mouna99/dien/master/data1.tar.gz\n", 39 | "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.228.133\n", 40 | "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.228.133|:443... connected.\n", 41 | "HTTP request sent, awaiting response... 200 OK\n", 42 | "Length: 104430448 (100M) [application/octet-stream]\n", 43 | "Saving to: ‘data1.tar.gz’\n", 44 | "\n", 45 | "100%[======================================>] 104,430,448 86.5MB/s in 1.2s \n", 46 | "\n", 47 | "2019-09-07 17:45:13 (86.5 MB/s) - ‘data1.tar.gz’ saved [104430448/104430448]\n", 48 | "\n", 49 | "--2019-09-07 17:45:13-- https://raw.githubusercontent.com/mouna99/dien/master/data2.tar.gz\n", 50 | "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.228.133\n", 51 | "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.228.133|:443... connected.\n", 52 | "HTTP request sent, awaiting response... 200 OK\n", 53 | "Length: 9460706 (9.0M) [application/octet-stream]\n", 54 | "Saving to: ‘data2.tar.gz’\n", 55 | "\n", 56 | "100%[======================================>] 9,460,706 --.-K/s in 0.1s \n", 57 | "\n", 58 | "2019-09-07 17:45:13 (95.0 MB/s) - ‘data2.tar.gz’ saved [9460706/9460706]\n", 59 | "\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "! wget --no-check-certificate https://raw.githubusercontent.com/mouna99/dien/master/data.tar.gz\n", 65 | "! wget --no-check-certificate https://raw.githubusercontent.com/mouna99/dien/master/data1.tar.gz\n", 66 | "! wget --no-check-certificate https://raw.githubusercontent.com/mouna99/dien/master/data2.tar.gz" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 15, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "data/\n", 79 | "data/cat_voc.pkl\n", 80 | "data/mid_voc.pkl\n", 81 | "data/uid_voc.pkl\n", 82 | "data/local_train_splitByUser\n", 83 | "data/local_test_splitByUser\n", 84 | "data1/\n", 85 | "data1/reviews-info\n", 86 | "data2/\n", 87 | "data2/item-info\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "! tar jxvf ./data.tar.gz && tar jxvf ./data1.tar.gz && tar jxvf ./data2.tar.gz" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 16, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "TEST_RUN = False" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 17, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "train_df = pd.read_csv(\n", 111 | " \"./data/local_train_splitByUser\", sep='\\t',\n", 112 | " names=['label', 'uid', 'mid', 'cat', 'hist_mids', 'hist_cats'])\n", 113 | "\n", 114 | "valid_df = pd.read_csv(\n", 115 | " \"./data/local_test_splitByUser\", sep='\\t',\n", 116 | " names=['label', 'uid', 'mid', 'cat', 'hist_mids', 'hist_cats'])" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 18, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "item_info_df = pd.read_csv(\"./data2/item-info\", sep='\\t', names=['mid', 'cat'])" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 19, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "data": { 135 | "text/html": [ 136 | "
\n", 137 | "\n", 150 | "\n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | "
midcat
00001048791Books
10001048775Books
20001048236Books
30000401048Books
40001019880Books
\n", 186 | "
" 187 | ], 188 | "text/plain": [ 189 | " mid cat\n", 190 | "0 0001048791 Books\n", 191 | "1 0001048775 Books\n", 192 | "2 0001048236 Books\n", 193 | "3 0000401048 Books\n", 194 | "4 0001019880 Books" 195 | ] 196 | }, 197 | "execution_count": 19, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "item_info_df.head()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 20, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "reviews_info_df = pd.read_csv(\"./data1/reviews-info\", sep='\\t', names=['c1', 'mid', 'c3', 'c4'])" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 21, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/html": [ 223 | "
\n", 224 | "\n", 237 | "\n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | "
c1midc3c4
0A10000012B7CGYKOMPQ4L000100039X5.01355616000
1A2S166WSCFIFP5000100039X5.01071100800
2A1BM81XB4QHOA3000100039X5.01390003200
3A1MOSTXNIO5MPJ000100039X5.01317081600
4A2XQ5LZHTD4AFT000100039X5.01033948800
\n", 285 | "
" 286 | ], 287 | "text/plain": [ 288 | " c1 mid c3 c4\n", 289 | "0 A10000012B7CGYKOMPQ4L 000100039X 5.0 1355616000\n", 290 | "1 A2S166WSCFIFP5 000100039X 5.0 1071100800\n", 291 | "2 A1BM81XB4QHOA3 000100039X 5.0 1390003200\n", 292 | "3 A1MOSTXNIO5MPJ 000100039X 5.0 1317081600\n", 293 | "4 A2XQ5LZHTD4AFT 000100039X 5.0 1033948800" 294 | ] 295 | }, 296 | "execution_count": 21, 297 | "metadata": {}, 298 | "output_type": "execute_result" 299 | } 300 | ], 301 | "source": [ 302 | "reviews_info_df.head()" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 22, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "reviews_info_df = reviews_info_df[['mid']].merge(item_info_df, on='mid', how='inner').drop_duplicates()" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 23, 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "data": { 321 | "text/html": [ 322 | "
\n", 323 | "\n", 336 | "\n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | "
midcat
0000100039XBooks
2060001055178Books
2240001473123Books
2400001473727Books
2470001473905Books
\n", 372 | "
" 373 | ], 374 | "text/plain": [ 375 | " mid cat\n", 376 | "0 000100039X Books\n", 377 | "206 0001055178 Books\n", 378 | "224 0001473123 Books\n", 379 | "240 0001473727 Books\n", 380 | "247 0001473905 Books" 381 | ] 382 | }, 383 | "execution_count": 23, 384 | "metadata": {}, 385 | "output_type": "execute_result" 386 | } 387 | ], 388 | "source": [ 389 | "reviews_info_df.head()" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 24, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "mid_cat_map = reviews_info_df.set_index('mid').to_dict()['cat']" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 25, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "if TEST_RUN:\n", 408 | " train_df = train_df.sample(1000)\n", 409 | " valid_df = valid_df.sample(1000)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 26, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "# slow implement\n", 419 | "def prepare_neg(df):\n", 420 | " records = df['hist_mids'].apply(lambda x: x.split(\"\u0002\"))\n", 421 | " candidates = list(mid_cat_map.keys())\n", 422 | " max_len = len(candidates)\n", 423 | "\n", 424 | " def neg_sampling(filters, length): \n", 425 | " mids = []\n", 426 | " cats = []\n", 427 | " for i in range(length):\n", 428 | " while(1):\n", 429 | " c = candidates[np.random.randint(0, max_len)]\n", 430 | " if c not in filters:\n", 431 | " mids.append(c)\n", 432 | " cats.append(mid_cat_map[c])\n", 433 | " filters.add(c)\n", 434 | " break\n", 435 | " return mids, cats\n", 436 | " \n", 437 | " total_neg_mids = []\n", 438 | " total_neg_cats = []\n", 439 | " for record in records:\n", 440 | " neg_mids, neg_cats = neg_sampling(set(record), len(record))\n", 441 | " total_neg_mids.append(\"\u0002\".join(neg_mids))\n", 442 | " total_neg_cats.append(\"\u0002\".join(neg_cats))\n", 443 | " \n", 444 | " return total_neg_mids, total_neg_cats" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 27, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "total_neg_mids, total_neg_cats = prepare_neg(train_df)\n", 454 | "train_df['neg_hist_mids'] = total_neg_mids\n", 455 | "train_df['neg_hist_cats'] = total_neg_cats" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 28, 461 | "metadata": {}, 462 | "outputs": [], 463 | "source": [ 464 | "total_neg_mids, total_neg_cats = prepare_neg(valid_df)\n", 465 | "valid_df['neg_hist_mids'] = total_neg_mids\n", 466 | "valid_df['neg_hist_cats'] = total_neg_cats" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 29, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "train_df.to_csv('local_train.csv', sep='\\t', index=False)\n", 476 | "valid_df.to_csv('local_test.csv', sep='\\t', index=False)" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [] 485 | } 486 | ], 487 | "metadata": { 488 | "kernelspec": { 489 | "display_name": "Python 3", 490 | "language": "python", 491 | "name": "python3" 492 | }, 493 | "language_info": { 494 | "codemirror_mode": { 495 | "name": "ipython", 496 | "version": 3 497 | }, 498 | "file_extension": ".py", 499 | "mimetype": "text/x-python", 500 | "name": "python", 501 | "nbconvert_exporter": "python", 502 | "pygments_lexer": "ipython3", 503 | "version": "3.6.2" 504 | } 505 | }, 506 | "nbformat": 4, 507 | "nbformat_minor": 2 508 | } 509 | -------------------------------------------------------------------------------- /examples/amazon/simple_benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/examples/amazon/simple_benchmark.png -------------------------------------------------------------------------------- /examples/movielens/ml-1m/README: -------------------------------------------------------------------------------- 1 | SUMMARY 2 | ================================================================================ 3 | 4 | These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 5 | made by 6,040 MovieLens users who joined MovieLens in 2000. 6 | 7 | USAGE LICENSE 8 | ================================================================================ 9 | 10 | Neither the University of Minnesota nor any of the researchers 11 | involved can guarantee the correctness of the data, its suitability 12 | for any particular purpose, or the validity of results based on the 13 | use of the data set. The data set may be used for any research 14 | purposes under the following conditions: 15 | 16 | * The user may not state or imply any endorsement from the 17 | University of Minnesota or the GroupLens Research Group. 18 | 19 | * The user must acknowledge the use of the data set in 20 | publications resulting from the use of the data set 21 | (see below for citation information). 22 | 23 | * The user may not redistribute the data without separate 24 | permission. 25 | 26 | * The user may not use this information for any commercial or 27 | revenue-bearing purposes without first obtaining permission 28 | from a faculty member of the GroupLens Research Project at the 29 | University of Minnesota. 30 | 31 | If you have any further questions or comments, please contact GroupLens 32 | . 33 | 34 | CITATION 35 | ================================================================================ 36 | 37 | To acknowledge use of the dataset in publications, please cite the following 38 | paper: 39 | 40 | F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History 41 | and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4, 42 | Article 19 (December 2015), 19 pages. DOI=http://dx.doi.org/10.1145/2827872 43 | 44 | 45 | ACKNOWLEDGEMENTS 46 | ================================================================================ 47 | 48 | Thanks to Shyong Lam and Jon Herlocker for cleaning up and generating the data 49 | set. 50 | 51 | FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT 52 | ================================================================================ 53 | 54 | The GroupLens Research Project is a research group in the Department of 55 | Computer Science and Engineering at the University of Minnesota. Members of 56 | the GroupLens Research Project are involved in many research projects related 57 | to the fields of information filtering, collaborative filtering, and 58 | recommender systems. The project is lead by professors John Riedl and Joseph 59 | Konstan. The project began to explore automated collaborative filtering in 60 | 1992, but is most well known for its world wide trial of an automated 61 | collaborative filtering system for Usenet news in 1996. Since then the project 62 | has expanded its scope to research overall information filtering solutions, 63 | integrating in content-based methods as well as improving current collaborative 64 | filtering technology. 65 | 66 | Further information on the GroupLens Research project, including research 67 | publications, can be found at the following web site: 68 | 69 | http://www.grouplens.org/ 70 | 71 | GroupLens Research currently operates a movie recommender based on 72 | collaborative filtering: 73 | 74 | http://www.movielens.org/ 75 | 76 | RATINGS FILE DESCRIPTION 77 | ================================================================================ 78 | 79 | All ratings are contained in the file "ratings.dat" and are in the 80 | following format: 81 | 82 | UserID::MovieID::Rating::Timestamp 83 | 84 | - UserIDs range between 1 and 6040 85 | - MovieIDs range between 1 and 3952 86 | - Ratings are made on a 5-star scale (whole-star ratings only) 87 | - Timestamp is represented in seconds since the epoch as returned by time(2) 88 | - Each user has at least 20 ratings 89 | 90 | USERS FILE DESCRIPTION 91 | ================================================================================ 92 | 93 | User information is in the file "users.dat" and is in the following 94 | format: 95 | 96 | UserID::Gender::Age::Occupation::Zip-code 97 | 98 | All demographic information is provided voluntarily by the users and is 99 | not checked for accuracy. Only users who have provided some demographic 100 | information are included in this data set. 101 | 102 | - Gender is denoted by a "M" for male and "F" for female 103 | - Age is chosen from the following ranges: 104 | 105 | * 1: "Under 18" 106 | * 18: "18-24" 107 | * 25: "25-34" 108 | * 35: "35-44" 109 | * 45: "45-49" 110 | * 50: "50-55" 111 | * 56: "56+" 112 | 113 | - Occupation is chosen from the following choices: 114 | 115 | * 0: "other" or not specified 116 | * 1: "academic/educator" 117 | * 2: "artist" 118 | * 3: "clerical/admin" 119 | * 4: "college/grad student" 120 | * 5: "customer service" 121 | * 6: "doctor/health care" 122 | * 7: "executive/managerial" 123 | * 8: "farmer" 124 | * 9: "homemaker" 125 | * 10: "K-12 student" 126 | * 11: "lawyer" 127 | * 12: "programmer" 128 | * 13: "retired" 129 | * 14: "sales/marketing" 130 | * 15: "scientist" 131 | * 16: "self-employed" 132 | * 17: "technician/engineer" 133 | * 18: "tradesman/craftsman" 134 | * 19: "unemployed" 135 | * 20: "writer" 136 | 137 | MOVIES FILE DESCRIPTION 138 | ================================================================================ 139 | 140 | Movie information is in the file "movies.dat" and is in the following 141 | format: 142 | 143 | MovieID::Title::Genres 144 | 145 | - Titles are identical to titles provided by the IMDB (including 146 | year of release) 147 | - Genres are pipe-separated and are selected from the following genres: 148 | 149 | * Action 150 | * Adventure 151 | * Animation 152 | * Children's 153 | * Comedy 154 | * Crime 155 | * Documentary 156 | * Drama 157 | * Fantasy 158 | * Film-Noir 159 | * Horror 160 | * Musical 161 | * Mystery 162 | * Romance 163 | * Sci-Fi 164 | * Thriller 165 | * War 166 | * Western 167 | 168 | - Some MovieIDs do not correspond to a movie due to accidental duplicate 169 | entries and/or test entries 170 | - Movies are mostly entered by hand, so errors and inconsistencies may exist 171 | -------------------------------------------------------------------------------- /prediction_flow/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.5' 2 | -------------------------------------------------------------------------------- /prediction_flow/features/__init__.py: -------------------------------------------------------------------------------- 1 | from .number_feature import Number 2 | from .category_feature import Category 3 | from .sequence_feature import Sequence 4 | from .features import Features 5 | 6 | 7 | __all__ = [ 8 | 'Number', 9 | 'Category', 10 | 'Sequence', 11 | 'Features' 12 | ] 13 | -------------------------------------------------------------------------------- /prediction_flow/features/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base class for features. 3 | """ 4 | 5 | # Authors: Hongwei Zhang 6 | # License: MIT 7 | 8 | 9 | from abc import ABC, abstractmethod 10 | from ..transformers.column import Column, ColumnFlow 11 | 12 | 13 | class BaseFeature(ABC): 14 | """Base class for all feature classes. 15 | 16 | Parameters 17 | ---------- 18 | name : str 19 | Name of this feature. 20 | 21 | column_flow : ColumnFlow 22 | ColumnFlow to transform this feature. 23 | 24 | Attributes 25 | ---------- 26 | name : str 27 | Name of this feature. 28 | 29 | column_flow : ColumnFlow or 30 | list of column transformers or 31 | single transformer 32 | ColumnFlow to transform this feature. 33 | """ 34 | 35 | @abstractmethod 36 | def __init__(self, name, column_flow=None): 37 | self.name = name 38 | 39 | self.column_flow = None 40 | 41 | if column_flow: 42 | if isinstance(column_flow, ColumnFlow): 43 | self.column_flow = column_flow 44 | elif isinstance(column_flow, list): 45 | self.column_flow = ColumnFlow(column_flow) 46 | elif isinstance(column_flow, Column): 47 | self.column_flow = ColumnFlow([column_flow]) 48 | else: 49 | raise NotImplementedError( 50 | "column_flow should be ColumnFlow or " 51 | "list of column transformers or single transformer") 52 | -------------------------------------------------------------------------------- /prediction_flow/features/category_feature.py: -------------------------------------------------------------------------------- 1 | """ 2 | Class for category feature. 3 | """ 4 | 5 | # Authors: Hongwei Zhang 6 | # License: MIT 7 | 8 | 9 | from .base import BaseFeature 10 | 11 | 12 | class Category(BaseFeature): 13 | """ 14 | Class for category feature. 15 | 16 | Parameters 17 | ---------- 18 | name : str 19 | Name of this feature. 20 | 21 | column_flow : ColumnFlow 22 | ColumnFlow to transform this feature. 23 | 24 | embedding_name: str 25 | Embedding name for reference. Give same embedding name to features that 26 | share same embedding layer. 27 | 28 | embedding_size: int 29 | Dimension of embedding layer. 30 | 31 | vocab_size: int 32 | Provide vocab_size if this feature do not need to be pre-processed. 33 | vocab_size is only be used when column_flow is None. 34 | 35 | Attributes 36 | ---------- 37 | name : str 38 | Name of this feature. 39 | 40 | column_flow : ColumnFlow 41 | ColumnFlow to transform this feature. 42 | 43 | embedding_name: str 44 | Embedding name for reference. Give same embedding name to features that 45 | share same embedding layer. 46 | 47 | embedding_size: int 48 | Dimension of embedding layer. 49 | """ 50 | def __init__(self, name, column_flow, 51 | embedding_name=None, embedding_size=None, 52 | vocab_size=None): 53 | super().__init__(name=name, column_flow=column_flow) 54 | self.embedding_name = embedding_name if embedding_name else name 55 | self.embedding_size = embedding_size 56 | self._vocab_size = vocab_size 57 | 58 | def dimension(self): 59 | """The dimension (vocab size) of sequence feature is the dimension 60 | of last transformer in ColumnFlow. 61 | """ 62 | if self.column_flow is not None: 63 | return self.column_flow.transformers[-1].dimension() 64 | else: 65 | if self._vocab_size: 66 | return self._vocab_size 67 | else: 68 | raise RuntimeError( 69 | "If param column_flow is not given, " 70 | "vocab_size must be given") 71 | -------------------------------------------------------------------------------- /prediction_flow/features/features.py: -------------------------------------------------------------------------------- 1 | """ 2 | Class to manage all features. 3 | """ 4 | 5 | # Authors: Hongwei Zhang 6 | # License: MIT 7 | 8 | 9 | from collections import OrderedDict 10 | 11 | 12 | class Features(object): 13 | """Class to manage all features. 14 | 15 | Parameters 16 | ---------- 17 | number_features : array-like 18 | NumberFeature array. 19 | 20 | category_features : array-like 21 | CategoryFeature array. 22 | 23 | sequence_features : array-like 24 | SequenceFeature array. 25 | 26 | Attributes 27 | ---------- 28 | number_features : array-like 29 | NumberFeature array. 30 | 31 | category_features : array-like 32 | CategoryFeature array. 33 | 34 | sequence_features : array-like 35 | SequenceFeature array. 36 | 37 | """ 38 | def __init__( 39 | self, 40 | number_features=[], 41 | category_features=[], 42 | sequence_features=[]): 43 | self.number_features = number_features 44 | self.category_features = category_features 45 | self.sequence_features = sequence_features 46 | 47 | def fit(self, df): 48 | """Fit all transformers. 49 | 50 | Parameters 51 | ---------- 52 | df : pandas.DataFrame 53 | 54 | Returns 55 | ---------- 56 | self : Features 57 | """ 58 | for feature in ( 59 | self.number_features + 60 | self.category_features + 61 | self.sequence_features): 62 | if feature.column_flow: 63 | feature.column_flow.fit(df[feature.name].values) 64 | 65 | return self 66 | 67 | def transform(self, df): 68 | """Transform df using fitted transformers. 69 | 70 | Parameters 71 | ---------- 72 | df : pandas.DataFrame 73 | 74 | Returns 75 | ---------- 76 | transformed_X: dict 77 | 78 | {'feature1': numpy.array([...]), 'feature2': numpy.array([...])} 79 | """ 80 | transformed_X = OrderedDict() 81 | 82 | for feature in ( 83 | self.number_features + 84 | self.category_features + 85 | self.sequence_features): 86 | if feature.column_flow: 87 | transformed_X[feature.name] = feature.column_flow.transform( 88 | df[feature.name].values) 89 | else: 90 | transformed_X[feature.name] = df[feature.name].values 91 | 92 | return transformed_X 93 | 94 | def number_feature_names(self): 95 | return [feature.name for feature in self.number_features] 96 | 97 | def category_feature_names(self): 98 | return [feature.name for feature in self.category_features] 99 | 100 | def sequence_feature_names(self): 101 | return [feature.name for feature in self.sequence_features] 102 | -------------------------------------------------------------------------------- /prediction_flow/features/number_feature.py: -------------------------------------------------------------------------------- 1 | """ 2 | Class for number feature. 3 | """ 4 | 5 | # Authors: Hongwei Zhang 6 | # License: MIT 7 | 8 | 9 | from .base import BaseFeature 10 | 11 | 12 | class Number(BaseFeature): 13 | """ 14 | Class for number feature. 15 | 16 | Parameters 17 | ---------- 18 | name : str 19 | Name of this feature. 20 | 21 | column_flow : ColumnFlow 22 | ColumnFlow to transform this feature. 23 | 24 | Attributes 25 | ---------- 26 | name : str 27 | Name of this feature. 28 | 29 | column_flow : ColumnFlow 30 | ColumnFlow to transform this feature. 31 | """ 32 | def __init__(self, name, column_flow): 33 | super().__init__(name=name, column_flow=column_flow) 34 | -------------------------------------------------------------------------------- /prediction_flow/features/sequence_feature.py: -------------------------------------------------------------------------------- 1 | """ 2 | Class for sequence features 3 | """ 4 | 5 | # Authors: Hongwei Zhang 6 | # License: MIT 7 | 8 | 9 | from .base import BaseFeature 10 | 11 | 12 | class Sequence(BaseFeature): 13 | """ 14 | Class for sequence features 15 | 16 | Parameters 17 | ---------- 18 | name : str 19 | Name of this feature. 20 | 21 | column_flow : ColumnFlow 22 | ColumnFlow to transform this feature. 23 | 24 | embedding_name: str 25 | Embedding name for reference. Give same embedding name to features that 26 | share same embedding layer. 27 | 28 | embedding_size: int 29 | Dimension of embedding layer. 30 | 31 | vocab_size: int 32 | Provide vocab_size if this feature do not need to be pre-processed. 33 | vocab_size is only be used when column_flow is None. 34 | 35 | max_length: int 36 | max_length is only be used when column_flow is None. 37 | 38 | Attributes 39 | ---------- 40 | name : str 41 | Name of this feature. 42 | 43 | column_flow : ColumnFlow 44 | ColumnFlow to transform this feature. 45 | 46 | embedding_name: str 47 | Embedding name for reference. Give same embedding name to features that 48 | share same embedding layer. 49 | 50 | embedding_size: int 51 | Dimension of embedding layer. 52 | """ 53 | def __init__(self, name, column_flow, 54 | embedding_name=None, embedding_size=None, 55 | vocab_size=None, max_length=None): 56 | super().__init__(name=name, column_flow=column_flow) 57 | self.embedding_name = embedding_name if embedding_name else name 58 | self.embedding_size = embedding_size 59 | self._vocab_size = vocab_size 60 | self._max_length = max_length 61 | 62 | def dimension(self): 63 | """The dimension (vocab size) of sequence feature is the dimension 64 | of last transformer in ColumnFlow. 65 | """ 66 | if self.column_flow is not None: 67 | return self.column_flow.transformers[-1].dimension() 68 | else: 69 | if self._vocab_size: 70 | return self._vocab_size 71 | else: 72 | raise RuntimeError( 73 | "If param column_flow is not given, " 74 | "vocab_size must be given") 75 | 76 | def max_length(self): 77 | """The max length of sequence feature is the max length 78 | of last transformer in ColumnFlow. 79 | """ 80 | if self.column_flow is not None: 81 | return self.column_flow.transformers[-1].max_length() 82 | else: 83 | if self._max_length: 84 | return self._max_length 85 | else: 86 | raise RuntimeError( 87 | "If param column_flow is not given, " 88 | "max_length must be given") 89 | -------------------------------------------------------------------------------- /prediction_flow/features/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/features/tests/__init__.py -------------------------------------------------------------------------------- /prediction_flow/features/tests/test_features.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.features import Number, Category, Sequence, Features 2 | from prediction_flow.transformers.column import ( 3 | StandardScaler, CategoryEncoder, SequenceEncoder, ColumnFlow) 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | 9 | __TEST_DATA = pd.DataFrame( 10 | [ 11 | [23, 173, 'male', 'fish vegetable', 1], 12 | [43, 181, 'male', 'fish pork', 0], 13 | [35, 161, 'female', 'beef vegetable', 0], 14 | [41, 171, 'male', 'fish vegetable', 1], 15 | [16, 153, 'female', 'pork chicken vegetable', 0], 16 | [32, 168, 'female', 'fish beef', 1], 17 | [26, 177, 'male', 'chicken vegetable', 0], 18 | [76, 190, 'male', 'fish pork vegetable', 0] 19 | ], 20 | columns=['age', 'height', 'gender', 'likes', 'label']) 21 | 22 | 23 | def test_simple_column_transformer_define(): 24 | number_features = [ 25 | Number('age', None), 26 | Number('height', StandardScaler())] 27 | 28 | category_features = [ 29 | Category('gender', CategoryEncoder(min_cnt=1))] 30 | 31 | sequence_features = [ 32 | Sequence('likes', SequenceEncoder(sep=' ', min_cnt=1))] 33 | 34 | features = Features( 35 | number_features, category_features, sequence_features) 36 | 37 | features.fit(__TEST_DATA) 38 | 39 | actual = features.transform(__TEST_DATA) 40 | 41 | expected_age = np.array([23, 43, 35, 41, 16, 32, 26, 76]) 42 | expected_gender = np.array([2, 2, 1, 2, 1, 1, 2, 2]) 43 | expected_height = np.array( 44 | [0.1159659, 0.85814767, -0.99730676, -0.06957954, -1.73948853, 45 | -0.34789771, 0.48705679, 1.69310217]) 46 | 47 | assert len(actual) == 4 48 | assert features.number_feature_names() == ['age', 'height'] 49 | assert features.category_feature_names() == ['gender'] 50 | assert features.sequence_feature_names() == ['likes'] 51 | np.testing.assert_array_equal(actual['age'], expected_age) 52 | np.testing.assert_array_equal(actual['gender'], expected_gender) 53 | np.testing.assert_array_almost_equal(actual['height'], expected_height) 54 | 55 | 56 | def test_column_flow_define(): 57 | number_features = [ 58 | Number('age', None), 59 | Number('height', ColumnFlow([StandardScaler()]))] 60 | 61 | category_features = [ 62 | Category('gender', ColumnFlow([CategoryEncoder(min_cnt=1)])) 63 | ] 64 | 65 | sequence_features = [ 66 | Sequence('likes', ColumnFlow([SequenceEncoder(sep=' ', min_cnt=1)])) 67 | ] 68 | 69 | features = Features( 70 | number_features, category_features, sequence_features) 71 | 72 | features.fit(__TEST_DATA) 73 | 74 | actual = features.transform(__TEST_DATA) 75 | 76 | expected_age = np.array([23, 43, 35, 41, 16, 32, 26, 76]) 77 | expected_gender = np.array([2, 2, 1, 2, 1, 1, 2, 2]) 78 | expected_height = np.array( 79 | [0.1159659, 0.85814767, -0.99730676, -0.06957954, -1.73948853, 80 | -0.34789771, 0.48705679, 1.69310217]) 81 | 82 | assert len(actual) == 4 83 | assert features.number_feature_names() == ['age', 'height'] 84 | assert features.category_feature_names() == ['gender'] 85 | assert features.sequence_feature_names() == ['likes'] 86 | np.testing.assert_array_equal(actual['age'], expected_age) 87 | np.testing.assert_array_equal(actual['gender'], expected_gender) 88 | np.testing.assert_array_almost_equal(actual['height'], expected_height) 89 | -------------------------------------------------------------------------------- /prediction_flow/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/metrics/__init__.py -------------------------------------------------------------------------------- /prediction_flow/pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | from .deepfm import DeepFM 2 | from .dnn import DNN 3 | from .interest_net import AttentionGroup 4 | from .din import DIN 5 | from .dien import DIEN 6 | from .wide_deep import WideDeep 7 | 8 | 9 | __all__ = [ 10 | 'DeepFM', 11 | 'DNN', 12 | 'AttentionGroup', 13 | 'DIN', 14 | 'DIEN', 15 | 'WideDeep' 16 | ] 17 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/base.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from collections import OrderedDict 3 | 4 | import torch.nn as nn 5 | 6 | 7 | class EmbeddingMixin: 8 | def build_embeddings( 9 | self, default_embedding_size, fixed_embedding_size=False): 10 | embeddings = OrderedDict() 11 | embedding_sizes = OrderedDict() 12 | 13 | for feature in itertools.chain( 14 | self.features.category_features, 15 | self.features.sequence_features): 16 | if feature.embedding_name not in embeddings: 17 | embedding_size = default_embedding_size 18 | if not fixed_embedding_size: 19 | embedding_size = (feature.embedding_size 20 | if feature.embedding_size 21 | else default_embedding_size) 22 | 23 | embeddings[feature.embedding_name] = nn.Embedding( 24 | feature.dimension(), embedding_size, padding_idx=0) 25 | embedding_sizes[feature.embedding_name] = embedding_size 26 | self.add_module( 27 | f"embedding:{feature.embedding_name}", 28 | embeddings[feature.embedding_name]) 29 | 30 | if feature.name != feature.embedding_name: 31 | embeddings[feature.name] = embeddings[feature.embedding_name] 32 | embedding_sizes[feature.name] = ( 33 | embedding_sizes[feature.embedding_name]) 34 | if feature.embedding_size and ( 35 | feature.embedding_size != 36 | embedding_sizes[feature.name]): 37 | raise RuntimeWarning( 38 | f"embedding_size of {feature.name} should be " 39 | f"the same with {feature.embedding_name}") 40 | 41 | return (embeddings, embedding_sizes) 42 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset import Dataset 2 | 3 | 4 | __all__ = [ 5 | 'Dataset' 6 | ] 7 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/data/dataset.py: -------------------------------------------------------------------------------- 1 | """Dataset for torch. 2 | """ 3 | 4 | # Authors: Hongwei Zhang 5 | # License: MIT 6 | 7 | 8 | from collections import OrderedDict 9 | from itertools import chain 10 | 11 | import numpy as np 12 | import torch.utils.data as data 13 | 14 | 15 | class Dataset(data.Dataset): 16 | """Dataset for torch. 17 | 18 | Parameters 19 | ---------- 20 | features : Features 21 | Fitted Features object. 22 | 23 | X_map : dict 24 | example: 25 | {'feature1': numpy.array([...]), 'feature2': numpy.array([...])} 26 | 27 | y : numpy.array 28 | """ 29 | def __init__(self, features, X_map, y=None): 30 | self.features = features 31 | self.X_map = X_map 32 | self.y = y 33 | if y is not None: 34 | self.y = np.asarray(y, np.float32).reshape(-1, 1) 35 | 36 | self.__data_size = self.__get_data_size() 37 | 38 | def __get_data_size(self): 39 | key = next(iter(self.X_map)) 40 | return self.X_map[key].shape[0] 41 | 42 | def __len__(self): 43 | return self.__data_size 44 | 45 | @staticmethod 46 | def __pad_sequence(sequence_feature, sequence): 47 | # zero is special index for padding 48 | padded_seq = np.zeros(sequence_feature.max_length(), np.int64) 49 | padded_seq[0: sequence.shape[0]] = sequence 50 | 51 | return padded_seq 52 | 53 | def __getitem__(self, idx): 54 | record = OrderedDict() 55 | 56 | for feat in chain( 57 | self.features.number_features, 58 | self.features.category_features): 59 | record[feat.name] = self.X_map[feat.name][idx] 60 | 61 | for feat in self.features.sequence_features: 62 | seq = self.X_map[feat.name][idx] 63 | record[feat.name] = Dataset.__pad_sequence(feat, seq) 64 | record[f"__{feat.name}_length"] = np.int64(seq.shape[0]) 65 | 66 | if self.y is not None: 67 | record['label'] = self.y[idx] 68 | return record 69 | 70 | def get_num_batches(self, batch_size): 71 | return np.ceil(self.__data_size / batch_size) 72 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/data/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/pytorch/data/tests/__init__.py -------------------------------------------------------------------------------- /prediction_flow/pytorch/data/tests/test_dataset.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.features import Number, Category, Sequence, Features 2 | from prediction_flow.transformers.column import ( 3 | StandardScaler, CategoryEncoder, SequenceEncoder) 4 | from prediction_flow.pytorch.data import Dataset 5 | 6 | import pandas as pd 7 | 8 | 9 | __SAMPLE_DF = pd.DataFrame({ 10 | 'userId': [11, 11, 11, 11, 11], 11 | 'userAge': [23, 21, 19, 17, 41], 12 | 'movieId': [4226, 5971, 6291, 7153, 30707], 13 | 'rating': [3.0, 2.0, 4.0, 4.6, 5.0], 14 | 'timestamp': [1294796159, 1294796201, 1294796113, 1294796132, 1294796176], 15 | 'title': ['Memento (2000)', 16 | 'My Neighbor Totoro (Tonari no Totoro) (1988)', 17 | 'Lilya 4-Ever (Lilja 4-ever) (2002)', 18 | 'Lord of the Rings: The Return of the King, The (2003)', 19 | 'Million Dollar Baby (2004)'], 20 | 'genres': [ 21 | 'Mystery|Thriller', 22 | 'Animation|Children|Drama|Fantasy', 23 | 'Crime|Drama', 24 | 'Action|Adventure|Drama|Fantasy', 25 | 'Drama'], 26 | 'topGenre': [ 27 | 'Mystery', 28 | 'Animation', 29 | 'Crime', 30 | 'Action', 31 | 'Drama'], 32 | 'clickedMovieIds': [ 33 | '5971|6291', 34 | '3242|42', 35 | '32|43542|3222|3', 36 | '', 37 | '34|23'], 38 | 'clickedMovieTopGenres': [ 39 | 'Animation|Mystery', 40 | 'Drama', 41 | 'Drama', 42 | '', 43 | 'Mystery|Crime'], 44 | 'label': [1, 0, 0, 1, 0]}) 45 | 46 | 47 | def test_normal(): 48 | number_features = [ 49 | Number('userAge', StandardScaler()), 50 | Number('rating', StandardScaler())] 51 | 52 | category_features = [ 53 | Category('userId', CategoryEncoder(min_cnt=1)), 54 | Category('movieId', CategoryEncoder(min_cnt=1)), 55 | Category('topGenre', CategoryEncoder(min_cnt=1))] 56 | 57 | sequence_features = [ 58 | Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), 59 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), 60 | Sequence('clickedMovieIds', 61 | SequenceEncoder(sep='|', min_cnt=1, max_len=5)), 62 | Sequence('clickedMovieTopGenres', 63 | SequenceEncoder(sep='|', min_cnt=1, max_len=5))] 64 | 65 | features = Features( 66 | number_features=number_features, 67 | category_features=category_features, 68 | sequence_features=sequence_features) 69 | 70 | features.fit(__SAMPLE_DF) 71 | 72 | X_map = features.transform(__SAMPLE_DF) 73 | 74 | dataset = Dataset(features, X_map, __SAMPLE_DF.label.values) 75 | 76 | assert dataset[0]['userId'] == 1 77 | assert dataset[0]['movieId'] == 1 78 | assert dataset[0]['genres'].tolist() == [8, 9, 0, 0] 79 | assert dataset[0]['__genres_length'] == 2 80 | assert dataset[0]['label'] == 1 81 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/deepfm.py: -------------------------------------------------------------------------------- 1 | """ 2 | DeepFM. 3 | """ 4 | from collections import OrderedDict 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from .base import EmbeddingMixin 10 | from .nn import FM, MLP, MaxPooling 11 | from .utils import init_weights 12 | 13 | 14 | class DeepFM(nn.Module, EmbeddingMixin): 15 | """DeepFM. 16 | 17 | Parameters 18 | ---------- 19 | features : Features 20 | 21 | num_classes : int 22 | Number of classes. 23 | 24 | embedding_size : int 25 | Size of embedding. 26 | 27 | hidden_layers : list 28 | Size of hidden layers. 29 | Example: [96, 32] 30 | 31 | activation : str 32 | Activation function. 33 | Example: relu 34 | 35 | final_activation : str 36 | Activation function of output. 37 | 38 | dropout : float 39 | Dropout rate. 40 | 41 | use_linear : bool 42 | 43 | use_fm : bool 44 | 45 | use_deep : bool 46 | """ 47 | def __init__(self, features, num_classes, embedding_size, hidden_layers, 48 | activation='relu', final_activation=None, dropout=None, 49 | use_linear=True, use_fm=True, use_deep=True): 50 | super(DeepFM, self).__init__() 51 | self.features = features 52 | self.num_classes = num_classes 53 | self.final_activation = final_activation 54 | self.use_linear = use_linear 55 | self.use_fm = use_fm 56 | self.use_deep = use_deep 57 | 58 | self.embeddings, self.embedding_sizes = self.build_embeddings( 59 | embedding_size, fixed_embedding_size=True) 60 | 61 | self._sequence_poolings = OrderedDict() 62 | 63 | total_embedding_sizes = 0 64 | for feature in self.features.number_features: 65 | self.embeddings[feature.name] = nn.Linear( 66 | in_features=1, 67 | out_features=embedding_size, 68 | bias=False) 69 | self.add_module( 70 | f"embedding:{feature.name}", 71 | self.embeddings[feature.name]) 72 | total_embedding_sizes += embedding_size 73 | 74 | for feature in self.features.category_features: 75 | total_embedding_sizes += ( 76 | self.embedding_sizes[feature.name]) 77 | 78 | for feature in self.features.sequence_features: 79 | self._sequence_poolings[feature.name] = MaxPooling(1) 80 | self.add_module( 81 | f"pooling:{feature.name}", 82 | self._sequence_poolings[feature.name]) 83 | total_embedding_sizes += ( 84 | self.embedding_sizes[feature.name]) 85 | 86 | final_layer_input_size = 0 87 | # linear 88 | # This part is diff from deepFM paper, 89 | # sparse features are not included. 90 | self.linear = None 91 | if self.use_linear and self.features.number_features: 92 | self.linear = MLP( 93 | len(self.features.number_features), 94 | hidden_layers=[1], dropout=None, activation=None) 95 | final_layer_input_size += 1 96 | 97 | # fm 98 | self.fm = None 99 | if use_fm and total_embedding_sizes: 100 | self.fm = FM() 101 | final_layer_input_size += 1 102 | 103 | # deep 104 | self.mlp = None 105 | if use_deep and total_embedding_sizes: 106 | total_input_size = (total_embedding_sizes + 107 | len(self.features.number_features)) 108 | self.mlp = MLP( 109 | total_input_size, hidden_layers, 110 | dropout=dropout, activation=activation) 111 | final_layer_input_size += hidden_layers[-1] 112 | 113 | output_size = self.num_classes 114 | 115 | if self.num_classes == 2 and self.final_activation == 'sigmoid': 116 | output_size -= 1 117 | 118 | self.final_layer = nn.Linear(final_layer_input_size, output_size) 119 | 120 | self.apply(init_weights) 121 | 122 | def forward(self, x): 123 | final_layer_inputs = list() 124 | 125 | number_inputs = list() 126 | if self.linear: 127 | # linear 128 | for feature in self.features.number_features: 129 | number_inputs.append(x[feature.name].view(-1, 1)) 130 | linear_concat = torch.cat(number_inputs, dim=1) 131 | final_layer_inputs.append(self.linear(linear_concat)) 132 | 133 | embeddings = list() 134 | for feature in self.features.number_features: 135 | embeddings.append( 136 | self.embeddings[feature.name]( 137 | x[feature.name].view(-1, 1)).unsqueeze(1)) 138 | for feature in self.features.category_features: 139 | embeddings.append( 140 | self.embeddings[feature.name](x[feature.name]).unsqueeze(1)) 141 | for feature in self.features.sequence_features: 142 | embeddings.append( 143 | self._sequence_poolings[feature.name]( 144 | self.embeddings[feature.name](x[feature.name])).unsqueeze(1)) 145 | 146 | emb_concat = None 147 | if embeddings: 148 | emb_concat = torch.cat(embeddings, dim=1) 149 | b, f, e = emb_concat.size() 150 | # fm 151 | if self.fm: 152 | final_layer_inputs.append(self.fm(emb_concat)) 153 | emb_concat = emb_concat.view(b, f * e) 154 | 155 | # deep 156 | if self.mlp: 157 | deep_input = torch.cat(number_inputs + [emb_concat], dim=1) 158 | final_layer_inputs.append(self.mlp(deep_input)) 159 | 160 | final_layer_inputs = torch.cat(final_layer_inputs, dim=1) 161 | 162 | output = self.final_layer(final_layer_inputs) 163 | 164 | if self.num_classes == 2 and self.final_activation == 'sigmoid': 165 | output = torch.sigmoid(output) 166 | elif self.num_classes > 1 and self.final_activation == 'softmax': 167 | output = torch.softmax(output) 168 | elif self.final_activation: 169 | raise NotImplementedError( 170 | f"pair (final_activation: {self.final_activation}, " 171 | f"num_classes: {self.num_classes}) is not implemented") 172 | 173 | return output 174 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/dien.py: -------------------------------------------------------------------------------- 1 | """ 2 | Deep Interest Evolution Network. 3 | """ 4 | 5 | from collections import OrderedDict 6 | 7 | import torch 8 | 9 | from .nn import Interest 10 | from .interest_net import InterestNet 11 | 12 | 13 | class DIEN(InterestNet): 14 | """Deep Interest Evolution Network. 15 | 16 | Parameters 17 | ---------- 18 | features : Features 19 | 20 | attention_groups : list of AttentionGroup 21 | 22 | num_classes : int 23 | Number of classes. 24 | 25 | embedding_size : int 26 | Size of embedding. 27 | 28 | hidden_layers : list 29 | Size of hidden layers. 30 | Example: [96, 32] 31 | 32 | dnn_activation : str 33 | Activation function of deep layers. 34 | Example: relu 35 | 36 | final_activation : str 37 | Activation function of output. 38 | 39 | dropout : float 40 | Dropout rate. 41 | 42 | use_negsampling : bool 43 | """ 44 | def __init__(self, *args, use_negsampling=False, **kwargs): 45 | self.use_negsampling = use_negsampling 46 | super(DIEN, self).__init__(*args, **kwargs) 47 | 48 | def create_attention_fn(self, attention_group): 49 | return Interest( 50 | attention_group.pairs_count * self.embedding_size, 51 | gru_type=attention_group.gru_type, 52 | gru_dropout=attention_group.gru_dropout, 53 | att_hidden_layers=attention_group.hidden_layers, 54 | att_dropout=attention_group.att_dropout, 55 | att_activation=attention_group.activation, 56 | use_negsampling=self.use_negsampling) 57 | 58 | def forward(self, x): 59 | final_layer_inputs = list() 60 | 61 | # linear 62 | number_inputs = list() 63 | for feature in self.features.number_features: 64 | number_inputs.append(x[feature.name].view(-1, 1)) 65 | 66 | embeddings = OrderedDict() 67 | for feature in self.features.category_features: 68 | embeddings[feature.name] = self.embeddings[ 69 | feature.name](x[feature.name]) 70 | 71 | for feature in self.features.sequence_features: 72 | if not self._is_attention_feature(feature): 73 | embeddings[feature.name] = self._sequence_poolings[ 74 | feature.name](self.embeddings[ 75 | feature.name](x[feature.name])) 76 | 77 | auxiliary_losses = [] 78 | for attention_group in self.attention_groups: 79 | query = torch.cat( 80 | [embeddings[pair['ad']] 81 | for pair in attention_group.pairs], 82 | dim=-1) 83 | pos_hist = torch.cat( 84 | [self.embeddings[pair['pos_hist']]( 85 | x[pair['pos_hist']]) for pair in attention_group.pairs], 86 | dim=-1) 87 | keys_length = torch.min(torch.cat( 88 | [x[f"__{pair['pos_hist']}_length"].view(-1, 1) 89 | for pair in attention_group.pairs], 90 | dim=-1), dim=-1)[0] 91 | neg_hist = None 92 | if self.use_negsampling: 93 | neg_hist = torch.cat( 94 | [self.embeddings[pair['neg_hist']]( 95 | x[pair['neg_hist']]) 96 | for pair in attention_group.pairs], 97 | dim=-1) 98 | embeddings[attention_group.name], tmp_loss = ( 99 | self._attention_poolings[attention_group.name]( 100 | query, pos_hist, keys_length, neg_hist)) 101 | if tmp_loss is not None: 102 | auxiliary_losses.append(tmp_loss) 103 | 104 | emb_concat = torch.cat(number_inputs + [ 105 | emb for emb in embeddings.values()], dim=-1) 106 | 107 | final_layer_inputs = self.mlp(emb_concat) 108 | 109 | output = self.final_layer(final_layer_inputs) 110 | 111 | if self.num_classes == 2 and self.final_activation == 'sigmoid': 112 | output = torch.sigmoid(output) 113 | elif self.num_classes > 1 and self.final_activation == 'softmax': 114 | output = torch.softmax(output) 115 | elif self.final_activation: 116 | raise NotImplementedError( 117 | f"pair (final_activation: {self.final_activation}, " 118 | f"num_classes: {self.num_classes}) is not implemented") 119 | 120 | auxiliary_avg_loss = None 121 | if auxiliary_losses: 122 | auxiliary_avg_loss = auxiliary_losses[0] 123 | size = len(auxiliary_losses) 124 | for i in range(1, size): 125 | auxiliary_avg_loss += auxiliary_losses[i] 126 | auxiliary_avg_loss /= size 127 | 128 | return output, auxiliary_avg_loss 129 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/din.py: -------------------------------------------------------------------------------- 1 | """ 2 | Deep Interest Network. 3 | """ 4 | 5 | from .nn import Attention 6 | from .interest_net import InterestNet 7 | 8 | 9 | class DIN(InterestNet): 10 | """Deep Interest Network. 11 | 12 | Parameters 13 | ---------- 14 | features : Features 15 | 16 | attention_groups : list of AttentionGroup 17 | 18 | num_classes : int 19 | Number of classes. 20 | 21 | embedding_size : int 22 | Size of embedding. 23 | 24 | hidden_layers : list 25 | Size of hidden layers. 26 | Example: [96, 32] 27 | 28 | dnn_activation : str 29 | Activation function of deep layers. 30 | Example: relu 31 | 32 | final_activation : str 33 | Activation function of output. 34 | 35 | dropout : float 36 | Dropout rate. 37 | """ 38 | def __init__(self, *args, **kwargs): 39 | super(DIN, self).__init__(*args, **kwargs) 40 | 41 | def create_attention_fn(self, attention_group): 42 | return Attention( 43 | attention_group.pairs_count * self.embedding_size, 44 | hidden_layers=attention_group.hidden_layers, 45 | dropout=attention_group.att_dropout, 46 | activation=attention_group.activation) 47 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/dnn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Deep Neural Network. 3 | """ 4 | from collections import OrderedDict 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from .base import EmbeddingMixin 10 | from .nn import MLP, MaxPooling 11 | from .utils import init_weights 12 | 13 | 14 | class DNN(nn.Module, EmbeddingMixin): 15 | """Deep Neural Network. 16 | 17 | Parameters 18 | ---------- 19 | features : Features 20 | 21 | num_classes : int 22 | Number of classes. 23 | 24 | embedding_size : int 25 | Size of embedding. 26 | 27 | hidden_layers : list 28 | Size of hidden layers. 29 | Example: [96, 32] 30 | 31 | activation : str 32 | Activation function. 33 | Example: relu 34 | 35 | final_activation : str 36 | Activation function of output. 37 | 38 | dropout : float 39 | Dropout rate. 40 | """ 41 | def __init__(self, features, num_classes, embedding_size, hidden_layers, 42 | activation='relu', final_activation=None, dropout=0.0): 43 | super(DNN, self).__init__() 44 | self.features = features 45 | self.num_classes = num_classes 46 | self.final_activation = final_activation 47 | 48 | self.embeddings, self.embedding_sizes = self.build_embeddings( 49 | embedding_size) 50 | 51 | self._sequence_poolings = OrderedDict() 52 | 53 | total_embedding_sizes = 0 54 | for feature in self.features.category_features: 55 | total_embedding_sizes += ( 56 | self.embedding_sizes[feature.name]) 57 | 58 | for feature in self.features.sequence_features: 59 | self._sequence_poolings[feature.name] = MaxPooling(1) 60 | self.add_module( 61 | f"pooling:{feature.name}", 62 | self._sequence_poolings[feature.name]) 63 | total_embedding_sizes += ( 64 | self.embedding_sizes[feature.name]) 65 | 66 | total_input_size = (total_embedding_sizes + 67 | len(self.features.number_features)) 68 | self.mlp = MLP( 69 | total_input_size, 70 | hidden_layers, 71 | dropout=dropout, batchnorm=True, activation=activation) 72 | final_layer_input_size = hidden_layers[-1] 73 | 74 | output_size = self.num_classes 75 | 76 | if self.num_classes == 2 and self.final_activation == 'sigmoid': 77 | output_size -= 1 78 | 79 | self.final_layer = nn.Linear(final_layer_input_size, output_size) 80 | 81 | self.apply(init_weights) 82 | 83 | def forward(self, x): 84 | final_layer_inputs = list() 85 | 86 | # linear 87 | number_inputs = list() 88 | for feature in self.features.number_features: 89 | number_inputs.append(x[feature.name].view(-1, 1)) 90 | 91 | embeddings = list() 92 | for feature in self.features.category_features: 93 | embeddings.append( 94 | self.embeddings[feature.name](x[feature.name])) 95 | 96 | for feature in self.features.sequence_features: 97 | embeddings.append( 98 | self._sequence_poolings[feature.name]( 99 | self.embeddings[feature.name](x[feature.name]))) 100 | 101 | emb_concat = torch.cat(number_inputs + embeddings, dim=1) 102 | 103 | final_layer_inputs = self.mlp(emb_concat) 104 | 105 | output = self.final_layer(final_layer_inputs) 106 | 107 | if self.num_classes == 2 and self.final_activation == 'sigmoid': 108 | output = torch.sigmoid(output) 109 | elif self.num_classes > 1 and self.final_activation == 'softmax': 110 | output = torch.softmax(output) 111 | elif self.final_activation: 112 | raise NotImplementedError( 113 | f"pair (final_activation: {self.final_activation}, " 114 | f"num_classes: {self.num_classes}) is not implemented") 115 | 116 | return output 117 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/functions.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm_notebook, tqdm 2 | 3 | import numpy as np 4 | 5 | import torch 6 | import torch.utils.data as data 7 | 8 | from prediction_flow.pytorch.data import Dataset 9 | from prediction_flow.features import Features 10 | 11 | 12 | def __to_gpu(device, batch): 13 | for key, tensor in batch.items(): 14 | batch[key] = tensor.to(device) 15 | 16 | 17 | def fit(epochs, model, loss, optimizer, train_loader, 18 | valid_loader=None, notebook=False, 19 | auxiliary_loss_rate=0.0): 20 | if notebook: 21 | epoch_bar = tqdm_notebook( 22 | desc='training routine', total=epochs, position=0) 23 | train_bar = tqdm_notebook( 24 | desc='train', total=len(train_loader), position=1) 25 | if valid_loader: 26 | valid_bar = tqdm_notebook( 27 | desc='valid', total=len(valid_loader), position=2) 28 | else: 29 | epoch_bar = tqdm( 30 | desc='training routine', total=epochs, position=0) 31 | train_bar = tqdm( 32 | desc='train', total=len(train_loader), position=1) 33 | if valid_loader: 34 | valid_bar = tqdm( 35 | desc='valid', total=len(valid_loader), position=2) 36 | 37 | use_cuda = torch.cuda.is_available() 38 | device = torch.device("cuda:0" if use_cuda else "cpu") 39 | 40 | if use_cuda: 41 | print("GPU is available, transfer model to GPU.") 42 | model = model.to(device) 43 | 44 | losses = [] 45 | 46 | for epoch in range(epochs): 47 | model.train() 48 | running_loss = 0 49 | auxiliary_running_loss = 0 50 | for index, batch in enumerate(train_loader): 51 | if use_cuda: 52 | __to_gpu(device, batch) 53 | label = batch['label'] 54 | # step 1. zero the gradients 55 | optimizer.zero_grad() 56 | # step 2. compute the output 57 | pred = model(batch) 58 | if isinstance(pred, tuple): 59 | pred, auxiliary_loss = pred 60 | if auxiliary_loss: 61 | auxiliary_running_loss += ( 62 | (auxiliary_loss.item() - 63 | auxiliary_running_loss) / (index + 1)) 64 | # step 3. compute the loss 65 | loss_t = loss(pred, label) 66 | if isinstance(pred, tuple): 67 | if auxiliary_loss: 68 | loss_t += auxiliary_loss_rate * auxiliary_loss 69 | running_loss += (loss_t.item() - running_loss) / (index + 1) 70 | # step 4. use loss to produce gradients 71 | loss_t.backward() 72 | # step 5. use optimizer to take gradient step 73 | optimizer.step() 74 | # update bar 75 | train_bar.set_postfix(loss=running_loss, epoch=index) 76 | train_bar.update() 77 | train_bar.reset() 78 | train_loss = running_loss 79 | train_auxiliary_loss = auxiliary_running_loss 80 | 81 | valid_loss = 0 82 | if valid_loader: 83 | model.eval() 84 | running_loss = 0 85 | auxiliary_running_loss = 0 86 | with torch.no_grad(): 87 | for index, batch in enumerate(valid_loader): 88 | if use_cuda: 89 | __to_gpu(device, batch) 90 | label = batch['label'] 91 | # step 1 compute the output 92 | pred = model(batch) 93 | # step 2. compute the loss 94 | auxiliary_loss = torch.tensor(0.0) 95 | pred = model(batch) 96 | if isinstance(pred, tuple): 97 | pred, auxiliary_loss = pred 98 | if auxiliary_loss: 99 | auxiliary_running_loss += ( 100 | (auxiliary_loss.item() - 101 | auxiliary_running_loss) / (index + 1)) 102 | loss_t = loss(pred, label) 103 | if isinstance(pred, tuple): 104 | if auxiliary_loss: 105 | loss_t += auxiliary_loss_rate * auxiliary_loss 106 | running_loss += ( 107 | loss_t.item() - running_loss) / (index + 1) 108 | # update bar 109 | valid_bar.set_postfix( 110 | loss=running_loss, epoch=index) 111 | valid_bar.update() 112 | valid_loss = running_loss 113 | valid_auxiliary_loss = auxiliary_running_loss 114 | valid_bar.reset() 115 | 116 | epoch_bar.set_postfix( 117 | train_loss=train_loss, valid_loss=valid_loss, epoch=epoch) 118 | epoch_bar.update() 119 | losses.append( 120 | {'train_loss': train_loss, 121 | 'valid_loss': valid_loss, 122 | 'train_auxiliary_loss': train_auxiliary_loss, 123 | 'valid_auxiliary_loss': valid_auxiliary_loss}) 124 | 125 | return losses 126 | 127 | 128 | def predict(model, test_loader): 129 | use_cuda = torch.cuda.is_available() 130 | device = torch.device("cuda:0" if use_cuda else "cpu") 131 | 132 | model.zero_grad() 133 | model.eval() 134 | 135 | preds = list() 136 | with torch.no_grad(): 137 | for _, batch in enumerate(test_loader): 138 | if use_cuda: 139 | __to_gpu(device, batch) 140 | # step 1 compute the output 141 | pred = model(batch) 142 | if isinstance(pred, tuple): 143 | pred, auxiliary_loss = pred 144 | preds.append(pred.cpu().numpy()) 145 | 146 | return np.vstack(preds) 147 | 148 | 149 | def create_dataloader_fn( 150 | number_features, category_features, sequence_features, batch_size, 151 | train_df, label_col='label', test_df=None, num_workers=0): 152 | 153 | features = Features( 154 | number_features=number_features, 155 | category_features=category_features, 156 | sequence_features=sequence_features) 157 | 158 | features = features.fit(train_df) 159 | 160 | train_X_map = features.transform(train_df) 161 | train_y = train_df[label_col].values 162 | train_dataset = Dataset(features, train_X_map, train_y) 163 | train_loader = data.DataLoader( 164 | train_dataset, batch_size=batch_size, 165 | shuffle=True, num_workers=num_workers) 166 | 167 | test_loader = None 168 | if test_df is not None: 169 | test_X_map = features.transform(test_df) 170 | test_y = None 171 | if label_col in set(test_df.columns): 172 | test_y = test_df[label_col].values 173 | test_dataset = Dataset(features, test_X_map, test_y) 174 | test_loader = data.DataLoader( 175 | test_dataset, batch_size=batch_size, 176 | shuffle=False, num_workers=num_workers) 177 | 178 | return features, train_loader, test_loader 179 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/interest_net.py: -------------------------------------------------------------------------------- 1 | """Interest Net. 2 | """ 3 | 4 | from collections import OrderedDict 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from .base import EmbeddingMixin 10 | from .nn import MLP, MaxPooling 11 | from .utils import init_weights 12 | 13 | 14 | class AttentionGroup(object): 15 | """ This class is used to identify which features should be 16 | processed by attention. All candidate features and all behavior 17 | sequential features must be the same embedding size. All behavior 18 | sequential features must be the same maximum length. 19 | 20 | Parameters 21 | ---------- 22 | name : str 23 | Unique group name. 24 | 25 | pairs : list of dict 26 | Example : 27 | [{'ad': 'item_id', 28 | 'pos_hist': 'clicked_item_ids', 29 | 'neg_hist': 'neg_item_ids'}, 30 | {'ad': 'item_category', 31 | 'pos_hist': 'clicked_item_categories', 32 | 'neg_hist': 'neg_item_categories'}] 33 | 34 | hidden_layers : iterable 35 | Hidden layer sizes of attention. 36 | 37 | activation : str 38 | Activation function of attention. 39 | Example: prelu 40 | 41 | att_dropout : float 42 | Dropout rate of attention. 43 | 44 | gru_type : str 45 | Type of GRU. GRU, AIGRU, AGRU and AUGRU are supported. 46 | 47 | gru_dropout : float 48 | Dropout rate of GRU. 49 | """ 50 | def __init__(self, name, pairs, 51 | hidden_layers, activation='prelu', att_dropout=0.0, 52 | gru_type='GRU', gru_dropout=0.0): 53 | self.name = name 54 | self.pairs = pairs 55 | self.hidden_layers = hidden_layers 56 | self.activation = activation 57 | self.att_dropout = att_dropout 58 | self.gru_type = gru_type 59 | self.gru_dropout = gru_dropout 60 | 61 | self.related_feature_names = set() 62 | self.neg_feature_names = set() 63 | for pair in pairs: 64 | self.related_feature_names.add(pair['ad']) 65 | self.related_feature_names.add(pair['pos_hist']) 66 | if 'neg_hist' in pair: 67 | self.related_feature_names.add(pair['neg_hist']) 68 | self.neg_feature_names.add(pair['neg_hist']) 69 | 70 | def is_attention_feature(self, feature_name): 71 | if feature_name in self.related_feature_names: 72 | return True 73 | return False 74 | 75 | def is_neg_sampling_feature(self, feature_name): 76 | if feature_name in self.neg_feature_names: 77 | return True 78 | return False 79 | 80 | @property 81 | def pairs_count(self): 82 | return len(self.pairs) 83 | 84 | 85 | class InterestNet(nn.Module, EmbeddingMixin): 86 | """Interest Network. 87 | 88 | Parameters 89 | ---------- 90 | features : Features 91 | 92 | attention_groups : list of AttentionGroup 93 | 94 | num_classes : int 95 | Number of classes. 96 | 97 | embedding_size : int 98 | Size of embedding. 99 | 100 | hidden_layers : list 101 | Size of hidden layers. 102 | Example: [96, 32] 103 | 104 | dnn_activation : str 105 | Activation function of deep layers. 106 | Example: relu 107 | 108 | final_activation : str 109 | Activation function of output. 110 | 111 | dropout : float 112 | Dropout rate. 113 | """ 114 | def _is_attention_feature(self, feature): 115 | for group in self.attention_groups: 116 | if group.is_attention_feature(feature.name): 117 | return True 118 | return False 119 | 120 | def _is_neg_sampling_feature(self, feature): 121 | for group in self.attention_groups: 122 | if group.is_neg_sampling_feature(feature.name): 123 | return True 124 | return False 125 | 126 | def create_attention_fn(self, attention_group): 127 | raise NotImplementedError( 128 | "Please implement the func to create attention") 129 | 130 | def __init__(self, features, attention_groups, num_classes, embedding_size, 131 | hidden_layers, dnn_activation='prelu', final_activation=None, 132 | dropout=0.0): 133 | super(InterestNet, self).__init__() 134 | self.features = features 135 | self.attention_groups = attention_groups 136 | self.num_classes = num_classes 137 | self.embedding_size = embedding_size 138 | self.hidden_layers = hidden_layers 139 | self.dnn_activation = dnn_activation 140 | self.final_activation = final_activation 141 | self.dropout = dropout 142 | 143 | self.embeddings, self.embedding_sizes = self.build_embeddings( 144 | embedding_size) 145 | 146 | self._sequence_poolings = OrderedDict() 147 | self._attention_poolings = OrderedDict() 148 | 149 | total_embedding_sizes = 0 150 | for feature in self.features.category_features: 151 | total_embedding_sizes += ( 152 | self.embedding_sizes[feature.name]) 153 | 154 | for feature in self.features.sequence_features: 155 | if not self._is_neg_sampling_feature(feature): 156 | total_embedding_sizes += ( 157 | self.embedding_sizes[feature.name]) 158 | if not self._is_attention_feature(feature): 159 | self._sequence_poolings[feature.name] = MaxPooling(1) 160 | self.add_module( 161 | f"pooling:{feature.name}", 162 | self._sequence_poolings[feature.name]) 163 | 164 | # attention 165 | for attention_group in self.attention_groups: 166 | self._attention_poolings[attention_group.name] = ( 167 | self.create_attention_fn(attention_group)) 168 | self.add_module( 169 | f"attention_pooling:{attention_group.name}", 170 | self._attention_poolings[attention_group.name]) 171 | 172 | total_input_size = (total_embedding_sizes + 173 | len(self.features.number_features)) 174 | self.mlp = MLP( 175 | total_input_size, 176 | hidden_layers, 177 | dropout=dropout, batchnorm=True, activation=dnn_activation) 178 | final_layer_input_size = hidden_layers[-1] 179 | 180 | output_size = self.num_classes 181 | 182 | if self.num_classes == 2 and self.final_activation == 'sigmoid': 183 | output_size -= 1 184 | 185 | self.final_layer = nn.Linear(final_layer_input_size, output_size) 186 | 187 | self.apply(init_weights) 188 | 189 | def forward(self, x): 190 | final_layer_inputs = list() 191 | 192 | # linear 193 | number_inputs = list() 194 | for feature in self.features.number_features: 195 | number_inputs.append(x[feature.name].view(-1, 1)) 196 | 197 | embeddings = OrderedDict() 198 | for feature in self.features.category_features: 199 | embeddings[feature.name] = self.embeddings[ 200 | feature.name](x[feature.name]) 201 | 202 | for feature in self.features.sequence_features: 203 | if not self._is_attention_feature(feature): 204 | embeddings[feature.name] = self._sequence_poolings[ 205 | feature.name](self.embeddings[ 206 | feature.name](x[feature.name])) 207 | 208 | for attention_group in self.attention_groups: 209 | query = torch.cat( 210 | [embeddings[pair['ad']] 211 | for pair in attention_group.pairs], 212 | dim=-1) 213 | keys = torch.cat( 214 | [self.embeddings[pair['pos_hist']]( 215 | x[pair['pos_hist']]) for pair in attention_group.pairs], 216 | dim=-1) 217 | keys_length = torch.min(torch.cat( 218 | [x[f"__{pair['pos_hist']}_length"].view(-1, 1) 219 | for pair in attention_group.pairs], 220 | dim=-1), dim=-1)[0] 221 | embeddings[attention_group.name] = self._attention_poolings[ 222 | attention_group.name](query, keys, keys_length) 223 | 224 | emb_concat = torch.cat(number_inputs + [ 225 | emb for emb in embeddings.values()], dim=-1) 226 | 227 | final_layer_inputs = self.mlp(emb_concat) 228 | 229 | output = self.final_layer(final_layer_inputs) 230 | 231 | if self.num_classes == 2 and self.final_activation == 'sigmoid': 232 | output = torch.sigmoid(output) 233 | elif self.num_classes > 1 and self.final_activation == 'softmax': 234 | output = torch.softmax(output) 235 | elif self.final_activation: 236 | raise NotImplementedError( 237 | f"pair (final_activation: {self.final_activation}, " 238 | f"num_classes: {self.num_classes}) is not implemented") 239 | 240 | return output 241 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from .mlp import MLP 2 | from .pooling import MaxPooling, SumPooling 3 | from .fm import FM 4 | from .attention import Attention 5 | from .rnn import AttentionGRUCell, AttentionUpdateGateGRUCell, DynamicGRU 6 | from .interest import Interest 7 | 8 | 9 | __all__ = [ 10 | 'MLP', 11 | 'MaxPooling', 12 | 'SumPooling', 13 | 'FM', 14 | 'Attention', 15 | 'AttentionGRUCell', 16 | 'AttentionUpdateGateGRUCell', 17 | 'DynamicGRU', 18 | 'Interest' 19 | ] 20 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/nn/attention.py: -------------------------------------------------------------------------------- 1 | """ 2 | Attention used by DIN model. 3 | 4 | Reference: 5 | Deep Interest Network for Click-Through Rate Prediction 6 | https://github.com/zhougr1993/DeepInterestNetwork/blob/master/din/model.py 7 | """ 8 | 9 | # Authors: Hongwei Zhang 10 | # License: MIT 11 | 12 | 13 | import numpy as np 14 | 15 | import torch 16 | import torch.nn as nn 17 | from torch.functional import F 18 | 19 | from .mlp import MLP 20 | 21 | 22 | class Attention(nn.Module): 23 | """Attention layer. 24 | 25 | Parameters 26 | ---------- 27 | input_size : int 28 | Size of input. 29 | 30 | hidden_layers : iterable 31 | Hidden layer sizes. 32 | 33 | dropout : float 34 | Dropout rate. 35 | 36 | activation : str 37 | Name of activation function. relu, prelu and sigmoid are supported. 38 | 39 | return_scores : bool 40 | Return attention scores instead of weighted sum pooling result. 41 | """ 42 | def __init__( 43 | self, 44 | input_size, 45 | hidden_layers, 46 | dropout=0.0, 47 | batchnorm=True, 48 | activation='prelu', 49 | return_scores=False): 50 | super(Attention, self).__init__() 51 | self.return_scores = return_scores 52 | self.mlp = MLP( 53 | input_size=input_size * 4, 54 | hidden_layers=hidden_layers, 55 | dropout=dropout, 56 | batchnorm=batchnorm, 57 | activation=activation) 58 | self.fc = nn.Linear(hidden_layers[-1], 1) 59 | 60 | def forward(self, query, keys, keys_length): 61 | """ 62 | Parameters 63 | ---------- 64 | query: 2D tensor, [B, H] 65 | keys: 3D tensor, [B, T, H] 66 | keys_length: 1D tensor, [B] 67 | 68 | Returns 69 | ------- 70 | outputs: 2D tensor, if return_scores=False [B, H], otherwise [B, T] 71 | """ 72 | batch_size, max_length, dim = keys.size() 73 | 74 | query = query.unsqueeze(1).expand(-1, max_length, -1) 75 | 76 | din_all = torch.cat( 77 | [query, keys, query - keys, query * keys], dim=-1) 78 | 79 | din_all = din_all.view(batch_size * max_length, -1) 80 | 81 | outputs = self.mlp(din_all) 82 | 83 | outputs = self.fc(outputs).view(batch_size, max_length) # [B, T] 84 | 85 | # Scale 86 | outputs = outputs / (dim ** 0.5) 87 | 88 | # Mask 89 | mask = (torch.arange(max_length, device=keys_length.device).repeat( 90 | batch_size, 1) < keys_length.view(-1, 1)) 91 | outputs[~mask] = -np.inf 92 | 93 | # Activation 94 | outputs = F.softmax(outputs, dim=1) # [B, T] 95 | 96 | if not self.return_scores: 97 | # Weighted sum 98 | outputs = torch.matmul( 99 | outputs.unsqueeze(1), keys).squeeze() # [B, H] 100 | 101 | return outputs 102 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/nn/fm.py: -------------------------------------------------------------------------------- 1 | """ 2 | FM layer. 3 | """ 4 | 5 | # Authors: Hongwei Zhang 6 | # License: MIT 7 | 8 | 9 | import torch 10 | import torch.nn as nn 11 | 12 | 13 | class FM(nn.Module): 14 | """FM layer. 15 | """ 16 | def __init__(self, reduce_sum=True): 17 | super(FM, self).__init__() 18 | self.reduce_sum = reduce_sum 19 | 20 | def forward(self, x): 21 | sum_squared = torch.pow(torch.sum(x, dim=1), 2) 22 | squared_sum = torch.sum(torch.pow(x, 2), dim=1) 23 | second_order = sum_squared - squared_sum 24 | if self.reduce_sum: 25 | output = torch.sum(second_order, dim=1, keepdim=True) 26 | return 0.5 * output 27 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/nn/interest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Iinterest part used by DIEN model. 3 | 4 | Reference: 5 | Deep Interest Evolution Network for Click-Through Rate Prediction 6 | https://arxiv.org/pdf/1809.03672.pdf 7 | """ 8 | 9 | # Authors: Hongwei Zhang 10 | # License: MIT 11 | 12 | 13 | from collections import OrderedDict 14 | 15 | import torch 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 19 | 20 | from .attention import Attention 21 | from .rnn import DynamicGRU 22 | 23 | 24 | class AuxiliaryNet(nn.Module): 25 | """ NN for Auxiliary Loss. 26 | 27 | Parameters 28 | ---------- 29 | input_size : int 30 | Size of input. 31 | 32 | hidden_layers : iterable 33 | Hidden layer sizes. 34 | 35 | activation : str 36 | Name of activation function. ReLU, PReLU and Sigmoid are supported. 37 | """ 38 | def __init__(self, input_size, hidden_layers, activation='sigmoid'): 39 | super(AuxiliaryNet, self).__init__() 40 | modules = OrderedDict() 41 | 42 | previous_size = input_size 43 | for index, hidden_layer in enumerate(hidden_layers): 44 | modules[f"dense{index}"] = nn.Linear(previous_size, hidden_layer) 45 | if activation: 46 | if activation.lower() == 'relu': 47 | modules[f"activation{index}"] = nn.ReLU() 48 | elif activation.lower() == 'prelu': 49 | modules[f"activation{index}"] = nn.PReLU() 50 | elif activation.lower() == 'sigmoid': 51 | modules[f"activation{index}"] = nn.Sigmoid() 52 | else: 53 | raise NotImplementedError(f"{activation} is not supported") 54 | previous_size = hidden_layer 55 | modules["final_layer"] = nn.Linear(previous_size, 1) 56 | self._sequential = nn.Sequential(modules) 57 | 58 | def forward(self, input): 59 | return torch.sigmoid(self._sequential(input)) 60 | 61 | 62 | class Interest(nn.Module): 63 | """Interest layer. 64 | 65 | Parameters 66 | ---------- 67 | input_size : int 68 | Size of input. 69 | 70 | gru_type : str 71 | Type of GRU. GRU, AIGRU, AGRU and AUGRU are supported. 72 | 73 | gru_dropout : float 74 | Dropout rate of GRU. 75 | 76 | att_hidden_layers : iterable 77 | Hidden layer sizes of attention. 78 | 79 | att_dropout : float 80 | Dropout rate of attention. 81 | 82 | att_batchnorm : bool 83 | Batchnorm of attention. 84 | 85 | att_activation : str 86 | Activation function name of attention. 87 | relu, prelu and sigmoid are supported. 88 | 89 | use_negsampling : bool 90 | """ 91 | __SUPPORTED_GRU_TYPE__ = ['GRU', 'AIGRU', 'AGRU', 'AUGRU'] 92 | 93 | def __init__( 94 | self, 95 | input_size, 96 | gru_type='GRU', 97 | gru_dropout=0.0, 98 | att_hidden_layers=[80, 40], 99 | att_dropout=0.0, 100 | att_batchnorm=True, 101 | att_activation='prelu', 102 | use_negsampling=False): 103 | super(Interest, self).__init__() 104 | if gru_type not in Interest.__SUPPORTED_GRU_TYPE__: 105 | raise NotImplementedError(f"gru_type: {gru_type} is not supported") 106 | 107 | self.gru_type = gru_type 108 | self.use_negsampling = use_negsampling 109 | 110 | self.interest_extractor = nn.GRU( 111 | input_size=input_size, 112 | hidden_size=input_size, 113 | batch_first=True, 114 | bidirectional=False) 115 | 116 | if self.use_negsampling: 117 | self.auxiliary_net = AuxiliaryNet( 118 | input_size * 2, hidden_layers=[100, 50]) 119 | 120 | if gru_type == 'GRU': 121 | self.interest_evolution = nn.GRU( 122 | input_size=input_size, 123 | hidden_size=input_size, 124 | batch_first=True, 125 | bidirectional=False) 126 | 127 | self.attention = Attention( 128 | input_size=input_size, 129 | hidden_layers=att_hidden_layers, 130 | dropout=att_dropout, 131 | batchnorm=att_batchnorm, 132 | activation=att_activation) 133 | elif gru_type == 'AIGRU': 134 | self.attention = Attention( 135 | input_size=input_size, 136 | hidden_layers=att_hidden_layers, 137 | dropout=att_dropout, 138 | batchnorm=att_batchnorm, 139 | activation=att_activation, 140 | return_scores=True) 141 | 142 | self.interest_evolution = nn.GRU( 143 | input_size=input_size, 144 | hidden_size=input_size, 145 | batch_first=True, 146 | bidirectional=False) 147 | elif gru_type == 'AGRU' or gru_type == 'AUGRU': 148 | self.attention = Attention( 149 | input_size=input_size, 150 | hidden_layers=att_hidden_layers, 151 | dropout=att_dropout, 152 | batchnorm=att_batchnorm, 153 | activation=att_activation, 154 | return_scores=True) 155 | 156 | self.interest_evolution = DynamicGRU( 157 | input_size=input_size, 158 | hidden_size=input_size, 159 | gru_type=gru_type) 160 | 161 | @staticmethod 162 | def _get_last_state(states, keys_length): 163 | # states [B, T, H] 164 | batch_size, max_seq_length, hidden_size = states.size() 165 | 166 | mask = (torch.arange(max_seq_length, device=keys_length.device).repeat( 167 | batch_size, 1) == (keys_length.view(-1, 1) - 1)) 168 | 169 | return states[mask] 170 | 171 | def cal_auxiliary_loss( 172 | self, states, click_seq, noclick_seq, keys_length): 173 | # states [B, T, H] 174 | # click_seq [B, T, H] 175 | # noclick_seq [B, T, H] 176 | # keys_length [B] 177 | batch_size, max_seq_length, embedding_size = states.size() 178 | 179 | mask = (torch.arange(max_seq_length, device=states.device).repeat( 180 | batch_size, 1) < keys_length.view(-1, 1)).float() 181 | 182 | click_input = torch.cat([states, click_seq], dim=-1) 183 | noclick_input = torch.cat([states, noclick_seq], dim=-1) 184 | embedding_size = embedding_size * 2 185 | 186 | click_p = self.auxiliary_net( 187 | click_input.view( 188 | batch_size * max_seq_length, embedding_size)).view( 189 | batch_size, max_seq_length)[mask > 0].view(-1, 1) 190 | click_target = torch.ones( 191 | click_p.size(), dtype=torch.float, device=click_p.device) 192 | 193 | noclick_p = self.auxiliary_net( 194 | noclick_input.view( 195 | batch_size * max_seq_length, embedding_size)).view( 196 | batch_size, max_seq_length)[mask > 0].view(-1, 1) 197 | noclick_target = torch.zeros( 198 | noclick_p.size(), dtype=torch.float, device=noclick_p.device) 199 | 200 | loss = F.binary_cross_entropy( 201 | torch.cat([click_p, noclick_p], dim=0), 202 | torch.cat([click_target, noclick_target], dim=0)) 203 | 204 | return loss 205 | 206 | def forward(self, query, keys, keys_length, neg_keys=None): 207 | """ 208 | Parameters 209 | ---------- 210 | query: 2D tensor, [B, H] 211 | keys: 3D tensor, [B, T, H] 212 | keys_length: 1D tensor, [B] 213 | neg_keys: 3D tensor, [B, T, H] 214 | 215 | Returns 216 | ------- 217 | outputs: 2D tensor, [B, H] 218 | """ 219 | batch_size, max_length, dim = keys.size() 220 | 221 | packed_keys = pack_padded_sequence( 222 | keys, 223 | lengths=keys_length.squeeze(), 224 | batch_first=True, 225 | enforce_sorted=False) 226 | 227 | packed_interests, _ = self.interest_extractor(packed_keys) 228 | 229 | aloss = None 230 | if (self.gru_type != 'GRU') or self.use_negsampling: 231 | interests, _ = pad_packed_sequence( 232 | packed_interests, 233 | batch_first=True, 234 | padding_value=0.0, 235 | total_length=max_length) 236 | 237 | if self.use_negsampling: 238 | aloss = self.cal_auxiliary_loss( 239 | interests[:, :-1, :], 240 | keys[:, 1:, :], 241 | neg_keys[:, 1:, :], 242 | keys_length - 1) 243 | 244 | if self.gru_type == 'GRU': 245 | packed_interests, _ = self.interest_evolution(packed_interests) 246 | 247 | interests, _ = pad_packed_sequence( 248 | packed_interests, 249 | batch_first=True, 250 | padding_value=0.0, 251 | total_length=max_length) 252 | 253 | outputs = self.attention(query, interests, keys_length) 254 | 255 | elif self.gru_type == 'AIGRU': 256 | # attention 257 | scores = self.attention(query, interests, keys_length) 258 | interests = interests * scores.unsqueeze(-1) 259 | 260 | packed_interests = pack_padded_sequence( 261 | interests, 262 | lengths=keys_length.squeeze(), 263 | batch_first=True, 264 | enforce_sorted=False) 265 | _, outputs = self.interest_evolution(packed_interests) 266 | outputs = outputs.squeeze() 267 | 268 | elif self.gru_type == 'AGRU' or self.gru_type == 'AUGRU': 269 | # attention 270 | scores = self.attention(query, interests, keys_length) 271 | 272 | packed_interests = pack_padded_sequence( 273 | interests, 274 | lengths=keys_length.squeeze(), 275 | batch_first=True, 276 | enforce_sorted=False) 277 | 278 | packed_scores = pack_padded_sequence( 279 | scores, 280 | lengths=keys_length.squeeze(), 281 | batch_first=True, 282 | enforce_sorted=False) 283 | 284 | outputs, _ = pad_packed_sequence( 285 | self.interest_evolution( 286 | packed_interests, packed_scores), batch_first=True) 287 | # pick last state 288 | outputs = Interest._get_last_state( 289 | outputs, keys_length.squeeze()) 290 | 291 | return outputs, aloss 292 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/nn/mlp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Multilayer perceptron torch module. 3 | """ 4 | 5 | # Authors: Hongwei Zhang 6 | # License: MIT 7 | 8 | 9 | from collections import OrderedDict 10 | 11 | import torch.nn as nn 12 | 13 | 14 | class MLP(nn.Module): 15 | """Multilayer perceptron torch module. 16 | 17 | Parameters 18 | ---------- 19 | input_size : int 20 | Size of input. 21 | 22 | hidden_layers : iterable 23 | Hidden layer sizes. 24 | 25 | dropout : float 26 | Dropout rate. 27 | 28 | activation : str 29 | Name of activation function. ReLU, PReLU and Sigmoid are supported. 30 | """ 31 | def __init__(self, input_size, hidden_layers, 32 | dropout=0.0, batchnorm=True, activation='relu'): 33 | super(MLP, self).__init__() 34 | modules = OrderedDict() 35 | 36 | previous_size = input_size 37 | for index, hidden_layer in enumerate(hidden_layers): 38 | modules[f"dense{index}"] = nn.Linear(previous_size, hidden_layer) 39 | if batchnorm: 40 | modules[f"batchnorm{index}"] = nn.BatchNorm1d(hidden_layer) 41 | if activation: 42 | if activation.lower() == 'relu': 43 | modules[f"activation{index}"] = nn.ReLU() 44 | elif activation.lower() == 'prelu': 45 | modules[f"activation{index}"] = nn.PReLU() 46 | elif activation.lower() == 'sigmoid': 47 | modules[f"activation{index}"] = nn.Sigmoid() 48 | else: 49 | raise NotImplementedError(f"{activation} is not supported") 50 | if dropout: 51 | modules[f"dropout{index}"] = nn.Dropout(dropout) 52 | previous_size = hidden_layer 53 | self._sequential = nn.Sequential(modules) 54 | 55 | def forward(self, input): 56 | return self._sequential(input) 57 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/nn/pooling.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pooling layers. 3 | """ 4 | 5 | # Authors: Hongwei Zhang 6 | # License: MIT 7 | 8 | 9 | import torch 10 | import torch.nn as nn 11 | 12 | 13 | class MaxPooling(nn.Module): 14 | """Max Pooling. 15 | 16 | Parameters 17 | ---------- 18 | dim : int 19 | The dimension to do pooling. 20 | 21 | Attributes 22 | ---------- 23 | dim : int 24 | The dimension to do pooling. 25 | """ 26 | def __init__(self, dim): 27 | super(MaxPooling, self).__init__() 28 | self.dim = dim 29 | 30 | def forward(self, input): 31 | return torch.max(input, self.dim)[0] 32 | 33 | 34 | class SumPooling(nn.Module): 35 | """Sum Pooling. 36 | 37 | Parameters 38 | ---------- 39 | dim : int 40 | The dimension to do pooling. 41 | 42 | Attributes 43 | ---------- 44 | dim : int 45 | The dimension to do pooling. 46 | """ 47 | def __init__(self, dim): 48 | super(SumPooling, self).__init__() 49 | self.dim = dim 50 | 51 | def forward(self, input): 52 | return torch.sum(input, self.dim) 53 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/nn/rnn.py: -------------------------------------------------------------------------------- 1 | """AttentionGRU and AttentionUpdateGateGRU. 2 | """ 3 | 4 | # Authors: Hongwei Zhang 5 | # License: MIT 6 | 7 | 8 | import math 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.init as init 13 | import torch.nn.functional as F 14 | from torch.nn.utils.rnn import PackedSequence 15 | 16 | 17 | class AttentionGRUCell(nn.Module): 18 | def __init__(self, input_size, hidden_size, bias=True): 19 | super(AttentionGRUCell, self).__init__() 20 | self.input_size = input_size 21 | self.hidden_size = hidden_size 22 | self.bias = bias 23 | # (W_ir|W_iz|W_in) 24 | self.weight_ih = nn.Parameter( 25 | torch.Tensor(3 * hidden_size, input_size)) 26 | # (W_hr|W_hz|W_hn) 27 | self.weight_hh = nn.Parameter( 28 | torch.Tensor(3 * hidden_size, hidden_size)) 29 | if bias: 30 | # (b_ir|b_iz|b_in) 31 | self.bias_ih = nn.Parameter(torch.Tensor(3 * hidden_size)) 32 | # (b_hr|b_hz|b_hn) 33 | self.bias_hh = nn.Parameter(torch.Tensor(3 * hidden_size)) 34 | else: 35 | self.register_parameter('bias_ih', None) 36 | self.register_parameter('bias_hh', None) 37 | self.reset_parameters() 38 | 39 | def reset_parameters(self): 40 | stdv = 1.0 / math.sqrt(self.hidden_size) 41 | for weight in self.parameters(): 42 | init.uniform_(weight, -stdv, stdv) 43 | 44 | def forward(self, input, hx, att_score): 45 | """ 46 | 47 | References 48 | ---------- 49 | https://github.com/pytorch/pytorch/blob/v0.4.1/torch/nn/_functions/rnn.py#L49 50 | """ 51 | 52 | gi = F.linear(input, self.weight_ih, self.bias_ih) 53 | gh = F.linear(hx, self.weight_hh, self.bias_hh) 54 | i_r, i_z, i_n = gi.chunk(3, 1) 55 | h_r, h_z, h_n = gh.chunk(3, 1) 56 | 57 | resetgate = torch.sigmoid(i_r + h_r) 58 | # updategate = torch.sigmoid(i_z + h_z) 59 | newgate = torch.tanh(i_n + resetgate * h_n) 60 | # hy = newgate + updategate * (hx - newgate) 61 | 62 | att_score = att_score.view(-1, 1) 63 | 64 | hy = (1. - att_score) * hx + att_score * newgate 65 | 66 | return hy 67 | 68 | 69 | class AttentionUpdateGateGRUCell(nn.Module): 70 | def __init__(self, input_size, hidden_size, bias=True): 71 | super(AttentionUpdateGateGRUCell, self).__init__() 72 | self.input_size = input_size 73 | self.hidden_size = hidden_size 74 | self.bias = bias 75 | # (W_ir|W_iz|W_in) 76 | self.weight_ih = nn.Parameter( 77 | torch.Tensor(3 * hidden_size, input_size)) 78 | # (W_hr|W_hz|W_hn) 79 | self.weight_hh = nn.Parameter( 80 | torch.Tensor(3 * hidden_size, hidden_size)) 81 | if bias: 82 | # (b_ir|b_iz|b_in) 83 | self.bias_ih = nn.Parameter(torch.Tensor(3 * hidden_size)) 84 | # (b_hr|b_hz|b_hn) 85 | self.bias_hh = nn.Parameter(torch.Tensor(3 * hidden_size)) 86 | else: 87 | self.register_parameter('bias_ih', None) 88 | self.register_parameter('bias_hh', None) 89 | self.reset_parameters() 90 | 91 | def reset_parameters(self): 92 | stdv = 1.0 / math.sqrt(self.hidden_size) 93 | for weight in self.parameters(): 94 | init.uniform_(weight, -stdv, stdv) 95 | 96 | def forward(self, input, hx, att_score): 97 | """ 98 | 99 | References 100 | ---------- 101 | https://github.com/pytorch/pytorch/blob/v0.4.1/torch/nn/_functions/rnn.py#L49 102 | """ 103 | 104 | gi = F.linear(input, self.weight_ih, self.bias_ih) 105 | gh = F.linear(hx, self.weight_hh, self.bias_hh) 106 | i_r, i_z, i_n = gi.chunk(3, 1) 107 | h_r, h_z, h_n = gh.chunk(3, 1) 108 | 109 | resetgate = torch.sigmoid(i_r + h_r) 110 | updategate = torch.sigmoid(i_z + h_z) 111 | newgate = torch.tanh(i_n + resetgate * h_n) 112 | 113 | updategate = att_score.view(-1, 1) * updategate 114 | 115 | hy = newgate + updategate * (hx - newgate) 116 | 117 | return hy 118 | 119 | 120 | class DynamicGRU(nn.Module): 121 | def __init__(self, input_size, hidden_size, bias=True, gru_type='AGRU'): 122 | super(DynamicGRU, self).__init__() 123 | self.input_size = input_size 124 | self.hidden_size = hidden_size 125 | 126 | if gru_type == 'AGRU': 127 | self.rnn = AttentionGRUCell(input_size, hidden_size, bias) 128 | elif gru_type == 'AUGRU': 129 | self.rnn = AttentionUpdateGateGRUCell( 130 | input_size, hidden_size, bias) 131 | 132 | def forward(self, input, att_scores, hx=None): 133 | is_packed_input = isinstance(input, PackedSequence) 134 | if not is_packed_input: 135 | raise NotImplementedError( 136 | "DynamicGRU only supports packed input") 137 | 138 | is_packed_att_scores = isinstance(att_scores, PackedSequence) 139 | if not is_packed_att_scores: 140 | raise NotImplementedError( 141 | "DynamicGRU only supports packed att_scores") 142 | 143 | input, batch_sizes, sorted_indices, unsorted_indices = input 144 | att_scores, _, _, _ = att_scores 145 | 146 | max_batch_size = batch_sizes[0] 147 | max_batch_size = int(max_batch_size) 148 | 149 | if hx is None: 150 | hx = torch.zeros( 151 | max_batch_size, self.hidden_size, 152 | dtype=input.dtype, device=input.device) 153 | 154 | outputs = torch.zeros( 155 | input.size(0), self.hidden_size, 156 | dtype=input.dtype, device=input.device) 157 | 158 | begin = 0 159 | for batch in batch_sizes: 160 | new_hx = self.rnn( 161 | input[begin: begin + batch], 162 | hx[0:batch], 163 | att_scores[begin: begin + batch]) 164 | outputs[begin: begin + batch] = new_hx 165 | hx = new_hx 166 | begin += batch 167 | 168 | return PackedSequence( 169 | outputs, batch_sizes, sorted_indices, unsorted_indices) 170 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/nn/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/pytorch/nn/tests/__init__.py -------------------------------------------------------------------------------- /prediction_flow/pytorch/nn/tests/test_attention.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.pytorch.nn import Attention 2 | 3 | 4 | import numpy as np 5 | import torch 6 | import torch.nn.init as init 7 | 8 | 9 | def test_attention(): 10 | attention = Attention(3, [8], batchnorm=False, activation=0.0) 11 | 12 | query = torch.tensor([[1, 1, 1], [0.1, 0.2, 0.3]], dtype=torch.float) 13 | 14 | keys = torch.tensor([ 15 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]], 16 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]] 17 | ], dtype=torch.float) 18 | 19 | keys_length = torch.tensor([3, 4]) 20 | 21 | for param in attention.mlp.parameters(): 22 | init.constant_(param, 1) 23 | 24 | for param in attention.fc.parameters(): 25 | init.constant_(param, 1) 26 | 27 | output = attention(query, keys, keys_length) 28 | 29 | actual = output.detach().numpy() 30 | assert output.size()[0] == 2 31 | assert output.size()[1] == 3 32 | np.testing.assert_array_almost_equal( 33 | actual, np.array([[1.0, 2.0, 3.0], 34 | [0.989024, 1.969694, 2.959199]], dtype=np.float)) 35 | 36 | def test_attention_return_scores(): 37 | attention = Attention( 38 | 3, [8], batchnorm=False, activation=0.0, return_scores=True) 39 | 40 | query = torch.tensor([[1, 1, 1], [0.1, 0.2, 0.3]], dtype=torch.float) 41 | 42 | keys = torch.tensor([ 43 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]], 44 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]] 45 | ], dtype=torch.float) 46 | 47 | keys_length = torch.tensor([3, 4]) 48 | 49 | for param in attention.mlp.parameters(): 50 | init.constant_(param, 1) 51 | 52 | for param in attention.fc.parameters(): 53 | init.constant_(param, 1) 54 | 55 | output = attention(query, keys, keys_length) 56 | 57 | actual = output.detach().numpy() 58 | 59 | expected = np.array( 60 | [[1.472415e-11, 1.000000e+00, 1.492623e-09, 0.000000e+00], 61 | [2.915521e-03, 9.821462e-01, 8.833572e-03, 6.104673e-03]], 62 | dtype=np.float) 63 | 64 | assert output.size()[0] == 2 65 | assert output.size()[1] == 4 66 | np.testing.assert_array_almost_equal(actual, expected) 67 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/nn/tests/test_fm.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.pytorch.nn import FM 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | def test_fm(): 8 | fm = FM() 9 | 10 | x = torch.as_tensor( 11 | [[[1.0, 1.0, 1.0], [1.0, 2.0, 3.0]], 12 | [[1.0, 1.0, 1.0], [4.0, 5.0, 6.0]]]) 13 | actual = fm(x) 14 | 15 | # 6.0 = 1 * 1 + 1 * 2 + 1 * 3 16 | # 15.0 = 1 * 4 + 1 * 5 + 1 * 6 17 | np.testing.assert_array_almost_equal( 18 | actual.numpy(), np.array([[6.0], [15.0]], dtype=np.float)) 19 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/nn/tests/test_interest.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.pytorch.nn import Interest 2 | 3 | 4 | import torch 5 | 6 | 7 | def test_gru_interest_evolution(): 8 | interests = Interest( 9 | input_size=3, 10 | gru_type='GRU', 11 | gru_dropout=0, 12 | att_hidden_layers=[8], 13 | att_dropout=0, 14 | att_batchnorm=False, 15 | att_activation=None) 16 | 17 | query = torch.tensor([[1, 1, 1], [0.1, 0.2, 0.3]], dtype=torch.float) 18 | 19 | keys = torch.tensor([ 20 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]], 21 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]] 22 | ], dtype=torch.float) 23 | 24 | keys_length = torch.tensor([3, 4]) 25 | 26 | output, _ = interests(query, keys, keys_length) 27 | 28 | assert output.size()[0] == 2 29 | assert output.size()[1] == 3 30 | 31 | 32 | def test_aigru_interest_evolution(): 33 | interests = Interest( 34 | input_size=3, 35 | gru_type='AIGRU', 36 | gru_dropout=0, 37 | att_hidden_layers=[8], 38 | att_dropout=0, 39 | att_batchnorm=False, 40 | att_activation=None) 41 | 42 | query = torch.tensor([[1, 1, 1], [0.1, 0.2, 0.3]], dtype=torch.float) 43 | 44 | keys = torch.tensor([ 45 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]], 46 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]] 47 | ], dtype=torch.float) 48 | 49 | keys_length = torch.tensor([3, 4]) 50 | 51 | output, _ = interests(query, keys, keys_length) 52 | 53 | assert output.size()[0] == 2 54 | assert output.size()[1] == 3 55 | 56 | 57 | def test_agru_interest_evolution(): 58 | interests = Interest( 59 | input_size=3, 60 | gru_type='AGRU', 61 | gru_dropout=0, 62 | att_hidden_layers=[8], 63 | att_dropout=0, 64 | att_batchnorm=False, 65 | att_activation=None) 66 | 67 | query = torch.tensor([[1, 1, 1], [0.1, 0.2, 0.3]], dtype=torch.float) 68 | 69 | keys = torch.tensor([ 70 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]], 71 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]] 72 | ], dtype=torch.float) 73 | 74 | keys_length = torch.tensor([3, 4]) 75 | 76 | output, _ = interests(query, keys, keys_length) 77 | 78 | assert output.size()[0] == 2 79 | assert output.size()[1] == 3 80 | 81 | 82 | def test_augru_interest_evolution(): 83 | interests = Interest( 84 | input_size=3, 85 | gru_type='AUGRU', 86 | gru_dropout=0, 87 | att_hidden_layers=[8], 88 | att_dropout=0, 89 | att_batchnorm=False, 90 | att_activation=None) 91 | 92 | query = torch.tensor([[1, 1, 1], [0.1, 0.2, 0.3]], dtype=torch.float) 93 | 94 | keys = torch.tensor([ 95 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]], 96 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]] 97 | ], dtype=torch.float) 98 | 99 | keys_length = torch.tensor([3, 4]) 100 | 101 | output, _ = interests(query, keys, keys_length) 102 | 103 | assert output.size()[0] == 2 104 | assert output.size()[1] == 3 105 | 106 | 107 | def test_neg_sampling(): 108 | interests = Interest( 109 | input_size=3, 110 | gru_type='AUGRU', 111 | gru_dropout=0, 112 | att_hidden_layers=[8], 113 | att_dropout=0, 114 | att_batchnorm=False, 115 | att_activation=None, 116 | use_negsampling=True) 117 | 118 | query = torch.tensor( 119 | [[1, 1, 1], [0.1, 0.2, 0.3], [0.3, 0.4, 0.5]], dtype=torch.float) 120 | 121 | keys = torch.tensor([ 122 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]], 123 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]], 124 | [[0.1, 0.2, 0.3], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] 125 | ], dtype=torch.float) 126 | 127 | neg_keys = torch.tensor([ 128 | [[0.3, 0.2, 0.1], [3, 2, 1], [1, 0.2, 0.4], [0.0, 0.0, 0.0]], 129 | [[0.3, 0.2, 0.1], [3, 2, 1], [1, 0.2, 0.4], [0.5, 0.5, 0.5]], 130 | [[0.3, 0.2, 0.1], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] 131 | ], dtype=torch.float) 132 | 133 | keys_length = torch.tensor([3, 4, 1]) 134 | 135 | output, _ = interests(query, keys, keys_length, neg_keys) 136 | 137 | assert output.size()[0] == 3 138 | assert output.size()[1] == 3 139 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/nn/tests/test_mlp.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.pytorch.nn import MLP 2 | 3 | 4 | def test_simple_creation(): 5 | mlp = MLP(input_size=10, hidden_layers=(16, 4), 6 | activation=None, dropout=0.0) 7 | 8 | assert len(mlp._sequential) == 4 9 | 10 | 11 | def test_creation_with_dropout(): 12 | mlp = MLP(input_size=10, hidden_layers=(16, 4), 13 | activation=None, dropout=0.1) 14 | 15 | assert len(mlp._sequential) == 6 16 | 17 | 18 | def test_creation_with_activation_and_dropout(): 19 | mlp = MLP(input_size=10, hidden_layers=(16, 4), 20 | activation='relu', dropout=0.1) 21 | 22 | assert len(mlp._sequential) == 8 23 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/nn/tests/test_pooling.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.pytorch.nn import MaxPooling, SumPooling 2 | 3 | import torch 4 | 5 | 6 | def test_max_pooling(): 7 | x = torch.tensor( 8 | [[[1, 2, 1, 1], 9 | [1, 1, 3, 1]], 10 | [[10, 1, 1, 1], 11 | [1, 1, 4, 1]], 12 | [[2, 8, 9, 0], 13 | [1, 1, 1, 1]]]) 14 | 15 | max_pooling = MaxPooling(dim=1) 16 | 17 | actual = max_pooling(x) 18 | 19 | assert actual.numpy().tolist() == [ 20 | [1, 2, 3, 1], [10, 1, 4, 1], [2, 8, 9, 1]] 21 | 22 | 23 | def test_sum_pooling(): 24 | x = torch.tensor( 25 | [[[1, 2, 1, 1], 26 | [1, 1, 3, 1]], 27 | [[10, 1, 1, 1], 28 | [1, 1, 4, 1]], 29 | [[2, 8, 9, 0], 30 | [1, 1, 1, 1]]]) 31 | 32 | sum_pooling = SumPooling(dim=1) 33 | 34 | actual = sum_pooling(x) 35 | 36 | assert actual.numpy().tolist() == [ 37 | [2, 3, 4, 2], [11, 2, 5, 2], [3, 9, 10, 1]] 38 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/nn/tests/test_rnn.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.pytorch.nn import ( 2 | AttentionGRUCell, AttentionUpdateGateGRUCell, DynamicGRU) 3 | 4 | import torch 5 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 6 | 7 | 8 | def test_attention_gru_cell(): 9 | gru_cell = AttentionGRUCell(10, 20) 10 | input = torch.randn(6, 3, 10) 11 | hx = torch.randn(3, 20) 12 | att_scores = torch.tensor([ 13 | [0.1, 0.3, 0.6], 14 | [0.2, 0.2, 0.6], 15 | [0.1, 0.6, 0.3], 16 | [1.0, 0., 0.], 17 | [0.2, 0.3, 0.5], 18 | [0.1, 0.3, 0.6], 19 | ]) 20 | 21 | output = [] 22 | for i in range(6): 23 | hx = gru_cell(input[i], hx, att_scores[i]) 24 | output.append(hx) 25 | 26 | assert len(output) == 6 27 | 28 | 29 | def test_attention_update_gate_gru_cell(): 30 | gru_cell = AttentionUpdateGateGRUCell(10, 20) 31 | input = torch.randn(6, 3, 10) 32 | hx = torch.randn(3, 20) 33 | att_scores = torch.tensor([ 34 | [0.1, 0.3, 0.6], 35 | [0.2, 0.2, 0.6], 36 | [0.1, 0.6, 0.3], 37 | [1.0, 0., 0.], 38 | [0.2, 0.3, 0.5], 39 | [0.1, 0.3, 0.6], 40 | ]) 41 | 42 | output = [] 43 | for i in range(6): 44 | hx = gru_cell(input[i], hx, att_scores[i]) 45 | output.append(hx) 46 | 47 | assert len(output) == 6 48 | 49 | 50 | def test_dynamic_gru(): 51 | rnn = DynamicGRU(3, 5) 52 | 53 | keys = torch.tensor([ 54 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]], 55 | [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]], 56 | [[0.1, 0.2, 0.3], [0., 0., 0.], [0., 0., 0.], [0., 0., 0.]], 57 | [[0.1, 0.2, 0.3], [0., 0., 0.], [0., 0., 0.], [0., 0., 0.]], 58 | [[0.1, 0.2, 0.3], [0., 0., 0.], [0., 0., 0.], [0., 0., 0.]] 59 | ], dtype=torch.float) 60 | 61 | att_scores = torch.tensor([ 62 | [0.0330, 0.7252, 0.2459, 0.], 63 | [0.2952, 0.8721, 0.4468, 0.0904], 64 | [0.4598, 0., 0., 0.], 65 | [0.0286, 0., 0., 0.], 66 | [0.0561, 0., 0., 0.]]) 67 | 68 | lengths = torch.tensor([3, 4, 1, 1, 1]) 69 | 70 | packed_att_scores = pack_padded_sequence( 71 | att_scores, 72 | lengths, 73 | batch_first=True, enforce_sorted=False) 74 | 75 | packed_keys = pack_padded_sequence( 76 | keys, 77 | lengths, 78 | batch_first=True, enforce_sorted=False) 79 | 80 | actual, actual_lengths = pad_packed_sequence( 81 | rnn(packed_keys, packed_att_scores), batch_first=True) 82 | 83 | assert actual.size() == (5, 4, 5) 84 | assert actual_lengths.numpy().tolist() == [3, 4, 1, 1, 1] 85 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/pytorch/tests/__init__.py -------------------------------------------------------------------------------- /prediction_flow/pytorch/tests/test_deepfm.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.features import Number, Category, Sequence, Features 2 | from prediction_flow.transformers.column import ( 3 | StandardScaler, CategoryEncoder, SequenceEncoder) 4 | from prediction_flow.pytorch import DeepFM 5 | 6 | from .utils import prepare_dataloader 7 | 8 | 9 | def test_normal(): 10 | number_features = [ 11 | Number('userAge', StandardScaler()), 12 | Number('rating', StandardScaler())] 13 | 14 | category_features = [ 15 | Category('userId', CategoryEncoder(min_cnt=1)), 16 | Category('movieId', CategoryEncoder(min_cnt=1)), 17 | Category('topGenre', CategoryEncoder(min_cnt=1))] 18 | 19 | sequence_features = [ 20 | Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), 21 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), 22 | Sequence('clickedMovieIds', 23 | SequenceEncoder(sep='|', min_cnt=1, max_len=5)), 24 | Sequence('clickedMovieTopGenres', 25 | SequenceEncoder(sep='|', min_cnt=1, max_len=5))] 26 | 27 | features = Features( 28 | number_features=number_features, 29 | category_features=category_features, 30 | sequence_features=sequence_features) 31 | 32 | dataloader, _ = prepare_dataloader(features) 33 | 34 | deep_fm = DeepFM( 35 | features, num_classes=2, embedding_size=4, hidden_layers=(8, 4), 36 | final_activation='sigmoid', dropout=0.3) 37 | 38 | deep_fm(next(iter(dataloader))) 39 | 40 | 41 | def test_without_number_feature(): 42 | number_features = [] 43 | 44 | category_features = [ 45 | Category('userId', CategoryEncoder(min_cnt=1)), 46 | Category('movieId', CategoryEncoder(min_cnt=1)), 47 | Category('topGenre', CategoryEncoder(min_cnt=1))] 48 | 49 | sequence_features = [ 50 | Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), 51 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), 52 | Sequence('clickedMovieIds', 53 | SequenceEncoder(sep='|', min_cnt=1, max_len=5)), 54 | Sequence('clickedMovieTopGenres', 55 | SequenceEncoder(sep='|', min_cnt=1, max_len=5))] 56 | 57 | features = Features( 58 | number_features=number_features, 59 | category_features=category_features, 60 | sequence_features=sequence_features) 61 | 62 | dataloader, _ = prepare_dataloader(features) 63 | 64 | deep_fm = DeepFM( 65 | features, num_classes=2, embedding_size=4, hidden_layers=(8, 4), 66 | final_activation='sigmoid', dropout=0.3) 67 | 68 | deep_fm(next(iter(dataloader))) 69 | 70 | 71 | def test_without_category_feature(): 72 | number_features = [] 73 | 74 | category_features = [] 75 | 76 | sequence_features = [ 77 | Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), 78 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), 79 | Sequence('clickedMovieIds', 80 | SequenceEncoder(sep='|', min_cnt=1, max_len=5)), 81 | Sequence('clickedMovieTopGenres', 82 | SequenceEncoder(sep='|', min_cnt=1, max_len=5))] 83 | 84 | features = Features( 85 | number_features=number_features, 86 | category_features=category_features, 87 | sequence_features=sequence_features) 88 | 89 | dataloader, _ = prepare_dataloader(features) 90 | 91 | deep_fm = DeepFM( 92 | features, num_classes=2, embedding_size=4, hidden_layers=(8, 4), 93 | final_activation='sigmoid', dropout=0.3) 94 | 95 | deep_fm(next(iter(dataloader))) 96 | 97 | 98 | def test_only_with_number_features(): 99 | number_features = [ 100 | Number('userAge', StandardScaler()), 101 | Number('rating', StandardScaler())] 102 | 103 | category_features = [] 104 | 105 | sequence_features = [] 106 | 107 | features = Features( 108 | number_features=number_features, 109 | category_features=category_features, 110 | sequence_features=sequence_features) 111 | 112 | dataloader, _ = prepare_dataloader(features) 113 | 114 | deep_fm = DeepFM( 115 | features, num_classes=2, embedding_size=4, hidden_layers=(8, 4), 116 | final_activation='sigmoid', dropout=0.3) 117 | 118 | deep_fm(next(iter(dataloader))) 119 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/tests/test_dien.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.features import Number, Category, Sequence, Features 2 | from prediction_flow.transformers.column import ( 3 | StandardScaler, CategoryEncoder, SequenceEncoder) 4 | from prediction_flow.pytorch import AttentionGroup, DIEN 5 | 6 | 7 | from .utils import prepare_dataloader 8 | 9 | 10 | def create_test_data(): 11 | number_features = [ 12 | Number('userAge', StandardScaler()), 13 | Number('rating', StandardScaler())] 14 | 15 | category_features = [ 16 | Category('userId', CategoryEncoder(min_cnt=1)), 17 | Category('movieId', CategoryEncoder(min_cnt=1)), 18 | Category('topGenre', CategoryEncoder(min_cnt=1))] 19 | 20 | sequence_features = [ 21 | Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), 22 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), 23 | Sequence('clickedMovieIds', 24 | SequenceEncoder(sep='|', min_cnt=1, max_len=5)), 25 | Sequence('clickedMovieTopGenres', 26 | SequenceEncoder(sep='|', min_cnt=1, max_len=5)), 27 | Sequence('noClickedMovieIds', 28 | SequenceEncoder(sep='|', min_cnt=1, max_len=5)), 29 | Sequence('noClickedMovieTopGenres', 30 | SequenceEncoder(sep='|', min_cnt=1, max_len=5))] 31 | 32 | attention_groups = [ 33 | AttentionGroup( 34 | name='group1', 35 | pairs=[{'ad': 'movieId', 36 | 'pos_hist': 'clickedMovieIds', 37 | 'neg_hist': 'noClickedMovieIds'}, 38 | {'ad': 'topGenre', 39 | 'pos_hist': 'clickedMovieTopGenres', 40 | 'neg_hist': 'noClickedMovieTopGenres'}], 41 | hidden_layers=[8, 4])] 42 | 43 | features = Features( 44 | number_features=number_features, 45 | category_features=category_features, 46 | sequence_features=sequence_features) 47 | 48 | dataloader, _ = prepare_dataloader(features) 49 | 50 | return dataloader, features, attention_groups 51 | 52 | 53 | def test_gru_gru_att(): 54 | dataloader, features, attention_groups = create_test_data() 55 | 56 | attention_groups[0].gru_type = 'GRU' 57 | 58 | model = DIEN( 59 | features, attention_groups=attention_groups, 60 | num_classes=2, embedding_size=4, hidden_layers=(16, 8), 61 | final_activation='sigmoid', dropout=0.3) 62 | 63 | model(next(iter(dataloader))) 64 | 65 | 66 | def test_gru_att_gru(): 67 | dataloader, features, attention_groups = create_test_data() 68 | 69 | attention_groups[0].gru_type = 'AIGRU' 70 | 71 | model = DIEN( 72 | features, attention_groups=attention_groups, 73 | num_classes=2, embedding_size=4, hidden_layers=(16, 8), 74 | final_activation='sigmoid', dropout=0.3) 75 | 76 | model(next(iter(dataloader))) 77 | 78 | 79 | def test_gru_agru(): 80 | dataloader, features, attention_groups = create_test_data() 81 | 82 | attention_groups[0].gru_type = 'AGRU' 83 | 84 | model = DIEN( 85 | features, attention_groups=attention_groups, 86 | num_classes=2, embedding_size=4, hidden_layers=(16, 8), 87 | final_activation='sigmoid', dropout=0.3) 88 | 89 | model(next(iter(dataloader))) 90 | 91 | 92 | def test_gru_augru(): 93 | dataloader, features, attention_groups = create_test_data() 94 | 95 | attention_groups[0].gru_type = 'AUGRU' 96 | 97 | model = DIEN( 98 | features, attention_groups=attention_groups, 99 | num_classes=2, embedding_size=4, hidden_layers=(16, 8), 100 | final_activation='sigmoid', dropout=0.3) 101 | 102 | model(next(iter(dataloader))) 103 | 104 | 105 | def test_gru_augru_neg(): 106 | dataloader, features, attention_groups = create_test_data() 107 | 108 | attention_groups[0].gru_type = 'AUGRU' 109 | 110 | model = DIEN( 111 | features, attention_groups=attention_groups, 112 | use_negsampling=True, 113 | num_classes=2, embedding_size=4, hidden_layers=(16, 8), 114 | final_activation='sigmoid', dropout=0.3) 115 | 116 | model(next(iter(dataloader))) 117 | 118 | 119 | def create_test_data_with_sharing_emb(): 120 | number_features = [ 121 | Number('userAge', StandardScaler()), 122 | Number('rating', StandardScaler())] 123 | 124 | # provide word to index mapping 125 | movie_word2idx = { 126 | '__PAD__': 0, 127 | '4226': 1, 128 | '5971': 2, 129 | '6291': 3, 130 | '7153': 4, 131 | '30707': 5, 132 | '3242': 6, 133 | '42': 7, 134 | '32': 8, 135 | '34': 9, 136 | '233': 10, 137 | '291': 11, 138 | '324': 12, 139 | '325': 13, 140 | '3542': 14, 141 | '322': 15, 142 | '33': 16, 143 | '45': 17, 144 | '__UNKNOWN__': 18} 145 | 146 | movie_idx2word = { 147 | index: word for word, index in movie_word2idx.items()} 148 | 149 | category_features = [ 150 | Category('movieId', 151 | CategoryEncoder( 152 | word2idx=movie_word2idx, 153 | idx2word=movie_idx2word), 154 | embedding_name='movieId'), 155 | Category('topGenre', CategoryEncoder(min_cnt=1))] 156 | 157 | sequence_features = [ 158 | Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), 159 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), 160 | Sequence('clickedMovieIds', 161 | SequenceEncoder( 162 | sep='|', max_len=5, 163 | word2idx=movie_word2idx, idx2word=movie_idx2word), 164 | embedding_name='movieId'), 165 | Sequence('noClickedMovieIds', 166 | SequenceEncoder( 167 | sep='|', max_len=5, 168 | word2idx=movie_word2idx, idx2word=movie_idx2word), 169 | embedding_name='movieId')] 170 | 171 | attention_groups = [ 172 | AttentionGroup( 173 | name='group1', 174 | pairs=[{'ad': 'movieId', 175 | 'pos_hist': 'clickedMovieIds', 176 | 'neg_hist': 'noClickedMovieIds'}], 177 | hidden_layers=[8, 4])] 178 | 179 | features = Features( 180 | number_features=number_features, 181 | category_features=category_features, 182 | sequence_features=sequence_features) 183 | 184 | dataloader, _ = prepare_dataloader(features) 185 | 186 | return dataloader, features, attention_groups 187 | 188 | 189 | def test_gru_augru_neg_with_sharing_emb(): 190 | dataloader, features, attention_groups = ( 191 | create_test_data_with_sharing_emb()) 192 | 193 | attention_groups[0].gru_type = 'AUGRU' 194 | 195 | model = DIEN( 196 | features, attention_groups=attention_groups, 197 | use_negsampling=True, 198 | num_classes=2, embedding_size=4, hidden_layers=(16, 8), 199 | final_activation='sigmoid', dropout=0.3) 200 | 201 | model(next(iter(dataloader))) 202 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/tests/test_din.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.features import Number, Category, Sequence, Features 2 | from prediction_flow.transformers.column import ( 3 | StandardScaler, CategoryEncoder, SequenceEncoder) 4 | from prediction_flow.pytorch import AttentionGroup, DIN 5 | 6 | 7 | from .utils import prepare_dataloader 8 | 9 | 10 | def test_normal(): 11 | number_features = [ 12 | Number('userAge', StandardScaler()), 13 | Number('rating', StandardScaler())] 14 | 15 | category_features = [ 16 | Category('userId', CategoryEncoder(min_cnt=1)), 17 | Category('movieId', CategoryEncoder(min_cnt=1)), 18 | Category('topGenre', CategoryEncoder(min_cnt=1))] 19 | 20 | sequence_features = [ 21 | Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), 22 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), 23 | Sequence('clickedMovieIds', 24 | SequenceEncoder(sep='|', min_cnt=1, max_len=5)), 25 | Sequence('clickedMovieTopGenres', 26 | SequenceEncoder(sep='|', min_cnt=1, max_len=5))] 27 | 28 | attention_groups = [ 29 | AttentionGroup( 30 | name='group1', 31 | pairs=[{'ad': 'movieId', 'pos_hist': 'clickedMovieIds'}, 32 | {'ad': 'topGenre', 'pos_hist': 'clickedMovieTopGenres'}], 33 | hidden_layers=[8, 4])] 34 | 35 | features = Features( 36 | number_features=number_features, 37 | category_features=category_features, 38 | sequence_features=sequence_features) 39 | 40 | dataloader, _ = prepare_dataloader(features) 41 | 42 | model = DIN( 43 | features, attention_groups=attention_groups, 44 | num_classes=2, embedding_size=4, hidden_layers=(16, 8), 45 | final_activation='sigmoid', dropout=0.3) 46 | 47 | model(next(iter(dataloader))) 48 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/tests/test_dnn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from prediction_flow.features import Number, Category, Sequence, Features 4 | from prediction_flow.transformers.column import ( 5 | StandardScaler, CategoryEncoder, SequenceEncoder) 6 | from prediction_flow.pytorch import DNN 7 | 8 | 9 | from .utils import prepare_dataloader, _SAMPLE_DF 10 | 11 | 12 | def test_normal(): 13 | number_features = [ 14 | Number('userAge', StandardScaler()), 15 | Number('rating', StandardScaler())] 16 | 17 | category_features = [ 18 | Category('userId', CategoryEncoder(min_cnt=1)), 19 | Category('movieId', CategoryEncoder(min_cnt=1)), 20 | Category('topGenre', CategoryEncoder(min_cnt=1))] 21 | 22 | sequence_features = [ 23 | Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), 24 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), 25 | Sequence('clickedMovieIds', 26 | SequenceEncoder(sep='|', min_cnt=1, max_len=5)), 27 | Sequence('clickedMovieTopGenres', 28 | SequenceEncoder(sep='|', min_cnt=1, max_len=5))] 29 | 30 | features = Features( 31 | number_features=number_features, 32 | category_features=category_features, 33 | sequence_features=sequence_features) 34 | 35 | dataloader, _ = prepare_dataloader(features) 36 | 37 | model = DNN( 38 | features, num_classes=2, embedding_size=4, hidden_layers=(8, 4), 39 | final_activation='sigmoid', dropout=0.3) 40 | 41 | model(next(iter(dataloader))) 42 | 43 | 44 | def test_without_number_feature(): 45 | number_features = [] 46 | 47 | category_features = [ 48 | Category('userId', CategoryEncoder(min_cnt=1)), 49 | Category('movieId', CategoryEncoder(min_cnt=1)), 50 | Category('topGenre', CategoryEncoder(min_cnt=1))] 51 | 52 | sequence_features = [ 53 | Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), 54 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), 55 | Sequence('clickedMovieIds', 56 | SequenceEncoder(sep='|', min_cnt=1, max_len=5)), 57 | Sequence('clickedMovieTopGenres', 58 | SequenceEncoder(sep='|', min_cnt=1, max_len=5))] 59 | 60 | features = Features( 61 | number_features=number_features, 62 | category_features=category_features, 63 | sequence_features=sequence_features) 64 | 65 | dataloader, _ = prepare_dataloader(features) 66 | 67 | model = DNN( 68 | features, num_classes=2, embedding_size=4, hidden_layers=(8, 4), 69 | final_activation='sigmoid', dropout=0.3) 70 | 71 | model(next(iter(dataloader))) 72 | 73 | 74 | def test_without_category_feature(): 75 | number_features = [] 76 | 77 | category_features = [] 78 | 79 | sequence_features = [ 80 | Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), 81 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), 82 | Sequence('clickedMovieIds', 83 | SequenceEncoder(sep='|', min_cnt=1, max_len=5)), 84 | Sequence('clickedMovieTopGenres', 85 | SequenceEncoder(sep='|', min_cnt=1, max_len=5))] 86 | 87 | features = Features( 88 | number_features=number_features, 89 | category_features=category_features, 90 | sequence_features=sequence_features) 91 | 92 | dataloader, _ = prepare_dataloader(features) 93 | 94 | model = DNN( 95 | features, num_classes=2, embedding_size=4, hidden_layers=(8, 4), 96 | final_activation='sigmoid', dropout=0.3) 97 | 98 | model(next(iter(dataloader))) 99 | 100 | 101 | def test_only_with_number_features(): 102 | number_features = [ 103 | Number('userAge', StandardScaler()), 104 | Number('rating', StandardScaler())] 105 | 106 | category_features = [] 107 | 108 | sequence_features = [] 109 | 110 | features = Features( 111 | number_features=number_features, 112 | category_features=category_features, 113 | sequence_features=sequence_features) 114 | 115 | dataloader, _ = prepare_dataloader(features) 116 | 117 | model = DNN( 118 | features, num_classes=2, embedding_size=4, hidden_layers=(8, 4), 119 | final_activation='sigmoid', dropout=0.3) 120 | 121 | model(next(iter(dataloader))) 122 | 123 | 124 | def test_shared_embedding(): 125 | number_features = [] 126 | 127 | movie_enc = SequenceEncoder(sep='|', min_cnt=1, max_len=5) 128 | genre_enc = SequenceEncoder(sep='|', min_cnt=1, max_len=5) 129 | 130 | movie_enc.fit( 131 | np.concatenate((_SAMPLE_DF.clickedMovieIds.values, 132 | _SAMPLE_DF.movieId.values), axis=None)) 133 | 134 | genre_enc.fit( 135 | np.concatenate((_SAMPLE_DF.clickedMovieTopGenres.values, 136 | _SAMPLE_DF.topGenre.values), axis=None)) 137 | 138 | category_features = [ 139 | Category('userId', CategoryEncoder(min_cnt=1)), 140 | Category('movieId', 141 | CategoryEncoder( 142 | min_cnt=1, 143 | word2idx=movie_enc.word2idx, 144 | idx2word=movie_enc.idx2word), 145 | embedding_name='movieId'), 146 | Category('topGenre', 147 | CategoryEncoder( 148 | min_cnt=1, 149 | word2idx=genre_enc.word2idx, 150 | idx2word=genre_enc.idx2word), 151 | embedding_name='topGenre', embedding_size=8)] 152 | 153 | sequence_features = [ 154 | Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), 155 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), 156 | Sequence('clickedMovieIds', 157 | SequenceEncoder( 158 | sep='|', 159 | min_cnt=1, 160 | max_len=5, 161 | word2idx=movie_enc.word2idx, 162 | idx2word=movie_enc.idx2word), 163 | embedding_name='movieId'), 164 | Sequence('clickedMovieTopGenres', 165 | SequenceEncoder( 166 | sep='|', 167 | min_cnt=1, 168 | max_len=5, 169 | word2idx=genre_enc.word2idx, 170 | idx2word=genre_enc.idx2word), 171 | embedding_name='topGenre', embedding_size=8)] 172 | 173 | features = Features( 174 | number_features=number_features, 175 | category_features=category_features, 176 | sequence_features=sequence_features) 177 | 178 | dataloader, _ = prepare_dataloader(features) 179 | 180 | model = DNN( 181 | features, num_classes=2, embedding_size=16, hidden_layers=(8, 4), 182 | final_activation='sigmoid', dropout=0.3) 183 | 184 | model(next(iter(dataloader))) 185 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/tests/test_wide_deep.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.features import Number, Category, Sequence, Features 2 | from prediction_flow.transformers.column import ( 3 | StandardScaler, CategoryEncoder, SequenceEncoder) 4 | from prediction_flow.pytorch import WideDeep 5 | 6 | 7 | from .utils import prepare_dataloader 8 | 9 | 10 | def test_normal(): 11 | number_features = [ 12 | Number('userAge', StandardScaler()), 13 | Number('rating', StandardScaler())] 14 | 15 | category_features = [ 16 | Category('userId', CategoryEncoder(min_cnt=1)), 17 | Category('movieId', CategoryEncoder(min_cnt=1)), 18 | Category('topGenre', CategoryEncoder(min_cnt=1))] 19 | 20 | sequence_features = [ 21 | Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), 22 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), 23 | Sequence('clickedMovieIds', 24 | SequenceEncoder(sep='|', min_cnt=1, max_len=5)), 25 | Sequence('clickedMovieTopGenres', 26 | SequenceEncoder(sep='|', min_cnt=1, max_len=5))] 27 | 28 | features = Features( 29 | number_features=number_features, 30 | category_features=category_features, 31 | sequence_features=sequence_features) 32 | 33 | wide_features = ['rating', 'title', 'genres'] 34 | deep_features = ['userAge', 'rating', 'userId', 'movieId', 'topGenre', 35 | 'clickedMovieIds', 'clickedMovieTopGenres'] 36 | cross_features = [('movieId', 'clickedMovieIds'), 37 | ('topGenre', 'clickedMovieTopGenres')] 38 | 39 | dataloader, _ = prepare_dataloader(features) 40 | 41 | model = WideDeep( 42 | features, wide_features, deep_features, cross_features, 43 | num_classes=2, embedding_size=4, hidden_layers=(8, 4), 44 | final_activation='sigmoid', dropout=0.3) 45 | 46 | model(next(iter(dataloader))) 47 | 48 | 49 | def test_without_number_feature(): 50 | number_features = [] 51 | 52 | category_features = [ 53 | Category('userId', CategoryEncoder(min_cnt=1)), 54 | Category('movieId', CategoryEncoder(min_cnt=1)), 55 | Category('topGenre', CategoryEncoder(min_cnt=1))] 56 | 57 | sequence_features = [ 58 | Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), 59 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), 60 | Sequence('clickedMovieIds', 61 | SequenceEncoder(sep='|', min_cnt=1, max_len=5)), 62 | Sequence('clickedMovieTopGenres', 63 | SequenceEncoder(sep='|', min_cnt=1, max_len=5))] 64 | 65 | features = Features( 66 | number_features=number_features, 67 | category_features=category_features, 68 | sequence_features=sequence_features) 69 | 70 | wide_features = ['title', 'genres'] 71 | deep_features = ['userId', 'movieId', 'topGenre', 72 | 'clickedMovieIds', 'clickedMovieTopGenres'] 73 | cross_features = [('movieId', 'clickedMovieIds'), 74 | ('topGenre', 'clickedMovieTopGenres')] 75 | 76 | dataloader, _ = prepare_dataloader(features) 77 | 78 | model = WideDeep( 79 | features, wide_features, deep_features, cross_features, 80 | num_classes=2, embedding_size=4, hidden_layers=(8, 4), 81 | final_activation='sigmoid', dropout=0.3) 82 | 83 | model(next(iter(dataloader))) 84 | 85 | 86 | def test_without_category_feature(): 87 | number_features = [] 88 | 89 | category_features = [] 90 | 91 | sequence_features = [ 92 | Sequence('title', SequenceEncoder(sep='|', min_cnt=1)), 93 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)), 94 | Sequence('clickedMovieIds', 95 | SequenceEncoder(sep='|', min_cnt=1, max_len=5)), 96 | Sequence('clickedMovieTopGenres', 97 | SequenceEncoder(sep='|', min_cnt=1, max_len=5))] 98 | 99 | features = Features( 100 | number_features=number_features, 101 | category_features=category_features, 102 | sequence_features=sequence_features) 103 | 104 | wide_features = ['title', 'genres'] 105 | deep_features = ['clickedMovieIds', 'clickedMovieTopGenres'] 106 | 107 | dataloader, _ = prepare_dataloader(features) 108 | 109 | model = WideDeep( 110 | features, wide_features, deep_features, [], 111 | num_classes=2, embedding_size=4, hidden_layers=(8, 4), 112 | final_activation='sigmoid', dropout=0.3) 113 | 114 | model(next(iter(dataloader))) 115 | 116 | 117 | def test_only_with_number_features(): 118 | number_features = [ 119 | Number('userAge', StandardScaler()), 120 | Number('rating', StandardScaler())] 121 | 122 | category_features = [] 123 | 124 | sequence_features = [] 125 | 126 | features = Features( 127 | number_features=number_features, 128 | category_features=category_features, 129 | sequence_features=sequence_features) 130 | 131 | wide_features = ['rating', 'userAge'] 132 | 133 | dataloader, _ = prepare_dataloader(features) 134 | 135 | model = WideDeep( 136 | features, wide_features, [], [], 137 | num_classes=2, embedding_size=4, hidden_layers=(8, 4), 138 | final_activation='sigmoid', dropout=0.3) 139 | 140 | model(next(iter(dataloader))) 141 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/tests/utils.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.pytorch.data import Dataset 2 | 3 | 4 | import pandas as pd 5 | import torch.utils.data as data 6 | 7 | 8 | _SAMPLE_DF = pd.DataFrame({ 9 | 'userId': [11, 11, 11, 11, 11], 10 | 'userAge': [23, 21, 19, 17, 41], 11 | 'movieId': ['4226', '5971', '6291', '7153', '30707'], 12 | 'rating': [3.0, 2.0, 4.0, 4.6, 5.0], 13 | 'timestamp': [1294796159, 1294796201, 1294796113, 1294796132, 1294796176], 14 | 'title': ['Memento (2000)', 15 | 'My Neighbor Totoro (Tonari no Totoro) (1988)', 16 | 'Lilya 4-Ever (Lilja 4-ever) (2002)', 17 | 'Lord of the Rings: The Return of the King, The (2003)', 18 | 'Million Dollar Baby (2004)'], 19 | 'genres': [ 20 | 'Mystery|Thriller', 21 | 'Animation|Children|Drama|Fantasy', 22 | 'Crime|Drama', 23 | 'Action|Adventure|Drama|Fantasy', 24 | 'Drama'], 25 | 'topGenre': [ 26 | 'Mystery', 27 | 'Animation', 28 | 'Crime', 29 | 'Action', 30 | 'Drama'], 31 | 'clickedMovieIds': [ 32 | '5971|6291', 33 | '3242|42', 34 | '32|43542|3222|3', 35 | '', 36 | '34|23'], 37 | 'clickedMovieTopGenres': [ 38 | 'Animation|Mystery', 39 | 'Drama', 40 | 'Drama|Drama|Drama|Drama', 41 | '', 42 | 'Mystery|Crime'], 43 | 'noClickedMovieIds': [ 44 | '233|291', 45 | '324|421', 46 | '325|3542|322|33', 47 | '', 48 | '45|48'], 49 | 'noClickedMovieTopGenres': [ 50 | 'Drama|Crime', 51 | 'Animation|Mystery', 52 | 'Mystery|Animation|Crime|Drama', 53 | '', 54 | 'Crime|Mystery'], 55 | 'label': [1, 0, 0, 1, 0]}) 56 | 57 | 58 | def prepare_dataloader(features): 59 | features.fit(_SAMPLE_DF) 60 | 61 | X_map = features.transform(_SAMPLE_DF) 62 | 63 | dataset = Dataset(features, X_map, _SAMPLE_DF.label.values) 64 | 65 | dataloader = data.DataLoader( 66 | dataset, batch_size=_SAMPLE_DF.shape[0], shuffle=False) 67 | 68 | return dataloader, _SAMPLE_DF 69 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/utils.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.init as init 3 | 4 | 5 | def init_weights(model): 6 | if isinstance(model, nn.Linear): 7 | if model.weight is not None: 8 | init.kaiming_uniform_(model.weight.data) 9 | if model.bias is not None: 10 | init.normal_(model.bias.data) 11 | elif isinstance(model, nn.BatchNorm1d): 12 | if model.weight is not None: 13 | init.normal_(model.weight.data, mean=1, std=0.02) 14 | if model.bias is not None: 15 | init.constant_(model.bias.data, 0) 16 | elif isinstance(model, nn.BatchNorm2d): 17 | if model.weight is not None: 18 | init.normal_(model.weight.data, mean=1, std=0.02) 19 | if model.bias is not None: 20 | init.constant_(model.bias.data, 0) 21 | elif isinstance(model, nn.BatchNorm3d): 22 | if model.weight is not None: 23 | init.normal_(model.weight.data, mean=1, std=0.02) 24 | if model.bias is not None: 25 | init.constant_(model.bias.data, 0) 26 | else: 27 | pass 28 | -------------------------------------------------------------------------------- /prediction_flow/pytorch/wide_deep.py: -------------------------------------------------------------------------------- 1 | """ 2 | Wide&Deep Model. 3 | """ 4 | from collections import OrderedDict 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from .base import EmbeddingMixin 10 | from .nn import MLP, SumPooling 11 | from .utils import init_weights 12 | 13 | 14 | class WideDeep(nn.Module, EmbeddingMixin): 15 | """Wide&Deep Model. 16 | 17 | Parameters 18 | ---------- 19 | features : Features 20 | 21 | wide_features : list of str 22 | Feature names for wide part. 23 | 24 | deep_features : list of str 25 | Feature names for deep part. 26 | 27 | cross_features: list of tuple 28 | Cross sparse feature names for wide part. 29 | 30 | num_classes : int 31 | Number of classes. 32 | 33 | embedding_size : int 34 | Size of embedding. 35 | 36 | hidden_layers : list 37 | Size of hidden layers. 38 | Example: [96, 32] 39 | 40 | activation : str 41 | Activation function. 42 | Example: relu 43 | 44 | final_activation : str 45 | Activation function of output. 46 | 47 | dropout : float 48 | Dropout rate. 49 | """ 50 | def __init__(self, features, wide_features, deep_features, cross_features, 51 | num_classes, embedding_size, hidden_layers, 52 | activation='relu', final_activation=None, dropout=0.0): 53 | super(WideDeep, self).__init__() 54 | self.features = features 55 | self.wide_features = wide_features 56 | self.deep_features = deep_features 57 | self.cross_features = cross_features 58 | self.num_classes = num_classes 59 | self.final_activation = final_activation 60 | 61 | self.embeddings, self.embedding_sizes = self.build_embeddings( 62 | embedding_size) 63 | 64 | self._sequence_poolings = OrderedDict() 65 | 66 | wide_input_size = 0 67 | deep_input_size = 0 68 | 69 | for feature in self.features.number_features: 70 | if feature.name in self.wide_features: 71 | wide_input_size += 1 72 | if feature.name in self.deep_features: 73 | deep_input_size += 1 74 | 75 | for feature in self.features.category_features: 76 | if feature.name in self.wide_features: 77 | wide_input_size += self.embedding_sizes[feature.name] 78 | if feature.name in self.deep_features: 79 | deep_input_size += self.embedding_sizes[feature.name] 80 | 81 | for feature in self.features.sequence_features: 82 | self._sequence_poolings[feature.name] = SumPooling(1) 83 | self.add_module( 84 | f"pooling:{feature.name}", 85 | self._sequence_poolings[feature.name]) 86 | if feature.name in self.wide_features: 87 | wide_input_size += self.embedding_sizes[feature.name] 88 | if feature.name in self.deep_features: 89 | deep_input_size += self.embedding_sizes[feature.name] 90 | 91 | # plus cross embedding size 92 | wide_input_size += len(self.cross_features) * embedding_size 93 | 94 | final_layer_input_size = wide_input_size 95 | 96 | if deep_input_size: 97 | self.mlp = MLP( 98 | deep_input_size, 99 | hidden_layers, 100 | dropout=dropout, batchnorm=True, activation=activation) 101 | final_layer_input_size += hidden_layers[-1] 102 | 103 | output_size = self.num_classes 104 | 105 | if self.num_classes == 2 and self.final_activation == 'sigmoid': 106 | output_size -= 1 107 | 108 | self.final_layer = nn.Linear(final_layer_input_size, output_size) 109 | 110 | self.apply(init_weights) 111 | 112 | def forward(self, x): 113 | wide_inputs = list() 114 | deep_inputs = list() 115 | cross_inputs = list() 116 | 117 | for feature in self.features.number_features: 118 | if feature.name in self.wide_features: 119 | wide_inputs.append(x[feature.name].view(-1, 1)) 120 | if feature.name in self.deep_features: 121 | deep_inputs.append(x[feature.name].view(-1, 1)) 122 | 123 | for feature in self.features.category_features: 124 | if feature.name in self.wide_features: 125 | wide_inputs.append( 126 | self.embeddings[feature.name](x[feature.name])) 127 | if feature.name in self.deep_features: 128 | deep_inputs.append( 129 | self.embeddings[feature.name](x[feature.name])) 130 | 131 | for feature in self.features.sequence_features: 132 | if feature.name in self.wide_features: 133 | wide_inputs.append( 134 | self._sequence_poolings[feature.name]( 135 | self.embeddings[feature.name]( 136 | x[feature.name]))) 137 | if feature.name in self.deep_features: 138 | deep_inputs.append( 139 | self._sequence_poolings[feature.name]( 140 | self.embeddings[feature.name]( 141 | x[feature.name]))) 142 | 143 | # prepare cross features 144 | for x_f, y_f in self.cross_features: 145 | if x_f in self._sequence_poolings: 146 | x_emb = self._sequence_poolings[x_f]( 147 | self.embeddings[x_f](x[x_f])) 148 | else: 149 | x_emb = self.embeddings[x_f](x[x_f]) 150 | 151 | if y_f in self._sequence_poolings: 152 | y_emb = self._sequence_poolings[y_f]( 153 | self.embeddings[y_f](x[y_f])) 154 | else: 155 | y_emb = self.embeddings[y_f](x[y_f]) 156 | cross_inputs.append(x_emb * y_emb) 157 | 158 | final_layer_inputs = list() 159 | if wide_inputs: 160 | final_layer_inputs.append(torch.cat(wide_inputs, dim=1)) 161 | 162 | if cross_inputs: 163 | final_layer_inputs.append(torch.cat(cross_inputs, dim=1)) 164 | 165 | if deep_inputs: 166 | final_layer_inputs.append(self.mlp(torch.cat(deep_inputs, dim=1))) 167 | 168 | output = self.final_layer(torch.cat(final_layer_inputs, dim=1)) 169 | 170 | if self.num_classes == 2 and self.final_activation == 'sigmoid': 171 | output = torch.sigmoid(output) 172 | elif self.num_classes > 1 and self.final_activation == 'softmax': 173 | output = torch.softmax(output) 174 | elif self.final_activation: 175 | raise NotImplementedError( 176 | f"pair (final_activation: {self.final_activation}, " 177 | f"num_classes: {self.num_classes}) is not implemented") 178 | 179 | return output 180 | -------------------------------------------------------------------------------- /prediction_flow/transformers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/transformers/__init__.py -------------------------------------------------------------------------------- /prediction_flow/transformers/column/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Column 2 | from .log_transformer import LogTransformer 3 | from .standard_scaler import StandardScaler 4 | from .category_encoder import CategoryEncoder 5 | from .sequence_encoder import SequenceEncoder 6 | from .column_flow import ColumnFlow 7 | 8 | 9 | __all__ = [ 10 | 'StandardScaler', 11 | 'LogTransformer', 12 | 'CategoryEncoder', 13 | 'SequenceEncoder', 14 | 'ColumnFlow', 15 | 'Column' 16 | ] 17 | -------------------------------------------------------------------------------- /prediction_flow/transformers/column/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base class for all column-orientation transformer classes 3 | with fit/transform functions. 4 | """ 5 | 6 | # Authors: Hongwei Zhang 7 | # License: MIT 8 | 9 | 10 | from abc import ABC, abstractmethod 11 | from enum import Enum 12 | 13 | 14 | class Column(ABC): 15 | """Base class for all column-orientation transformer classes 16 | with fit/transform functions. 17 | """ 18 | 19 | @abstractmethod 20 | def fit(self, x, y=None): 21 | """Fit this transformer. 22 | 23 | Parameters 24 | ---------- 25 | x : array-like 26 | One column of training data. 27 | y : array-like, default=None 28 | Training targets. 29 | """ 30 | 31 | raise NotImplementedError 32 | 33 | @abstractmethod 34 | def transform(self, x): 35 | """Transform x by this fitted transformer. 36 | 37 | Parameters 38 | ---------- 39 | x : array-like 40 | Column data to be transformed. 41 | """ 42 | 43 | raise NotImplementedError 44 | 45 | 46 | class ColumnType(Enum): 47 | NUMBER = 1 48 | CATEGORY = 2 49 | SEQUENCE = 3 50 | 51 | 52 | class NumberColumn(Column): 53 | """Base class for all column-orientation number type transformer classes 54 | with fit/transform functions. 55 | """ 56 | column_type = ColumnType.NUMBER 57 | 58 | 59 | class CategoryColumn(Column): 60 | """Base class for all column-orientation category type transformer classes 61 | with fit/transform functions. 62 | """ 63 | column_type = ColumnType.CATEGORY 64 | 65 | @abstractmethod 66 | def dimension(self): 67 | """Number of unique terms. 68 | """ 69 | raise NotImplementedError 70 | 71 | 72 | class SequenceColumn(Column): 73 | """Base class for all column-orientation sequence type transformer classes 74 | with fit/transform functions. 75 | """ 76 | column_type = ColumnType.SEQUENCE 77 | 78 | @abstractmethod 79 | def dimension(self): 80 | """Number of unique terms. 81 | """ 82 | raise NotImplementedError 83 | 84 | @abstractmethod 85 | def max_length(self): 86 | """Maximum length of one sequence. 87 | """ 88 | raise NotImplementedError 89 | -------------------------------------------------------------------------------- /prediction_flow/transformers/column/category_encoder.py: -------------------------------------------------------------------------------- 1 | """ 2 | CatagoryEncoder to convert term to number. 3 | """ 4 | 5 | # Authors: Hongwei Zhang 6 | # License: MIT 7 | 8 | 9 | import numpy as np 10 | from collections import Counter 11 | 12 | from .base import CategoryColumn 13 | 14 | 15 | class CategoryEncoder(CategoryColumn): 16 | """Encoder for category type feature. 17 | 18 | Parameters 19 | ---------- 20 | min_cnt : int, default=5 21 | Minimum count of term. 22 | 23 | word2idx : dict 24 | Mappings from term to index. 25 | 26 | idx2word : dict 27 | Mappings from index to term. 28 | 29 | Attributes 30 | ---------- 31 | min_cnt : int, default=5 32 | Minimum count of term. 33 | 34 | word2idx : dict 35 | Mappings from term to index. 36 | 37 | idx2word : dict 38 | Mappings from index to term. 39 | """ 40 | def __init__(self, min_cnt=5, word2idx=None, idx2word=None): 41 | self.min_cnt = min_cnt 42 | 43 | self.word2idx = word2idx if word2idx else dict() 44 | self.idx2word = idx2word if idx2word else dict() 45 | 46 | def fit(self, x, y=None): 47 | """Fit this transformer. 48 | 49 | Parameters 50 | ---------- 51 | x : array-like 52 | One column of training data. 53 | y : array-like, default=None, ignored 54 | Training targets. 55 | 56 | Returns 57 | ------- 58 | self : CategoryEncoder 59 | This CategoryEncoder. 60 | """ 61 | if not self.word2idx: 62 | counter = Counter(np.asarray(x).ravel()) 63 | 64 | selected_terms = sorted( 65 | list(filter(lambda x: counter[x] >= self.min_cnt, counter))) 66 | 67 | self.word2idx = dict( 68 | zip(selected_terms, range(1, len(selected_terms) + 1))) 69 | self.word2idx['__PAD__'] = 0 70 | if '__UNKNOWN__' not in self.word2idx: 71 | self.word2idx['__UNKNOWN__'] = len(self.word2idx) 72 | 73 | if not self.idx2word: 74 | self.idx2word = { 75 | index: word for word, index in self.word2idx.items()} 76 | 77 | return self 78 | 79 | def transform(self, x): 80 | """Transform x by this fitted transformer. 81 | 82 | Parameters 83 | ---------- 84 | x : array-like 85 | Column data to be transformed. 86 | 87 | Returns 88 | ------- 89 | transformed_x : array-like 90 | Transformed data. 91 | """ 92 | transformed_x = list() 93 | for term in np.asarray(x).ravel(): 94 | try: 95 | transformed_x.append(self.word2idx[term]) 96 | except KeyError: 97 | transformed_x.append(self.word2idx['__UNKNOWN__']) 98 | 99 | return np.asarray(transformed_x, dtype=np.int64) 100 | 101 | def dimension(self): 102 | return len(self.word2idx) 103 | -------------------------------------------------------------------------------- /prediction_flow/transformers/column/column_flow.py: -------------------------------------------------------------------------------- 1 | """ColumnFlow contaions a chain of column-orientation 2 | transformers (implementing fit/transform). 3 | """ 4 | 5 | # Authors: Hongwei Zhang 6 | # License: MIT 7 | 8 | 9 | import numpy as np 10 | 11 | 12 | class ColumnFlow(object): 13 | """ColumnFlow contaions a chain of column-orientation 14 | transformers (implementing fit/transform). 15 | 16 | Parameters 17 | ---------- 18 | transformers : list 19 | List of column transformers (implementing fit/transform) that are 20 | chained, in the order in which they are chained. 21 | 22 | verbose : boolean, optional 23 | If True, the log while fitting each transformer will be printed. 24 | 25 | Attributes 26 | ---------- 27 | transformers : list 28 | List of column transformers (implementing fit/transform) that are 29 | chained, in the order in which they are chained. 30 | 31 | verbose : boolean, optional 32 | If True, the log while fitting each transformer will be printed. 33 | """ 34 | 35 | def __init__(self, transformers, verbose=False): 36 | ColumnFlow.__check_transformers(transformers) 37 | self.transformers = transformers 38 | self.verbose = verbose 39 | 40 | @staticmethod 41 | def __check_transformers(transformers): 42 | if not isinstance(transformers, list): 43 | raise TypeError( 44 | "transformers must be list type, not {type(transformers)}") 45 | 46 | types = [ 47 | transformer.column_type for transformer in transformers] 48 | 49 | if len(set(types)) != 1: 50 | raise ValueError("transformers must be the same type, not {types}") 51 | 52 | def fit(self, x, y=None): 53 | """Fit all transformers one after the other. 54 | 55 | Parameters 56 | ---------- 57 | x : array-like 58 | One column of training data. 59 | y : array-like, default=None 60 | Training targets. 61 | 62 | Returns 63 | ------- 64 | self : ColumnFlow 65 | This flow. 66 | """ 67 | transformed_x = np.asarray(x).ravel() 68 | for transformer in self.transformers: 69 | transformer.fit(transformed_x, y) 70 | transformed_x = transformer.transform(transformed_x) 71 | 72 | return self 73 | 74 | def transform(self, x): 75 | """Transform x by all fitted transformers. 76 | 77 | Parameters 78 | ---------- 79 | x : array-like 80 | Column data to be transformed. 81 | 82 | Returns 83 | ------- 84 | transformed_x : array-like 85 | Transformed data. 86 | """ 87 | transformed_x = np.asarray(x).ravel() 88 | for transformer in self.transformers: 89 | transformed_x = transformer.transform(transformed_x) 90 | 91 | return transformed_x 92 | -------------------------------------------------------------------------------- /prediction_flow/transformers/column/log_transformer.py: -------------------------------------------------------------------------------- 1 | """ 2 | LogTransformer to convert number feature. 3 | """ 4 | 5 | # Authors: Hongwei Zhang 6 | # License: MIT 7 | 8 | 9 | import numpy as np 10 | 11 | from .base import NumberColumn 12 | 13 | 14 | class LogTransformer(NumberColumn): 15 | """LogTransformer to convert number feature. 16 | """ 17 | def fit(self, x, y=None): 18 | """Fit this transformer. 19 | 20 | Parameters 21 | ---------- 22 | x : array-like 23 | One column of training data. 24 | y : array-like, default=None, ignored 25 | Training targets. 26 | 27 | Returns 28 | ------- 29 | self : LogTransformer 30 | This LogTransformer. 31 | """ 32 | return self 33 | 34 | def transform(self, x): 35 | """ log(1 + x) when x > 0 else x 36 | 37 | Parameters 38 | ---------- 39 | x : array-like 40 | Column data to be transformed. 41 | 42 | Returns 43 | ---------- 44 | res: array-like 45 | """ 46 | res = x.copy().astype(np.float).ravel() 47 | mask = x > 0.0 48 | res[mask] = np.log(1 + x[mask]) 49 | return res 50 | -------------------------------------------------------------------------------- /prediction_flow/transformers/column/sequence_encoder.py: -------------------------------------------------------------------------------- 1 | """ 2 | SequenceEncoder to convert sequence terms to sequence number. 3 | """ 4 | 5 | # Authors: Hongwei Zhang 6 | # License: MIT 7 | 8 | 9 | from collections import Counter 10 | import numpy as np 11 | 12 | from .base import SequenceColumn 13 | 14 | 15 | class SequenceEncoder(SequenceColumn): 16 | """Encoder for sequence type feature. Convert terms to numbers. 17 | First index is 1. 18 | 19 | Parameters 20 | ---------- 21 | sep : str, default=' ' 22 | Separator of input sequence. 23 | 24 | min_cnt : int, default=5 25 | Minimum count of term. 26 | 27 | max_len: int, default=None 28 | Maximum length of sequence. If none is given, 29 | the maximum length of training sequence will be used. 30 | 31 | word2idx : dict 32 | Mappings from term to index. 33 | 34 | idx2word : dict 35 | Mappings from index to term. 36 | 37 | Attributes 38 | ---------- 39 | sep : str, default=' ' 40 | Separator of input sequence. 41 | 42 | min_cnt : int, default=5 43 | Minimum count of term. 44 | 45 | max_len: int, default=None 46 | Maximum length of sequence. If none is given, 47 | the maximum length of training sequence will be used. 48 | 49 | word2idx : dict 50 | Mappings from term to index. 51 | 52 | idx2word : dict 53 | Mappings from index to term. 54 | """ 55 | def __init__(self, sep=' ', min_cnt=5, max_len=None, 56 | word2idx=None, idx2word=None): 57 | self.sep = sep 58 | self.min_cnt = min_cnt 59 | self.max_len = max_len 60 | 61 | self.word2idx = word2idx if word2idx else dict() 62 | self.idx2word = idx2word if idx2word else dict() 63 | 64 | def fit(self, x, y=None): 65 | """Fit this transformer. 66 | 67 | Parameters 68 | ---------- 69 | x : array-like 70 | One column of training data. 71 | y : array-like, default=None 72 | Training targets. 73 | 74 | Returns 75 | ------- 76 | self : SequenceEncoder 77 | This SequenceEncoder. 78 | """ 79 | 80 | if not self.word2idx: 81 | counter = Counter() 82 | 83 | max_len = 0 84 | for sequence in np.array(x).ravel(): 85 | words = sequence.split(self.sep) 86 | counter.update(words) 87 | max_len = max(max_len, len(words)) 88 | 89 | if self.max_len is None: 90 | self.max_len = max_len 91 | 92 | # drop rare words 93 | words = sorted( 94 | list(filter(lambda x: counter[x] >= self.min_cnt, counter))) 95 | 96 | self.word2idx = dict(zip(words, range(1, len(words) + 1))) 97 | self.word2idx['__PAD__'] = 0 98 | if '__UNKNOWN__' not in self.word2idx: 99 | self.word2idx['__UNKNOWN__'] = len(self.word2idx) 100 | 101 | if not self.idx2word: 102 | self.idx2word = { 103 | index: word for word, index in self.word2idx.items()} 104 | 105 | if not self.max_len: 106 | max_len = 0 107 | for sequence in np.array(x).ravel(): 108 | words = sequence.split(self.sep) 109 | max_len = max(max_len, len(words)) 110 | self.max_len = max_len 111 | 112 | return self 113 | 114 | def transform(self, x): 115 | """Transform x by this fitted transformer. 116 | 117 | Parameters 118 | ---------- 119 | x : array-like 120 | Column data to be transformed. 121 | 122 | Returns 123 | ------- 124 | transformed_x : array-like 125 | Transformed data. 126 | """ 127 | transformed_x = list() 128 | 129 | for sequence in np.asarray(x).ravel(): 130 | words = list() 131 | for word in sequence.split(self.sep): 132 | try: 133 | words.append(self.word2idx[word]) 134 | except KeyError: 135 | words.append(self.word2idx['__UNKNOWN__']) 136 | 137 | transformed_x.append( 138 | np.asarray(words[0:self.max_len], dtype=np.int64)) 139 | 140 | return np.asarray(transformed_x, dtype=np.object) 141 | 142 | def dimension(self): 143 | """Number of unique terms. 144 | """ 145 | return len(self.word2idx) 146 | 147 | def max_length(self): 148 | """Maximum length of one sequence. 149 | """ 150 | return self.max_len 151 | -------------------------------------------------------------------------------- /prediction_flow/transformers/column/standard_scaler.py: -------------------------------------------------------------------------------- 1 | """ 2 | StandardScaler to convert term to number. 3 | """ 4 | 5 | # Authors: Hongwei Zhang 6 | # License: MIT 7 | 8 | 9 | import numpy as np 10 | import sklearn.preprocessing as sk 11 | 12 | from .base import NumberColumn 13 | 14 | 15 | class StandardScaler(NumberColumn): 16 | """Normalize number feature. 17 | """ 18 | def __init__(self): 19 | self.__scaler = sk.StandardScaler() 20 | 21 | def fit(self, x, y=None): 22 | """Fit this transformer. 23 | 24 | Parameters 25 | ---------- 26 | x : array-like 27 | One column of training data. 28 | y : array-like, default=None, ignored 29 | Training targets. 30 | 31 | Returns 32 | ------- 33 | self : StandardScaler 34 | This StandardScaler. 35 | """ 36 | self.__scaler.fit(np.asarray(x, dtype=np.float).reshape(-1, 1)) 37 | return self 38 | 39 | def transform(self, x): 40 | """Transform x by this fitted transformer. 41 | 42 | Parameters 43 | ---------- 44 | x : array-like 45 | Column data to be transformed. 46 | 47 | Returns 48 | ------- 49 | transformed_x : array-like 50 | Transformed data. 51 | """ 52 | transformed_x = self.__scaler.transform( 53 | np.asarray(x, dtype=np.float32).reshape(-1, 1)) 54 | 55 | return transformed_x.ravel() 56 | -------------------------------------------------------------------------------- /prediction_flow/transformers/column/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/transformers/column/tests/__init__.py -------------------------------------------------------------------------------- /prediction_flow/transformers/column/tests/test_category_encoder.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.transformers.column import CategoryEncoder 2 | 3 | 4 | def test_str_inputs(): 5 | category_encoder = CategoryEncoder(min_cnt=1) 6 | 7 | input_terms = ['this', 'is', 'a', 'simple', 'test'] 8 | 9 | category_encoder.fit(input_terms) 10 | 11 | transformed = category_encoder.transform(input_terms) 12 | 13 | assert set(transformed) == {1, 2, 3, 4, 5} 14 | assert category_encoder.dimension() == 7 15 | 16 | 17 | def test_int_inputs(): 18 | category_encoder = CategoryEncoder(min_cnt=1) 19 | 20 | input_terms = [345, 3434, 23, 88, 4] 21 | 22 | category_encoder.fit(input_terms) 23 | 24 | transformed = category_encoder.transform(input_terms) 25 | 26 | assert set(transformed) == {1, 2, 3, 4, 5} 27 | assert category_encoder.dimension() == 7 28 | 29 | 30 | def test_unseen_inputs(): 31 | category_encoder = CategoryEncoder(min_cnt=1) 32 | 33 | input_terms = [345, 3434, 23, 88, 4] 34 | 35 | category_encoder.fit(input_terms) 36 | 37 | transformed = category_encoder.transform([345, 5343]) 38 | 39 | assert set(transformed) == {4, 6} 40 | -------------------------------------------------------------------------------- /prediction_flow/transformers/column/tests/test_column_flow.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from prediction_flow.transformers.column import ( 4 | LogTransformer, CategoryEncoder, ColumnFlow) 5 | 6 | 7 | def test_wrong_type_transformers(): 8 | with pytest.raises(TypeError): 9 | ColumnFlow({CategoryEncoder()}) 10 | 11 | 12 | def test_multi_type_transformers(): 13 | with pytest.raises(ValueError): 14 | ColumnFlow([LogTransformer(), CategoryEncoder()]) 15 | 16 | 17 | def test_transformers(): 18 | column_flow = ColumnFlow([CategoryEncoder(min_cnt=1)]) 19 | 20 | input_terms = ['this', 'is', 'a', 'simple', 'test'] 21 | 22 | column_flow.fit(input_terms) 23 | 24 | transformed = column_flow.transform(input_terms) 25 | 26 | assert set(transformed) == {1, 2, 3, 4, 5} 27 | assert column_flow.transformers[-1].dimension() == 7 28 | assert isinstance(input_terms, list) == True 29 | -------------------------------------------------------------------------------- /prediction_flow/transformers/column/tests/test_log_transformer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from prediction_flow.transformers.column import LogTransformer 4 | 5 | 6 | def test_normal(): 7 | log_transformer = LogTransformer() 8 | 9 | x = np.array([100, 10, 32]) 10 | log_transformer.fit(x) 11 | 12 | np.testing.assert_array_almost_equal( 13 | log_transformer.transform(x), np.array([4.615121, 2.397895, 3.496508])) 14 | -------------------------------------------------------------------------------- /prediction_flow/transformers/column/tests/test_sequence_encoder.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.transformers.column import SequenceEncoder 2 | 3 | import numpy as np 4 | 5 | 6 | def test_normal(): 7 | sequence_encoder = SequenceEncoder(sep=' ', min_cnt=1, max_len=3) 8 | 9 | x = [ 10 | "this is a simple test", 11 | "this class is work" 12 | ] 13 | 14 | sequence_encoder.fit(x) 15 | 16 | actual = sequence_encoder.transform(x) 17 | assert sequence_encoder.dimension() == 9 18 | assert sequence_encoder.max_length() == 3 19 | assert actual.tolist() == [[6, 3, 1], [6, 2, 3]] 20 | assert isinstance(x, list) 21 | 22 | 23 | def test_unseen_inputs(): 24 | sequence_encoder = SequenceEncoder(sep=' ', min_cnt=1, max_len=10) 25 | 26 | x = [ 27 | "this is a simple test", 28 | "this class is work" 29 | ] 30 | 31 | sequence_encoder.fit(x) 32 | 33 | actual = sequence_encoder.transform(["this is an unseen test"]) 34 | assert actual.tolist() == [[6, 3, 8, 8, 5]] 35 | -------------------------------------------------------------------------------- /prediction_flow/transformers/column/tests/test_standard_scaler.py: -------------------------------------------------------------------------------- 1 | from prediction_flow.transformers.column import StandardScaler 2 | 3 | import numpy as np 4 | 5 | 6 | def test_normal(): 7 | scaler = StandardScaler() 8 | 9 | x = np.array([3, 4, 2, 24, 2], dtype=np.float) 10 | 11 | scaler.fit(x) 12 | 13 | actual = scaler.transform(x) 14 | expected = np.array([ 15 | -0.46880723, -0.35160542, -0.58600904, 1.99243073, -0.58600904]) 16 | 17 | np.testing.assert_array_almost_equal(actual, expected) 18 | -------------------------------------------------------------------------------- /prediction_flow/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/utils/__init__.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.16.0 2 | pandas>=0.24.2 3 | torch>=1.1.0 4 | tqdm>=4.32.0 5 | scikit-learn>=0.20.0 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | desciption-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | 4 | from setuptools import setup, find_packages 5 | import prediction_flow 6 | 7 | 8 | with open('README.md') as f: 9 | LONG_DESCRIPTION = f.read() 10 | 11 | with open('requirements.txt') as f: 12 | INSTALL_REQUIRES = f.read().splitlines() 13 | 14 | DISTNAME = 'prediction-flow' 15 | DESCRIPTION = 'Deep-Learning based CTR models implemented by PyTorch' 16 | MAINTAINER = 'Hongwei Zhang' 17 | MAINTAINER_EMAIL = 'hw_zhang@outlook.com' 18 | URL = 'https://github.com/GitHub-HongweiZhang/prediction-flow' 19 | LICENSE = 'MIT' 20 | VERSION = prediction_flow.__version__ 21 | 22 | 23 | def setup_package(): 24 | setup( 25 | name=DISTNAME, 26 | packages=find_packages(), 27 | maintainer=MAINTAINER, 28 | maintainer_email=MAINTAINER_EMAIL, 29 | description=DESCRIPTION, 30 | url=URL, 31 | version=VERSION, 32 | long_description=LONG_DESCRIPTION, 33 | long_description_content_type="text/markdown", 34 | python_requires='>=3.6', 35 | include_package_data=True, 36 | install_requires=INSTALL_REQUIRES, 37 | classifiers=( 38 | 'Development Status :: 3 - Alpha', 39 | 'License :: OSI Approved :: MIT License', 40 | 'Operating System :: OS Independent', 41 | 'Intended Audience :: Science/Research', 42 | 'Intended Audience :: Developers', 43 | 'Intended Audience :: Education', 44 | 'Programming Language :: Python', 45 | 'Programming Language :: Python :: 3', 46 | 'Programming Language :: Python :: 3.6', 47 | 'Topic :: Software Development', 48 | 'Topic :: Scientific/Engineering', 49 | ), 50 | license=LICENSE, 51 | keywords=[ 52 | 'torch', 'ctr prediction', 'deep learning', 53 | 'deepfm', 'din', 'dnn', 'deep neural network'] 54 | ) 55 | 56 | 57 | if __name__ == '__main__': 58 | setup_package() 59 | --------------------------------------------------------------------------------