├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
├── .gitignore
├── .travis.yml
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── examples
    ├── amazon
    │   ├── amazon-lightning.ipynb
    │   ├── amazon.ipynb
    │   ├── prepare_neg.ipynb
    │   └── simple_benchmark.png
    └── movielens
    │   ├── ml-1m
    │       ├── README
    │       ├── preprocess.ipynb
    │       ├── test.csv
    │       └── train.csv
    │   └── movielens-1m.ipynb
├── prediction_flow
    ├── __init__.py
    ├── features
    │   ├── __init__.py
    │   ├── base.py
    │   ├── category_feature.py
    │   ├── features.py
    │   ├── number_feature.py
    │   ├── sequence_feature.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   └── test_features.py
    ├── metrics
    │   └── __init__.py
    ├── pytorch
    │   ├── __init__.py
    │   ├── base.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   └── test_dataset.py
    │   ├── deepfm.py
    │   ├── dien.py
    │   ├── din.py
    │   ├── dnn.py
    │   ├── functions.py
    │   ├── interest_net.py
    │   ├── nn
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── fm.py
    │   │   ├── interest.py
    │   │   ├── mlp.py
    │   │   ├── pooling.py
    │   │   ├── rnn.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   ├── test_attention.py
    │   │   │   ├── test_fm.py
    │   │   │   ├── test_interest.py
    │   │   │   ├── test_mlp.py
    │   │   │   ├── test_pooling.py
    │   │   │   └── test_rnn.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_deepfm.py
    │   │   ├── test_dien.py
    │   │   ├── test_din.py
    │   │   ├── test_dnn.py
    │   │   ├── test_wide_deep.py
    │   │   └── utils.py
    │   ├── utils.py
    │   └── wide_deep.py
    ├── transformers
    │   ├── __init__.py
    │   └── column
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── category_encoder.py
    │   │   ├── column_flow.py
    │   │   ├── log_transformer.py
    │   │   ├── sequence_encoder.py
    │   │   ├── standard_scaler.py
    │   │   └── tests
    │   │       ├── __init__.py
    │   │       ├── test_category_encoder.py
    │   │       ├── test_column_flow.py
    │   │       ├── test_log_transformer.py
    │   │       ├── test_sequence_encoder.py
    │   │       └── test_standard_scaler.py
    └── utils
    │   └── __init__.py
├── requirements.txt
├── setup.cfg
└── setup.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: GitHub-HongweiZhang
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Additional context**
27 | Add any other context about the problem here.
28 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: GitHub-HongweiZhang
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | examples/movielens/ml-1m/*.dat
107 | examples/amazon/local*
108 | examples/amazon/*info
109 | tmp/
110 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | 
3 | python:
4 |   - "3.6"
5 | 
6 | script:
7 |   - pytest
8 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | TODO
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Hongwei Zhang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # Include the README
2 | include *.md
3 | 
4 | # Include the license file
5 | include LICENSE
6 | 
7 | # Include the Requirements
8 | include requirements.txt
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://travis-ci.org/GitHub-HongweiZhang/prediction-flow.svg?branch=master)](https://travis-ci.org/GitHub-HongweiZhang/prediction-flow)
 2 | 
 3 | [![PyPI version](https://badge.fury.io/py/prediction-flow.svg)](https://badge.fury.io/py/prediction-flow)
 4 | 
 5 | # prediction-flow
 6 | **prediction-flow** is a Python package providing modern **Deep-Learning**
 7 | based CTR models. Models are implemented by **PyTorch**.
 8 | 
 9 | ## how to use
10 | * Install using pip.
11 | ```
12 | pip install prediction-flow
13 | ```
14 | 
15 | ## feature
16 | ### how to define feature
17 | There are two parameters for all feature types, name and column_flow.
18 | The name parameter is used to index the column raw data from input data frame.
19 | The column_flow parameter is a single transformer of a list of transformers.
20 | The transformer is used to pre-process the column data before training the model.
21 | 
22 | * dense number feature
23 | ```
24 | Number('age', StandardScaler())
25 | Number('ctr', None)
26 | ```
27 | * sparse category feature
28 | ```
29 | Category('movieId', CategoryEncoder(min_cnt=1))
30 | ```
31 | * var length sequence feature
32 | ```
33 | Sequence('genres', SequenceEncoder(sep='|', min_cnt=1))
34 | ```
35 | 
36 | ## transformer
37 | The following transformers are provided now.
38 | 
39 | | transformer | supported feature type | detail |
40 | |--|--|--|
41 | | StandardScaler | Number | Wrapper of scikit-learn's StandardScaler. Null value must be filled in advance. |
42 | | LogTransformer | Number | Log scaler. Null value must be filled in advance. |
43 | | CategoryEncoder | Category | Converting str value to int. Null value must be filled in advance using '\_\_UNKNOWN\_\_'. |
44 | | SequenceEncoder | Sequence | Converting sequence str value to int. Null value must be filled in advance using '\_\_UNKNOWN\_\_'. |
45 | 
46 | ## model
47 | 
48 | | model | reference |
49 | |--|--|
50 | | DNN | - |
51 | | Wide & Deep | [DLRS 2016][Wide & Deep Learning for Recommender Systems](https://arxiv.org/pdf/1606.07792.pdf) |
52 | | DeepFM | [IJCAI 2017][DeepFM: A Factorization-Machine based Neural Network for CTR Prediction](http://www.ijcai.org/proceedings/2017/0239.pdf) |
53 | | DIN | [KDD 2018][Deep Interest Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1706.06978.pdf) |
54 | | DNN + GRU + GRU + Attention | [AAAI 2019][Deep Interest Evolution Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1809.03672.pdf) |
55 | | DNN + GRU + AIGRU | [AAAI 2019][Deep Interest Evolution Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1809.03672.pdf) |
56 | | DNN + GRU + AGRU | [AAAI 2019][Deep Interest Evolution Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1809.03672.pdf) |
57 | | DNN + GRU + AUGRU | [AAAI 2019][Deep Interest Evolution Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1809.03672.pdf) |
58 | | DIEN | [AAAI 2019][Deep Interest Evolution Network for Click-Through Rate Prediction](https://arxiv.org/pdf/1809.03672.pdf) |
59 | | OTHER | TODO |
60 | 
61 | ## example
62 | ### movielens-1M 
63 | **This dataset is just used to test the code can run, accuracy does not make
64 | sense.**
65 | * Prepare the dataset. [preprocess.ipynb](examples/movielens/ml-1m/preprocess.ipynb)
66 | * Run the model. [movielens-1m.ipynb](examples/movielens/movielens-1m.ipynb)
67 | 
68 | ### amazon
69 | * Prepare the dataset. [prepare_neg.ipynb](examples/amazon/prepare_neg.ipynb)
70 | * Run the model.
71 |   [amazon.ipynb](examples/amazon/amazon.ipynb)
72 | * An example using [pytorch-lightning](https://github.com/williamFalcon/pytorch-lightning).
73 |   [amazon-lightning.ipynb](examples/amazon/amazon-lightning.ipynb)
74 | 
75 | **accuracy**
76 | 
77 | ![benchmark](examples/amazon/simple_benchmark.png)
78 | 
79 | ## acknowledge and reference
80 | * Referring the design from [DeepCTR](https://github.com/shenweichen/DeepCTR),
81 |   the features are divided into dense (class Number), sparse (class Category),
82 |   sequence (class Sequence) types.
83 | 


--------------------------------------------------------------------------------
/examples/amazon/amazon-lightning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# script to do experiments described in paper: Deep Interest Evolution Network for Click-Through Rate Prediction\n",
  8 |     "\n",
  9 |     "## how to run\n",
 10 |     "\n",
 11 |     "1. Please run prepare_neg.ipynb first."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "SEQ_MAX_LEN = 100 # maximum sequence length\n",
 21 |     "BATCH_SIZE = 128\n",
 22 |     "EMBEDDING_DIM = 18\n",
 23 |     "DNN_HIDDEN_SIZE = [200, 80]\n",
 24 |     "DNN_DROPOUT = 0.0\n",
 25 |     "TEST_RUN = False\n",
 26 |     "EPOCH = 2\n",
 27 |     "SEED = 10"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "%matplotlib inline\n",
 37 |     "\n",
 38 |     "import itertools\n",
 39 |     "from collections import Counter, OrderedDict\n",
 40 |     "\n",
 41 |     "import random\n",
 42 |     "import numpy as np\n",
 43 |     "import pandas as pd\n",
 44 |     "import matplotlib.pyplot as plt\n",
 45 |     "\n",
 46 |     "import torch\n",
 47 |     "import torch.nn as nn\n",
 48 |     "import torch.optim as optim\n",
 49 |     "import torch.nn.functional as F\n",
 50 |     "from sklearn.metrics import roc_auc_score\n",
 51 |     "\n",
 52 |     "from prediction_flow.features import Number, Category, Sequence, Features\n",
 53 |     "from prediction_flow.transformers.column import (\n",
 54 |     "    StandardScaler, CategoryEncoder, SequenceEncoder)\n",
 55 |     "\n",
 56 |     "from prediction_flow.pytorch.data import Dataset\n",
 57 |     "from prediction_flow.pytorch import WideDeep, DeepFM, DNN, DIN, DIEN, AttentionGroup\n",
 58 |     "\n",
 59 |     "from prediction_flow.pytorch.functions import fit, predict, create_dataloader_fn"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 3,
 65 |    "metadata": {},
 66 |    "outputs": [
 67 |     {
 68 |      "data": {
 69 |       "text/plain": [
 70 |        "<torch._C.Generator at 0x7f052fcd4150>"
 71 |       ]
 72 |      },
 73 |      "execution_count": 3,
 74 |      "metadata": {},
 75 |      "output_type": "execute_result"
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "random.seed(SEED)\n",
 80 |     "np.random.seed(SEED)\n",
 81 |     "torch.manual_seed(SEED)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 4,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "train_df = pd.read_csv(\n",
 91 |     "    \"./local_train.csv\", sep='\\t')\n",
 92 |     "\n",
 93 |     "valid_df = pd.read_csv(\n",
 94 |     "    \"./local_test.csv\", sep='\\t')"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 5,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "if TEST_RUN:\n",
104 |     "    train_df = train_df.sample(1000)\n",
105 |     "    valid_df = valid_df.sample(1000)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 6,
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "data": {
115 |       "text/html": [
116 |        "<div>\n",
117 |        "<style scoped>\n",
118 |        "    .dataframe tbody tr th:only-of-type {\n",
119 |        "        vertical-align: middle;\n",
120 |        "    }\n",
121 |        "\n",
122 |        "    .dataframe tbody tr th {\n",
123 |        "        vertical-align: top;\n",
124 |        "    }\n",
125 |        "\n",
126 |        "    .dataframe thead th {\n",
127 |        "        text-align: right;\n",
128 |        "    }\n",
129 |        "</style>\n",
130 |        "<table border=\"1\" class=\"dataframe\">\n",
131 |        "  <thead>\n",
132 |        "    <tr style=\"text-align: right;\">\n",
133 |        "      <th></th>\n",
134 |        "      <th>label</th>\n",
135 |        "      <th>uid</th>\n",
136 |        "      <th>mid</th>\n",
137 |        "      <th>cat</th>\n",
138 |        "      <th>hist_mids</th>\n",
139 |        "      <th>hist_cats</th>\n",
140 |        "      <th>neg_hist_mids</th>\n",
141 |        "      <th>neg_hist_cats</th>\n",
142 |        "    </tr>\n",
143 |        "  </thead>\n",
144 |        "  <tbody>\n",
145 |        "    <tr>\n",
146 |        "      <th>0</th>\n",
147 |        "      <td>0</td>\n",
148 |        "      <td>AZPJ9LUT0FEPY</td>\n",
149 |        "      <td>B00AMNNTIA</td>\n",
150 |        "      <td>Literature &amp; Fiction</td>\n",
151 |        "      <td>0307744434\u00020062248391\u00020470530707\u00020978924622\u000215...</td>\n",
152 |        "      <td>Books\u0002Books\u0002Books\u0002Books\u0002Books</td>\n",
153 |        "      <td>0786890487\u00020618539069\u0002B001IDZJO0\u00021603421548\u000203...</td>\n",
154 |        "      <td>Books\u0002Books\u0002Books\u0002Books\u0002Books</td>\n",
155 |        "    </tr>\n",
156 |        "    <tr>\n",
157 |        "      <th>1</th>\n",
158 |        "      <td>1</td>\n",
159 |        "      <td>AZPJ9LUT0FEPY</td>\n",
160 |        "      <td>0800731603</td>\n",
161 |        "      <td>Books</td>\n",
162 |        "      <td>0307744434\u00020062248391\u00020470530707\u00020978924622\u000215...</td>\n",
163 |        "      <td>Books\u0002Books\u0002Books\u0002Books\u0002Books</td>\n",
164 |        "      <td>B00BEFIHOG\u00021402245270\u00020670031747\u00020615785182\u000214...</td>\n",
165 |        "      <td>Literary\u0002Books\u0002Books\u0002Books\u0002Books</td>\n",
166 |        "    </tr>\n",
167 |        "    <tr>\n",
168 |        "      <th>2</th>\n",
169 |        "      <td>0</td>\n",
170 |        "      <td>A2NRV79GKAU726</td>\n",
171 |        "      <td>B003NNV10O</td>\n",
172 |        "      <td>Russian</td>\n",
173 |        "      <td>0814472869\u00020071462074\u00021583942300\u00020812538366\u0002B0...</td>\n",
174 |        "      <td>Books\u0002Books\u0002Books\u0002Books\u0002Baking\u0002Books\u0002Books</td>\n",
175 |        "      <td>B00LQABRTG\u0002087830178X\u00020991543009\u0002071533154X\u000203...</td>\n",
176 |        "      <td>Neuropsychology\u0002Books\u0002Books\u0002Books\u0002Books\u0002Books\u0002...</td>\n",
177 |        "    </tr>\n",
178 |        "    <tr>\n",
179 |        "      <th>3</th>\n",
180 |        "      <td>1</td>\n",
181 |        "      <td>A2NRV79GKAU726</td>\n",
182 |        "      <td>B000UWJ91O</td>\n",
183 |        "      <td>Books</td>\n",
184 |        "      <td>0814472869\u00020071462074\u00021583942300\u00020812538366\u0002B0...</td>\n",
185 |        "      <td>Books\u0002Books\u0002Books\u0002Books\u0002Baking\u0002Books\u0002Books</td>\n",
186 |        "      <td>1595328149\u00021591797810\u00020451233018\u00020373771355\u000214...</td>\n",
187 |        "      <td>Books\u0002Books\u0002Books\u0002Books\u0002Books\u0002Books\u0002Contempora...</td>\n",
188 |        "    </tr>\n",
189 |        "    <tr>\n",
190 |        "      <th>4</th>\n",
191 |        "      <td>0</td>\n",
192 |        "      <td>A2GEQVDX2LL4V3</td>\n",
193 |        "      <td>0321334094</td>\n",
194 |        "      <td>Books</td>\n",
195 |        "      <td>0743596870\u00020374280991\u00021439140634\u00020976475731</td>\n",
196 |        "      <td>Books\u0002Books\u0002Books\u0002Books</td>\n",
197 |        "      <td>0316159735\u0002156718359X\u00020786812400\u00020062506110</td>\n",
198 |        "      <td>Books\u0002Books\u0002Books\u0002Books</td>\n",
199 |        "    </tr>\n",
200 |        "  </tbody>\n",
201 |        "</table>\n",
202 |        "</div>"
203 |       ],
204 |       "text/plain": [
205 |        "   label             uid         mid                   cat  \\\n",
206 |        "0      0   AZPJ9LUT0FEPY  B00AMNNTIA  Literature & Fiction   \n",
207 |        "1      1   AZPJ9LUT0FEPY  0800731603                 Books   \n",
208 |        "2      0  A2NRV79GKAU726  B003NNV10O               Russian   \n",
209 |        "3      1  A2NRV79GKAU726  B000UWJ91O                 Books   \n",
210 |        "4      0  A2GEQVDX2LL4V3  0321334094                 Books   \n",
211 |        "\n",
212 |        "                                           hist_mids  \\\n",
213 |        "0  0307744434\u00020062248391\u00020470530707\u00020978924622\u000215...   \n",
214 |        "1  0307744434\u00020062248391\u00020470530707\u00020978924622\u000215...   \n",
215 |        "2  0814472869\u00020071462074\u00021583942300\u00020812538366\u0002B0...   \n",
216 |        "3  0814472869\u00020071462074\u00021583942300\u00020812538366\u0002B0...   \n",
217 |        "4        0743596870\u00020374280991\u00021439140634\u00020976475731   \n",
218 |        "\n",
219 |        "                                    hist_cats  \\\n",
220 |        "0               Books\u0002Books\u0002Books\u0002Books\u0002Books   \n",
221 |        "1               Books\u0002Books\u0002Books\u0002Books\u0002Books   \n",
222 |        "2  Books\u0002Books\u0002Books\u0002Books\u0002Baking\u0002Books\u0002Books   \n",
223 |        "3  Books\u0002Books\u0002Books\u0002Books\u0002Baking\u0002Books\u0002Books   \n",
224 |        "4                     Books\u0002Books\u0002Books\u0002Books   \n",
225 |        "\n",
226 |        "                                       neg_hist_mids  \\\n",
227 |        "0  0786890487\u00020618539069\u0002B001IDZJO0\u00021603421548\u000203...   \n",
228 |        "1  B00BEFIHOG\u00021402245270\u00020670031747\u00020615785182\u000214...   \n",
229 |        "2  B00LQABRTG\u0002087830178X\u00020991543009\u0002071533154X\u000203...   \n",
230 |        "3  1595328149\u00021591797810\u00020451233018\u00020373771355\u000214...   \n",
231 |        "4        0316159735\u0002156718359X\u00020786812400\u00020062506110   \n",
232 |        "\n",
233 |        "                                       neg_hist_cats  \n",
234 |        "0                      Books\u0002Books\u0002Books\u0002Books\u0002Books  \n",
235 |        "1                   Literary\u0002Books\u0002Books\u0002Books\u0002Books  \n",
236 |        "2  Neuropsychology\u0002Books\u0002Books\u0002Books\u0002Books\u0002Books\u0002...  \n",
237 |        "3  Books\u0002Books\u0002Books\u0002Books\u0002Books\u0002Books\u0002Contempora...  \n",
238 |        "4                            Books\u0002Books\u0002Books\u0002Books  "
239 |       ]
240 |      },
241 |      "execution_count": 6,
242 |      "metadata": {},
243 |      "output_type": "execute_result"
244 |     }
245 |    ],
246 |    "source": [
247 |     "train_df.head()"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 7,
253 |    "metadata": {},
254 |    "outputs": [
255 |     {
256 |      "data": {
257 |       "text/html": [
258 |        "<div>\n",
259 |        "<style scoped>\n",
260 |        "    .dataframe tbody tr th:only-of-type {\n",
261 |        "        vertical-align: middle;\n",
262 |        "    }\n",
263 |        "\n",
264 |        "    .dataframe tbody tr th {\n",
265 |        "        vertical-align: top;\n",
266 |        "    }\n",
267 |        "\n",
268 |        "    .dataframe thead th {\n",
269 |        "        text-align: right;\n",
270 |        "    }\n",
271 |        "</style>\n",
272 |        "<table border=\"1\" class=\"dataframe\">\n",
273 |        "  <thead>\n",
274 |        "    <tr style=\"text-align: right;\">\n",
275 |        "      <th></th>\n",
276 |        "      <th>label</th>\n",
277 |        "      <th>uid</th>\n",
278 |        "      <th>mid</th>\n",
279 |        "      <th>cat</th>\n",
280 |        "      <th>hist_mids</th>\n",
281 |        "      <th>hist_cats</th>\n",
282 |        "      <th>neg_hist_mids</th>\n",
283 |        "      <th>neg_hist_cats</th>\n",
284 |        "    </tr>\n",
285 |        "  </thead>\n",
286 |        "  <tbody>\n",
287 |        "    <tr>\n",
288 |        "      <th>0</th>\n",
289 |        "      <td>0</td>\n",
290 |        "      <td>A3BI7R43VUZ1TY</td>\n",
291 |        "      <td>B00JNHU0T2</td>\n",
292 |        "      <td>Literature &amp; Fiction</td>\n",
293 |        "      <td>0989464105\u0002B00B01691C\u00021477809732\u00021608442845</td>\n",
294 |        "      <td>Books\u0002Literature &amp; Fiction\u0002Books\u0002Books</td>\n",
295 |        "      <td>0899576168\u0002B0056ATROO\u00020446600474\u00020615209459</td>\n",
296 |        "      <td>Books\u0002Sleep\u0002Books\u0002Books</td>\n",
297 |        "    </tr>\n",
298 |        "    <tr>\n",
299 |        "      <th>1</th>\n",
300 |        "      <td>1</td>\n",
301 |        "      <td>A3BI7R43VUZ1TY</td>\n",
302 |        "      <td>0989464121</td>\n",
303 |        "      <td>Books</td>\n",
304 |        "      <td>0989464105\u0002B00B01691C\u00021477809732\u00021608442845</td>\n",
305 |        "      <td>Books\u0002Literature &amp; Fiction\u0002Books\u0002Books</td>\n",
306 |        "      <td>0373527721\u00020981854524\u00020470404159\u0002B00BWKBSOY</td>\n",
307 |        "      <td>Books\u0002Books\u0002Books\u0002Literature &amp; Fiction</td>\n",
308 |        "    </tr>\n",
309 |        "    <tr>\n",
310 |        "      <th>2</th>\n",
311 |        "      <td>0</td>\n",
312 |        "      <td>A2Z3AHJPXG3ZNP</td>\n",
313 |        "      <td>B0072YSPJ0</td>\n",
314 |        "      <td>Literature &amp; Fiction</td>\n",
315 |        "      <td>1478310960\u00021492231452\u00021477603425\u0002B00FRKLA6Q</td>\n",
316 |        "      <td>Books\u0002Books\u0002Books\u0002Urban</td>\n",
317 |        "      <td>B00EQAEA60\u0002B007D64VX6\u0002188547766X\u00021590172477</td>\n",
318 |        "      <td>Literature &amp; Fiction\u0002Quran\u0002Books\u0002Books</td>\n",
319 |        "    </tr>\n",
320 |        "    <tr>\n",
321 |        "      <th>3</th>\n",
322 |        "      <td>1</td>\n",
323 |        "      <td>A2Z3AHJPXG3ZNP</td>\n",
324 |        "      <td>B00G4I4I5U</td>\n",
325 |        "      <td>Urban</td>\n",
326 |        "      <td>1478310960\u00021492231452\u00021477603425\u0002B00FRKLA6Q</td>\n",
327 |        "      <td>Books\u0002Books\u0002Books\u0002Urban</td>\n",
328 |        "      <td>1583942475\u00021585678600\u00021570199221\u00020312373090</td>\n",
329 |        "      <td>Books\u0002Books\u0002Books\u0002Books</td>\n",
330 |        "    </tr>\n",
331 |        "    <tr>\n",
332 |        "      <th>4</th>\n",
333 |        "      <td>0</td>\n",
334 |        "      <td>A2KDDPJUNWC5CA</td>\n",
335 |        "      <td>0316228532</td>\n",
336 |        "      <td>Books</td>\n",
337 |        "      <td>0141326085\u0002031026622X\u00020316077046\u00020988649179\u000214...</td>\n",
338 |        "      <td>Books\u0002Books\u0002Books\u0002Books\u0002Books</td>\n",
339 |        "      <td>B0077FOPFC\u00021594744106\u0002B00DFGN1DE\u00020972259112\u0002B0...</td>\n",
340 |        "      <td>Ghosts\u0002Books\u0002Erotica\u0002Books\u0002Soups &amp; Stews</td>\n",
341 |        "    </tr>\n",
342 |        "  </tbody>\n",
343 |        "</table>\n",
344 |        "</div>"
345 |       ],
346 |       "text/plain": [
347 |        "   label             uid         mid                   cat  \\\n",
348 |        "0      0  A3BI7R43VUZ1TY  B00JNHU0T2  Literature & Fiction   \n",
349 |        "1      1  A3BI7R43VUZ1TY  0989464121                 Books   \n",
350 |        "2      0  A2Z3AHJPXG3ZNP  B0072YSPJ0  Literature & Fiction   \n",
351 |        "3      1  A2Z3AHJPXG3ZNP  B00G4I4I5U                 Urban   \n",
352 |        "4      0  A2KDDPJUNWC5CA  0316228532                 Books   \n",
353 |        "\n",
354 |        "                                           hist_mids  \\\n",
355 |        "0        0989464105\u0002B00B01691C\u00021477809732\u00021608442845   \n",
356 |        "1        0989464105\u0002B00B01691C\u00021477809732\u00021608442845   \n",
357 |        "2        1478310960\u00021492231452\u00021477603425\u0002B00FRKLA6Q   \n",
358 |        "3        1478310960\u00021492231452\u00021477603425\u0002B00FRKLA6Q   \n",
359 |        "4  0141326085\u0002031026622X\u00020316077046\u00020988649179\u000214...   \n",
360 |        "\n",
361 |        "                                hist_cats  \\\n",
362 |        "0  Books\u0002Literature & Fiction\u0002Books\u0002Books   \n",
363 |        "1  Books\u0002Literature & Fiction\u0002Books\u0002Books   \n",
364 |        "2                 Books\u0002Books\u0002Books\u0002Urban   \n",
365 |        "3                 Books\u0002Books\u0002Books\u0002Urban   \n",
366 |        "4           Books\u0002Books\u0002Books\u0002Books\u0002Books   \n",
367 |        "\n",
368 |        "                                       neg_hist_mids  \\\n",
369 |        "0        0899576168\u0002B0056ATROO\u00020446600474\u00020615209459   \n",
370 |        "1        0373527721\u00020981854524\u00020470404159\u0002B00BWKBSOY   \n",
371 |        "2        B00EQAEA60\u0002B007D64VX6\u0002188547766X\u00021590172477   \n",
372 |        "3        1583942475\u00021585678600\u00021570199221\u00020312373090   \n",
373 |        "4  B0077FOPFC\u00021594744106\u0002B00DFGN1DE\u00020972259112\u0002B0...   \n",
374 |        "\n",
375 |        "                              neg_hist_cats  \n",
376 |        "0                   Books\u0002Sleep\u0002Books\u0002Books  \n",
377 |        "1    Books\u0002Books\u0002Books\u0002Literature & Fiction  \n",
378 |        "2    Literature & Fiction\u0002Quran\u0002Books\u0002Books  \n",
379 |        "3                   Books\u0002Books\u0002Books\u0002Books  \n",
380 |        "4  Ghosts\u0002Books\u0002Erotica\u0002Books\u0002Soups & Stews  "
381 |       ]
382 |      },
383 |      "execution_count": 7,
384 |      "metadata": {},
385 |      "output_type": "execute_result"
386 |     }
387 |    ],
388 |    "source": [
389 |     "valid_df.head()"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "markdown",
394 |    "metadata": {},
395 |    "source": [
396 |     "# define features"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": 8,
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": [
405 |     "cat_enc = SequenceEncoder(sep=\"\\x02\", min_cnt=1, max_len=SEQ_MAX_LEN)"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": 9,
411 |    "metadata": {},
412 |    "outputs": [
413 |     {
414 |      "data": {
415 |       "text/plain": [
416 |        "<prediction_flow.transformers.column.sequence_encoder.SequenceEncoder at 0x7f04ba916080>"
417 |       ]
418 |      },
419 |      "execution_count": 9,
420 |      "metadata": {},
421 |      "output_type": "execute_result"
422 |     }
423 |    ],
424 |    "source": [
425 |     "cat_enc.fit(train_df.hist_cats.values)"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": 10,
431 |    "metadata": {},
432 |    "outputs": [],
433 |    "source": [
434 |     "cat_word2idx, cat_idx2word = cat_enc.word2idx, cat_enc.idx2word"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": 11,
440 |    "metadata": {},
441 |    "outputs": [
442 |     {
443 |      "name": "stdout",
444 |      "output_type": "stream",
445 |      "text": [
446 |       "1602\n"
447 |      ]
448 |     }
449 |    ],
450 |    "source": [
451 |     "print(len(cat_word2idx))"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "code",
456 |    "execution_count": 12,
457 |    "metadata": {},
458 |    "outputs": [],
459 |    "source": [
460 |     "mid_enc = SequenceEncoder(sep=\"\\x02\", min_cnt=1, max_len=SEQ_MAX_LEN)"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "code",
465 |    "execution_count": 13,
466 |    "metadata": {},
467 |    "outputs": [
468 |     {
469 |      "data": {
470 |       "text/plain": [
471 |        "<prediction_flow.transformers.column.sequence_encoder.SequenceEncoder at 0x7f04ba92a7b8>"
472 |       ]
473 |      },
474 |      "execution_count": 13,
475 |      "metadata": {},
476 |      "output_type": "execute_result"
477 |     }
478 |    ],
479 |    "source": [
480 |     "mid_enc.fit(np.vstack([train_df.mid.values, train_df.hist_mids.values]))"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "code",
485 |    "execution_count": 14,
486 |    "metadata": {},
487 |    "outputs": [],
488 |    "source": [
489 |     "mid_word2idx, mid_idx2word = mid_enc.word2idx, mid_enc.idx2word"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": 15,
495 |    "metadata": {},
496 |    "outputs": [
497 |     {
498 |      "name": "stdout",
499 |      "output_type": "stream",
500 |      "text": [
501 |       "367984\n"
502 |      ]
503 |     }
504 |    ],
505 |    "source": [
506 |     "print(len(mid_word2idx))"
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "code",
511 |    "execution_count": 16,
512 |    "metadata": {},
513 |    "outputs": [],
514 |    "source": [
515 |     "number_features = []\n",
516 |     "\n",
517 |     "category_features = [\n",
518 |     "    Category('mid',\n",
519 |     "             CategoryEncoder(min_cnt=1, word2idx=mid_word2idx, idx2word=mid_idx2word),\n",
520 |     "             embedding_name='mid'),\n",
521 |     "    Category('cat',\n",
522 |     "             CategoryEncoder(min_cnt=1, word2idx=cat_word2idx, idx2word=cat_idx2word),\n",
523 |     "             embedding_name='cat'),\n",
524 |     "]\n",
525 |     "\n",
526 |     "sequence_features = [\n",
527 |     "    Sequence('hist_mids',\n",
528 |     "             SequenceEncoder(sep=\"\\x02\", min_cnt=1, max_len=SEQ_MAX_LEN,\n",
529 |     "                             word2idx=mid_word2idx, idx2word=mid_idx2word),\n",
530 |     "             embedding_name='mid'),\n",
531 |     "    Sequence('hist_cats',\n",
532 |     "             SequenceEncoder(sep=\"\\x02\", min_cnt=1, max_len=SEQ_MAX_LEN,\n",
533 |     "                             word2idx=cat_word2idx, idx2word=cat_idx2word),\n",
534 |     "             embedding_name='cat')\n",
535 |     "]\n",
536 |     "\n",
537 |     "features, train_loader, valid_loader = create_dataloader_fn(\n",
538 |     "    number_features, category_features, sequence_features, BATCH_SIZE, train_df, 'label', valid_df, 4)"
539 |    ]
540 |   },
541 |   {
542 |    "cell_type": "code",
543 |    "execution_count": 17,
544 |    "metadata": {},
545 |    "outputs": [],
546 |    "source": [
547 |     "def evaluation(model, df, dataloader):\n",
548 |     "    preds = predict(model, dataloader)\n",
549 |     "    return roc_auc_score(df['label'], preds.ravel())"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": 18,
555 |    "metadata": {},
556 |    "outputs": [],
557 |    "source": [
558 |     "import pytorch_lightning as pl\n",
559 |     "\n",
560 |     "class CoolModel(pl.LightningModule):\n",
561 |     "    def __init__(self):\n",
562 |     "        super(CoolModel, self).__init__()\n",
563 |     "        self.model = DNN(\n",
564 |     "            features,\n",
565 |     "            2,\n",
566 |     "            EMBEDDING_DIM,\n",
567 |     "            DNN_HIDDEN_SIZE,\n",
568 |     "            final_activation='sigmoid',\n",
569 |     "            dropout=DNN_DROPOUT)\n",
570 |     "    \n",
571 |     "    def forward(self, x):\n",
572 |     "        return self.model(x)\n",
573 |     "\n",
574 |     "    def training_step(self, batch, batch_nb):\n",
575 |     "        # REQUIRED\n",
576 |     "        y = batch['label']\n",
577 |     "        y_hat = self.forward(batch)\n",
578 |     "        loss = F.binary_cross_entropy(y_hat, y)\n",
579 |     "        return {\n",
580 |     "            'loss': loss,\n",
581 |     "            'progress_bar':\n",
582 |     "            {'training_loss': loss}}\n",
583 |     "\n",
584 |     "    def validation_step(self, batch, batch_nb):\n",
585 |     "        # OPTIONAL\n",
586 |     "        y = batch['label']\n",
587 |     "        y_hat = self.forward(batch)\n",
588 |     "        loss = F.binary_cross_entropy(y_hat, y)\n",
589 |     "        return {'val_loss': loss}\n",
590 |     "\n",
591 |     "    def validation_end(self, outputs):\n",
592 |     "        # OPTIONAL\n",
593 |     "        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()\n",
594 |     "        return {'progress_bar': {'val_loss': avg_loss}}\n",
595 |     "\n",
596 |     "    def configure_optimizers(self):\n",
597 |     "        # REQUIRED\n",
598 |     "        return torch.optim.Adam(self.parameters(), lr=0.003)\n",
599 |     "\n",
600 |     "    @pl.data_loader\n",
601 |     "    def train_dataloader(self):\n",
602 |     "        return train_loader\n",
603 |     "\n",
604 |     "    @pl.data_loader\n",
605 |     "    def val_dataloader(self):\n",
606 |     "        # OPTIONAL\n",
607 |     "        # can also return a list of val dataloaders\n",
608 |     "        return valid_loader"
609 |    ]
610 |   },
611 |   {
612 |    "cell_type": "code",
613 |    "execution_count": 22,
614 |    "metadata": {},
615 |    "outputs": [
616 |     {
617 |      "name": "stdout",
618 |      "output_type": "stream",
619 |      "text": [
620 |       "gpu available: True, used: True\n",
621 |       "VISIBLE GPUS: 0\n"
622 |      ]
623 |     }
624 |    ],
625 |    "source": [
626 |     "from pytorch_lightning import Trainer\n",
627 |     "\n",
628 |     "model = CoolModel()\n",
629 |     "\n",
630 |     "# most basic trainer, uses good defaults\n",
631 |     "trainer = Trainer(max_nb_epochs=EPOCH, gpus=1)    "
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "code",
636 |    "execution_count": 23,
637 |    "metadata": {},
638 |    "outputs": [
639 |     {
640 |      "name": "stderr",
641 |      "output_type": "stream",
642 |      "text": [
643 |       "  0%|          | 0/5 [00:00<?, ?it/s]"
644 |      ]
645 |     },
646 |     {
647 |      "name": "stdout",
648 |      "output_type": "stream",
649 |      "text": [
650 |       "                                 Name         Type Params\n",
651 |       "0                               model          DNN    6 M\n",
652 |       "1                 model.embedding:mid    Embedding    6 M\n",
653 |       "2                 model.embedding:cat    Embedding   28 K\n",
654 |       "3             model.pooling:hist_mids   MaxPooling    0  \n",
655 |       "4             model.pooling:hist_cats   MaxPooling    0  \n",
656 |       "5                           model.mlp          MLP   31 K\n",
657 |       "6               model.mlp._sequential   Sequential   31 K\n",
658 |       "7        model.mlp._sequential.dense0       Linear   14 K\n",
659 |       "8    model.mlp._sequential.batchnorm0  BatchNorm1d  400  \n",
660 |       "9   model.mlp._sequential.activation0         ReLU    0  \n",
661 |       "10       model.mlp._sequential.dense1       Linear   16 K\n",
662 |       "11   model.mlp._sequential.batchnorm1  BatchNorm1d  160  \n",
663 |       "12  model.mlp._sequential.activation1         ReLU    0  \n",
664 |       "13                  model.final_layer       Linear   81  \n"
665 |      ]
666 |     },
667 |     {
668 |      "name": "stderr",
669 |      "output_type": "stream",
670 |      "text": [
671 |       "100%|██████████| 9433/9433 [01:21<00:00, 317.80it/s, batch_nb=8485, epoch=1, gpu=0, loss=0.596, training_loss=0.571, v_nb=1, val_loss=0.625]"
672 |      ]
673 |     },
674 |     {
675 |      "data": {
676 |       "text/plain": [
677 |        "1"
678 |       ]
679 |      },
680 |      "execution_count": 23,
681 |      "metadata": {},
682 |      "output_type": "execute_result"
683 |     },
684 |     {
685 |      "name": "stderr",
686 |      "output_type": "stream",
687 |      "text": [
688 |       "\r",
689 |       "100%|██████████| 9433/9433 [01:40<00:00, 317.80it/s, batch_nb=8485, epoch=1, gpu=0, loss=0.596, training_loss=0.571, v_nb=1, val_loss=0.625]"
690 |      ]
691 |     }
692 |    ],
693 |    "source": [
694 |     "trainer.fit(model)"
695 |    ]
696 |   },
697 |   {
698 |    "cell_type": "code",
699 |    "execution_count": 24,
700 |    "metadata": {},
701 |    "outputs": [],
702 |    "source": [
703 |     "score = evaluation(model, valid_df, valid_loader)"
704 |    ]
705 |   },
706 |   {
707 |    "cell_type": "code",
708 |    "execution_count": 25,
709 |    "metadata": {},
710 |    "outputs": [
711 |     {
712 |      "name": "stdout",
713 |      "output_type": "stream",
714 |      "text": [
715 |       "auc: 0.7025202354666744\n"
716 |      ]
717 |     }
718 |    ],
719 |    "source": [
720 |     "print(f'auc: {score}')"
721 |    ]
722 |   },
723 |   {
724 |    "cell_type": "code",
725 |    "execution_count": null,
726 |    "metadata": {},
727 |    "outputs": [],
728 |    "source": []
729 |   }
730 |  ],
731 |  "metadata": {
732 |   "kernelspec": {
733 |    "display_name": "Python 3",
734 |    "language": "python",
735 |    "name": "python3"
736 |   },
737 |   "language_info": {
738 |    "codemirror_mode": {
739 |     "name": "ipython",
740 |     "version": 3
741 |    },
742 |    "file_extension": ".py",
743 |    "mimetype": "text/x-python",
744 |    "name": "python",
745 |    "nbconvert_exporter": "python",
746 |    "pygments_lexer": "ipython3",
747 |    "version": "3.6.2"
748 |   }
749 |  },
750 |  "nbformat": 4,
751 |  "nbformat_minor": 2
752 | }
753 | 


--------------------------------------------------------------------------------
/examples/amazon/prepare_neg.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%matplotlib inline\n",
 10 |     "\n",
 11 |     "import itertools\n",
 12 |     "from collections import Counter\n",
 13 |     "\n",
 14 |     "import numpy as np\n",
 15 |     "import pandas as pd"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 10,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "name": "stdout",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "--2019-09-07 17:45:10--  https://raw.githubusercontent.com/mouna99/dien/master/data.tar.gz\n",
 28 |       "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.228.133\n",
 29 |       "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.228.133|:443... connected.\n",
 30 |       "HTTP request sent, awaiting response... 200 OK\n",
 31 |       "Length: 81977637 (78M) [application/octet-stream]\n",
 32 |       "Saving to: ‘data.tar.gz’\n",
 33 |       "\n",
 34 |       "100%[======================================>] 81,977,637   105MB/s   in 0.7s   \n",
 35 |       "\n",
 36 |       "2019-09-07 17:45:11 (105 MB/s) - ‘data.tar.gz’ saved [81977637/81977637]\n",
 37 |       "\n",
 38 |       "--2019-09-07 17:45:11--  https://raw.githubusercontent.com/mouna99/dien/master/data1.tar.gz\n",
 39 |       "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.228.133\n",
 40 |       "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.228.133|:443... connected.\n",
 41 |       "HTTP request sent, awaiting response... 200 OK\n",
 42 |       "Length: 104430448 (100M) [application/octet-stream]\n",
 43 |       "Saving to: ‘data1.tar.gz’\n",
 44 |       "\n",
 45 |       "100%[======================================>] 104,430,448 86.5MB/s   in 1.2s   \n",
 46 |       "\n",
 47 |       "2019-09-07 17:45:13 (86.5 MB/s) - ‘data1.tar.gz’ saved [104430448/104430448]\n",
 48 |       "\n",
 49 |       "--2019-09-07 17:45:13--  https://raw.githubusercontent.com/mouna99/dien/master/data2.tar.gz\n",
 50 |       "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.228.133\n",
 51 |       "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.228.133|:443... connected.\n",
 52 |       "HTTP request sent, awaiting response... 200 OK\n",
 53 |       "Length: 9460706 (9.0M) [application/octet-stream]\n",
 54 |       "Saving to: ‘data2.tar.gz’\n",
 55 |       "\n",
 56 |       "100%[======================================>] 9,460,706   --.-K/s   in 0.1s    \n",
 57 |       "\n",
 58 |       "2019-09-07 17:45:13 (95.0 MB/s) - ‘data2.tar.gz’ saved [9460706/9460706]\n",
 59 |       "\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "! wget --no-check-certificate https://raw.githubusercontent.com/mouna99/dien/master/data.tar.gz\n",
 65 |     "! wget --no-check-certificate https://raw.githubusercontent.com/mouna99/dien/master/data1.tar.gz\n",
 66 |     "! wget --no-check-certificate https://raw.githubusercontent.com/mouna99/dien/master/data2.tar.gz"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 15,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "name": "stdout",
 76 |      "output_type": "stream",
 77 |      "text": [
 78 |       "data/\n",
 79 |       "data/cat_voc.pkl\n",
 80 |       "data/mid_voc.pkl\n",
 81 |       "data/uid_voc.pkl\n",
 82 |       "data/local_train_splitByUser\n",
 83 |       "data/local_test_splitByUser\n",
 84 |       "data1/\n",
 85 |       "data1/reviews-info\n",
 86 |       "data2/\n",
 87 |       "data2/item-info\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "! tar jxvf ./data.tar.gz && tar jxvf ./data1.tar.gz && tar jxvf ./data2.tar.gz"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 16,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "TEST_RUN = False"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 17,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "train_df = pd.read_csv(\n",
111 |     "    \"./data/local_train_splitByUser\", sep='\\t',\n",
112 |     "    names=['label', 'uid', 'mid', 'cat', 'hist_mids', 'hist_cats'])\n",
113 |     "\n",
114 |     "valid_df = pd.read_csv(\n",
115 |     "    \"./data/local_test_splitByUser\", sep='\\t',\n",
116 |     "    names=['label', 'uid', 'mid', 'cat', 'hist_mids', 'hist_cats'])"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 18,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "item_info_df = pd.read_csv(\"./data2/item-info\", sep='\\t', names=['mid', 'cat'])"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 19,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "data": {
135 |       "text/html": [
136 |        "<div>\n",
137 |        "<style scoped>\n",
138 |        "    .dataframe tbody tr th:only-of-type {\n",
139 |        "        vertical-align: middle;\n",
140 |        "    }\n",
141 |        "\n",
142 |        "    .dataframe tbody tr th {\n",
143 |        "        vertical-align: top;\n",
144 |        "    }\n",
145 |        "\n",
146 |        "    .dataframe thead th {\n",
147 |        "        text-align: right;\n",
148 |        "    }\n",
149 |        "</style>\n",
150 |        "<table border=\"1\" class=\"dataframe\">\n",
151 |        "  <thead>\n",
152 |        "    <tr style=\"text-align: right;\">\n",
153 |        "      <th></th>\n",
154 |        "      <th>mid</th>\n",
155 |        "      <th>cat</th>\n",
156 |        "    </tr>\n",
157 |        "  </thead>\n",
158 |        "  <tbody>\n",
159 |        "    <tr>\n",
160 |        "      <th>0</th>\n",
161 |        "      <td>0001048791</td>\n",
162 |        "      <td>Books</td>\n",
163 |        "    </tr>\n",
164 |        "    <tr>\n",
165 |        "      <th>1</th>\n",
166 |        "      <td>0001048775</td>\n",
167 |        "      <td>Books</td>\n",
168 |        "    </tr>\n",
169 |        "    <tr>\n",
170 |        "      <th>2</th>\n",
171 |        "      <td>0001048236</td>\n",
172 |        "      <td>Books</td>\n",
173 |        "    </tr>\n",
174 |        "    <tr>\n",
175 |        "      <th>3</th>\n",
176 |        "      <td>0000401048</td>\n",
177 |        "      <td>Books</td>\n",
178 |        "    </tr>\n",
179 |        "    <tr>\n",
180 |        "      <th>4</th>\n",
181 |        "      <td>0001019880</td>\n",
182 |        "      <td>Books</td>\n",
183 |        "    </tr>\n",
184 |        "  </tbody>\n",
185 |        "</table>\n",
186 |        "</div>"
187 |       ],
188 |       "text/plain": [
189 |        "          mid    cat\n",
190 |        "0  0001048791  Books\n",
191 |        "1  0001048775  Books\n",
192 |        "2  0001048236  Books\n",
193 |        "3  0000401048  Books\n",
194 |        "4  0001019880  Books"
195 |       ]
196 |      },
197 |      "execution_count": 19,
198 |      "metadata": {},
199 |      "output_type": "execute_result"
200 |     }
201 |    ],
202 |    "source": [
203 |     "item_info_df.head()"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 20,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "reviews_info_df = pd.read_csv(\"./data1/reviews-info\", sep='\\t', names=['c1', 'mid', 'c3', 'c4'])"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 21,
218 |    "metadata": {},
219 |    "outputs": [
220 |     {
221 |      "data": {
222 |       "text/html": [
223 |        "<div>\n",
224 |        "<style scoped>\n",
225 |        "    .dataframe tbody tr th:only-of-type {\n",
226 |        "        vertical-align: middle;\n",
227 |        "    }\n",
228 |        "\n",
229 |        "    .dataframe tbody tr th {\n",
230 |        "        vertical-align: top;\n",
231 |        "    }\n",
232 |        "\n",
233 |        "    .dataframe thead th {\n",
234 |        "        text-align: right;\n",
235 |        "    }\n",
236 |        "</style>\n",
237 |        "<table border=\"1\" class=\"dataframe\">\n",
238 |        "  <thead>\n",
239 |        "    <tr style=\"text-align: right;\">\n",
240 |        "      <th></th>\n",
241 |        "      <th>c1</th>\n",
242 |        "      <th>mid</th>\n",
243 |        "      <th>c3</th>\n",
244 |        "      <th>c4</th>\n",
245 |        "    </tr>\n",
246 |        "  </thead>\n",
247 |        "  <tbody>\n",
248 |        "    <tr>\n",
249 |        "      <th>0</th>\n",
250 |        "      <td>A10000012B7CGYKOMPQ4L</td>\n",
251 |        "      <td>000100039X</td>\n",
252 |        "      <td>5.0</td>\n",
253 |        "      <td>1355616000</td>\n",
254 |        "    </tr>\n",
255 |        "    <tr>\n",
256 |        "      <th>1</th>\n",
257 |        "      <td>A2S166WSCFIFP5</td>\n",
258 |        "      <td>000100039X</td>\n",
259 |        "      <td>5.0</td>\n",
260 |        "      <td>1071100800</td>\n",
261 |        "    </tr>\n",
262 |        "    <tr>\n",
263 |        "      <th>2</th>\n",
264 |        "      <td>A1BM81XB4QHOA3</td>\n",
265 |        "      <td>000100039X</td>\n",
266 |        "      <td>5.0</td>\n",
267 |        "      <td>1390003200</td>\n",
268 |        "    </tr>\n",
269 |        "    <tr>\n",
270 |        "      <th>3</th>\n",
271 |        "      <td>A1MOSTXNIO5MPJ</td>\n",
272 |        "      <td>000100039X</td>\n",
273 |        "      <td>5.0</td>\n",
274 |        "      <td>1317081600</td>\n",
275 |        "    </tr>\n",
276 |        "    <tr>\n",
277 |        "      <th>4</th>\n",
278 |        "      <td>A2XQ5LZHTD4AFT</td>\n",
279 |        "      <td>000100039X</td>\n",
280 |        "      <td>5.0</td>\n",
281 |        "      <td>1033948800</td>\n",
282 |        "    </tr>\n",
283 |        "  </tbody>\n",
284 |        "</table>\n",
285 |        "</div>"
286 |       ],
287 |       "text/plain": [
288 |        "                      c1         mid   c3          c4\n",
289 |        "0  A10000012B7CGYKOMPQ4L  000100039X  5.0  1355616000\n",
290 |        "1         A2S166WSCFIFP5  000100039X  5.0  1071100800\n",
291 |        "2         A1BM81XB4QHOA3  000100039X  5.0  1390003200\n",
292 |        "3         A1MOSTXNIO5MPJ  000100039X  5.0  1317081600\n",
293 |        "4         A2XQ5LZHTD4AFT  000100039X  5.0  1033948800"
294 |       ]
295 |      },
296 |      "execution_count": 21,
297 |      "metadata": {},
298 |      "output_type": "execute_result"
299 |     }
300 |    ],
301 |    "source": [
302 |     "reviews_info_df.head()"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 22,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "reviews_info_df = reviews_info_df[['mid']].merge(item_info_df, on='mid', how='inner').drop_duplicates()"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 23,
317 |    "metadata": {},
318 |    "outputs": [
319 |     {
320 |      "data": {
321 |       "text/html": [
322 |        "<div>\n",
323 |        "<style scoped>\n",
324 |        "    .dataframe tbody tr th:only-of-type {\n",
325 |        "        vertical-align: middle;\n",
326 |        "    }\n",
327 |        "\n",
328 |        "    .dataframe tbody tr th {\n",
329 |        "        vertical-align: top;\n",
330 |        "    }\n",
331 |        "\n",
332 |        "    .dataframe thead th {\n",
333 |        "        text-align: right;\n",
334 |        "    }\n",
335 |        "</style>\n",
336 |        "<table border=\"1\" class=\"dataframe\">\n",
337 |        "  <thead>\n",
338 |        "    <tr style=\"text-align: right;\">\n",
339 |        "      <th></th>\n",
340 |        "      <th>mid</th>\n",
341 |        "      <th>cat</th>\n",
342 |        "    </tr>\n",
343 |        "  </thead>\n",
344 |        "  <tbody>\n",
345 |        "    <tr>\n",
346 |        "      <th>0</th>\n",
347 |        "      <td>000100039X</td>\n",
348 |        "      <td>Books</td>\n",
349 |        "    </tr>\n",
350 |        "    <tr>\n",
351 |        "      <th>206</th>\n",
352 |        "      <td>0001055178</td>\n",
353 |        "      <td>Books</td>\n",
354 |        "    </tr>\n",
355 |        "    <tr>\n",
356 |        "      <th>224</th>\n",
357 |        "      <td>0001473123</td>\n",
358 |        "      <td>Books</td>\n",
359 |        "    </tr>\n",
360 |        "    <tr>\n",
361 |        "      <th>240</th>\n",
362 |        "      <td>0001473727</td>\n",
363 |        "      <td>Books</td>\n",
364 |        "    </tr>\n",
365 |        "    <tr>\n",
366 |        "      <th>247</th>\n",
367 |        "      <td>0001473905</td>\n",
368 |        "      <td>Books</td>\n",
369 |        "    </tr>\n",
370 |        "  </tbody>\n",
371 |        "</table>\n",
372 |        "</div>"
373 |       ],
374 |       "text/plain": [
375 |        "            mid    cat\n",
376 |        "0    000100039X  Books\n",
377 |        "206  0001055178  Books\n",
378 |        "224  0001473123  Books\n",
379 |        "240  0001473727  Books\n",
380 |        "247  0001473905  Books"
381 |       ]
382 |      },
383 |      "execution_count": 23,
384 |      "metadata": {},
385 |      "output_type": "execute_result"
386 |     }
387 |    ],
388 |    "source": [
389 |     "reviews_info_df.head()"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 24,
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": [
398 |     "mid_cat_map = reviews_info_df.set_index('mid').to_dict()['cat']"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 25,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "if TEST_RUN:\n",
408 |     "    train_df = train_df.sample(1000)\n",
409 |     "    valid_df = valid_df.sample(1000)"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 26,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "# slow implement\n",
419 |     "def prepare_neg(df):\n",
420 |     "    records = df['hist_mids'].apply(lambda x: x.split(\"\u0002\"))\n",
421 |     "    candidates = list(mid_cat_map.keys())\n",
422 |     "    max_len = len(candidates)\n",
423 |     "\n",
424 |     "    def neg_sampling(filters, length):    \n",
425 |     "        mids = []\n",
426 |     "        cats = []\n",
427 |     "        for i in range(length):\n",
428 |     "            while(1):\n",
429 |     "                c = candidates[np.random.randint(0, max_len)]\n",
430 |     "                if c not in filters:\n",
431 |     "                    mids.append(c)\n",
432 |     "                    cats.append(mid_cat_map[c])\n",
433 |     "                    filters.add(c)\n",
434 |     "                    break\n",
435 |     "        return mids, cats\n",
436 |     "    \n",
437 |     "    total_neg_mids = []\n",
438 |     "    total_neg_cats = []\n",
439 |     "    for record in records:\n",
440 |     "        neg_mids, neg_cats = neg_sampling(set(record), len(record))\n",
441 |     "        total_neg_mids.append(\"\u0002\".join(neg_mids))\n",
442 |     "        total_neg_cats.append(\"\u0002\".join(neg_cats))\n",
443 |     "    \n",
444 |     "    return total_neg_mids, total_neg_cats"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": 27,
450 |    "metadata": {},
451 |    "outputs": [],
452 |    "source": [
453 |     "total_neg_mids, total_neg_cats = prepare_neg(train_df)\n",
454 |     "train_df['neg_hist_mids'] = total_neg_mids\n",
455 |     "train_df['neg_hist_cats'] = total_neg_cats"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": 28,
461 |    "metadata": {},
462 |    "outputs": [],
463 |    "source": [
464 |     "total_neg_mids, total_neg_cats = prepare_neg(valid_df)\n",
465 |     "valid_df['neg_hist_mids'] = total_neg_mids\n",
466 |     "valid_df['neg_hist_cats'] = total_neg_cats"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": 29,
472 |    "metadata": {},
473 |    "outputs": [],
474 |    "source": [
475 |     "train_df.to_csv('local_train.csv', sep='\\t', index=False)\n",
476 |     "valid_df.to_csv('local_test.csv', sep='\\t', index=False)"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": null,
482 |    "metadata": {},
483 |    "outputs": [],
484 |    "source": []
485 |   }
486 |  ],
487 |  "metadata": {
488 |   "kernelspec": {
489 |    "display_name": "Python 3",
490 |    "language": "python",
491 |    "name": "python3"
492 |   },
493 |   "language_info": {
494 |    "codemirror_mode": {
495 |     "name": "ipython",
496 |     "version": 3
497 |    },
498 |    "file_extension": ".py",
499 |    "mimetype": "text/x-python",
500 |    "name": "python",
501 |    "nbconvert_exporter": "python",
502 |    "pygments_lexer": "ipython3",
503 |    "version": "3.6.2"
504 |   }
505 |  },
506 |  "nbformat": 4,
507 |  "nbformat_minor": 2
508 | }
509 | 


--------------------------------------------------------------------------------
/examples/amazon/simple_benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/examples/amazon/simple_benchmark.png


--------------------------------------------------------------------------------
/examples/movielens/ml-1m/README:
--------------------------------------------------------------------------------
  1 | SUMMARY
  2 | ================================================================================
  3 | 
  4 | These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
  5 | made by 6,040 MovieLens users who joined MovieLens in 2000.
  6 | 
  7 | USAGE LICENSE
  8 | ================================================================================
  9 | 
 10 | Neither the University of Minnesota nor any of the researchers
 11 | involved can guarantee the correctness of the data, its suitability
 12 | for any particular purpose, or the validity of results based on the
 13 | use of the data set.  The data set may be used for any research
 14 | purposes under the following conditions:
 15 | 
 16 |      * The user may not state or imply any endorsement from the
 17 |        University of Minnesota or the GroupLens Research Group.
 18 | 
 19 |      * The user must acknowledge the use of the data set in
 20 |        publications resulting from the use of the data set
 21 |        (see below for citation information).
 22 | 
 23 |      * The user may not redistribute the data without separate
 24 |        permission.
 25 | 
 26 |      * The user may not use this information for any commercial or
 27 |        revenue-bearing purposes without first obtaining permission
 28 |        from a faculty member of the GroupLens Research Project at the
 29 |        University of Minnesota.
 30 | 
 31 | If you have any further questions or comments, please contact GroupLens
 32 | <grouplens-info@cs.umn.edu>. 
 33 | 
 34 | CITATION
 35 | ================================================================================
 36 | 
 37 | To acknowledge use of the dataset in publications, please cite the following
 38 | paper:
 39 | 
 40 | F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History
 41 | and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4,
 42 | Article 19 (December 2015), 19 pages. DOI=http://dx.doi.org/10.1145/2827872
 43 | 
 44 | 
 45 | ACKNOWLEDGEMENTS
 46 | ================================================================================
 47 | 
 48 | Thanks to Shyong Lam and Jon Herlocker for cleaning up and generating the data
 49 | set.
 50 | 
 51 | FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT
 52 | ================================================================================
 53 | 
 54 | The GroupLens Research Project is a research group in the Department of 
 55 | Computer Science and Engineering at the University of Minnesota. Members of 
 56 | the GroupLens Research Project are involved in many research projects related 
 57 | to the fields of information filtering, collaborative filtering, and 
 58 | recommender systems. The project is lead by professors John Riedl and Joseph 
 59 | Konstan. The project began to explore automated collaborative filtering in 
 60 | 1992, but is most well known for its world wide trial of an automated 
 61 | collaborative filtering system for Usenet news in 1996. Since then the project 
 62 | has expanded its scope to research overall information filtering solutions, 
 63 | integrating in content-based methods as well as improving current collaborative 
 64 | filtering technology.
 65 | 
 66 | Further information on the GroupLens Research project, including research 
 67 | publications, can be found at the following web site:
 68 |         
 69 |         http://www.grouplens.org/
 70 | 
 71 | GroupLens Research currently operates a movie recommender based on 
 72 | collaborative filtering:
 73 | 
 74 |         http://www.movielens.org/
 75 | 
 76 | RATINGS FILE DESCRIPTION
 77 | ================================================================================
 78 | 
 79 | All ratings are contained in the file "ratings.dat" and are in the
 80 | following format:
 81 | 
 82 | UserID::MovieID::Rating::Timestamp
 83 | 
 84 | - UserIDs range between 1 and 6040 
 85 | - MovieIDs range between 1 and 3952
 86 | - Ratings are made on a 5-star scale (whole-star ratings only)
 87 | - Timestamp is represented in seconds since the epoch as returned by time(2)
 88 | - Each user has at least 20 ratings
 89 | 
 90 | USERS FILE DESCRIPTION
 91 | ================================================================================
 92 | 
 93 | User information is in the file "users.dat" and is in the following
 94 | format:
 95 | 
 96 | UserID::Gender::Age::Occupation::Zip-code
 97 | 
 98 | All demographic information is provided voluntarily by the users and is
 99 | not checked for accuracy.  Only users who have provided some demographic
100 | information are included in this data set.
101 | 
102 | - Gender is denoted by a "M" for male and "F" for female
103 | - Age is chosen from the following ranges:
104 | 
105 | 	*  1:  "Under 18"
106 | 	* 18:  "18-24"
107 | 	* 25:  "25-34"
108 | 	* 35:  "35-44"
109 | 	* 45:  "45-49"
110 | 	* 50:  "50-55"
111 | 	* 56:  "56+"
112 | 
113 | - Occupation is chosen from the following choices:
114 | 
115 | 	*  0:  "other" or not specified
116 | 	*  1:  "academic/educator"
117 | 	*  2:  "artist"
118 | 	*  3:  "clerical/admin"
119 | 	*  4:  "college/grad student"
120 | 	*  5:  "customer service"
121 | 	*  6:  "doctor/health care"
122 | 	*  7:  "executive/managerial"
123 | 	*  8:  "farmer"
124 | 	*  9:  "homemaker"
125 | 	* 10:  "K-12 student"
126 | 	* 11:  "lawyer"
127 | 	* 12:  "programmer"
128 | 	* 13:  "retired"
129 | 	* 14:  "sales/marketing"
130 | 	* 15:  "scientist"
131 | 	* 16:  "self-employed"
132 | 	* 17:  "technician/engineer"
133 | 	* 18:  "tradesman/craftsman"
134 | 	* 19:  "unemployed"
135 | 	* 20:  "writer"
136 | 
137 | MOVIES FILE DESCRIPTION
138 | ================================================================================
139 | 
140 | Movie information is in the file "movies.dat" and is in the following
141 | format:
142 | 
143 | MovieID::Title::Genres
144 | 
145 | - Titles are identical to titles provided by the IMDB (including
146 | year of release)
147 | - Genres are pipe-separated and are selected from the following genres:
148 | 
149 | 	* Action
150 | 	* Adventure
151 | 	* Animation
152 | 	* Children's
153 | 	* Comedy
154 | 	* Crime
155 | 	* Documentary
156 | 	* Drama
157 | 	* Fantasy
158 | 	* Film-Noir
159 | 	* Horror
160 | 	* Musical
161 | 	* Mystery
162 | 	* Romance
163 | 	* Sci-Fi
164 | 	* Thriller
165 | 	* War
166 | 	* Western
167 | 
168 | - Some MovieIDs do not correspond to a movie due to accidental duplicate
169 | entries and/or test entries
170 | - Movies are mostly entered by hand, so errors and inconsistencies may exist
171 | 


--------------------------------------------------------------------------------
/prediction_flow/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.5'
2 | 


--------------------------------------------------------------------------------
/prediction_flow/features/__init__.py:
--------------------------------------------------------------------------------
 1 | from .number_feature import Number
 2 | from .category_feature import Category
 3 | from .sequence_feature import Sequence
 4 | from .features import Features
 5 | 
 6 | 
 7 | __all__ = [
 8 |     'Number',
 9 |     'Category',
10 |     'Sequence',
11 |     'Features'
12 | ]
13 | 


--------------------------------------------------------------------------------
/prediction_flow/features/base.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base class for features.
 3 | """
 4 | 
 5 | # Authors: Hongwei Zhang
 6 | # License: MIT
 7 | 
 8 | 
 9 | from abc import ABC, abstractmethod
10 | from ..transformers.column import Column, ColumnFlow
11 | 
12 | 
13 | class BaseFeature(ABC):
14 |     """Base class for all feature classes.
15 | 
16 |     Parameters
17 |     ----------
18 |     name : str
19 |         Name of this feature.
20 | 
21 |     column_flow : ColumnFlow
22 |         ColumnFlow to transform this feature.
23 | 
24 |     Attributes
25 |     ----------
26 |     name : str
27 |         Name of this feature.
28 | 
29 |     column_flow : ColumnFlow or
30 |                   list of column transformers or
31 |                   single transformer
32 |         ColumnFlow to transform this feature.
33 |     """
34 | 
35 |     @abstractmethod
36 |     def __init__(self, name, column_flow=None):
37 |         self.name = name
38 | 
39 |         self.column_flow = None
40 | 
41 |         if column_flow:
42 |             if isinstance(column_flow, ColumnFlow):
43 |                 self.column_flow = column_flow
44 |             elif isinstance(column_flow, list):
45 |                 self.column_flow = ColumnFlow(column_flow)
46 |             elif isinstance(column_flow, Column):
47 |                 self.column_flow = ColumnFlow([column_flow])
48 |             else:
49 |                 raise NotImplementedError(
50 |                     "column_flow should be ColumnFlow or "
51 |                     "list of column transformers or single transformer")
52 | 


--------------------------------------------------------------------------------
/prediction_flow/features/category_feature.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Class for category feature.
 3 | """
 4 | 
 5 | # Authors: Hongwei Zhang
 6 | # License: MIT
 7 | 
 8 | 
 9 | from .base import BaseFeature
10 | 
11 | 
12 | class Category(BaseFeature):
13 |     """
14 |     Class for category feature.
15 | 
16 |     Parameters
17 |     ----------
18 |     name : str
19 |         Name of this feature.
20 | 
21 |     column_flow : ColumnFlow
22 |         ColumnFlow to transform this feature.
23 | 
24 |     embedding_name: str
25 |         Embedding name for reference. Give same embedding name to features that
26 |         share same embedding layer.
27 | 
28 |     embedding_size: int
29 |         Dimension of embedding layer.
30 | 
31 |     vocab_size: int
32 |         Provide vocab_size if this feature do not need to be pre-processed.
33 |         vocab_size is only be used when column_flow is None.
34 | 
35 |     Attributes
36 |     ----------
37 |     name : str
38 |         Name of this feature.
39 | 
40 |     column_flow : ColumnFlow
41 |         ColumnFlow to transform this feature.
42 | 
43 |     embedding_name: str
44 |         Embedding name for reference. Give same embedding name to features that
45 |         share same embedding layer.
46 | 
47 |     embedding_size: int
48 |         Dimension of embedding layer.
49 |     """
50 |     def __init__(self, name, column_flow,
51 |                  embedding_name=None, embedding_size=None,
52 |                  vocab_size=None):
53 |         super().__init__(name=name, column_flow=column_flow)
54 |         self.embedding_name = embedding_name if embedding_name else name
55 |         self.embedding_size = embedding_size
56 |         self._vocab_size = vocab_size
57 | 
58 |     def dimension(self):
59 |         """The dimension (vocab size) of sequence feature is the dimension
60 |         of last transformer in ColumnFlow.
61 |         """
62 |         if self.column_flow is not None:
63 |             return self.column_flow.transformers[-1].dimension()
64 |         else:
65 |             if self._vocab_size:
66 |                 return self._vocab_size
67 |             else:
68 |                 raise RuntimeError(
69 |                     "If param column_flow is not given, "
70 |                     "vocab_size must be given")
71 | 


--------------------------------------------------------------------------------
/prediction_flow/features/features.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Class to manage all features.
  3 | """
  4 | 
  5 | # Authors: Hongwei Zhang
  6 | # License: MIT
  7 | 
  8 | 
  9 | from collections import OrderedDict
 10 | 
 11 | 
 12 | class Features(object):
 13 |     """Class to manage all features.
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     number_features : array-like
 18 |         NumberFeature array.
 19 | 
 20 |     category_features : array-like
 21 |         CategoryFeature array.
 22 | 
 23 |     sequence_features : array-like
 24 |         SequenceFeature array.
 25 | 
 26 |     Attributes
 27 |     ----------
 28 |     number_features : array-like
 29 |         NumberFeature array.
 30 | 
 31 |     category_features : array-like
 32 |         CategoryFeature array.
 33 | 
 34 |     sequence_features : array-like
 35 |         SequenceFeature array.
 36 | 
 37 |     """
 38 |     def __init__(
 39 |             self,
 40 |             number_features=[],
 41 |             category_features=[],
 42 |             sequence_features=[]):
 43 |         self.number_features = number_features
 44 |         self.category_features = category_features
 45 |         self.sequence_features = sequence_features
 46 | 
 47 |     def fit(self, df):
 48 |         """Fit all transformers.
 49 | 
 50 |         Parameters
 51 |         ----------
 52 |         df : pandas.DataFrame
 53 | 
 54 |         Returns
 55 |         ----------
 56 |         self : Features
 57 |         """
 58 |         for feature in (
 59 |                 self.number_features +
 60 |                 self.category_features +
 61 |                 self.sequence_features):
 62 |             if feature.column_flow:
 63 |                 feature.column_flow.fit(df[feature.name].values)
 64 | 
 65 |         return self
 66 | 
 67 |     def transform(self, df):
 68 |         """Transform df using fitted transformers.
 69 | 
 70 |         Parameters
 71 |         ----------
 72 |         df : pandas.DataFrame
 73 | 
 74 |         Returns
 75 |         ----------
 76 |         transformed_X: dict
 77 | 
 78 |         {'feature1': numpy.array([...]), 'feature2': numpy.array([...])}
 79 |         """
 80 |         transformed_X = OrderedDict()
 81 | 
 82 |         for feature in (
 83 |                 self.number_features +
 84 |                 self.category_features +
 85 |                 self.sequence_features):
 86 |             if feature.column_flow:
 87 |                 transformed_X[feature.name] = feature.column_flow.transform(
 88 |                     df[feature.name].values)
 89 |             else:
 90 |                 transformed_X[feature.name] = df[feature.name].values
 91 | 
 92 |         return transformed_X
 93 | 
 94 |     def number_feature_names(self):
 95 |         return [feature.name for feature in self.number_features]
 96 | 
 97 |     def category_feature_names(self):
 98 |         return [feature.name for feature in self.category_features]
 99 | 
100 |     def sequence_feature_names(self):
101 |         return [feature.name for feature in self.sequence_features]
102 | 


--------------------------------------------------------------------------------
/prediction_flow/features/number_feature.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Class for number feature.
 3 | """
 4 | 
 5 | # Authors: Hongwei Zhang
 6 | # License: MIT
 7 | 
 8 | 
 9 | from .base import BaseFeature
10 | 
11 | 
12 | class Number(BaseFeature):
13 |     """
14 |     Class for number feature.
15 | 
16 |     Parameters
17 |     ----------
18 |     name : str
19 |         Name of this feature.
20 | 
21 |     column_flow : ColumnFlow
22 |         ColumnFlow to transform this feature.
23 | 
24 |     Attributes
25 |     ----------
26 |     name : str
27 |         Name of this feature.
28 | 
29 |     column_flow : ColumnFlow
30 |         ColumnFlow to transform this feature.
31 |     """
32 |     def __init__(self, name, column_flow):
33 |         super().__init__(name=name, column_flow=column_flow)
34 | 


--------------------------------------------------------------------------------
/prediction_flow/features/sequence_feature.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Class for sequence features
 3 | """
 4 | 
 5 | # Authors: Hongwei Zhang
 6 | # License: MIT
 7 | 
 8 | 
 9 | from .base import BaseFeature
10 | 
11 | 
12 | class Sequence(BaseFeature):
13 |     """
14 |     Class for sequence features
15 | 
16 |     Parameters
17 |     ----------
18 |     name : str
19 |         Name of this feature.
20 | 
21 |     column_flow : ColumnFlow
22 |         ColumnFlow to transform this feature.
23 | 
24 |     embedding_name: str
25 |         Embedding name for reference. Give same embedding name to features that
26 |         share same embedding layer.
27 | 
28 |     embedding_size: int
29 |         Dimension of embedding layer.
30 | 
31 |     vocab_size: int
32 |         Provide vocab_size if this feature do not need to be pre-processed.
33 |         vocab_size is only be used when column_flow is None.
34 | 
35 |     max_length: int
36 |         max_length is only be used when column_flow is None.
37 | 
38 |     Attributes
39 |     ----------
40 |     name : str
41 |         Name of this feature.
42 | 
43 |     column_flow : ColumnFlow
44 |         ColumnFlow to transform this feature.
45 | 
46 |     embedding_name: str
47 |         Embedding name for reference. Give same embedding name to features that
48 |         share same embedding layer.
49 | 
50 |     embedding_size: int
51 |         Dimension of embedding layer.
52 |     """
53 |     def __init__(self, name, column_flow,
54 |                  embedding_name=None, embedding_size=None,
55 |                  vocab_size=None, max_length=None):
56 |         super().__init__(name=name, column_flow=column_flow)
57 |         self.embedding_name = embedding_name if embedding_name else name
58 |         self.embedding_size = embedding_size
59 |         self._vocab_size = vocab_size
60 |         self._max_length = max_length
61 | 
62 |     def dimension(self):
63 |         """The dimension (vocab size) of sequence feature is the dimension
64 |         of last transformer in ColumnFlow.
65 |         """
66 |         if self.column_flow is not None:
67 |             return self.column_flow.transformers[-1].dimension()
68 |         else:
69 |             if self._vocab_size:
70 |                 return self._vocab_size
71 |             else:
72 |                 raise RuntimeError(
73 |                     "If param column_flow is not given, "
74 |                     "vocab_size must be given")
75 | 
76 |     def max_length(self):
77 |         """The max length of sequence feature is the max length
78 |         of last transformer in ColumnFlow.
79 |         """
80 |         if self.column_flow is not None:
81 |             return self.column_flow.transformers[-1].max_length()
82 |         else:
83 |             if self._max_length:
84 |                 return self._max_length
85 |             else:
86 |                 raise RuntimeError(
87 |                     "If param column_flow is not given, "
88 |                     "max_length must be given")
89 | 


--------------------------------------------------------------------------------
/prediction_flow/features/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/features/tests/__init__.py


--------------------------------------------------------------------------------
/prediction_flow/features/tests/test_features.py:
--------------------------------------------------------------------------------
 1 | from prediction_flow.features import Number, Category, Sequence, Features
 2 | from prediction_flow.transformers.column import (
 3 |     StandardScaler, CategoryEncoder, SequenceEncoder, ColumnFlow)
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | 
 9 | __TEST_DATA = pd.DataFrame(
10 |     [
11 |         [23, 173, 'male', 'fish vegetable', 1],
12 |         [43, 181, 'male', 'fish pork', 0],
13 |         [35, 161, 'female', 'beef vegetable', 0],
14 |         [41, 171, 'male', 'fish vegetable', 1],
15 |         [16, 153, 'female', 'pork chicken vegetable', 0],
16 |         [32, 168, 'female', 'fish beef', 1],
17 |         [26, 177, 'male', 'chicken vegetable', 0],
18 |         [76, 190, 'male', 'fish pork vegetable', 0]
19 |     ],
20 |     columns=['age', 'height', 'gender', 'likes', 'label'])
21 | 
22 | 
23 | def test_simple_column_transformer_define():
24 |     number_features = [
25 |         Number('age', None),
26 |         Number('height', StandardScaler())]
27 | 
28 |     category_features = [
29 |         Category('gender', CategoryEncoder(min_cnt=1))]
30 | 
31 |     sequence_features = [
32 |         Sequence('likes', SequenceEncoder(sep=' ', min_cnt=1))]
33 | 
34 |     features = Features(
35 |         number_features, category_features, sequence_features)
36 | 
37 |     features.fit(__TEST_DATA)
38 | 
39 |     actual = features.transform(__TEST_DATA)
40 | 
41 |     expected_age = np.array([23, 43, 35, 41, 16, 32, 26, 76])
42 |     expected_gender = np.array([2, 2, 1, 2, 1, 1, 2, 2])
43 |     expected_height = np.array(
44 |         [0.1159659, 0.85814767, -0.99730676, -0.06957954, -1.73948853,
45 |          -0.34789771, 0.48705679, 1.69310217])
46 | 
47 |     assert len(actual) == 4
48 |     assert features.number_feature_names() == ['age', 'height']
49 |     assert features.category_feature_names() == ['gender']
50 |     assert features.sequence_feature_names() == ['likes']
51 |     np.testing.assert_array_equal(actual['age'], expected_age)
52 |     np.testing.assert_array_equal(actual['gender'], expected_gender)
53 |     np.testing.assert_array_almost_equal(actual['height'], expected_height)
54 | 
55 | 
56 | def test_column_flow_define():
57 |     number_features = [
58 |         Number('age', None),
59 |         Number('height', ColumnFlow([StandardScaler()]))]
60 | 
61 |     category_features = [
62 |         Category('gender', ColumnFlow([CategoryEncoder(min_cnt=1)]))
63 |     ]
64 | 
65 |     sequence_features = [
66 |         Sequence('likes', ColumnFlow([SequenceEncoder(sep=' ', min_cnt=1)]))
67 |     ]
68 | 
69 |     features = Features(
70 |         number_features, category_features, sequence_features)
71 | 
72 |     features.fit(__TEST_DATA)
73 | 
74 |     actual = features.transform(__TEST_DATA)
75 | 
76 |     expected_age = np.array([23, 43, 35, 41, 16, 32, 26, 76])
77 |     expected_gender = np.array([2, 2, 1, 2, 1, 1, 2, 2])
78 |     expected_height = np.array(
79 |         [0.1159659, 0.85814767, -0.99730676, -0.06957954, -1.73948853,
80 |          -0.34789771, 0.48705679, 1.69310217])
81 | 
82 |     assert len(actual) == 4
83 |     assert features.number_feature_names() == ['age', 'height']
84 |     assert features.category_feature_names() == ['gender']
85 |     assert features.sequence_feature_names() == ['likes']
86 |     np.testing.assert_array_equal(actual['age'], expected_age)
87 |     np.testing.assert_array_equal(actual['gender'], expected_gender)
88 |     np.testing.assert_array_almost_equal(actual['height'], expected_height)
89 | 


--------------------------------------------------------------------------------
/prediction_flow/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/metrics/__init__.py


--------------------------------------------------------------------------------
/prediction_flow/pytorch/__init__.py:
--------------------------------------------------------------------------------
 1 | from .deepfm import DeepFM
 2 | from .dnn import DNN
 3 | from .interest_net import AttentionGroup
 4 | from .din import DIN
 5 | from .dien import DIEN
 6 | from .wide_deep import WideDeep
 7 | 
 8 | 
 9 | __all__ = [
10 |     'DeepFM',
11 |     'DNN',
12 |     'AttentionGroup',
13 |     'DIN',
14 |     'DIEN',
15 |     'WideDeep'
16 | ]
17 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/base.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | from collections import OrderedDict
 3 | 
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | class EmbeddingMixin:
 8 |     def build_embeddings(
 9 |             self, default_embedding_size, fixed_embedding_size=False):
10 |         embeddings = OrderedDict()
11 |         embedding_sizes = OrderedDict()
12 | 
13 |         for feature in itertools.chain(
14 |                 self.features.category_features,
15 |                 self.features.sequence_features):
16 |             if feature.embedding_name not in embeddings:
17 |                 embedding_size = default_embedding_size
18 |                 if not fixed_embedding_size:
19 |                     embedding_size = (feature.embedding_size
20 |                                       if feature.embedding_size
21 |                                       else default_embedding_size)
22 | 
23 |                 embeddings[feature.embedding_name] = nn.Embedding(
24 |                     feature.dimension(), embedding_size, padding_idx=0)
25 |                 embedding_sizes[feature.embedding_name] = embedding_size
26 |                 self.add_module(
27 |                     f"embedding:{feature.embedding_name}",
28 |                     embeddings[feature.embedding_name])
29 | 
30 |             if feature.name != feature.embedding_name:
31 |                 embeddings[feature.name] = embeddings[feature.embedding_name]
32 |                 embedding_sizes[feature.name] = (
33 |                     embedding_sizes[feature.embedding_name])
34 |                 if feature.embedding_size and (
35 |                         feature.embedding_size !=
36 |                         embedding_sizes[feature.name]):
37 |                     raise RuntimeWarning(
38 |                         f"embedding_size of {feature.name} should be "
39 |                         f"the same with {feature.embedding_name}")
40 | 
41 |         return (embeddings, embedding_sizes)
42 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataset import Dataset
2 | 
3 | 
4 | __all__ = [
5 |     'Dataset'
6 | ]
7 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/data/dataset.py:
--------------------------------------------------------------------------------
 1 | """Dataset for torch.
 2 | """
 3 | 
 4 | # Authors: Hongwei Zhang
 5 | # License: MIT
 6 | 
 7 | 
 8 | from collections import OrderedDict
 9 | from itertools import chain
10 | 
11 | import numpy as np
12 | import torch.utils.data as data
13 | 
14 | 
15 | class Dataset(data.Dataset):
16 |     """Dataset for torch.
17 | 
18 |     Parameters
19 |     ----------
20 |     features : Features
21 |         Fitted Features object.
22 | 
23 |     X_map : dict
24 |         example:
25 |             {'feature1': numpy.array([...]), 'feature2': numpy.array([...])}
26 | 
27 |     y : numpy.array
28 |     """
29 |     def __init__(self, features, X_map, y=None):
30 |         self.features = features
31 |         self.X_map = X_map
32 |         self.y = y
33 |         if y is not None:
34 |             self.y = np.asarray(y, np.float32).reshape(-1, 1)
35 | 
36 |         self.__data_size = self.__get_data_size()
37 | 
38 |     def __get_data_size(self):
39 |         key = next(iter(self.X_map))
40 |         return self.X_map[key].shape[0]
41 | 
42 |     def __len__(self):
43 |         return self.__data_size
44 | 
45 |     @staticmethod
46 |     def __pad_sequence(sequence_feature, sequence):
47 |         # zero is special index for padding
48 |         padded_seq = np.zeros(sequence_feature.max_length(), np.int64)
49 |         padded_seq[0: sequence.shape[0]] = sequence
50 | 
51 |         return padded_seq
52 | 
53 |     def __getitem__(self, idx):
54 |         record = OrderedDict()
55 | 
56 |         for feat in chain(
57 |                 self.features.number_features,
58 |                 self.features.category_features):
59 |             record[feat.name] = self.X_map[feat.name][idx]
60 | 
61 |         for feat in self.features.sequence_features:
62 |             seq = self.X_map[feat.name][idx]
63 |             record[feat.name] = Dataset.__pad_sequence(feat, seq)
64 |             record[f"__{feat.name}_length"] = np.int64(seq.shape[0])
65 | 
66 |         if self.y is not None:
67 |             record['label'] = self.y[idx]
68 |         return record
69 | 
70 |     def get_num_batches(self, batch_size):
71 |         return np.ceil(self.__data_size / batch_size)
72 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/data/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/pytorch/data/tests/__init__.py


--------------------------------------------------------------------------------
/prediction_flow/pytorch/data/tests/test_dataset.py:
--------------------------------------------------------------------------------
 1 | from prediction_flow.features import Number, Category, Sequence, Features
 2 | from prediction_flow.transformers.column import (
 3 |     StandardScaler, CategoryEncoder, SequenceEncoder)
 4 | from prediction_flow.pytorch.data import Dataset
 5 | 
 6 | import pandas as pd
 7 | 
 8 | 
 9 | __SAMPLE_DF = pd.DataFrame({
10 |     'userId': [11, 11, 11, 11, 11],
11 |     'userAge': [23, 21, 19, 17, 41],
12 |     'movieId': [4226, 5971, 6291, 7153, 30707],
13 |     'rating': [3.0, 2.0, 4.0, 4.6, 5.0],
14 |     'timestamp': [1294796159, 1294796201, 1294796113, 1294796132, 1294796176],
15 |     'title': ['Memento (2000)',
16 |               'My Neighbor Totoro (Tonari no Totoro) (1988)',
17 |               'Lilya 4-Ever (Lilja 4-ever) (2002)',
18 |               'Lord of the Rings: The Return of the King, The (2003)',
19 |               'Million Dollar Baby (2004)'],
20 |     'genres': [
21 |         'Mystery|Thriller',
22 |         'Animation|Children|Drama|Fantasy',
23 |         'Crime|Drama',
24 |         'Action|Adventure|Drama|Fantasy',
25 |         'Drama'],
26 |     'topGenre': [
27 |         'Mystery',
28 |         'Animation',
29 |         'Crime',
30 |         'Action',
31 |         'Drama'],
32 |     'clickedMovieIds': [
33 |         '5971|6291',
34 |         '3242|42',
35 |         '32|43542|3222|3',
36 |         '',
37 |         '34|23'],
38 |     'clickedMovieTopGenres': [
39 |         'Animation|Mystery',
40 |         'Drama',
41 |         'Drama',
42 |         '',
43 |         'Mystery|Crime'],
44 |     'label': [1, 0, 0, 1, 0]})
45 | 
46 | 
47 | def test_normal():
48 |     number_features = [
49 |         Number('userAge', StandardScaler()),
50 |         Number('rating', StandardScaler())]
51 | 
52 |     category_features = [
53 |         Category('userId', CategoryEncoder(min_cnt=1)),
54 |         Category('movieId', CategoryEncoder(min_cnt=1)),
55 |         Category('topGenre', CategoryEncoder(min_cnt=1))]
56 | 
57 |     sequence_features = [
58 |         Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
59 |         Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
60 |         Sequence('clickedMovieIds',
61 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
62 |         Sequence('clickedMovieTopGenres',
63 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5))]
64 | 
65 |     features = Features(
66 |         number_features=number_features,
67 |         category_features=category_features,
68 |         sequence_features=sequence_features)
69 | 
70 |     features.fit(__SAMPLE_DF)
71 | 
72 |     X_map = features.transform(__SAMPLE_DF)
73 | 
74 |     dataset = Dataset(features, X_map, __SAMPLE_DF.label.values)
75 | 
76 |     assert dataset[0]['userId'] == 1
77 |     assert dataset[0]['movieId'] == 1
78 |     assert dataset[0]['genres'].tolist() == [8, 9, 0, 0]
79 |     assert dataset[0]['__genres_length'] == 2
80 |     assert dataset[0]['label'] == 1
81 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/deepfm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DeepFM.
  3 | """
  4 | from collections import OrderedDict
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | from .base import EmbeddingMixin
 10 | from .nn import FM, MLP, MaxPooling
 11 | from .utils import init_weights
 12 | 
 13 | 
 14 | class DeepFM(nn.Module, EmbeddingMixin):
 15 |     """DeepFM.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     features : Features
 20 | 
 21 |     num_classes : int
 22 |         Number of classes.
 23 | 
 24 |     embedding_size : int
 25 |         Size of embedding.
 26 | 
 27 |     hidden_layers : list
 28 |         Size of hidden layers.
 29 |         Example: [96, 32]
 30 | 
 31 |     activation : str
 32 |         Activation function.
 33 |         Example: relu
 34 | 
 35 |     final_activation : str
 36 |         Activation function of output.
 37 | 
 38 |     dropout : float
 39 |         Dropout rate.
 40 | 
 41 |     use_linear : bool
 42 | 
 43 |     use_fm : bool
 44 | 
 45 |     use_deep : bool
 46 |     """
 47 |     def __init__(self, features, num_classes, embedding_size, hidden_layers,
 48 |                  activation='relu', final_activation=None, dropout=None,
 49 |                  use_linear=True, use_fm=True, use_deep=True):
 50 |         super(DeepFM, self).__init__()
 51 |         self.features = features
 52 |         self.num_classes = num_classes
 53 |         self.final_activation = final_activation
 54 |         self.use_linear = use_linear
 55 |         self.use_fm = use_fm
 56 |         self.use_deep = use_deep
 57 | 
 58 |         self.embeddings, self.embedding_sizes = self.build_embeddings(
 59 |             embedding_size, fixed_embedding_size=True)
 60 | 
 61 |         self._sequence_poolings = OrderedDict()
 62 | 
 63 |         total_embedding_sizes = 0
 64 |         for feature in self.features.number_features:
 65 |             self.embeddings[feature.name] = nn.Linear(
 66 |                 in_features=1,
 67 |                 out_features=embedding_size,
 68 |                 bias=False)
 69 |             self.add_module(
 70 |                 f"embedding:{feature.name}",
 71 |                 self.embeddings[feature.name])
 72 |             total_embedding_sizes += embedding_size
 73 | 
 74 |         for feature in self.features.category_features:
 75 |             total_embedding_sizes += (
 76 |                 self.embedding_sizes[feature.name])
 77 | 
 78 |         for feature in self.features.sequence_features:
 79 |             self._sequence_poolings[feature.name] = MaxPooling(1)
 80 |             self.add_module(
 81 |                 f"pooling:{feature.name}",
 82 |                 self._sequence_poolings[feature.name])
 83 |             total_embedding_sizes += (
 84 |                 self.embedding_sizes[feature.name])
 85 | 
 86 |         final_layer_input_size = 0
 87 |         # linear
 88 |         # This part is diff from deepFM paper,
 89 |         # sparse features are not included.
 90 |         self.linear = None
 91 |         if self.use_linear and self.features.number_features:
 92 |             self.linear = MLP(
 93 |                 len(self.features.number_features),
 94 |                 hidden_layers=[1], dropout=None, activation=None)
 95 |             final_layer_input_size += 1
 96 | 
 97 |         # fm
 98 |         self.fm = None
 99 |         if use_fm and total_embedding_sizes:
100 |             self.fm = FM()
101 |             final_layer_input_size += 1
102 | 
103 |         # deep
104 |         self.mlp = None
105 |         if use_deep and total_embedding_sizes:
106 |             total_input_size = (total_embedding_sizes +
107 |                                 len(self.features.number_features))
108 |             self.mlp = MLP(
109 |                 total_input_size, hidden_layers,
110 |                 dropout=dropout, activation=activation)
111 |             final_layer_input_size += hidden_layers[-1]
112 | 
113 |         output_size = self.num_classes
114 | 
115 |         if self.num_classes == 2 and self.final_activation == 'sigmoid':
116 |             output_size -= 1
117 | 
118 |         self.final_layer = nn.Linear(final_layer_input_size, output_size)
119 | 
120 |         self.apply(init_weights)
121 | 
122 |     def forward(self, x):
123 |         final_layer_inputs = list()
124 | 
125 |         number_inputs = list()
126 |         if self.linear:
127 |             # linear
128 |             for feature in self.features.number_features:
129 |                 number_inputs.append(x[feature.name].view(-1, 1))
130 |             linear_concat = torch.cat(number_inputs, dim=1)
131 |             final_layer_inputs.append(self.linear(linear_concat))
132 | 
133 |         embeddings = list()
134 |         for feature in self.features.number_features:
135 |             embeddings.append(
136 |                 self.embeddings[feature.name](
137 |                     x[feature.name].view(-1, 1)).unsqueeze(1))
138 |         for feature in self.features.category_features:
139 |             embeddings.append(
140 |                 self.embeddings[feature.name](x[feature.name]).unsqueeze(1))
141 |         for feature in self.features.sequence_features:
142 |             embeddings.append(
143 |                 self._sequence_poolings[feature.name](
144 |                     self.embeddings[feature.name](x[feature.name])).unsqueeze(1))
145 | 
146 |         emb_concat = None
147 |         if embeddings:
148 |             emb_concat = torch.cat(embeddings, dim=1)
149 |             b, f, e = emb_concat.size()
150 |             # fm
151 |             if self.fm:
152 |                 final_layer_inputs.append(self.fm(emb_concat))
153 |             emb_concat = emb_concat.view(b, f * e)
154 | 
155 |         # deep
156 |         if self.mlp:
157 |             deep_input = torch.cat(number_inputs + [emb_concat], dim=1)
158 |             final_layer_inputs.append(self.mlp(deep_input))
159 | 
160 |         final_layer_inputs = torch.cat(final_layer_inputs, dim=1)
161 | 
162 |         output = self.final_layer(final_layer_inputs)
163 | 
164 |         if self.num_classes == 2 and self.final_activation == 'sigmoid':
165 |             output = torch.sigmoid(output)
166 |         elif self.num_classes > 1 and self.final_activation == 'softmax':
167 |             output = torch.softmax(output)
168 |         elif self.final_activation:
169 |             raise NotImplementedError(
170 |                 f"pair (final_activation: {self.final_activation}, "
171 |                 f"num_classes: {self.num_classes}) is not implemented")
172 | 
173 |         return output
174 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/dien.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Deep Interest Evolution Network.
  3 | """
  4 | 
  5 | from collections import OrderedDict
  6 | 
  7 | import torch
  8 | 
  9 | from .nn import Interest
 10 | from .interest_net import InterestNet
 11 | 
 12 | 
 13 | class DIEN(InterestNet):
 14 |     """Deep Interest Evolution Network.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     features : Features
 19 | 
 20 |     attention_groups : list of AttentionGroup
 21 | 
 22 |     num_classes : int
 23 |         Number of classes.
 24 | 
 25 |     embedding_size : int
 26 |         Size of embedding.
 27 | 
 28 |     hidden_layers : list
 29 |         Size of hidden layers.
 30 |         Example: [96, 32]
 31 | 
 32 |     dnn_activation : str
 33 |         Activation function of deep layers.
 34 |         Example: relu
 35 | 
 36 |     final_activation : str
 37 |         Activation function of output.
 38 | 
 39 |     dropout : float
 40 |         Dropout rate.
 41 | 
 42 |     use_negsampling : bool
 43 |     """
 44 |     def __init__(self, *args, use_negsampling=False, **kwargs):
 45 |         self.use_negsampling = use_negsampling
 46 |         super(DIEN, self).__init__(*args, **kwargs)
 47 | 
 48 |     def create_attention_fn(self, attention_group):
 49 |         return Interest(
 50 |             attention_group.pairs_count * self.embedding_size,
 51 |             gru_type=attention_group.gru_type,
 52 |             gru_dropout=attention_group.gru_dropout,
 53 |             att_hidden_layers=attention_group.hidden_layers,
 54 |             att_dropout=attention_group.att_dropout,
 55 |             att_activation=attention_group.activation,
 56 |             use_negsampling=self.use_negsampling)
 57 | 
 58 |     def forward(self, x):
 59 |         final_layer_inputs = list()
 60 | 
 61 |         # linear
 62 |         number_inputs = list()
 63 |         for feature in self.features.number_features:
 64 |             number_inputs.append(x[feature.name].view(-1, 1))
 65 | 
 66 |         embeddings = OrderedDict()
 67 |         for feature in self.features.category_features:
 68 |             embeddings[feature.name] = self.embeddings[
 69 |                 feature.name](x[feature.name])
 70 | 
 71 |         for feature in self.features.sequence_features:
 72 |             if not self._is_attention_feature(feature):
 73 |                 embeddings[feature.name] = self._sequence_poolings[
 74 |                     feature.name](self.embeddings[
 75 |                         feature.name](x[feature.name]))
 76 | 
 77 |         auxiliary_losses = []
 78 |         for attention_group in self.attention_groups:
 79 |             query = torch.cat(
 80 |                 [embeddings[pair['ad']]
 81 |                  for pair in attention_group.pairs],
 82 |                 dim=-1)
 83 |             pos_hist = torch.cat(
 84 |                 [self.embeddings[pair['pos_hist']](
 85 |                     x[pair['pos_hist']]) for pair in attention_group.pairs],
 86 |                 dim=-1)
 87 |             keys_length = torch.min(torch.cat(
 88 |                 [x[f"__{pair['pos_hist']}_length"].view(-1, 1)
 89 |                  for pair in attention_group.pairs],
 90 |                 dim=-1), dim=-1)[0]
 91 |             neg_hist = None
 92 |             if self.use_negsampling:
 93 |                 neg_hist = torch.cat(
 94 |                     [self.embeddings[pair['neg_hist']](
 95 |                         x[pair['neg_hist']])
 96 |                      for pair in attention_group.pairs],
 97 |                     dim=-1)
 98 |             embeddings[attention_group.name], tmp_loss = (
 99 |                 self._attention_poolings[attention_group.name](
100 |                     query, pos_hist, keys_length, neg_hist))
101 |             if tmp_loss is not None:
102 |                 auxiliary_losses.append(tmp_loss)
103 | 
104 |         emb_concat = torch.cat(number_inputs + [
105 |             emb for emb in embeddings.values()], dim=-1)
106 | 
107 |         final_layer_inputs = self.mlp(emb_concat)
108 | 
109 |         output = self.final_layer(final_layer_inputs)
110 | 
111 |         if self.num_classes == 2 and self.final_activation == 'sigmoid':
112 |             output = torch.sigmoid(output)
113 |         elif self.num_classes > 1 and self.final_activation == 'softmax':
114 |             output = torch.softmax(output)
115 |         elif self.final_activation:
116 |             raise NotImplementedError(
117 |                 f"pair (final_activation: {self.final_activation}, "
118 |                 f"num_classes: {self.num_classes}) is not implemented")
119 | 
120 |         auxiliary_avg_loss = None
121 |         if auxiliary_losses:
122 |             auxiliary_avg_loss = auxiliary_losses[0]
123 |             size = len(auxiliary_losses)
124 |             for i in range(1, size):
125 |                 auxiliary_avg_loss += auxiliary_losses[i]
126 |             auxiliary_avg_loss /= size
127 | 
128 |         return output, auxiliary_avg_loss
129 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/din.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Deep Interest Network.
 3 | """
 4 | 
 5 | from .nn import Attention
 6 | from .interest_net import InterestNet
 7 | 
 8 | 
 9 | class DIN(InterestNet):
10 |     """Deep Interest Network.
11 | 
12 |     Parameters
13 |     ----------
14 |     features : Features
15 | 
16 |     attention_groups : list of AttentionGroup
17 | 
18 |     num_classes : int
19 |         Number of classes.
20 | 
21 |     embedding_size : int
22 |         Size of embedding.
23 | 
24 |     hidden_layers : list
25 |         Size of hidden layers.
26 |         Example: [96, 32]
27 | 
28 |     dnn_activation : str
29 |         Activation function of deep layers.
30 |         Example: relu
31 | 
32 |     final_activation : str
33 |         Activation function of output.
34 | 
35 |     dropout : float
36 |         Dropout rate.
37 |     """
38 |     def __init__(self, *args, **kwargs):
39 |         super(DIN, self).__init__(*args, **kwargs)
40 | 
41 |     def create_attention_fn(self, attention_group):
42 |         return Attention(
43 |             attention_group.pairs_count * self.embedding_size,
44 |             hidden_layers=attention_group.hidden_layers,
45 |             dropout=attention_group.att_dropout,
46 |             activation=attention_group.activation)
47 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/dnn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Deep Neural Network.
  3 | """
  4 | from collections import OrderedDict
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | from .base import EmbeddingMixin
 10 | from .nn import MLP, MaxPooling
 11 | from .utils import init_weights
 12 | 
 13 | 
 14 | class DNN(nn.Module, EmbeddingMixin):
 15 |     """Deep Neural Network.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     features : Features
 20 | 
 21 |     num_classes : int
 22 |         Number of classes.
 23 | 
 24 |     embedding_size : int
 25 |         Size of embedding.
 26 | 
 27 |     hidden_layers : list
 28 |         Size of hidden layers.
 29 |         Example: [96, 32]
 30 | 
 31 |     activation : str
 32 |         Activation function.
 33 |         Example: relu
 34 | 
 35 |     final_activation : str
 36 |         Activation function of output.
 37 | 
 38 |     dropout : float
 39 |         Dropout rate.
 40 |     """
 41 |     def __init__(self, features, num_classes, embedding_size, hidden_layers,
 42 |                  activation='relu', final_activation=None, dropout=0.0):
 43 |         super(DNN, self).__init__()
 44 |         self.features = features
 45 |         self.num_classes = num_classes
 46 |         self.final_activation = final_activation
 47 | 
 48 |         self.embeddings, self.embedding_sizes = self.build_embeddings(
 49 |             embedding_size)
 50 | 
 51 |         self._sequence_poolings = OrderedDict()
 52 | 
 53 |         total_embedding_sizes = 0
 54 |         for feature in self.features.category_features:
 55 |             total_embedding_sizes += (
 56 |                 self.embedding_sizes[feature.name])
 57 | 
 58 |         for feature in self.features.sequence_features:
 59 |             self._sequence_poolings[feature.name] = MaxPooling(1)
 60 |             self.add_module(
 61 |                 f"pooling:{feature.name}",
 62 |                 self._sequence_poolings[feature.name])
 63 |             total_embedding_sizes += (
 64 |                 self.embedding_sizes[feature.name])
 65 | 
 66 |         total_input_size = (total_embedding_sizes +
 67 |                             len(self.features.number_features))
 68 |         self.mlp = MLP(
 69 |             total_input_size,
 70 |             hidden_layers,
 71 |             dropout=dropout, batchnorm=True, activation=activation)
 72 |         final_layer_input_size = hidden_layers[-1]
 73 | 
 74 |         output_size = self.num_classes
 75 | 
 76 |         if self.num_classes == 2 and self.final_activation == 'sigmoid':
 77 |             output_size -= 1
 78 | 
 79 |         self.final_layer = nn.Linear(final_layer_input_size, output_size)
 80 | 
 81 |         self.apply(init_weights)
 82 | 
 83 |     def forward(self, x):
 84 |         final_layer_inputs = list()
 85 | 
 86 |         # linear
 87 |         number_inputs = list()
 88 |         for feature in self.features.number_features:
 89 |             number_inputs.append(x[feature.name].view(-1, 1))
 90 | 
 91 |         embeddings = list()
 92 |         for feature in self.features.category_features:
 93 |             embeddings.append(
 94 |                 self.embeddings[feature.name](x[feature.name]))
 95 | 
 96 |         for feature in self.features.sequence_features:
 97 |             embeddings.append(
 98 |                 self._sequence_poolings[feature.name](
 99 |                     self.embeddings[feature.name](x[feature.name])))
100 | 
101 |         emb_concat = torch.cat(number_inputs + embeddings, dim=1)
102 | 
103 |         final_layer_inputs = self.mlp(emb_concat)
104 | 
105 |         output = self.final_layer(final_layer_inputs)
106 | 
107 |         if self.num_classes == 2 and self.final_activation == 'sigmoid':
108 |             output = torch.sigmoid(output)
109 |         elif self.num_classes > 1 and self.final_activation == 'softmax':
110 |             output = torch.softmax(output)
111 |         elif self.final_activation:
112 |             raise NotImplementedError(
113 |                 f"pair (final_activation: {self.final_activation}, "
114 |                 f"num_classes: {self.num_classes}) is not implemented")
115 | 
116 |         return output
117 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/functions.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm_notebook, tqdm
  2 | 
  3 | import numpy as np
  4 | 
  5 | import torch
  6 | import torch.utils.data as data
  7 | 
  8 | from prediction_flow.pytorch.data import Dataset
  9 | from prediction_flow.features import Features
 10 | 
 11 | 
 12 | def __to_gpu(device, batch):
 13 |     for key, tensor in batch.items():
 14 |         batch[key] = tensor.to(device)
 15 | 
 16 | 
 17 | def fit(epochs, model, loss, optimizer, train_loader,
 18 |         valid_loader=None, notebook=False,
 19 |         auxiliary_loss_rate=0.0):
 20 |     if notebook:
 21 |         epoch_bar = tqdm_notebook(
 22 |             desc='training routine', total=epochs, position=0)
 23 |         train_bar = tqdm_notebook(
 24 |             desc='train', total=len(train_loader), position=1)
 25 |         if valid_loader:
 26 |             valid_bar = tqdm_notebook(
 27 |                 desc='valid', total=len(valid_loader), position=2)
 28 |     else:
 29 |         epoch_bar = tqdm(
 30 |             desc='training routine', total=epochs, position=0)
 31 |         train_bar = tqdm(
 32 |             desc='train', total=len(train_loader), position=1)
 33 |         if valid_loader:
 34 |             valid_bar = tqdm(
 35 |                 desc='valid', total=len(valid_loader), position=2)
 36 | 
 37 |     use_cuda = torch.cuda.is_available()
 38 |     device = torch.device("cuda:0" if use_cuda else "cpu")
 39 | 
 40 |     if use_cuda:
 41 |         print("GPU is available, transfer model to GPU.")
 42 |         model = model.to(device)
 43 | 
 44 |     losses = []
 45 | 
 46 |     for epoch in range(epochs):
 47 |         model.train()
 48 |         running_loss = 0
 49 |         auxiliary_running_loss = 0
 50 |         for index, batch in enumerate(train_loader):
 51 |             if use_cuda:
 52 |                 __to_gpu(device, batch)
 53 |             label = batch['label']
 54 |             # step 1. zero the gradients
 55 |             optimizer.zero_grad()
 56 |             # step 2. compute the output
 57 |             pred = model(batch)
 58 |             if isinstance(pred, tuple):
 59 |                 pred, auxiliary_loss = pred
 60 |                 if auxiliary_loss:
 61 |                     auxiliary_running_loss += (
 62 |                         (auxiliary_loss.item() -
 63 |                          auxiliary_running_loss) / (index + 1))
 64 |             # step 3. compute the loss
 65 |             loss_t = loss(pred, label)
 66 |             if isinstance(pred, tuple):
 67 |                 if auxiliary_loss:
 68 |                     loss_t += auxiliary_loss_rate * auxiliary_loss
 69 |             running_loss += (loss_t.item() - running_loss) / (index + 1)
 70 |             # step 4. use loss to produce gradients
 71 |             loss_t.backward()
 72 |             # step 5. use optimizer to take gradient step
 73 |             optimizer.step()
 74 |             # update bar
 75 |             train_bar.set_postfix(loss=running_loss, epoch=index)
 76 |             train_bar.update()
 77 |         train_bar.reset()
 78 |         train_loss = running_loss
 79 |         train_auxiliary_loss = auxiliary_running_loss
 80 | 
 81 |         valid_loss = 0
 82 |         if valid_loader:
 83 |             model.eval()
 84 |             running_loss = 0
 85 |             auxiliary_running_loss = 0
 86 |             with torch.no_grad():
 87 |                 for index, batch in enumerate(valid_loader):
 88 |                     if use_cuda:
 89 |                         __to_gpu(device, batch)
 90 |                     label = batch['label']
 91 |                     # step 1 compute the output
 92 |                     pred = model(batch)
 93 |                     # step 2. compute the loss
 94 |                     auxiliary_loss = torch.tensor(0.0)
 95 |                     pred = model(batch)
 96 |                     if isinstance(pred, tuple):
 97 |                         pred, auxiliary_loss = pred
 98 |                         if auxiliary_loss:
 99 |                             auxiliary_running_loss += (
100 |                                 (auxiliary_loss.item() -
101 |                                  auxiliary_running_loss) / (index + 1))
102 |                     loss_t = loss(pred, label)
103 |                     if isinstance(pred, tuple):
104 |                         if auxiliary_loss:
105 |                             loss_t += auxiliary_loss_rate * auxiliary_loss
106 |                     running_loss += (
107 |                         loss_t.item() - running_loss) / (index + 1)
108 |                     # update bar
109 |                     valid_bar.set_postfix(
110 |                         loss=running_loss, epoch=index)
111 |                     valid_bar.update()
112 |                 valid_loss = running_loss
113 |                 valid_auxiliary_loss = auxiliary_running_loss
114 |             valid_bar.reset()
115 | 
116 |         epoch_bar.set_postfix(
117 |             train_loss=train_loss, valid_loss=valid_loss, epoch=epoch)
118 |         epoch_bar.update()
119 |         losses.append(
120 |             {'train_loss': train_loss,
121 |              'valid_loss': valid_loss,
122 |              'train_auxiliary_loss': train_auxiliary_loss,
123 |              'valid_auxiliary_loss': valid_auxiliary_loss})
124 | 
125 |     return losses
126 | 
127 | 
128 | def predict(model, test_loader):
129 |     use_cuda = torch.cuda.is_available()
130 |     device = torch.device("cuda:0" if use_cuda else "cpu")
131 | 
132 |     model.zero_grad()
133 |     model.eval()
134 | 
135 |     preds = list()
136 |     with torch.no_grad():
137 |         for _, batch in enumerate(test_loader):
138 |             if use_cuda:
139 |                 __to_gpu(device, batch)
140 |             # step 1 compute the output
141 |             pred = model(batch)
142 |             if isinstance(pred, tuple):
143 |                 pred, auxiliary_loss = pred
144 |             preds.append(pred.cpu().numpy())
145 | 
146 |     return np.vstack(preds)
147 | 
148 | 
149 | def create_dataloader_fn(
150 |         number_features, category_features, sequence_features, batch_size,
151 |         train_df, label_col='label', test_df=None, num_workers=0):
152 | 
153 |     features = Features(
154 |         number_features=number_features,
155 |         category_features=category_features,
156 |         sequence_features=sequence_features)
157 | 
158 |     features = features.fit(train_df)
159 | 
160 |     train_X_map = features.transform(train_df)
161 |     train_y = train_df[label_col].values
162 |     train_dataset = Dataset(features, train_X_map, train_y)
163 |     train_loader = data.DataLoader(
164 |         train_dataset, batch_size=batch_size,
165 |         shuffle=True, num_workers=num_workers)
166 | 
167 |     test_loader = None
168 |     if test_df is not None:
169 |         test_X_map = features.transform(test_df)
170 |         test_y = None
171 |         if label_col in set(test_df.columns):
172 |             test_y = test_df[label_col].values
173 |         test_dataset = Dataset(features, test_X_map, test_y)
174 |         test_loader = data.DataLoader(
175 |             test_dataset, batch_size=batch_size,
176 |             shuffle=False, num_workers=num_workers)
177 | 
178 |     return features, train_loader, test_loader
179 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/interest_net.py:
--------------------------------------------------------------------------------
  1 | """Interest Net.
  2 | """
  3 | 
  4 | from collections import OrderedDict
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | from .base import EmbeddingMixin
 10 | from .nn import MLP, MaxPooling
 11 | from .utils import init_weights
 12 | 
 13 | 
 14 | class AttentionGroup(object):
 15 |     """ This class is used to identify which features should be
 16 |     processed by attention. All candidate features and all behavior
 17 |     sequential features must be the same embedding size. All behavior
 18 |     sequential features must be the same maximum length.
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     name : str
 23 |         Unique group name.
 24 | 
 25 |     pairs : list of dict
 26 |         Example :
 27 |             [{'ad': 'item_id',
 28 |               'pos_hist': 'clicked_item_ids',
 29 |               'neg_hist': 'neg_item_ids'},
 30 |              {'ad': 'item_category',
 31 |               'pos_hist': 'clicked_item_categories',
 32 |               'neg_hist': 'neg_item_categories'}]
 33 | 
 34 |     hidden_layers : iterable
 35 |         Hidden layer sizes of attention.
 36 | 
 37 |     activation : str
 38 |         Activation function of attention.
 39 |         Example: prelu
 40 | 
 41 |     att_dropout : float
 42 |         Dropout rate of attention.
 43 | 
 44 |     gru_type : str
 45 |         Type of GRU. GRU, AIGRU, AGRU and AUGRU are supported.
 46 | 
 47 |     gru_dropout : float
 48 |         Dropout rate of GRU.
 49 |     """
 50 |     def __init__(self, name, pairs,
 51 |                  hidden_layers, activation='prelu', att_dropout=0.0,
 52 |                  gru_type='GRU', gru_dropout=0.0):
 53 |         self.name = name
 54 |         self.pairs = pairs
 55 |         self.hidden_layers = hidden_layers
 56 |         self.activation = activation
 57 |         self.att_dropout = att_dropout
 58 |         self.gru_type = gru_type
 59 |         self.gru_dropout = gru_dropout
 60 | 
 61 |         self.related_feature_names = set()
 62 |         self.neg_feature_names = set()
 63 |         for pair in pairs:
 64 |             self.related_feature_names.add(pair['ad'])
 65 |             self.related_feature_names.add(pair['pos_hist'])
 66 |             if 'neg_hist' in pair:
 67 |                 self.related_feature_names.add(pair['neg_hist'])
 68 |                 self.neg_feature_names.add(pair['neg_hist'])
 69 | 
 70 |     def is_attention_feature(self, feature_name):
 71 |         if feature_name in self.related_feature_names:
 72 |             return True
 73 |         return False
 74 | 
 75 |     def is_neg_sampling_feature(self, feature_name):
 76 |         if feature_name in self.neg_feature_names:
 77 |             return True
 78 |         return False
 79 | 
 80 |     @property
 81 |     def pairs_count(self):
 82 |         return len(self.pairs)
 83 | 
 84 | 
 85 | class InterestNet(nn.Module, EmbeddingMixin):
 86 |     """Interest Network.
 87 | 
 88 |     Parameters
 89 |     ----------
 90 |     features : Features
 91 | 
 92 |     attention_groups : list of AttentionGroup
 93 | 
 94 |     num_classes : int
 95 |         Number of classes.
 96 | 
 97 |     embedding_size : int
 98 |         Size of embedding.
 99 | 
100 |     hidden_layers : list
101 |         Size of hidden layers.
102 |         Example: [96, 32]
103 | 
104 |     dnn_activation : str
105 |         Activation function of deep layers.
106 |         Example: relu
107 | 
108 |     final_activation : str
109 |         Activation function of output.
110 | 
111 |     dropout : float
112 |         Dropout rate.
113 |     """
114 |     def _is_attention_feature(self, feature):
115 |         for group in self.attention_groups:
116 |             if group.is_attention_feature(feature.name):
117 |                 return True
118 |         return False
119 | 
120 |     def _is_neg_sampling_feature(self, feature):
121 |         for group in self.attention_groups:
122 |             if group.is_neg_sampling_feature(feature.name):
123 |                 return True
124 |         return False
125 | 
126 |     def create_attention_fn(self, attention_group):
127 |         raise NotImplementedError(
128 |             "Please implement the func to create attention")
129 | 
130 |     def __init__(self, features, attention_groups, num_classes, embedding_size,
131 |                  hidden_layers, dnn_activation='prelu', final_activation=None,
132 |                  dropout=0.0):
133 |         super(InterestNet, self).__init__()
134 |         self.features = features
135 |         self.attention_groups = attention_groups
136 |         self.num_classes = num_classes
137 |         self.embedding_size = embedding_size
138 |         self.hidden_layers = hidden_layers
139 |         self.dnn_activation = dnn_activation
140 |         self.final_activation = final_activation
141 |         self.dropout = dropout
142 | 
143 |         self.embeddings, self.embedding_sizes = self.build_embeddings(
144 |             embedding_size)
145 | 
146 |         self._sequence_poolings = OrderedDict()
147 |         self._attention_poolings = OrderedDict()
148 | 
149 |         total_embedding_sizes = 0
150 |         for feature in self.features.category_features:
151 |             total_embedding_sizes += (
152 |                 self.embedding_sizes[feature.name])
153 | 
154 |         for feature in self.features.sequence_features:
155 |             if not self._is_neg_sampling_feature(feature):
156 |                 total_embedding_sizes += (
157 |                     self.embedding_sizes[feature.name])
158 |             if not self._is_attention_feature(feature):
159 |                 self._sequence_poolings[feature.name] = MaxPooling(1)
160 |                 self.add_module(
161 |                     f"pooling:{feature.name}",
162 |                     self._sequence_poolings[feature.name])
163 | 
164 |         # attention
165 |         for attention_group in self.attention_groups:
166 |             self._attention_poolings[attention_group.name] = (
167 |                 self.create_attention_fn(attention_group))
168 |             self.add_module(
169 |                 f"attention_pooling:{attention_group.name}",
170 |                 self._attention_poolings[attention_group.name])
171 | 
172 |         total_input_size = (total_embedding_sizes +
173 |                             len(self.features.number_features))
174 |         self.mlp = MLP(
175 |             total_input_size,
176 |             hidden_layers,
177 |             dropout=dropout, batchnorm=True, activation=dnn_activation)
178 |         final_layer_input_size = hidden_layers[-1]
179 | 
180 |         output_size = self.num_classes
181 | 
182 |         if self.num_classes == 2 and self.final_activation == 'sigmoid':
183 |             output_size -= 1
184 | 
185 |         self.final_layer = nn.Linear(final_layer_input_size, output_size)
186 | 
187 |         self.apply(init_weights)
188 | 
189 |     def forward(self, x):
190 |         final_layer_inputs = list()
191 | 
192 |         # linear
193 |         number_inputs = list()
194 |         for feature in self.features.number_features:
195 |             number_inputs.append(x[feature.name].view(-1, 1))
196 | 
197 |         embeddings = OrderedDict()
198 |         for feature in self.features.category_features:
199 |             embeddings[feature.name] = self.embeddings[
200 |                 feature.name](x[feature.name])
201 | 
202 |         for feature in self.features.sequence_features:
203 |             if not self._is_attention_feature(feature):
204 |                 embeddings[feature.name] = self._sequence_poolings[
205 |                     feature.name](self.embeddings[
206 |                         feature.name](x[feature.name]))
207 | 
208 |         for attention_group in self.attention_groups:
209 |             query = torch.cat(
210 |                 [embeddings[pair['ad']]
211 |                  for pair in attention_group.pairs],
212 |                 dim=-1)
213 |             keys = torch.cat(
214 |                 [self.embeddings[pair['pos_hist']](
215 |                     x[pair['pos_hist']]) for pair in attention_group.pairs],
216 |                 dim=-1)
217 |             keys_length = torch.min(torch.cat(
218 |                 [x[f"__{pair['pos_hist']}_length"].view(-1, 1)
219 |                  for pair in attention_group.pairs],
220 |                 dim=-1), dim=-1)[0]
221 |             embeddings[attention_group.name] = self._attention_poolings[
222 |                 attention_group.name](query, keys, keys_length)
223 | 
224 |         emb_concat = torch.cat(number_inputs + [
225 |             emb for emb in embeddings.values()], dim=-1)
226 | 
227 |         final_layer_inputs = self.mlp(emb_concat)
228 | 
229 |         output = self.final_layer(final_layer_inputs)
230 | 
231 |         if self.num_classes == 2 and self.final_activation == 'sigmoid':
232 |             output = torch.sigmoid(output)
233 |         elif self.num_classes > 1 and self.final_activation == 'softmax':
234 |             output = torch.softmax(output)
235 |         elif self.final_activation:
236 |             raise NotImplementedError(
237 |                 f"pair (final_activation: {self.final_activation}, "
238 |                 f"num_classes: {self.num_classes}) is not implemented")
239 | 
240 |         return output
241 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/nn/__init__.py:
--------------------------------------------------------------------------------
 1 | from .mlp import MLP
 2 | from .pooling import MaxPooling, SumPooling
 3 | from .fm import FM
 4 | from .attention import Attention
 5 | from .rnn import AttentionGRUCell, AttentionUpdateGateGRUCell, DynamicGRU
 6 | from .interest import Interest
 7 | 
 8 | 
 9 | __all__ = [
10 |     'MLP',
11 |     'MaxPooling',
12 |     'SumPooling',
13 |     'FM',
14 |     'Attention',
15 |     'AttentionGRUCell',
16 |     'AttentionUpdateGateGRUCell',
17 |     'DynamicGRU',
18 |     'Interest'
19 | ]
20 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/nn/attention.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Attention used by DIN model.
  3 | 
  4 | Reference:
  5 |     Deep Interest Network for Click-Through Rate Prediction
  6 |     https://github.com/zhougr1993/DeepInterestNetwork/blob/master/din/model.py
  7 | """
  8 | 
  9 | # Authors: Hongwei Zhang
 10 | # License: MIT
 11 | 
 12 | 
 13 | import numpy as np
 14 | 
 15 | import torch
 16 | import torch.nn as nn
 17 | from torch.functional import F
 18 | 
 19 | from .mlp import MLP
 20 | 
 21 | 
 22 | class Attention(nn.Module):
 23 |     """Attention layer.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     input_size : int
 28 |         Size of input.
 29 | 
 30 |     hidden_layers : iterable
 31 |         Hidden layer sizes.
 32 | 
 33 |     dropout : float
 34 |         Dropout rate.
 35 | 
 36 |     activation : str
 37 |         Name of activation function. relu, prelu and sigmoid are supported.
 38 | 
 39 |     return_scores : bool
 40 |         Return attention scores instead of weighted sum pooling result.
 41 |     """
 42 |     def __init__(
 43 |             self,
 44 |             input_size,
 45 |             hidden_layers,
 46 |             dropout=0.0,
 47 |             batchnorm=True,
 48 |             activation='prelu',
 49 |             return_scores=False):
 50 |         super(Attention, self).__init__()
 51 |         self.return_scores = return_scores
 52 |         self.mlp = MLP(
 53 |             input_size=input_size * 4,
 54 |             hidden_layers=hidden_layers,
 55 |             dropout=dropout,
 56 |             batchnorm=batchnorm,
 57 |             activation=activation)
 58 |         self.fc = nn.Linear(hidden_layers[-1], 1)
 59 | 
 60 |     def forward(self, query, keys, keys_length):
 61 |         """
 62 |         Parameters
 63 |         ----------
 64 |         query: 2D tensor, [B, H]
 65 |         keys: 3D tensor, [B, T, H]
 66 |         keys_length: 1D tensor, [B]
 67 | 
 68 |         Returns
 69 |         -------
 70 |         outputs: 2D tensor, if return_scores=False [B, H], otherwise [B, T]
 71 |         """
 72 |         batch_size, max_length, dim = keys.size()
 73 | 
 74 |         query = query.unsqueeze(1).expand(-1, max_length, -1)
 75 | 
 76 |         din_all = torch.cat(
 77 |             [query, keys, query - keys, query * keys], dim=-1)
 78 | 
 79 |         din_all = din_all.view(batch_size * max_length, -1)
 80 | 
 81 |         outputs = self.mlp(din_all)
 82 | 
 83 |         outputs = self.fc(outputs).view(batch_size, max_length)  # [B, T]
 84 | 
 85 |         # Scale
 86 |         outputs = outputs / (dim ** 0.5)
 87 | 
 88 |         # Mask
 89 |         mask = (torch.arange(max_length, device=keys_length.device).repeat(
 90 |             batch_size, 1) < keys_length.view(-1, 1))
 91 |         outputs[~mask] = -np.inf
 92 | 
 93 |         # Activation
 94 |         outputs = F.softmax(outputs, dim=1)  # [B, T]
 95 | 
 96 |         if not self.return_scores:
 97 |             # Weighted sum
 98 |             outputs = torch.matmul(
 99 |                 outputs.unsqueeze(1), keys).squeeze()  # [B, H]
100 | 
101 |         return outputs
102 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/nn/fm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | FM layer.
 3 | """
 4 | 
 5 | # Authors: Hongwei Zhang
 6 | # License: MIT
 7 | 
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | 
12 | 
13 | class FM(nn.Module):
14 |     """FM layer.
15 |     """
16 |     def __init__(self, reduce_sum=True):
17 |         super(FM, self).__init__()
18 |         self.reduce_sum = reduce_sum
19 | 
20 |     def forward(self, x):
21 |         sum_squared = torch.pow(torch.sum(x, dim=1), 2)
22 |         squared_sum = torch.sum(torch.pow(x, 2), dim=1)
23 |         second_order = sum_squared - squared_sum
24 |         if self.reduce_sum:
25 |             output = torch.sum(second_order, dim=1, keepdim=True)
26 |         return 0.5 * output
27 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/nn/interest.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Iinterest part used by DIEN model.
  3 | 
  4 | Reference:
  5 |     Deep Interest Evolution Network for Click-Through Rate Prediction
  6 |     https://arxiv.org/pdf/1809.03672.pdf
  7 | """
  8 | 
  9 | # Authors: Hongwei Zhang
 10 | # License: MIT
 11 | 
 12 | 
 13 | from collections import OrderedDict
 14 | 
 15 | import torch
 16 | import torch.nn as nn
 17 | import torch.nn.functional as F
 18 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 19 | 
 20 | from .attention import Attention
 21 | from .rnn import DynamicGRU
 22 | 
 23 | 
 24 | class AuxiliaryNet(nn.Module):
 25 |     """ NN for Auxiliary Loss.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     input_size : int
 30 |         Size of input.
 31 | 
 32 |     hidden_layers : iterable
 33 |         Hidden layer sizes.
 34 | 
 35 |     activation : str
 36 |         Name of activation function. ReLU, PReLU and Sigmoid are supported.
 37 |     """
 38 |     def __init__(self, input_size, hidden_layers, activation='sigmoid'):
 39 |         super(AuxiliaryNet, self).__init__()
 40 |         modules = OrderedDict()
 41 | 
 42 |         previous_size = input_size
 43 |         for index, hidden_layer in enumerate(hidden_layers):
 44 |             modules[f"dense{index}"] = nn.Linear(previous_size, hidden_layer)
 45 |             if activation:
 46 |                 if activation.lower() == 'relu':
 47 |                     modules[f"activation{index}"] = nn.ReLU()
 48 |                 elif activation.lower() == 'prelu':
 49 |                     modules[f"activation{index}"] = nn.PReLU()
 50 |                 elif activation.lower() == 'sigmoid':
 51 |                     modules[f"activation{index}"] = nn.Sigmoid()
 52 |                 else:
 53 |                     raise NotImplementedError(f"{activation} is not supported")
 54 |             previous_size = hidden_layer
 55 |         modules["final_layer"] = nn.Linear(previous_size, 1)
 56 |         self._sequential = nn.Sequential(modules)
 57 | 
 58 |     def forward(self, input):
 59 |         return torch.sigmoid(self._sequential(input))
 60 | 
 61 | 
 62 | class Interest(nn.Module):
 63 |     """Interest layer.
 64 | 
 65 |     Parameters
 66 |     ----------
 67 |     input_size : int
 68 |         Size of input.
 69 | 
 70 |     gru_type : str
 71 |         Type of GRU. GRU, AIGRU, AGRU and AUGRU are supported.
 72 | 
 73 |     gru_dropout : float
 74 |         Dropout rate of GRU.
 75 | 
 76 |     att_hidden_layers : iterable
 77 |         Hidden layer sizes of attention.
 78 | 
 79 |     att_dropout : float
 80 |         Dropout rate of attention.
 81 | 
 82 |     att_batchnorm : bool
 83 |         Batchnorm of attention.
 84 | 
 85 |     att_activation : str
 86 |         Activation function name of attention.
 87 |         relu, prelu and sigmoid are supported.
 88 | 
 89 |     use_negsampling : bool
 90 |     """
 91 |     __SUPPORTED_GRU_TYPE__ = ['GRU', 'AIGRU', 'AGRU', 'AUGRU']
 92 | 
 93 |     def __init__(
 94 |             self,
 95 |             input_size,
 96 |             gru_type='GRU',
 97 |             gru_dropout=0.0,
 98 |             att_hidden_layers=[80, 40],
 99 |             att_dropout=0.0,
100 |             att_batchnorm=True,
101 |             att_activation='prelu',
102 |             use_negsampling=False):
103 |         super(Interest, self).__init__()
104 |         if gru_type not in Interest.__SUPPORTED_GRU_TYPE__:
105 |             raise NotImplementedError(f"gru_type: {gru_type} is not supported")
106 | 
107 |         self.gru_type = gru_type
108 |         self.use_negsampling = use_negsampling
109 | 
110 |         self.interest_extractor = nn.GRU(
111 |             input_size=input_size,
112 |             hidden_size=input_size,
113 |             batch_first=True,
114 |             bidirectional=False)
115 | 
116 |         if self.use_negsampling:
117 |             self.auxiliary_net = AuxiliaryNet(
118 |                 input_size * 2, hidden_layers=[100, 50])
119 | 
120 |         if gru_type == 'GRU':
121 |             self.interest_evolution = nn.GRU(
122 |                 input_size=input_size,
123 |                 hidden_size=input_size,
124 |                 batch_first=True,
125 |                 bidirectional=False)
126 | 
127 |             self.attention = Attention(
128 |                 input_size=input_size,
129 |                 hidden_layers=att_hidden_layers,
130 |                 dropout=att_dropout,
131 |                 batchnorm=att_batchnorm,
132 |                 activation=att_activation)
133 |         elif gru_type == 'AIGRU':
134 |             self.attention = Attention(
135 |                 input_size=input_size,
136 |                 hidden_layers=att_hidden_layers,
137 |                 dropout=att_dropout,
138 |                 batchnorm=att_batchnorm,
139 |                 activation=att_activation,
140 |                 return_scores=True)
141 | 
142 |             self.interest_evolution = nn.GRU(
143 |                 input_size=input_size,
144 |                 hidden_size=input_size,
145 |                 batch_first=True,
146 |                 bidirectional=False)
147 |         elif gru_type == 'AGRU' or gru_type == 'AUGRU':
148 |             self.attention = Attention(
149 |                 input_size=input_size,
150 |                 hidden_layers=att_hidden_layers,
151 |                 dropout=att_dropout,
152 |                 batchnorm=att_batchnorm,
153 |                 activation=att_activation,
154 |                 return_scores=True)
155 | 
156 |             self.interest_evolution = DynamicGRU(
157 |                 input_size=input_size,
158 |                 hidden_size=input_size,
159 |                 gru_type=gru_type)
160 | 
161 |     @staticmethod
162 |     def _get_last_state(states, keys_length):
163 |         # states [B, T, H]
164 |         batch_size, max_seq_length, hidden_size = states.size()
165 | 
166 |         mask = (torch.arange(max_seq_length, device=keys_length.device).repeat(
167 |             batch_size, 1) == (keys_length.view(-1, 1) - 1))
168 | 
169 |         return states[mask]
170 | 
171 |     def cal_auxiliary_loss(
172 |             self, states, click_seq, noclick_seq, keys_length):
173 |         # states [B, T, H]
174 |         # click_seq [B, T, H]
175 |         # noclick_seq [B, T, H]
176 |         # keys_length [B]
177 |         batch_size, max_seq_length, embedding_size = states.size()
178 | 
179 |         mask = (torch.arange(max_seq_length, device=states.device).repeat(
180 |             batch_size, 1) < keys_length.view(-1, 1)).float()
181 | 
182 |         click_input = torch.cat([states, click_seq], dim=-1)
183 |         noclick_input = torch.cat([states, noclick_seq], dim=-1)
184 |         embedding_size = embedding_size * 2
185 | 
186 |         click_p = self.auxiliary_net(
187 |             click_input.view(
188 |                 batch_size * max_seq_length, embedding_size)).view(
189 |                     batch_size, max_seq_length)[mask > 0].view(-1, 1)
190 |         click_target = torch.ones(
191 |             click_p.size(), dtype=torch.float, device=click_p.device)
192 | 
193 |         noclick_p = self.auxiliary_net(
194 |             noclick_input.view(
195 |                 batch_size * max_seq_length, embedding_size)).view(
196 |                     batch_size, max_seq_length)[mask > 0].view(-1, 1)
197 |         noclick_target = torch.zeros(
198 |             noclick_p.size(), dtype=torch.float, device=noclick_p.device)
199 | 
200 |         loss = F.binary_cross_entropy(
201 |             torch.cat([click_p, noclick_p], dim=0),
202 |             torch.cat([click_target, noclick_target], dim=0))
203 | 
204 |         return loss
205 | 
206 |     def forward(self, query, keys, keys_length, neg_keys=None):
207 |         """
208 |         Parameters
209 |         ----------
210 |         query: 2D tensor, [B, H]
211 |         keys: 3D tensor, [B, T, H]
212 |         keys_length: 1D tensor, [B]
213 |         neg_keys: 3D tensor, [B, T, H]
214 | 
215 |         Returns
216 |         -------
217 |         outputs: 2D tensor, [B, H]
218 |         """
219 |         batch_size, max_length, dim = keys.size()
220 | 
221 |         packed_keys = pack_padded_sequence(
222 |             keys,
223 |             lengths=keys_length.squeeze(),
224 |             batch_first=True,
225 |             enforce_sorted=False)
226 | 
227 |         packed_interests, _ = self.interest_extractor(packed_keys)
228 | 
229 |         aloss = None
230 |         if (self.gru_type != 'GRU') or self.use_negsampling:
231 |             interests, _ = pad_packed_sequence(
232 |                 packed_interests,
233 |                 batch_first=True,
234 |                 padding_value=0.0,
235 |                 total_length=max_length)
236 | 
237 |             if self.use_negsampling:
238 |                 aloss = self.cal_auxiliary_loss(
239 |                     interests[:, :-1, :],
240 |                     keys[:, 1:, :],
241 |                     neg_keys[:, 1:, :],
242 |                     keys_length - 1)
243 | 
244 |         if self.gru_type == 'GRU':
245 |             packed_interests, _ = self.interest_evolution(packed_interests)
246 | 
247 |             interests, _ = pad_packed_sequence(
248 |                 packed_interests,
249 |                 batch_first=True,
250 |                 padding_value=0.0,
251 |                 total_length=max_length)
252 | 
253 |             outputs = self.attention(query, interests, keys_length)
254 | 
255 |         elif self.gru_type == 'AIGRU':
256 |             # attention
257 |             scores = self.attention(query, interests, keys_length)
258 |             interests = interests * scores.unsqueeze(-1)
259 | 
260 |             packed_interests = pack_padded_sequence(
261 |                 interests,
262 |                 lengths=keys_length.squeeze(),
263 |                 batch_first=True,
264 |                 enforce_sorted=False)
265 |             _, outputs = self.interest_evolution(packed_interests)
266 |             outputs = outputs.squeeze()
267 | 
268 |         elif self.gru_type == 'AGRU' or self.gru_type == 'AUGRU':
269 |             # attention
270 |             scores = self.attention(query, interests, keys_length)
271 | 
272 |             packed_interests = pack_padded_sequence(
273 |                 interests,
274 |                 lengths=keys_length.squeeze(),
275 |                 batch_first=True,
276 |                 enforce_sorted=False)
277 | 
278 |             packed_scores = pack_padded_sequence(
279 |                 scores,
280 |                 lengths=keys_length.squeeze(),
281 |                 batch_first=True,
282 |                 enforce_sorted=False)
283 | 
284 |             outputs, _ = pad_packed_sequence(
285 |                 self.interest_evolution(
286 |                     packed_interests, packed_scores), batch_first=True)
287 |             # pick last state
288 |             outputs = Interest._get_last_state(
289 |                 outputs, keys_length.squeeze())
290 | 
291 |         return outputs, aloss
292 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/nn/mlp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Multilayer perceptron torch module.
 3 | """
 4 | 
 5 | # Authors: Hongwei Zhang
 6 | # License: MIT
 7 | 
 8 | 
 9 | from collections import OrderedDict
10 | 
11 | import torch.nn as nn
12 | 
13 | 
14 | class MLP(nn.Module):
15 |     """Multilayer perceptron torch module.
16 | 
17 |     Parameters
18 |     ----------
19 |     input_size : int
20 |         Size of input.
21 | 
22 |     hidden_layers : iterable
23 |         Hidden layer sizes.
24 | 
25 |     dropout : float
26 |         Dropout rate.
27 | 
28 |     activation : str
29 |         Name of activation function. ReLU, PReLU and Sigmoid are supported.
30 |     """
31 |     def __init__(self, input_size, hidden_layers,
32 |                  dropout=0.0, batchnorm=True, activation='relu'):
33 |         super(MLP, self).__init__()
34 |         modules = OrderedDict()
35 | 
36 |         previous_size = input_size
37 |         for index, hidden_layer in enumerate(hidden_layers):
38 |             modules[f"dense{index}"] = nn.Linear(previous_size, hidden_layer)
39 |             if batchnorm:
40 |                 modules[f"batchnorm{index}"] = nn.BatchNorm1d(hidden_layer)
41 |             if activation:
42 |                 if activation.lower() == 'relu':
43 |                     modules[f"activation{index}"] = nn.ReLU()
44 |                 elif activation.lower() == 'prelu':
45 |                     modules[f"activation{index}"] = nn.PReLU()
46 |                 elif activation.lower() == 'sigmoid':
47 |                     modules[f"activation{index}"] = nn.Sigmoid()
48 |                 else:
49 |                     raise NotImplementedError(f"{activation} is not supported")
50 |             if dropout:
51 |                 modules[f"dropout{index}"] = nn.Dropout(dropout)
52 |             previous_size = hidden_layer
53 |         self._sequential = nn.Sequential(modules)
54 | 
55 |     def forward(self, input):
56 |         return self._sequential(input)
57 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/nn/pooling.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pooling layers.
 3 | """
 4 | 
 5 | # Authors: Hongwei Zhang
 6 | # License: MIT
 7 | 
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | 
12 | 
13 | class MaxPooling(nn.Module):
14 |     """Max Pooling.
15 | 
16 |     Parameters
17 |     ----------
18 |     dim : int
19 |         The dimension to do pooling.
20 | 
21 |     Attributes
22 |     ----------
23 |     dim : int
24 |         The dimension to do pooling.
25 |     """
26 |     def __init__(self, dim):
27 |         super(MaxPooling, self).__init__()
28 |         self.dim = dim
29 | 
30 |     def forward(self, input):
31 |         return torch.max(input, self.dim)[0]
32 | 
33 | 
34 | class SumPooling(nn.Module):
35 |     """Sum Pooling.
36 | 
37 |     Parameters
38 |     ----------
39 |     dim : int
40 |         The dimension to do pooling.
41 | 
42 |     Attributes
43 |     ----------
44 |     dim : int
45 |         The dimension to do pooling.
46 |     """
47 |     def __init__(self, dim):
48 |         super(SumPooling, self).__init__()
49 |         self.dim = dim
50 | 
51 |     def forward(self, input):
52 |         return torch.sum(input, self.dim)
53 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/nn/rnn.py:
--------------------------------------------------------------------------------
  1 | """AttentionGRU and AttentionUpdateGateGRU.
  2 | """
  3 | 
  4 | # Authors: Hongwei Zhang
  5 | # License: MIT
  6 | 
  7 | 
  8 | import math
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.nn.init as init
 13 | import torch.nn.functional as F
 14 | from torch.nn.utils.rnn import PackedSequence
 15 | 
 16 | 
 17 | class AttentionGRUCell(nn.Module):
 18 |     def __init__(self, input_size, hidden_size, bias=True):
 19 |         super(AttentionGRUCell, self).__init__()
 20 |         self.input_size = input_size
 21 |         self.hidden_size = hidden_size
 22 |         self.bias = bias
 23 |         # (W_ir|W_iz|W_in)
 24 |         self.weight_ih = nn.Parameter(
 25 |             torch.Tensor(3 * hidden_size, input_size))
 26 |         # (W_hr|W_hz|W_hn)
 27 |         self.weight_hh = nn.Parameter(
 28 |             torch.Tensor(3 * hidden_size, hidden_size))
 29 |         if bias:
 30 |             # (b_ir|b_iz|b_in)
 31 |             self.bias_ih = nn.Parameter(torch.Tensor(3 * hidden_size))
 32 |             # (b_hr|b_hz|b_hn)
 33 |             self.bias_hh = nn.Parameter(torch.Tensor(3 * hidden_size))
 34 |         else:
 35 |             self.register_parameter('bias_ih', None)
 36 |             self.register_parameter('bias_hh', None)
 37 |         self.reset_parameters()
 38 | 
 39 |     def reset_parameters(self):
 40 |         stdv = 1.0 / math.sqrt(self.hidden_size)
 41 |         for weight in self.parameters():
 42 |             init.uniform_(weight, -stdv, stdv)
 43 | 
 44 |     def forward(self, input, hx, att_score):
 45 |         """
 46 | 
 47 |         References
 48 |         ----------
 49 |             https://github.com/pytorch/pytorch/blob/v0.4.1/torch/nn/_functions/rnn.py#L49
 50 |         """
 51 | 
 52 |         gi = F.linear(input, self.weight_ih, self.bias_ih)
 53 |         gh = F.linear(hx, self.weight_hh, self.bias_hh)
 54 |         i_r, i_z, i_n = gi.chunk(3, 1)
 55 |         h_r, h_z, h_n = gh.chunk(3, 1)
 56 | 
 57 |         resetgate = torch.sigmoid(i_r + h_r)
 58 |         # updategate = torch.sigmoid(i_z + h_z)
 59 |         newgate = torch.tanh(i_n + resetgate * h_n)
 60 |         # hy = newgate + updategate * (hx - newgate)
 61 | 
 62 |         att_score = att_score.view(-1, 1)
 63 | 
 64 |         hy = (1. - att_score) * hx + att_score * newgate
 65 | 
 66 |         return hy
 67 | 
 68 | 
 69 | class AttentionUpdateGateGRUCell(nn.Module):
 70 |     def __init__(self, input_size, hidden_size, bias=True):
 71 |         super(AttentionUpdateGateGRUCell, self).__init__()
 72 |         self.input_size = input_size
 73 |         self.hidden_size = hidden_size
 74 |         self.bias = bias
 75 |         # (W_ir|W_iz|W_in)
 76 |         self.weight_ih = nn.Parameter(
 77 |             torch.Tensor(3 * hidden_size, input_size))
 78 |         # (W_hr|W_hz|W_hn)
 79 |         self.weight_hh = nn.Parameter(
 80 |             torch.Tensor(3 * hidden_size, hidden_size))
 81 |         if bias:
 82 |             # (b_ir|b_iz|b_in)
 83 |             self.bias_ih = nn.Parameter(torch.Tensor(3 * hidden_size))
 84 |             # (b_hr|b_hz|b_hn)
 85 |             self.bias_hh = nn.Parameter(torch.Tensor(3 * hidden_size))
 86 |         else:
 87 |             self.register_parameter('bias_ih', None)
 88 |             self.register_parameter('bias_hh', None)
 89 |         self.reset_parameters()
 90 | 
 91 |     def reset_parameters(self):
 92 |         stdv = 1.0 / math.sqrt(self.hidden_size)
 93 |         for weight in self.parameters():
 94 |             init.uniform_(weight, -stdv, stdv)
 95 | 
 96 |     def forward(self, input, hx, att_score):
 97 |         """
 98 | 
 99 |         References
100 |         ----------
101 |             https://github.com/pytorch/pytorch/blob/v0.4.1/torch/nn/_functions/rnn.py#L49
102 |         """
103 | 
104 |         gi = F.linear(input, self.weight_ih, self.bias_ih)
105 |         gh = F.linear(hx, self.weight_hh, self.bias_hh)
106 |         i_r, i_z, i_n = gi.chunk(3, 1)
107 |         h_r, h_z, h_n = gh.chunk(3, 1)
108 | 
109 |         resetgate = torch.sigmoid(i_r + h_r)
110 |         updategate = torch.sigmoid(i_z + h_z)
111 |         newgate = torch.tanh(i_n + resetgate * h_n)
112 | 
113 |         updategate = att_score.view(-1, 1) * updategate
114 | 
115 |         hy = newgate + updategate * (hx - newgate)
116 | 
117 |         return hy
118 | 
119 | 
120 | class DynamicGRU(nn.Module):
121 |     def __init__(self, input_size, hidden_size, bias=True, gru_type='AGRU'):
122 |         super(DynamicGRU, self).__init__()
123 |         self.input_size = input_size
124 |         self.hidden_size = hidden_size
125 | 
126 |         if gru_type == 'AGRU':
127 |             self.rnn = AttentionGRUCell(input_size, hidden_size, bias)
128 |         elif gru_type == 'AUGRU':
129 |             self.rnn = AttentionUpdateGateGRUCell(
130 |                 input_size, hidden_size, bias)
131 | 
132 |     def forward(self, input, att_scores, hx=None):
133 |         is_packed_input = isinstance(input, PackedSequence)
134 |         if not is_packed_input:
135 |             raise NotImplementedError(
136 |                 "DynamicGRU only supports packed input")
137 | 
138 |         is_packed_att_scores = isinstance(att_scores, PackedSequence)
139 |         if not is_packed_att_scores:
140 |             raise NotImplementedError(
141 |                 "DynamicGRU only supports packed att_scores")
142 | 
143 |         input, batch_sizes, sorted_indices, unsorted_indices = input
144 |         att_scores, _, _, _ = att_scores
145 | 
146 |         max_batch_size = batch_sizes[0]
147 |         max_batch_size = int(max_batch_size)
148 | 
149 |         if hx is None:
150 |             hx = torch.zeros(
151 |                 max_batch_size, self.hidden_size,
152 |                 dtype=input.dtype, device=input.device)
153 | 
154 |         outputs = torch.zeros(
155 |             input.size(0), self.hidden_size,
156 |             dtype=input.dtype, device=input.device)
157 | 
158 |         begin = 0
159 |         for batch in batch_sizes:
160 |             new_hx = self.rnn(
161 |                 input[begin: begin + batch],
162 |                 hx[0:batch],
163 |                 att_scores[begin: begin + batch])
164 |             outputs[begin: begin + batch] = new_hx
165 |             hx = new_hx
166 |             begin += batch
167 | 
168 |         return PackedSequence(
169 |             outputs, batch_sizes, sorted_indices, unsorted_indices)
170 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/nn/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/pytorch/nn/tests/__init__.py


--------------------------------------------------------------------------------
/prediction_flow/pytorch/nn/tests/test_attention.py:
--------------------------------------------------------------------------------
 1 | from prediction_flow.pytorch.nn import Attention
 2 | 
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | import torch.nn.init as init
 7 | 
 8 | 
 9 | def test_attention():
10 |     attention = Attention(3, [8], batchnorm=False, activation=0.0)
11 | 
12 |     query = torch.tensor([[1, 1, 1], [0.1, 0.2, 0.3]], dtype=torch.float)
13 | 
14 |     keys = torch.tensor([
15 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]],
16 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]]
17 |     ], dtype=torch.float)
18 | 
19 |     keys_length = torch.tensor([3, 4])
20 | 
21 |     for param in attention.mlp.parameters():
22 |         init.constant_(param, 1)
23 | 
24 |     for param in attention.fc.parameters():
25 |         init.constant_(param, 1)
26 | 
27 |     output = attention(query, keys, keys_length)
28 | 
29 |     actual = output.detach().numpy()
30 |     assert output.size()[0] == 2
31 |     assert output.size()[1] == 3
32 |     np.testing.assert_array_almost_equal(
33 |         actual, np.array([[1.0, 2.0, 3.0],
34 |                           [0.989024, 1.969694, 2.959199]], dtype=np.float))
35 | 
36 | def test_attention_return_scores():
37 |     attention = Attention(
38 |         3, [8], batchnorm=False, activation=0.0, return_scores=True)
39 | 
40 |     query = torch.tensor([[1, 1, 1], [0.1, 0.2, 0.3]], dtype=torch.float)
41 | 
42 |     keys = torch.tensor([
43 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]],
44 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]]
45 |     ], dtype=torch.float)
46 | 
47 |     keys_length = torch.tensor([3, 4])
48 | 
49 |     for param in attention.mlp.parameters():
50 |         init.constant_(param, 1)
51 | 
52 |     for param in attention.fc.parameters():
53 |         init.constant_(param, 1)
54 | 
55 |     output = attention(query, keys, keys_length)
56 | 
57 |     actual = output.detach().numpy()
58 | 
59 |     expected = np.array(
60 |         [[1.472415e-11, 1.000000e+00, 1.492623e-09, 0.000000e+00],
61 |          [2.915521e-03, 9.821462e-01, 8.833572e-03, 6.104673e-03]],
62 |         dtype=np.float)
63 | 
64 |     assert output.size()[0] == 2
65 |     assert output.size()[1] == 4
66 |     np.testing.assert_array_almost_equal(actual, expected)
67 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/nn/tests/test_fm.py:
--------------------------------------------------------------------------------
 1 | from prediction_flow.pytorch.nn import FM
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | 
 7 | def test_fm():
 8 |     fm = FM()
 9 | 
10 |     x = torch.as_tensor(
11 |         [[[1.0, 1.0, 1.0], [1.0, 2.0, 3.0]],
12 |          [[1.0, 1.0, 1.0], [4.0, 5.0, 6.0]]])
13 |     actual = fm(x)
14 | 
15 |     # 6.0 = 1 * 1 + 1 * 2 + 1 * 3
16 |     # 15.0 = 1 * 4 + 1 * 5 + 1 * 6
17 |     np.testing.assert_array_almost_equal(
18 |         actual.numpy(), np.array([[6.0], [15.0]], dtype=np.float))
19 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/nn/tests/test_interest.py:
--------------------------------------------------------------------------------
  1 | from prediction_flow.pytorch.nn import Interest
  2 | 
  3 | 
  4 | import torch
  5 | 
  6 | 
  7 | def test_gru_interest_evolution():
  8 |     interests = Interest(
  9 |         input_size=3,
 10 |         gru_type='GRU',
 11 |         gru_dropout=0,
 12 |         att_hidden_layers=[8],
 13 |         att_dropout=0,
 14 |         att_batchnorm=False,
 15 |         att_activation=None)
 16 | 
 17 |     query = torch.tensor([[1, 1, 1], [0.1, 0.2, 0.3]], dtype=torch.float)
 18 | 
 19 |     keys = torch.tensor([
 20 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]],
 21 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]]
 22 |     ], dtype=torch.float)
 23 | 
 24 |     keys_length = torch.tensor([3, 4])
 25 | 
 26 |     output, _ = interests(query, keys, keys_length)
 27 | 
 28 |     assert output.size()[0] == 2
 29 |     assert output.size()[1] == 3
 30 | 
 31 | 
 32 | def test_aigru_interest_evolution():
 33 |     interests = Interest(
 34 |         input_size=3,
 35 |         gru_type='AIGRU',
 36 |         gru_dropout=0,
 37 |         att_hidden_layers=[8],
 38 |         att_dropout=0,
 39 |         att_batchnorm=False,
 40 |         att_activation=None)
 41 | 
 42 |     query = torch.tensor([[1, 1, 1], [0.1, 0.2, 0.3]], dtype=torch.float)
 43 | 
 44 |     keys = torch.tensor([
 45 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]],
 46 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]]
 47 |     ], dtype=torch.float)
 48 | 
 49 |     keys_length = torch.tensor([3, 4])
 50 | 
 51 |     output, _ = interests(query, keys, keys_length)
 52 | 
 53 |     assert output.size()[0] == 2
 54 |     assert output.size()[1] == 3
 55 | 
 56 | 
 57 | def test_agru_interest_evolution():
 58 |     interests = Interest(
 59 |         input_size=3,
 60 |         gru_type='AGRU',
 61 |         gru_dropout=0,
 62 |         att_hidden_layers=[8],
 63 |         att_dropout=0,
 64 |         att_batchnorm=False,
 65 |         att_activation=None)
 66 | 
 67 |     query = torch.tensor([[1, 1, 1], [0.1, 0.2, 0.3]], dtype=torch.float)
 68 | 
 69 |     keys = torch.tensor([
 70 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]],
 71 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]]
 72 |     ], dtype=torch.float)
 73 | 
 74 |     keys_length = torch.tensor([3, 4])
 75 | 
 76 |     output, _ = interests(query, keys, keys_length)
 77 | 
 78 |     assert output.size()[0] == 2
 79 |     assert output.size()[1] == 3
 80 | 
 81 | 
 82 | def test_augru_interest_evolution():
 83 |     interests = Interest(
 84 |         input_size=3,
 85 |         gru_type='AUGRU',
 86 |         gru_dropout=0,
 87 |         att_hidden_layers=[8],
 88 |         att_dropout=0,
 89 |         att_batchnorm=False,
 90 |         att_activation=None)
 91 | 
 92 |     query = torch.tensor([[1, 1, 1], [0.1, 0.2, 0.3]], dtype=torch.float)
 93 | 
 94 |     keys = torch.tensor([
 95 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]],
 96 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]]
 97 |     ], dtype=torch.float)
 98 | 
 99 |     keys_length = torch.tensor([3, 4])
100 | 
101 |     output, _ = interests(query, keys, keys_length)
102 | 
103 |     assert output.size()[0] == 2
104 |     assert output.size()[1] == 3
105 | 
106 | 
107 | def test_neg_sampling():
108 |     interests = Interest(
109 |         input_size=3,
110 |         gru_type='AUGRU',
111 |         gru_dropout=0,
112 |         att_hidden_layers=[8],
113 |         att_dropout=0,
114 |         att_batchnorm=False,
115 |         att_activation=None,
116 |         use_negsampling=True)
117 | 
118 |     query = torch.tensor(
119 |         [[1, 1, 1], [0.1, 0.2, 0.3], [0.3, 0.4, 0.5]], dtype=torch.float)
120 | 
121 |     keys = torch.tensor([
122 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]],
123 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]],
124 |         [[0.1, 0.2, 0.3], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
125 |     ], dtype=torch.float)
126 | 
127 |     neg_keys = torch.tensor([
128 |         [[0.3, 0.2, 0.1], [3, 2, 1], [1, 0.2, 0.4], [0.0, 0.0, 0.0]],
129 |         [[0.3, 0.2, 0.1], [3, 2, 1], [1, 0.2, 0.4], [0.5, 0.5, 0.5]],
130 |         [[0.3, 0.2, 0.1], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
131 |     ], dtype=torch.float)
132 | 
133 |     keys_length = torch.tensor([3, 4, 1])
134 | 
135 |     output, _ = interests(query, keys, keys_length, neg_keys)
136 | 
137 |     assert output.size()[0] == 3
138 |     assert output.size()[1] == 3
139 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/nn/tests/test_mlp.py:
--------------------------------------------------------------------------------
 1 | from prediction_flow.pytorch.nn import MLP
 2 | 
 3 | 
 4 | def test_simple_creation():
 5 |     mlp = MLP(input_size=10, hidden_layers=(16, 4),
 6 |               activation=None, dropout=0.0)
 7 | 
 8 |     assert len(mlp._sequential) == 4
 9 | 
10 | 
11 | def test_creation_with_dropout():
12 |     mlp = MLP(input_size=10, hidden_layers=(16, 4),
13 |               activation=None, dropout=0.1)
14 | 
15 |     assert len(mlp._sequential) == 6
16 | 
17 | 
18 | def test_creation_with_activation_and_dropout():
19 |     mlp = MLP(input_size=10, hidden_layers=(16, 4),
20 |               activation='relu', dropout=0.1)
21 | 
22 |     assert len(mlp._sequential) == 8
23 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/nn/tests/test_pooling.py:
--------------------------------------------------------------------------------
 1 | from prediction_flow.pytorch.nn import MaxPooling, SumPooling
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def test_max_pooling():
 7 |     x = torch.tensor(
 8 |         [[[1, 2, 1, 1],
 9 |           [1, 1, 3, 1]],
10 |          [[10, 1, 1, 1],
11 |           [1, 1, 4, 1]],
12 |          [[2, 8, 9, 0],
13 |           [1, 1, 1, 1]]])
14 | 
15 |     max_pooling = MaxPooling(dim=1)
16 | 
17 |     actual = max_pooling(x)
18 | 
19 |     assert actual.numpy().tolist() == [
20 |         [1, 2, 3, 1], [10, 1, 4, 1], [2, 8, 9, 1]]
21 | 
22 | 
23 | def test_sum_pooling():
24 |     x = torch.tensor(
25 |         [[[1, 2, 1, 1],
26 |           [1, 1, 3, 1]],
27 |          [[10, 1, 1, 1],
28 |           [1, 1, 4, 1]],
29 |          [[2, 8, 9, 0],
30 |           [1, 1, 1, 1]]])
31 | 
32 |     sum_pooling = SumPooling(dim=1)
33 | 
34 |     actual = sum_pooling(x)
35 | 
36 |     assert actual.numpy().tolist() == [
37 |         [2, 3, 4, 2], [11, 2, 5, 2], [3, 9, 10, 1]]
38 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/nn/tests/test_rnn.py:
--------------------------------------------------------------------------------
 1 | from prediction_flow.pytorch.nn import (
 2 |     AttentionGRUCell, AttentionUpdateGateGRUCell, DynamicGRU)
 3 | 
 4 | import torch
 5 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 6 | 
 7 | 
 8 | def test_attention_gru_cell():
 9 |     gru_cell = AttentionGRUCell(10, 20)
10 |     input = torch.randn(6, 3, 10)
11 |     hx = torch.randn(3, 20)
12 |     att_scores = torch.tensor([
13 |         [0.1, 0.3, 0.6],
14 |         [0.2, 0.2, 0.6],
15 |         [0.1, 0.6, 0.3],
16 |         [1.0, 0., 0.],
17 |         [0.2, 0.3, 0.5],
18 |         [0.1, 0.3, 0.6],
19 |     ])
20 | 
21 |     output = []
22 |     for i in range(6):
23 |         hx = gru_cell(input[i], hx, att_scores[i])
24 |         output.append(hx)
25 | 
26 |     assert len(output) == 6
27 | 
28 | 
29 | def test_attention_update_gate_gru_cell():
30 |     gru_cell = AttentionUpdateGateGRUCell(10, 20)
31 |     input = torch.randn(6, 3, 10)
32 |     hx = torch.randn(3, 20)
33 |     att_scores = torch.tensor([
34 |         [0.1, 0.3, 0.6],
35 |         [0.2, 0.2, 0.6],
36 |         [0.1, 0.6, 0.3],
37 |         [1.0, 0., 0.],
38 |         [0.2, 0.3, 0.5],
39 |         [0.1, 0.3, 0.6],
40 |     ])
41 | 
42 |     output = []
43 |     for i in range(6):
44 |         hx = gru_cell(input[i], hx, att_scores[i])
45 |         output.append(hx)
46 | 
47 |     assert len(output) == 6
48 | 
49 | 
50 | def test_dynamic_gru():
51 |     rnn = DynamicGRU(3, 5)
52 | 
53 |     keys = torch.tensor([
54 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.0, 0.0, 0.0]],
55 |         [[0.1, 0.2, 0.3], [1, 2, 3], [0.4, 0.2, 1], [0.5, 0.5, 0.5]],
56 |         [[0.1, 0.2, 0.3], [0., 0., 0.], [0., 0., 0.], [0., 0., 0.]],
57 |         [[0.1, 0.2, 0.3], [0., 0., 0.], [0., 0., 0.], [0., 0., 0.]],
58 |         [[0.1, 0.2, 0.3], [0., 0., 0.], [0., 0., 0.], [0., 0., 0.]]
59 |     ], dtype=torch.float)
60 | 
61 |     att_scores = torch.tensor([
62 |         [0.0330, 0.7252, 0.2459, 0.],
63 |         [0.2952, 0.8721, 0.4468, 0.0904],
64 |         [0.4598,  0.,  0.,  0.],
65 |         [0.0286, 0.,  0.,  0.],
66 |         [0.0561, 0.,  0.,  0.]])
67 | 
68 |     lengths = torch.tensor([3, 4, 1, 1, 1])
69 | 
70 |     packed_att_scores = pack_padded_sequence(
71 |         att_scores,
72 |         lengths,
73 |         batch_first=True, enforce_sorted=False)
74 | 
75 |     packed_keys = pack_padded_sequence(
76 |         keys,
77 |         lengths,
78 |         batch_first=True, enforce_sorted=False)
79 | 
80 |     actual, actual_lengths = pad_packed_sequence(
81 |         rnn(packed_keys, packed_att_scores), batch_first=True)
82 | 
83 |     assert actual.size() == (5, 4, 5)
84 |     assert actual_lengths.numpy().tolist() == [3, 4, 1, 1, 1]
85 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/pytorch/tests/__init__.py


--------------------------------------------------------------------------------
/prediction_flow/pytorch/tests/test_deepfm.py:
--------------------------------------------------------------------------------
  1 | from prediction_flow.features import Number, Category, Sequence, Features
  2 | from prediction_flow.transformers.column import (
  3 |     StandardScaler, CategoryEncoder, SequenceEncoder)
  4 | from prediction_flow.pytorch import DeepFM
  5 | 
  6 | from .utils import prepare_dataloader
  7 | 
  8 | 
  9 | def test_normal():
 10 |     number_features = [
 11 |         Number('userAge', StandardScaler()),
 12 |         Number('rating', StandardScaler())]
 13 | 
 14 |     category_features = [
 15 |         Category('userId', CategoryEncoder(min_cnt=1)),
 16 |         Category('movieId', CategoryEncoder(min_cnt=1)),
 17 |         Category('topGenre', CategoryEncoder(min_cnt=1))]
 18 | 
 19 |     sequence_features = [
 20 |         Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
 21 |         Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
 22 |         Sequence('clickedMovieIds',
 23 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
 24 |         Sequence('clickedMovieTopGenres',
 25 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5))]
 26 | 
 27 |     features = Features(
 28 |         number_features=number_features,
 29 |         category_features=category_features,
 30 |         sequence_features=sequence_features)
 31 | 
 32 |     dataloader, _ = prepare_dataloader(features)
 33 | 
 34 |     deep_fm = DeepFM(
 35 |         features, num_classes=2, embedding_size=4, hidden_layers=(8, 4),
 36 |         final_activation='sigmoid', dropout=0.3)
 37 | 
 38 |     deep_fm(next(iter(dataloader)))
 39 | 
 40 | 
 41 | def test_without_number_feature():
 42 |     number_features = []
 43 | 
 44 |     category_features = [
 45 |         Category('userId', CategoryEncoder(min_cnt=1)),
 46 |         Category('movieId', CategoryEncoder(min_cnt=1)),
 47 |         Category('topGenre', CategoryEncoder(min_cnt=1))]
 48 | 
 49 |     sequence_features = [
 50 |         Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
 51 |         Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
 52 |         Sequence('clickedMovieIds',
 53 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
 54 |         Sequence('clickedMovieTopGenres',
 55 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5))]
 56 | 
 57 |     features = Features(
 58 |         number_features=number_features,
 59 |         category_features=category_features,
 60 |         sequence_features=sequence_features)
 61 | 
 62 |     dataloader, _ = prepare_dataloader(features)
 63 | 
 64 |     deep_fm = DeepFM(
 65 |         features, num_classes=2, embedding_size=4, hidden_layers=(8, 4),
 66 |         final_activation='sigmoid', dropout=0.3)
 67 | 
 68 |     deep_fm(next(iter(dataloader)))
 69 | 
 70 | 
 71 | def test_without_category_feature():
 72 |     number_features = []
 73 | 
 74 |     category_features = []
 75 | 
 76 |     sequence_features = [
 77 |         Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
 78 |         Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
 79 |         Sequence('clickedMovieIds',
 80 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
 81 |         Sequence('clickedMovieTopGenres',
 82 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5))]
 83 | 
 84 |     features = Features(
 85 |         number_features=number_features,
 86 |         category_features=category_features,
 87 |         sequence_features=sequence_features)
 88 | 
 89 |     dataloader, _ = prepare_dataloader(features)
 90 | 
 91 |     deep_fm = DeepFM(
 92 |         features, num_classes=2, embedding_size=4, hidden_layers=(8, 4),
 93 |         final_activation='sigmoid', dropout=0.3)
 94 | 
 95 |     deep_fm(next(iter(dataloader)))
 96 | 
 97 | 
 98 | def test_only_with_number_features():
 99 |     number_features = [
100 |         Number('userAge', StandardScaler()),
101 |         Number('rating', StandardScaler())]
102 | 
103 |     category_features = []
104 | 
105 |     sequence_features = []
106 | 
107 |     features = Features(
108 |         number_features=number_features,
109 |         category_features=category_features,
110 |         sequence_features=sequence_features)
111 | 
112 |     dataloader, _ = prepare_dataloader(features)
113 | 
114 |     deep_fm = DeepFM(
115 |         features, num_classes=2, embedding_size=4, hidden_layers=(8, 4),
116 |         final_activation='sigmoid', dropout=0.3)
117 | 
118 |     deep_fm(next(iter(dataloader)))
119 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/tests/test_dien.py:
--------------------------------------------------------------------------------
  1 | from prediction_flow.features import Number, Category, Sequence, Features
  2 | from prediction_flow.transformers.column import (
  3 |     StandardScaler, CategoryEncoder, SequenceEncoder)
  4 | from prediction_flow.pytorch import AttentionGroup, DIEN
  5 | 
  6 | 
  7 | from .utils import prepare_dataloader
  8 | 
  9 | 
 10 | def create_test_data():
 11 |     number_features = [
 12 |         Number('userAge', StandardScaler()),
 13 |         Number('rating', StandardScaler())]
 14 | 
 15 |     category_features = [
 16 |         Category('userId', CategoryEncoder(min_cnt=1)),
 17 |         Category('movieId', CategoryEncoder(min_cnt=1)),
 18 |         Category('topGenre', CategoryEncoder(min_cnt=1))]
 19 | 
 20 |     sequence_features = [
 21 |         Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
 22 |         Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
 23 |         Sequence('clickedMovieIds',
 24 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
 25 |         Sequence('clickedMovieTopGenres',
 26 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
 27 |         Sequence('noClickedMovieIds',
 28 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
 29 |         Sequence('noClickedMovieTopGenres',
 30 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5))]
 31 | 
 32 |     attention_groups = [
 33 |         AttentionGroup(
 34 |             name='group1',
 35 |             pairs=[{'ad': 'movieId',
 36 |                     'pos_hist': 'clickedMovieIds',
 37 |                     'neg_hist': 'noClickedMovieIds'},
 38 |                    {'ad': 'topGenre',
 39 |                     'pos_hist': 'clickedMovieTopGenres',
 40 |                     'neg_hist': 'noClickedMovieTopGenres'}],
 41 |             hidden_layers=[8, 4])]
 42 | 
 43 |     features = Features(
 44 |         number_features=number_features,
 45 |         category_features=category_features,
 46 |         sequence_features=sequence_features)
 47 | 
 48 |     dataloader, _ = prepare_dataloader(features)
 49 | 
 50 |     return dataloader, features, attention_groups
 51 | 
 52 | 
 53 | def test_gru_gru_att():
 54 |     dataloader, features, attention_groups = create_test_data()
 55 | 
 56 |     attention_groups[0].gru_type = 'GRU'
 57 | 
 58 |     model = DIEN(
 59 |         features, attention_groups=attention_groups,
 60 |         num_classes=2, embedding_size=4, hidden_layers=(16, 8),
 61 |         final_activation='sigmoid', dropout=0.3)
 62 | 
 63 |     model(next(iter(dataloader)))
 64 | 
 65 | 
 66 | def test_gru_att_gru():
 67 |     dataloader, features, attention_groups = create_test_data()
 68 | 
 69 |     attention_groups[0].gru_type = 'AIGRU'
 70 | 
 71 |     model = DIEN(
 72 |         features, attention_groups=attention_groups,
 73 |         num_classes=2, embedding_size=4, hidden_layers=(16, 8),
 74 |         final_activation='sigmoid', dropout=0.3)
 75 | 
 76 |     model(next(iter(dataloader)))
 77 | 
 78 | 
 79 | def test_gru_agru():
 80 |     dataloader, features, attention_groups = create_test_data()
 81 | 
 82 |     attention_groups[0].gru_type = 'AGRU'
 83 | 
 84 |     model = DIEN(
 85 |         features, attention_groups=attention_groups,
 86 |         num_classes=2, embedding_size=4, hidden_layers=(16, 8),
 87 |         final_activation='sigmoid', dropout=0.3)
 88 | 
 89 |     model(next(iter(dataloader)))
 90 | 
 91 | 
 92 | def test_gru_augru():
 93 |     dataloader, features, attention_groups = create_test_data()
 94 | 
 95 |     attention_groups[0].gru_type = 'AUGRU'
 96 | 
 97 |     model = DIEN(
 98 |         features, attention_groups=attention_groups,
 99 |         num_classes=2, embedding_size=4, hidden_layers=(16, 8),
100 |         final_activation='sigmoid', dropout=0.3)
101 | 
102 |     model(next(iter(dataloader)))
103 | 
104 | 
105 | def test_gru_augru_neg():
106 |     dataloader, features, attention_groups = create_test_data()
107 | 
108 |     attention_groups[0].gru_type = 'AUGRU'
109 | 
110 |     model = DIEN(
111 |         features, attention_groups=attention_groups,
112 |         use_negsampling=True,
113 |         num_classes=2, embedding_size=4, hidden_layers=(16, 8),
114 |         final_activation='sigmoid', dropout=0.3)
115 | 
116 |     model(next(iter(dataloader)))
117 | 
118 | 
119 | def create_test_data_with_sharing_emb():
120 |     number_features = [
121 |         Number('userAge', StandardScaler()),
122 |         Number('rating', StandardScaler())]
123 | 
124 |     # provide word to index mapping
125 |     movie_word2idx = {
126 |         '__PAD__': 0,
127 |         '4226': 1,
128 |         '5971': 2,
129 |         '6291': 3,
130 |         '7153': 4,
131 |         '30707': 5,
132 |         '3242': 6,
133 |         '42': 7,
134 |         '32': 8,
135 |         '34': 9,
136 |         '233': 10,
137 |         '291': 11,
138 |         '324': 12,
139 |         '325': 13,
140 |         '3542': 14,
141 |         '322': 15,
142 |         '33': 16,
143 |         '45': 17,
144 |         '__UNKNOWN__': 18}
145 | 
146 |     movie_idx2word = {
147 |         index: word for word, index in movie_word2idx.items()}
148 | 
149 |     category_features = [
150 |         Category('movieId',
151 |                  CategoryEncoder(
152 |                      word2idx=movie_word2idx,
153 |                      idx2word=movie_idx2word),
154 |                  embedding_name='movieId'),
155 |         Category('topGenre', CategoryEncoder(min_cnt=1))]
156 | 
157 |     sequence_features = [
158 |         Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
159 |         Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
160 |         Sequence('clickedMovieIds',
161 |                  SequenceEncoder(
162 |                      sep='|', max_len=5,
163 |                      word2idx=movie_word2idx, idx2word=movie_idx2word),
164 |                  embedding_name='movieId'),
165 |         Sequence('noClickedMovieIds',
166 |                  SequenceEncoder(
167 |                      sep='|', max_len=5,
168 |                      word2idx=movie_word2idx, idx2word=movie_idx2word),
169 |                  embedding_name='movieId')]
170 | 
171 |     attention_groups = [
172 |         AttentionGroup(
173 |             name='group1',
174 |             pairs=[{'ad': 'movieId',
175 |                     'pos_hist': 'clickedMovieIds',
176 |                     'neg_hist': 'noClickedMovieIds'}],
177 |             hidden_layers=[8, 4])]
178 | 
179 |     features = Features(
180 |         number_features=number_features,
181 |         category_features=category_features,
182 |         sequence_features=sequence_features)
183 | 
184 |     dataloader, _ = prepare_dataloader(features)
185 | 
186 |     return dataloader, features, attention_groups
187 | 
188 | 
189 | def test_gru_augru_neg_with_sharing_emb():
190 |     dataloader, features, attention_groups = (
191 |         create_test_data_with_sharing_emb())
192 | 
193 |     attention_groups[0].gru_type = 'AUGRU'
194 | 
195 |     model = DIEN(
196 |         features, attention_groups=attention_groups,
197 |         use_negsampling=True,
198 |         num_classes=2, embedding_size=4, hidden_layers=(16, 8),
199 |         final_activation='sigmoid', dropout=0.3)
200 | 
201 |     model(next(iter(dataloader)))
202 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/tests/test_din.py:
--------------------------------------------------------------------------------
 1 | from prediction_flow.features import Number, Category, Sequence, Features
 2 | from prediction_flow.transformers.column import (
 3 |     StandardScaler, CategoryEncoder, SequenceEncoder)
 4 | from prediction_flow.pytorch import AttentionGroup, DIN
 5 | 
 6 | 
 7 | from .utils import prepare_dataloader
 8 | 
 9 | 
10 | def test_normal():
11 |     number_features = [
12 |         Number('userAge', StandardScaler()),
13 |         Number('rating', StandardScaler())]
14 | 
15 |     category_features = [
16 |         Category('userId', CategoryEncoder(min_cnt=1)),
17 |         Category('movieId', CategoryEncoder(min_cnt=1)),
18 |         Category('topGenre', CategoryEncoder(min_cnt=1))]
19 | 
20 |     sequence_features = [
21 |         Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
22 |         Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
23 |         Sequence('clickedMovieIds',
24 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
25 |         Sequence('clickedMovieTopGenres',
26 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5))]
27 | 
28 |     attention_groups = [
29 |         AttentionGroup(
30 |             name='group1',
31 |             pairs=[{'ad': 'movieId', 'pos_hist': 'clickedMovieIds'},
32 |                    {'ad': 'topGenre', 'pos_hist': 'clickedMovieTopGenres'}],
33 |             hidden_layers=[8, 4])]
34 | 
35 |     features = Features(
36 |         number_features=number_features,
37 |         category_features=category_features,
38 |         sequence_features=sequence_features)
39 | 
40 |     dataloader, _ = prepare_dataloader(features)
41 | 
42 |     model = DIN(
43 |         features, attention_groups=attention_groups,
44 |         num_classes=2, embedding_size=4, hidden_layers=(16, 8),
45 |         final_activation='sigmoid', dropout=0.3)
46 | 
47 |     model(next(iter(dataloader)))
48 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/tests/test_dnn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from prediction_flow.features import Number, Category, Sequence, Features
  4 | from prediction_flow.transformers.column import (
  5 |     StandardScaler, CategoryEncoder, SequenceEncoder)
  6 | from prediction_flow.pytorch import DNN
  7 | 
  8 | 
  9 | from .utils import prepare_dataloader, _SAMPLE_DF
 10 | 
 11 | 
 12 | def test_normal():
 13 |     number_features = [
 14 |         Number('userAge', StandardScaler()),
 15 |         Number('rating', StandardScaler())]
 16 | 
 17 |     category_features = [
 18 |         Category('userId', CategoryEncoder(min_cnt=1)),
 19 |         Category('movieId', CategoryEncoder(min_cnt=1)),
 20 |         Category('topGenre', CategoryEncoder(min_cnt=1))]
 21 | 
 22 |     sequence_features = [
 23 |         Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
 24 |         Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
 25 |         Sequence('clickedMovieIds',
 26 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
 27 |         Sequence('clickedMovieTopGenres',
 28 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5))]
 29 | 
 30 |     features = Features(
 31 |         number_features=number_features,
 32 |         category_features=category_features,
 33 |         sequence_features=sequence_features)
 34 | 
 35 |     dataloader, _ = prepare_dataloader(features)
 36 | 
 37 |     model = DNN(
 38 |         features, num_classes=2, embedding_size=4, hidden_layers=(8, 4),
 39 |         final_activation='sigmoid', dropout=0.3)
 40 | 
 41 |     model(next(iter(dataloader)))
 42 | 
 43 | 
 44 | def test_without_number_feature():
 45 |     number_features = []
 46 | 
 47 |     category_features = [
 48 |         Category('userId', CategoryEncoder(min_cnt=1)),
 49 |         Category('movieId', CategoryEncoder(min_cnt=1)),
 50 |         Category('topGenre', CategoryEncoder(min_cnt=1))]
 51 | 
 52 |     sequence_features = [
 53 |         Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
 54 |         Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
 55 |         Sequence('clickedMovieIds',
 56 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
 57 |         Sequence('clickedMovieTopGenres',
 58 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5))]
 59 | 
 60 |     features = Features(
 61 |         number_features=number_features,
 62 |         category_features=category_features,
 63 |         sequence_features=sequence_features)
 64 | 
 65 |     dataloader, _ = prepare_dataloader(features)
 66 | 
 67 |     model = DNN(
 68 |         features, num_classes=2, embedding_size=4, hidden_layers=(8, 4),
 69 |         final_activation='sigmoid', dropout=0.3)
 70 | 
 71 |     model(next(iter(dataloader)))
 72 | 
 73 | 
 74 | def test_without_category_feature():
 75 |     number_features = []
 76 | 
 77 |     category_features = []
 78 | 
 79 |     sequence_features = [
 80 |         Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
 81 |         Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
 82 |         Sequence('clickedMovieIds',
 83 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
 84 |         Sequence('clickedMovieTopGenres',
 85 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5))]
 86 | 
 87 |     features = Features(
 88 |         number_features=number_features,
 89 |         category_features=category_features,
 90 |         sequence_features=sequence_features)
 91 | 
 92 |     dataloader, _ = prepare_dataloader(features)
 93 | 
 94 |     model = DNN(
 95 |         features, num_classes=2, embedding_size=4, hidden_layers=(8, 4),
 96 |         final_activation='sigmoid', dropout=0.3)
 97 | 
 98 |     model(next(iter(dataloader)))
 99 | 
100 | 
101 | def test_only_with_number_features():
102 |     number_features = [
103 |         Number('userAge', StandardScaler()),
104 |         Number('rating', StandardScaler())]
105 | 
106 |     category_features = []
107 | 
108 |     sequence_features = []
109 | 
110 |     features = Features(
111 |         number_features=number_features,
112 |         category_features=category_features,
113 |         sequence_features=sequence_features)
114 | 
115 |     dataloader, _ = prepare_dataloader(features)
116 | 
117 |     model = DNN(
118 |         features, num_classes=2, embedding_size=4, hidden_layers=(8, 4),
119 |         final_activation='sigmoid', dropout=0.3)
120 | 
121 |     model(next(iter(dataloader)))
122 | 
123 | 
124 | def test_shared_embedding():
125 |     number_features = []
126 | 
127 |     movie_enc = SequenceEncoder(sep='|', min_cnt=1, max_len=5)
128 |     genre_enc = SequenceEncoder(sep='|', min_cnt=1, max_len=5)
129 | 
130 |     movie_enc.fit(
131 |         np.concatenate((_SAMPLE_DF.clickedMovieIds.values,
132 |                         _SAMPLE_DF.movieId.values), axis=None))
133 | 
134 |     genre_enc.fit(
135 |         np.concatenate((_SAMPLE_DF.clickedMovieTopGenres.values,
136 |                         _SAMPLE_DF.topGenre.values), axis=None))
137 | 
138 |     category_features = [
139 |         Category('userId', CategoryEncoder(min_cnt=1)),
140 |         Category('movieId',
141 |                  CategoryEncoder(
142 |                      min_cnt=1,
143 |                      word2idx=movie_enc.word2idx,
144 |                      idx2word=movie_enc.idx2word),
145 |                  embedding_name='movieId'),
146 |         Category('topGenre',
147 |                  CategoryEncoder(
148 |                      min_cnt=1,
149 |                      word2idx=genre_enc.word2idx,
150 |                      idx2word=genre_enc.idx2word),
151 |                  embedding_name='topGenre', embedding_size=8)]
152 | 
153 |     sequence_features = [
154 |         Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
155 |         Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
156 |         Sequence('clickedMovieIds',
157 |                  SequenceEncoder(
158 |                      sep='|',
159 |                      min_cnt=1,
160 |                      max_len=5,
161 |                      word2idx=movie_enc.word2idx,
162 |                      idx2word=movie_enc.idx2word),
163 |                  embedding_name='movieId'),
164 |         Sequence('clickedMovieTopGenres',
165 |                  SequenceEncoder(
166 |                      sep='|',
167 |                      min_cnt=1,
168 |                      max_len=5,
169 |                      word2idx=genre_enc.word2idx,
170 |                      idx2word=genre_enc.idx2word),
171 |                  embedding_name='topGenre', embedding_size=8)]
172 | 
173 |     features = Features(
174 |         number_features=number_features,
175 |         category_features=category_features,
176 |         sequence_features=sequence_features)
177 | 
178 |     dataloader, _ = prepare_dataloader(features)
179 | 
180 |     model = DNN(
181 |         features, num_classes=2, embedding_size=16, hidden_layers=(8, 4),
182 |         final_activation='sigmoid', dropout=0.3)
183 | 
184 |     model(next(iter(dataloader)))
185 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/tests/test_wide_deep.py:
--------------------------------------------------------------------------------
  1 | from prediction_flow.features import Number, Category, Sequence, Features
  2 | from prediction_flow.transformers.column import (
  3 |     StandardScaler, CategoryEncoder, SequenceEncoder)
  4 | from prediction_flow.pytorch import WideDeep
  5 | 
  6 | 
  7 | from .utils import prepare_dataloader
  8 | 
  9 | 
 10 | def test_normal():
 11 |     number_features = [
 12 |         Number('userAge', StandardScaler()),
 13 |         Number('rating', StandardScaler())]
 14 | 
 15 |     category_features = [
 16 |         Category('userId', CategoryEncoder(min_cnt=1)),
 17 |         Category('movieId', CategoryEncoder(min_cnt=1)),
 18 |         Category('topGenre', CategoryEncoder(min_cnt=1))]
 19 | 
 20 |     sequence_features = [
 21 |         Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
 22 |         Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
 23 |         Sequence('clickedMovieIds',
 24 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
 25 |         Sequence('clickedMovieTopGenres',
 26 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5))]
 27 | 
 28 |     features = Features(
 29 |         number_features=number_features,
 30 |         category_features=category_features,
 31 |         sequence_features=sequence_features)
 32 | 
 33 |     wide_features = ['rating', 'title', 'genres']
 34 |     deep_features = ['userAge', 'rating', 'userId', 'movieId', 'topGenre',
 35 |                      'clickedMovieIds', 'clickedMovieTopGenres']
 36 |     cross_features = [('movieId', 'clickedMovieIds'),
 37 |                       ('topGenre', 'clickedMovieTopGenres')]
 38 | 
 39 |     dataloader, _ = prepare_dataloader(features)
 40 | 
 41 |     model = WideDeep(
 42 |         features, wide_features, deep_features, cross_features,
 43 |         num_classes=2, embedding_size=4, hidden_layers=(8, 4),
 44 |         final_activation='sigmoid', dropout=0.3)
 45 | 
 46 |     model(next(iter(dataloader)))
 47 | 
 48 | 
 49 | def test_without_number_feature():
 50 |     number_features = []
 51 | 
 52 |     category_features = [
 53 |         Category('userId', CategoryEncoder(min_cnt=1)),
 54 |         Category('movieId', CategoryEncoder(min_cnt=1)),
 55 |         Category('topGenre', CategoryEncoder(min_cnt=1))]
 56 | 
 57 |     sequence_features = [
 58 |         Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
 59 |         Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
 60 |         Sequence('clickedMovieIds',
 61 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
 62 |         Sequence('clickedMovieTopGenres',
 63 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5))]
 64 | 
 65 |     features = Features(
 66 |         number_features=number_features,
 67 |         category_features=category_features,
 68 |         sequence_features=sequence_features)
 69 | 
 70 |     wide_features = ['title', 'genres']
 71 |     deep_features = ['userId', 'movieId', 'topGenre',
 72 |                      'clickedMovieIds', 'clickedMovieTopGenres']
 73 |     cross_features = [('movieId', 'clickedMovieIds'),
 74 |                       ('topGenre', 'clickedMovieTopGenres')]
 75 | 
 76 |     dataloader, _ = prepare_dataloader(features)
 77 | 
 78 |     model = WideDeep(
 79 |         features, wide_features, deep_features, cross_features,
 80 |         num_classes=2, embedding_size=4, hidden_layers=(8, 4),
 81 |         final_activation='sigmoid', dropout=0.3)
 82 | 
 83 |     model(next(iter(dataloader)))
 84 | 
 85 | 
 86 | def test_without_category_feature():
 87 |     number_features = []
 88 | 
 89 |     category_features = []
 90 | 
 91 |     sequence_features = [
 92 |         Sequence('title', SequenceEncoder(sep='|', min_cnt=1)),
 93 |         Sequence('genres', SequenceEncoder(sep='|', min_cnt=1)),
 94 |         Sequence('clickedMovieIds',
 95 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5)),
 96 |         Sequence('clickedMovieTopGenres',
 97 |                  SequenceEncoder(sep='|', min_cnt=1, max_len=5))]
 98 | 
 99 |     features = Features(
100 |         number_features=number_features,
101 |         category_features=category_features,
102 |         sequence_features=sequence_features)
103 | 
104 |     wide_features = ['title', 'genres']
105 |     deep_features = ['clickedMovieIds', 'clickedMovieTopGenres']
106 | 
107 |     dataloader, _ = prepare_dataloader(features)
108 | 
109 |     model = WideDeep(
110 |         features, wide_features, deep_features, [],
111 |         num_classes=2, embedding_size=4, hidden_layers=(8, 4),
112 |         final_activation='sigmoid', dropout=0.3)
113 | 
114 |     model(next(iter(dataloader)))
115 | 
116 | 
117 | def test_only_with_number_features():
118 |     number_features = [
119 |         Number('userAge', StandardScaler()),
120 |         Number('rating', StandardScaler())]
121 | 
122 |     category_features = []
123 | 
124 |     sequence_features = []
125 | 
126 |     features = Features(
127 |         number_features=number_features,
128 |         category_features=category_features,
129 |         sequence_features=sequence_features)
130 | 
131 |     wide_features = ['rating', 'userAge']
132 | 
133 |     dataloader, _ = prepare_dataloader(features)
134 | 
135 |     model = WideDeep(
136 |         features, wide_features, [], [],
137 |         num_classes=2, embedding_size=4, hidden_layers=(8, 4),
138 |         final_activation='sigmoid', dropout=0.3)
139 | 
140 |     model(next(iter(dataloader)))
141 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/tests/utils.py:
--------------------------------------------------------------------------------
 1 | from prediction_flow.pytorch.data import Dataset
 2 | 
 3 | 
 4 | import pandas as pd
 5 | import torch.utils.data as data
 6 | 
 7 | 
 8 | _SAMPLE_DF = pd.DataFrame({
 9 |     'userId': [11, 11, 11, 11, 11],
10 |     'userAge': [23, 21, 19, 17, 41],
11 |     'movieId': ['4226', '5971', '6291', '7153', '30707'],
12 |     'rating': [3.0, 2.0, 4.0, 4.6, 5.0],
13 |     'timestamp': [1294796159, 1294796201, 1294796113, 1294796132, 1294796176],
14 |     'title': ['Memento (2000)',
15 |               'My Neighbor Totoro (Tonari no Totoro) (1988)',
16 |               'Lilya 4-Ever (Lilja 4-ever) (2002)',
17 |               'Lord of the Rings: The Return of the King, The (2003)',
18 |               'Million Dollar Baby (2004)'],
19 |     'genres': [
20 |         'Mystery|Thriller',
21 |         'Animation|Children|Drama|Fantasy',
22 |         'Crime|Drama',
23 |         'Action|Adventure|Drama|Fantasy',
24 |         'Drama'],
25 |     'topGenre': [
26 |         'Mystery',
27 |         'Animation',
28 |         'Crime',
29 |         'Action',
30 |         'Drama'],
31 |     'clickedMovieIds': [
32 |         '5971|6291',
33 |         '3242|42',
34 |         '32|43542|3222|3',
35 |         '',
36 |         '34|23'],
37 |     'clickedMovieTopGenres': [
38 |         'Animation|Mystery',
39 |         'Drama',
40 |         'Drama|Drama|Drama|Drama',
41 |         '',
42 |         'Mystery|Crime'],
43 |     'noClickedMovieIds': [
44 |         '233|291',
45 |         '324|421',
46 |         '325|3542|322|33',
47 |         '',
48 |         '45|48'],
49 |     'noClickedMovieTopGenres': [
50 |         'Drama|Crime',
51 |         'Animation|Mystery',
52 |         'Mystery|Animation|Crime|Drama',
53 |         '',
54 |         'Crime|Mystery'],
55 |     'label': [1, 0, 0, 1, 0]})
56 | 
57 | 
58 | def prepare_dataloader(features):
59 |     features.fit(_SAMPLE_DF)
60 | 
61 |     X_map = features.transform(_SAMPLE_DF)
62 | 
63 |     dataset = Dataset(features, X_map, _SAMPLE_DF.label.values)
64 | 
65 |     dataloader = data.DataLoader(
66 |         dataset, batch_size=_SAMPLE_DF.shape[0], shuffle=False)
67 | 
68 |     return dataloader, _SAMPLE_DF
69 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/utils.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.init as init
 3 | 
 4 | 
 5 | def init_weights(model):
 6 |     if isinstance(model, nn.Linear):
 7 |         if model.weight is not None:
 8 |             init.kaiming_uniform_(model.weight.data)
 9 |         if model.bias is not None:
10 |             init.normal_(model.bias.data)
11 |     elif isinstance(model, nn.BatchNorm1d):
12 |         if model.weight is not None:
13 |             init.normal_(model.weight.data, mean=1, std=0.02)
14 |         if model.bias is not None:
15 |             init.constant_(model.bias.data, 0)
16 |     elif isinstance(model, nn.BatchNorm2d):
17 |         if model.weight is not None:
18 |             init.normal_(model.weight.data, mean=1, std=0.02)
19 |         if model.bias is not None:
20 |             init.constant_(model.bias.data, 0)
21 |     elif isinstance(model, nn.BatchNorm3d):
22 |         if model.weight is not None:
23 |             init.normal_(model.weight.data, mean=1, std=0.02)
24 |         if model.bias is not None:
25 |             init.constant_(model.bias.data, 0)
26 |     else:
27 |         pass
28 | 


--------------------------------------------------------------------------------
/prediction_flow/pytorch/wide_deep.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Wide&Deep Model.
  3 | """
  4 | from collections import OrderedDict
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | from .base import EmbeddingMixin
 10 | from .nn import MLP, SumPooling
 11 | from .utils import init_weights
 12 | 
 13 | 
 14 | class WideDeep(nn.Module, EmbeddingMixin):
 15 |     """Wide&Deep Model.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     features : Features
 20 | 
 21 |     wide_features : list of str
 22 |         Feature names for wide part.
 23 | 
 24 |     deep_features : list of str
 25 |         Feature names for deep part.
 26 | 
 27 |     cross_features: list of tuple
 28 |         Cross sparse feature names for wide part.
 29 | 
 30 |     num_classes : int
 31 |         Number of classes.
 32 | 
 33 |     embedding_size : int
 34 |         Size of embedding.
 35 | 
 36 |     hidden_layers : list
 37 |         Size of hidden layers.
 38 |         Example: [96, 32]
 39 | 
 40 |     activation : str
 41 |         Activation function.
 42 |         Example: relu
 43 | 
 44 |     final_activation : str
 45 |         Activation function of output.
 46 | 
 47 |     dropout : float
 48 |         Dropout rate.
 49 |     """
 50 |     def __init__(self, features, wide_features, deep_features, cross_features,
 51 |                  num_classes, embedding_size, hidden_layers,
 52 |                  activation='relu', final_activation=None, dropout=0.0):
 53 |         super(WideDeep, self).__init__()
 54 |         self.features = features
 55 |         self.wide_features = wide_features
 56 |         self.deep_features = deep_features
 57 |         self.cross_features = cross_features
 58 |         self.num_classes = num_classes
 59 |         self.final_activation = final_activation
 60 | 
 61 |         self.embeddings, self.embedding_sizes = self.build_embeddings(
 62 |             embedding_size)
 63 | 
 64 |         self._sequence_poolings = OrderedDict()
 65 | 
 66 |         wide_input_size = 0
 67 |         deep_input_size = 0
 68 | 
 69 |         for feature in self.features.number_features:
 70 |             if feature.name in self.wide_features:
 71 |                 wide_input_size += 1
 72 |             if feature.name in self.deep_features:
 73 |                 deep_input_size += 1
 74 | 
 75 |         for feature in self.features.category_features:
 76 |             if feature.name in self.wide_features:
 77 |                 wide_input_size += self.embedding_sizes[feature.name]
 78 |             if feature.name in self.deep_features:
 79 |                 deep_input_size += self.embedding_sizes[feature.name]
 80 | 
 81 |         for feature in self.features.sequence_features:
 82 |             self._sequence_poolings[feature.name] = SumPooling(1)
 83 |             self.add_module(
 84 |                 f"pooling:{feature.name}",
 85 |                 self._sequence_poolings[feature.name])
 86 |             if feature.name in self.wide_features:
 87 |                 wide_input_size += self.embedding_sizes[feature.name]
 88 |             if feature.name in self.deep_features:
 89 |                 deep_input_size += self.embedding_sizes[feature.name]
 90 | 
 91 |         # plus cross embedding size
 92 |         wide_input_size += len(self.cross_features) * embedding_size
 93 | 
 94 |         final_layer_input_size = wide_input_size
 95 | 
 96 |         if deep_input_size:
 97 |             self.mlp = MLP(
 98 |                 deep_input_size,
 99 |                 hidden_layers,
100 |                 dropout=dropout, batchnorm=True, activation=activation)
101 |             final_layer_input_size += hidden_layers[-1]
102 | 
103 |         output_size = self.num_classes
104 | 
105 |         if self.num_classes == 2 and self.final_activation == 'sigmoid':
106 |             output_size -= 1
107 | 
108 |         self.final_layer = nn.Linear(final_layer_input_size, output_size)
109 | 
110 |         self.apply(init_weights)
111 | 
112 |     def forward(self, x):
113 |         wide_inputs = list()
114 |         deep_inputs = list()
115 |         cross_inputs = list()
116 | 
117 |         for feature in self.features.number_features:
118 |             if feature.name in self.wide_features:
119 |                 wide_inputs.append(x[feature.name].view(-1, 1))
120 |             if feature.name in self.deep_features:
121 |                 deep_inputs.append(x[feature.name].view(-1, 1))
122 | 
123 |         for feature in self.features.category_features:
124 |             if feature.name in self.wide_features:
125 |                 wide_inputs.append(
126 |                     self.embeddings[feature.name](x[feature.name]))
127 |             if feature.name in self.deep_features:
128 |                 deep_inputs.append(
129 |                     self.embeddings[feature.name](x[feature.name]))
130 | 
131 |         for feature in self.features.sequence_features:
132 |             if feature.name in self.wide_features:
133 |                 wide_inputs.append(
134 |                     self._sequence_poolings[feature.name](
135 |                         self.embeddings[feature.name](
136 |                             x[feature.name])))
137 |             if feature.name in self.deep_features:
138 |                 deep_inputs.append(
139 |                     self._sequence_poolings[feature.name](
140 |                         self.embeddings[feature.name](
141 |                             x[feature.name])))
142 | 
143 |         # prepare cross features
144 |         for x_f, y_f in self.cross_features:
145 |             if x_f in self._sequence_poolings:
146 |                 x_emb = self._sequence_poolings[x_f](
147 |                     self.embeddings[x_f](x[x_f]))
148 |             else:
149 |                 x_emb = self.embeddings[x_f](x[x_f])
150 | 
151 |             if y_f in self._sequence_poolings:
152 |                 y_emb = self._sequence_poolings[y_f](
153 |                     self.embeddings[y_f](x[y_f]))
154 |             else:
155 |                 y_emb = self.embeddings[y_f](x[y_f])
156 |             cross_inputs.append(x_emb * y_emb)
157 | 
158 |         final_layer_inputs = list()
159 |         if wide_inputs:
160 |             final_layer_inputs.append(torch.cat(wide_inputs, dim=1))
161 | 
162 |         if cross_inputs:
163 |             final_layer_inputs.append(torch.cat(cross_inputs, dim=1))
164 | 
165 |         if deep_inputs:
166 |             final_layer_inputs.append(self.mlp(torch.cat(deep_inputs, dim=1)))
167 | 
168 |         output = self.final_layer(torch.cat(final_layer_inputs, dim=1))
169 | 
170 |         if self.num_classes == 2 and self.final_activation == 'sigmoid':
171 |             output = torch.sigmoid(output)
172 |         elif self.num_classes > 1 and self.final_activation == 'softmax':
173 |             output = torch.softmax(output)
174 |         elif self.final_activation:
175 |             raise NotImplementedError(
176 |                 f"pair (final_activation: {self.final_activation}, "
177 |                 f"num_classes: {self.num_classes}) is not implemented")
178 | 
179 |         return output
180 | 


--------------------------------------------------------------------------------
/prediction_flow/transformers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/transformers/__init__.py


--------------------------------------------------------------------------------
/prediction_flow/transformers/column/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base import Column
 2 | from .log_transformer import LogTransformer
 3 | from .standard_scaler import StandardScaler
 4 | from .category_encoder import CategoryEncoder
 5 | from .sequence_encoder import SequenceEncoder
 6 | from .column_flow import ColumnFlow
 7 | 
 8 | 
 9 | __all__ = [
10 |     'StandardScaler',
11 |     'LogTransformer',
12 |     'CategoryEncoder',
13 |     'SequenceEncoder',
14 |     'ColumnFlow',
15 |     'Column'
16 | ]
17 | 


--------------------------------------------------------------------------------
/prediction_flow/transformers/column/base.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base class for all column-orientation transformer classes
 3 | with fit/transform functions.
 4 | """
 5 | 
 6 | # Authors: Hongwei Zhang
 7 | # License: MIT
 8 | 
 9 | 
10 | from abc import ABC, abstractmethod
11 | from enum import Enum
12 | 
13 | 
14 | class Column(ABC):
15 |     """Base class for all column-orientation transformer classes
16 |     with fit/transform functions.
17 |     """
18 | 
19 |     @abstractmethod
20 |     def fit(self, x, y=None):
21 |         """Fit this transformer.
22 | 
23 |         Parameters
24 |         ----------
25 |         x : array-like
26 |             One column of training data.
27 |         y : array-like, default=None
28 |             Training targets.
29 |         """
30 | 
31 |         raise NotImplementedError
32 | 
33 |     @abstractmethod
34 |     def transform(self, x):
35 |         """Transform x by this fitted transformer.
36 | 
37 |         Parameters
38 |         ----------
39 |         x : array-like
40 |             Column data to be transformed.
41 |         """
42 | 
43 |         raise NotImplementedError
44 | 
45 | 
46 | class ColumnType(Enum):
47 |     NUMBER = 1
48 |     CATEGORY = 2
49 |     SEQUENCE = 3
50 | 
51 | 
52 | class NumberColumn(Column):
53 |     """Base class for all column-orientation number type transformer classes
54 |     with fit/transform functions.
55 |     """
56 |     column_type = ColumnType.NUMBER
57 | 
58 | 
59 | class CategoryColumn(Column):
60 |     """Base class for all column-orientation category type transformer classes
61 |     with fit/transform functions.
62 |     """
63 |     column_type = ColumnType.CATEGORY
64 | 
65 |     @abstractmethod
66 |     def dimension(self):
67 |         """Number of unique terms.
68 |         """
69 |         raise NotImplementedError
70 | 
71 | 
72 | class SequenceColumn(Column):
73 |     """Base class for all column-orientation sequence type transformer classes
74 |     with fit/transform functions.
75 |     """
76 |     column_type = ColumnType.SEQUENCE
77 | 
78 |     @abstractmethod
79 |     def dimension(self):
80 |         """Number of unique terms.
81 |         """
82 |         raise NotImplementedError
83 | 
84 |     @abstractmethod
85 |     def max_length(self):
86 |         """Maximum length of one sequence.
87 |         """
88 |         raise NotImplementedError
89 | 


--------------------------------------------------------------------------------
/prediction_flow/transformers/column/category_encoder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | CatagoryEncoder to convert term to number.
  3 | """
  4 | 
  5 | # Authors: Hongwei Zhang
  6 | # License: MIT
  7 | 
  8 | 
  9 | import numpy as np
 10 | from collections import Counter
 11 | 
 12 | from .base import CategoryColumn
 13 | 
 14 | 
 15 | class CategoryEncoder(CategoryColumn):
 16 |     """Encoder for category type feature.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     min_cnt : int, default=5
 21 |         Minimum count of term.
 22 | 
 23 |     word2idx : dict
 24 |         Mappings from term to index.
 25 | 
 26 |     idx2word : dict
 27 |         Mappings from index to term.
 28 | 
 29 |     Attributes
 30 |     ----------
 31 |     min_cnt : int, default=5
 32 |         Minimum count of term.
 33 | 
 34 |     word2idx : dict
 35 |         Mappings from term to index.
 36 | 
 37 |     idx2word : dict
 38 |         Mappings from index to term.
 39 |     """
 40 |     def __init__(self, min_cnt=5, word2idx=None, idx2word=None):
 41 |         self.min_cnt = min_cnt
 42 | 
 43 |         self.word2idx = word2idx if word2idx else dict()
 44 |         self.idx2word = idx2word if idx2word else dict()
 45 | 
 46 |     def fit(self, x, y=None):
 47 |         """Fit this transformer.
 48 | 
 49 |         Parameters
 50 |         ----------
 51 |         x : array-like
 52 |             One column of training data.
 53 |         y : array-like, default=None, ignored
 54 |             Training targets.
 55 | 
 56 |         Returns
 57 |         -------
 58 |         self : CategoryEncoder
 59 |             This CategoryEncoder.
 60 |         """
 61 |         if not self.word2idx:
 62 |             counter = Counter(np.asarray(x).ravel())
 63 | 
 64 |             selected_terms = sorted(
 65 |                 list(filter(lambda x: counter[x] >= self.min_cnt, counter)))
 66 | 
 67 |             self.word2idx = dict(
 68 |                 zip(selected_terms, range(1, len(selected_terms) + 1)))
 69 |             self.word2idx['__PAD__'] = 0
 70 |             if '__UNKNOWN__' not in self.word2idx:
 71 |                 self.word2idx['__UNKNOWN__'] = len(self.word2idx)
 72 | 
 73 |         if not self.idx2word:
 74 |             self.idx2word = {
 75 |                 index: word for word, index in self.word2idx.items()}
 76 | 
 77 |         return self
 78 | 
 79 |     def transform(self, x):
 80 |         """Transform x by this fitted transformer.
 81 | 
 82 |         Parameters
 83 |         ----------
 84 |         x : array-like
 85 |             Column data to be transformed.
 86 | 
 87 |         Returns
 88 |         -------
 89 |         transformed_x : array-like
 90 |             Transformed data.
 91 |         """
 92 |         transformed_x = list()
 93 |         for term in np.asarray(x).ravel():
 94 |             try:
 95 |                 transformed_x.append(self.word2idx[term])
 96 |             except KeyError:
 97 |                 transformed_x.append(self.word2idx['__UNKNOWN__'])
 98 | 
 99 |         return np.asarray(transformed_x, dtype=np.int64)
100 | 
101 |     def dimension(self):
102 |         return len(self.word2idx)
103 | 


--------------------------------------------------------------------------------
/prediction_flow/transformers/column/column_flow.py:
--------------------------------------------------------------------------------
 1 | """ColumnFlow contaions a chain of column-orientation
 2 | transformers (implementing fit/transform).
 3 | """
 4 | 
 5 | # Authors: Hongwei Zhang
 6 | # License: MIT
 7 | 
 8 | 
 9 | import numpy as np
10 | 
11 | 
12 | class ColumnFlow(object):
13 |     """ColumnFlow contaions a chain of column-orientation
14 |     transformers (implementing fit/transform).
15 | 
16 |     Parameters
17 |     ----------
18 |     transformers : list
19 |         List of column transformers (implementing fit/transform) that are
20 |         chained, in the order in which they are chained.
21 | 
22 |     verbose : boolean, optional
23 |         If True, the log while fitting each transformer will be printed.
24 | 
25 |     Attributes
26 |     ----------
27 |     transformers : list
28 |         List of column transformers (implementing fit/transform) that are
29 |         chained, in the order in which they are chained.
30 | 
31 |     verbose : boolean, optional
32 |         If True, the log while fitting each transformer will be printed.
33 |     """
34 | 
35 |     def __init__(self, transformers, verbose=False):
36 |         ColumnFlow.__check_transformers(transformers)
37 |         self.transformers = transformers
38 |         self.verbose = verbose
39 | 
40 |     @staticmethod
41 |     def __check_transformers(transformers):
42 |         if not isinstance(transformers, list):
43 |             raise TypeError(
44 |                 "transformers must be list type, not {type(transformers)}")
45 | 
46 |         types = [
47 |             transformer.column_type for transformer in transformers]
48 | 
49 |         if len(set(types)) != 1:
50 |             raise ValueError("transformers must be the same type, not {types}")
51 | 
52 |     def fit(self, x, y=None):
53 |         """Fit all transformers one after the other.
54 | 
55 |         Parameters
56 |         ----------
57 |         x : array-like
58 |             One column of training data.
59 |         y : array-like, default=None
60 |             Training targets.
61 | 
62 |         Returns
63 |         -------
64 |         self : ColumnFlow
65 |             This flow.
66 |         """
67 |         transformed_x = np.asarray(x).ravel()
68 |         for transformer in self.transformers:
69 |             transformer.fit(transformed_x, y)
70 |             transformed_x = transformer.transform(transformed_x)
71 | 
72 |         return self
73 | 
74 |     def transform(self, x):
75 |         """Transform x by all fitted transformers.
76 | 
77 |         Parameters
78 |         ----------
79 |         x : array-like
80 |             Column data to be transformed.
81 | 
82 |         Returns
83 |         -------
84 |         transformed_x : array-like
85 |             Transformed data.
86 |         """
87 |         transformed_x = np.asarray(x).ravel()
88 |         for transformer in self.transformers:
89 |             transformed_x = transformer.transform(transformed_x)
90 | 
91 |         return transformed_x
92 | 


--------------------------------------------------------------------------------
/prediction_flow/transformers/column/log_transformer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | LogTransformer to convert number feature.
 3 | """
 4 | 
 5 | # Authors: Hongwei Zhang
 6 | # License: MIT
 7 | 
 8 | 
 9 | import numpy as np
10 | 
11 | from .base import NumberColumn
12 | 
13 | 
14 | class LogTransformer(NumberColumn):
15 |     """LogTransformer to convert number feature.
16 |     """
17 |     def fit(self, x, y=None):
18 |         """Fit this transformer.
19 | 
20 |         Parameters
21 |         ----------
22 |         x : array-like
23 |             One column of training data.
24 |         y : array-like, default=None, ignored
25 |             Training targets.
26 | 
27 |         Returns
28 |         -------
29 |         self : LogTransformer
30 |             This LogTransformer.
31 |         """
32 |         return self
33 | 
34 |     def transform(self, x):
35 |         """ log(1 + x) when x > 0 else x
36 | 
37 |         Parameters
38 |         ----------
39 |         x : array-like
40 |             Column data to be transformed.
41 | 
42 |         Returns
43 |         ----------
44 |         res: array-like
45 |         """
46 |         res = x.copy().astype(np.float).ravel()
47 |         mask = x > 0.0
48 |         res[mask] = np.log(1 + x[mask])
49 |         return res
50 | 


--------------------------------------------------------------------------------
/prediction_flow/transformers/column/sequence_encoder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | SequenceEncoder to convert sequence terms to sequence number.
  3 | """
  4 | 
  5 | # Authors: Hongwei Zhang
  6 | # License: MIT
  7 | 
  8 | 
  9 | from collections import Counter
 10 | import numpy as np
 11 | 
 12 | from .base import SequenceColumn
 13 | 
 14 | 
 15 | class SequenceEncoder(SequenceColumn):
 16 |     """Encoder for sequence type feature. Convert terms to numbers.
 17 |     First index is 1.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     sep : str, default=' '
 22 |         Separator of input sequence.
 23 | 
 24 |     min_cnt : int, default=5
 25 |         Minimum count of term.
 26 | 
 27 |     max_len: int, default=None
 28 |         Maximum length of sequence. If none is given,
 29 |         the maximum length of training sequence will be used.
 30 | 
 31 |     word2idx : dict
 32 |         Mappings from term to index.
 33 | 
 34 |     idx2word : dict
 35 |         Mappings from index to term.
 36 | 
 37 |     Attributes
 38 |     ----------
 39 |     sep : str, default=' '
 40 |         Separator of input sequence.
 41 | 
 42 |     min_cnt : int, default=5
 43 |         Minimum count of term.
 44 | 
 45 |     max_len: int, default=None
 46 |         Maximum length of sequence. If none is given,
 47 |         the maximum length of training sequence will be used.
 48 | 
 49 |     word2idx : dict
 50 |         Mappings from term to index.
 51 | 
 52 |     idx2word : dict
 53 |         Mappings from index to term.
 54 |     """
 55 |     def __init__(self, sep=' ', min_cnt=5, max_len=None,
 56 |                  word2idx=None, idx2word=None):
 57 |         self.sep = sep
 58 |         self.min_cnt = min_cnt
 59 |         self.max_len = max_len
 60 | 
 61 |         self.word2idx = word2idx if word2idx else dict()
 62 |         self.idx2word = idx2word if idx2word else dict()
 63 | 
 64 |     def fit(self, x, y=None):
 65 |         """Fit this transformer.
 66 | 
 67 |         Parameters
 68 |         ----------
 69 |         x : array-like
 70 |             One column of training data.
 71 |         y : array-like, default=None
 72 |             Training targets.
 73 | 
 74 |         Returns
 75 |         -------
 76 |         self : SequenceEncoder
 77 |             This SequenceEncoder.
 78 |         """
 79 | 
 80 |         if not self.word2idx:
 81 |             counter = Counter()
 82 | 
 83 |             max_len = 0
 84 |             for sequence in np.array(x).ravel():
 85 |                 words = sequence.split(self.sep)
 86 |                 counter.update(words)
 87 |                 max_len = max(max_len, len(words))
 88 | 
 89 |             if self.max_len is None:
 90 |                 self.max_len = max_len
 91 | 
 92 |             # drop rare words
 93 |             words = sorted(
 94 |                 list(filter(lambda x: counter[x] >= self.min_cnt, counter)))
 95 | 
 96 |             self.word2idx = dict(zip(words, range(1, len(words) + 1)))
 97 |             self.word2idx['__PAD__'] = 0
 98 |             if '__UNKNOWN__' not in self.word2idx:
 99 |                 self.word2idx['__UNKNOWN__'] = len(self.word2idx)
100 | 
101 |         if not self.idx2word:
102 |             self.idx2word = {
103 |                 index: word for word, index in self.word2idx.items()}
104 | 
105 |         if not self.max_len:
106 |             max_len = 0
107 |             for sequence in np.array(x).ravel():
108 |                 words = sequence.split(self.sep)
109 |                 max_len = max(max_len, len(words))
110 |             self.max_len = max_len
111 | 
112 |         return self
113 | 
114 |     def transform(self, x):
115 |         """Transform x by this fitted transformer.
116 | 
117 |         Parameters
118 |         ----------
119 |         x : array-like
120 |             Column data to be transformed.
121 | 
122 |         Returns
123 |         -------
124 |         transformed_x : array-like
125 |             Transformed data.
126 |         """
127 |         transformed_x = list()
128 | 
129 |         for sequence in np.asarray(x).ravel():
130 |             words = list()
131 |             for word in sequence.split(self.sep):
132 |                 try:
133 |                     words.append(self.word2idx[word])
134 |                 except KeyError:
135 |                     words.append(self.word2idx['__UNKNOWN__'])
136 | 
137 |             transformed_x.append(
138 |                 np.asarray(words[0:self.max_len], dtype=np.int64))
139 | 
140 |         return np.asarray(transformed_x, dtype=np.object)
141 | 
142 |     def dimension(self):
143 |         """Number of unique terms.
144 |         """
145 |         return len(self.word2idx)
146 | 
147 |     def max_length(self):
148 |         """Maximum length of one sequence.
149 |         """
150 |         return self.max_len
151 | 


--------------------------------------------------------------------------------
/prediction_flow/transformers/column/standard_scaler.py:
--------------------------------------------------------------------------------
 1 | """
 2 | StandardScaler to convert term to number.
 3 | """
 4 | 
 5 | # Authors: Hongwei Zhang
 6 | # License: MIT
 7 | 
 8 | 
 9 | import numpy as np
10 | import sklearn.preprocessing as sk
11 | 
12 | from .base import NumberColumn
13 | 
14 | 
15 | class StandardScaler(NumberColumn):
16 |     """Normalize number feature.
17 |     """
18 |     def __init__(self):
19 |         self.__scaler = sk.StandardScaler()
20 | 
21 |     def fit(self, x, y=None):
22 |         """Fit this transformer.
23 | 
24 |         Parameters
25 |         ----------
26 |         x : array-like
27 |             One column of training data.
28 |         y : array-like, default=None, ignored
29 |             Training targets.
30 | 
31 |         Returns
32 |         -------
33 |         self : StandardScaler
34 |             This StandardScaler.
35 |         """
36 |         self.__scaler.fit(np.asarray(x, dtype=np.float).reshape(-1, 1))
37 |         return self
38 | 
39 |     def transform(self, x):
40 |         """Transform x by this fitted transformer.
41 | 
42 |         Parameters
43 |         ----------
44 |         x : array-like
45 |             Column data to be transformed.
46 | 
47 |         Returns
48 |         -------
49 |         transformed_x : array-like
50 |             Transformed data.
51 |         """
52 |         transformed_x = self.__scaler.transform(
53 |             np.asarray(x, dtype=np.float32).reshape(-1, 1))
54 | 
55 |         return transformed_x.ravel()
56 | 


--------------------------------------------------------------------------------
/prediction_flow/transformers/column/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/transformers/column/tests/__init__.py


--------------------------------------------------------------------------------
/prediction_flow/transformers/column/tests/test_category_encoder.py:
--------------------------------------------------------------------------------
 1 | from prediction_flow.transformers.column import CategoryEncoder
 2 | 
 3 | 
 4 | def test_str_inputs():
 5 |     category_encoder = CategoryEncoder(min_cnt=1)
 6 | 
 7 |     input_terms = ['this', 'is', 'a', 'simple', 'test']
 8 | 
 9 |     category_encoder.fit(input_terms)
10 | 
11 |     transformed = category_encoder.transform(input_terms)
12 | 
13 |     assert set(transformed) == {1, 2, 3, 4, 5}
14 |     assert category_encoder.dimension() == 7
15 | 
16 | 
17 | def test_int_inputs():
18 |     category_encoder = CategoryEncoder(min_cnt=1)
19 | 
20 |     input_terms = [345, 3434, 23, 88, 4]
21 | 
22 |     category_encoder.fit(input_terms)
23 | 
24 |     transformed = category_encoder.transform(input_terms)
25 | 
26 |     assert set(transformed) == {1, 2, 3, 4, 5}
27 |     assert category_encoder.dimension() == 7
28 | 
29 | 
30 | def test_unseen_inputs():
31 |     category_encoder = CategoryEncoder(min_cnt=1)
32 | 
33 |     input_terms = [345, 3434, 23, 88, 4]
34 | 
35 |     category_encoder.fit(input_terms)
36 | 
37 |     transformed = category_encoder.transform([345, 5343])
38 | 
39 |     assert set(transformed) == {4, 6}
40 | 


--------------------------------------------------------------------------------
/prediction_flow/transformers/column/tests/test_column_flow.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from prediction_flow.transformers.column import (
 4 |     LogTransformer, CategoryEncoder, ColumnFlow)
 5 | 
 6 | 
 7 | def test_wrong_type_transformers():
 8 |     with pytest.raises(TypeError):
 9 |         ColumnFlow({CategoryEncoder()})
10 | 
11 | 
12 | def test_multi_type_transformers():
13 |     with pytest.raises(ValueError):
14 |         ColumnFlow([LogTransformer(), CategoryEncoder()])
15 | 
16 | 
17 | def test_transformers():
18 |     column_flow = ColumnFlow([CategoryEncoder(min_cnt=1)])
19 | 
20 |     input_terms = ['this', 'is', 'a', 'simple', 'test']
21 | 
22 |     column_flow.fit(input_terms)
23 | 
24 |     transformed = column_flow.transform(input_terms)
25 | 
26 |     assert set(transformed) == {1, 2, 3, 4, 5}
27 |     assert column_flow.transformers[-1].dimension() == 7
28 |     assert isinstance(input_terms, list) == True
29 | 


--------------------------------------------------------------------------------
/prediction_flow/transformers/column/tests/test_log_transformer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from prediction_flow.transformers.column import LogTransformer
 4 | 
 5 | 
 6 | def test_normal():
 7 |     log_transformer = LogTransformer()
 8 | 
 9 |     x = np.array([100, 10, 32])
10 |     log_transformer.fit(x)
11 | 
12 |     np.testing.assert_array_almost_equal(
13 |         log_transformer.transform(x), np.array([4.615121, 2.397895, 3.496508]))
14 | 


--------------------------------------------------------------------------------
/prediction_flow/transformers/column/tests/test_sequence_encoder.py:
--------------------------------------------------------------------------------
 1 | from prediction_flow.transformers.column import SequenceEncoder
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def test_normal():
 7 |     sequence_encoder = SequenceEncoder(sep=' ', min_cnt=1, max_len=3)
 8 | 
 9 |     x = [
10 |         "this is a simple test",
11 |         "this class is work"
12 |     ]
13 | 
14 |     sequence_encoder.fit(x)
15 | 
16 |     actual = sequence_encoder.transform(x)
17 |     assert sequence_encoder.dimension() == 9
18 |     assert sequence_encoder.max_length() == 3
19 |     assert actual.tolist() == [[6, 3, 1], [6, 2, 3]]
20 |     assert isinstance(x, list)
21 | 
22 | 
23 | def test_unseen_inputs():
24 |     sequence_encoder = SequenceEncoder(sep=' ', min_cnt=1, max_len=10)
25 | 
26 |     x = [
27 |         "this is a simple test",
28 |         "this class is work"
29 |     ]
30 | 
31 |     sequence_encoder.fit(x)
32 | 
33 |     actual = sequence_encoder.transform(["this is an unseen test"])
34 |     assert actual.tolist() == [[6, 3, 8, 8, 5]]
35 | 


--------------------------------------------------------------------------------
/prediction_flow/transformers/column/tests/test_standard_scaler.py:
--------------------------------------------------------------------------------
 1 | from prediction_flow.transformers.column import StandardScaler
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def test_normal():
 7 |     scaler = StandardScaler()
 8 | 
 9 |     x = np.array([3, 4, 2, 24, 2], dtype=np.float)
10 | 
11 |     scaler.fit(x)
12 | 
13 |     actual = scaler.transform(x)
14 |     expected = np.array([
15 |         -0.46880723, -0.35160542, -0.58600904,  1.99243073, -0.58600904])
16 | 
17 |     np.testing.assert_array_almost_equal(actual, expected)
18 | 


--------------------------------------------------------------------------------
/prediction_flow/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitHub-HongweiZhang/prediction-flow/cad8aa1f0830f5540a6d309e542519645850ba11/prediction_flow/utils/__init__.py


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.16.0
2 | pandas>=0.24.2
3 | torch>=1.1.0
4 | tqdm>=4.32.0
5 | scikit-learn>=0.20.0
6 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | desciption-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | 
 4 | from setuptools import setup, find_packages
 5 | import prediction_flow
 6 | 
 7 | 
 8 | with open('README.md') as f:
 9 |     LONG_DESCRIPTION = f.read()
10 | 
11 | with open('requirements.txt') as f:
12 |     INSTALL_REQUIRES = f.read().splitlines()
13 | 
14 | DISTNAME = 'prediction-flow'
15 | DESCRIPTION = 'Deep-Learning based CTR models implemented by PyTorch'
16 | MAINTAINER = 'Hongwei Zhang'
17 | MAINTAINER_EMAIL = 'hw_zhang@outlook.com'
18 | URL = 'https://github.com/GitHub-HongweiZhang/prediction-flow'
19 | LICENSE = 'MIT'
20 | VERSION = prediction_flow.__version__
21 | 
22 | 
23 | def setup_package():
24 |     setup(
25 |         name=DISTNAME,
26 |         packages=find_packages(),
27 |         maintainer=MAINTAINER,
28 |         maintainer_email=MAINTAINER_EMAIL,
29 |         description=DESCRIPTION,
30 |         url=URL,
31 |         version=VERSION,
32 |         long_description=LONG_DESCRIPTION,
33 |         long_description_content_type="text/markdown",
34 |         python_requires='>=3.6',
35 |         include_package_data=True,
36 |         install_requires=INSTALL_REQUIRES,
37 |         classifiers=(
38 |             'Development Status :: 3 - Alpha',
39 |             'License :: OSI Approved :: MIT License',
40 |             'Operating System :: OS Independent',
41 |             'Intended Audience :: Science/Research',
42 |             'Intended Audience :: Developers',
43 |             'Intended Audience :: Education',
44 |             'Programming Language :: Python',
45 |             'Programming Language :: Python :: 3',
46 |             'Programming Language :: Python :: 3.6',
47 |             'Topic :: Software Development',
48 |             'Topic :: Scientific/Engineering',
49 |         ),
50 |         license=LICENSE,
51 |         keywords=[
52 |             'torch', 'ctr prediction', 'deep learning',
53 |             'deepfm', 'din', 'dnn', 'deep neural network']
54 |     )
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     setup_package()
59 | 


--------------------------------------------------------------------------------