├── .gitignore
├── README.md
├── images
├── model_retrained.png
├── pipeline_diagram.png
└── start_pipeline.gif
├── initialize.py
├── notebooks
├── .ipynb_checkpoints
│ └── skopt_vs_hyperopt-checkpoint.ipynb
└── skopt_vs_hyperopt.ipynb
├── predictor.py
├── requirements.txt
├── sample_app.py
├── train
├── __init__.py
├── train_hyperopt.py
├── train_hyperopt_mlflow.py
├── train_hyperparameterhunter.py
└── train_hyperparameterhunter_mlfow.py
├── trainer.py
└── utils
├── __init__.py
├── feature_tools.py
├── messages_utils.py
└── preprocess_data.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 | db.sqlite3-journal
62 |
63 | # Flask stuff:
64 | instance/
65 | .webassets-cache
66 |
67 | # Scrapy stuff:
68 | .scrapy
69 |
70 | # Sphinx documentation
71 | docs/_build/
72 |
73 | # PyBuilder
74 | target/
75 |
76 | # Jupyter Notebook
77 | .ipynb_checkpoints
78 |
79 | # IPython
80 | profile_default/
81 | ipython_config.py
82 |
83 | # pyenv
84 | .python-version
85 |
86 | # pipenv
87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
90 | # install all needed dependencies.
91 | #Pipfile.lock
92 |
93 | # celery beat schedule file
94 | celerybeat-schedule
95 |
96 | # SageMath parsed files
97 | *.sage.py
98 |
99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 |
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 |
112 | # Rope project settings
113 | .ropeproject
114 |
115 | # mkdocs documentation
116 | /site
117 |
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 |
123 | # Pyre type checker
124 | .pyre/
125 |
126 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
127 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
128 | .idea/
129 | # User-specific stuff
130 | .idea/**/workspace.xml
131 | .idea/**/tasks.xml
132 | .idea/**/usage.statistics.xml
133 | .idea/**/dictionaries
134 | .idea/**/shelf
135 |
136 | # Generated files
137 | .idea/**/contentModel.xml
138 |
139 | # Sensitive or high-churn files
140 | .idea/**/dataSources/
141 | .idea/**/dataSources.ids
142 | .idea/**/dataSources.local.xml
143 | .idea/**/sqlDataSources.xml
144 | .idea/**/dynamic.xml
145 | .idea/**/uiDesigner.xml
146 | .idea/**/dbnavigator.xml
147 |
148 | # Gradle
149 | .idea/**/gradle.xml
150 | .idea/**/libraries
151 |
152 | # Gradle and Maven with auto-import
153 | # When using Gradle or Maven with auto-import, you should exclude module files,
154 | # since they will be recreated, and may cause churn. Uncomment if using
155 | # auto-import.
156 | # .idea/modules.xml
157 | # .idea/*.iml
158 | # .idea/modules
159 | # *.iml
160 | # *.ipr
161 |
162 | # CMake
163 | cmake-build-*/
164 |
165 | # Mongo Explorer plugin
166 | .idea/**/mongoSettings.xml
167 |
168 | # File-based project format
169 | *.iws
170 |
171 | # IntelliJ
172 | out/
173 |
174 | # mpeltonen/sbt-idea plugin
175 | .idea_modules/
176 |
177 | # JIRA plugin
178 | atlassian-ide-plugin.xml
179 |
180 | # Cursive Clojure plugin
181 | .idea/replstate.xml
182 |
183 | # Crashlytics plugin (for Android Studio and IntelliJ)
184 | com_crashlytics_export_strings.xml
185 | crashlytics.properties
186 | crashlytics-build.properties
187 | fabric.properties
188 |
189 | # Editor-based Rest Client
190 | .idea/httpRequests
191 |
192 | # Android studio 3.1+ serialized cache file
193 | .idea/caches/build_file_checksums.ser
194 |
195 | data/
196 | mlruns/
197 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Putting ML in Production
2 | This repo contains code that we hope is useful to illustrate how one could productionise a real-time algorithm. The companion Medium posts can be found [here](https://medium.com/@jrzaurin/putting-ml-in-production-i-using-apache-kafka-in-python-ce06b3a395c8) and [here](https://towardsdatascience.com/putting-ml-in-production-ii-logging-and-monitoring-algorithms-91f174044e4e). The code here is meant to be as generic as possible (within certain limits), and is designed to be useful in an scenario similar to the following one.
3 |
4 | ## Scenario
5 |
6 | A company collects data using a series of services that generate events as the users/customers interact with the the company’s website or app. As these interactions happen, an algorithm needs to run in real time and some immediate action needs to be taken based on the algorithm’s outputs (or predictions). On top of that, after *N* interactions (or observations) the algorithm needs to be retrained without stopping the prediction service, since users will keep interacting.
7 |
8 | For the exercise here we have used the [Adult](https://archive.ics.uci.edu/ml/datasets/adult) dataset, where the goal is to predict whether individuals earn an income higher/lower than 50k based on their age, native country, etc. To adapt this dataset to the scenario described before, one could assume that age, native country, etc is collected through an online questionnaire/form and we need to predict whether users have high/low income in real time. If high income, then we immediately call/email them with some offer, for example. Then, after N new observations we retrain the algorithm while we keep predicting on new users.
9 |
10 | ## Solution
11 |
12 | The online part of our solution is illustrated in the figure below, and uses mainly [Kafka-Python](https://github.com/dpkp/kafka-python), [LightGBM](https://lightgbm.readthedocs.io/en/latest/#) and [Hyperopt](http://hyperopt.github.io/hyperopt/) or [HyperparameterHunter](https://github.com/HunterMcGushion/hyperparameter_hunter).
13 |
14 | 
15 |
16 | A full description of the solution can be found in the already mentioned Medium posts. Briefly:
17 |
18 | **OFFLINE TRAINING**
19 |
20 | The offline process is fairly standard and all accomplished by running the `initialize.py` script. This script will download the dataset, set the dir structure, pre-preprocess the data, train an initial model on the training dataset and optimise the hyperparameters of that model. The results will be saved to disk and from there in advance we are ready to move to the online stage of the process.
21 |
22 | **ONLINE PREDICTIONS AND RETRAINING**
23 |
24 | 0. The App/Service (`app_sample.py`) will send messages (JSON) into the pipeline. These will be processed and App/Service will then get the results of the predictions.
25 | 1. 1a) The messages from App/Service will be published to Kafka and, eventualy, received by the Predictor (`predictor.py`)
26 |
27 | 1b) The Predictor will process the data and run the algorithm publishing the message with the prediction result back to Kafka, which will be eventually received by App/Service
28 | 2. After N messages the Predictor will publish a "retrain topic" message
29 | 3. The Trainer (`trainer.py`) will receive the "retrain topic" message and start retraining the algorithm. In the meantime, the Predictor will not stop serving predictions.
30 | 4. Once the algorithm is retrained, the Trainer will publish a message with the corresponding information (namely: *"retraining completed"*)
31 | 5. The Predictor will receive the message that retraining is complete, it will load the new model and proceed as usual.
32 |
33 | ## How to run the pipeline
34 |
35 | 1. Run initialize.py
36 | ```
37 | python initialize.py
38 | ```
39 | or
40 | ```
41 | python initialize.py --hyper hyperparameterhunter
42 | ```
43 | HyperparameterHunter is built on top of [Skopt](https://scikit-optimize.github.io/). It is not our goal here to compare hyperparameter optimization packages. Nonetheless, a brief comparison is included in the Medium post and a notebook comparing Skopt and Hyperopt performances is included here, in the notebooks directory.
44 |
45 | 2. Start Zookeeper and Kafka. Assuming these are installed using Homebrew, starting these services is as easy as:
46 | ```
47 | $ brew services start zookeeper
48 | ==> Successfully started `zookeeper` (label: homebrew.mxcl.zookeeper)
49 | $ brew services start kafka
50 | ==> Successfully started `kafka` (label: homebrew.mxcl.kafka)
51 | ```
52 |
53 | 3. In Terminal#1 run the Predictor (or the Trainer):
54 | ```
55 | python predictor.py
56 | ```
57 | 4. In Terminal#2 run the Trainer (or the Predictor):
58 | ```
59 | python trainer.py
60 | ```
61 | or
62 | ```
63 | python trainer.py --hyper hyperparameterhunter
64 | ```
65 | 5. In Terminal#3 run the Sample App
66 | ```
67 | python sample_app.py
68 | ```
69 |
70 | Below we have included a GIF showing the steps 3, 4 and 5.
71 |
72 |
73 | 
74 |
75 | After `RETRAIN_EVERY` messages (parameter to be set by the user), the user will be able to see how the algorithm is retrained in the terminal, as shown in the Figure below. The top-right terminal shows how Hyperopt has run 10 evaluations (in a real exercise these should be a few hundred). Once the model is retrained and optimised we see in the top-left window how the predictor has loaded the new model (after the annoying warning message from the new LightGBM version) and proceed with the prediction service as usual.
76 |
77 | 
78 |
79 |
80 | ## Logging and monitoring
81 | To log all the information generated form the pipeline as it retraines the algorithm one could directly use HyperparameterHunter, which is fantastic precisely at that task. In addition, we have also used MLflow, which comes with a very convenient UI. [Our second posts](https://towardsdatascience.com/putting-ml-in-production-ii-logging-and-monitoring-algorithms-91f174044e4e) focuses on the interplay of these two tools. All the related code can be found at the `train` module within `train_hyperopt_mlflow.py` or `train_hyperparameterhunter_mlfow.py`.
82 |
83 | Comments or suggestions, please email: jrzaurin@gmail.com
84 |
--------------------------------------------------------------------------------
/images/model_retrained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrzaurin/ml-pipeline/af2b338b19a7eda31af7da35db877a9254a02058/images/model_retrained.png
--------------------------------------------------------------------------------
/images/pipeline_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrzaurin/ml-pipeline/af2b338b19a7eda31af7da35db877a9254a02058/images/pipeline_diagram.png
--------------------------------------------------------------------------------
/images/start_pipeline.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrzaurin/ml-pipeline/af2b338b19a7eda31af7da35db877a9254a02058/images/start_pipeline.gif
--------------------------------------------------------------------------------
/initialize.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import lightgbm as lgb
4 | import pickle
5 | import warnings
6 | import argparse
7 | import os
8 | import pdb
9 |
10 | from pathlib import Path
11 | from utils.preprocess_data import build_train
12 |
13 |
14 | PATH = Path('data/')
15 | TRAIN_PATH = PATH/'train'
16 | DATAPROCESSORS_PATH = PATH/'dataprocessors'
17 | MODELS_PATH = PATH/'models'
18 | MESSAGES_PATH = PATH/'messages'
19 |
20 |
21 | def create_folders():
22 | print("creating directory structure...")
23 | (PATH).mkdir(exist_ok=True)
24 | (TRAIN_PATH).mkdir(exist_ok=True)
25 | (MODELS_PATH).mkdir(exist_ok=True)
26 | (DATAPROCESSORS_PATH).mkdir(exist_ok=True)
27 | (MESSAGES_PATH).mkdir(exist_ok=True)
28 |
29 |
30 | def download_data():
31 | train_path = PATH/'adult.data'
32 | test_path = PATH/'adult.test'
33 |
34 | COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
35 | "marital_status", "occupation", "relationship", "race", "gender",
36 | "capital_gain", "capital_loss", "hours_per_week", "native_country",
37 | "income_bracket"]
38 |
39 | print("downloading training data...")
40 | df_train = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data",
41 | names=COLUMNS, skipinitialspace=True, index_col=0)
42 | df_train.drop("education_num", axis=1, inplace=True)
43 | df_train.to_csv(train_path)
44 | df_train.to_csv(PATH/'train/train.csv')
45 |
46 | print("downloading testing data...")
47 | df_test = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test",
48 | names=COLUMNS, skipinitialspace=True, skiprows=1, index_col=0)
49 | df_test.drop("education_num", axis=1, inplace=True)
50 | df_test.to_csv(test_path)
51 |
52 |
53 | def create_data_processor():
54 | print("creating preprocessor...")
55 | dataprocessor = build_train(TRAIN_PATH/'train.csv', DATAPROCESSORS_PATH)
56 |
57 |
58 | def create_model(hyper):
59 | print("creating model...")
60 | init_dataprocessor = 'dataprocessor_0_.p'
61 | dtrain = pickle.load(open(DATAPROCESSORS_PATH/init_dataprocessor, 'rb'))
62 | if hyper == "hyperopt":
63 | # from train.train_hyperopt import LGBOptimizer
64 | from train.train_hyperopt_mlflow import LGBOptimizer
65 | elif hyper == "hyperparameterhunter":
66 | # from train.train_hyperparameterhunter import LGBOptimizer
67 | from train.train_hyperparameterhunter_mlfow import LGBOptimizer
68 | LGBOpt = LGBOptimizer(dtrain, MODELS_PATH)
69 | LGBOpt.optimize(maxevals=50)
70 |
71 |
72 | if __name__ == '__main__':
73 |
74 | parser = argparse.ArgumentParser()
75 |
76 | parser.add_argument("--hyper", type=str, default="hyperopt")
77 | args = parser.parse_args()
78 | create_folders()
79 | download_data()
80 | create_data_processor()
81 | create_model(args.hyper)
--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/skopt_vs_hyperopt-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Skopt vs Hyperopt\n",
8 | "\n",
9 | "## Importing and preprocessing data"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "name": "stderr",
19 | "output_type": "stream",
20 | "text": [
21 | "/usr/local/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.\n",
22 | "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
23 | "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
24 | "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
25 | " \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
26 | ]
27 | }
28 | ],
29 | "source": [
30 | "import pandas as pd\n",
31 | "import numpy as np\n",
32 | "import pickle\n",
33 | "import lightgbm as lgb\n",
34 | "import warnings\n",
35 | "\n",
36 | "from time import time\n",
37 | "from hyperopt import hp, tpe, fmin, Trials\n",
38 | "from skopt import BayesSearchCV\n",
39 | "from skopt.space import Real, Categorical, Integer\n",
40 | "from skopt import gbrt_minimize\n",
41 | "from sklearn.model_selection import StratifiedKFold, cross_val_score\n",
42 | "from sklearn.metrics import log_loss\n",
43 | "from utils import FeatureTools\n",
44 | "\n",
45 | "warnings.filterwarnings(\"ignore\")"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 2,
51 | "metadata": {},
52 | "outputs": [
53 | {
54 | "data": {
55 | "text/html": [
56 | "
\n",
57 | "\n",
70 | "
\n",
71 | " \n",
72 | " \n",
73 | " | \n",
74 | " age | \n",
75 | " workclass | \n",
76 | " fnlwgt | \n",
77 | " education | \n",
78 | " marital_status | \n",
79 | " occupation | \n",
80 | " relationship | \n",
81 | " race | \n",
82 | " gender | \n",
83 | " capital_gain | \n",
84 | " capital_loss | \n",
85 | " hours_per_week | \n",
86 | " native_country | \n",
87 | " target | \n",
88 | "
\n",
89 | " \n",
90 | " \n",
91 | " \n",
92 | " 0 | \n",
93 | " 39 | \n",
94 | " State-gov | \n",
95 | " 77516 | \n",
96 | " Bachelors | \n",
97 | " Never-married | \n",
98 | " Adm-clerical | \n",
99 | " Not-in-family | \n",
100 | " White | \n",
101 | " Male | \n",
102 | " 2174 | \n",
103 | " 0 | \n",
104 | " 40 | \n",
105 | " United-States | \n",
106 | " 0 | \n",
107 | "
\n",
108 | " \n",
109 | " 1 | \n",
110 | " 50 | \n",
111 | " Self-emp-not-inc | \n",
112 | " 83311 | \n",
113 | " Bachelors | \n",
114 | " Married-civ-spouse | \n",
115 | " Exec-managerial | \n",
116 | " Husband | \n",
117 | " White | \n",
118 | " Male | \n",
119 | " 0 | \n",
120 | " 0 | \n",
121 | " 13 | \n",
122 | " United-States | \n",
123 | " 0 | \n",
124 | "
\n",
125 | " \n",
126 | " 2 | \n",
127 | " 38 | \n",
128 | " Private | \n",
129 | " 215646 | \n",
130 | " HS-grad | \n",
131 | " Divorced | \n",
132 | " Handlers-cleaners | \n",
133 | " Not-in-family | \n",
134 | " White | \n",
135 | " Male | \n",
136 | " 0 | \n",
137 | " 0 | \n",
138 | " 40 | \n",
139 | " United-States | \n",
140 | " 0 | \n",
141 | "
\n",
142 | " \n",
143 | " 3 | \n",
144 | " 53 | \n",
145 | " Private | \n",
146 | " 234721 | \n",
147 | " 11th | \n",
148 | " Married-civ-spouse | \n",
149 | " Handlers-cleaners | \n",
150 | " Husband | \n",
151 | " Black | \n",
152 | " Male | \n",
153 | " 0 | \n",
154 | " 0 | \n",
155 | " 40 | \n",
156 | " United-States | \n",
157 | " 0 | \n",
158 | "
\n",
159 | " \n",
160 | " 4 | \n",
161 | " 28 | \n",
162 | " Private | \n",
163 | " 338409 | \n",
164 | " Bachelors | \n",
165 | " Married-civ-spouse | \n",
166 | " Prof-specialty | \n",
167 | " Wife | \n",
168 | " Black | \n",
169 | " Female | \n",
170 | " 0 | \n",
171 | " 0 | \n",
172 | " 40 | \n",
173 | " Cuba | \n",
174 | " 0 | \n",
175 | "
\n",
176 | " \n",
177 | "
\n",
178 | "
"
179 | ],
180 | "text/plain": [
181 | " age workclass fnlwgt education marital_status \\\n",
182 | "0 39 State-gov 77516 Bachelors Never-married \n",
183 | "1 50 Self-emp-not-inc 83311 Bachelors Married-civ-spouse \n",
184 | "2 38 Private 215646 HS-grad Divorced \n",
185 | "3 53 Private 234721 11th Married-civ-spouse \n",
186 | "4 28 Private 338409 Bachelors Married-civ-spouse \n",
187 | "\n",
188 | " occupation relationship race gender capital_gain \\\n",
189 | "0 Adm-clerical Not-in-family White Male 2174 \n",
190 | "1 Exec-managerial Husband White Male 0 \n",
191 | "2 Handlers-cleaners Not-in-family White Male 0 \n",
192 | "3 Handlers-cleaners Husband Black Male 0 \n",
193 | "4 Prof-specialty Wife Black Female 0 \n",
194 | "\n",
195 | " capital_loss hours_per_week native_country target \n",
196 | "0 0 40 United-States 0 \n",
197 | "1 0 13 United-States 0 \n",
198 | "2 0 40 United-States 0 \n",
199 | "3 0 40 United-States 0 \n",
200 | "4 0 40 Cuba 0 "
201 | ]
202 | },
203 | "execution_count": 2,
204 | "metadata": {},
205 | "output_type": "execute_result"
206 | }
207 | ],
208 | "source": [
209 | "df = pd.read_csv(\"data/adult.data\")\n",
210 | "df['target'] = (df['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)\n",
211 | "df.drop('income_bracket', axis=1, inplace=True)\n",
212 | "df.head()"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": [
219 | "I have coded a preprocessor class before that does the work for us."
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": 3,
225 | "metadata": {},
226 | "outputs": [
227 | {
228 | "name": "stdout",
229 | "output_type": "stream",
230 | "text": [
231 | "the features column names are: ['age', 'workclass', 'fnlwgt', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'education_occupation', 'native_country_occupation']\n",
232 | "the categorical columns are: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country', 'education_occupation', 'native_country_occupation']\n"
233 | ]
234 | }
235 | ],
236 | "source": [
237 | "dataprocessor = pickle.load(open(\"data/dataprocessors/dataprocessor_0_.p\", \"rb\"))\n",
238 | "all_features = dataprocessor.colnames\n",
239 | "categorical_features = dataprocessor.cat_cols + dataprocessor.crossed_columns\n",
240 | "\n",
241 | "print(\"the features column names are: {}\".format(all_features))\n",
242 | "print(\"the categorical columns are: {}\".format(categorical_features))"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {},
248 | "source": [
249 | "the `dataprocessor` is already train, so we simply need to `transform`"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 4,
255 | "metadata": {},
256 | "outputs": [
257 | {
258 | "data": {
259 | "text/html": [
260 | "\n",
261 | "\n",
274 | "
\n",
275 | " \n",
276 | " \n",
277 | " | \n",
278 | " age | \n",
279 | " workclass | \n",
280 | " fnlwgt | \n",
281 | " education | \n",
282 | " marital_status | \n",
283 | " occupation | \n",
284 | " relationship | \n",
285 | " race | \n",
286 | " gender | \n",
287 | " capital_gain | \n",
288 | " capital_loss | \n",
289 | " hours_per_week | \n",
290 | " native_country | \n",
291 | " target | \n",
292 | " education_occupation | \n",
293 | " native_country_occupation | \n",
294 | "
\n",
295 | " \n",
296 | " \n",
297 | " \n",
298 | " 0 | \n",
299 | " 0.301370 | \n",
300 | " 0 | \n",
301 | " 0.044302 | \n",
302 | " 0 | \n",
303 | " 0 | \n",
304 | " 0 | \n",
305 | " 0 | \n",
306 | " 0 | \n",
307 | " 0 | \n",
308 | " 0.02174 | \n",
309 | " 0.0 | \n",
310 | " 0.397959 | \n",
311 | " 0 | \n",
312 | " 0 | \n",
313 | " 0 | \n",
314 | " 0 | \n",
315 | "
\n",
316 | " \n",
317 | " 1 | \n",
318 | " 0.452055 | \n",
319 | " 1 | \n",
320 | " 0.048238 | \n",
321 | " 0 | \n",
322 | " 1 | \n",
323 | " 1 | \n",
324 | " 1 | \n",
325 | " 0 | \n",
326 | " 0 | \n",
327 | " 0.00000 | \n",
328 | " 0.0 | \n",
329 | " 0.122449 | \n",
330 | " 0 | \n",
331 | " 0 | \n",
332 | " 1 | \n",
333 | " 1 | \n",
334 | "
\n",
335 | " \n",
336 | " 2 | \n",
337 | " 0.287671 | \n",
338 | " 2 | \n",
339 | " 0.138113 | \n",
340 | " 1 | \n",
341 | " 2 | \n",
342 | " 2 | \n",
343 | " 0 | \n",
344 | " 0 | \n",
345 | " 0 | \n",
346 | " 0.00000 | \n",
347 | " 0.0 | \n",
348 | " 0.397959 | \n",
349 | " 0 | \n",
350 | " 0 | \n",
351 | " 2 | \n",
352 | " 2 | \n",
353 | "
\n",
354 | " \n",
355 | " 3 | \n",
356 | " 0.493151 | \n",
357 | " 2 | \n",
358 | " 0.151068 | \n",
359 | " 2 | \n",
360 | " 1 | \n",
361 | " 2 | \n",
362 | " 1 | \n",
363 | " 1 | \n",
364 | " 0 | \n",
365 | " 0.00000 | \n",
366 | " 0.0 | \n",
367 | " 0.397959 | \n",
368 | " 0 | \n",
369 | " 0 | \n",
370 | " 3 | \n",
371 | " 2 | \n",
372 | "
\n",
373 | " \n",
374 | " 4 | \n",
375 | " 0.150685 | \n",
376 | " 2 | \n",
377 | " 0.221488 | \n",
378 | " 0 | \n",
379 | " 1 | \n",
380 | " 3 | \n",
381 | " 2 | \n",
382 | " 1 | \n",
383 | " 1 | \n",
384 | " 0.00000 | \n",
385 | " 0.0 | \n",
386 | " 0.397959 | \n",
387 | " 1 | \n",
388 | " 0 | \n",
389 | " 4 | \n",
390 | " 3 | \n",
391 | "
\n",
392 | " \n",
393 | "
\n",
394 | "
"
395 | ],
396 | "text/plain": [
397 | " age workclass fnlwgt education marital_status occupation \\\n",
398 | "0 0.301370 0 0.044302 0 0 0 \n",
399 | "1 0.452055 1 0.048238 0 1 1 \n",
400 | "2 0.287671 2 0.138113 1 2 2 \n",
401 | "3 0.493151 2 0.151068 2 1 2 \n",
402 | "4 0.150685 2 0.221488 0 1 3 \n",
403 | "\n",
404 | " relationship race gender capital_gain capital_loss hours_per_week \\\n",
405 | "0 0 0 0 0.02174 0.0 0.397959 \n",
406 | "1 1 0 0 0.00000 0.0 0.122449 \n",
407 | "2 0 0 0 0.00000 0.0 0.397959 \n",
408 | "3 1 1 0 0.00000 0.0 0.397959 \n",
409 | "4 2 1 1 0.00000 0.0 0.397959 \n",
410 | "\n",
411 | " native_country target education_occupation native_country_occupation \n",
412 | "0 0 0 0 0 \n",
413 | "1 0 0 1 1 \n",
414 | "2 0 0 2 2 \n",
415 | "3 0 0 3 2 \n",
416 | "4 1 0 4 3 "
417 | ]
418 | },
419 | "execution_count": 4,
420 | "metadata": {},
421 | "output_type": "execute_result"
422 | }
423 | ],
424 | "source": [
425 | "train_data = dataprocessor.transform(df)\n",
426 | "\n",
427 | "train_data.head()"
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": 6,
433 | "metadata": {},
434 | "outputs": [],
435 | "source": [
436 | "# np arrays\n",
437 | "X_train = train_data[[c for c in train_data.columns if c is not 'target']].values\n",
438 | "y_train = train_data['target'].values\n",
439 | "\n",
440 | "# lgb Dataset object\n",
441 | "lgtrain = lgb.Dataset(X_train,\n",
442 | " label=y_train,\n",
443 | " feature_name=all_features,\n",
444 | " categorical_feature=categorical_features,\n",
445 | " free_raw_data=False)"
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": 7,
451 | "metadata": {},
452 | "outputs": [],
453 | "source": [
454 | "# model and fit params\n",
455 | "params = dict(learning_rate=0.01,\n",
456 | " num_boost_round=300,\n",
457 | " num_leaves = 255,\n",
458 | " verbose=-1,\n",
459 | " is_unbalance=True)\n",
460 | "fit_params = dict(feature_name=all_features,\n",
461 | " categorical_feature=categorical_features)"
462 | ]
463 | },
464 | {
465 | "cell_type": "markdown",
466 | "metadata": {},
467 | "source": [
468 | "## 1. First experiment. Sklearn wrap up vs lightgbm methods"
469 | ]
470 | },
471 | {
472 | "cell_type": "code",
473 | "execution_count": 12,
474 | "metadata": {},
475 | "outputs": [
476 | {
477 | "name": "stdout",
478 | "output_type": "stream",
479 | "text": [
480 | "7.932429075241089\n"
481 | ]
482 | }
483 | ],
484 | "source": [
485 | "clf = lgb.LGBMClassifier(**params, silent=True)\n",
486 | "start = time()\n",
487 | "score = cross_val_score(clf,\n",
488 | " X_train, y_train,\n",
489 | " scoring='neg_log_loss',\n",
490 | " cv=StratifiedKFold(random_state=1981),\n",
491 | " fit_params=fit_params)\n",
492 | "sklearn_runtime = time() - start\n",
493 | "print(sklearn_runtime)"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": 13,
499 | "metadata": {},
500 | "outputs": [
501 | {
502 | "name": "stdout",
503 | "output_type": "stream",
504 | "text": [
505 | "7.038502931594849\n"
506 | ]
507 | }
508 | ],
509 | "source": [
510 | "start = time()\n",
511 | "cv_result = lgb.cv(params,\n",
512 | " lgtrain,\n",
513 | " metrics='binary_logloss',\n",
514 | " nfold=3,\n",
515 | " stratified=True, \n",
516 | " seed=1981)\n",
517 | "lightgbm_runtime = time() - start\n",
518 | "print(lightgbm_runtime)"
519 | ]
520 | },
521 | {
522 | "cell_type": "markdown",
523 | "metadata": {},
524 | "source": [
525 | "LightGBM methods seem to be a bit faster. Let's now compare `Hyperopt` and `Skopt`"
526 | ]
527 | },
528 | {
529 | "cell_type": "markdown",
530 | "metadata": {},
531 | "source": [
532 | "## Hyperopt vs Skopt"
533 | ]
534 | },
535 | {
536 | "cell_type": "markdown",
537 | "metadata": {},
538 | "source": [
539 | "The first thing to comment is that while Hyperopt offers the `hp.quniform(label, low, high, q)` parameter expressions, there is not such a thing for Skopt. One has `Categorical`, but you have to pass all values. In other words, When using hyperopt one could use:\n",
540 | "\n",
541 | " 'num_boost_round': hp.quniform('num_boost_round', 50, 500, 20)\n",
542 | "\n",
543 | "but when using Skopt one would have to do:\n",
544 | "\n",
545 | " Categorical(np.arange(50, 500, 20))\n",
546 | " \n",
547 | "Because I want to keep the comparison as light and direct as possible, I will just use `Real` parameters with uniform distributions."
548 | ]
549 | },
550 | {
551 | "cell_type": "markdown",
552 | "metadata": {},
553 | "source": [
554 | "### 1. Hyperopt\n",
555 | "\n",
556 | "With Hyperopt we will use the [TPE](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf) algorithm."
557 | ]
558 | },
559 | {
560 | "cell_type": "code",
561 | "execution_count": 14,
562 | "metadata": {},
563 | "outputs": [],
564 | "source": [
565 | "hp_space = {\n",
566 | " 'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),\n",
567 | " 'min_child_weight': hp.uniform('min_child_weight', 0.1, 10),\n",
568 | " 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.),\n",
569 | " 'subsample': hp.uniform('subsample', 0.5, 1.),\n",
570 | " }"
571 | ]
572 | },
573 | {
574 | "cell_type": "code",
575 | "execution_count": 20,
576 | "metadata": {},
577 | "outputs": [
578 | {
579 | "name": "stdout",
580 | "output_type": "stream",
581 | "text": [
582 | "100%|██████████| 50/50 [00:39<00:00, 1.18it/s, best loss: 0.28343034788381305]\n"
583 | ]
584 | }
585 | ],
586 | "source": [
587 | "def objective(params):\n",
588 | " clf = lgb.LGBMClassifier(**params, is_unbalance=True, verbose=-1, silent=True)\n",
589 | " score = cross_val_score(clf,\n",
590 | " X_train, y_train,\n",
591 | " scoring='f1',\n",
592 | " cv=StratifiedKFold(random_state=3),\n",
593 | " fit_params=fit_params).mean()\n",
594 | " return 1-score\n",
595 | "trials = Trials()\n",
596 | "best = fmin(fn=objective,\n",
597 | " space=hp_space,\n",
598 | " algo=tpe.suggest,\n",
599 | " max_evals=50,\n",
600 | " trials=trials)"
601 | ]
602 | },
603 | {
604 | "cell_type": "markdown",
605 | "metadata": {},
606 | "source": [
607 | "### 2. SKopt\n",
608 | "\n",
609 | "Since TPE is a Bayesian method we will first compare with the `BayesSearchCV` method in `Skopt`"
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": 16,
615 | "metadata": {},
616 | "outputs": [],
617 | "source": [
618 | "hh_space = dict(\n",
619 | " learning_rate = Real(0.01, 0.3),\n",
620 | " min_child_weight = Real(0.1, 10),\n",
621 | " colsample_bytree= Real(0.5, 1.),\n",
622 | " subsample=Real(0.5, 1.),\n",
623 | " )"
624 | ]
625 | },
626 | {
627 | "cell_type": "code",
628 | "execution_count": 17,
629 | "metadata": {},
630 | "outputs": [
631 | {
632 | "name": "stdout",
633 | "output_type": "stream",
634 | "text": [
635 | "63.14497208595276\n"
636 | ]
637 | }
638 | ],
639 | "source": [
640 | "clf = lgb.LGBMClassifier(is_unbalance=True, verbose=-1, silent=True)\n",
641 | "start = time()\n",
642 | "opt = BayesSearchCV(clf,\n",
643 | " search_spaces=hh_space,\n",
644 | " scoring='f1',\n",
645 | " cv=StratifiedKFold(random_state=3),\n",
646 | " fit_params=fit_params,\n",
647 | " n_iter=50,\n",
648 | " n_jobs=-1)\n",
649 | "opt.fit(X_train, y_train)\n",
650 | "skopt_bayes_runtime = time()-start\n",
651 | "print(skopt_bayes_runtime)"
652 | ]
653 | },
654 | {
655 | "cell_type": "markdown",
656 | "metadata": {},
657 | "source": [
658 | "`Skopt`'s seems to be a significantly slower than hyperopt even with no verbosity. Let's see if performs better:"
659 | ]
660 | },
661 | {
662 | "cell_type": "code",
663 | "execution_count": 19,
664 | "metadata": {},
665 | "outputs": [
666 | {
667 | "name": "stdout",
668 | "output_type": "stream",
669 | "text": [
670 | "best SKOPT F1 score: 0.7174372197995806\n"
671 | ]
672 | }
673 | ],
674 | "source": [
675 | "print('best SKOPT F1 score: {}'.format(opt.best_score_))"
676 | ]
677 | },
678 | {
679 | "cell_type": "markdown",
680 | "metadata": {},
681 | "source": [
682 | "which is almost identical to the one obtained with `Hyperopt`"
683 | ]
684 | },
685 | {
686 | "cell_type": "code",
687 | "execution_count": 25,
688 | "metadata": {},
689 | "outputs": [
690 | {
691 | "name": "stdout",
692 | "output_type": "stream",
693 | "text": [
694 | "best HYPEROPT F1 score: 0.716569652116187\n"
695 | ]
696 | }
697 | ],
698 | "source": [
699 | "# Remember hyperopt minimises 1-score. \n",
700 | "print('best HYPEROPT F1 score: {}'.format(1-trials.best_trial['result']['loss']))"
701 | ]
702 | },
703 | {
704 | "cell_type": "markdown",
705 | "metadata": {},
706 | "source": [
707 | "The conclusion at this stage is that `Hyperopt` is faster than `Skopt` with the same performance. However, the `TPE` algorithm is a tree based algorithm, so let's also compare with the `gbrt_minimize` method (Sequential optimization using gradient boosted trees) in `Skopt`. Here the syntax is a bit different to that of `BayesSearchCV`. "
708 | ]
709 | },
710 | {
711 | "cell_type": "code",
712 | "execution_count": 26,
713 | "metadata": {},
714 | "outputs": [],
715 | "source": [
716 | "# the space has to be tuples like these\n",
717 | "hh_space_gbrt = [Real(0.01, 0.3, 'uniform', name='learning_rate'),\n",
718 | " Real(0.1, 10, 'uniform', name='min_child_weight'),\n",
719 | " Real(0.5, 1., 'uniform', name='colsample_bytree'),\n",
720 | " Real(0.5, 1., 'uniform', name='subsample')]"
721 | ]
722 | },
723 | {
724 | "cell_type": "code",
725 | "execution_count": 30,
726 | "metadata": {},
727 | "outputs": [],
728 | "source": [
729 | "# Let's adapt the objective\n",
730 | "def gbrt_objective(params):\n",
731 | " tmp_params = {}\n",
732 | " tmp_params['learning_rate'], tmp_params['min_child_weight'], \\\n",
733 | " tmp_params['colsample_bytree'], tmp_params['subsample'], = params[0], params[1], params[2], params[3]\n",
734 | " clf = lgb.LGBMClassifier(**tmp_params, is_unbalance=True, verbose=-1, silent=True)\n",
735 | " score = cross_val_score(clf,\n",
736 | " X_train, y_train,\n",
737 | " scoring='f1',\n",
738 | " cv=StratifiedKFold(random_state=3),\n",
739 | " fit_params=fit_params).mean()\n",
740 | " return 1-score"
741 | ]
742 | },
743 | {
744 | "cell_type": "code",
745 | "execution_count": 31,
746 | "metadata": {},
747 | "outputs": [
748 | {
749 | "name": "stdout",
750 | "output_type": "stream",
751 | "text": [
752 | "54.64228296279907\n"
753 | ]
754 | }
755 | ],
756 | "source": [
757 | "start=time()\n",
758 | "sk_best = gbrt_minimize(gbrt_objective,\n",
759 | " hh_space_gbrt,\n",
760 | " n_calls=50,\n",
761 | " verbose=False,\n",
762 | " n_jobs=-1)\n",
763 | "skopt_gbrt_runtime = time()-start\n",
764 | "print(skopt_gbrt_runtime)"
765 | ]
766 | },
767 | {
768 | "cell_type": "markdown",
769 | "metadata": {},
770 | "source": [
771 | "Faster than `BayesSearchCV`, but still, slower than `Hyperopt`. Let's see if the results are any better"
772 | ]
773 | },
774 | {
775 | "cell_type": "code",
776 | "execution_count": 36,
777 | "metadata": {},
778 | "outputs": [
779 | {
780 | "name": "stdout",
781 | "output_type": "stream",
782 | "text": [
783 | "best SKOPT GBRT F1 score: 0.7173483496134895\n"
784 | ]
785 | }
786 | ],
787 | "source": [
788 | "print('best SKOPT GBRT F1 score: {}'.format(1-sk_best.fun))"
789 | ]
790 | },
791 | {
792 | "cell_type": "markdown",
793 | "metadata": {},
794 | "source": [
795 | "## CONCLUSION"
796 | ]
797 | },
798 | {
799 | "cell_type": "markdown",
800 | "metadata": {},
801 | "source": [
802 | "`Hyperopt`'s TPE performs as good as Skopt `gbrt_minimize` and `BayesSearchCV` methods and is significantly faster."
803 | ]
804 | }
805 | ],
806 | "metadata": {
807 | "kernelspec": {
808 | "display_name": "Python 3",
809 | "language": "python",
810 | "name": "python3"
811 | },
812 | "language_info": {
813 | "codemirror_mode": {
814 | "name": "ipython",
815 | "version": 3
816 | },
817 | "file_extension": ".py",
818 | "mimetype": "text/x-python",
819 | "name": "python",
820 | "nbconvert_exporter": "python",
821 | "pygments_lexer": "ipython3",
822 | "version": "3.6.5"
823 | }
824 | },
825 | "nbformat": 4,
826 | "nbformat_minor": 2
827 | }
828 |
--------------------------------------------------------------------------------
/notebooks/skopt_vs_hyperopt.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Skopt vs Hyperopt\n",
8 | "\n",
9 | "## Importing and preprocessing data"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "name": "stderr",
19 | "output_type": "stream",
20 | "text": [
21 | "/usr/local/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.\n",
22 | "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
23 | "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
24 | "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
25 | " \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
26 | ]
27 | }
28 | ],
29 | "source": [
30 | "import pandas as pd\n",
31 | "import numpy as np\n",
32 | "import pickle\n",
33 | "import lightgbm as lgb\n",
34 | "import warnings\n",
35 | "\n",
36 | "from time import time\n",
37 | "from hyperopt import hp, tpe, fmin, Trials\n",
38 | "from skopt import BayesSearchCV\n",
39 | "from skopt.space import Real, Categorical, Integer\n",
40 | "from skopt import gbrt_minimize\n",
41 | "from sklearn.model_selection import StratifiedKFold, cross_val_score\n",
42 | "from sklearn.metrics import log_loss\n",
43 | "from utils import FeatureTools\n",
44 | "\n",
45 | "warnings.filterwarnings(\"ignore\")"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 2,
51 | "metadata": {},
52 | "outputs": [
53 | {
54 | "data": {
55 | "text/html": [
56 | "\n",
57 | "\n",
70 | "
\n",
71 | " \n",
72 | " \n",
73 | " | \n",
74 | " age | \n",
75 | " workclass | \n",
76 | " fnlwgt | \n",
77 | " education | \n",
78 | " marital_status | \n",
79 | " occupation | \n",
80 | " relationship | \n",
81 | " race | \n",
82 | " gender | \n",
83 | " capital_gain | \n",
84 | " capital_loss | \n",
85 | " hours_per_week | \n",
86 | " native_country | \n",
87 | " target | \n",
88 | "
\n",
89 | " \n",
90 | " \n",
91 | " \n",
92 | " 0 | \n",
93 | " 39 | \n",
94 | " State-gov | \n",
95 | " 77516 | \n",
96 | " Bachelors | \n",
97 | " Never-married | \n",
98 | " Adm-clerical | \n",
99 | " Not-in-family | \n",
100 | " White | \n",
101 | " Male | \n",
102 | " 2174 | \n",
103 | " 0 | \n",
104 | " 40 | \n",
105 | " United-States | \n",
106 | " 0 | \n",
107 | "
\n",
108 | " \n",
109 | " 1 | \n",
110 | " 50 | \n",
111 | " Self-emp-not-inc | \n",
112 | " 83311 | \n",
113 | " Bachelors | \n",
114 | " Married-civ-spouse | \n",
115 | " Exec-managerial | \n",
116 | " Husband | \n",
117 | " White | \n",
118 | " Male | \n",
119 | " 0 | \n",
120 | " 0 | \n",
121 | " 13 | \n",
122 | " United-States | \n",
123 | " 0 | \n",
124 | "
\n",
125 | " \n",
126 | " 2 | \n",
127 | " 38 | \n",
128 | " Private | \n",
129 | " 215646 | \n",
130 | " HS-grad | \n",
131 | " Divorced | \n",
132 | " Handlers-cleaners | \n",
133 | " Not-in-family | \n",
134 | " White | \n",
135 | " Male | \n",
136 | " 0 | \n",
137 | " 0 | \n",
138 | " 40 | \n",
139 | " United-States | \n",
140 | " 0 | \n",
141 | "
\n",
142 | " \n",
143 | " 3 | \n",
144 | " 53 | \n",
145 | " Private | \n",
146 | " 234721 | \n",
147 | " 11th | \n",
148 | " Married-civ-spouse | \n",
149 | " Handlers-cleaners | \n",
150 | " Husband | \n",
151 | " Black | \n",
152 | " Male | \n",
153 | " 0 | \n",
154 | " 0 | \n",
155 | " 40 | \n",
156 | " United-States | \n",
157 | " 0 | \n",
158 | "
\n",
159 | " \n",
160 | " 4 | \n",
161 | " 28 | \n",
162 | " Private | \n",
163 | " 338409 | \n",
164 | " Bachelors | \n",
165 | " Married-civ-spouse | \n",
166 | " Prof-specialty | \n",
167 | " Wife | \n",
168 | " Black | \n",
169 | " Female | \n",
170 | " 0 | \n",
171 | " 0 | \n",
172 | " 40 | \n",
173 | " Cuba | \n",
174 | " 0 | \n",
175 | "
\n",
176 | " \n",
177 | "
\n",
178 | "
"
179 | ],
180 | "text/plain": [
181 | " age workclass fnlwgt education marital_status \\\n",
182 | "0 39 State-gov 77516 Bachelors Never-married \n",
183 | "1 50 Self-emp-not-inc 83311 Bachelors Married-civ-spouse \n",
184 | "2 38 Private 215646 HS-grad Divorced \n",
185 | "3 53 Private 234721 11th Married-civ-spouse \n",
186 | "4 28 Private 338409 Bachelors Married-civ-spouse \n",
187 | "\n",
188 | " occupation relationship race gender capital_gain \\\n",
189 | "0 Adm-clerical Not-in-family White Male 2174 \n",
190 | "1 Exec-managerial Husband White Male 0 \n",
191 | "2 Handlers-cleaners Not-in-family White Male 0 \n",
192 | "3 Handlers-cleaners Husband Black Male 0 \n",
193 | "4 Prof-specialty Wife Black Female 0 \n",
194 | "\n",
195 | " capital_loss hours_per_week native_country target \n",
196 | "0 0 40 United-States 0 \n",
197 | "1 0 13 United-States 0 \n",
198 | "2 0 40 United-States 0 \n",
199 | "3 0 40 United-States 0 \n",
200 | "4 0 40 Cuba 0 "
201 | ]
202 | },
203 | "execution_count": 2,
204 | "metadata": {},
205 | "output_type": "execute_result"
206 | }
207 | ],
208 | "source": [
209 | "df = pd.read_csv(\"data/adult.data\")\n",
210 | "df['target'] = (df['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)\n",
211 | "df.drop('income_bracket', axis=1, inplace=True)\n",
212 | "df.head()"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": [
219 | "I have coded a preprocessor class before that does the work for us."
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": 3,
225 | "metadata": {},
226 | "outputs": [
227 | {
228 | "name": "stdout",
229 | "output_type": "stream",
230 | "text": [
231 | "the features column names are: ['age', 'workclass', 'fnlwgt', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'education_occupation', 'native_country_occupation']\n",
232 | "the categorical columns are: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country', 'education_occupation', 'native_country_occupation']\n"
233 | ]
234 | }
235 | ],
236 | "source": [
237 | "dataprocessor = pickle.load(open(\"data/dataprocessors/dataprocessor_0_.p\", \"rb\"))\n",
238 | "all_features = dataprocessor.colnames\n",
239 | "categorical_features = dataprocessor.cat_cols + dataprocessor.crossed_columns\n",
240 | "\n",
241 | "print(\"the features column names are: {}\".format(all_features))\n",
242 | "print(\"the categorical columns are: {}\".format(categorical_features))"
243 | ]
244 | },
245 | {
246 | "cell_type": "markdown",
247 | "metadata": {},
248 | "source": [
249 | "the `dataprocessor` is already train, so we simply need to `transform`"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 4,
255 | "metadata": {},
256 | "outputs": [
257 | {
258 | "data": {
259 | "text/html": [
260 | "\n",
261 | "\n",
274 | "
\n",
275 | " \n",
276 | " \n",
277 | " | \n",
278 | " age | \n",
279 | " workclass | \n",
280 | " fnlwgt | \n",
281 | " education | \n",
282 | " marital_status | \n",
283 | " occupation | \n",
284 | " relationship | \n",
285 | " race | \n",
286 | " gender | \n",
287 | " capital_gain | \n",
288 | " capital_loss | \n",
289 | " hours_per_week | \n",
290 | " native_country | \n",
291 | " target | \n",
292 | " education_occupation | \n",
293 | " native_country_occupation | \n",
294 | "
\n",
295 | " \n",
296 | " \n",
297 | " \n",
298 | " 0 | \n",
299 | " 0.301370 | \n",
300 | " 0 | \n",
301 | " 0.044302 | \n",
302 | " 0 | \n",
303 | " 0 | \n",
304 | " 0 | \n",
305 | " 0 | \n",
306 | " 0 | \n",
307 | " 0 | \n",
308 | " 0.02174 | \n",
309 | " 0.0 | \n",
310 | " 0.397959 | \n",
311 | " 0 | \n",
312 | " 0 | \n",
313 | " 0 | \n",
314 | " 0 | \n",
315 | "
\n",
316 | " \n",
317 | " 1 | \n",
318 | " 0.452055 | \n",
319 | " 1 | \n",
320 | " 0.048238 | \n",
321 | " 0 | \n",
322 | " 1 | \n",
323 | " 1 | \n",
324 | " 1 | \n",
325 | " 0 | \n",
326 | " 0 | \n",
327 | " 0.00000 | \n",
328 | " 0.0 | \n",
329 | " 0.122449 | \n",
330 | " 0 | \n",
331 | " 0 | \n",
332 | " 1 | \n",
333 | " 1 | \n",
334 | "
\n",
335 | " \n",
336 | " 2 | \n",
337 | " 0.287671 | \n",
338 | " 2 | \n",
339 | " 0.138113 | \n",
340 | " 1 | \n",
341 | " 2 | \n",
342 | " 2 | \n",
343 | " 0 | \n",
344 | " 0 | \n",
345 | " 0 | \n",
346 | " 0.00000 | \n",
347 | " 0.0 | \n",
348 | " 0.397959 | \n",
349 | " 0 | \n",
350 | " 0 | \n",
351 | " 2 | \n",
352 | " 2 | \n",
353 | "
\n",
354 | " \n",
355 | " 3 | \n",
356 | " 0.493151 | \n",
357 | " 2 | \n",
358 | " 0.151068 | \n",
359 | " 2 | \n",
360 | " 1 | \n",
361 | " 2 | \n",
362 | " 1 | \n",
363 | " 1 | \n",
364 | " 0 | \n",
365 | " 0.00000 | \n",
366 | " 0.0 | \n",
367 | " 0.397959 | \n",
368 | " 0 | \n",
369 | " 0 | \n",
370 | " 3 | \n",
371 | " 2 | \n",
372 | "
\n",
373 | " \n",
374 | " 4 | \n",
375 | " 0.150685 | \n",
376 | " 2 | \n",
377 | " 0.221488 | \n",
378 | " 0 | \n",
379 | " 1 | \n",
380 | " 3 | \n",
381 | " 2 | \n",
382 | " 1 | \n",
383 | " 1 | \n",
384 | " 0.00000 | \n",
385 | " 0.0 | \n",
386 | " 0.397959 | \n",
387 | " 1 | \n",
388 | " 0 | \n",
389 | " 4 | \n",
390 | " 3 | \n",
391 | "
\n",
392 | " \n",
393 | "
\n",
394 | "
"
395 | ],
396 | "text/plain": [
397 | " age workclass fnlwgt education marital_status occupation \\\n",
398 | "0 0.301370 0 0.044302 0 0 0 \n",
399 | "1 0.452055 1 0.048238 0 1 1 \n",
400 | "2 0.287671 2 0.138113 1 2 2 \n",
401 | "3 0.493151 2 0.151068 2 1 2 \n",
402 | "4 0.150685 2 0.221488 0 1 3 \n",
403 | "\n",
404 | " relationship race gender capital_gain capital_loss hours_per_week \\\n",
405 | "0 0 0 0 0.02174 0.0 0.397959 \n",
406 | "1 1 0 0 0.00000 0.0 0.122449 \n",
407 | "2 0 0 0 0.00000 0.0 0.397959 \n",
408 | "3 1 1 0 0.00000 0.0 0.397959 \n",
409 | "4 2 1 1 0.00000 0.0 0.397959 \n",
410 | "\n",
411 | " native_country target education_occupation native_country_occupation \n",
412 | "0 0 0 0 0 \n",
413 | "1 0 0 1 1 \n",
414 | "2 0 0 2 2 \n",
415 | "3 0 0 3 2 \n",
416 | "4 1 0 4 3 "
417 | ]
418 | },
419 | "execution_count": 4,
420 | "metadata": {},
421 | "output_type": "execute_result"
422 | }
423 | ],
424 | "source": [
425 | "train_data = dataprocessor.transform(df)\n",
426 | "\n",
427 | "train_data.head()"
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": 6,
433 | "metadata": {},
434 | "outputs": [],
435 | "source": [
436 | "# np arrays\n",
437 | "X_train = train_data[[c for c in train_data.columns if c is not 'target']].values\n",
438 | "y_train = train_data['target'].values\n",
439 | "\n",
440 | "# lgb Dataset object\n",
441 | "lgtrain = lgb.Dataset(X_train,\n",
442 | " label=y_train,\n",
443 | " feature_name=all_features,\n",
444 | " categorical_feature=categorical_features,\n",
445 | " free_raw_data=False)"
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": 7,
451 | "metadata": {},
452 | "outputs": [],
453 | "source": [
454 | "# model and fit params\n",
455 | "params = dict(learning_rate=0.01,\n",
456 | " num_boost_round=300,\n",
457 | " num_leaves = 255,\n",
458 | " verbose=-1,\n",
459 | " is_unbalance=True)\n",
460 | "fit_params = dict(feature_name=all_features,\n",
461 | " categorical_feature=categorical_features)"
462 | ]
463 | },
464 | {
465 | "cell_type": "markdown",
466 | "metadata": {},
467 | "source": [
468 | "## 1. First experiment. Sklearn wrap up vs lightgbm methods"
469 | ]
470 | },
471 | {
472 | "cell_type": "code",
473 | "execution_count": 12,
474 | "metadata": {},
475 | "outputs": [
476 | {
477 | "name": "stdout",
478 | "output_type": "stream",
479 | "text": [
480 | "7.932429075241089\n"
481 | ]
482 | }
483 | ],
484 | "source": [
485 | "clf = lgb.LGBMClassifier(**params, silent=True)\n",
486 | "start = time()\n",
487 | "score = cross_val_score(clf,\n",
488 | " X_train, y_train,\n",
489 | " scoring='neg_log_loss',\n",
490 | " cv=StratifiedKFold(random_state=1981),\n",
491 | " fit_params=fit_params)\n",
492 | "sklearn_runtime = time() - start\n",
493 | "print(sklearn_runtime)"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": 13,
499 | "metadata": {},
500 | "outputs": [
501 | {
502 | "name": "stdout",
503 | "output_type": "stream",
504 | "text": [
505 | "7.038502931594849\n"
506 | ]
507 | }
508 | ],
509 | "source": [
510 | "start = time()\n",
511 | "cv_result = lgb.cv(params,\n",
512 | " lgtrain,\n",
513 | " metrics='binary_logloss',\n",
514 | " nfold=3,\n",
515 | " stratified=True, \n",
516 | " seed=1981)\n",
517 | "lightgbm_runtime = time() - start\n",
518 | "print(lightgbm_runtime)"
519 | ]
520 | },
521 | {
522 | "cell_type": "markdown",
523 | "metadata": {},
524 | "source": [
525 | "LightGBM methods seem to be a bit faster. Let's now compare `Hyperopt` and `Skopt`"
526 | ]
527 | },
528 | {
529 | "cell_type": "markdown",
530 | "metadata": {},
531 | "source": [
532 | "## Hyperopt vs Skopt"
533 | ]
534 | },
535 | {
536 | "cell_type": "markdown",
537 | "metadata": {},
538 | "source": [
539 | "The first thing to comment is that while Hyperopt offers the `hp.quniform(label, low, high, q)` parameter expressions, there is not such a thing for Skopt. One has `Categorical`, but you have to pass all values. In other words, When using hyperopt one could use:\n",
540 | "\n",
541 | " 'num_boost_round': hp.quniform('num_boost_round', 50, 500, 20)\n",
542 | "\n",
543 | "but when using Skopt one would have to do:\n",
544 | "\n",
545 | " Categorical(np.arange(50, 500, 20))\n",
546 | " \n",
547 | "Because I want to keep the comparison as light and direct as possible, I will just use `Real` parameters with uniform distributions."
548 | ]
549 | },
550 | {
551 | "cell_type": "markdown",
552 | "metadata": {},
553 | "source": [
554 | "### 1. Hyperopt\n",
555 | "\n",
556 | "With Hyperopt we will use the [TPE](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf) algorithm."
557 | ]
558 | },
559 | {
560 | "cell_type": "code",
561 | "execution_count": 14,
562 | "metadata": {},
563 | "outputs": [],
564 | "source": [
565 | "hp_space = {\n",
566 | " 'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),\n",
567 | " 'min_child_weight': hp.uniform('min_child_weight', 0.1, 10),\n",
568 | " 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.),\n",
569 | " 'subsample': hp.uniform('subsample', 0.5, 1.),\n",
570 | " }"
571 | ]
572 | },
573 | {
574 | "cell_type": "code",
575 | "execution_count": 20,
576 | "metadata": {},
577 | "outputs": [
578 | {
579 | "name": "stdout",
580 | "output_type": "stream",
581 | "text": [
582 | "100%|██████████| 50/50 [00:39<00:00, 1.18it/s, best loss: 0.28343034788381305]\n"
583 | ]
584 | }
585 | ],
586 | "source": [
587 | "def objective(params):\n",
588 | " clf = lgb.LGBMClassifier(**params, is_unbalance=True, verbose=-1, silent=True)\n",
589 | " score = cross_val_score(clf,\n",
590 | " X_train, y_train,\n",
591 | " scoring='f1',\n",
592 | " cv=StratifiedKFold(random_state=3),\n",
593 | " fit_params=fit_params).mean()\n",
594 | " return 1-score\n",
595 | "trials = Trials()\n",
596 | "best = fmin(fn=objective,\n",
597 | " space=hp_space,\n",
598 | " algo=tpe.suggest,\n",
599 | " max_evals=50,\n",
600 | " trials=trials)"
601 | ]
602 | },
603 | {
604 | "cell_type": "markdown",
605 | "metadata": {},
606 | "source": [
607 | "### 2. SKopt\n",
608 | "\n",
609 | "Since TPE is a Bayesian method we will first compare with the `BayesSearchCV` method in `Skopt`"
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": 16,
615 | "metadata": {},
616 | "outputs": [],
617 | "source": [
618 | "hh_space = dict(\n",
619 | " learning_rate = Real(0.01, 0.3),\n",
620 | " min_child_weight = Real(0.1, 10),\n",
621 | " colsample_bytree= Real(0.5, 1.),\n",
622 | " subsample=Real(0.5, 1.),\n",
623 | " )"
624 | ]
625 | },
626 | {
627 | "cell_type": "code",
628 | "execution_count": 17,
629 | "metadata": {},
630 | "outputs": [
631 | {
632 | "name": "stdout",
633 | "output_type": "stream",
634 | "text": [
635 | "63.14497208595276\n"
636 | ]
637 | }
638 | ],
639 | "source": [
640 | "clf = lgb.LGBMClassifier(is_unbalance=True, verbose=-1, silent=True)\n",
641 | "start = time()\n",
642 | "opt = BayesSearchCV(clf,\n",
643 | " search_spaces=hh_space,\n",
644 | " scoring='f1',\n",
645 | " cv=StratifiedKFold(random_state=3),\n",
646 | " fit_params=fit_params,\n",
647 | " n_iter=50,\n",
648 | " n_jobs=-1)\n",
649 | "opt.fit(X_train, y_train)\n",
650 | "skopt_bayes_runtime = time()-start\n",
651 | "print(skopt_bayes_runtime)"
652 | ]
653 | },
654 | {
655 | "cell_type": "markdown",
656 | "metadata": {},
657 | "source": [
658 | "`Skopt`'s seems to be a significantly slower than hyperopt even with no verbosity. Let's see if performs better:"
659 | ]
660 | },
661 | {
662 | "cell_type": "code",
663 | "execution_count": 19,
664 | "metadata": {},
665 | "outputs": [
666 | {
667 | "name": "stdout",
668 | "output_type": "stream",
669 | "text": [
670 | "best SKOPT F1 score: 0.7174372197995806\n"
671 | ]
672 | }
673 | ],
674 | "source": [
675 | "print('best SKOPT F1 score: {}'.format(opt.best_score_))"
676 | ]
677 | },
678 | {
679 | "cell_type": "markdown",
680 | "metadata": {},
681 | "source": [
682 | "which is almost identical to the one obtained with `Hyperopt`"
683 | ]
684 | },
685 | {
686 | "cell_type": "code",
687 | "execution_count": 25,
688 | "metadata": {},
689 | "outputs": [
690 | {
691 | "name": "stdout",
692 | "output_type": "stream",
693 | "text": [
694 | "best HYPEROPT F1 score: 0.716569652116187\n"
695 | ]
696 | }
697 | ],
698 | "source": [
699 | "# Remember hyperopt minimises 1-score. \n",
700 | "print('best HYPEROPT F1 score: {}'.format(1-trials.best_trial['result']['loss']))"
701 | ]
702 | },
703 | {
704 | "cell_type": "markdown",
705 | "metadata": {},
706 | "source": [
707 | "The conclusion at this stage is that `Hyperopt` is faster than `Skopt` with the same performance. However, the `TPE` algorithm is a tree based algorithm, so let's also compare with the `gbrt_minimize` method (Sequential optimization using gradient boosted trees) in `Skopt`. Here the syntax is a bit different to that of `BayesSearchCV`. "
708 | ]
709 | },
710 | {
711 | "cell_type": "code",
712 | "execution_count": 26,
713 | "metadata": {},
714 | "outputs": [],
715 | "source": [
716 | "# the space has to be tuples like these\n",
717 | "hh_space_gbrt = [Real(0.01, 0.3, 'uniform', name='learning_rate'),\n",
718 | " Real(0.1, 10, 'uniform', name='min_child_weight'),\n",
719 | " Real(0.5, 1., 'uniform', name='colsample_bytree'),\n",
720 | " Real(0.5, 1., 'uniform', name='subsample')]"
721 | ]
722 | },
723 | {
724 | "cell_type": "code",
725 | "execution_count": 30,
726 | "metadata": {},
727 | "outputs": [],
728 | "source": [
729 | "# Let's adapt the objective\n",
730 | "def gbrt_objective(params):\n",
731 | " tmp_params = {}\n",
732 | " tmp_params['learning_rate'], tmp_params['min_child_weight'], \\\n",
733 | " tmp_params['colsample_bytree'], tmp_params['subsample'], = params[0], params[1], params[2], params[3]\n",
734 | " clf = lgb.LGBMClassifier(**tmp_params, is_unbalance=True, verbose=-1, silent=True)\n",
735 | " score = cross_val_score(clf,\n",
736 | " X_train, y_train,\n",
737 | " scoring='f1',\n",
738 | " cv=StratifiedKFold(random_state=3),\n",
739 | " fit_params=fit_params).mean()\n",
740 | " return 1-score"
741 | ]
742 | },
743 | {
744 | "cell_type": "code",
745 | "execution_count": 31,
746 | "metadata": {},
747 | "outputs": [
748 | {
749 | "name": "stdout",
750 | "output_type": "stream",
751 | "text": [
752 | "54.64228296279907\n"
753 | ]
754 | }
755 | ],
756 | "source": [
757 | "start=time()\n",
758 | "sk_best = gbrt_minimize(gbrt_objective,\n",
759 | " hh_space_gbrt,\n",
760 | " n_calls=50,\n",
761 | " verbose=False,\n",
762 | " n_jobs=-1)\n",
763 | "skopt_gbrt_runtime = time()-start\n",
764 | "print(skopt_gbrt_runtime)"
765 | ]
766 | },
767 | {
768 | "cell_type": "markdown",
769 | "metadata": {},
770 | "source": [
771 | "Faster than `BayesSearchCV`, but still, slower than `Hyperopt`. Let's see if the results are any better"
772 | ]
773 | },
774 | {
775 | "cell_type": "code",
776 | "execution_count": 36,
777 | "metadata": {},
778 | "outputs": [
779 | {
780 | "name": "stdout",
781 | "output_type": "stream",
782 | "text": [
783 | "best SKOPT GBRT F1 score: 0.7173483496134895\n"
784 | ]
785 | }
786 | ],
787 | "source": [
788 | "print('best SKOPT GBRT F1 score: {}'.format(1-sk_best.fun))"
789 | ]
790 | },
791 | {
792 | "cell_type": "markdown",
793 | "metadata": {},
794 | "source": [
795 | "## CONCLUSION"
796 | ]
797 | },
798 | {
799 | "cell_type": "markdown",
800 | "metadata": {},
801 | "source": [
802 | "`Hyperopt`'s TPE performs as good as Skopt `gbrt_minimize` and `BayesSearchCV` methods and is significantly faster."
803 | ]
804 | }
805 | ],
806 | "metadata": {
807 | "kernelspec": {
808 | "display_name": "Python 3",
809 | "language": "python",
810 | "name": "python3"
811 | },
812 | "language_info": {
813 | "codemirror_mode": {
814 | "name": "ipython",
815 | "version": 3
816 | },
817 | "file_extension": ".py",
818 | "mimetype": "text/x-python",
819 | "name": "python",
820 | "nbconvert_exporter": "python",
821 | "pygments_lexer": "ipython3",
822 | "version": "3.6.5"
823 | }
824 | },
825 | "nbformat": 4,
826 | "nbformat_minor": 2
827 | }
828 |
--------------------------------------------------------------------------------
/predictor.py:
--------------------------------------------------------------------------------
1 | import pdb
2 | import json
3 | import pandas as pd
4 | import pickle
5 |
6 | from pathlib import Path
7 | from kafka import KafkaConsumer
8 | from utils.messages_utils import append_message, read_messages_count, send_retrain_message, publish_prediction
9 |
10 | KAFKA_HOST = 'localhost:9092'
11 | TOPICS = ['app_messages', 'retrain_topic']
12 | PATH = Path('data/')
13 | MODELS_PATH = PATH/'models'
14 | DATAPROCESSORS_PATH = PATH/'dataprocessors'
15 | MESSAGES_PATH = PATH/'messages'
16 | RETRAIN_EVERY = 25
17 | EXTRA_MODELS_TO_KEEP = 1
18 |
19 | column_order = pickle.load(open(DATAPROCESSORS_PATH/'column_order.p', 'rb'))
20 | dataprocessor = None
21 | consumer = None
22 | model = None
23 |
24 |
25 | def reload_model(path):
26 | return pickle.load(open(path, 'rb'))
27 |
28 |
29 | def is_retraining_message(msg):
30 | message = json.loads(msg.value)
31 | return msg.topic == 'retrain_topic' and 'training_completed' in message and message['training_completed']
32 |
33 |
34 | def is_application_message(msg):
35 | message = json.loads(msg.value)
36 | return msg.topic == 'app_messages' and 'prediction' not in message
37 |
38 |
39 | def predict(message, column_order):
40 | row = pd.DataFrame(message, index=[0])
41 | # sanity check
42 | assert row.columns.tolist()[:-1] == column_order
43 | # In the real world we would not have the target (here 'income_bracket').
44 | # In this example we keep it and we will retrain the model as it reads
45 | # RETRAIN_EVERY number of messages. In the real world, after RETRAIN_EVERY
46 | # number of messages have been collected, one would have to wait until we
47 | # can collect RETRAIN_EVERY targets AND THEN retrain
48 | row.drop('income_bracket', axis=1, inplace=True)
49 | trow = dataprocessor.transform(row)
50 | return model.predict(trow)[0]
51 |
52 |
53 | def start(model_id, messages_count, batch_id):
54 | for msg in consumer:
55 | message = json.loads(msg.value)
56 |
57 | if is_retraining_message(msg):
58 | model_fname = 'model_{}_.p'.format(model_id)
59 | model = reload_model(MODELS_PATH/model_fname)
60 | print("NEW MODEL RELOADED {}".format(model_id))
61 |
62 | elif is_application_message(msg):
63 | request_id = message['request_id']
64 | pred = predict(message['data'], column_order)
65 | publish_prediction(pred, request_id)
66 |
67 | append_message(message['data'], MESSAGES_PATH, batch_id)
68 | messages_count += 1
69 | if messages_count % RETRAIN_EVERY == 0:
70 | model_id = (model_id + 1) % (EXTRA_MODELS_TO_KEEP + 1)
71 | send_retrain_message(model_id, batch_id)
72 | batch_id += 1
73 |
74 |
75 | if __name__ == '__main__':
76 | dataprocessor_id = 0
77 | dataprocessor_fname = 'dataprocessor_{}_.p'.format(dataprocessor_id)
78 | dataprocessor = pickle.load(open(DATAPROCESSORS_PATH/dataprocessor_fname, 'rb'))
79 |
80 | messages_count = read_messages_count(MESSAGES_PATH, RETRAIN_EVERY)
81 | batch_id = messages_count % RETRAIN_EVERY
82 |
83 | model_id = batch_id % (EXTRA_MODELS_TO_KEEP + 1)
84 | model_fname = 'model_{}_.p'.format(model_id)
85 | model = reload_model(MODELS_PATH/model_fname)
86 |
87 | consumer = KafkaConsumer(bootstrap_servers=KAFKA_HOST)
88 | consumer.subscribe(TOPICS)
89 |
90 | start(model_id, messages_count, batch_id)
91 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | alembic==1.0.11
2 | certifi==2019.6.16
3 | chardet==3.0.4
4 | Click==7.0
5 | cloudpickle==1.2.1
6 | configparser==3.7.4
7 | databricks-cli==0.8.7
8 | decorator==4.4.0
9 | docker==4.0.2
10 | entrypoints==0.3
11 | Flask==1.1.1
12 | future==0.17.1
13 | gitdb2==2.0.5
14 | GitPython==2.1.11
15 | gunicorn==19.9.0
16 | hyperopt==0.1.2
17 | idna==2.8
18 | itsdangerous==1.1.0
19 | Jinja2==2.10.1
20 | joblib==0.13.2
21 | kafka-python==1.4.6
22 | lightgbm==2.2.3
23 | Mako==1.0.13
24 | MarkupSafe==1.1.1
25 | mlflow==1.0.0
26 | networkx==2.3
27 | numpy==1.16.4
28 | pandas==0.24.2
29 | protobuf==3.9.0
30 | pymongo==3.8.0
31 | python-dateutil==2.8.0
32 | python-editor==1.0.4
33 | pytz==2019.1
34 | PyYAML==5.1.1
35 | querystring-parser==1.2.3
36 | requests==2.22.0
37 | scikit-learn==0.21.2
38 | scipy==1.3.0
39 | simplejson==3.16.0
40 | six==1.12.0
41 | sklearn==0.0
42 | smmap2==2.0.5
43 | SQLAlchemy==1.3.5
44 | sqlparse==0.3.0
45 | tabulate==0.8.3
46 | tqdm==4.32.2
47 | urllib3==1.25.3
48 | websocket-client==0.56.0
49 | Werkzeug==0.15.4
50 |
--------------------------------------------------------------------------------
/sample_app.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import json
3 | import threading
4 | import uuid
5 |
6 | from pathlib import Path
7 | from kafka import KafkaProducer, KafkaConsumer
8 | from time import sleep
9 |
10 |
11 | PATH = Path('data/')
12 | KAFKA_HOST = 'localhost:9092'
13 | df_test = pd.read_csv(PATH/'adult.test')
14 | # In the real world, the messages would not come with the target/outcome of
15 | # our actions. Here we will keep it and assume that at some point in the
16 | # future we can collect the outcome and monitor how our algorithm is doing
17 | # df_test.drop('income_bracket', axis=1, inplace=True)
18 | df_test['json'] = df_test.apply(lambda x: x.to_json(), axis=1)
19 | messages = df_test.json.tolist()
20 |
21 |
22 | def start_producing():
23 | producer = KafkaProducer(bootstrap_servers=KAFKA_HOST)
24 | for i in range(200):
25 | message_id = str(uuid.uuid4())
26 | message = {'request_id': message_id, 'data': json.loads(messages[i])}
27 |
28 | producer.send('app_messages', json.dumps(message).encode('utf-8'))
29 | producer.flush()
30 |
31 | print("\033[1;31;40m -- PRODUCER: Sent message with id {}".format(message_id))
32 | sleep(2)
33 |
34 |
35 | def start_consuming():
36 | consumer = KafkaConsumer('app_messages', bootstrap_servers=KAFKA_HOST)
37 |
38 | for msg in consumer:
39 | message = json.loads(msg.value)
40 | if 'prediction' in message:
41 | request_id = message['request_id']
42 | print("\033[1;32;40m ** CONSUMER: Received prediction {} for request id {}".format(message['prediction'], request_id))
43 |
44 |
45 | threads = []
46 | t = threading.Thread(target=start_producing)
47 | t2 = threading.Thread(target=start_consuming)
48 | threads.append(t)
49 | threads.append(t2)
50 | t.start()
51 | t2.start()
52 |
--------------------------------------------------------------------------------
/train/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrzaurin/ml-pipeline/af2b338b19a7eda31af7da35db877a9254a02058/train/__init__.py
--------------------------------------------------------------------------------
/train/train_hyperopt.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import lightgbm as lgb
4 | import pickle
5 | import pdb
6 | import warnings
7 |
8 | from pathlib import Path
9 | from sklearn.metrics import f1_score
10 | from hyperopt import hp, tpe, fmin, Trials
11 |
12 |
13 | warnings.filterwarnings("ignore")
14 |
15 |
16 | def best_threshold(y_true, pred_proba, proba_range, verbose=False):
17 | """
18 | Function to find the probability threshold that optimises the f1_score
19 |
20 | Comment: this function is not used in this excercise, but we include it in
21 | case the reader finds it useful
22 |
23 | Parameters:
24 | -----------
25 | y_true: numpy.ndarray
26 | array with the true labels
27 | pred_proba: numpy.ndarray
28 | array with the predicted probability
29 | proba_range: numpy.ndarray
30 | range of probabilities to explore.
31 | e.g. np.arange(0.1,0.9,0.01)
32 |
33 | Return:
34 | -----------
35 | tuple with the optimal threshold and the corresponding f1_score
36 | """
37 | scores = []
38 | for prob in proba_range:
39 | pred = [int(p>prob) for p in pred_proba]
40 | score = f1_score(y_true,pred)
41 | scores.append(score)
42 | if verbose:
43 | print("INFO: prob threshold: {}. score :{}".format(round(prob,3), round(score,5)))
44 | best_score = scores[np.argmax(scores)]
45 | optimal_threshold = proba_range[np.argmax(scores)]
46 | return (optimal_threshold, best_score)
47 |
48 |
49 | def lgb_f1_score(preds, lgbDataset):
50 | """
51 | Function to compute the f1_score to be used with lightgbm methods.
52 | Comments: output format must be:
53 | (eval_name, eval_result, is_higher_better)
54 |
55 | Parameters:
56 | -----------
57 | preds: np.array or List
58 | lgbDataset: lightgbm.Dataset
59 | """
60 | binary_preds = [int(p>0.5) for p in preds]
61 | y_true = lgbDataset.get_label()
62 | # lightgbm: (eval_name, eval_result, is_higher_better)
63 | return 'f1', f1_score(y_true, binary_preds), True
64 |
65 |
66 | class LGBOptimizer(object):
67 | def __init__(self, trainDataset, out_dir):
68 | """
69 | Hyper Parameter optimization
70 |
71 | Parameters:
72 | -----------
73 | trainDataset: FeatureTools object
74 | The result of running FeatureTools().fit()
75 | out_dir: pathlib.PosixPath
76 | Path to the output directory
77 | """
78 | self.PATH = out_dir
79 | self.early_stop_dict = {}
80 |
81 | self.X = trainDataset.data
82 | self.y = trainDataset.target
83 | self.colnames = trainDataset.colnames
84 | self.categorical_columns = trainDataset.categorical_columns + trainDataset.crossed_columns
85 |
86 | self.lgtrain = lgb.Dataset(self.X,label=self.y,
87 | feature_name=self.colnames,
88 | categorical_feature = self.categorical_columns,
89 | free_raw_data=False)
90 |
91 | def optimize(self, maxevals=200, model_id=0):
92 |
93 | param_space = self.hyperparameter_space()
94 | objective = self.get_objective(self.lgtrain)
95 | objective.i=0
96 | trials = Trials()
97 | best = fmin(fn=objective,
98 | space=param_space,
99 | algo=tpe.suggest,
100 | max_evals=maxevals,
101 | trials=trials)
102 | best['num_boost_round'] = self.early_stop_dict[trials.best_trial['tid']]
103 | best['num_leaves'] = int(best['num_leaves'])
104 | best['verbose'] = -1
105 |
106 | # set the model with the best parameters, fit and save
107 | model = lgb.LGBMClassifier(**best)
108 | model.fit(self.lgtrain.data,
109 | self.lgtrain.label,
110 | feature_name=self.colnames,
111 | categorical_feature=self.categorical_columns)
112 |
113 | model_fname = 'model_{}_.p'.format(model_id)
114 | best_experiment_fname = 'best_experiment_{}_.p'.format(model_id)
115 |
116 | pickle.dump(model, open(self.PATH/model_fname, 'wb'))
117 | pickle.dump(best, open(self.PATH/best_experiment_fname, 'wb'))
118 |
119 | self.best = best
120 | self.model = model
121 |
122 |
123 | def get_objective(self, train):
124 |
125 | def objective(params):
126 | """
127 | objective function for lightgbm.
128 | """
129 | # hyperopt casts as float
130 | params['num_boost_round'] = int(params['num_boost_round'])
131 | params['num_leaves'] = int(params['num_leaves'])
132 |
133 | # need to be passed as parameter
134 | params['is_unbalance'] = True
135 | params['verbose'] = -1
136 | params['seed'] = 1
137 |
138 | cv_result = lgb.cv(
139 | params,
140 | train,
141 | num_boost_round=params['num_boost_round'],
142 | metrics='binary_logloss',
143 | # feval = lgb_f1_score,
144 | nfold=3,
145 | stratified=True,
146 | early_stopping_rounds=20)
147 | self.early_stop_dict[objective.i] = len(cv_result['binary_logloss-mean'])
148 | error = round(cv_result['binary_logloss-mean'][-1], 4)
149 | objective.i+=1
150 | return error
151 |
152 | return objective
153 |
154 | def hyperparameter_space(self, param_space=None):
155 |
156 | space = {
157 | 'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
158 | 'num_boost_round': hp.quniform('num_boost_round', 50, 500, 20),
159 | 'num_leaves': hp.quniform('num_leaves', 31, 255, 4),
160 | 'min_child_weight': hp.uniform('min_child_weight', 0.1, 10),
161 | 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.),
162 | 'subsample': hp.uniform('subsample', 0.5, 1.),
163 | 'reg_alpha': hp.uniform('reg_alpha', 0.01, 0.1),
164 | 'reg_lambda': hp.uniform('reg_lambda', 0.01, 0.1),
165 | }
166 |
167 | if param_space:
168 | return param_space
169 | else:
170 | return space
--------------------------------------------------------------------------------
/train/train_hyperopt_mlflow.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import lightgbm as lgb
4 | import pickle
5 | import pdb
6 | import warnings
7 | import mlflow
8 | import mlflow.sklearn
9 |
10 | from pathlib import Path
11 | from sklearn.metrics import f1_score
12 | from hyperopt import hp, tpe, fmin, Trials
13 | from mlflow.tracking import MlflowClient
14 |
15 |
16 | warnings.filterwarnings("ignore")
17 |
18 |
19 | def best_threshold(y_true, pred_proba, proba_range, verbose=False):
20 | """
21 | Function to find the probability threshold that optimises the f1_score
22 |
23 | Comment: this function is not used in this excercise, but we include it in
24 | case the reader finds it useful
25 |
26 | Parameters:
27 | -----------
28 | y_true: numpy.ndarray
29 | array with the true labels
30 | pred_proba: numpy.ndarray
31 | array with the predicted probability
32 | proba_range: numpy.ndarray
33 | range of probabilities to explore.
34 | e.g. np.arange(0.1,0.9,0.01)
35 |
36 | Return:
37 | -----------
38 | tuple with the optimal threshold and the corresponding f1_score
39 | """
40 | scores = []
41 | for prob in proba_range:
42 | pred = [int(p>prob) for p in pred_proba]
43 | score = f1_score(y_true,pred)
44 | scores.append(score)
45 | if verbose:
46 | print("INFO: prob threshold: {}. score :{}".format(round(prob,3), round(score,5)))
47 | best_score = scores[np.argmax(scores)]
48 | optimal_threshold = proba_range[np.argmax(scores)]
49 | return (optimal_threshold, best_score)
50 |
51 |
52 | def lgb_f1_score(preds, lgbDataset):
53 | """
54 | Function to compute the f1_score to be used with lightgbm methods.
55 | Comments: output format must be:
56 | (eval_name, eval_result, is_higher_better)
57 |
58 | Parameters:
59 | -----------
60 | preds: np.array or List
61 | lgbDataset: lightgbm.Dataset
62 | """
63 | binary_preds = [int(p>0.5) for p in preds]
64 | y_true = lgbDataset.get_label()
65 | # lightgbm: (eval_name, eval_result, is_higher_better)
66 | return 'f1', f1_score(y_true, binary_preds), True
67 |
68 |
69 | class LGBOptimizer(object):
70 | def __init__(self, trainDataset, out_dir):
71 | """
72 | Hyper Parameter optimization
73 |
74 | Parameters:
75 | -----------
76 | trainDataset: FeatureTools object
77 | The result of running FeatureTools().fit()
78 | out_dir: pathlib.PosixPath
79 | Path to the output directory
80 | """
81 | self.PATH = out_dir
82 | self.early_stop_dict = {}
83 |
84 | self.X = trainDataset.data
85 | self.y = trainDataset.target
86 | self.colnames = trainDataset.colnames
87 | self.categorical_columns = trainDataset.categorical_columns + trainDataset.crossed_columns
88 |
89 | self.lgtrain = lgb.Dataset(self.X,label=self.y,
90 | feature_name=self.colnames,
91 | categorical_feature = self.categorical_columns,
92 | free_raw_data=False)
93 |
94 | def optimize(self, maxevals=200, model_id=0, reuse_experiment=False):
95 |
96 | param_space = self.hyperparameter_space()
97 | objective = self.get_objective(self.lgtrain)
98 | objective.i=0
99 | trials = Trials()
100 | best = fmin(fn=objective,
101 | space=param_space,
102 | algo=tpe.suggest,
103 | max_evals=maxevals,
104 | trials=trials)
105 | best['num_boost_round'] = self.early_stop_dict[trials.best_trial['tid']]
106 | best['num_leaves'] = int(best['num_leaves'])
107 | best['verbose'] = -1
108 |
109 | # The next few lines are the only ones related to mlflow.
110 | if not Path('mlruns').exists():
111 | # here set the tracking_uri. If None then http://localhost:5000
112 | client = MlflowClient()
113 | n_experiments=0
114 | elif not reuse_experiment:
115 | client = MlflowClient()
116 | n_experiments = len(client.list_experiments())
117 | experiment_name = 'experiment_' + str(n_experiments)
118 | client.create_experiment(name=experiment_name)
119 | with mlflow.start_run(experiment_id=n_experiments):
120 | model = lgb.LGBMClassifier(**best)
121 | model.fit(self.lgtrain.data,
122 | self.lgtrain.label,
123 | feature_name=self.colnames,
124 | categorical_feature=self.categorical_columns)
125 | for name, value in best.items():
126 | mlflow.log_param(name, value)
127 | mlflow.log_metric('binary_logloss', trials.best_trial['result']['loss'])
128 | mlflow.sklearn.log_model(model, "model")
129 |
130 | model_fname = 'model_{}_.p'.format(model_id)
131 | best_experiment_fname = 'best_experiment_{}_.p'.format(model_id)
132 |
133 | pickle.dump(model, open(self.PATH/model_fname, 'wb'))
134 | pickle.dump(best, open(self.PATH/best_experiment_fname, 'wb'))
135 |
136 | self.best = best
137 | self.model = model
138 |
139 | def get_objective(self, train):
140 |
141 | def objective(params):
142 | """
143 | objective function for lightgbm.
144 | """
145 | # hyperopt casts as float
146 | params['num_boost_round'] = int(params['num_boost_round'])
147 | params['num_leaves'] = int(params['num_leaves'])
148 |
149 | # need to be passed as parameter
150 | params['is_unbalance'] = True
151 | params['verbose'] = -1
152 | params['seed'] = 1
153 |
154 | cv_result = lgb.cv(
155 | params,
156 | train,
157 | num_boost_round=params['num_boost_round'],
158 | metrics='binary_logloss',
159 | # feval = lgb_f1_score,
160 | nfold=3,
161 | stratified=True,
162 | early_stopping_rounds=20)
163 | self.early_stop_dict[objective.i] = len(cv_result['binary_logloss-mean'])
164 | error = cv_result['binary_logloss-mean'][-1]
165 | objective.i+=1
166 | return error
167 |
168 | return objective
169 |
170 | def hyperparameter_space(self, param_space=None):
171 |
172 | space = {
173 | 'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
174 | 'num_boost_round': hp.quniform('num_boost_round', 50, 500, 20),
175 | 'num_leaves': hp.quniform('num_leaves', 31, 256, 4),
176 | 'min_child_weight': hp.uniform('min_child_weight', 0.1, 10),
177 | 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.),
178 | 'subsample': hp.uniform('subsample', 0.5, 1.),
179 | 'reg_alpha': hp.uniform('reg_alpha', 0.01, 0.1),
180 | 'reg_lambda': hp.uniform('reg_lambda', 0.01, 0.1),
181 | }
182 |
183 | if param_space:
184 | return param_space
185 | else:
186 | return space
187 |
--------------------------------------------------------------------------------
/train/train_hyperparameterhunter.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import warnings
4 | import pickle
5 | import json
6 | import lightgbm as lgb
7 |
8 | import pdb
9 |
10 | from pathlib import Path
11 | from hyperparameter_hunter import (Environment, CVExperiment,
12 | BayesianOptimization, Integer, Real, Categorical)
13 | from hyperparameter_hunter import optimization as opt
14 | from sklearn.model_selection import StratifiedKFold
15 |
16 | warnings.filterwarnings("ignore")
17 |
18 |
19 | class LGBOptimizer(object):
20 | def __init__(self, trainDataset, out_dir):
21 | """
22 | Hyper Parameter optimization
23 |
24 | Comments: Hyperparameter_hunter (hereafter HH) is a fantastic package
25 | (https://github.com/HunterMcGushion/hyperparameter_hunter) to avoid
26 | wasting time as you optimise parameters. In the words of his author:
27 | "For so long, hyperparameter optimization has been such a time
28 | consuming process that just pointed you in a direction for further
29 | optimization, then you basically had to start over".
30 |
31 | Parameters:
32 | -----------
33 | trainDataset: FeatureTools object
34 | The result of running FeatureTools().fit()
35 | out_dir: Str
36 | Path to the output directory
37 | """
38 |
39 | self.PATH = str(out_dir)
40 | self.data = trainDataset.data
41 | self.data['target'] = trainDataset.target
42 | self.colnames = trainDataset.colnames
43 | self.categorical_columns = trainDataset.categorical_columns + trainDataset.crossed_columns
44 |
45 | def optimize(self, metrics='f1_score', n_splits=3, cv_type=StratifiedKFold,
46 | maxevals=200, do_predict_proba=None, model_id=0):
47 |
48 | params = self.hyperparameter_space()
49 | extra_params = self.extra_setup()
50 |
51 | env = Environment(
52 | train_dataset=self.data,
53 | results_path='HyperparameterHunterAssets',
54 | # results_path=self.PATH,
55 | metrics=[metrics],
56 | do_predict_proba = do_predict_proba,
57 | cv_type=cv_type,
58 | cv_params=dict(n_splits=n_splits),
59 | )
60 |
61 | # optimizer = opt.GradientBoostedRegressionTreeOptimization(iterations=maxevals)
62 | optimizer = opt.BayesianOptimization(iterations=maxevals)
63 | optimizer.set_experiment_guidelines(
64 | model_initializer=lgb.LGBMClassifier,
65 | model_init_params=params,
66 | model_extra_params=extra_params
67 | )
68 | optimizer.go()
69 | # there are a few fixes on its way and the next few lines will soon be
70 | # one. At the moment, to access to the best parameters one has to read
71 | # from disc and access them
72 | best_experiment = 'HyperparameterHunterAssets/Experiments/Descriptions/'+\
73 | optimizer.best_experiment+'.json'
74 | with open(best_experiment) as best:
75 | best = json.loads(best.read())['hyperparameters']['model_init_params']
76 | model = lgb.LGBMClassifier(**best)
77 | X, y = self.data.drop('target',axis=1), self.data.target
78 | model.fit(X,y,
79 | feature_name=self.colnames,
80 | categorical_feature=self.categorical_columns
81 | )
82 | model_fname = 'model_{}_.p'.format(model_id)
83 | best_experiment_fname = 'best_experiment_{}_.p'.format(model_id)
84 | pickle.dump(model, open('/'.join([self.PATH,model_fname]), 'wb'))
85 | pickle.dump(optimizer, open('/'.join([self.PATH,best_experiment_fname]), 'wb'))
86 |
87 |
88 | def hyperparameter_space(self, param_space=None):
89 |
90 | space = dict(
91 | is_unbalance = True,
92 | learning_rate = Real(0.01, 0.3),
93 | num_boost_round=Categorical(np.arange(50, 500, 20)),
94 | num_leaves=Categorical(np.arange(31, 256, 4)),
95 | min_child_weight = Real(0.1, 10),
96 | colsample_bytree= Real(0.5, 1.),
97 | subsample=Real(0.5, 1.),
98 | reg_alpha= Real(0.01, 0.1),
99 | reg_lambda= Real(0.01, 0.1)
100 | )
101 |
102 | if param_space:
103 | return param_space
104 | else:
105 | return space
106 |
107 |
108 | def extra_setup(self, extra_setup=None):
109 |
110 | extra_params = dict(
111 | early_stopping_rounds=20,
112 | feature_name=self.colnames,
113 | categorical_feature=self.categorical_columns
114 | )
115 |
116 | if extra_setup:
117 | return extra_setup
118 | else:
119 | return extra_params
120 |
--------------------------------------------------------------------------------
/train/train_hyperparameterhunter_mlfow.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import warnings
4 | import pickle
5 | import json
6 | import lightgbm as lgb
7 | import mlflow
8 | import mlflow.sklearn
9 |
10 | import pdb
11 |
12 | from pathlib import Path
13 | from hyperparameter_hunter import (Environment, CVExperiment,
14 | BayesianOptimization, Integer, Real, Categorical)
15 | from hyperparameter_hunter import optimization as opt
16 | from sklearn.model_selection import StratifiedKFold
17 | from mlflow.tracking import MlflowClient
18 |
19 |
20 | warnings.filterwarnings("ignore")
21 |
22 |
23 | class LGBOptimizer(object):
24 | def __init__(self, trainDataset, out_dir):
25 | """
26 | Hyper Parameter optimization
27 |
28 | Comments: Hyperparameter_hunter (hereafter HH) is a fantastic package
29 | (https://github.com/HunterMcGushion/hyperparameter_hunter) to avoid
30 | wasting time as you optimise parameters. In the words of his author:
31 | "For so long, hyperparameter optimization has been such a time
32 | consuming process that just pointed you in a direction for further
33 | optimization, then you basically had to start over".
34 |
35 | Parameters:
36 | -----------
37 | trainDataset: FeatureTools object
38 | The result of running FeatureTools().fit()
39 | out_dir: Str
40 | Path to the output directory
41 | """
42 |
43 | self.PATH = str(out_dir)
44 | self.data = trainDataset.data
45 | self.data['target'] = trainDataset.target
46 | self.colnames = trainDataset.colnames
47 | self.categorical_columns = trainDataset.categorical_columns + trainDataset.crossed_columns
48 |
49 | def optimize(self, metrics='f1_score', n_splits=3, cv_type=StratifiedKFold,
50 | maxevals=200, do_predict_proba=None, model_id=0, reuse_experiment=False):
51 |
52 | params = self.hyperparameter_space()
53 | extra_params = self.extra_setup()
54 |
55 | env = Environment(
56 | train_dataset=self.data,
57 | results_path='HyperparameterHunterAssets',
58 | # results_path=self.PATH,
59 | metrics=[metrics],
60 | do_predict_proba = do_predict_proba,
61 | cv_type=cv_type,
62 | cv_params=dict(n_splits=n_splits),
63 | )
64 |
65 | # optimizer = opt.GradientBoostedRegressionTreeOptimization(iterations=maxevals)
66 | optimizer = opt.BayesianOptimization(iterations=maxevals)
67 | optimizer.set_experiment_guidelines(
68 | model_initializer=lgb.LGBMClassifier,
69 | model_init_params=params,
70 | model_extra_params=extra_params
71 | )
72 | optimizer.go()
73 |
74 | # there are a few fixes on its way and the next few lines will soon be
75 | # one. At the moment, to access to the best parameters one has to read
76 | # from disc and access them
77 | best_experiment = 'HyperparameterHunterAssets/Experiments/Descriptions/'+\
78 | optimizer.best_experiment+'.json'
79 | with open(best_experiment) as best:
80 | best = json.loads(best.read())['hyperparameters']['model_init_params']
81 |
82 | # The next few lines are the only ones related to mlflow
83 | if not Path('mlruns').exists():
84 | # here set the tracking_uri. If None then http://localhost:5000
85 | client = MlflowClient()
86 | n_experiments=0
87 | elif not reuse_experiment:
88 | client = MlflowClient()
89 | n_experiments = len(client.list_experiments())
90 | experiment_name = 'experiment_' + str(n_experiments)
91 | client.create_experiment(name=experiment_name)
92 | with mlflow.start_run(experiment_id=n_experiments):
93 | model = lgb.LGBMClassifier(**best)
94 | X, y = self.data.drop('target',axis=1), self.data.target
95 | model.fit(X,y,
96 | feature_name=self.colnames,
97 | categorical_feature=self.categorical_columns
98 | )
99 | for name, value in best.items():
100 | mlflow.log_param(name, value)
101 | mlflow.log_metric('f1_score', -optimizer.optimizer_result.fun)
102 | mlflow.sklearn.log_model(model, "model")
103 |
104 | model_fname = 'model_{}_.p'.format(model_id)
105 | best_experiment_fname = 'best_experiment_{}_.p'.format(model_id)
106 | pickle.dump(model, open('/'.join([self.PATH,model_fname]), 'wb'))
107 | pickle.dump(optimizer, open('/'.join([self.PATH,best_experiment_fname]), 'wb'))
108 |
109 |
110 | def hyperparameter_space(self, param_space=None):
111 |
112 | space = dict(
113 | is_unbalance = True,
114 | learning_rate = Real(0.01, 0.3),
115 | num_boost_round=Integer(50, 500),
116 | num_leaves=Integer(31, 255),
117 | min_child_weight = Real(0.1, 10),
118 | colsample_bytree= Real(0.5, 1.),
119 | subsample=Real(0.5, 1.),
120 | reg_alpha= Real(0.01, 0.1),
121 | reg_lambda= Real(0.01, 0.1)
122 | )
123 |
124 | if param_space:
125 | return param_space
126 | else:
127 | return space
128 |
129 | def extra_setup(self, extra_setup=None):
130 |
131 | extra_params = dict(
132 | early_stopping_rounds=20,
133 | feature_name=self.colnames,
134 | categorical_feature=self.categorical_columns
135 | )
136 |
137 | if extra_setup:
138 | return extra_setup
139 | else:
140 | return extra_params
141 |
--------------------------------------------------------------------------------
/trainer.py:
--------------------------------------------------------------------------------
1 | import pdb
2 | import json
3 | import pandas as pd
4 | import pickle
5 | import argparse
6 |
7 | from pathlib import Path
8 | from kafka import KafkaConsumer
9 |
10 | from utils.messages_utils import publish_traininig_completed
11 | from utils.preprocess_data import build_train
12 |
13 |
14 | KAFKA_HOST = 'localhost:9092'
15 | RETRAIN_TOPIC = 'retrain_topic'
16 | PATH = Path('data/')
17 | TRAIN_DATA = PATH/'train/train.csv'
18 | DATAPROCESSORS_PATH = PATH/'dataprocessors'
19 | MODELS_PATH = PATH/'models'
20 | MESSAGES_PATH = PATH/'messages'
21 |
22 |
23 | def train(model_id, messages, hyper):
24 | print("RETRAINING STARTED (model id: {})".format(model_id))
25 | dtrain = build_train(TRAIN_DATA, DATAPROCESSORS_PATH, model_id, messages)
26 | if hyper == "hyperopt":
27 | # from train.train_hyperopt import LGBOptimizer
28 | from train.train_hyperopt_mlflow import LGBOptimizer
29 | elif hyper == "hyperparameterhunter":
30 | # from train.train_hyperparameterhunter import LGBOptimizer
31 | from train.train_hyperparameterhunter_mlfow import LGBOptimizer
32 | LGBOpt = LGBOptimizer(dtrain, MODELS_PATH)
33 | LGBOpt.optimize(maxevals=2, model_id=model_id)
34 | print("RETRAINING COMPLETED (model id: {})".format(model_id))
35 |
36 |
37 | def start(hyper):
38 | consumer = KafkaConsumer(RETRAIN_TOPIC, bootstrap_servers=KAFKA_HOST)
39 |
40 | for msg in consumer:
41 | message = json.loads(msg.value)
42 | if 'retrain' in message and message['retrain']:
43 | model_id = message['model_id']
44 | batch_id = message['batch_id']
45 | message_fname = 'messages_{}_.txt'.format(batch_id)
46 | messages = MESSAGES_PATH/message_fname
47 |
48 | train(model_id, messages, hyper)
49 | publish_traininig_completed(model_id)
50 |
51 |
52 | if __name__ == '__main__':
53 | parser = argparse.ArgumentParser()
54 |
55 | parser.add_argument("--hyper", type=str, default="hyperopt")
56 | args = parser.parse_args()
57 |
58 | start(args.hyper)
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrzaurin/ml-pipeline/af2b338b19a7eda31af7da35db877a9254a02058/utils/__init__.py
--------------------------------------------------------------------------------
/utils/feature_tools.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import copy
3 |
4 |
5 | class FeatureTools(object):
6 | """Collection of preprocessing methods"""
7 |
8 | @staticmethod
9 | def num_scaler(df_inp, cols, sc, trained=False):
10 | """
11 | Method to scale numeric columns in a dataframe
12 |
13 | Parameters:
14 | -----------
15 | df_inp: Pandas.DataFrame
16 | cols: List
17 | List of numeric columns to be scaled
18 | sc: Scaler object. From sklearn.preprocessing or similar structure
19 | trained: Boolean
20 | If True it will only be used to 'transform'
21 |
22 | Returns:
23 | --------
24 | df: Pandas.DataFrame
25 | transformed/normalised dataframe
26 | sc: trained scaler
27 | """
28 | df = df_inp.copy()
29 | if not trained:
30 | df[cols] = sc.fit_transform(df[cols])
31 | else:
32 | df[cols] = sc.transform(df[cols])
33 | return df, sc
34 |
35 | @staticmethod
36 | def cross_columns(df_inp, x_cols):
37 | """
38 | Method to build crossed columns. These are new columns that are the
39 | cartesian product of the parent columns.
40 |
41 | Parameters:
42 | -----------
43 | df_inp: Pandas.DataFrame
44 | x_cols: List.
45 | List of tuples with the columns to cross
46 | e.g. [('colA', 'colB'),('colC', 'colD')]
47 |
48 | Returns:
49 | --------
50 | df: Pandas.DataFrame
51 | pandas dataframe with the new crossed columns
52 | colnames: List
53 | list the new column names
54 | """
55 | df = df_inp.copy()
56 | colnames = ['_'.join(x_c) for x_c in x_cols]
57 | crossed_columns = {k:v for k,v in zip(colnames, x_cols)}
58 |
59 | for k, v in crossed_columns.items():
60 | df[k] = df[v].apply(lambda x: '-'.join(x), axis=1)
61 |
62 | return df, colnames
63 |
64 | @staticmethod
65 | def val2idx(df_inp, cols, val_to_idx=None):
66 | """
67 | This is basically a LabelEncoder that returns a dictionary with the
68 | mapping of the labels.
69 |
70 | Parameters:
71 | -----------
72 | df_inp: Pandas.DataFrame
73 | cols: List
74 | List of categorical columns to encode
75 | val_to_idx: Dict
76 | LabelEncoding dictionary if already exists
77 |
78 | Returns:
79 | --------
80 | df: Pandas.DataFrame
81 | pandas dataframe with the categorical columns encoded
82 | val_to_idx: Dict
83 | dictionary with the encoding mappings
84 | """
85 | df = df_inp.copy()
86 | if not val_to_idx:
87 |
88 | val_types = dict()
89 | for c in cols:
90 | val_types[c] = df[c].unique()
91 |
92 | val_to_idx = dict()
93 | for k, v in val_types.items():
94 | val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}
95 |
96 | for k, v in val_to_idx.items():
97 | df[k] = df[k].apply(lambda x: v[x])
98 |
99 | return df, val_to_idx
100 |
101 | def fit(self, df_inp, target_col, numerical_columns, categorical_columns, x_columns, sc):
102 | """
103 | Parameters:
104 | -----------
105 | df_inp: Pandas.DataFrame
106 | target_col: Str
107 | numerical_columns: List
108 | List with the numerical columns
109 | categorical_columns: List
110 | List with the categorical columns
111 | x_columns: List
112 | List of tuples with the columns to cross
113 | sc: Scaler. From sklearn.preprocessing or object with the same
114 | structure
115 | """
116 | df = df_inp.copy()
117 | self.numerical_columns = numerical_columns
118 | self.categorical_columns = categorical_columns
119 | self.x_columns = x_columns
120 |
121 | df, self.sc = self.num_scaler(df, numerical_columns, sc)
122 | df, self.crossed_columns = self.cross_columns(df, x_columns)
123 | df, self.encoding_d = self.val2idx(df, categorical_columns+self.crossed_columns)
124 |
125 | self.target = df[target_col]
126 | df.drop(target_col, axis=1, inplace=True)
127 | self.data = df
128 | self.colnames = df.columns.tolist()
129 |
130 | return self
131 |
132 | def transform(self, df_inp, trained_sc=None):
133 | """
134 | Parameters:
135 | -----------
136 | df_inp: Pandas.DataFrame
137 | trained_sc: Scaler. From sklearn.preprocessing or object with the same
138 |
139 | Returns:
140 | --------
141 | df: Pandas.DataFrame
142 | Tranformed dataframe: scaled, Labelencoded and with crossed columns
143 | """
144 | df = df_inp.copy()
145 | if trained_sc:
146 | sc = copy.deepcopy(trained_sc)
147 | else:
148 | sc = copy.deepcopy(self.sc)
149 |
150 | df, _ = self.num_scaler(df, self.numerical_columns, sc, trained=True)
151 | df, _ = self.cross_columns(df, self.x_columns)
152 | df, _ = self.val2idx(df, self.categorical_columns+self.crossed_columns, self.encoding_d)
153 |
154 | return df
155 |
--------------------------------------------------------------------------------
/utils/messages_utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pickle
3 |
4 | from kafka import KafkaProducer
5 |
6 | producer = KafkaProducer(bootstrap_servers='localhost:9092')
7 |
8 | def publish_prediction(pred, request_id):
9 | producer.send('app_messages', json.dumps({'request_id': request_id, 'prediction': float(pred)}).encode('utf-8'))
10 | producer.flush()
11 |
12 |
13 | def publish_traininig_completed(model_id):
14 | producer.send('retrain_topic', json.dumps({'training_completed': True, 'model_id': model_id}).encode('utf-8'))
15 | producer.flush()
16 |
17 |
18 | def read_messages_count(path, repeat_every):
19 | file_list=list(path.iterdir())
20 | nfiles = len(file_list)
21 | if nfiles==0:
22 | return 0
23 | else:
24 | return ((nfiles-1)*repeat_every) + len(file_list[-1].open().readlines())
25 |
26 |
27 | def append_message(message, path, batch_id):
28 | message_fname = 'messages_{}_.txt'.format(batch_id)
29 | f=open(path/message_fname, "a")
30 | f.write("%s\n" % (json.dumps(message)))
31 | f.close()
32 |
33 |
34 | def send_retrain_message(model_id, batch_id):
35 | producer.send('retrain_topic', json.dumps({'retrain': True, 'model_id': model_id, 'batch_id': batch_id}).encode('utf-8'))
36 | producer.flush()
--------------------------------------------------------------------------------
/utils/preprocess_data.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pickle
3 | import json
4 | import pdb
5 | import warnings
6 |
7 | from pathlib import Path
8 | from utils.feature_tools import FeatureTools
9 | from sklearn.preprocessing import MinMaxScaler
10 |
11 | warnings.filterwarnings("ignore")
12 |
13 |
14 | def load_new_training_data(path):
15 | data = []
16 | with open(path, "r") as f:
17 | for line in f:
18 | data.append(json.loads(line))
19 | return pd.DataFrame(data)
20 |
21 |
22 | def build_train(train_path, results_path, dataprocessor_id=0, PATH_2=None):
23 | target = 'income_label'
24 | # read initial DataFrame
25 | df = pd.read_csv(train_path)
26 | if PATH_2:
27 | df_tmp = load_new_training_data(PATH_2)
28 | # Let's make sure columns are in the same order
29 | df_tmp = df_tmp[df.columns]
30 | # append new DataFrame
31 | df = pd.concat([df, df_tmp], ignore_index=True)
32 | # Save it to disk
33 | df.to_csv(train_path, index=False)
34 |
35 | df[target] = (df['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)
36 | df.drop('income_bracket', axis=1, inplace=True)
37 |
38 | categorical_columns = list(df.select_dtypes(include=['object']).columns)
39 | numerical_columns = [c for c in df.columns if c not in categorical_columns+[target]]
40 | crossed_columns = (['education', 'occupation'], ['native_country', 'occupation'])
41 |
42 | preprocessor = FeatureTools()
43 | dataprocessor = preprocessor.fit(
44 | df,
45 | target,
46 | numerical_columns,
47 | categorical_columns,
48 | crossed_columns,
49 | sc=MinMaxScaler()
50 | )
51 |
52 | dataprocessor_fname = 'dataprocessor_{}_.p'.format(dataprocessor_id)
53 | pickle.dump(dataprocessor, open(results_path/dataprocessor_fname, "wb"))
54 | if dataprocessor_id==0:
55 | pickle.dump(df.columns.tolist()[:-1], open(results_path/'column_order.p', "wb"))
56 |
57 | return dataprocessor
58 |
59 |
60 | # if __name__ == '__main__':
61 |
62 | # PATH = Path('data/')
63 | # TRAIN_PATH = PATH/'train'
64 | # DATAPROCESSORS_PATH = PATH/'dataprocessors'
65 |
66 | # dataprocessor = build_train(TRAIN_PATH/'train.csv', DATAPROCESSORS_PATH)
67 |
68 |
--------------------------------------------------------------------------------