├── .gitignore
├── README.md
├── images
    ├── model_retrained.png
    ├── pipeline_diagram.png
    └── start_pipeline.gif
├── initialize.py
├── notebooks
    ├── .ipynb_checkpoints
    │   └── skopt_vs_hyperopt-checkpoint.ipynb
    └── skopt_vs_hyperopt.ipynb
├── predictor.py
├── requirements.txt
├── sample_app.py
├── train
    ├── __init__.py
    ├── train_hyperopt.py
    ├── train_hyperopt_mlflow.py
    ├── train_hyperparameterhunter.py
    └── train_hyperparameterhunter_mlfow.py
├── trainer.py
└── utils
    ├── __init__.py
    ├── feature_tools.py
    ├── messages_utils.py
    └── preprocess_data.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 88 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 89 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 90 | #   install all needed dependencies.
 91 | #Pipfile.lock
 92 | 
 93 | # celery beat schedule file
 94 | celerybeat-schedule
 95 | 
 96 | # SageMath parsed files
 97 | *.sage.py
 98 | 
 99 | # Environments
100 | .env
101 | .venv
102 | env/
103 | venv/
104 | ENV/
105 | env.bak/
106 | venv.bak/
107 | 
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 | 
112 | # Rope project settings
113 | .ropeproject
114 | 
115 | # mkdocs documentation
116 | /site
117 | 
118 | # mypy
119 | .mypy_cache/
120 | .dmypy.json
121 | dmypy.json
122 | 
123 | # Pyre type checker
124 | .pyre/
125 | 
126 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
127 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
128 | .idea/
129 | # User-specific stuff
130 | .idea/**/workspace.xml
131 | .idea/**/tasks.xml
132 | .idea/**/usage.statistics.xml
133 | .idea/**/dictionaries
134 | .idea/**/shelf
135 | 
136 | # Generated files
137 | .idea/**/contentModel.xml
138 | 
139 | # Sensitive or high-churn files
140 | .idea/**/dataSources/
141 | .idea/**/dataSources.ids
142 | .idea/**/dataSources.local.xml
143 | .idea/**/sqlDataSources.xml
144 | .idea/**/dynamic.xml
145 | .idea/**/uiDesigner.xml
146 | .idea/**/dbnavigator.xml
147 | 
148 | # Gradle
149 | .idea/**/gradle.xml
150 | .idea/**/libraries
151 | 
152 | # Gradle and Maven with auto-import
153 | # When using Gradle or Maven with auto-import, you should exclude module files,
154 | # since they will be recreated, and may cause churn.  Uncomment if using
155 | # auto-import.
156 | # .idea/modules.xml
157 | # .idea/*.iml
158 | # .idea/modules
159 | # *.iml
160 | # *.ipr
161 | 
162 | # CMake
163 | cmake-build-*/
164 | 
165 | # Mongo Explorer plugin
166 | .idea/**/mongoSettings.xml
167 | 
168 | # File-based project format
169 | *.iws
170 | 
171 | # IntelliJ
172 | out/
173 | 
174 | # mpeltonen/sbt-idea plugin
175 | .idea_modules/
176 | 
177 | # JIRA plugin
178 | atlassian-ide-plugin.xml
179 | 
180 | # Cursive Clojure plugin
181 | .idea/replstate.xml
182 | 
183 | # Crashlytics plugin (for Android Studio and IntelliJ)
184 | com_crashlytics_export_strings.xml
185 | crashlytics.properties
186 | crashlytics-build.properties
187 | fabric.properties
188 | 
189 | # Editor-based Rest Client
190 | .idea/httpRequests
191 | 
192 | # Android studio 3.1+ serialized cache file
193 | .idea/caches/build_file_checksums.ser
194 | 
195 | data/
196 | mlruns/
197 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Putting ML in Production
 2 | This repo contains code that we hope is useful to illustrate how one could productionise a real-time algorithm. The companion Medium posts can be found [here](https://medium.com/@jrzaurin/putting-ml-in-production-i-using-apache-kafka-in-python-ce06b3a395c8) and [here](https://towardsdatascience.com/putting-ml-in-production-ii-logging-and-monitoring-algorithms-91f174044e4e). The code here is meant to be as generic as possible (within certain limits), and is designed to be useful in an scenario similar to the following one.
 3 | 
 4 | ## Scenario
 5 | 
 6 | A company collects data using a series of services that generate events as the users/customers interact with the the company’s website or app. As these interactions happen, an algorithm needs to run in real time and some immediate action needs to be taken based on the algorithm’s outputs (or predictions). On top of that, after *N* interactions (or observations) the algorithm needs to be retrained without stopping the prediction service, since users will keep interacting.
 7 | 
 8 | For the exercise here we have used the [Adult](https://archive.ics.uci.edu/ml/datasets/adult) dataset, where the goal is to predict whether individuals earn an income higher/lower than 50k based on their age, native country, etc. To adapt this dataset to the scenario described before, one could assume that age, native country, etc is collected through an online questionnaire/form and we need to predict whether users have high/low income in real time. If high income, then we immediately call/email them with some offer, for example. Then, after N new observations we retrain the algorithm while we keep predicting on new users.
 9 | 
10 | ## Solution
11 | 
12 | The online part of our solution is illustrated in the figure below, and uses mainly [Kafka-Python](https://github.com/dpkp/kafka-python), [LightGBM](https://lightgbm.readthedocs.io/en/latest/#) and [Hyperopt](http://hyperopt.github.io/hyperopt/) or [HyperparameterHunter](https://github.com/HunterMcGushion/hyperparameter_hunter).
13 | 
14 | ![Figure 1. Real-time prediction ML pipeline.](images/pipeline_diagram.png)
15 | 
16 | A full description of the solution can be found in the already mentioned Medium posts. Briefly:
17 | 
18 | **OFFLINE TRAINING**
19 | 
20 | The offline process is fairly standard and all accomplished by running the `initialize.py` script. This script will download the dataset, set the dir structure, pre-preprocess the data, train an initial model on the training dataset and optimise the hyperparameters of that model. The results will be saved to disk and from there in advance we are ready to move to the online stage of the process.
21 | 
22 | **ONLINE PREDICTIONS AND RETRAINING**
23 | 
24 |  0. The App/Service (`app_sample.py`) will send messages (JSON) into the pipeline. These will be processed and App/Service will then get the results of the predictions.
25 |  1. 1a) The messages from App/Service will be published to Kafka and, eventualy, received by the Predictor (`predictor.py`)
26 | 
27 |  	1b) The Predictor will process the data and run the algorithm publishing the message with the prediction result back to Kafka, which will be eventually received by App/Service
28 |  2. After N messages the Predictor will publish a "retrain topic" message
29 |  3. The Trainer (`trainer.py`) will receive the "retrain topic" message and start retraining the algorithm. In the meantime, the Predictor will not stop serving predictions.
30 |  4. Once the algorithm is retrained, the Trainer will publish a message with the corresponding information (namely: *"retraining completed"*)
31 |  5. The Predictor will receive the message that retraining is complete, it will load the new model and proceed as usual.
32 | 
33 | ## How to run the pipeline
34 | 
35 | 1. Run initialize.py
36 | ```
37 | python initialize.py
38 | ```
39 | or
40 | ```
41 | python initialize.py --hyper hyperparameterhunter
42 | ```
43 | HyperparameterHunter is built on top of [Skopt](https://scikit-optimize.github.io/). It is not our goal here to compare hyperparameter optimization packages. Nonetheless, a brief comparison is included in the Medium post and a notebook comparing Skopt and Hyperopt performances is included here, in the notebooks directory.
44 | 
45 | 2. Start Zookeeper and Kafka. Assuming these are installed using Homebrew, starting these services is as easy as:
46 | ```
47 | $ brew services start zookeeper
48 | ==> Successfully started `zookeeper` (label: homebrew.mxcl.zookeeper)
49 | $ brew services start kafka
50 | ==> Successfully started `kafka` (label: homebrew.mxcl.kafka)
51 | ```
52 | 
53 | 3. In Terminal#1 run the Predictor (or the Trainer):
54 | ```
55 | python predictor.py
56 | ```
57 | 4. In Terminal#2 run the Trainer (or the Predictor):
58 | ```
59 | python trainer.py
60 | ```
61 | or
62 | ```
63 | python trainer.py --hyper hyperparameterhunter
64 | ```
65 | 5. In Terminal#3 run the Sample App
66 | ```
67 | python sample_app.py
68 | ```
69 | 
70 | Below we have included a GIF showing the steps 3, 4 and 5.
71 | 
72 | 
73 | ![Figure 2. How to launch the Pipeline](images/start_pipeline.gif)
74 | 
75 | After `RETRAIN_EVERY` messages (parameter to be set by the user), the user will be able to see how the algorithm is retrained in the terminal, as shown in the Figure below. The top-right terminal shows how Hyperopt has run 10 evaluations (in a real exercise these should be a few hundred). Once the model is retrained and optimised we see in the top-left window how the predictor has loaded the new model (after the annoying warning message from the new LightGBM version) and proceed with the prediction service as usual.
76 | 
77 | ![Figure 3. Retraining process](images/model_retrained.png)
78 | 
79 | 
80 | ## Logging and monitoring
81 | To log all the information generated form the pipeline as it retraines the algorithm one could directly use HyperparameterHunter, which is fantastic precisely at that task. In addition, we have also used MLflow, which comes with a very convenient UI. [Our second posts](https://towardsdatascience.com/putting-ml-in-production-ii-logging-and-monitoring-algorithms-91f174044e4e) focuses on the interplay of these two tools. All the related code can be found at the `train` module within `train_hyperopt_mlflow.py` or `train_hyperparameterhunter_mlfow.py`.
82 | 
83 | Comments or suggestions, please email: jrzaurin@gmail.com
84 | 


--------------------------------------------------------------------------------
/images/model_retrained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrzaurin/ml-pipeline/af2b338b19a7eda31af7da35db877a9254a02058/images/model_retrained.png


--------------------------------------------------------------------------------
/images/pipeline_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrzaurin/ml-pipeline/af2b338b19a7eda31af7da35db877a9254a02058/images/pipeline_diagram.png


--------------------------------------------------------------------------------
/images/start_pipeline.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrzaurin/ml-pipeline/af2b338b19a7eda31af7da35db877a9254a02058/images/start_pipeline.gif


--------------------------------------------------------------------------------
/initialize.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import lightgbm as lgb
 4 | import pickle
 5 | import warnings
 6 | import argparse
 7 | import os
 8 | import pdb
 9 | 
10 | from pathlib import Path
11 | from utils.preprocess_data import build_train
12 | 
13 | 
14 | PATH = Path('data/')
15 | TRAIN_PATH = PATH/'train'
16 | DATAPROCESSORS_PATH = PATH/'dataprocessors'
17 | MODELS_PATH = PATH/'models'
18 | MESSAGES_PATH = PATH/'messages'
19 | 
20 | 
21 | def create_folders():
22 | 	print("creating directory structure...")
23 | 	(PATH).mkdir(exist_ok=True)
24 | 	(TRAIN_PATH).mkdir(exist_ok=True)
25 | 	(MODELS_PATH).mkdir(exist_ok=True)
26 | 	(DATAPROCESSORS_PATH).mkdir(exist_ok=True)
27 | 	(MESSAGES_PATH).mkdir(exist_ok=True)
28 | 
29 | 
30 | def download_data():
31 | 	train_path = PATH/'adult.data'
32 | 	test_path = PATH/'adult.test'
33 | 
34 | 	COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
35 | 	           "marital_status", "occupation", "relationship", "race", "gender",
36 | 	           "capital_gain", "capital_loss", "hours_per_week", "native_country",
37 | 	           "income_bracket"]
38 | 
39 | 	print("downloading training data...")
40 | 	df_train = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data",
41 | 	    names=COLUMNS, skipinitialspace=True, index_col=0)
42 | 	df_train.drop("education_num", axis=1, inplace=True)
43 | 	df_train.to_csv(train_path)
44 | 	df_train.to_csv(PATH/'train/train.csv')
45 | 
46 | 	print("downloading testing data...")
47 | 	df_test = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test",
48 | 	    names=COLUMNS, skipinitialspace=True, skiprows=1, index_col=0)
49 | 	df_test.drop("education_num", axis=1, inplace=True)
50 | 	df_test.to_csv(test_path)
51 | 
52 | 
53 | def create_data_processor():
54 | 	print("creating preprocessor...")
55 | 	dataprocessor = build_train(TRAIN_PATH/'train.csv', DATAPROCESSORS_PATH)
56 | 
57 | 
58 | def create_model(hyper):
59 | 	print("creating model...")
60 | 	init_dataprocessor = 'dataprocessor_0_.p'
61 | 	dtrain = pickle.load(open(DATAPROCESSORS_PATH/init_dataprocessor, 'rb'))
62 | 	if hyper == "hyperopt":
63 | 		# from train.train_hyperopt import LGBOptimizer
64 | 		from train.train_hyperopt_mlflow import LGBOptimizer
65 | 	elif hyper == "hyperparameterhunter":
66 | 		# from train.train_hyperparameterhunter import LGBOptimizer
67 | 		from train.train_hyperparameterhunter_mlfow import LGBOptimizer
68 | 	LGBOpt = LGBOptimizer(dtrain, MODELS_PATH)
69 | 	LGBOpt.optimize(maxevals=50)
70 | 
71 | 
72 | if __name__ == '__main__':
73 | 
74 | 	parser = argparse.ArgumentParser()
75 | 
76 | 	parser.add_argument("--hyper", type=str, default="hyperopt")
77 | 	args = parser.parse_args()
78 | 	create_folders()
79 | 	download_data()
80 | 	create_data_processor()
81 | 	create_model(args.hyper)


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/skopt_vs_hyperopt-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Skopt vs Hyperopt\n",
  8 |     "\n",
  9 |     "## Importing and preprocessing data"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stderr",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "/usr/local/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.\n",
 22 |       "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
 23 |       "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
 24 |       "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
 25 |       "  \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
 26 |      ]
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "import pandas as pd\n",
 31 |     "import numpy as np\n",
 32 |     "import pickle\n",
 33 |     "import lightgbm as lgb\n",
 34 |     "import warnings\n",
 35 |     "\n",
 36 |     "from time import time\n",
 37 |     "from hyperopt import hp, tpe, fmin, Trials\n",
 38 |     "from skopt import BayesSearchCV\n",
 39 |     "from skopt.space import Real, Categorical, Integer\n",
 40 |     "from skopt import gbrt_minimize\n",
 41 |     "from sklearn.model_selection import StratifiedKFold, cross_val_score\n",
 42 |     "from sklearn.metrics import log_loss\n",
 43 |     "from utils import FeatureTools\n",
 44 |     "\n",
 45 |     "warnings.filterwarnings(\"ignore\")"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 2,
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "data": {
 55 |       "text/html": [
 56 |        "<div>\n",
 57 |        "<style scoped>\n",
 58 |        "    .dataframe tbody tr th:only-of-type {\n",
 59 |        "        vertical-align: middle;\n",
 60 |        "    }\n",
 61 |        "\n",
 62 |        "    .dataframe tbody tr th {\n",
 63 |        "        vertical-align: top;\n",
 64 |        "    }\n",
 65 |        "\n",
 66 |        "    .dataframe thead th {\n",
 67 |        "        text-align: right;\n",
 68 |        "    }\n",
 69 |        "</style>\n",
 70 |        "<table border=\"1\" class=\"dataframe\">\n",
 71 |        "  <thead>\n",
 72 |        "    <tr style=\"text-align: right;\">\n",
 73 |        "      <th></th>\n",
 74 |        "      <th>age</th>\n",
 75 |        "      <th>workclass</th>\n",
 76 |        "      <th>fnlwgt</th>\n",
 77 |        "      <th>education</th>\n",
 78 |        "      <th>marital_status</th>\n",
 79 |        "      <th>occupation</th>\n",
 80 |        "      <th>relationship</th>\n",
 81 |        "      <th>race</th>\n",
 82 |        "      <th>gender</th>\n",
 83 |        "      <th>capital_gain</th>\n",
 84 |        "      <th>capital_loss</th>\n",
 85 |        "      <th>hours_per_week</th>\n",
 86 |        "      <th>native_country</th>\n",
 87 |        "      <th>target</th>\n",
 88 |        "    </tr>\n",
 89 |        "  </thead>\n",
 90 |        "  <tbody>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>0</th>\n",
 93 |        "      <td>39</td>\n",
 94 |        "      <td>State-gov</td>\n",
 95 |        "      <td>77516</td>\n",
 96 |        "      <td>Bachelors</td>\n",
 97 |        "      <td>Never-married</td>\n",
 98 |        "      <td>Adm-clerical</td>\n",
 99 |        "      <td>Not-in-family</td>\n",
100 |        "      <td>White</td>\n",
101 |        "      <td>Male</td>\n",
102 |        "      <td>2174</td>\n",
103 |        "      <td>0</td>\n",
104 |        "      <td>40</td>\n",
105 |        "      <td>United-States</td>\n",
106 |        "      <td>0</td>\n",
107 |        "    </tr>\n",
108 |        "    <tr>\n",
109 |        "      <th>1</th>\n",
110 |        "      <td>50</td>\n",
111 |        "      <td>Self-emp-not-inc</td>\n",
112 |        "      <td>83311</td>\n",
113 |        "      <td>Bachelors</td>\n",
114 |        "      <td>Married-civ-spouse</td>\n",
115 |        "      <td>Exec-managerial</td>\n",
116 |        "      <td>Husband</td>\n",
117 |        "      <td>White</td>\n",
118 |        "      <td>Male</td>\n",
119 |        "      <td>0</td>\n",
120 |        "      <td>0</td>\n",
121 |        "      <td>13</td>\n",
122 |        "      <td>United-States</td>\n",
123 |        "      <td>0</td>\n",
124 |        "    </tr>\n",
125 |        "    <tr>\n",
126 |        "      <th>2</th>\n",
127 |        "      <td>38</td>\n",
128 |        "      <td>Private</td>\n",
129 |        "      <td>215646</td>\n",
130 |        "      <td>HS-grad</td>\n",
131 |        "      <td>Divorced</td>\n",
132 |        "      <td>Handlers-cleaners</td>\n",
133 |        "      <td>Not-in-family</td>\n",
134 |        "      <td>White</td>\n",
135 |        "      <td>Male</td>\n",
136 |        "      <td>0</td>\n",
137 |        "      <td>0</td>\n",
138 |        "      <td>40</td>\n",
139 |        "      <td>United-States</td>\n",
140 |        "      <td>0</td>\n",
141 |        "    </tr>\n",
142 |        "    <tr>\n",
143 |        "      <th>3</th>\n",
144 |        "      <td>53</td>\n",
145 |        "      <td>Private</td>\n",
146 |        "      <td>234721</td>\n",
147 |        "      <td>11th</td>\n",
148 |        "      <td>Married-civ-spouse</td>\n",
149 |        "      <td>Handlers-cleaners</td>\n",
150 |        "      <td>Husband</td>\n",
151 |        "      <td>Black</td>\n",
152 |        "      <td>Male</td>\n",
153 |        "      <td>0</td>\n",
154 |        "      <td>0</td>\n",
155 |        "      <td>40</td>\n",
156 |        "      <td>United-States</td>\n",
157 |        "      <td>0</td>\n",
158 |        "    </tr>\n",
159 |        "    <tr>\n",
160 |        "      <th>4</th>\n",
161 |        "      <td>28</td>\n",
162 |        "      <td>Private</td>\n",
163 |        "      <td>338409</td>\n",
164 |        "      <td>Bachelors</td>\n",
165 |        "      <td>Married-civ-spouse</td>\n",
166 |        "      <td>Prof-specialty</td>\n",
167 |        "      <td>Wife</td>\n",
168 |        "      <td>Black</td>\n",
169 |        "      <td>Female</td>\n",
170 |        "      <td>0</td>\n",
171 |        "      <td>0</td>\n",
172 |        "      <td>40</td>\n",
173 |        "      <td>Cuba</td>\n",
174 |        "      <td>0</td>\n",
175 |        "    </tr>\n",
176 |        "  </tbody>\n",
177 |        "</table>\n",
178 |        "</div>"
179 |       ],
180 |       "text/plain": [
181 |        "   age         workclass  fnlwgt  education      marital_status  \\\n",
182 |        "0   39         State-gov   77516  Bachelors       Never-married   \n",
183 |        "1   50  Self-emp-not-inc   83311  Bachelors  Married-civ-spouse   \n",
184 |        "2   38           Private  215646    HS-grad            Divorced   \n",
185 |        "3   53           Private  234721       11th  Married-civ-spouse   \n",
186 |        "4   28           Private  338409  Bachelors  Married-civ-spouse   \n",
187 |        "\n",
188 |        "          occupation   relationship   race  gender  capital_gain  \\\n",
189 |        "0       Adm-clerical  Not-in-family  White    Male          2174   \n",
190 |        "1    Exec-managerial        Husband  White    Male             0   \n",
191 |        "2  Handlers-cleaners  Not-in-family  White    Male             0   \n",
192 |        "3  Handlers-cleaners        Husband  Black    Male             0   \n",
193 |        "4     Prof-specialty           Wife  Black  Female             0   \n",
194 |        "\n",
195 |        "   capital_loss  hours_per_week native_country  target  \n",
196 |        "0             0              40  United-States       0  \n",
197 |        "1             0              13  United-States       0  \n",
198 |        "2             0              40  United-States       0  \n",
199 |        "3             0              40  United-States       0  \n",
200 |        "4             0              40           Cuba       0  "
201 |       ]
202 |      },
203 |      "execution_count": 2,
204 |      "metadata": {},
205 |      "output_type": "execute_result"
206 |     }
207 |    ],
208 |    "source": [
209 |     "df = pd.read_csv(\"data/adult.data\")\n",
210 |     "df['target'] = (df['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)\n",
211 |     "df.drop('income_bracket', axis=1, inplace=True)\n",
212 |     "df.head()"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "I have coded a preprocessor class before that does the work for us."
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 3,
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "name": "stdout",
229 |      "output_type": "stream",
230 |      "text": [
231 |       "the features column names are: ['age', 'workclass', 'fnlwgt', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'education_occupation', 'native_country_occupation']\n",
232 |       "the categorical columns are: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country', 'education_occupation', 'native_country_occupation']\n"
233 |      ]
234 |     }
235 |    ],
236 |    "source": [
237 |     "dataprocessor = pickle.load(open(\"data/dataprocessors/dataprocessor_0_.p\", \"rb\"))\n",
238 |     "all_features = dataprocessor.colnames\n",
239 |     "categorical_features = dataprocessor.cat_cols + dataprocessor.crossed_columns\n",
240 |     "\n",
241 |     "print(\"the features column names are: {}\".format(all_features))\n",
242 |     "print(\"the categorical columns are: {}\".format(categorical_features))"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "metadata": {},
248 |    "source": [
249 |     "the `dataprocessor` is already train, so we simply need to `transform`"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 4,
255 |    "metadata": {},
256 |    "outputs": [
257 |     {
258 |      "data": {
259 |       "text/html": [
260 |        "<div>\n",
261 |        "<style scoped>\n",
262 |        "    .dataframe tbody tr th:only-of-type {\n",
263 |        "        vertical-align: middle;\n",
264 |        "    }\n",
265 |        "\n",
266 |        "    .dataframe tbody tr th {\n",
267 |        "        vertical-align: top;\n",
268 |        "    }\n",
269 |        "\n",
270 |        "    .dataframe thead th {\n",
271 |        "        text-align: right;\n",
272 |        "    }\n",
273 |        "</style>\n",
274 |        "<table border=\"1\" class=\"dataframe\">\n",
275 |        "  <thead>\n",
276 |        "    <tr style=\"text-align: right;\">\n",
277 |        "      <th></th>\n",
278 |        "      <th>age</th>\n",
279 |        "      <th>workclass</th>\n",
280 |        "      <th>fnlwgt</th>\n",
281 |        "      <th>education</th>\n",
282 |        "      <th>marital_status</th>\n",
283 |        "      <th>occupation</th>\n",
284 |        "      <th>relationship</th>\n",
285 |        "      <th>race</th>\n",
286 |        "      <th>gender</th>\n",
287 |        "      <th>capital_gain</th>\n",
288 |        "      <th>capital_loss</th>\n",
289 |        "      <th>hours_per_week</th>\n",
290 |        "      <th>native_country</th>\n",
291 |        "      <th>target</th>\n",
292 |        "      <th>education_occupation</th>\n",
293 |        "      <th>native_country_occupation</th>\n",
294 |        "    </tr>\n",
295 |        "  </thead>\n",
296 |        "  <tbody>\n",
297 |        "    <tr>\n",
298 |        "      <th>0</th>\n",
299 |        "      <td>0.301370</td>\n",
300 |        "      <td>0</td>\n",
301 |        "      <td>0.044302</td>\n",
302 |        "      <td>0</td>\n",
303 |        "      <td>0</td>\n",
304 |        "      <td>0</td>\n",
305 |        "      <td>0</td>\n",
306 |        "      <td>0</td>\n",
307 |        "      <td>0</td>\n",
308 |        "      <td>0.02174</td>\n",
309 |        "      <td>0.0</td>\n",
310 |        "      <td>0.397959</td>\n",
311 |        "      <td>0</td>\n",
312 |        "      <td>0</td>\n",
313 |        "      <td>0</td>\n",
314 |        "      <td>0</td>\n",
315 |        "    </tr>\n",
316 |        "    <tr>\n",
317 |        "      <th>1</th>\n",
318 |        "      <td>0.452055</td>\n",
319 |        "      <td>1</td>\n",
320 |        "      <td>0.048238</td>\n",
321 |        "      <td>0</td>\n",
322 |        "      <td>1</td>\n",
323 |        "      <td>1</td>\n",
324 |        "      <td>1</td>\n",
325 |        "      <td>0</td>\n",
326 |        "      <td>0</td>\n",
327 |        "      <td>0.00000</td>\n",
328 |        "      <td>0.0</td>\n",
329 |        "      <td>0.122449</td>\n",
330 |        "      <td>0</td>\n",
331 |        "      <td>0</td>\n",
332 |        "      <td>1</td>\n",
333 |        "      <td>1</td>\n",
334 |        "    </tr>\n",
335 |        "    <tr>\n",
336 |        "      <th>2</th>\n",
337 |        "      <td>0.287671</td>\n",
338 |        "      <td>2</td>\n",
339 |        "      <td>0.138113</td>\n",
340 |        "      <td>1</td>\n",
341 |        "      <td>2</td>\n",
342 |        "      <td>2</td>\n",
343 |        "      <td>0</td>\n",
344 |        "      <td>0</td>\n",
345 |        "      <td>0</td>\n",
346 |        "      <td>0.00000</td>\n",
347 |        "      <td>0.0</td>\n",
348 |        "      <td>0.397959</td>\n",
349 |        "      <td>0</td>\n",
350 |        "      <td>0</td>\n",
351 |        "      <td>2</td>\n",
352 |        "      <td>2</td>\n",
353 |        "    </tr>\n",
354 |        "    <tr>\n",
355 |        "      <th>3</th>\n",
356 |        "      <td>0.493151</td>\n",
357 |        "      <td>2</td>\n",
358 |        "      <td>0.151068</td>\n",
359 |        "      <td>2</td>\n",
360 |        "      <td>1</td>\n",
361 |        "      <td>2</td>\n",
362 |        "      <td>1</td>\n",
363 |        "      <td>1</td>\n",
364 |        "      <td>0</td>\n",
365 |        "      <td>0.00000</td>\n",
366 |        "      <td>0.0</td>\n",
367 |        "      <td>0.397959</td>\n",
368 |        "      <td>0</td>\n",
369 |        "      <td>0</td>\n",
370 |        "      <td>3</td>\n",
371 |        "      <td>2</td>\n",
372 |        "    </tr>\n",
373 |        "    <tr>\n",
374 |        "      <th>4</th>\n",
375 |        "      <td>0.150685</td>\n",
376 |        "      <td>2</td>\n",
377 |        "      <td>0.221488</td>\n",
378 |        "      <td>0</td>\n",
379 |        "      <td>1</td>\n",
380 |        "      <td>3</td>\n",
381 |        "      <td>2</td>\n",
382 |        "      <td>1</td>\n",
383 |        "      <td>1</td>\n",
384 |        "      <td>0.00000</td>\n",
385 |        "      <td>0.0</td>\n",
386 |        "      <td>0.397959</td>\n",
387 |        "      <td>1</td>\n",
388 |        "      <td>0</td>\n",
389 |        "      <td>4</td>\n",
390 |        "      <td>3</td>\n",
391 |        "    </tr>\n",
392 |        "  </tbody>\n",
393 |        "</table>\n",
394 |        "</div>"
395 |       ],
396 |       "text/plain": [
397 |        "        age  workclass    fnlwgt  education  marital_status  occupation  \\\n",
398 |        "0  0.301370          0  0.044302          0               0           0   \n",
399 |        "1  0.452055          1  0.048238          0               1           1   \n",
400 |        "2  0.287671          2  0.138113          1               2           2   \n",
401 |        "3  0.493151          2  0.151068          2               1           2   \n",
402 |        "4  0.150685          2  0.221488          0               1           3   \n",
403 |        "\n",
404 |        "   relationship  race  gender  capital_gain  capital_loss  hours_per_week  \\\n",
405 |        "0             0     0       0       0.02174           0.0        0.397959   \n",
406 |        "1             1     0       0       0.00000           0.0        0.122449   \n",
407 |        "2             0     0       0       0.00000           0.0        0.397959   \n",
408 |        "3             1     1       0       0.00000           0.0        0.397959   \n",
409 |        "4             2     1       1       0.00000           0.0        0.397959   \n",
410 |        "\n",
411 |        "   native_country  target  education_occupation  native_country_occupation  \n",
412 |        "0               0       0                     0                          0  \n",
413 |        "1               0       0                     1                          1  \n",
414 |        "2               0       0                     2                          2  \n",
415 |        "3               0       0                     3                          2  \n",
416 |        "4               1       0                     4                          3  "
417 |       ]
418 |      },
419 |      "execution_count": 4,
420 |      "metadata": {},
421 |      "output_type": "execute_result"
422 |     }
423 |    ],
424 |    "source": [
425 |     "train_data = dataprocessor.transform(df)\n",
426 |     "\n",
427 |     "train_data.head()"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": 6,
433 |    "metadata": {},
434 |    "outputs": [],
435 |    "source": [
436 |     "# np arrays\n",
437 |     "X_train = train_data[[c for c in train_data.columns if c is not 'target']].values\n",
438 |     "y_train = train_data['target'].values\n",
439 |     "\n",
440 |     "# lgb Dataset object\n",
441 |     "lgtrain = lgb.Dataset(X_train,\n",
442 |     "    label=y_train,\n",
443 |     "    feature_name=all_features,\n",
444 |     "    categorical_feature=categorical_features,\n",
445 |     "    free_raw_data=False)"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": 7,
451 |    "metadata": {},
452 |    "outputs": [],
453 |    "source": [
454 |     "# model and fit params\n",
455 |     "params = dict(learning_rate=0.01,\n",
456 |     "    num_boost_round=300,\n",
457 |     "    num_leaves = 255,\n",
458 |     "    verbose=-1,\n",
459 |     "    is_unbalance=True)\n",
460 |     "fit_params = dict(feature_name=all_features,\n",
461 |     "        categorical_feature=categorical_features)"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "markdown",
466 |    "metadata": {},
467 |    "source": [
468 |     "## 1. First experiment. Sklearn wrap up vs lightgbm methods"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "code",
473 |    "execution_count": 12,
474 |    "metadata": {},
475 |    "outputs": [
476 |     {
477 |      "name": "stdout",
478 |      "output_type": "stream",
479 |      "text": [
480 |       "7.932429075241089\n"
481 |      ]
482 |     }
483 |    ],
484 |    "source": [
485 |     "clf = lgb.LGBMClassifier(**params, silent=True)\n",
486 |     "start = time()\n",
487 |     "score = cross_val_score(clf,\n",
488 |     "    X_train, y_train,\n",
489 |     "    scoring='neg_log_loss',\n",
490 |     "    cv=StratifiedKFold(random_state=1981),\n",
491 |     "    fit_params=fit_params)\n",
492 |     "sklearn_runtime = time() - start\n",
493 |     "print(sklearn_runtime)"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": 13,
499 |    "metadata": {},
500 |    "outputs": [
501 |     {
502 |      "name": "stdout",
503 |      "output_type": "stream",
504 |      "text": [
505 |       "7.038502931594849\n"
506 |      ]
507 |     }
508 |    ],
509 |    "source": [
510 |     "start = time()\n",
511 |     "cv_result = lgb.cv(params,\n",
512 |     "    lgtrain,\n",
513 |     "    metrics='binary_logloss',\n",
514 |     "    nfold=3,\n",
515 |     "    stratified=True, \n",
516 |     "    seed=1981)\n",
517 |     "lightgbm_runtime = time() - start\n",
518 |     "print(lightgbm_runtime)"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "markdown",
523 |    "metadata": {},
524 |    "source": [
525 |     "LightGBM methods seem to be a bit faster. Let's now compare `Hyperopt` and `Skopt`"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "markdown",
530 |    "metadata": {},
531 |    "source": [
532 |     "## Hyperopt vs Skopt"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "markdown",
537 |    "metadata": {},
538 |    "source": [
539 |     "The first thing to comment is that while Hyperopt offers the `hp.quniform(label, low, high, q)` parameter expressions, there is not such a thing for Skopt. One has `Categorical`, but you have to pass all values. In other words, When using hyperopt one could use:\n",
540 |     "\n",
541 |     "    'num_boost_round': hp.quniform('num_boost_round', 50, 500, 20)\n",
542 |     "\n",
543 |     "but when using Skopt one would have to do:\n",
544 |     "\n",
545 |     "    Categorical(np.arange(50, 500, 20))\n",
546 |     "    \n",
547 |     "Because I want to keep the comparison as light and direct as possible, I will just use `Real` parameters with uniform distributions."
548 |    ]
549 |   },
550 |   {
551 |    "cell_type": "markdown",
552 |    "metadata": {},
553 |    "source": [
554 |     "### 1. Hyperopt\n",
555 |     "\n",
556 |     "With Hyperopt we will use the [TPE](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf) algorithm."
557 |    ]
558 |   },
559 |   {
560 |    "cell_type": "code",
561 |    "execution_count": 14,
562 |    "metadata": {},
563 |    "outputs": [],
564 |    "source": [
565 |     "hp_space = {\n",
566 |     "    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),\n",
567 |     "    'min_child_weight': hp.uniform('min_child_weight', 0.1, 10),\n",
568 |     "    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.),\n",
569 |     "    'subsample': hp.uniform('subsample', 0.5, 1.),\n",
570 |     "    }"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": 20,
576 |    "metadata": {},
577 |    "outputs": [
578 |     {
579 |      "name": "stdout",
580 |      "output_type": "stream",
581 |      "text": [
582 |       "100%|██████████| 50/50 [00:39<00:00,  1.18it/s, best loss: 0.28343034788381305]\n"
583 |      ]
584 |     }
585 |    ],
586 |    "source": [
587 |     "def objective(params):\n",
588 |     "    clf = lgb.LGBMClassifier(**params, is_unbalance=True, verbose=-1, silent=True)\n",
589 |     "    score = cross_val_score(clf,\n",
590 |     "        X_train, y_train,\n",
591 |     "        scoring='f1',\n",
592 |     "        cv=StratifiedKFold(random_state=3),\n",
593 |     "        fit_params=fit_params).mean()\n",
594 |     "    return 1-score\n",
595 |     "trials = Trials()\n",
596 |     "best = fmin(fn=objective,\n",
597 |     "            space=hp_space,\n",
598 |     "            algo=tpe.suggest,\n",
599 |     "            max_evals=50,\n",
600 |     "            trials=trials)"
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "markdown",
605 |    "metadata": {},
606 |    "source": [
607 |     "### 2. SKopt\n",
608 |     "\n",
609 |     "Since TPE is a Bayesian method we will first compare with the `BayesSearchCV` method in `Skopt`"
610 |    ]
611 |   },
612 |   {
613 |    "cell_type": "code",
614 |    "execution_count": 16,
615 |    "metadata": {},
616 |    "outputs": [],
617 |    "source": [
618 |     "hh_space = dict(\n",
619 |     "        learning_rate = Real(0.01, 0.3),\n",
620 |     "        min_child_weight = Real(0.1, 10),\n",
621 |     "        colsample_bytree= Real(0.5, 1.),\n",
622 |     "        subsample=Real(0.5, 1.),\n",
623 |     "    )"
624 |    ]
625 |   },
626 |   {
627 |    "cell_type": "code",
628 |    "execution_count": 17,
629 |    "metadata": {},
630 |    "outputs": [
631 |     {
632 |      "name": "stdout",
633 |      "output_type": "stream",
634 |      "text": [
635 |       "63.14497208595276\n"
636 |      ]
637 |     }
638 |    ],
639 |    "source": [
640 |     "clf = lgb.LGBMClassifier(is_unbalance=True, verbose=-1, silent=True)\n",
641 |     "start = time()\n",
642 |     "opt = BayesSearchCV(clf,\n",
643 |     "    search_spaces=hh_space,\n",
644 |     "    scoring='f1',\n",
645 |     "    cv=StratifiedKFold(random_state=3),\n",
646 |     "    fit_params=fit_params,\n",
647 |     "    n_iter=50,\n",
648 |     "    n_jobs=-1)\n",
649 |     "opt.fit(X_train, y_train)\n",
650 |     "skopt_bayes_runtime = time()-start\n",
651 |     "print(skopt_bayes_runtime)"
652 |    ]
653 |   },
654 |   {
655 |    "cell_type": "markdown",
656 |    "metadata": {},
657 |    "source": [
658 |     "`Skopt`'s seems to be a significantly slower than hyperopt even with no verbosity. Let's see if performs better:"
659 |    ]
660 |   },
661 |   {
662 |    "cell_type": "code",
663 |    "execution_count": 19,
664 |    "metadata": {},
665 |    "outputs": [
666 |     {
667 |      "name": "stdout",
668 |      "output_type": "stream",
669 |      "text": [
670 |       "best SKOPT F1 score: 0.7174372197995806\n"
671 |      ]
672 |     }
673 |    ],
674 |    "source": [
675 |     "print('best SKOPT F1 score: {}'.format(opt.best_score_))"
676 |    ]
677 |   },
678 |   {
679 |    "cell_type": "markdown",
680 |    "metadata": {},
681 |    "source": [
682 |     "which is almost identical to the one obtained with `Hyperopt`"
683 |    ]
684 |   },
685 |   {
686 |    "cell_type": "code",
687 |    "execution_count": 25,
688 |    "metadata": {},
689 |    "outputs": [
690 |     {
691 |      "name": "stdout",
692 |      "output_type": "stream",
693 |      "text": [
694 |       "best HYPEROPT F1 score: 0.716569652116187\n"
695 |      ]
696 |     }
697 |    ],
698 |    "source": [
699 |     "# Remember hyperopt minimises 1-score. \n",
700 |     "print('best HYPEROPT F1 score: {}'.format(1-trials.best_trial['result']['loss']))"
701 |    ]
702 |   },
703 |   {
704 |    "cell_type": "markdown",
705 |    "metadata": {},
706 |    "source": [
707 |     "The conclusion at this stage is that `Hyperopt` is faster than `Skopt` with the same performance. However, the `TPE` algorithm is a tree based algorithm, so let's also compare with the `gbrt_minimize` method (Sequential optimization using gradient boosted trees) in `Skopt`. Here the syntax is a bit different to that of `BayesSearchCV`. "
708 |    ]
709 |   },
710 |   {
711 |    "cell_type": "code",
712 |    "execution_count": 26,
713 |    "metadata": {},
714 |    "outputs": [],
715 |    "source": [
716 |     "# the space has to be tuples like these\n",
717 |     "hh_space_gbrt  = [Real(0.01, 0.3, 'uniform', name='learning_rate'),\n",
718 |     "          Real(0.1, 10, 'uniform', name='min_child_weight'),\n",
719 |     "          Real(0.5, 1., 'uniform', name='colsample_bytree'),\n",
720 |     "          Real(0.5, 1., 'uniform', name='subsample')]"
721 |    ]
722 |   },
723 |   {
724 |    "cell_type": "code",
725 |    "execution_count": 30,
726 |    "metadata": {},
727 |    "outputs": [],
728 |    "source": [
729 |     "# Let's adapt the objective\n",
730 |     "def gbrt_objective(params):\n",
731 |     "    tmp_params = {}\n",
732 |     "    tmp_params['learning_rate'], tmp_params['min_child_weight'], \\\n",
733 |     "    tmp_params['colsample_bytree'], tmp_params['subsample'], = params[0], params[1], params[2], params[3]\n",
734 |     "    clf = lgb.LGBMClassifier(**tmp_params, is_unbalance=True, verbose=-1, silent=True)\n",
735 |     "    score = cross_val_score(clf,\n",
736 |     "        X_train, y_train,\n",
737 |     "        scoring='f1',\n",
738 |     "        cv=StratifiedKFold(random_state=3),\n",
739 |     "        fit_params=fit_params).mean()\n",
740 |     "    return 1-score"
741 |    ]
742 |   },
743 |   {
744 |    "cell_type": "code",
745 |    "execution_count": 31,
746 |    "metadata": {},
747 |    "outputs": [
748 |     {
749 |      "name": "stdout",
750 |      "output_type": "stream",
751 |      "text": [
752 |       "54.64228296279907\n"
753 |      ]
754 |     }
755 |    ],
756 |    "source": [
757 |     "start=time()\n",
758 |     "sk_best = gbrt_minimize(gbrt_objective,\n",
759 |     "    hh_space_gbrt,\n",
760 |     "    n_calls=50,\n",
761 |     "    verbose=False,\n",
762 |     "    n_jobs=-1)\n",
763 |     "skopt_gbrt_runtime = time()-start\n",
764 |     "print(skopt_gbrt_runtime)"
765 |    ]
766 |   },
767 |   {
768 |    "cell_type": "markdown",
769 |    "metadata": {},
770 |    "source": [
771 |     "Faster than `BayesSearchCV`, but still, slower than `Hyperopt`. Let's see if the results are any better"
772 |    ]
773 |   },
774 |   {
775 |    "cell_type": "code",
776 |    "execution_count": 36,
777 |    "metadata": {},
778 |    "outputs": [
779 |     {
780 |      "name": "stdout",
781 |      "output_type": "stream",
782 |      "text": [
783 |       "best SKOPT GBRT F1 score: 0.7173483496134895\n"
784 |      ]
785 |     }
786 |    ],
787 |    "source": [
788 |     "print('best SKOPT GBRT F1 score: {}'.format(1-sk_best.fun))"
789 |    ]
790 |   },
791 |   {
792 |    "cell_type": "markdown",
793 |    "metadata": {},
794 |    "source": [
795 |     "## CONCLUSION"
796 |    ]
797 |   },
798 |   {
799 |    "cell_type": "markdown",
800 |    "metadata": {},
801 |    "source": [
802 |     "`Hyperopt`'s TPE performs as good as Skopt `gbrt_minimize` and `BayesSearchCV` methods and is significantly faster."
803 |    ]
804 |   }
805 |  ],
806 |  "metadata": {
807 |   "kernelspec": {
808 |    "display_name": "Python 3",
809 |    "language": "python",
810 |    "name": "python3"
811 |   },
812 |   "language_info": {
813 |    "codemirror_mode": {
814 |     "name": "ipython",
815 |     "version": 3
816 |    },
817 |    "file_extension": ".py",
818 |    "mimetype": "text/x-python",
819 |    "name": "python",
820 |    "nbconvert_exporter": "python",
821 |    "pygments_lexer": "ipython3",
822 |    "version": "3.6.5"
823 |   }
824 |  },
825 |  "nbformat": 4,
826 |  "nbformat_minor": 2
827 | }
828 | 


--------------------------------------------------------------------------------
/notebooks/skopt_vs_hyperopt.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Skopt vs Hyperopt\n",
  8 |     "\n",
  9 |     "## Importing and preprocessing data"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stderr",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "/usr/local/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.\n",
 22 |       "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
 23 |       "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
 24 |       "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
 25 |       "  \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
 26 |      ]
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "import pandas as pd\n",
 31 |     "import numpy as np\n",
 32 |     "import pickle\n",
 33 |     "import lightgbm as lgb\n",
 34 |     "import warnings\n",
 35 |     "\n",
 36 |     "from time import time\n",
 37 |     "from hyperopt import hp, tpe, fmin, Trials\n",
 38 |     "from skopt import BayesSearchCV\n",
 39 |     "from skopt.space import Real, Categorical, Integer\n",
 40 |     "from skopt import gbrt_minimize\n",
 41 |     "from sklearn.model_selection import StratifiedKFold, cross_val_score\n",
 42 |     "from sklearn.metrics import log_loss\n",
 43 |     "from utils import FeatureTools\n",
 44 |     "\n",
 45 |     "warnings.filterwarnings(\"ignore\")"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 2,
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "data": {
 55 |       "text/html": [
 56 |        "<div>\n",
 57 |        "<style scoped>\n",
 58 |        "    .dataframe tbody tr th:only-of-type {\n",
 59 |        "        vertical-align: middle;\n",
 60 |        "    }\n",
 61 |        "\n",
 62 |        "    .dataframe tbody tr th {\n",
 63 |        "        vertical-align: top;\n",
 64 |        "    }\n",
 65 |        "\n",
 66 |        "    .dataframe thead th {\n",
 67 |        "        text-align: right;\n",
 68 |        "    }\n",
 69 |        "</style>\n",
 70 |        "<table border=\"1\" class=\"dataframe\">\n",
 71 |        "  <thead>\n",
 72 |        "    <tr style=\"text-align: right;\">\n",
 73 |        "      <th></th>\n",
 74 |        "      <th>age</th>\n",
 75 |        "      <th>workclass</th>\n",
 76 |        "      <th>fnlwgt</th>\n",
 77 |        "      <th>education</th>\n",
 78 |        "      <th>marital_status</th>\n",
 79 |        "      <th>occupation</th>\n",
 80 |        "      <th>relationship</th>\n",
 81 |        "      <th>race</th>\n",
 82 |        "      <th>gender</th>\n",
 83 |        "      <th>capital_gain</th>\n",
 84 |        "      <th>capital_loss</th>\n",
 85 |        "      <th>hours_per_week</th>\n",
 86 |        "      <th>native_country</th>\n",
 87 |        "      <th>target</th>\n",
 88 |        "    </tr>\n",
 89 |        "  </thead>\n",
 90 |        "  <tbody>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>0</th>\n",
 93 |        "      <td>39</td>\n",
 94 |        "      <td>State-gov</td>\n",
 95 |        "      <td>77516</td>\n",
 96 |        "      <td>Bachelors</td>\n",
 97 |        "      <td>Never-married</td>\n",
 98 |        "      <td>Adm-clerical</td>\n",
 99 |        "      <td>Not-in-family</td>\n",
100 |        "      <td>White</td>\n",
101 |        "      <td>Male</td>\n",
102 |        "      <td>2174</td>\n",
103 |        "      <td>0</td>\n",
104 |        "      <td>40</td>\n",
105 |        "      <td>United-States</td>\n",
106 |        "      <td>0</td>\n",
107 |        "    </tr>\n",
108 |        "    <tr>\n",
109 |        "      <th>1</th>\n",
110 |        "      <td>50</td>\n",
111 |        "      <td>Self-emp-not-inc</td>\n",
112 |        "      <td>83311</td>\n",
113 |        "      <td>Bachelors</td>\n",
114 |        "      <td>Married-civ-spouse</td>\n",
115 |        "      <td>Exec-managerial</td>\n",
116 |        "      <td>Husband</td>\n",
117 |        "      <td>White</td>\n",
118 |        "      <td>Male</td>\n",
119 |        "      <td>0</td>\n",
120 |        "      <td>0</td>\n",
121 |        "      <td>13</td>\n",
122 |        "      <td>United-States</td>\n",
123 |        "      <td>0</td>\n",
124 |        "    </tr>\n",
125 |        "    <tr>\n",
126 |        "      <th>2</th>\n",
127 |        "      <td>38</td>\n",
128 |        "      <td>Private</td>\n",
129 |        "      <td>215646</td>\n",
130 |        "      <td>HS-grad</td>\n",
131 |        "      <td>Divorced</td>\n",
132 |        "      <td>Handlers-cleaners</td>\n",
133 |        "      <td>Not-in-family</td>\n",
134 |        "      <td>White</td>\n",
135 |        "      <td>Male</td>\n",
136 |        "      <td>0</td>\n",
137 |        "      <td>0</td>\n",
138 |        "      <td>40</td>\n",
139 |        "      <td>United-States</td>\n",
140 |        "      <td>0</td>\n",
141 |        "    </tr>\n",
142 |        "    <tr>\n",
143 |        "      <th>3</th>\n",
144 |        "      <td>53</td>\n",
145 |        "      <td>Private</td>\n",
146 |        "      <td>234721</td>\n",
147 |        "      <td>11th</td>\n",
148 |        "      <td>Married-civ-spouse</td>\n",
149 |        "      <td>Handlers-cleaners</td>\n",
150 |        "      <td>Husband</td>\n",
151 |        "      <td>Black</td>\n",
152 |        "      <td>Male</td>\n",
153 |        "      <td>0</td>\n",
154 |        "      <td>0</td>\n",
155 |        "      <td>40</td>\n",
156 |        "      <td>United-States</td>\n",
157 |        "      <td>0</td>\n",
158 |        "    </tr>\n",
159 |        "    <tr>\n",
160 |        "      <th>4</th>\n",
161 |        "      <td>28</td>\n",
162 |        "      <td>Private</td>\n",
163 |        "      <td>338409</td>\n",
164 |        "      <td>Bachelors</td>\n",
165 |        "      <td>Married-civ-spouse</td>\n",
166 |        "      <td>Prof-specialty</td>\n",
167 |        "      <td>Wife</td>\n",
168 |        "      <td>Black</td>\n",
169 |        "      <td>Female</td>\n",
170 |        "      <td>0</td>\n",
171 |        "      <td>0</td>\n",
172 |        "      <td>40</td>\n",
173 |        "      <td>Cuba</td>\n",
174 |        "      <td>0</td>\n",
175 |        "    </tr>\n",
176 |        "  </tbody>\n",
177 |        "</table>\n",
178 |        "</div>"
179 |       ],
180 |       "text/plain": [
181 |        "   age         workclass  fnlwgt  education      marital_status  \\\n",
182 |        "0   39         State-gov   77516  Bachelors       Never-married   \n",
183 |        "1   50  Self-emp-not-inc   83311  Bachelors  Married-civ-spouse   \n",
184 |        "2   38           Private  215646    HS-grad            Divorced   \n",
185 |        "3   53           Private  234721       11th  Married-civ-spouse   \n",
186 |        "4   28           Private  338409  Bachelors  Married-civ-spouse   \n",
187 |        "\n",
188 |        "          occupation   relationship   race  gender  capital_gain  \\\n",
189 |        "0       Adm-clerical  Not-in-family  White    Male          2174   \n",
190 |        "1    Exec-managerial        Husband  White    Male             0   \n",
191 |        "2  Handlers-cleaners  Not-in-family  White    Male             0   \n",
192 |        "3  Handlers-cleaners        Husband  Black    Male             0   \n",
193 |        "4     Prof-specialty           Wife  Black  Female             0   \n",
194 |        "\n",
195 |        "   capital_loss  hours_per_week native_country  target  \n",
196 |        "0             0              40  United-States       0  \n",
197 |        "1             0              13  United-States       0  \n",
198 |        "2             0              40  United-States       0  \n",
199 |        "3             0              40  United-States       0  \n",
200 |        "4             0              40           Cuba       0  "
201 |       ]
202 |      },
203 |      "execution_count": 2,
204 |      "metadata": {},
205 |      "output_type": "execute_result"
206 |     }
207 |    ],
208 |    "source": [
209 |     "df = pd.read_csv(\"data/adult.data\")\n",
210 |     "df['target'] = (df['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)\n",
211 |     "df.drop('income_bracket', axis=1, inplace=True)\n",
212 |     "df.head()"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "I have coded a preprocessor class before that does the work for us."
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 3,
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "name": "stdout",
229 |      "output_type": "stream",
230 |      "text": [
231 |       "the features column names are: ['age', 'workclass', 'fnlwgt', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'education_occupation', 'native_country_occupation']\n",
232 |       "the categorical columns are: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country', 'education_occupation', 'native_country_occupation']\n"
233 |      ]
234 |     }
235 |    ],
236 |    "source": [
237 |     "dataprocessor = pickle.load(open(\"data/dataprocessors/dataprocessor_0_.p\", \"rb\"))\n",
238 |     "all_features = dataprocessor.colnames\n",
239 |     "categorical_features = dataprocessor.cat_cols + dataprocessor.crossed_columns\n",
240 |     "\n",
241 |     "print(\"the features column names are: {}\".format(all_features))\n",
242 |     "print(\"the categorical columns are: {}\".format(categorical_features))"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "metadata": {},
248 |    "source": [
249 |     "the `dataprocessor` is already train, so we simply need to `transform`"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 4,
255 |    "metadata": {},
256 |    "outputs": [
257 |     {
258 |      "data": {
259 |       "text/html": [
260 |        "<div>\n",
261 |        "<style scoped>\n",
262 |        "    .dataframe tbody tr th:only-of-type {\n",
263 |        "        vertical-align: middle;\n",
264 |        "    }\n",
265 |        "\n",
266 |        "    .dataframe tbody tr th {\n",
267 |        "        vertical-align: top;\n",
268 |        "    }\n",
269 |        "\n",
270 |        "    .dataframe thead th {\n",
271 |        "        text-align: right;\n",
272 |        "    }\n",
273 |        "</style>\n",
274 |        "<table border=\"1\" class=\"dataframe\">\n",
275 |        "  <thead>\n",
276 |        "    <tr style=\"text-align: right;\">\n",
277 |        "      <th></th>\n",
278 |        "      <th>age</th>\n",
279 |        "      <th>workclass</th>\n",
280 |        "      <th>fnlwgt</th>\n",
281 |        "      <th>education</th>\n",
282 |        "      <th>marital_status</th>\n",
283 |        "      <th>occupation</th>\n",
284 |        "      <th>relationship</th>\n",
285 |        "      <th>race</th>\n",
286 |        "      <th>gender</th>\n",
287 |        "      <th>capital_gain</th>\n",
288 |        "      <th>capital_loss</th>\n",
289 |        "      <th>hours_per_week</th>\n",
290 |        "      <th>native_country</th>\n",
291 |        "      <th>target</th>\n",
292 |        "      <th>education_occupation</th>\n",
293 |        "      <th>native_country_occupation</th>\n",
294 |        "    </tr>\n",
295 |        "  </thead>\n",
296 |        "  <tbody>\n",
297 |        "    <tr>\n",
298 |        "      <th>0</th>\n",
299 |        "      <td>0.301370</td>\n",
300 |        "      <td>0</td>\n",
301 |        "      <td>0.044302</td>\n",
302 |        "      <td>0</td>\n",
303 |        "      <td>0</td>\n",
304 |        "      <td>0</td>\n",
305 |        "      <td>0</td>\n",
306 |        "      <td>0</td>\n",
307 |        "      <td>0</td>\n",
308 |        "      <td>0.02174</td>\n",
309 |        "      <td>0.0</td>\n",
310 |        "      <td>0.397959</td>\n",
311 |        "      <td>0</td>\n",
312 |        "      <td>0</td>\n",
313 |        "      <td>0</td>\n",
314 |        "      <td>0</td>\n",
315 |        "    </tr>\n",
316 |        "    <tr>\n",
317 |        "      <th>1</th>\n",
318 |        "      <td>0.452055</td>\n",
319 |        "      <td>1</td>\n",
320 |        "      <td>0.048238</td>\n",
321 |        "      <td>0</td>\n",
322 |        "      <td>1</td>\n",
323 |        "      <td>1</td>\n",
324 |        "      <td>1</td>\n",
325 |        "      <td>0</td>\n",
326 |        "      <td>0</td>\n",
327 |        "      <td>0.00000</td>\n",
328 |        "      <td>0.0</td>\n",
329 |        "      <td>0.122449</td>\n",
330 |        "      <td>0</td>\n",
331 |        "      <td>0</td>\n",
332 |        "      <td>1</td>\n",
333 |        "      <td>1</td>\n",
334 |        "    </tr>\n",
335 |        "    <tr>\n",
336 |        "      <th>2</th>\n",
337 |        "      <td>0.287671</td>\n",
338 |        "      <td>2</td>\n",
339 |        "      <td>0.138113</td>\n",
340 |        "      <td>1</td>\n",
341 |        "      <td>2</td>\n",
342 |        "      <td>2</td>\n",
343 |        "      <td>0</td>\n",
344 |        "      <td>0</td>\n",
345 |        "      <td>0</td>\n",
346 |        "      <td>0.00000</td>\n",
347 |        "      <td>0.0</td>\n",
348 |        "      <td>0.397959</td>\n",
349 |        "      <td>0</td>\n",
350 |        "      <td>0</td>\n",
351 |        "      <td>2</td>\n",
352 |        "      <td>2</td>\n",
353 |        "    </tr>\n",
354 |        "    <tr>\n",
355 |        "      <th>3</th>\n",
356 |        "      <td>0.493151</td>\n",
357 |        "      <td>2</td>\n",
358 |        "      <td>0.151068</td>\n",
359 |        "      <td>2</td>\n",
360 |        "      <td>1</td>\n",
361 |        "      <td>2</td>\n",
362 |        "      <td>1</td>\n",
363 |        "      <td>1</td>\n",
364 |        "      <td>0</td>\n",
365 |        "      <td>0.00000</td>\n",
366 |        "      <td>0.0</td>\n",
367 |        "      <td>0.397959</td>\n",
368 |        "      <td>0</td>\n",
369 |        "      <td>0</td>\n",
370 |        "      <td>3</td>\n",
371 |        "      <td>2</td>\n",
372 |        "    </tr>\n",
373 |        "    <tr>\n",
374 |        "      <th>4</th>\n",
375 |        "      <td>0.150685</td>\n",
376 |        "      <td>2</td>\n",
377 |        "      <td>0.221488</td>\n",
378 |        "      <td>0</td>\n",
379 |        "      <td>1</td>\n",
380 |        "      <td>3</td>\n",
381 |        "      <td>2</td>\n",
382 |        "      <td>1</td>\n",
383 |        "      <td>1</td>\n",
384 |        "      <td>0.00000</td>\n",
385 |        "      <td>0.0</td>\n",
386 |        "      <td>0.397959</td>\n",
387 |        "      <td>1</td>\n",
388 |        "      <td>0</td>\n",
389 |        "      <td>4</td>\n",
390 |        "      <td>3</td>\n",
391 |        "    </tr>\n",
392 |        "  </tbody>\n",
393 |        "</table>\n",
394 |        "</div>"
395 |       ],
396 |       "text/plain": [
397 |        "        age  workclass    fnlwgt  education  marital_status  occupation  \\\n",
398 |        "0  0.301370          0  0.044302          0               0           0   \n",
399 |        "1  0.452055          1  0.048238          0               1           1   \n",
400 |        "2  0.287671          2  0.138113          1               2           2   \n",
401 |        "3  0.493151          2  0.151068          2               1           2   \n",
402 |        "4  0.150685          2  0.221488          0               1           3   \n",
403 |        "\n",
404 |        "   relationship  race  gender  capital_gain  capital_loss  hours_per_week  \\\n",
405 |        "0             0     0       0       0.02174           0.0        0.397959   \n",
406 |        "1             1     0       0       0.00000           0.0        0.122449   \n",
407 |        "2             0     0       0       0.00000           0.0        0.397959   \n",
408 |        "3             1     1       0       0.00000           0.0        0.397959   \n",
409 |        "4             2     1       1       0.00000           0.0        0.397959   \n",
410 |        "\n",
411 |        "   native_country  target  education_occupation  native_country_occupation  \n",
412 |        "0               0       0                     0                          0  \n",
413 |        "1               0       0                     1                          1  \n",
414 |        "2               0       0                     2                          2  \n",
415 |        "3               0       0                     3                          2  \n",
416 |        "4               1       0                     4                          3  "
417 |       ]
418 |      },
419 |      "execution_count": 4,
420 |      "metadata": {},
421 |      "output_type": "execute_result"
422 |     }
423 |    ],
424 |    "source": [
425 |     "train_data = dataprocessor.transform(df)\n",
426 |     "\n",
427 |     "train_data.head()"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": 6,
433 |    "metadata": {},
434 |    "outputs": [],
435 |    "source": [
436 |     "# np arrays\n",
437 |     "X_train = train_data[[c for c in train_data.columns if c is not 'target']].values\n",
438 |     "y_train = train_data['target'].values\n",
439 |     "\n",
440 |     "# lgb Dataset object\n",
441 |     "lgtrain = lgb.Dataset(X_train,\n",
442 |     "    label=y_train,\n",
443 |     "    feature_name=all_features,\n",
444 |     "    categorical_feature=categorical_features,\n",
445 |     "    free_raw_data=False)"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": 7,
451 |    "metadata": {},
452 |    "outputs": [],
453 |    "source": [
454 |     "# model and fit params\n",
455 |     "params = dict(learning_rate=0.01,\n",
456 |     "    num_boost_round=300,\n",
457 |     "    num_leaves = 255,\n",
458 |     "    verbose=-1,\n",
459 |     "    is_unbalance=True)\n",
460 |     "fit_params = dict(feature_name=all_features,\n",
461 |     "        categorical_feature=categorical_features)"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "markdown",
466 |    "metadata": {},
467 |    "source": [
468 |     "## 1. First experiment. Sklearn wrap up vs lightgbm methods"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "code",
473 |    "execution_count": 12,
474 |    "metadata": {},
475 |    "outputs": [
476 |     {
477 |      "name": "stdout",
478 |      "output_type": "stream",
479 |      "text": [
480 |       "7.932429075241089\n"
481 |      ]
482 |     }
483 |    ],
484 |    "source": [
485 |     "clf = lgb.LGBMClassifier(**params, silent=True)\n",
486 |     "start = time()\n",
487 |     "score = cross_val_score(clf,\n",
488 |     "    X_train, y_train,\n",
489 |     "    scoring='neg_log_loss',\n",
490 |     "    cv=StratifiedKFold(random_state=1981),\n",
491 |     "    fit_params=fit_params)\n",
492 |     "sklearn_runtime = time() - start\n",
493 |     "print(sklearn_runtime)"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": 13,
499 |    "metadata": {},
500 |    "outputs": [
501 |     {
502 |      "name": "stdout",
503 |      "output_type": "stream",
504 |      "text": [
505 |       "7.038502931594849\n"
506 |      ]
507 |     }
508 |    ],
509 |    "source": [
510 |     "start = time()\n",
511 |     "cv_result = lgb.cv(params,\n",
512 |     "    lgtrain,\n",
513 |     "    metrics='binary_logloss',\n",
514 |     "    nfold=3,\n",
515 |     "    stratified=True, \n",
516 |     "    seed=1981)\n",
517 |     "lightgbm_runtime = time() - start\n",
518 |     "print(lightgbm_runtime)"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "markdown",
523 |    "metadata": {},
524 |    "source": [
525 |     "LightGBM methods seem to be a bit faster. Let's now compare `Hyperopt` and `Skopt`"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "markdown",
530 |    "metadata": {},
531 |    "source": [
532 |     "## Hyperopt vs Skopt"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "markdown",
537 |    "metadata": {},
538 |    "source": [
539 |     "The first thing to comment is that while Hyperopt offers the `hp.quniform(label, low, high, q)` parameter expressions, there is not such a thing for Skopt. One has `Categorical`, but you have to pass all values. In other words, When using hyperopt one could use:\n",
540 |     "\n",
541 |     "    'num_boost_round': hp.quniform('num_boost_round', 50, 500, 20)\n",
542 |     "\n",
543 |     "but when using Skopt one would have to do:\n",
544 |     "\n",
545 |     "    Categorical(np.arange(50, 500, 20))\n",
546 |     "    \n",
547 |     "Because I want to keep the comparison as light and direct as possible, I will just use `Real` parameters with uniform distributions."
548 |    ]
549 |   },
550 |   {
551 |    "cell_type": "markdown",
552 |    "metadata": {},
553 |    "source": [
554 |     "### 1. Hyperopt\n",
555 |     "\n",
556 |     "With Hyperopt we will use the [TPE](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf) algorithm."
557 |    ]
558 |   },
559 |   {
560 |    "cell_type": "code",
561 |    "execution_count": 14,
562 |    "metadata": {},
563 |    "outputs": [],
564 |    "source": [
565 |     "hp_space = {\n",
566 |     "    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),\n",
567 |     "    'min_child_weight': hp.uniform('min_child_weight', 0.1, 10),\n",
568 |     "    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.),\n",
569 |     "    'subsample': hp.uniform('subsample', 0.5, 1.),\n",
570 |     "    }"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": 20,
576 |    "metadata": {},
577 |    "outputs": [
578 |     {
579 |      "name": "stdout",
580 |      "output_type": "stream",
581 |      "text": [
582 |       "100%|██████████| 50/50 [00:39<00:00,  1.18it/s, best loss: 0.28343034788381305]\n"
583 |      ]
584 |     }
585 |    ],
586 |    "source": [
587 |     "def objective(params):\n",
588 |     "    clf = lgb.LGBMClassifier(**params, is_unbalance=True, verbose=-1, silent=True)\n",
589 |     "    score = cross_val_score(clf,\n",
590 |     "        X_train, y_train,\n",
591 |     "        scoring='f1',\n",
592 |     "        cv=StratifiedKFold(random_state=3),\n",
593 |     "        fit_params=fit_params).mean()\n",
594 |     "    return 1-score\n",
595 |     "trials = Trials()\n",
596 |     "best = fmin(fn=objective,\n",
597 |     "            space=hp_space,\n",
598 |     "            algo=tpe.suggest,\n",
599 |     "            max_evals=50,\n",
600 |     "            trials=trials)"
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "markdown",
605 |    "metadata": {},
606 |    "source": [
607 |     "### 2. SKopt\n",
608 |     "\n",
609 |     "Since TPE is a Bayesian method we will first compare with the `BayesSearchCV` method in `Skopt`"
610 |    ]
611 |   },
612 |   {
613 |    "cell_type": "code",
614 |    "execution_count": 16,
615 |    "metadata": {},
616 |    "outputs": [],
617 |    "source": [
618 |     "hh_space = dict(\n",
619 |     "        learning_rate = Real(0.01, 0.3),\n",
620 |     "        min_child_weight = Real(0.1, 10),\n",
621 |     "        colsample_bytree= Real(0.5, 1.),\n",
622 |     "        subsample=Real(0.5, 1.),\n",
623 |     "    )"
624 |    ]
625 |   },
626 |   {
627 |    "cell_type": "code",
628 |    "execution_count": 17,
629 |    "metadata": {},
630 |    "outputs": [
631 |     {
632 |      "name": "stdout",
633 |      "output_type": "stream",
634 |      "text": [
635 |       "63.14497208595276\n"
636 |      ]
637 |     }
638 |    ],
639 |    "source": [
640 |     "clf = lgb.LGBMClassifier(is_unbalance=True, verbose=-1, silent=True)\n",
641 |     "start = time()\n",
642 |     "opt = BayesSearchCV(clf,\n",
643 |     "    search_spaces=hh_space,\n",
644 |     "    scoring='f1',\n",
645 |     "    cv=StratifiedKFold(random_state=3),\n",
646 |     "    fit_params=fit_params,\n",
647 |     "    n_iter=50,\n",
648 |     "    n_jobs=-1)\n",
649 |     "opt.fit(X_train, y_train)\n",
650 |     "skopt_bayes_runtime = time()-start\n",
651 |     "print(skopt_bayes_runtime)"
652 |    ]
653 |   },
654 |   {
655 |    "cell_type": "markdown",
656 |    "metadata": {},
657 |    "source": [
658 |     "`Skopt`'s seems to be a significantly slower than hyperopt even with no verbosity. Let's see if performs better:"
659 |    ]
660 |   },
661 |   {
662 |    "cell_type": "code",
663 |    "execution_count": 19,
664 |    "metadata": {},
665 |    "outputs": [
666 |     {
667 |      "name": "stdout",
668 |      "output_type": "stream",
669 |      "text": [
670 |       "best SKOPT F1 score: 0.7174372197995806\n"
671 |      ]
672 |     }
673 |    ],
674 |    "source": [
675 |     "print('best SKOPT F1 score: {}'.format(opt.best_score_))"
676 |    ]
677 |   },
678 |   {
679 |    "cell_type": "markdown",
680 |    "metadata": {},
681 |    "source": [
682 |     "which is almost identical to the one obtained with `Hyperopt`"
683 |    ]
684 |   },
685 |   {
686 |    "cell_type": "code",
687 |    "execution_count": 25,
688 |    "metadata": {},
689 |    "outputs": [
690 |     {
691 |      "name": "stdout",
692 |      "output_type": "stream",
693 |      "text": [
694 |       "best HYPEROPT F1 score: 0.716569652116187\n"
695 |      ]
696 |     }
697 |    ],
698 |    "source": [
699 |     "# Remember hyperopt minimises 1-score. \n",
700 |     "print('best HYPEROPT F1 score: {}'.format(1-trials.best_trial['result']['loss']))"
701 |    ]
702 |   },
703 |   {
704 |    "cell_type": "markdown",
705 |    "metadata": {},
706 |    "source": [
707 |     "The conclusion at this stage is that `Hyperopt` is faster than `Skopt` with the same performance. However, the `TPE` algorithm is a tree based algorithm, so let's also compare with the `gbrt_minimize` method (Sequential optimization using gradient boosted trees) in `Skopt`. Here the syntax is a bit different to that of `BayesSearchCV`. "
708 |    ]
709 |   },
710 |   {
711 |    "cell_type": "code",
712 |    "execution_count": 26,
713 |    "metadata": {},
714 |    "outputs": [],
715 |    "source": [
716 |     "# the space has to be tuples like these\n",
717 |     "hh_space_gbrt  = [Real(0.01, 0.3, 'uniform', name='learning_rate'),\n",
718 |     "          Real(0.1, 10, 'uniform', name='min_child_weight'),\n",
719 |     "          Real(0.5, 1., 'uniform', name='colsample_bytree'),\n",
720 |     "          Real(0.5, 1., 'uniform', name='subsample')]"
721 |    ]
722 |   },
723 |   {
724 |    "cell_type": "code",
725 |    "execution_count": 30,
726 |    "metadata": {},
727 |    "outputs": [],
728 |    "source": [
729 |     "# Let's adapt the objective\n",
730 |     "def gbrt_objective(params):\n",
731 |     "    tmp_params = {}\n",
732 |     "    tmp_params['learning_rate'], tmp_params['min_child_weight'], \\\n",
733 |     "    tmp_params['colsample_bytree'], tmp_params['subsample'], = params[0], params[1], params[2], params[3]\n",
734 |     "    clf = lgb.LGBMClassifier(**tmp_params, is_unbalance=True, verbose=-1, silent=True)\n",
735 |     "    score = cross_val_score(clf,\n",
736 |     "        X_train, y_train,\n",
737 |     "        scoring='f1',\n",
738 |     "        cv=StratifiedKFold(random_state=3),\n",
739 |     "        fit_params=fit_params).mean()\n",
740 |     "    return 1-score"
741 |    ]
742 |   },
743 |   {
744 |    "cell_type": "code",
745 |    "execution_count": 31,
746 |    "metadata": {},
747 |    "outputs": [
748 |     {
749 |      "name": "stdout",
750 |      "output_type": "stream",
751 |      "text": [
752 |       "54.64228296279907\n"
753 |      ]
754 |     }
755 |    ],
756 |    "source": [
757 |     "start=time()\n",
758 |     "sk_best = gbrt_minimize(gbrt_objective,\n",
759 |     "    hh_space_gbrt,\n",
760 |     "    n_calls=50,\n",
761 |     "    verbose=False,\n",
762 |     "    n_jobs=-1)\n",
763 |     "skopt_gbrt_runtime = time()-start\n",
764 |     "print(skopt_gbrt_runtime)"
765 |    ]
766 |   },
767 |   {
768 |    "cell_type": "markdown",
769 |    "metadata": {},
770 |    "source": [
771 |     "Faster than `BayesSearchCV`, but still, slower than `Hyperopt`. Let's see if the results are any better"
772 |    ]
773 |   },
774 |   {
775 |    "cell_type": "code",
776 |    "execution_count": 36,
777 |    "metadata": {},
778 |    "outputs": [
779 |     {
780 |      "name": "stdout",
781 |      "output_type": "stream",
782 |      "text": [
783 |       "best SKOPT GBRT F1 score: 0.7173483496134895\n"
784 |      ]
785 |     }
786 |    ],
787 |    "source": [
788 |     "print('best SKOPT GBRT F1 score: {}'.format(1-sk_best.fun))"
789 |    ]
790 |   },
791 |   {
792 |    "cell_type": "markdown",
793 |    "metadata": {},
794 |    "source": [
795 |     "## CONCLUSION"
796 |    ]
797 |   },
798 |   {
799 |    "cell_type": "markdown",
800 |    "metadata": {},
801 |    "source": [
802 |     "`Hyperopt`'s TPE performs as good as Skopt `gbrt_minimize` and `BayesSearchCV` methods and is significantly faster."
803 |    ]
804 |   }
805 |  ],
806 |  "metadata": {
807 |   "kernelspec": {
808 |    "display_name": "Python 3",
809 |    "language": "python",
810 |    "name": "python3"
811 |   },
812 |   "language_info": {
813 |    "codemirror_mode": {
814 |     "name": "ipython",
815 |     "version": 3
816 |    },
817 |    "file_extension": ".py",
818 |    "mimetype": "text/x-python",
819 |    "name": "python",
820 |    "nbconvert_exporter": "python",
821 |    "pygments_lexer": "ipython3",
822 |    "version": "3.6.5"
823 |   }
824 |  },
825 |  "nbformat": 4,
826 |  "nbformat_minor": 2
827 | }
828 | 


--------------------------------------------------------------------------------
/predictor.py:
--------------------------------------------------------------------------------
 1 | import pdb
 2 | import json
 3 | import pandas as pd
 4 | import pickle
 5 | 
 6 | from pathlib import Path
 7 | from kafka import KafkaConsumer
 8 | from utils.messages_utils import append_message, read_messages_count, send_retrain_message, publish_prediction
 9 | 
10 | KAFKA_HOST = 'localhost:9092'
11 | TOPICS = ['app_messages', 'retrain_topic']
12 | PATH = Path('data/')
13 | MODELS_PATH = PATH/'models'
14 | DATAPROCESSORS_PATH = PATH/'dataprocessors'
15 | MESSAGES_PATH = PATH/'messages'
16 | RETRAIN_EVERY = 25
17 | EXTRA_MODELS_TO_KEEP = 1
18 | 
19 | column_order = pickle.load(open(DATAPROCESSORS_PATH/'column_order.p', 'rb'))
20 | dataprocessor = None
21 | consumer = None
22 | model = None
23 | 
24 | 
25 | def reload_model(path):
26 | 	return pickle.load(open(path, 'rb'))
27 | 
28 | 
29 | def is_retraining_message(msg):
30 | 	message = json.loads(msg.value)
31 | 	return msg.topic == 'retrain_topic' and 'training_completed' in message and message['training_completed']
32 | 
33 | 
34 | def is_application_message(msg):
35 | 	message = json.loads(msg.value)
36 | 	return msg.topic == 'app_messages' and 'prediction' not in message
37 | 
38 | 
39 | def predict(message, column_order):
40 | 	row = pd.DataFrame(message, index=[0])
41 | 	# sanity check
42 | 	assert row.columns.tolist()[:-1] == column_order
43 | 	# In the real world we would not have the target (here 'income_bracket').
44 | 	# In this example we keep it and we will retrain the model as it reads
45 | 	# RETRAIN_EVERY number of messages. In the real world, after RETRAIN_EVERY
46 | 	# number of messages have been collected, one would have to wait until we
47 | 	# can collect RETRAIN_EVERY targets AND THEN retrain
48 | 	row.drop('income_bracket', axis=1, inplace=True)
49 | 	trow = dataprocessor.transform(row)
50 | 	return model.predict(trow)[0]
51 | 
52 | 
53 | def start(model_id, messages_count, batch_id):
54 | 	for msg in consumer:
55 | 		message = json.loads(msg.value)
56 | 
57 | 		if is_retraining_message(msg):
58 | 			model_fname = 'model_{}_.p'.format(model_id)
59 | 			model = reload_model(MODELS_PATH/model_fname)
60 | 			print("NEW MODEL RELOADED {}".format(model_id))
61 | 
62 | 		elif is_application_message(msg):
63 | 			request_id = message['request_id']
64 | 			pred = predict(message['data'], column_order)
65 | 			publish_prediction(pred, request_id)
66 | 
67 | 			append_message(message['data'], MESSAGES_PATH, batch_id)
68 | 			messages_count += 1
69 | 			if messages_count % RETRAIN_EVERY == 0:
70 | 				model_id = (model_id + 1) % (EXTRA_MODELS_TO_KEEP + 1)
71 | 				send_retrain_message(model_id, batch_id)
72 | 				batch_id += 1
73 | 
74 | 
75 | if __name__ == '__main__':
76 | 	dataprocessor_id = 0
77 | 	dataprocessor_fname = 'dataprocessor_{}_.p'.format(dataprocessor_id)
78 | 	dataprocessor = pickle.load(open(DATAPROCESSORS_PATH/dataprocessor_fname, 'rb'))
79 | 
80 | 	messages_count = read_messages_count(MESSAGES_PATH, RETRAIN_EVERY)
81 | 	batch_id = messages_count % RETRAIN_EVERY
82 | 
83 | 	model_id = batch_id % (EXTRA_MODELS_TO_KEEP + 1)
84 | 	model_fname = 'model_{}_.p'.format(model_id)
85 | 	model = reload_model(MODELS_PATH/model_fname)
86 | 
87 | 	consumer = KafkaConsumer(bootstrap_servers=KAFKA_HOST)
88 | 	consumer.subscribe(TOPICS)
89 | 
90 | 	start(model_id, messages_count, batch_id)
91 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | alembic==1.0.11
 2 | certifi==2019.6.16
 3 | chardet==3.0.4
 4 | Click==7.0
 5 | cloudpickle==1.2.1
 6 | configparser==3.7.4
 7 | databricks-cli==0.8.7
 8 | decorator==4.4.0
 9 | docker==4.0.2
10 | entrypoints==0.3
11 | Flask==1.1.1
12 | future==0.17.1
13 | gitdb2==2.0.5
14 | GitPython==2.1.11
15 | gunicorn==19.9.0
16 | hyperopt==0.1.2
17 | idna==2.8
18 | itsdangerous==1.1.0
19 | Jinja2==2.10.1
20 | joblib==0.13.2
21 | kafka-python==1.4.6
22 | lightgbm==2.2.3
23 | Mako==1.0.13
24 | MarkupSafe==1.1.1
25 | mlflow==1.0.0
26 | networkx==2.3
27 | numpy==1.16.4
28 | pandas==0.24.2
29 | protobuf==3.9.0
30 | pymongo==3.8.0
31 | python-dateutil==2.8.0
32 | python-editor==1.0.4
33 | pytz==2019.1
34 | PyYAML==5.1.1
35 | querystring-parser==1.2.3
36 | requests==2.22.0
37 | scikit-learn==0.21.2
38 | scipy==1.3.0
39 | simplejson==3.16.0
40 | six==1.12.0
41 | sklearn==0.0
42 | smmap2==2.0.5
43 | SQLAlchemy==1.3.5
44 | sqlparse==0.3.0
45 | tabulate==0.8.3
46 | tqdm==4.32.2
47 | urllib3==1.25.3
48 | websocket-client==0.56.0
49 | Werkzeug==0.15.4
50 | 


--------------------------------------------------------------------------------
/sample_app.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import threading
 4 | import uuid
 5 | 
 6 | from pathlib import Path
 7 | from kafka import KafkaProducer, KafkaConsumer
 8 | from time import sleep
 9 | 
10 | 
11 | PATH = Path('data/')
12 | KAFKA_HOST = 'localhost:9092'
13 | df_test = pd.read_csv(PATH/'adult.test')
14 | # In the real world, the messages would not come with the target/outcome of
15 | # our actions. Here we will keep it and assume that at some point in the
16 | # future we can collect the outcome and monitor how our algorithm is doing
17 | # df_test.drop('income_bracket', axis=1, inplace=True)
18 | df_test['json'] = df_test.apply(lambda x: x.to_json(), axis=1)
19 | messages = df_test.json.tolist()
20 | 
21 | 
22 | def start_producing():
23 | 	producer = KafkaProducer(bootstrap_servers=KAFKA_HOST)
24 | 	for i in range(200):
25 | 		message_id = str(uuid.uuid4())
26 | 		message = {'request_id': message_id, 'data': json.loads(messages[i])}
27 | 
28 | 		producer.send('app_messages', json.dumps(message).encode('utf-8'))
29 | 		producer.flush()
30 | 
31 | 		print("\033[1;31;40m -- PRODUCER: Sent message with id {}".format(message_id))
32 | 		sleep(2)
33 | 
34 | 
35 | def start_consuming():
36 | 	consumer = KafkaConsumer('app_messages', bootstrap_servers=KAFKA_HOST)
37 | 
38 | 	for msg in consumer:
39 | 		message = json.loads(msg.value)
40 | 		if 'prediction' in message:
41 | 			request_id = message['request_id']
42 | 			print("\033[1;32;40m ** CONSUMER: Received prediction {} for request id {}".format(message['prediction'], request_id))
43 | 
44 | 
45 | threads = []
46 | t = threading.Thread(target=start_producing)
47 | t2 = threading.Thread(target=start_consuming)
48 | threads.append(t)
49 | threads.append(t2)
50 | t.start()
51 | t2.start()
52 | 


--------------------------------------------------------------------------------
/train/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrzaurin/ml-pipeline/af2b338b19a7eda31af7da35db877a9254a02058/train/__init__.py


--------------------------------------------------------------------------------
/train/train_hyperopt.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import lightgbm as lgb
  4 | import pickle
  5 | import pdb
  6 | import warnings
  7 | 
  8 | from pathlib import Path
  9 | from sklearn.metrics import f1_score
 10 | from hyperopt import hp, tpe, fmin, Trials
 11 | 
 12 | 
 13 | warnings.filterwarnings("ignore")
 14 | 
 15 | 
 16 | def best_threshold(y_true, pred_proba, proba_range, verbose=False):
 17 | 	"""
 18 | 	Function to find the probability threshold that optimises the f1_score
 19 | 
 20 | 	Comment: this function is not used in this excercise, but we include it in
 21 | 	case the reader finds it useful
 22 | 
 23 | 	Parameters:
 24 | 	-----------
 25 | 	y_true: numpy.ndarray
 26 | 		array with the true labels
 27 | 	pred_proba: numpy.ndarray
 28 | 		array with the predicted probability
 29 | 	proba_range: numpy.ndarray
 30 | 		range of probabilities to explore.
 31 | 		e.g. np.arange(0.1,0.9,0.01)
 32 | 
 33 | 	Return:
 34 | 	-----------
 35 | 	tuple with the optimal threshold and the corresponding f1_score
 36 | 	"""
 37 | 	scores = []
 38 | 	for prob in proba_range:
 39 | 		pred = [int(p>prob) for p in pred_proba]
 40 | 		score = f1_score(y_true,pred)
 41 | 		scores.append(score)
 42 | 		if verbose:
 43 | 			print("INFO: prob threshold: {}.  score :{}".format(round(prob,3), round(score,5)))
 44 | 	best_score = scores[np.argmax(scores)]
 45 | 	optimal_threshold = proba_range[np.argmax(scores)]
 46 | 	return (optimal_threshold, best_score)
 47 | 
 48 | 
 49 | def lgb_f1_score(preds, lgbDataset):
 50 | 	"""
 51 | 	Function to compute the f1_score to be used with lightgbm methods.
 52 | 	Comments: output format must be:
 53 | 	(eval_name, eval_result, is_higher_better)
 54 | 
 55 | 	Parameters:
 56 | 	-----------
 57 | 	preds: np.array or List
 58 | 	lgbDataset: lightgbm.Dataset
 59 | 	"""
 60 | 	binary_preds = [int(p>0.5) for p in preds]
 61 | 	y_true = lgbDataset.get_label()
 62 | 	# lightgbm: (eval_name, eval_result, is_higher_better)
 63 | 	return 'f1', f1_score(y_true, binary_preds), True
 64 | 
 65 | 
 66 | class LGBOptimizer(object):
 67 | 	def __init__(self, trainDataset, out_dir):
 68 | 		"""
 69 | 		Hyper Parameter optimization
 70 | 
 71 | 		Parameters:
 72 | 		-----------
 73 | 		trainDataset: FeatureTools object
 74 | 			The result of running FeatureTools().fit()
 75 | 		out_dir: pathlib.PosixPath
 76 | 			Path to the output directory
 77 | 		"""
 78 | 		self.PATH = out_dir
 79 | 		self.early_stop_dict = {}
 80 | 
 81 | 		self.X = trainDataset.data
 82 | 		self.y = trainDataset.target
 83 | 		self.colnames = trainDataset.colnames
 84 | 		self.categorical_columns = trainDataset.categorical_columns + trainDataset.crossed_columns
 85 | 
 86 | 		self.lgtrain = lgb.Dataset(self.X,label=self.y,
 87 | 			feature_name=self.colnames,
 88 | 			categorical_feature = self.categorical_columns,
 89 | 			free_raw_data=False)
 90 | 
 91 | 	def optimize(self, maxevals=200, model_id=0):
 92 | 
 93 | 		param_space = self.hyperparameter_space()
 94 | 		objective = self.get_objective(self.lgtrain)
 95 | 		objective.i=0
 96 | 		trials = Trials()
 97 | 		best = fmin(fn=objective,
 98 | 		            space=param_space,
 99 | 		            algo=tpe.suggest,
100 | 		            max_evals=maxevals,
101 | 		            trials=trials)
102 | 		best['num_boost_round'] = self.early_stop_dict[trials.best_trial['tid']]
103 | 		best['num_leaves'] = int(best['num_leaves'])
104 | 		best['verbose'] = -1
105 | 
106 | 		# set the model with the best parameters, fit and save
107 | 		model = lgb.LGBMClassifier(**best)
108 | 		model.fit(self.lgtrain.data,
109 | 			self.lgtrain.label,
110 | 			feature_name=self.colnames,
111 | 			categorical_feature=self.categorical_columns)
112 | 
113 | 		model_fname = 'model_{}_.p'.format(model_id)
114 | 		best_experiment_fname = 'best_experiment_{}_.p'.format(model_id)
115 | 
116 | 		pickle.dump(model, open(self.PATH/model_fname, 'wb'))
117 | 		pickle.dump(best, open(self.PATH/best_experiment_fname, 'wb'))
118 | 
119 | 		self.best = best
120 | 		self.model = model
121 | 
122 | 
123 | 	def get_objective(self, train):
124 | 
125 | 		def objective(params):
126 | 			"""
127 | 			objective function for lightgbm.
128 | 			"""
129 | 			# hyperopt casts as float
130 | 			params['num_boost_round'] = int(params['num_boost_round'])
131 | 			params['num_leaves'] = int(params['num_leaves'])
132 | 
133 | 			# need to be passed as parameter
134 | 			params['is_unbalance'] = True
135 | 			params['verbose'] = -1
136 | 			params['seed'] = 1
137 | 
138 | 			cv_result = lgb.cv(
139 | 				params,
140 | 				train,
141 | 				num_boost_round=params['num_boost_round'],
142 | 				metrics='binary_logloss',
143 | 				# feval = lgb_f1_score,
144 | 				nfold=3,
145 | 				stratified=True,
146 | 				early_stopping_rounds=20)
147 | 			self.early_stop_dict[objective.i] = len(cv_result['binary_logloss-mean'])
148 | 			error = round(cv_result['binary_logloss-mean'][-1], 4)
149 | 			objective.i+=1
150 | 			return error
151 | 
152 | 		return objective
153 | 
154 | 	def hyperparameter_space(self, param_space=None):
155 | 
156 | 		space = {
157 | 			'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
158 | 			'num_boost_round': hp.quniform('num_boost_round', 50, 500, 20),
159 | 			'num_leaves': hp.quniform('num_leaves', 31, 255, 4),
160 | 		    'min_child_weight': hp.uniform('min_child_weight', 0.1, 10),
161 | 		    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.),
162 | 		    'subsample': hp.uniform('subsample', 0.5, 1.),
163 | 		    'reg_alpha': hp.uniform('reg_alpha', 0.01, 0.1),
164 | 		    'reg_lambda': hp.uniform('reg_lambda', 0.01, 0.1),
165 | 		}
166 | 
167 | 		if param_space:
168 | 			return param_space
169 | 		else:
170 | 			return space


--------------------------------------------------------------------------------
/train/train_hyperopt_mlflow.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import lightgbm as lgb
  4 | import pickle
  5 | import pdb
  6 | import warnings
  7 | import mlflow
  8 | import mlflow.sklearn
  9 | 
 10 | from pathlib import Path
 11 | from sklearn.metrics import f1_score
 12 | from hyperopt import hp, tpe, fmin, Trials
 13 | from mlflow.tracking import MlflowClient
 14 | 
 15 | 
 16 | warnings.filterwarnings("ignore")
 17 | 
 18 | 
 19 | def best_threshold(y_true, pred_proba, proba_range, verbose=False):
 20 | 	"""
 21 | 	Function to find the probability threshold that optimises the f1_score
 22 | 
 23 | 	Comment: this function is not used in this excercise, but we include it in
 24 | 	case the reader finds it useful
 25 | 
 26 | 	Parameters:
 27 | 	-----------
 28 | 	y_true: numpy.ndarray
 29 | 		array with the true labels
 30 | 	pred_proba: numpy.ndarray
 31 | 		array with the predicted probability
 32 | 	proba_range: numpy.ndarray
 33 | 		range of probabilities to explore.
 34 | 		e.g. np.arange(0.1,0.9,0.01)
 35 | 
 36 | 	Return:
 37 | 	-----------
 38 | 	tuple with the optimal threshold and the corresponding f1_score
 39 | 	"""
 40 | 	scores = []
 41 | 	for prob in proba_range:
 42 | 		pred = [int(p>prob) for p in pred_proba]
 43 | 		score = f1_score(y_true,pred)
 44 | 		scores.append(score)
 45 | 		if verbose:
 46 | 			print("INFO: prob threshold: {}.  score :{}".format(round(prob,3), round(score,5)))
 47 | 	best_score = scores[np.argmax(scores)]
 48 | 	optimal_threshold = proba_range[np.argmax(scores)]
 49 | 	return (optimal_threshold, best_score)
 50 | 
 51 | 
 52 | def lgb_f1_score(preds, lgbDataset):
 53 | 	"""
 54 | 	Function to compute the f1_score to be used with lightgbm methods.
 55 | 	Comments: output format must be:
 56 | 	(eval_name, eval_result, is_higher_better)
 57 | 
 58 | 	Parameters:
 59 | 	-----------
 60 | 	preds: np.array or List
 61 | 	lgbDataset: lightgbm.Dataset
 62 | 	"""
 63 | 	binary_preds = [int(p>0.5) for p in preds]
 64 | 	y_true = lgbDataset.get_label()
 65 | 	# lightgbm: (eval_name, eval_result, is_higher_better)
 66 | 	return 'f1', f1_score(y_true, binary_preds), True
 67 | 
 68 | 
 69 | class LGBOptimizer(object):
 70 | 	def __init__(self, trainDataset, out_dir):
 71 | 		"""
 72 | 		Hyper Parameter optimization
 73 | 
 74 | 		Parameters:
 75 | 		-----------
 76 | 		trainDataset: FeatureTools object
 77 | 			The result of running FeatureTools().fit()
 78 | 		out_dir: pathlib.PosixPath
 79 | 			Path to the output directory
 80 | 		"""
 81 | 		self.PATH = out_dir
 82 | 		self.early_stop_dict = {}
 83 | 
 84 | 		self.X = trainDataset.data
 85 | 		self.y = trainDataset.target
 86 | 		self.colnames = trainDataset.colnames
 87 | 		self.categorical_columns = trainDataset.categorical_columns + trainDataset.crossed_columns
 88 | 
 89 | 		self.lgtrain = lgb.Dataset(self.X,label=self.y,
 90 | 			feature_name=self.colnames,
 91 | 			categorical_feature = self.categorical_columns,
 92 | 			free_raw_data=False)
 93 | 
 94 | 	def optimize(self, maxevals=200, model_id=0, reuse_experiment=False):
 95 | 
 96 | 		param_space = self.hyperparameter_space()
 97 | 		objective = self.get_objective(self.lgtrain)
 98 | 		objective.i=0
 99 | 		trials = Trials()
100 | 		best = fmin(fn=objective,
101 | 		            space=param_space,
102 | 		            algo=tpe.suggest,
103 | 		            max_evals=maxevals,
104 | 		            trials=trials)
105 | 		best['num_boost_round'] = self.early_stop_dict[trials.best_trial['tid']]
106 | 		best['num_leaves'] = int(best['num_leaves'])
107 | 		best['verbose'] = -1
108 | 
109 | 		# The next few lines are the only ones related to mlflow.
110 | 		if not Path('mlruns').exists():
111 |             # here set the tracking_uri. If None then http://localhost:5000
112 | 		    client = MlflowClient()
113 | 		    n_experiments=0
114 | 		elif not reuse_experiment:
115 | 		    client = MlflowClient()
116 | 		    n_experiments = len(client.list_experiments())
117 | 		    experiment_name = 'experiment_' + str(n_experiments)
118 | 		    client.create_experiment(name=experiment_name)
119 | 		with mlflow.start_run(experiment_id=n_experiments):
120 | 			model = lgb.LGBMClassifier(**best)
121 | 			model.fit(self.lgtrain.data,
122 | 				self.lgtrain.label,
123 | 				feature_name=self.colnames,
124 | 				categorical_feature=self.categorical_columns)
125 | 			for name, value in best.items():
126 | 				mlflow.log_param(name, value)
127 | 			mlflow.log_metric('binary_logloss', trials.best_trial['result']['loss'])
128 | 			mlflow.sklearn.log_model(model, "model")
129 | 
130 | 		model_fname = 'model_{}_.p'.format(model_id)
131 | 		best_experiment_fname = 'best_experiment_{}_.p'.format(model_id)
132 | 
133 | 		pickle.dump(model, open(self.PATH/model_fname, 'wb'))
134 | 		pickle.dump(best, open(self.PATH/best_experiment_fname, 'wb'))
135 | 
136 | 		self.best = best
137 | 		self.model = model
138 | 
139 | 	def get_objective(self, train):
140 | 
141 | 		def objective(params):
142 | 			"""
143 | 			objective function for lightgbm.
144 | 			"""
145 | 			# hyperopt casts as float
146 | 			params['num_boost_round'] = int(params['num_boost_round'])
147 | 			params['num_leaves'] = int(params['num_leaves'])
148 | 
149 | 			# need to be passed as parameter
150 | 			params['is_unbalance'] = True
151 | 			params['verbose'] = -1
152 | 			params['seed'] = 1
153 | 
154 | 			cv_result = lgb.cv(
155 | 				params,
156 | 				train,
157 | 				num_boost_round=params['num_boost_round'],
158 | 				metrics='binary_logloss',
159 | 				# feval = lgb_f1_score,
160 | 				nfold=3,
161 | 				stratified=True,
162 | 				early_stopping_rounds=20)
163 | 			self.early_stop_dict[objective.i] = len(cv_result['binary_logloss-mean'])
164 | 			error = cv_result['binary_logloss-mean'][-1]
165 | 			objective.i+=1
166 | 			return error
167 | 
168 | 		return objective
169 | 
170 | 	def hyperparameter_space(self, param_space=None):
171 | 
172 | 		space = {
173 | 			'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
174 | 			'num_boost_round': hp.quniform('num_boost_round', 50, 500, 20),
175 | 			'num_leaves': hp.quniform('num_leaves', 31, 256, 4),
176 | 		    'min_child_weight': hp.uniform('min_child_weight', 0.1, 10),
177 | 		    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.),
178 | 		    'subsample': hp.uniform('subsample', 0.5, 1.),
179 | 		    'reg_alpha': hp.uniform('reg_alpha', 0.01, 0.1),
180 | 		    'reg_lambda': hp.uniform('reg_lambda', 0.01, 0.1),
181 | 		}
182 | 
183 | 		if param_space:
184 | 			return param_space
185 | 		else:
186 | 			return space
187 | 


--------------------------------------------------------------------------------
/train/train_hyperparameterhunter.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import warnings
  4 | import pickle
  5 | import json
  6 | import lightgbm as lgb
  7 | 
  8 | import pdb
  9 | 
 10 | from pathlib import Path
 11 | from hyperparameter_hunter import (Environment, CVExperiment,
 12 |     BayesianOptimization, Integer, Real, Categorical)
 13 | from hyperparameter_hunter import optimization as opt
 14 | from sklearn.model_selection import StratifiedKFold
 15 | 
 16 | warnings.filterwarnings("ignore")
 17 | 
 18 | 
 19 | class LGBOptimizer(object):
 20 |     def __init__(self, trainDataset, out_dir):
 21 |         """
 22 |         Hyper Parameter optimization
 23 | 
 24 |         Comments: Hyperparameter_hunter (hereafter HH) is a fantastic package
 25 |         (https://github.com/HunterMcGushion/hyperparameter_hunter) to avoid
 26 |         wasting time as you optimise parameters. In the words of his author:
 27 |         "For so long, hyperparameter optimization has been such a time
 28 |         consuming process that just pointed you in a direction for further
 29 |         optimization, then you basically had to start over".
 30 | 
 31 |         Parameters:
 32 |         -----------
 33 |         trainDataset: FeatureTools object
 34 |             The result of running FeatureTools().fit()
 35 |         out_dir: Str
 36 |             Path to the output directory
 37 |         """
 38 | 
 39 |         self.PATH = str(out_dir)
 40 |         self.data = trainDataset.data
 41 |         self.data['target'] = trainDataset.target
 42 |         self.colnames = trainDataset.colnames
 43 |         self.categorical_columns = trainDataset.categorical_columns + trainDataset.crossed_columns
 44 | 
 45 |     def optimize(self, metrics='f1_score', n_splits=3, cv_type=StratifiedKFold,
 46 |         maxevals=200, do_predict_proba=None, model_id=0):
 47 | 
 48 |         params = self.hyperparameter_space()
 49 |         extra_params = self.extra_setup()
 50 | 
 51 |         env = Environment(
 52 |             train_dataset=self.data,
 53 |             results_path='HyperparameterHunterAssets',
 54 |             # results_path=self.PATH,
 55 |             metrics=[metrics],
 56 |             do_predict_proba = do_predict_proba,
 57 |             cv_type=cv_type,
 58 |             cv_params=dict(n_splits=n_splits),
 59 |         )
 60 | 
 61 |         # optimizer = opt.GradientBoostedRegressionTreeOptimization(iterations=maxevals)
 62 |         optimizer = opt.BayesianOptimization(iterations=maxevals)
 63 |         optimizer.set_experiment_guidelines(
 64 |             model_initializer=lgb.LGBMClassifier,
 65 |             model_init_params=params,
 66 |             model_extra_params=extra_params
 67 |         )
 68 |         optimizer.go()
 69 |         # there are a few fixes on its way and the next few lines will soon be
 70 |         # one. At the moment, to access to the best parameters one has to read
 71 |         # from disc and access them
 72 |         best_experiment = 'HyperparameterHunterAssets/Experiments/Descriptions/'+\
 73 |             optimizer.best_experiment+'.json'
 74 |         with open(best_experiment) as best:
 75 |             best = json.loads(best.read())['hyperparameters']['model_init_params']
 76 |         model = lgb.LGBMClassifier(**best)
 77 |         X, y = self.data.drop('target',axis=1), self.data.target
 78 |         model.fit(X,y,
 79 |             feature_name=self.colnames,
 80 |             categorical_feature=self.categorical_columns
 81 |             )
 82 |         model_fname = 'model_{}_.p'.format(model_id)
 83 |         best_experiment_fname = 'best_experiment_{}_.p'.format(model_id)
 84 |         pickle.dump(model, open('/'.join([self.PATH,model_fname]), 'wb'))
 85 |         pickle.dump(optimizer, open('/'.join([self.PATH,best_experiment_fname]), 'wb'))
 86 | 
 87 | 
 88 |     def hyperparameter_space(self, param_space=None):
 89 | 
 90 |         space = dict(
 91 |                 is_unbalance = True,
 92 |                 learning_rate = Real(0.01, 0.3),
 93 |                 num_boost_round=Categorical(np.arange(50, 500, 20)),
 94 |                 num_leaves=Categorical(np.arange(31, 256, 4)),
 95 |                 min_child_weight = Real(0.1, 10),
 96 |                 colsample_bytree= Real(0.5, 1.),
 97 |                 subsample=Real(0.5, 1.),
 98 |                 reg_alpha= Real(0.01, 0.1),
 99 |                 reg_lambda= Real(0.01, 0.1)
100 |             )
101 | 
102 |         if param_space:
103 |             return param_space
104 |         else:
105 |             return space
106 | 
107 | 
108 |     def extra_setup(self, extra_setup=None):
109 | 
110 |         extra_params = dict(
111 |             early_stopping_rounds=20,
112 |             feature_name=self.colnames,
113 |             categorical_feature=self.categorical_columns
114 |         )
115 | 
116 |         if extra_setup:
117 |             return extra_setup
118 |         else:
119 |             return extra_params
120 | 


--------------------------------------------------------------------------------
/train/train_hyperparameterhunter_mlfow.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import warnings
  4 | import pickle
  5 | import json
  6 | import lightgbm as lgb
  7 | import mlflow
  8 | import mlflow.sklearn
  9 | 
 10 | import pdb
 11 | 
 12 | from pathlib import Path
 13 | from hyperparameter_hunter import (Environment, CVExperiment,
 14 |     BayesianOptimization, Integer, Real, Categorical)
 15 | from hyperparameter_hunter import optimization as opt
 16 | from sklearn.model_selection import StratifiedKFold
 17 | from mlflow.tracking import MlflowClient
 18 | 
 19 | 
 20 | warnings.filterwarnings("ignore")
 21 | 
 22 | 
 23 | class LGBOptimizer(object):
 24 |     def __init__(self, trainDataset, out_dir):
 25 |         """
 26 |         Hyper Parameter optimization
 27 | 
 28 |         Comments: Hyperparameter_hunter (hereafter HH) is a fantastic package
 29 |         (https://github.com/HunterMcGushion/hyperparameter_hunter) to avoid
 30 |         wasting time as you optimise parameters. In the words of his author:
 31 |         "For so long, hyperparameter optimization has been such a time
 32 |         consuming process that just pointed you in a direction for further
 33 |         optimization, then you basically had to start over".
 34 | 
 35 |         Parameters:
 36 |         -----------
 37 |         trainDataset: FeatureTools object
 38 |             The result of running FeatureTools().fit()
 39 |         out_dir: Str
 40 |             Path to the output directory
 41 |         """
 42 | 
 43 |         self.PATH = str(out_dir)
 44 |         self.data = trainDataset.data
 45 |         self.data['target'] = trainDataset.target
 46 |         self.colnames = trainDataset.colnames
 47 |         self.categorical_columns = trainDataset.categorical_columns + trainDataset.crossed_columns
 48 | 
 49 |     def optimize(self, metrics='f1_score', n_splits=3, cv_type=StratifiedKFold,
 50 |         maxevals=200, do_predict_proba=None, model_id=0, reuse_experiment=False):
 51 | 
 52 |         params = self.hyperparameter_space()
 53 |         extra_params = self.extra_setup()
 54 | 
 55 |         env = Environment(
 56 |             train_dataset=self.data,
 57 |             results_path='HyperparameterHunterAssets',
 58 |             # results_path=self.PATH,
 59 |             metrics=[metrics],
 60 |             do_predict_proba = do_predict_proba,
 61 |             cv_type=cv_type,
 62 |             cv_params=dict(n_splits=n_splits),
 63 |         )
 64 | 
 65 |         # optimizer = opt.GradientBoostedRegressionTreeOptimization(iterations=maxevals)
 66 |         optimizer = opt.BayesianOptimization(iterations=maxevals)
 67 |         optimizer.set_experiment_guidelines(
 68 |             model_initializer=lgb.LGBMClassifier,
 69 |             model_init_params=params,
 70 |             model_extra_params=extra_params
 71 |         )
 72 |         optimizer.go()
 73 | 
 74 |         # there are a few fixes on its way and the next few lines will soon be
 75 |         # one. At the moment, to access to the best parameters one has to read
 76 |         # from disc and access them
 77 |         best_experiment = 'HyperparameterHunterAssets/Experiments/Descriptions/'+\
 78 |             optimizer.best_experiment+'.json'
 79 |         with open(best_experiment) as best:
 80 |             best = json.loads(best.read())['hyperparameters']['model_init_params']
 81 | 
 82 |         # The next few lines are the only ones related to mlflow
 83 |         if not Path('mlruns').exists():
 84 |             # here set the tracking_uri. If None then http://localhost:5000
 85 |             client = MlflowClient()
 86 |             n_experiments=0
 87 |         elif not reuse_experiment:
 88 |             client = MlflowClient()
 89 |             n_experiments = len(client.list_experiments())
 90 |             experiment_name = 'experiment_' + str(n_experiments)
 91 |             client.create_experiment(name=experiment_name)
 92 |         with mlflow.start_run(experiment_id=n_experiments):
 93 |             model = lgb.LGBMClassifier(**best)
 94 |             X, y = self.data.drop('target',axis=1), self.data.target
 95 |             model.fit(X,y,
 96 |                 feature_name=self.colnames,
 97 |                 categorical_feature=self.categorical_columns
 98 |                 )
 99 |             for name, value in best.items():
100 |                 mlflow.log_param(name, value)
101 |             mlflow.log_metric('f1_score', -optimizer.optimizer_result.fun)
102 |             mlflow.sklearn.log_model(model, "model")
103 | 
104 |         model_fname = 'model_{}_.p'.format(model_id)
105 |         best_experiment_fname = 'best_experiment_{}_.p'.format(model_id)
106 |         pickle.dump(model, open('/'.join([self.PATH,model_fname]), 'wb'))
107 |         pickle.dump(optimizer, open('/'.join([self.PATH,best_experiment_fname]), 'wb'))
108 | 
109 | 
110 |     def hyperparameter_space(self, param_space=None):
111 | 
112 |         space = dict(
113 |                 is_unbalance = True,
114 |                 learning_rate = Real(0.01, 0.3),
115 |                 num_boost_round=Integer(50, 500),
116 |                 num_leaves=Integer(31, 255),
117 |                 min_child_weight = Real(0.1, 10),
118 |                 colsample_bytree= Real(0.5, 1.),
119 |                 subsample=Real(0.5, 1.),
120 |                 reg_alpha= Real(0.01, 0.1),
121 |                 reg_lambda= Real(0.01, 0.1)
122 |             )
123 | 
124 |         if param_space:
125 |             return param_space
126 |         else:
127 |             return space
128 | 
129 |     def extra_setup(self, extra_setup=None):
130 | 
131 |         extra_params = dict(
132 |             early_stopping_rounds=20,
133 |             feature_name=self.colnames,
134 |             categorical_feature=self.categorical_columns
135 |         )
136 | 
137 |         if extra_setup:
138 |             return extra_setup
139 |         else:
140 |             return extra_params
141 | 


--------------------------------------------------------------------------------
/trainer.py:
--------------------------------------------------------------------------------
 1 | import pdb
 2 | import json
 3 | import pandas as pd
 4 | import pickle
 5 | import argparse
 6 | 
 7 | from pathlib import Path
 8 | from kafka import KafkaConsumer
 9 | 
10 | from utils.messages_utils import publish_traininig_completed
11 | from utils.preprocess_data import build_train
12 | 
13 | 
14 | KAFKA_HOST = 'localhost:9092'
15 | RETRAIN_TOPIC = 'retrain_topic'
16 | PATH = Path('data/')
17 | TRAIN_DATA = PATH/'train/train.csv'
18 | DATAPROCESSORS_PATH = PATH/'dataprocessors'
19 | MODELS_PATH = PATH/'models'
20 | MESSAGES_PATH = PATH/'messages'
21 | 
22 | 
23 | def train(model_id, messages, hyper):
24 | 	print("RETRAINING STARTED (model id: {})".format(model_id))
25 | 	dtrain = build_train(TRAIN_DATA, DATAPROCESSORS_PATH, model_id, messages)
26 | 	if hyper == "hyperopt":
27 | 		# from train.train_hyperopt import LGBOptimizer
28 | 		from train.train_hyperopt_mlflow import LGBOptimizer
29 | 	elif hyper == "hyperparameterhunter":
30 | 		# from train.train_hyperparameterhunter import LGBOptimizer
31 | 		from train.train_hyperparameterhunter_mlfow import LGBOptimizer
32 | 	LGBOpt = LGBOptimizer(dtrain, MODELS_PATH)
33 | 	LGBOpt.optimize(maxevals=2, model_id=model_id)
34 | 	print("RETRAINING COMPLETED (model id: {})".format(model_id))
35 | 
36 | 
37 | def start(hyper):
38 | 	consumer = KafkaConsumer(RETRAIN_TOPIC, bootstrap_servers=KAFKA_HOST)
39 | 
40 | 	for msg in consumer:
41 | 		message = json.loads(msg.value)
42 | 		if 'retrain' in message and message['retrain']:
43 | 			model_id = message['model_id']
44 | 			batch_id = message['batch_id']
45 | 			message_fname = 'messages_{}_.txt'.format(batch_id)
46 | 			messages = MESSAGES_PATH/message_fname
47 | 
48 | 			train(model_id, messages, hyper)
49 | 			publish_traininig_completed(model_id)
50 | 
51 | 
52 | if __name__ == '__main__':
53 | 	parser = argparse.ArgumentParser()
54 | 
55 | 	parser.add_argument("--hyper", type=str, default="hyperopt")
56 | 	args = parser.parse_args()
57 | 
58 | 	start(args.hyper)


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrzaurin/ml-pipeline/af2b338b19a7eda31af7da35db877a9254a02058/utils/__init__.py


--------------------------------------------------------------------------------
/utils/feature_tools.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import copy
  3 | 
  4 | 
  5 | class FeatureTools(object):
  6 | 	"""Collection of preprocessing methods"""
  7 | 
  8 | 	@staticmethod
  9 | 	def num_scaler(df_inp, cols, sc, trained=False):
 10 | 		"""
 11 | 		Method to scale numeric columns in a dataframe
 12 | 
 13 | 		Parameters:
 14 | 		-----------
 15 | 		df_inp: Pandas.DataFrame
 16 | 		cols: List
 17 | 			List of numeric columns to be scaled
 18 | 		sc: Scaler object. From sklearn.preprocessing or similar structure
 19 | 		trained: Boolean
 20 | 			If True it will only be used to 'transform'
 21 | 
 22 | 		Returns:
 23 | 		--------
 24 | 		df: Pandas.DataFrame
 25 | 			transformed/normalised dataframe
 26 | 		sc: trained scaler
 27 | 		"""
 28 | 		df = df_inp.copy()
 29 | 		if not trained:
 30 | 			df[cols] = sc.fit_transform(df[cols])
 31 | 		else:
 32 | 			df[cols] = sc.transform(df[cols])
 33 | 		return df, sc
 34 | 
 35 | 	@staticmethod
 36 | 	def cross_columns(df_inp, x_cols):
 37 | 		"""
 38 | 		Method to build crossed columns. These are new columns that are the
 39 | 		cartesian product of the parent columns.
 40 | 
 41 | 		Parameters:
 42 | 		-----------
 43 | 		df_inp: Pandas.DataFrame
 44 | 		x_cols: List.
 45 | 			List of tuples with the columns to cross
 46 | 			e.g. [('colA', 'colB'),('colC', 'colD')]
 47 | 
 48 | 		Returns:
 49 | 		--------
 50 | 		df: Pandas.DataFrame
 51 | 			pandas dataframe with the new crossed columns
 52 | 		colnames: List
 53 | 			list the new column names
 54 | 		"""
 55 | 		df = df_inp.copy()
 56 | 		colnames = ['_'.join(x_c) for x_c in x_cols]
 57 | 		crossed_columns = {k:v for k,v in zip(colnames, x_cols)}
 58 | 
 59 | 		for k, v in crossed_columns.items():
 60 | 		    df[k] = df[v].apply(lambda x: '-'.join(x), axis=1)
 61 | 
 62 | 		return df, colnames
 63 | 
 64 | 	@staticmethod
 65 | 	def val2idx(df_inp, cols, val_to_idx=None):
 66 | 		"""
 67 | 		This is basically a LabelEncoder that returns a dictionary with the
 68 | 		mapping of the labels.
 69 | 
 70 | 		Parameters:
 71 | 		-----------
 72 | 		df_inp: Pandas.DataFrame
 73 | 		cols: List
 74 | 			List of categorical columns to encode
 75 | 		val_to_idx: Dict
 76 | 			LabelEncoding dictionary if already exists
 77 | 
 78 | 		Returns:
 79 | 		--------
 80 | 		df: Pandas.DataFrame
 81 | 			pandas dataframe with the categorical columns encoded
 82 | 		val_to_idx: Dict
 83 | 			dictionary with the encoding mappings
 84 | 		"""
 85 | 		df = df_inp.copy()
 86 | 		if not val_to_idx:
 87 | 
 88 | 			val_types = dict()
 89 | 			for c in cols:
 90 | 			    val_types[c] = df[c].unique()
 91 | 
 92 | 			val_to_idx = dict()
 93 | 			for k, v in val_types.items():
 94 | 			    val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}
 95 | 
 96 | 		for k, v in val_to_idx.items():
 97 | 		    df[k] = df[k].apply(lambda x: v[x])
 98 | 
 99 | 		return df, val_to_idx
100 | 
101 | 	def fit(self, df_inp, target_col, numerical_columns, categorical_columns, x_columns, sc):
102 | 		"""
103 | 		Parameters:
104 | 		-----------
105 | 		df_inp: Pandas.DataFrame
106 | 		target_col: Str
107 | 		numerical_columns: List
108 | 			List with the numerical columns
109 | 		categorical_columns: List
110 | 			List with the categorical columns
111 | 		x_columns: List
112 | 			List of tuples with the columns to cross
113 | 		sc: Scaler. From sklearn.preprocessing or object with the same
114 | 		structure
115 | 		"""
116 | 		df = df_inp.copy()
117 | 		self.numerical_columns = numerical_columns
118 | 		self.categorical_columns = categorical_columns
119 | 		self.x_columns = x_columns
120 | 
121 | 		df, self.sc = self.num_scaler(df, numerical_columns, sc)
122 | 		df, self.crossed_columns = self.cross_columns(df, x_columns)
123 | 		df, self.encoding_d = self.val2idx(df, categorical_columns+self.crossed_columns)
124 | 
125 | 		self.target = df[target_col]
126 | 		df.drop(target_col, axis=1, inplace=True)
127 | 		self.data = df
128 | 		self.colnames = df.columns.tolist()
129 | 
130 | 		return self
131 | 
132 | 	def transform(self, df_inp, trained_sc=None):
133 | 		"""
134 | 		Parameters:
135 | 		-----------
136 | 		df_inp: Pandas.DataFrame
137 | 		trained_sc: Scaler. From sklearn.preprocessing or object with the same
138 | 
139 | 		Returns:
140 | 		--------
141 | 		df: Pandas.DataFrame
142 | 			Tranformed dataframe: scaled, Labelencoded and with crossed columns
143 | 		"""
144 | 		df = df_inp.copy()
145 | 		if trained_sc:
146 | 			sc = copy.deepcopy(trained_sc)
147 | 		else:
148 | 			sc = copy.deepcopy(self.sc)
149 | 
150 | 		df, _ = self.num_scaler(df, self.numerical_columns, sc, trained=True)
151 | 		df, _ = self.cross_columns(df, self.x_columns)
152 | 		df, _ = self.val2idx(df, self.categorical_columns+self.crossed_columns, self.encoding_d)
153 | 
154 | 		return df
155 | 


--------------------------------------------------------------------------------
/utils/messages_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pickle
 3 | 
 4 | from kafka import KafkaProducer
 5 | 
 6 | producer = KafkaProducer(bootstrap_servers='localhost:9092')
 7 | 
 8 | def publish_prediction(pred, request_id):
 9 | 	producer.send('app_messages', json.dumps({'request_id': request_id, 'prediction': float(pred)}).encode('utf-8'))
10 | 	producer.flush()
11 | 
12 | 
13 | def publish_traininig_completed(model_id):
14 | 	producer.send('retrain_topic', json.dumps({'training_completed': True, 'model_id': model_id}).encode('utf-8'))
15 | 	producer.flush()
16 | 
17 | 
18 | def read_messages_count(path, repeat_every):
19 | 	file_list=list(path.iterdir())
20 | 	nfiles = len(file_list)
21 | 	if nfiles==0:
22 | 		return 0
23 | 	else:
24 | 		return ((nfiles-1)*repeat_every) + len(file_list[-1].open().readlines())
25 | 
26 | 
27 | def append_message(message, path, batch_id):
28 | 	message_fname = 'messages_{}_.txt'.format(batch_id)
29 | 	f=open(path/message_fname, "a")
30 | 	f.write("%s\n" % (json.dumps(message)))
31 | 	f.close()
32 | 
33 | 
34 | def send_retrain_message(model_id, batch_id):
35 | 	producer.send('retrain_topic', json.dumps({'retrain': True, 'model_id': model_id, 'batch_id': batch_id}).encode('utf-8'))
36 | 	producer.flush()


--------------------------------------------------------------------------------
/utils/preprocess_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pickle
 3 | import json
 4 | import pdb
 5 | import warnings
 6 | 
 7 | from pathlib import Path
 8 | from utils.feature_tools import FeatureTools
 9 | from sklearn.preprocessing import MinMaxScaler
10 | 
11 | warnings.filterwarnings("ignore")
12 | 
13 | 
14 | def load_new_training_data(path):
15 | 	data = []
16 | 	with open(path, "r") as f:
17 | 		for line in f:
18 | 			data.append(json.loads(line))
19 | 	return pd.DataFrame(data)
20 | 
21 | 
22 | def build_train(train_path, results_path, dataprocessor_id=0, PATH_2=None):
23 | 	target = 'income_label'
24 | 	# read initial DataFrame
25 | 	df = pd.read_csv(train_path)
26 | 	if PATH_2:
27 | 		df_tmp = load_new_training_data(PATH_2)
28 | 		# Let's make sure columns are in the same order
29 | 		df_tmp = df_tmp[df.columns]
30 | 		# append new DataFrame
31 | 		df = pd.concat([df, df_tmp], ignore_index=True)
32 | 		# Save it to disk
33 | 		df.to_csv(train_path, index=False)
34 | 
35 | 	df[target] = (df['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)
36 | 	df.drop('income_bracket', axis=1, inplace=True)
37 | 
38 | 	categorical_columns = list(df.select_dtypes(include=['object']).columns)
39 | 	numerical_columns = [c for c in df.columns if c not in categorical_columns+[target]]
40 | 	crossed_columns = (['education', 'occupation'], ['native_country', 'occupation'])
41 | 
42 | 	preprocessor = FeatureTools()
43 | 	dataprocessor = preprocessor.fit(
44 | 		df,
45 | 		target,
46 | 		numerical_columns,
47 | 		categorical_columns,
48 | 		crossed_columns,
49 | 		sc=MinMaxScaler()
50 | 		)
51 | 
52 | 	dataprocessor_fname = 'dataprocessor_{}_.p'.format(dataprocessor_id)
53 | 	pickle.dump(dataprocessor, open(results_path/dataprocessor_fname, "wb"))
54 | 	if dataprocessor_id==0:
55 | 		pickle.dump(df.columns.tolist()[:-1], open(results_path/'column_order.p', "wb"))
56 | 
57 | 	return dataprocessor
58 | 
59 | 
60 | # if __name__ == '__main__':
61 | 
62 | # 	PATH = Path('data/')
63 | # 	TRAIN_PATH = PATH/'train'
64 | # 	DATAPROCESSORS_PATH = PATH/'dataprocessors'
65 | 
66 | # 	dataprocessor = build_train(TRAIN_PATH/'train.csv', DATAPROCESSORS_PATH)
67 | 
68 | 


--------------------------------------------------------------------------------