├── .gitignore ├── README.md ├── images ├── model_retrained.png ├── pipeline_diagram.png └── start_pipeline.gif ├── initialize.py ├── notebooks ├── .ipynb_checkpoints │ └── skopt_vs_hyperopt-checkpoint.ipynb └── skopt_vs_hyperopt.ipynb ├── predictor.py ├── requirements.txt ├── sample_app.py ├── train ├── __init__.py ├── train_hyperopt.py ├── train_hyperopt_mlflow.py ├── train_hyperparameterhunter.py └── train_hyperparameterhunter_mlfow.py ├── trainer.py └── utils ├── __init__.py ├── feature_tools.py ├── messages_utils.py └── preprocess_data.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # pipenv 87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 90 | # install all needed dependencies. 91 | #Pipfile.lock 92 | 93 | # celery beat schedule file 94 | celerybeat-schedule 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # Environments 100 | .env 101 | .venv 102 | env/ 103 | venv/ 104 | ENV/ 105 | env.bak/ 106 | venv.bak/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | .dmypy.json 121 | dmypy.json 122 | 123 | # Pyre type checker 124 | .pyre/ 125 | 126 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 127 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 128 | .idea/ 129 | # User-specific stuff 130 | .idea/**/workspace.xml 131 | .idea/**/tasks.xml 132 | .idea/**/usage.statistics.xml 133 | .idea/**/dictionaries 134 | .idea/**/shelf 135 | 136 | # Generated files 137 | .idea/**/contentModel.xml 138 | 139 | # Sensitive or high-churn files 140 | .idea/**/dataSources/ 141 | .idea/**/dataSources.ids 142 | .idea/**/dataSources.local.xml 143 | .idea/**/sqlDataSources.xml 144 | .idea/**/dynamic.xml 145 | .idea/**/uiDesigner.xml 146 | .idea/**/dbnavigator.xml 147 | 148 | # Gradle 149 | .idea/**/gradle.xml 150 | .idea/**/libraries 151 | 152 | # Gradle and Maven with auto-import 153 | # When using Gradle or Maven with auto-import, you should exclude module files, 154 | # since they will be recreated, and may cause churn. Uncomment if using 155 | # auto-import. 156 | # .idea/modules.xml 157 | # .idea/*.iml 158 | # .idea/modules 159 | # *.iml 160 | # *.ipr 161 | 162 | # CMake 163 | cmake-build-*/ 164 | 165 | # Mongo Explorer plugin 166 | .idea/**/mongoSettings.xml 167 | 168 | # File-based project format 169 | *.iws 170 | 171 | # IntelliJ 172 | out/ 173 | 174 | # mpeltonen/sbt-idea plugin 175 | .idea_modules/ 176 | 177 | # JIRA plugin 178 | atlassian-ide-plugin.xml 179 | 180 | # Cursive Clojure plugin 181 | .idea/replstate.xml 182 | 183 | # Crashlytics plugin (for Android Studio and IntelliJ) 184 | com_crashlytics_export_strings.xml 185 | crashlytics.properties 186 | crashlytics-build.properties 187 | fabric.properties 188 | 189 | # Editor-based Rest Client 190 | .idea/httpRequests 191 | 192 | # Android studio 3.1+ serialized cache file 193 | .idea/caches/build_file_checksums.ser 194 | 195 | data/ 196 | mlruns/ 197 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Putting ML in Production 2 | This repo contains code that we hope is useful to illustrate how one could productionise a real-time algorithm. The companion Medium posts can be found [here](https://medium.com/@jrzaurin/putting-ml-in-production-i-using-apache-kafka-in-python-ce06b3a395c8) and [here](https://towardsdatascience.com/putting-ml-in-production-ii-logging-and-monitoring-algorithms-91f174044e4e). The code here is meant to be as generic as possible (within certain limits), and is designed to be useful in an scenario similar to the following one. 3 | 4 | ## Scenario 5 | 6 | A company collects data using a series of services that generate events as the users/customers interact with the the company’s website or app. As these interactions happen, an algorithm needs to run in real time and some immediate action needs to be taken based on the algorithm’s outputs (or predictions). On top of that, after *N* interactions (or observations) the algorithm needs to be retrained without stopping the prediction service, since users will keep interacting. 7 | 8 | For the exercise here we have used the [Adult](https://archive.ics.uci.edu/ml/datasets/adult) dataset, where the goal is to predict whether individuals earn an income higher/lower than 50k based on their age, native country, etc. To adapt this dataset to the scenario described before, one could assume that age, native country, etc is collected through an online questionnaire/form and we need to predict whether users have high/low income in real time. If high income, then we immediately call/email them with some offer, for example. Then, after N new observations we retrain the algorithm while we keep predicting on new users. 9 | 10 | ## Solution 11 | 12 | The online part of our solution is illustrated in the figure below, and uses mainly [Kafka-Python](https://github.com/dpkp/kafka-python), [LightGBM](https://lightgbm.readthedocs.io/en/latest/#) and [Hyperopt](http://hyperopt.github.io/hyperopt/) or [HyperparameterHunter](https://github.com/HunterMcGushion/hyperparameter_hunter). 13 | 14 | ![Figure 1. Real-time prediction ML pipeline.](images/pipeline_diagram.png) 15 | 16 | A full description of the solution can be found in the already mentioned Medium posts. Briefly: 17 | 18 | **OFFLINE TRAINING** 19 | 20 | The offline process is fairly standard and all accomplished by running the `initialize.py` script. This script will download the dataset, set the dir structure, pre-preprocess the data, train an initial model on the training dataset and optimise the hyperparameters of that model. The results will be saved to disk and from there in advance we are ready to move to the online stage of the process. 21 | 22 | **ONLINE PREDICTIONS AND RETRAINING** 23 | 24 | 0. The App/Service (`app_sample.py`) will send messages (JSON) into the pipeline. These will be processed and App/Service will then get the results of the predictions. 25 | 1. 1a) The messages from App/Service will be published to Kafka and, eventualy, received by the Predictor (`predictor.py`) 26 | 27 | 1b) The Predictor will process the data and run the algorithm publishing the message with the prediction result back to Kafka, which will be eventually received by App/Service 28 | 2. After N messages the Predictor will publish a "retrain topic" message 29 | 3. The Trainer (`trainer.py`) will receive the "retrain topic" message and start retraining the algorithm. In the meantime, the Predictor will not stop serving predictions. 30 | 4. Once the algorithm is retrained, the Trainer will publish a message with the corresponding information (namely: *"retraining completed"*) 31 | 5. The Predictor will receive the message that retraining is complete, it will load the new model and proceed as usual. 32 | 33 | ## How to run the pipeline 34 | 35 | 1. Run initialize.py 36 | ``` 37 | python initialize.py 38 | ``` 39 | or 40 | ``` 41 | python initialize.py --hyper hyperparameterhunter 42 | ``` 43 | HyperparameterHunter is built on top of [Skopt](https://scikit-optimize.github.io/). It is not our goal here to compare hyperparameter optimization packages. Nonetheless, a brief comparison is included in the Medium post and a notebook comparing Skopt and Hyperopt performances is included here, in the notebooks directory. 44 | 45 | 2. Start Zookeeper and Kafka. Assuming these are installed using Homebrew, starting these services is as easy as: 46 | ``` 47 | $ brew services start zookeeper 48 | ==> Successfully started `zookeeper` (label: homebrew.mxcl.zookeeper) 49 | $ brew services start kafka 50 | ==> Successfully started `kafka` (label: homebrew.mxcl.kafka) 51 | ``` 52 | 53 | 3. In Terminal#1 run the Predictor (or the Trainer): 54 | ``` 55 | python predictor.py 56 | ``` 57 | 4. In Terminal#2 run the Trainer (or the Predictor): 58 | ``` 59 | python trainer.py 60 | ``` 61 | or 62 | ``` 63 | python trainer.py --hyper hyperparameterhunter 64 | ``` 65 | 5. In Terminal#3 run the Sample App 66 | ``` 67 | python sample_app.py 68 | ``` 69 | 70 | Below we have included a GIF showing the steps 3, 4 and 5. 71 | 72 | 73 | ![Figure 2. How to launch the Pipeline](images/start_pipeline.gif) 74 | 75 | After `RETRAIN_EVERY` messages (parameter to be set by the user), the user will be able to see how the algorithm is retrained in the terminal, as shown in the Figure below. The top-right terminal shows how Hyperopt has run 10 evaluations (in a real exercise these should be a few hundred). Once the model is retrained and optimised we see in the top-left window how the predictor has loaded the new model (after the annoying warning message from the new LightGBM version) and proceed with the prediction service as usual. 76 | 77 | ![Figure 3. Retraining process](images/model_retrained.png) 78 | 79 | 80 | ## Logging and monitoring 81 | To log all the information generated form the pipeline as it retraines the algorithm one could directly use HyperparameterHunter, which is fantastic precisely at that task. In addition, we have also used MLflow, which comes with a very convenient UI. [Our second posts](https://towardsdatascience.com/putting-ml-in-production-ii-logging-and-monitoring-algorithms-91f174044e4e) focuses on the interplay of these two tools. All the related code can be found at the `train` module within `train_hyperopt_mlflow.py` or `train_hyperparameterhunter_mlfow.py`. 82 | 83 | Comments or suggestions, please email: jrzaurin@gmail.com 84 | -------------------------------------------------------------------------------- /images/model_retrained.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrzaurin/ml-pipeline/af2b338b19a7eda31af7da35db877a9254a02058/images/model_retrained.png -------------------------------------------------------------------------------- /images/pipeline_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrzaurin/ml-pipeline/af2b338b19a7eda31af7da35db877a9254a02058/images/pipeline_diagram.png -------------------------------------------------------------------------------- /images/start_pipeline.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrzaurin/ml-pipeline/af2b338b19a7eda31af7da35db877a9254a02058/images/start_pipeline.gif -------------------------------------------------------------------------------- /initialize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import lightgbm as lgb 4 | import pickle 5 | import warnings 6 | import argparse 7 | import os 8 | import pdb 9 | 10 | from pathlib import Path 11 | from utils.preprocess_data import build_train 12 | 13 | 14 | PATH = Path('data/') 15 | TRAIN_PATH = PATH/'train' 16 | DATAPROCESSORS_PATH = PATH/'dataprocessors' 17 | MODELS_PATH = PATH/'models' 18 | MESSAGES_PATH = PATH/'messages' 19 | 20 | 21 | def create_folders(): 22 | print("creating directory structure...") 23 | (PATH).mkdir(exist_ok=True) 24 | (TRAIN_PATH).mkdir(exist_ok=True) 25 | (MODELS_PATH).mkdir(exist_ok=True) 26 | (DATAPROCESSORS_PATH).mkdir(exist_ok=True) 27 | (MESSAGES_PATH).mkdir(exist_ok=True) 28 | 29 | 30 | def download_data(): 31 | train_path = PATH/'adult.data' 32 | test_path = PATH/'adult.test' 33 | 34 | COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num", 35 | "marital_status", "occupation", "relationship", "race", "gender", 36 | "capital_gain", "capital_loss", "hours_per_week", "native_country", 37 | "income_bracket"] 38 | 39 | print("downloading training data...") 40 | df_train = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data", 41 | names=COLUMNS, skipinitialspace=True, index_col=0) 42 | df_train.drop("education_num", axis=1, inplace=True) 43 | df_train.to_csv(train_path) 44 | df_train.to_csv(PATH/'train/train.csv') 45 | 46 | print("downloading testing data...") 47 | df_test = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test", 48 | names=COLUMNS, skipinitialspace=True, skiprows=1, index_col=0) 49 | df_test.drop("education_num", axis=1, inplace=True) 50 | df_test.to_csv(test_path) 51 | 52 | 53 | def create_data_processor(): 54 | print("creating preprocessor...") 55 | dataprocessor = build_train(TRAIN_PATH/'train.csv', DATAPROCESSORS_PATH) 56 | 57 | 58 | def create_model(hyper): 59 | print("creating model...") 60 | init_dataprocessor = 'dataprocessor_0_.p' 61 | dtrain = pickle.load(open(DATAPROCESSORS_PATH/init_dataprocessor, 'rb')) 62 | if hyper == "hyperopt": 63 | # from train.train_hyperopt import LGBOptimizer 64 | from train.train_hyperopt_mlflow import LGBOptimizer 65 | elif hyper == "hyperparameterhunter": 66 | # from train.train_hyperparameterhunter import LGBOptimizer 67 | from train.train_hyperparameterhunter_mlfow import LGBOptimizer 68 | LGBOpt = LGBOptimizer(dtrain, MODELS_PATH) 69 | LGBOpt.optimize(maxevals=50) 70 | 71 | 72 | if __name__ == '__main__': 73 | 74 | parser = argparse.ArgumentParser() 75 | 76 | parser.add_argument("--hyper", type=str, default="hyperopt") 77 | args = parser.parse_args() 78 | create_folders() 79 | download_data() 80 | create_data_processor() 81 | create_model(args.hyper) -------------------------------------------------------------------------------- /notebooks/.ipynb_checkpoints/skopt_vs_hyperopt-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Skopt vs Hyperopt\n", 8 | "\n", 9 | "## Importing and preprocessing data" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stderr", 19 | "output_type": "stream", 20 | "text": [ 21 | "/usr/local/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.\n", 22 | "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n", 23 | "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n", 24 | "You can install the OpenMP library by the following command: ``brew install libomp``.\n", 25 | " \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "import pandas as pd\n", 31 | "import numpy as np\n", 32 | "import pickle\n", 33 | "import lightgbm as lgb\n", 34 | "import warnings\n", 35 | "\n", 36 | "from time import time\n", 37 | "from hyperopt import hp, tpe, fmin, Trials\n", 38 | "from skopt import BayesSearchCV\n", 39 | "from skopt.space import Real, Categorical, Integer\n", 40 | "from skopt import gbrt_minimize\n", 41 | "from sklearn.model_selection import StratifiedKFold, cross_val_score\n", 42 | "from sklearn.metrics import log_loss\n", 43 | "from utils import FeatureTools\n", 44 | "\n", 45 | "warnings.filterwarnings(\"ignore\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/html": [ 56 | "
\n", 57 | "\n", 70 | "\n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | "
ageworkclassfnlwgteducationmarital_statusoccupationrelationshipracegendercapital_gaincapital_losshours_per_weeknative_countrytarget
039State-gov77516BachelorsNever-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States0
150Self-emp-not-inc83311BachelorsMarried-civ-spouseExec-managerialHusbandWhiteMale0013United-States0
238Private215646HS-gradDivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States0
353Private23472111thMarried-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States0
428Private338409BachelorsMarried-civ-spouseProf-specialtyWifeBlackFemale0040Cuba0
\n", 178 | "
" 179 | ], 180 | "text/plain": [ 181 | " age workclass fnlwgt education marital_status \\\n", 182 | "0 39 State-gov 77516 Bachelors Never-married \n", 183 | "1 50 Self-emp-not-inc 83311 Bachelors Married-civ-spouse \n", 184 | "2 38 Private 215646 HS-grad Divorced \n", 185 | "3 53 Private 234721 11th Married-civ-spouse \n", 186 | "4 28 Private 338409 Bachelors Married-civ-spouse \n", 187 | "\n", 188 | " occupation relationship race gender capital_gain \\\n", 189 | "0 Adm-clerical Not-in-family White Male 2174 \n", 190 | "1 Exec-managerial Husband White Male 0 \n", 191 | "2 Handlers-cleaners Not-in-family White Male 0 \n", 192 | "3 Handlers-cleaners Husband Black Male 0 \n", 193 | "4 Prof-specialty Wife Black Female 0 \n", 194 | "\n", 195 | " capital_loss hours_per_week native_country target \n", 196 | "0 0 40 United-States 0 \n", 197 | "1 0 13 United-States 0 \n", 198 | "2 0 40 United-States 0 \n", 199 | "3 0 40 United-States 0 \n", 200 | "4 0 40 Cuba 0 " 201 | ] 202 | }, 203 | "execution_count": 2, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "df = pd.read_csv(\"data/adult.data\")\n", 210 | "df['target'] = (df['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)\n", 211 | "df.drop('income_bracket', axis=1, inplace=True)\n", 212 | "df.head()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "I have coded a preprocessor class before that does the work for us." 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 3, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "name": "stdout", 229 | "output_type": "stream", 230 | "text": [ 231 | "the features column names are: ['age', 'workclass', 'fnlwgt', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'education_occupation', 'native_country_occupation']\n", 232 | "the categorical columns are: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country', 'education_occupation', 'native_country_occupation']\n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "dataprocessor = pickle.load(open(\"data/dataprocessors/dataprocessor_0_.p\", \"rb\"))\n", 238 | "all_features = dataprocessor.colnames\n", 239 | "categorical_features = dataprocessor.cat_cols + dataprocessor.crossed_columns\n", 240 | "\n", 241 | "print(\"the features column names are: {}\".format(all_features))\n", 242 | "print(\"the categorical columns are: {}\".format(categorical_features))" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "the `dataprocessor` is already train, so we simply need to `transform`" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 4, 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "data": { 259 | "text/html": [ 260 | "
\n", 261 | "\n", 274 | "\n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | "
ageworkclassfnlwgteducationmarital_statusoccupationrelationshipracegendercapital_gaincapital_losshours_per_weeknative_countrytargeteducation_occupationnative_country_occupation
00.30137000.0443020000000.021740.00.3979590000
10.45205510.0482380111000.000000.00.1224490011
20.28767120.1381131220000.000000.00.3979590022
30.49315120.1510682121100.000000.00.3979590032
40.15068520.2214880132110.000000.00.3979591043
\n", 394 | "
" 395 | ], 396 | "text/plain": [ 397 | " age workclass fnlwgt education marital_status occupation \\\n", 398 | "0 0.301370 0 0.044302 0 0 0 \n", 399 | "1 0.452055 1 0.048238 0 1 1 \n", 400 | "2 0.287671 2 0.138113 1 2 2 \n", 401 | "3 0.493151 2 0.151068 2 1 2 \n", 402 | "4 0.150685 2 0.221488 0 1 3 \n", 403 | "\n", 404 | " relationship race gender capital_gain capital_loss hours_per_week \\\n", 405 | "0 0 0 0 0.02174 0.0 0.397959 \n", 406 | "1 1 0 0 0.00000 0.0 0.122449 \n", 407 | "2 0 0 0 0.00000 0.0 0.397959 \n", 408 | "3 1 1 0 0.00000 0.0 0.397959 \n", 409 | "4 2 1 1 0.00000 0.0 0.397959 \n", 410 | "\n", 411 | " native_country target education_occupation native_country_occupation \n", 412 | "0 0 0 0 0 \n", 413 | "1 0 0 1 1 \n", 414 | "2 0 0 2 2 \n", 415 | "3 0 0 3 2 \n", 416 | "4 1 0 4 3 " 417 | ] 418 | }, 419 | "execution_count": 4, 420 | "metadata": {}, 421 | "output_type": "execute_result" 422 | } 423 | ], 424 | "source": [ 425 | "train_data = dataprocessor.transform(df)\n", 426 | "\n", 427 | "train_data.head()" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 6, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "# np arrays\n", 437 | "X_train = train_data[[c for c in train_data.columns if c is not 'target']].values\n", 438 | "y_train = train_data['target'].values\n", 439 | "\n", 440 | "# lgb Dataset object\n", 441 | "lgtrain = lgb.Dataset(X_train,\n", 442 | " label=y_train,\n", 443 | " feature_name=all_features,\n", 444 | " categorical_feature=categorical_features,\n", 445 | " free_raw_data=False)" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 7, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "# model and fit params\n", 455 | "params = dict(learning_rate=0.01,\n", 456 | " num_boost_round=300,\n", 457 | " num_leaves = 255,\n", 458 | " verbose=-1,\n", 459 | " is_unbalance=True)\n", 460 | "fit_params = dict(feature_name=all_features,\n", 461 | " categorical_feature=categorical_features)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": {}, 467 | "source": [ 468 | "## 1. First experiment. Sklearn wrap up vs lightgbm methods" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 12, 474 | "metadata": {}, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "7.932429075241089\n" 481 | ] 482 | } 483 | ], 484 | "source": [ 485 | "clf = lgb.LGBMClassifier(**params, silent=True)\n", 486 | "start = time()\n", 487 | "score = cross_val_score(clf,\n", 488 | " X_train, y_train,\n", 489 | " scoring='neg_log_loss',\n", 490 | " cv=StratifiedKFold(random_state=1981),\n", 491 | " fit_params=fit_params)\n", 492 | "sklearn_runtime = time() - start\n", 493 | "print(sklearn_runtime)" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 13, 499 | "metadata": {}, 500 | "outputs": [ 501 | { 502 | "name": "stdout", 503 | "output_type": "stream", 504 | "text": [ 505 | "7.038502931594849\n" 506 | ] 507 | } 508 | ], 509 | "source": [ 510 | "start = time()\n", 511 | "cv_result = lgb.cv(params,\n", 512 | " lgtrain,\n", 513 | " metrics='binary_logloss',\n", 514 | " nfold=3,\n", 515 | " stratified=True, \n", 516 | " seed=1981)\n", 517 | "lightgbm_runtime = time() - start\n", 518 | "print(lightgbm_runtime)" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "LightGBM methods seem to be a bit faster. Let's now compare `Hyperopt` and `Skopt`" 526 | ] 527 | }, 528 | { 529 | "cell_type": "markdown", 530 | "metadata": {}, 531 | "source": [ 532 | "## Hyperopt vs Skopt" 533 | ] 534 | }, 535 | { 536 | "cell_type": "markdown", 537 | "metadata": {}, 538 | "source": [ 539 | "The first thing to comment is that while Hyperopt offers the `hp.quniform(label, low, high, q)` parameter expressions, there is not such a thing for Skopt. One has `Categorical`, but you have to pass all values. In other words, When using hyperopt one could use:\n", 540 | "\n", 541 | " 'num_boost_round': hp.quniform('num_boost_round', 50, 500, 20)\n", 542 | "\n", 543 | "but when using Skopt one would have to do:\n", 544 | "\n", 545 | " Categorical(np.arange(50, 500, 20))\n", 546 | " \n", 547 | "Because I want to keep the comparison as light and direct as possible, I will just use `Real` parameters with uniform distributions." 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": {}, 553 | "source": [ 554 | "### 1. Hyperopt\n", 555 | "\n", 556 | "With Hyperopt we will use the [TPE](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf) algorithm." 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 14, 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [ 565 | "hp_space = {\n", 566 | " 'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),\n", 567 | " 'min_child_weight': hp.uniform('min_child_weight', 0.1, 10),\n", 568 | " 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.),\n", 569 | " 'subsample': hp.uniform('subsample', 0.5, 1.),\n", 570 | " }" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 20, 576 | "metadata": {}, 577 | "outputs": [ 578 | { 579 | "name": "stdout", 580 | "output_type": "stream", 581 | "text": [ 582 | "100%|██████████| 50/50 [00:39<00:00, 1.18it/s, best loss: 0.28343034788381305]\n" 583 | ] 584 | } 585 | ], 586 | "source": [ 587 | "def objective(params):\n", 588 | " clf = lgb.LGBMClassifier(**params, is_unbalance=True, verbose=-1, silent=True)\n", 589 | " score = cross_val_score(clf,\n", 590 | " X_train, y_train,\n", 591 | " scoring='f1',\n", 592 | " cv=StratifiedKFold(random_state=3),\n", 593 | " fit_params=fit_params).mean()\n", 594 | " return 1-score\n", 595 | "trials = Trials()\n", 596 | "best = fmin(fn=objective,\n", 597 | " space=hp_space,\n", 598 | " algo=tpe.suggest,\n", 599 | " max_evals=50,\n", 600 | " trials=trials)" 601 | ] 602 | }, 603 | { 604 | "cell_type": "markdown", 605 | "metadata": {}, 606 | "source": [ 607 | "### 2. SKopt\n", 608 | "\n", 609 | "Since TPE is a Bayesian method we will first compare with the `BayesSearchCV` method in `Skopt`" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 16, 615 | "metadata": {}, 616 | "outputs": [], 617 | "source": [ 618 | "hh_space = dict(\n", 619 | " learning_rate = Real(0.01, 0.3),\n", 620 | " min_child_weight = Real(0.1, 10),\n", 621 | " colsample_bytree= Real(0.5, 1.),\n", 622 | " subsample=Real(0.5, 1.),\n", 623 | " )" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": 17, 629 | "metadata": {}, 630 | "outputs": [ 631 | { 632 | "name": "stdout", 633 | "output_type": "stream", 634 | "text": [ 635 | "63.14497208595276\n" 636 | ] 637 | } 638 | ], 639 | "source": [ 640 | "clf = lgb.LGBMClassifier(is_unbalance=True, verbose=-1, silent=True)\n", 641 | "start = time()\n", 642 | "opt = BayesSearchCV(clf,\n", 643 | " search_spaces=hh_space,\n", 644 | " scoring='f1',\n", 645 | " cv=StratifiedKFold(random_state=3),\n", 646 | " fit_params=fit_params,\n", 647 | " n_iter=50,\n", 648 | " n_jobs=-1)\n", 649 | "opt.fit(X_train, y_train)\n", 650 | "skopt_bayes_runtime = time()-start\n", 651 | "print(skopt_bayes_runtime)" 652 | ] 653 | }, 654 | { 655 | "cell_type": "markdown", 656 | "metadata": {}, 657 | "source": [ 658 | "`Skopt`'s seems to be a significantly slower than hyperopt even with no verbosity. Let's see if performs better:" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 19, 664 | "metadata": {}, 665 | "outputs": [ 666 | { 667 | "name": "stdout", 668 | "output_type": "stream", 669 | "text": [ 670 | "best SKOPT F1 score: 0.7174372197995806\n" 671 | ] 672 | } 673 | ], 674 | "source": [ 675 | "print('best SKOPT F1 score: {}'.format(opt.best_score_))" 676 | ] 677 | }, 678 | { 679 | "cell_type": "markdown", 680 | "metadata": {}, 681 | "source": [ 682 | "which is almost identical to the one obtained with `Hyperopt`" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": 25, 688 | "metadata": {}, 689 | "outputs": [ 690 | { 691 | "name": "stdout", 692 | "output_type": "stream", 693 | "text": [ 694 | "best HYPEROPT F1 score: 0.716569652116187\n" 695 | ] 696 | } 697 | ], 698 | "source": [ 699 | "# Remember hyperopt minimises 1-score. \n", 700 | "print('best HYPEROPT F1 score: {}'.format(1-trials.best_trial['result']['loss']))" 701 | ] 702 | }, 703 | { 704 | "cell_type": "markdown", 705 | "metadata": {}, 706 | "source": [ 707 | "The conclusion at this stage is that `Hyperopt` is faster than `Skopt` with the same performance. However, the `TPE` algorithm is a tree based algorithm, so let's also compare with the `gbrt_minimize` method (Sequential optimization using gradient boosted trees) in `Skopt`. Here the syntax is a bit different to that of `BayesSearchCV`. " 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": 26, 713 | "metadata": {}, 714 | "outputs": [], 715 | "source": [ 716 | "# the space has to be tuples like these\n", 717 | "hh_space_gbrt = [Real(0.01, 0.3, 'uniform', name='learning_rate'),\n", 718 | " Real(0.1, 10, 'uniform', name='min_child_weight'),\n", 719 | " Real(0.5, 1., 'uniform', name='colsample_bytree'),\n", 720 | " Real(0.5, 1., 'uniform', name='subsample')]" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 30, 726 | "metadata": {}, 727 | "outputs": [], 728 | "source": [ 729 | "# Let's adapt the objective\n", 730 | "def gbrt_objective(params):\n", 731 | " tmp_params = {}\n", 732 | " tmp_params['learning_rate'], tmp_params['min_child_weight'], \\\n", 733 | " tmp_params['colsample_bytree'], tmp_params['subsample'], = params[0], params[1], params[2], params[3]\n", 734 | " clf = lgb.LGBMClassifier(**tmp_params, is_unbalance=True, verbose=-1, silent=True)\n", 735 | " score = cross_val_score(clf,\n", 736 | " X_train, y_train,\n", 737 | " scoring='f1',\n", 738 | " cv=StratifiedKFold(random_state=3),\n", 739 | " fit_params=fit_params).mean()\n", 740 | " return 1-score" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 31, 746 | "metadata": {}, 747 | "outputs": [ 748 | { 749 | "name": "stdout", 750 | "output_type": "stream", 751 | "text": [ 752 | "54.64228296279907\n" 753 | ] 754 | } 755 | ], 756 | "source": [ 757 | "start=time()\n", 758 | "sk_best = gbrt_minimize(gbrt_objective,\n", 759 | " hh_space_gbrt,\n", 760 | " n_calls=50,\n", 761 | " verbose=False,\n", 762 | " n_jobs=-1)\n", 763 | "skopt_gbrt_runtime = time()-start\n", 764 | "print(skopt_gbrt_runtime)" 765 | ] 766 | }, 767 | { 768 | "cell_type": "markdown", 769 | "metadata": {}, 770 | "source": [ 771 | "Faster than `BayesSearchCV`, but still, slower than `Hyperopt`. Let's see if the results are any better" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": 36, 777 | "metadata": {}, 778 | "outputs": [ 779 | { 780 | "name": "stdout", 781 | "output_type": "stream", 782 | "text": [ 783 | "best SKOPT GBRT F1 score: 0.7173483496134895\n" 784 | ] 785 | } 786 | ], 787 | "source": [ 788 | "print('best SKOPT GBRT F1 score: {}'.format(1-sk_best.fun))" 789 | ] 790 | }, 791 | { 792 | "cell_type": "markdown", 793 | "metadata": {}, 794 | "source": [ 795 | "## CONCLUSION" 796 | ] 797 | }, 798 | { 799 | "cell_type": "markdown", 800 | "metadata": {}, 801 | "source": [ 802 | "`Hyperopt`'s TPE performs as good as Skopt `gbrt_minimize` and `BayesSearchCV` methods and is significantly faster." 803 | ] 804 | } 805 | ], 806 | "metadata": { 807 | "kernelspec": { 808 | "display_name": "Python 3", 809 | "language": "python", 810 | "name": "python3" 811 | }, 812 | "language_info": { 813 | "codemirror_mode": { 814 | "name": "ipython", 815 | "version": 3 816 | }, 817 | "file_extension": ".py", 818 | "mimetype": "text/x-python", 819 | "name": "python", 820 | "nbconvert_exporter": "python", 821 | "pygments_lexer": "ipython3", 822 | "version": "3.6.5" 823 | } 824 | }, 825 | "nbformat": 4, 826 | "nbformat_minor": 2 827 | } 828 | -------------------------------------------------------------------------------- /notebooks/skopt_vs_hyperopt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Skopt vs Hyperopt\n", 8 | "\n", 9 | "## Importing and preprocessing data" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stderr", 19 | "output_type": "stream", 20 | "text": [ 21 | "/usr/local/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.\n", 22 | "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n", 23 | "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n", 24 | "You can install the OpenMP library by the following command: ``brew install libomp``.\n", 25 | " \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "import pandas as pd\n", 31 | "import numpy as np\n", 32 | "import pickle\n", 33 | "import lightgbm as lgb\n", 34 | "import warnings\n", 35 | "\n", 36 | "from time import time\n", 37 | "from hyperopt import hp, tpe, fmin, Trials\n", 38 | "from skopt import BayesSearchCV\n", 39 | "from skopt.space import Real, Categorical, Integer\n", 40 | "from skopt import gbrt_minimize\n", 41 | "from sklearn.model_selection import StratifiedKFold, cross_val_score\n", 42 | "from sklearn.metrics import log_loss\n", 43 | "from utils import FeatureTools\n", 44 | "\n", 45 | "warnings.filterwarnings(\"ignore\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/html": [ 56 | "
\n", 57 | "\n", 70 | "\n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | "
ageworkclassfnlwgteducationmarital_statusoccupationrelationshipracegendercapital_gaincapital_losshours_per_weeknative_countrytarget
039State-gov77516BachelorsNever-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States0
150Self-emp-not-inc83311BachelorsMarried-civ-spouseExec-managerialHusbandWhiteMale0013United-States0
238Private215646HS-gradDivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States0
353Private23472111thMarried-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States0
428Private338409BachelorsMarried-civ-spouseProf-specialtyWifeBlackFemale0040Cuba0
\n", 178 | "
" 179 | ], 180 | "text/plain": [ 181 | " age workclass fnlwgt education marital_status \\\n", 182 | "0 39 State-gov 77516 Bachelors Never-married \n", 183 | "1 50 Self-emp-not-inc 83311 Bachelors Married-civ-spouse \n", 184 | "2 38 Private 215646 HS-grad Divorced \n", 185 | "3 53 Private 234721 11th Married-civ-spouse \n", 186 | "4 28 Private 338409 Bachelors Married-civ-spouse \n", 187 | "\n", 188 | " occupation relationship race gender capital_gain \\\n", 189 | "0 Adm-clerical Not-in-family White Male 2174 \n", 190 | "1 Exec-managerial Husband White Male 0 \n", 191 | "2 Handlers-cleaners Not-in-family White Male 0 \n", 192 | "3 Handlers-cleaners Husband Black Male 0 \n", 193 | "4 Prof-specialty Wife Black Female 0 \n", 194 | "\n", 195 | " capital_loss hours_per_week native_country target \n", 196 | "0 0 40 United-States 0 \n", 197 | "1 0 13 United-States 0 \n", 198 | "2 0 40 United-States 0 \n", 199 | "3 0 40 United-States 0 \n", 200 | "4 0 40 Cuba 0 " 201 | ] 202 | }, 203 | "execution_count": 2, 204 | "metadata": {}, 205 | "output_type": "execute_result" 206 | } 207 | ], 208 | "source": [ 209 | "df = pd.read_csv(\"data/adult.data\")\n", 210 | "df['target'] = (df['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)\n", 211 | "df.drop('income_bracket', axis=1, inplace=True)\n", 212 | "df.head()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "I have coded a preprocessor class before that does the work for us." 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 3, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "name": "stdout", 229 | "output_type": "stream", 230 | "text": [ 231 | "the features column names are: ['age', 'workclass', 'fnlwgt', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'education_occupation', 'native_country_occupation']\n", 232 | "the categorical columns are: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country', 'education_occupation', 'native_country_occupation']\n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "dataprocessor = pickle.load(open(\"data/dataprocessors/dataprocessor_0_.p\", \"rb\"))\n", 238 | "all_features = dataprocessor.colnames\n", 239 | "categorical_features = dataprocessor.cat_cols + dataprocessor.crossed_columns\n", 240 | "\n", 241 | "print(\"the features column names are: {}\".format(all_features))\n", 242 | "print(\"the categorical columns are: {}\".format(categorical_features))" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "the `dataprocessor` is already train, so we simply need to `transform`" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 4, 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "data": { 259 | "text/html": [ 260 | "
\n", 261 | "\n", 274 | "\n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | "
ageworkclassfnlwgteducationmarital_statusoccupationrelationshipracegendercapital_gaincapital_losshours_per_weeknative_countrytargeteducation_occupationnative_country_occupation
00.30137000.0443020000000.021740.00.3979590000
10.45205510.0482380111000.000000.00.1224490011
20.28767120.1381131220000.000000.00.3979590022
30.49315120.1510682121100.000000.00.3979590032
40.15068520.2214880132110.000000.00.3979591043
\n", 394 | "
" 395 | ], 396 | "text/plain": [ 397 | " age workclass fnlwgt education marital_status occupation \\\n", 398 | "0 0.301370 0 0.044302 0 0 0 \n", 399 | "1 0.452055 1 0.048238 0 1 1 \n", 400 | "2 0.287671 2 0.138113 1 2 2 \n", 401 | "3 0.493151 2 0.151068 2 1 2 \n", 402 | "4 0.150685 2 0.221488 0 1 3 \n", 403 | "\n", 404 | " relationship race gender capital_gain capital_loss hours_per_week \\\n", 405 | "0 0 0 0 0.02174 0.0 0.397959 \n", 406 | "1 1 0 0 0.00000 0.0 0.122449 \n", 407 | "2 0 0 0 0.00000 0.0 0.397959 \n", 408 | "3 1 1 0 0.00000 0.0 0.397959 \n", 409 | "4 2 1 1 0.00000 0.0 0.397959 \n", 410 | "\n", 411 | " native_country target education_occupation native_country_occupation \n", 412 | "0 0 0 0 0 \n", 413 | "1 0 0 1 1 \n", 414 | "2 0 0 2 2 \n", 415 | "3 0 0 3 2 \n", 416 | "4 1 0 4 3 " 417 | ] 418 | }, 419 | "execution_count": 4, 420 | "metadata": {}, 421 | "output_type": "execute_result" 422 | } 423 | ], 424 | "source": [ 425 | "train_data = dataprocessor.transform(df)\n", 426 | "\n", 427 | "train_data.head()" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 6, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "# np arrays\n", 437 | "X_train = train_data[[c for c in train_data.columns if c is not 'target']].values\n", 438 | "y_train = train_data['target'].values\n", 439 | "\n", 440 | "# lgb Dataset object\n", 441 | "lgtrain = lgb.Dataset(X_train,\n", 442 | " label=y_train,\n", 443 | " feature_name=all_features,\n", 444 | " categorical_feature=categorical_features,\n", 445 | " free_raw_data=False)" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 7, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "# model and fit params\n", 455 | "params = dict(learning_rate=0.01,\n", 456 | " num_boost_round=300,\n", 457 | " num_leaves = 255,\n", 458 | " verbose=-1,\n", 459 | " is_unbalance=True)\n", 460 | "fit_params = dict(feature_name=all_features,\n", 461 | " categorical_feature=categorical_features)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": {}, 467 | "source": [ 468 | "## 1. First experiment. Sklearn wrap up vs lightgbm methods" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 12, 474 | "metadata": {}, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "7.932429075241089\n" 481 | ] 482 | } 483 | ], 484 | "source": [ 485 | "clf = lgb.LGBMClassifier(**params, silent=True)\n", 486 | "start = time()\n", 487 | "score = cross_val_score(clf,\n", 488 | " X_train, y_train,\n", 489 | " scoring='neg_log_loss',\n", 490 | " cv=StratifiedKFold(random_state=1981),\n", 491 | " fit_params=fit_params)\n", 492 | "sklearn_runtime = time() - start\n", 493 | "print(sklearn_runtime)" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 13, 499 | "metadata": {}, 500 | "outputs": [ 501 | { 502 | "name": "stdout", 503 | "output_type": "stream", 504 | "text": [ 505 | "7.038502931594849\n" 506 | ] 507 | } 508 | ], 509 | "source": [ 510 | "start = time()\n", 511 | "cv_result = lgb.cv(params,\n", 512 | " lgtrain,\n", 513 | " metrics='binary_logloss',\n", 514 | " nfold=3,\n", 515 | " stratified=True, \n", 516 | " seed=1981)\n", 517 | "lightgbm_runtime = time() - start\n", 518 | "print(lightgbm_runtime)" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "LightGBM methods seem to be a bit faster. Let's now compare `Hyperopt` and `Skopt`" 526 | ] 527 | }, 528 | { 529 | "cell_type": "markdown", 530 | "metadata": {}, 531 | "source": [ 532 | "## Hyperopt vs Skopt" 533 | ] 534 | }, 535 | { 536 | "cell_type": "markdown", 537 | "metadata": {}, 538 | "source": [ 539 | "The first thing to comment is that while Hyperopt offers the `hp.quniform(label, low, high, q)` parameter expressions, there is not such a thing for Skopt. One has `Categorical`, but you have to pass all values. In other words, When using hyperopt one could use:\n", 540 | "\n", 541 | " 'num_boost_round': hp.quniform('num_boost_round', 50, 500, 20)\n", 542 | "\n", 543 | "but when using Skopt one would have to do:\n", 544 | "\n", 545 | " Categorical(np.arange(50, 500, 20))\n", 546 | " \n", 547 | "Because I want to keep the comparison as light and direct as possible, I will just use `Real` parameters with uniform distributions." 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": {}, 553 | "source": [ 554 | "### 1. Hyperopt\n", 555 | "\n", 556 | "With Hyperopt we will use the [TPE](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf) algorithm." 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 14, 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [ 565 | "hp_space = {\n", 566 | " 'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),\n", 567 | " 'min_child_weight': hp.uniform('min_child_weight', 0.1, 10),\n", 568 | " 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.),\n", 569 | " 'subsample': hp.uniform('subsample', 0.5, 1.),\n", 570 | " }" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 20, 576 | "metadata": {}, 577 | "outputs": [ 578 | { 579 | "name": "stdout", 580 | "output_type": "stream", 581 | "text": [ 582 | "100%|██████████| 50/50 [00:39<00:00, 1.18it/s, best loss: 0.28343034788381305]\n" 583 | ] 584 | } 585 | ], 586 | "source": [ 587 | "def objective(params):\n", 588 | " clf = lgb.LGBMClassifier(**params, is_unbalance=True, verbose=-1, silent=True)\n", 589 | " score = cross_val_score(clf,\n", 590 | " X_train, y_train,\n", 591 | " scoring='f1',\n", 592 | " cv=StratifiedKFold(random_state=3),\n", 593 | " fit_params=fit_params).mean()\n", 594 | " return 1-score\n", 595 | "trials = Trials()\n", 596 | "best = fmin(fn=objective,\n", 597 | " space=hp_space,\n", 598 | " algo=tpe.suggest,\n", 599 | " max_evals=50,\n", 600 | " trials=trials)" 601 | ] 602 | }, 603 | { 604 | "cell_type": "markdown", 605 | "metadata": {}, 606 | "source": [ 607 | "### 2. SKopt\n", 608 | "\n", 609 | "Since TPE is a Bayesian method we will first compare with the `BayesSearchCV` method in `Skopt`" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 16, 615 | "metadata": {}, 616 | "outputs": [], 617 | "source": [ 618 | "hh_space = dict(\n", 619 | " learning_rate = Real(0.01, 0.3),\n", 620 | " min_child_weight = Real(0.1, 10),\n", 621 | " colsample_bytree= Real(0.5, 1.),\n", 622 | " subsample=Real(0.5, 1.),\n", 623 | " )" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": 17, 629 | "metadata": {}, 630 | "outputs": [ 631 | { 632 | "name": "stdout", 633 | "output_type": "stream", 634 | "text": [ 635 | "63.14497208595276\n" 636 | ] 637 | } 638 | ], 639 | "source": [ 640 | "clf = lgb.LGBMClassifier(is_unbalance=True, verbose=-1, silent=True)\n", 641 | "start = time()\n", 642 | "opt = BayesSearchCV(clf,\n", 643 | " search_spaces=hh_space,\n", 644 | " scoring='f1',\n", 645 | " cv=StratifiedKFold(random_state=3),\n", 646 | " fit_params=fit_params,\n", 647 | " n_iter=50,\n", 648 | " n_jobs=-1)\n", 649 | "opt.fit(X_train, y_train)\n", 650 | "skopt_bayes_runtime = time()-start\n", 651 | "print(skopt_bayes_runtime)" 652 | ] 653 | }, 654 | { 655 | "cell_type": "markdown", 656 | "metadata": {}, 657 | "source": [ 658 | "`Skopt`'s seems to be a significantly slower than hyperopt even with no verbosity. Let's see if performs better:" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 19, 664 | "metadata": {}, 665 | "outputs": [ 666 | { 667 | "name": "stdout", 668 | "output_type": "stream", 669 | "text": [ 670 | "best SKOPT F1 score: 0.7174372197995806\n" 671 | ] 672 | } 673 | ], 674 | "source": [ 675 | "print('best SKOPT F1 score: {}'.format(opt.best_score_))" 676 | ] 677 | }, 678 | { 679 | "cell_type": "markdown", 680 | "metadata": {}, 681 | "source": [ 682 | "which is almost identical to the one obtained with `Hyperopt`" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": 25, 688 | "metadata": {}, 689 | "outputs": [ 690 | { 691 | "name": "stdout", 692 | "output_type": "stream", 693 | "text": [ 694 | "best HYPEROPT F1 score: 0.716569652116187\n" 695 | ] 696 | } 697 | ], 698 | "source": [ 699 | "# Remember hyperopt minimises 1-score. \n", 700 | "print('best HYPEROPT F1 score: {}'.format(1-trials.best_trial['result']['loss']))" 701 | ] 702 | }, 703 | { 704 | "cell_type": "markdown", 705 | "metadata": {}, 706 | "source": [ 707 | "The conclusion at this stage is that `Hyperopt` is faster than `Skopt` with the same performance. However, the `TPE` algorithm is a tree based algorithm, so let's also compare with the `gbrt_minimize` method (Sequential optimization using gradient boosted trees) in `Skopt`. Here the syntax is a bit different to that of `BayesSearchCV`. " 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": 26, 713 | "metadata": {}, 714 | "outputs": [], 715 | "source": [ 716 | "# the space has to be tuples like these\n", 717 | "hh_space_gbrt = [Real(0.01, 0.3, 'uniform', name='learning_rate'),\n", 718 | " Real(0.1, 10, 'uniform', name='min_child_weight'),\n", 719 | " Real(0.5, 1., 'uniform', name='colsample_bytree'),\n", 720 | " Real(0.5, 1., 'uniform', name='subsample')]" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 30, 726 | "metadata": {}, 727 | "outputs": [], 728 | "source": [ 729 | "# Let's adapt the objective\n", 730 | "def gbrt_objective(params):\n", 731 | " tmp_params = {}\n", 732 | " tmp_params['learning_rate'], tmp_params['min_child_weight'], \\\n", 733 | " tmp_params['colsample_bytree'], tmp_params['subsample'], = params[0], params[1], params[2], params[3]\n", 734 | " clf = lgb.LGBMClassifier(**tmp_params, is_unbalance=True, verbose=-1, silent=True)\n", 735 | " score = cross_val_score(clf,\n", 736 | " X_train, y_train,\n", 737 | " scoring='f1',\n", 738 | " cv=StratifiedKFold(random_state=3),\n", 739 | " fit_params=fit_params).mean()\n", 740 | " return 1-score" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 31, 746 | "metadata": {}, 747 | "outputs": [ 748 | { 749 | "name": "stdout", 750 | "output_type": "stream", 751 | "text": [ 752 | "54.64228296279907\n" 753 | ] 754 | } 755 | ], 756 | "source": [ 757 | "start=time()\n", 758 | "sk_best = gbrt_minimize(gbrt_objective,\n", 759 | " hh_space_gbrt,\n", 760 | " n_calls=50,\n", 761 | " verbose=False,\n", 762 | " n_jobs=-1)\n", 763 | "skopt_gbrt_runtime = time()-start\n", 764 | "print(skopt_gbrt_runtime)" 765 | ] 766 | }, 767 | { 768 | "cell_type": "markdown", 769 | "metadata": {}, 770 | "source": [ 771 | "Faster than `BayesSearchCV`, but still, slower than `Hyperopt`. Let's see if the results are any better" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": 36, 777 | "metadata": {}, 778 | "outputs": [ 779 | { 780 | "name": "stdout", 781 | "output_type": "stream", 782 | "text": [ 783 | "best SKOPT GBRT F1 score: 0.7173483496134895\n" 784 | ] 785 | } 786 | ], 787 | "source": [ 788 | "print('best SKOPT GBRT F1 score: {}'.format(1-sk_best.fun))" 789 | ] 790 | }, 791 | { 792 | "cell_type": "markdown", 793 | "metadata": {}, 794 | "source": [ 795 | "## CONCLUSION" 796 | ] 797 | }, 798 | { 799 | "cell_type": "markdown", 800 | "metadata": {}, 801 | "source": [ 802 | "`Hyperopt`'s TPE performs as good as Skopt `gbrt_minimize` and `BayesSearchCV` methods and is significantly faster." 803 | ] 804 | } 805 | ], 806 | "metadata": { 807 | "kernelspec": { 808 | "display_name": "Python 3", 809 | "language": "python", 810 | "name": "python3" 811 | }, 812 | "language_info": { 813 | "codemirror_mode": { 814 | "name": "ipython", 815 | "version": 3 816 | }, 817 | "file_extension": ".py", 818 | "mimetype": "text/x-python", 819 | "name": "python", 820 | "nbconvert_exporter": "python", 821 | "pygments_lexer": "ipython3", 822 | "version": "3.6.5" 823 | } 824 | }, 825 | "nbformat": 4, 826 | "nbformat_minor": 2 827 | } 828 | -------------------------------------------------------------------------------- /predictor.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | import json 3 | import pandas as pd 4 | import pickle 5 | 6 | from pathlib import Path 7 | from kafka import KafkaConsumer 8 | from utils.messages_utils import append_message, read_messages_count, send_retrain_message, publish_prediction 9 | 10 | KAFKA_HOST = 'localhost:9092' 11 | TOPICS = ['app_messages', 'retrain_topic'] 12 | PATH = Path('data/') 13 | MODELS_PATH = PATH/'models' 14 | DATAPROCESSORS_PATH = PATH/'dataprocessors' 15 | MESSAGES_PATH = PATH/'messages' 16 | RETRAIN_EVERY = 25 17 | EXTRA_MODELS_TO_KEEP = 1 18 | 19 | column_order = pickle.load(open(DATAPROCESSORS_PATH/'column_order.p', 'rb')) 20 | dataprocessor = None 21 | consumer = None 22 | model = None 23 | 24 | 25 | def reload_model(path): 26 | return pickle.load(open(path, 'rb')) 27 | 28 | 29 | def is_retraining_message(msg): 30 | message = json.loads(msg.value) 31 | return msg.topic == 'retrain_topic' and 'training_completed' in message and message['training_completed'] 32 | 33 | 34 | def is_application_message(msg): 35 | message = json.loads(msg.value) 36 | return msg.topic == 'app_messages' and 'prediction' not in message 37 | 38 | 39 | def predict(message, column_order): 40 | row = pd.DataFrame(message, index=[0]) 41 | # sanity check 42 | assert row.columns.tolist()[:-1] == column_order 43 | # In the real world we would not have the target (here 'income_bracket'). 44 | # In this example we keep it and we will retrain the model as it reads 45 | # RETRAIN_EVERY number of messages. In the real world, after RETRAIN_EVERY 46 | # number of messages have been collected, one would have to wait until we 47 | # can collect RETRAIN_EVERY targets AND THEN retrain 48 | row.drop('income_bracket', axis=1, inplace=True) 49 | trow = dataprocessor.transform(row) 50 | return model.predict(trow)[0] 51 | 52 | 53 | def start(model_id, messages_count, batch_id): 54 | for msg in consumer: 55 | message = json.loads(msg.value) 56 | 57 | if is_retraining_message(msg): 58 | model_fname = 'model_{}_.p'.format(model_id) 59 | model = reload_model(MODELS_PATH/model_fname) 60 | print("NEW MODEL RELOADED {}".format(model_id)) 61 | 62 | elif is_application_message(msg): 63 | request_id = message['request_id'] 64 | pred = predict(message['data'], column_order) 65 | publish_prediction(pred, request_id) 66 | 67 | append_message(message['data'], MESSAGES_PATH, batch_id) 68 | messages_count += 1 69 | if messages_count % RETRAIN_EVERY == 0: 70 | model_id = (model_id + 1) % (EXTRA_MODELS_TO_KEEP + 1) 71 | send_retrain_message(model_id, batch_id) 72 | batch_id += 1 73 | 74 | 75 | if __name__ == '__main__': 76 | dataprocessor_id = 0 77 | dataprocessor_fname = 'dataprocessor_{}_.p'.format(dataprocessor_id) 78 | dataprocessor = pickle.load(open(DATAPROCESSORS_PATH/dataprocessor_fname, 'rb')) 79 | 80 | messages_count = read_messages_count(MESSAGES_PATH, RETRAIN_EVERY) 81 | batch_id = messages_count % RETRAIN_EVERY 82 | 83 | model_id = batch_id % (EXTRA_MODELS_TO_KEEP + 1) 84 | model_fname = 'model_{}_.p'.format(model_id) 85 | model = reload_model(MODELS_PATH/model_fname) 86 | 87 | consumer = KafkaConsumer(bootstrap_servers=KAFKA_HOST) 88 | consumer.subscribe(TOPICS) 89 | 90 | start(model_id, messages_count, batch_id) 91 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | alembic==1.0.11 2 | certifi==2019.6.16 3 | chardet==3.0.4 4 | Click==7.0 5 | cloudpickle==1.2.1 6 | configparser==3.7.4 7 | databricks-cli==0.8.7 8 | decorator==4.4.0 9 | docker==4.0.2 10 | entrypoints==0.3 11 | Flask==1.1.1 12 | future==0.17.1 13 | gitdb2==2.0.5 14 | GitPython==2.1.11 15 | gunicorn==19.9.0 16 | hyperopt==0.1.2 17 | idna==2.8 18 | itsdangerous==1.1.0 19 | Jinja2==2.10.1 20 | joblib==0.13.2 21 | kafka-python==1.4.6 22 | lightgbm==2.2.3 23 | Mako==1.0.13 24 | MarkupSafe==1.1.1 25 | mlflow==1.0.0 26 | networkx==2.3 27 | numpy==1.16.4 28 | pandas==0.24.2 29 | protobuf==3.9.0 30 | pymongo==3.8.0 31 | python-dateutil==2.8.0 32 | python-editor==1.0.4 33 | pytz==2019.1 34 | PyYAML==5.1.1 35 | querystring-parser==1.2.3 36 | requests==2.22.0 37 | scikit-learn==0.21.2 38 | scipy==1.3.0 39 | simplejson==3.16.0 40 | six==1.12.0 41 | sklearn==0.0 42 | smmap2==2.0.5 43 | SQLAlchemy==1.3.5 44 | sqlparse==0.3.0 45 | tabulate==0.8.3 46 | tqdm==4.32.2 47 | urllib3==1.25.3 48 | websocket-client==0.56.0 49 | Werkzeug==0.15.4 50 | -------------------------------------------------------------------------------- /sample_app.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import threading 4 | import uuid 5 | 6 | from pathlib import Path 7 | from kafka import KafkaProducer, KafkaConsumer 8 | from time import sleep 9 | 10 | 11 | PATH = Path('data/') 12 | KAFKA_HOST = 'localhost:9092' 13 | df_test = pd.read_csv(PATH/'adult.test') 14 | # In the real world, the messages would not come with the target/outcome of 15 | # our actions. Here we will keep it and assume that at some point in the 16 | # future we can collect the outcome and monitor how our algorithm is doing 17 | # df_test.drop('income_bracket', axis=1, inplace=True) 18 | df_test['json'] = df_test.apply(lambda x: x.to_json(), axis=1) 19 | messages = df_test.json.tolist() 20 | 21 | 22 | def start_producing(): 23 | producer = KafkaProducer(bootstrap_servers=KAFKA_HOST) 24 | for i in range(200): 25 | message_id = str(uuid.uuid4()) 26 | message = {'request_id': message_id, 'data': json.loads(messages[i])} 27 | 28 | producer.send('app_messages', json.dumps(message).encode('utf-8')) 29 | producer.flush() 30 | 31 | print("\033[1;31;40m -- PRODUCER: Sent message with id {}".format(message_id)) 32 | sleep(2) 33 | 34 | 35 | def start_consuming(): 36 | consumer = KafkaConsumer('app_messages', bootstrap_servers=KAFKA_HOST) 37 | 38 | for msg in consumer: 39 | message = json.loads(msg.value) 40 | if 'prediction' in message: 41 | request_id = message['request_id'] 42 | print("\033[1;32;40m ** CONSUMER: Received prediction {} for request id {}".format(message['prediction'], request_id)) 43 | 44 | 45 | threads = [] 46 | t = threading.Thread(target=start_producing) 47 | t2 = threading.Thread(target=start_consuming) 48 | threads.append(t) 49 | threads.append(t2) 50 | t.start() 51 | t2.start() 52 | -------------------------------------------------------------------------------- /train/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrzaurin/ml-pipeline/af2b338b19a7eda31af7da35db877a9254a02058/train/__init__.py -------------------------------------------------------------------------------- /train/train_hyperopt.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import lightgbm as lgb 4 | import pickle 5 | import pdb 6 | import warnings 7 | 8 | from pathlib import Path 9 | from sklearn.metrics import f1_score 10 | from hyperopt import hp, tpe, fmin, Trials 11 | 12 | 13 | warnings.filterwarnings("ignore") 14 | 15 | 16 | def best_threshold(y_true, pred_proba, proba_range, verbose=False): 17 | """ 18 | Function to find the probability threshold that optimises the f1_score 19 | 20 | Comment: this function is not used in this excercise, but we include it in 21 | case the reader finds it useful 22 | 23 | Parameters: 24 | ----------- 25 | y_true: numpy.ndarray 26 | array with the true labels 27 | pred_proba: numpy.ndarray 28 | array with the predicted probability 29 | proba_range: numpy.ndarray 30 | range of probabilities to explore. 31 | e.g. np.arange(0.1,0.9,0.01) 32 | 33 | Return: 34 | ----------- 35 | tuple with the optimal threshold and the corresponding f1_score 36 | """ 37 | scores = [] 38 | for prob in proba_range: 39 | pred = [int(p>prob) for p in pred_proba] 40 | score = f1_score(y_true,pred) 41 | scores.append(score) 42 | if verbose: 43 | print("INFO: prob threshold: {}. score :{}".format(round(prob,3), round(score,5))) 44 | best_score = scores[np.argmax(scores)] 45 | optimal_threshold = proba_range[np.argmax(scores)] 46 | return (optimal_threshold, best_score) 47 | 48 | 49 | def lgb_f1_score(preds, lgbDataset): 50 | """ 51 | Function to compute the f1_score to be used with lightgbm methods. 52 | Comments: output format must be: 53 | (eval_name, eval_result, is_higher_better) 54 | 55 | Parameters: 56 | ----------- 57 | preds: np.array or List 58 | lgbDataset: lightgbm.Dataset 59 | """ 60 | binary_preds = [int(p>0.5) for p in preds] 61 | y_true = lgbDataset.get_label() 62 | # lightgbm: (eval_name, eval_result, is_higher_better) 63 | return 'f1', f1_score(y_true, binary_preds), True 64 | 65 | 66 | class LGBOptimizer(object): 67 | def __init__(self, trainDataset, out_dir): 68 | """ 69 | Hyper Parameter optimization 70 | 71 | Parameters: 72 | ----------- 73 | trainDataset: FeatureTools object 74 | The result of running FeatureTools().fit() 75 | out_dir: pathlib.PosixPath 76 | Path to the output directory 77 | """ 78 | self.PATH = out_dir 79 | self.early_stop_dict = {} 80 | 81 | self.X = trainDataset.data 82 | self.y = trainDataset.target 83 | self.colnames = trainDataset.colnames 84 | self.categorical_columns = trainDataset.categorical_columns + trainDataset.crossed_columns 85 | 86 | self.lgtrain = lgb.Dataset(self.X,label=self.y, 87 | feature_name=self.colnames, 88 | categorical_feature = self.categorical_columns, 89 | free_raw_data=False) 90 | 91 | def optimize(self, maxevals=200, model_id=0): 92 | 93 | param_space = self.hyperparameter_space() 94 | objective = self.get_objective(self.lgtrain) 95 | objective.i=0 96 | trials = Trials() 97 | best = fmin(fn=objective, 98 | space=param_space, 99 | algo=tpe.suggest, 100 | max_evals=maxevals, 101 | trials=trials) 102 | best['num_boost_round'] = self.early_stop_dict[trials.best_trial['tid']] 103 | best['num_leaves'] = int(best['num_leaves']) 104 | best['verbose'] = -1 105 | 106 | # set the model with the best parameters, fit and save 107 | model = lgb.LGBMClassifier(**best) 108 | model.fit(self.lgtrain.data, 109 | self.lgtrain.label, 110 | feature_name=self.colnames, 111 | categorical_feature=self.categorical_columns) 112 | 113 | model_fname = 'model_{}_.p'.format(model_id) 114 | best_experiment_fname = 'best_experiment_{}_.p'.format(model_id) 115 | 116 | pickle.dump(model, open(self.PATH/model_fname, 'wb')) 117 | pickle.dump(best, open(self.PATH/best_experiment_fname, 'wb')) 118 | 119 | self.best = best 120 | self.model = model 121 | 122 | 123 | def get_objective(self, train): 124 | 125 | def objective(params): 126 | """ 127 | objective function for lightgbm. 128 | """ 129 | # hyperopt casts as float 130 | params['num_boost_round'] = int(params['num_boost_round']) 131 | params['num_leaves'] = int(params['num_leaves']) 132 | 133 | # need to be passed as parameter 134 | params['is_unbalance'] = True 135 | params['verbose'] = -1 136 | params['seed'] = 1 137 | 138 | cv_result = lgb.cv( 139 | params, 140 | train, 141 | num_boost_round=params['num_boost_round'], 142 | metrics='binary_logloss', 143 | # feval = lgb_f1_score, 144 | nfold=3, 145 | stratified=True, 146 | early_stopping_rounds=20) 147 | self.early_stop_dict[objective.i] = len(cv_result['binary_logloss-mean']) 148 | error = round(cv_result['binary_logloss-mean'][-1], 4) 149 | objective.i+=1 150 | return error 151 | 152 | return objective 153 | 154 | def hyperparameter_space(self, param_space=None): 155 | 156 | space = { 157 | 'learning_rate': hp.uniform('learning_rate', 0.01, 0.2), 158 | 'num_boost_round': hp.quniform('num_boost_round', 50, 500, 20), 159 | 'num_leaves': hp.quniform('num_leaves', 31, 255, 4), 160 | 'min_child_weight': hp.uniform('min_child_weight', 0.1, 10), 161 | 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.), 162 | 'subsample': hp.uniform('subsample', 0.5, 1.), 163 | 'reg_alpha': hp.uniform('reg_alpha', 0.01, 0.1), 164 | 'reg_lambda': hp.uniform('reg_lambda', 0.01, 0.1), 165 | } 166 | 167 | if param_space: 168 | return param_space 169 | else: 170 | return space -------------------------------------------------------------------------------- /train/train_hyperopt_mlflow.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import lightgbm as lgb 4 | import pickle 5 | import pdb 6 | import warnings 7 | import mlflow 8 | import mlflow.sklearn 9 | 10 | from pathlib import Path 11 | from sklearn.metrics import f1_score 12 | from hyperopt import hp, tpe, fmin, Trials 13 | from mlflow.tracking import MlflowClient 14 | 15 | 16 | warnings.filterwarnings("ignore") 17 | 18 | 19 | def best_threshold(y_true, pred_proba, proba_range, verbose=False): 20 | """ 21 | Function to find the probability threshold that optimises the f1_score 22 | 23 | Comment: this function is not used in this excercise, but we include it in 24 | case the reader finds it useful 25 | 26 | Parameters: 27 | ----------- 28 | y_true: numpy.ndarray 29 | array with the true labels 30 | pred_proba: numpy.ndarray 31 | array with the predicted probability 32 | proba_range: numpy.ndarray 33 | range of probabilities to explore. 34 | e.g. np.arange(0.1,0.9,0.01) 35 | 36 | Return: 37 | ----------- 38 | tuple with the optimal threshold and the corresponding f1_score 39 | """ 40 | scores = [] 41 | for prob in proba_range: 42 | pred = [int(p>prob) for p in pred_proba] 43 | score = f1_score(y_true,pred) 44 | scores.append(score) 45 | if verbose: 46 | print("INFO: prob threshold: {}. score :{}".format(round(prob,3), round(score,5))) 47 | best_score = scores[np.argmax(scores)] 48 | optimal_threshold = proba_range[np.argmax(scores)] 49 | return (optimal_threshold, best_score) 50 | 51 | 52 | def lgb_f1_score(preds, lgbDataset): 53 | """ 54 | Function to compute the f1_score to be used with lightgbm methods. 55 | Comments: output format must be: 56 | (eval_name, eval_result, is_higher_better) 57 | 58 | Parameters: 59 | ----------- 60 | preds: np.array or List 61 | lgbDataset: lightgbm.Dataset 62 | """ 63 | binary_preds = [int(p>0.5) for p in preds] 64 | y_true = lgbDataset.get_label() 65 | # lightgbm: (eval_name, eval_result, is_higher_better) 66 | return 'f1', f1_score(y_true, binary_preds), True 67 | 68 | 69 | class LGBOptimizer(object): 70 | def __init__(self, trainDataset, out_dir): 71 | """ 72 | Hyper Parameter optimization 73 | 74 | Parameters: 75 | ----------- 76 | trainDataset: FeatureTools object 77 | The result of running FeatureTools().fit() 78 | out_dir: pathlib.PosixPath 79 | Path to the output directory 80 | """ 81 | self.PATH = out_dir 82 | self.early_stop_dict = {} 83 | 84 | self.X = trainDataset.data 85 | self.y = trainDataset.target 86 | self.colnames = trainDataset.colnames 87 | self.categorical_columns = trainDataset.categorical_columns + trainDataset.crossed_columns 88 | 89 | self.lgtrain = lgb.Dataset(self.X,label=self.y, 90 | feature_name=self.colnames, 91 | categorical_feature = self.categorical_columns, 92 | free_raw_data=False) 93 | 94 | def optimize(self, maxevals=200, model_id=0, reuse_experiment=False): 95 | 96 | param_space = self.hyperparameter_space() 97 | objective = self.get_objective(self.lgtrain) 98 | objective.i=0 99 | trials = Trials() 100 | best = fmin(fn=objective, 101 | space=param_space, 102 | algo=tpe.suggest, 103 | max_evals=maxevals, 104 | trials=trials) 105 | best['num_boost_round'] = self.early_stop_dict[trials.best_trial['tid']] 106 | best['num_leaves'] = int(best['num_leaves']) 107 | best['verbose'] = -1 108 | 109 | # The next few lines are the only ones related to mlflow. 110 | if not Path('mlruns').exists(): 111 | # here set the tracking_uri. If None then http://localhost:5000 112 | client = MlflowClient() 113 | n_experiments=0 114 | elif not reuse_experiment: 115 | client = MlflowClient() 116 | n_experiments = len(client.list_experiments()) 117 | experiment_name = 'experiment_' + str(n_experiments) 118 | client.create_experiment(name=experiment_name) 119 | with mlflow.start_run(experiment_id=n_experiments): 120 | model = lgb.LGBMClassifier(**best) 121 | model.fit(self.lgtrain.data, 122 | self.lgtrain.label, 123 | feature_name=self.colnames, 124 | categorical_feature=self.categorical_columns) 125 | for name, value in best.items(): 126 | mlflow.log_param(name, value) 127 | mlflow.log_metric('binary_logloss', trials.best_trial['result']['loss']) 128 | mlflow.sklearn.log_model(model, "model") 129 | 130 | model_fname = 'model_{}_.p'.format(model_id) 131 | best_experiment_fname = 'best_experiment_{}_.p'.format(model_id) 132 | 133 | pickle.dump(model, open(self.PATH/model_fname, 'wb')) 134 | pickle.dump(best, open(self.PATH/best_experiment_fname, 'wb')) 135 | 136 | self.best = best 137 | self.model = model 138 | 139 | def get_objective(self, train): 140 | 141 | def objective(params): 142 | """ 143 | objective function for lightgbm. 144 | """ 145 | # hyperopt casts as float 146 | params['num_boost_round'] = int(params['num_boost_round']) 147 | params['num_leaves'] = int(params['num_leaves']) 148 | 149 | # need to be passed as parameter 150 | params['is_unbalance'] = True 151 | params['verbose'] = -1 152 | params['seed'] = 1 153 | 154 | cv_result = lgb.cv( 155 | params, 156 | train, 157 | num_boost_round=params['num_boost_round'], 158 | metrics='binary_logloss', 159 | # feval = lgb_f1_score, 160 | nfold=3, 161 | stratified=True, 162 | early_stopping_rounds=20) 163 | self.early_stop_dict[objective.i] = len(cv_result['binary_logloss-mean']) 164 | error = cv_result['binary_logloss-mean'][-1] 165 | objective.i+=1 166 | return error 167 | 168 | return objective 169 | 170 | def hyperparameter_space(self, param_space=None): 171 | 172 | space = { 173 | 'learning_rate': hp.uniform('learning_rate', 0.01, 0.2), 174 | 'num_boost_round': hp.quniform('num_boost_round', 50, 500, 20), 175 | 'num_leaves': hp.quniform('num_leaves', 31, 256, 4), 176 | 'min_child_weight': hp.uniform('min_child_weight', 0.1, 10), 177 | 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.), 178 | 'subsample': hp.uniform('subsample', 0.5, 1.), 179 | 'reg_alpha': hp.uniform('reg_alpha', 0.01, 0.1), 180 | 'reg_lambda': hp.uniform('reg_lambda', 0.01, 0.1), 181 | } 182 | 183 | if param_space: 184 | return param_space 185 | else: 186 | return space 187 | -------------------------------------------------------------------------------- /train/train_hyperparameterhunter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import warnings 4 | import pickle 5 | import json 6 | import lightgbm as lgb 7 | 8 | import pdb 9 | 10 | from pathlib import Path 11 | from hyperparameter_hunter import (Environment, CVExperiment, 12 | BayesianOptimization, Integer, Real, Categorical) 13 | from hyperparameter_hunter import optimization as opt 14 | from sklearn.model_selection import StratifiedKFold 15 | 16 | warnings.filterwarnings("ignore") 17 | 18 | 19 | class LGBOptimizer(object): 20 | def __init__(self, trainDataset, out_dir): 21 | """ 22 | Hyper Parameter optimization 23 | 24 | Comments: Hyperparameter_hunter (hereafter HH) is a fantastic package 25 | (https://github.com/HunterMcGushion/hyperparameter_hunter) to avoid 26 | wasting time as you optimise parameters. In the words of his author: 27 | "For so long, hyperparameter optimization has been such a time 28 | consuming process that just pointed you in a direction for further 29 | optimization, then you basically had to start over". 30 | 31 | Parameters: 32 | ----------- 33 | trainDataset: FeatureTools object 34 | The result of running FeatureTools().fit() 35 | out_dir: Str 36 | Path to the output directory 37 | """ 38 | 39 | self.PATH = str(out_dir) 40 | self.data = trainDataset.data 41 | self.data['target'] = trainDataset.target 42 | self.colnames = trainDataset.colnames 43 | self.categorical_columns = trainDataset.categorical_columns + trainDataset.crossed_columns 44 | 45 | def optimize(self, metrics='f1_score', n_splits=3, cv_type=StratifiedKFold, 46 | maxevals=200, do_predict_proba=None, model_id=0): 47 | 48 | params = self.hyperparameter_space() 49 | extra_params = self.extra_setup() 50 | 51 | env = Environment( 52 | train_dataset=self.data, 53 | results_path='HyperparameterHunterAssets', 54 | # results_path=self.PATH, 55 | metrics=[metrics], 56 | do_predict_proba = do_predict_proba, 57 | cv_type=cv_type, 58 | cv_params=dict(n_splits=n_splits), 59 | ) 60 | 61 | # optimizer = opt.GradientBoostedRegressionTreeOptimization(iterations=maxevals) 62 | optimizer = opt.BayesianOptimization(iterations=maxevals) 63 | optimizer.set_experiment_guidelines( 64 | model_initializer=lgb.LGBMClassifier, 65 | model_init_params=params, 66 | model_extra_params=extra_params 67 | ) 68 | optimizer.go() 69 | # there are a few fixes on its way and the next few lines will soon be 70 | # one. At the moment, to access to the best parameters one has to read 71 | # from disc and access them 72 | best_experiment = 'HyperparameterHunterAssets/Experiments/Descriptions/'+\ 73 | optimizer.best_experiment+'.json' 74 | with open(best_experiment) as best: 75 | best = json.loads(best.read())['hyperparameters']['model_init_params'] 76 | model = lgb.LGBMClassifier(**best) 77 | X, y = self.data.drop('target',axis=1), self.data.target 78 | model.fit(X,y, 79 | feature_name=self.colnames, 80 | categorical_feature=self.categorical_columns 81 | ) 82 | model_fname = 'model_{}_.p'.format(model_id) 83 | best_experiment_fname = 'best_experiment_{}_.p'.format(model_id) 84 | pickle.dump(model, open('/'.join([self.PATH,model_fname]), 'wb')) 85 | pickle.dump(optimizer, open('/'.join([self.PATH,best_experiment_fname]), 'wb')) 86 | 87 | 88 | def hyperparameter_space(self, param_space=None): 89 | 90 | space = dict( 91 | is_unbalance = True, 92 | learning_rate = Real(0.01, 0.3), 93 | num_boost_round=Categorical(np.arange(50, 500, 20)), 94 | num_leaves=Categorical(np.arange(31, 256, 4)), 95 | min_child_weight = Real(0.1, 10), 96 | colsample_bytree= Real(0.5, 1.), 97 | subsample=Real(0.5, 1.), 98 | reg_alpha= Real(0.01, 0.1), 99 | reg_lambda= Real(0.01, 0.1) 100 | ) 101 | 102 | if param_space: 103 | return param_space 104 | else: 105 | return space 106 | 107 | 108 | def extra_setup(self, extra_setup=None): 109 | 110 | extra_params = dict( 111 | early_stopping_rounds=20, 112 | feature_name=self.colnames, 113 | categorical_feature=self.categorical_columns 114 | ) 115 | 116 | if extra_setup: 117 | return extra_setup 118 | else: 119 | return extra_params 120 | -------------------------------------------------------------------------------- /train/train_hyperparameterhunter_mlfow.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import warnings 4 | import pickle 5 | import json 6 | import lightgbm as lgb 7 | import mlflow 8 | import mlflow.sklearn 9 | 10 | import pdb 11 | 12 | from pathlib import Path 13 | from hyperparameter_hunter import (Environment, CVExperiment, 14 | BayesianOptimization, Integer, Real, Categorical) 15 | from hyperparameter_hunter import optimization as opt 16 | from sklearn.model_selection import StratifiedKFold 17 | from mlflow.tracking import MlflowClient 18 | 19 | 20 | warnings.filterwarnings("ignore") 21 | 22 | 23 | class LGBOptimizer(object): 24 | def __init__(self, trainDataset, out_dir): 25 | """ 26 | Hyper Parameter optimization 27 | 28 | Comments: Hyperparameter_hunter (hereafter HH) is a fantastic package 29 | (https://github.com/HunterMcGushion/hyperparameter_hunter) to avoid 30 | wasting time as you optimise parameters. In the words of his author: 31 | "For so long, hyperparameter optimization has been such a time 32 | consuming process that just pointed you in a direction for further 33 | optimization, then you basically had to start over". 34 | 35 | Parameters: 36 | ----------- 37 | trainDataset: FeatureTools object 38 | The result of running FeatureTools().fit() 39 | out_dir: Str 40 | Path to the output directory 41 | """ 42 | 43 | self.PATH = str(out_dir) 44 | self.data = trainDataset.data 45 | self.data['target'] = trainDataset.target 46 | self.colnames = trainDataset.colnames 47 | self.categorical_columns = trainDataset.categorical_columns + trainDataset.crossed_columns 48 | 49 | def optimize(self, metrics='f1_score', n_splits=3, cv_type=StratifiedKFold, 50 | maxevals=200, do_predict_proba=None, model_id=0, reuse_experiment=False): 51 | 52 | params = self.hyperparameter_space() 53 | extra_params = self.extra_setup() 54 | 55 | env = Environment( 56 | train_dataset=self.data, 57 | results_path='HyperparameterHunterAssets', 58 | # results_path=self.PATH, 59 | metrics=[metrics], 60 | do_predict_proba = do_predict_proba, 61 | cv_type=cv_type, 62 | cv_params=dict(n_splits=n_splits), 63 | ) 64 | 65 | # optimizer = opt.GradientBoostedRegressionTreeOptimization(iterations=maxevals) 66 | optimizer = opt.BayesianOptimization(iterations=maxevals) 67 | optimizer.set_experiment_guidelines( 68 | model_initializer=lgb.LGBMClassifier, 69 | model_init_params=params, 70 | model_extra_params=extra_params 71 | ) 72 | optimizer.go() 73 | 74 | # there are a few fixes on its way and the next few lines will soon be 75 | # one. At the moment, to access to the best parameters one has to read 76 | # from disc and access them 77 | best_experiment = 'HyperparameterHunterAssets/Experiments/Descriptions/'+\ 78 | optimizer.best_experiment+'.json' 79 | with open(best_experiment) as best: 80 | best = json.loads(best.read())['hyperparameters']['model_init_params'] 81 | 82 | # The next few lines are the only ones related to mlflow 83 | if not Path('mlruns').exists(): 84 | # here set the tracking_uri. If None then http://localhost:5000 85 | client = MlflowClient() 86 | n_experiments=0 87 | elif not reuse_experiment: 88 | client = MlflowClient() 89 | n_experiments = len(client.list_experiments()) 90 | experiment_name = 'experiment_' + str(n_experiments) 91 | client.create_experiment(name=experiment_name) 92 | with mlflow.start_run(experiment_id=n_experiments): 93 | model = lgb.LGBMClassifier(**best) 94 | X, y = self.data.drop('target',axis=1), self.data.target 95 | model.fit(X,y, 96 | feature_name=self.colnames, 97 | categorical_feature=self.categorical_columns 98 | ) 99 | for name, value in best.items(): 100 | mlflow.log_param(name, value) 101 | mlflow.log_metric('f1_score', -optimizer.optimizer_result.fun) 102 | mlflow.sklearn.log_model(model, "model") 103 | 104 | model_fname = 'model_{}_.p'.format(model_id) 105 | best_experiment_fname = 'best_experiment_{}_.p'.format(model_id) 106 | pickle.dump(model, open('/'.join([self.PATH,model_fname]), 'wb')) 107 | pickle.dump(optimizer, open('/'.join([self.PATH,best_experiment_fname]), 'wb')) 108 | 109 | 110 | def hyperparameter_space(self, param_space=None): 111 | 112 | space = dict( 113 | is_unbalance = True, 114 | learning_rate = Real(0.01, 0.3), 115 | num_boost_round=Integer(50, 500), 116 | num_leaves=Integer(31, 255), 117 | min_child_weight = Real(0.1, 10), 118 | colsample_bytree= Real(0.5, 1.), 119 | subsample=Real(0.5, 1.), 120 | reg_alpha= Real(0.01, 0.1), 121 | reg_lambda= Real(0.01, 0.1) 122 | ) 123 | 124 | if param_space: 125 | return param_space 126 | else: 127 | return space 128 | 129 | def extra_setup(self, extra_setup=None): 130 | 131 | extra_params = dict( 132 | early_stopping_rounds=20, 133 | feature_name=self.colnames, 134 | categorical_feature=self.categorical_columns 135 | ) 136 | 137 | if extra_setup: 138 | return extra_setup 139 | else: 140 | return extra_params 141 | -------------------------------------------------------------------------------- /trainer.py: -------------------------------------------------------------------------------- 1 | import pdb 2 | import json 3 | import pandas as pd 4 | import pickle 5 | import argparse 6 | 7 | from pathlib import Path 8 | from kafka import KafkaConsumer 9 | 10 | from utils.messages_utils import publish_traininig_completed 11 | from utils.preprocess_data import build_train 12 | 13 | 14 | KAFKA_HOST = 'localhost:9092' 15 | RETRAIN_TOPIC = 'retrain_topic' 16 | PATH = Path('data/') 17 | TRAIN_DATA = PATH/'train/train.csv' 18 | DATAPROCESSORS_PATH = PATH/'dataprocessors' 19 | MODELS_PATH = PATH/'models' 20 | MESSAGES_PATH = PATH/'messages' 21 | 22 | 23 | def train(model_id, messages, hyper): 24 | print("RETRAINING STARTED (model id: {})".format(model_id)) 25 | dtrain = build_train(TRAIN_DATA, DATAPROCESSORS_PATH, model_id, messages) 26 | if hyper == "hyperopt": 27 | # from train.train_hyperopt import LGBOptimizer 28 | from train.train_hyperopt_mlflow import LGBOptimizer 29 | elif hyper == "hyperparameterhunter": 30 | # from train.train_hyperparameterhunter import LGBOptimizer 31 | from train.train_hyperparameterhunter_mlfow import LGBOptimizer 32 | LGBOpt = LGBOptimizer(dtrain, MODELS_PATH) 33 | LGBOpt.optimize(maxevals=2, model_id=model_id) 34 | print("RETRAINING COMPLETED (model id: {})".format(model_id)) 35 | 36 | 37 | def start(hyper): 38 | consumer = KafkaConsumer(RETRAIN_TOPIC, bootstrap_servers=KAFKA_HOST) 39 | 40 | for msg in consumer: 41 | message = json.loads(msg.value) 42 | if 'retrain' in message and message['retrain']: 43 | model_id = message['model_id'] 44 | batch_id = message['batch_id'] 45 | message_fname = 'messages_{}_.txt'.format(batch_id) 46 | messages = MESSAGES_PATH/message_fname 47 | 48 | train(model_id, messages, hyper) 49 | publish_traininig_completed(model_id) 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser() 54 | 55 | parser.add_argument("--hyper", type=str, default="hyperopt") 56 | args = parser.parse_args() 57 | 58 | start(args.hyper) -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrzaurin/ml-pipeline/af2b338b19a7eda31af7da35db877a9254a02058/utils/__init__.py -------------------------------------------------------------------------------- /utils/feature_tools.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import copy 3 | 4 | 5 | class FeatureTools(object): 6 | """Collection of preprocessing methods""" 7 | 8 | @staticmethod 9 | def num_scaler(df_inp, cols, sc, trained=False): 10 | """ 11 | Method to scale numeric columns in a dataframe 12 | 13 | Parameters: 14 | ----------- 15 | df_inp: Pandas.DataFrame 16 | cols: List 17 | List of numeric columns to be scaled 18 | sc: Scaler object. From sklearn.preprocessing or similar structure 19 | trained: Boolean 20 | If True it will only be used to 'transform' 21 | 22 | Returns: 23 | -------- 24 | df: Pandas.DataFrame 25 | transformed/normalised dataframe 26 | sc: trained scaler 27 | """ 28 | df = df_inp.copy() 29 | if not trained: 30 | df[cols] = sc.fit_transform(df[cols]) 31 | else: 32 | df[cols] = sc.transform(df[cols]) 33 | return df, sc 34 | 35 | @staticmethod 36 | def cross_columns(df_inp, x_cols): 37 | """ 38 | Method to build crossed columns. These are new columns that are the 39 | cartesian product of the parent columns. 40 | 41 | Parameters: 42 | ----------- 43 | df_inp: Pandas.DataFrame 44 | x_cols: List. 45 | List of tuples with the columns to cross 46 | e.g. [('colA', 'colB'),('colC', 'colD')] 47 | 48 | Returns: 49 | -------- 50 | df: Pandas.DataFrame 51 | pandas dataframe with the new crossed columns 52 | colnames: List 53 | list the new column names 54 | """ 55 | df = df_inp.copy() 56 | colnames = ['_'.join(x_c) for x_c in x_cols] 57 | crossed_columns = {k:v for k,v in zip(colnames, x_cols)} 58 | 59 | for k, v in crossed_columns.items(): 60 | df[k] = df[v].apply(lambda x: '-'.join(x), axis=1) 61 | 62 | return df, colnames 63 | 64 | @staticmethod 65 | def val2idx(df_inp, cols, val_to_idx=None): 66 | """ 67 | This is basically a LabelEncoder that returns a dictionary with the 68 | mapping of the labels. 69 | 70 | Parameters: 71 | ----------- 72 | df_inp: Pandas.DataFrame 73 | cols: List 74 | List of categorical columns to encode 75 | val_to_idx: Dict 76 | LabelEncoding dictionary if already exists 77 | 78 | Returns: 79 | -------- 80 | df: Pandas.DataFrame 81 | pandas dataframe with the categorical columns encoded 82 | val_to_idx: Dict 83 | dictionary with the encoding mappings 84 | """ 85 | df = df_inp.copy() 86 | if not val_to_idx: 87 | 88 | val_types = dict() 89 | for c in cols: 90 | val_types[c] = df[c].unique() 91 | 92 | val_to_idx = dict() 93 | for k, v in val_types.items(): 94 | val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])} 95 | 96 | for k, v in val_to_idx.items(): 97 | df[k] = df[k].apply(lambda x: v[x]) 98 | 99 | return df, val_to_idx 100 | 101 | def fit(self, df_inp, target_col, numerical_columns, categorical_columns, x_columns, sc): 102 | """ 103 | Parameters: 104 | ----------- 105 | df_inp: Pandas.DataFrame 106 | target_col: Str 107 | numerical_columns: List 108 | List with the numerical columns 109 | categorical_columns: List 110 | List with the categorical columns 111 | x_columns: List 112 | List of tuples with the columns to cross 113 | sc: Scaler. From sklearn.preprocessing or object with the same 114 | structure 115 | """ 116 | df = df_inp.copy() 117 | self.numerical_columns = numerical_columns 118 | self.categorical_columns = categorical_columns 119 | self.x_columns = x_columns 120 | 121 | df, self.sc = self.num_scaler(df, numerical_columns, sc) 122 | df, self.crossed_columns = self.cross_columns(df, x_columns) 123 | df, self.encoding_d = self.val2idx(df, categorical_columns+self.crossed_columns) 124 | 125 | self.target = df[target_col] 126 | df.drop(target_col, axis=1, inplace=True) 127 | self.data = df 128 | self.colnames = df.columns.tolist() 129 | 130 | return self 131 | 132 | def transform(self, df_inp, trained_sc=None): 133 | """ 134 | Parameters: 135 | ----------- 136 | df_inp: Pandas.DataFrame 137 | trained_sc: Scaler. From sklearn.preprocessing or object with the same 138 | 139 | Returns: 140 | -------- 141 | df: Pandas.DataFrame 142 | Tranformed dataframe: scaled, Labelencoded and with crossed columns 143 | """ 144 | df = df_inp.copy() 145 | if trained_sc: 146 | sc = copy.deepcopy(trained_sc) 147 | else: 148 | sc = copy.deepcopy(self.sc) 149 | 150 | df, _ = self.num_scaler(df, self.numerical_columns, sc, trained=True) 151 | df, _ = self.cross_columns(df, self.x_columns) 152 | df, _ = self.val2idx(df, self.categorical_columns+self.crossed_columns, self.encoding_d) 153 | 154 | return df 155 | -------------------------------------------------------------------------------- /utils/messages_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pickle 3 | 4 | from kafka import KafkaProducer 5 | 6 | producer = KafkaProducer(bootstrap_servers='localhost:9092') 7 | 8 | def publish_prediction(pred, request_id): 9 | producer.send('app_messages', json.dumps({'request_id': request_id, 'prediction': float(pred)}).encode('utf-8')) 10 | producer.flush() 11 | 12 | 13 | def publish_traininig_completed(model_id): 14 | producer.send('retrain_topic', json.dumps({'training_completed': True, 'model_id': model_id}).encode('utf-8')) 15 | producer.flush() 16 | 17 | 18 | def read_messages_count(path, repeat_every): 19 | file_list=list(path.iterdir()) 20 | nfiles = len(file_list) 21 | if nfiles==0: 22 | return 0 23 | else: 24 | return ((nfiles-1)*repeat_every) + len(file_list[-1].open().readlines()) 25 | 26 | 27 | def append_message(message, path, batch_id): 28 | message_fname = 'messages_{}_.txt'.format(batch_id) 29 | f=open(path/message_fname, "a") 30 | f.write("%s\n" % (json.dumps(message))) 31 | f.close() 32 | 33 | 34 | def send_retrain_message(model_id, batch_id): 35 | producer.send('retrain_topic', json.dumps({'retrain': True, 'model_id': model_id, 'batch_id': batch_id}).encode('utf-8')) 36 | producer.flush() -------------------------------------------------------------------------------- /utils/preprocess_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pickle 3 | import json 4 | import pdb 5 | import warnings 6 | 7 | from pathlib import Path 8 | from utils.feature_tools import FeatureTools 9 | from sklearn.preprocessing import MinMaxScaler 10 | 11 | warnings.filterwarnings("ignore") 12 | 13 | 14 | def load_new_training_data(path): 15 | data = [] 16 | with open(path, "r") as f: 17 | for line in f: 18 | data.append(json.loads(line)) 19 | return pd.DataFrame(data) 20 | 21 | 22 | def build_train(train_path, results_path, dataprocessor_id=0, PATH_2=None): 23 | target = 'income_label' 24 | # read initial DataFrame 25 | df = pd.read_csv(train_path) 26 | if PATH_2: 27 | df_tmp = load_new_training_data(PATH_2) 28 | # Let's make sure columns are in the same order 29 | df_tmp = df_tmp[df.columns] 30 | # append new DataFrame 31 | df = pd.concat([df, df_tmp], ignore_index=True) 32 | # Save it to disk 33 | df.to_csv(train_path, index=False) 34 | 35 | df[target] = (df['income_bracket'].apply(lambda x: '>50K' in x)).astype(int) 36 | df.drop('income_bracket', axis=1, inplace=True) 37 | 38 | categorical_columns = list(df.select_dtypes(include=['object']).columns) 39 | numerical_columns = [c for c in df.columns if c not in categorical_columns+[target]] 40 | crossed_columns = (['education', 'occupation'], ['native_country', 'occupation']) 41 | 42 | preprocessor = FeatureTools() 43 | dataprocessor = preprocessor.fit( 44 | df, 45 | target, 46 | numerical_columns, 47 | categorical_columns, 48 | crossed_columns, 49 | sc=MinMaxScaler() 50 | ) 51 | 52 | dataprocessor_fname = 'dataprocessor_{}_.p'.format(dataprocessor_id) 53 | pickle.dump(dataprocessor, open(results_path/dataprocessor_fname, "wb")) 54 | if dataprocessor_id==0: 55 | pickle.dump(df.columns.tolist()[:-1], open(results_path/'column_order.p', "wb")) 56 | 57 | return dataprocessor 58 | 59 | 60 | # if __name__ == '__main__': 61 | 62 | # PATH = Path('data/') 63 | # TRAIN_PATH = PATH/'train' 64 | # DATAPROCESSORS_PATH = PATH/'dataprocessors' 65 | 66 | # dataprocessor = build_train(TRAIN_PATH/'train.csv', DATAPROCESSORS_PATH) 67 | 68 | --------------------------------------------------------------------------------