├── README.md ├── LICENSE ├── requirements.txt └── LeadsBlogNotebook.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # End-to-End Lead Scoring Example 2 | 3 | Example data and code for [the end-to-end lead scoring tutorial](https://towardsdatascience.com/a-true-end-to-end-ml-example-lead-scoring-f5b52e9a3c80) on Towards Data Science. 4 | 5 | Test out this [lead scoring model demo on Booklet.ai.](https://app.booklet.ai/model/lead-scoring) 6 | 7 | ## Setup 8 | 9 | This example is built with Python `3.8.2` 10 | 11 | After cloning this repo and entering the directory with `cd lead-scoring-demo` 12 | 13 | ``` 14 | python3 -m venv lead-scoring 15 | source lead-scoring/bin/activate 16 | pip install -r requirements.txt 17 | python -m ipykernel install --user --name=lead-scoring 18 | ``` 19 | 20 | Then start the MLFlow server: 21 | 22 | ``` 23 | mlflow server 24 | ``` 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 BookletAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | alembic==1.4.2 2 | appnope==0.1.0 3 | backcall==0.2.0 4 | boto3==1.14.0 5 | botocore==1.17.0 6 | certifi==2020.4.5.2 7 | chardet==3.0.4 8 | click==7.1.2 9 | cloudpickle==1.4.1 10 | databricks-cli==0.11.0 11 | decorator==4.4.2 12 | docker==4.2.1 13 | docutils==0.15.2 14 | entrypoints==0.3 15 | Flask==1.1.2 16 | gitdb==4.0.5 17 | GitPython==3.1.3 18 | gorilla==0.3.0 19 | gunicorn==20.0.4 20 | idna==2.9 21 | ipykernel==5.3.0 22 | ipython==7.15.0 23 | ipython-genutils==0.2.0 24 | itsdangerous==1.1.0 25 | jedi==0.17.0 26 | Jinja2==2.11.2 27 | jmespath==0.10.0 28 | joblib==0.15.1 29 | jupyter-client==6.1.3 30 | jupyter-core==4.6.3 31 | Mako==1.1.3 32 | MarkupSafe==1.1.1 33 | mlflow==1.8.0 34 | numpy==1.18.5 35 | pandas==1.0.4 36 | parso==0.7.0 37 | pexpect==4.8.0 38 | pickleshare==0.7.5 39 | prometheus-client==0.8.0 40 | prometheus-flask-exporter==0.13.0 41 | prompt-toolkit==3.0.5 42 | protobuf==3.12.2 43 | ptyprocess==0.6.0 44 | Pygments==2.6.1 45 | python-dateutil==2.8.1 46 | python-editor==1.0.4 47 | pytz==2020.1 48 | PyYAML==5.3.1 49 | pyzmq==19.0.1 50 | querystring-parser==1.2.4 51 | requests==2.23.0 52 | s3transfer==0.3.3 53 | scikit-learn==0.23.1 54 | scipy==1.4.1 55 | simplejson==3.17.0 56 | six==1.15.0 57 | sklearn==0.0 58 | smmap==3.0.4 59 | SQLAlchemy==1.3.13 60 | sqlparse==0.3.1 61 | tabulate==0.8.7 62 | threadpoolctl==2.1.0 63 | tornado==6.0.4 64 | traitlets==4.3.3 65 | urllib3==1.25.9 66 | wcwidth==0.2.4 67 | websocket-client==0.57.0 68 | Werkzeug==1.0.1 69 | -------------------------------------------------------------------------------- /LeadsBlogNotebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import requests\n", 12 | "\n", 13 | "import boto3\n", 14 | "import mlflow\n", 15 | "from mlflow import pyfunc as ml_pyfunc\n", 16 | "import pandas as pd\n", 17 | "from sklearn.ensemble import RandomForestClassifier\n", 18 | "from sklearn.model_selection import train_test_split\n", 19 | "from sklearn.preprocessing import StandardScaler\n", 20 | "from sklearn import metrics" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "########################################################\n", 32 | "### Import Dataset\n", 33 | "########################################################" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "## Import dataset\n", 45 | "leads_dataset = pd.read_csv('data/leads_cleaned.csv')\n", 46 | "leads_dataset.columns = map(str.lower, leads_dataset.columns)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "########################################################\n", 58 | "### Clean and Prepare Data\n", 59 | "########################################################" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "collapsed": true 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "# Create data pre-processing steps before plugging into model\n", 71 | "leads_categorical_columns = ['lead origin',\n", 72 | " 'lead source',\n", 73 | " 'last activity',\n", 74 | " 'specialization',\n", 75 | " 'what is your current occupation',\n", 76 | " 'what matters most to you in choosing a course',\n", 77 | " 'city',\n", 78 | " 'last notable activity']\n", 79 | "\n", 80 | "leads_numeric_columns = ['totalvisits',\n", 81 | " 'total time spent on website',\n", 82 | " 'page views per visit']\n", 83 | "\n", 84 | "leads_response_columns = ['converted']" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "#split data for training, remove extras\n", 96 | "\n", 97 | "leads_x = leads_dataset.drop(leads_response_columns, axis=1)\n", 98 | "leads_y = leads_dataset[leads_response_columns]\n", 99 | "\n", 100 | "leads_x_train, leads_x_test, leads_y_train, leads_y_test = train_test_split(leads_x,\n", 101 | " leads_y,\n", 102 | " train_size=0.7,\n", 103 | " test_size=0.3,\n", 104 | " random_state=5050)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": true 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "scaler = StandardScaler()\n", 116 | "scaler = scaler.fit(leads_x_train[leads_numeric_columns])" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": true 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "def pre_process_leads_data(df,\n", 128 | " numeric_columns,\n", 129 | " categorical_columns,\n", 130 | " fitted_scaler,\n", 131 | " train_df_columns = None):\n", 132 | " ## create new df with selected columns\n", 133 | " df.columns = map(str.lower, df.columns)\n", 134 | " _df = df[set(numeric_columns + categorical_columns)].copy()\n", 135 | " \n", 136 | " ## scale the numeric columns with the pre-built scaler\n", 137 | " _df[numeric_columns] = fitted_scaler.transform(_df[numeric_columns])\n", 138 | " \n", 139 | " # First, make categorical text lowercase\n", 140 | " _df[categorical_columns] = _df[categorical_columns].apply(lambda x: x.str.lower())\n", 141 | " # Next, create one-hot-encoded variables, add to dataframe, drop old columns\n", 142 | " _df_dummies = pd.get_dummies(_df[categorical_columns], drop_first=True)\n", 143 | " _df = pd.concat([_df, _df_dummies], axis=1)\n", 144 | " _df.drop(categorical_columns, axis=1, inplace = True)\n", 145 | "\n", 146 | " if train_df_columns:\n", 147 | " _df = _df.reindex(columns=train_df_columns, fill_value=0)\n", 148 | "\n", 149 | " return _df" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": true 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "leads_x_train_clean = pre_process_leads_data(df = leads_x_train,\n", 161 | " numeric_columns = leads_numeric_columns,\n", 162 | " categorical_columns = leads_categorical_columns,\n", 163 | " fitted_scaler = scaler)\n", 164 | "\n", 165 | "leads_x_test_clean = pre_process_leads_data(df = leads_x_test,\n", 166 | " numeric_columns = leads_numeric_columns,\n", 167 | " categorical_columns = leads_categorical_columns,\n", 168 | " fitted_scaler = scaler,\n", 169 | " train_df_columns = leads_x_train_clean.columns.tolist())" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "collapsed": true 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "########################################################\n", 181 | "### Train and Evaluate Model\n", 182 | "########################################################" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "## Train the random forest model\n", 192 | "num_estimators = 100\n", 193 | "min_samples = 4\n", 194 | "\n", 195 | "rf = RandomForestClassifier(n_estimators=num_estimators,\n", 196 | " min_samples_split=min_samples)\n", 197 | "rf.fit(leads_x_train_clean, leads_y_train.values.ravel())" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "leads_y_test_predicted = rf.predict(leads_x_test_clean)\n", 207 | "\n", 208 | "accuracy = metrics.accuracy_score(leads_y_test, leads_y_test_predicted)\n", 209 | "auc_score = metrics.roc_auc_score(leads_y_test, leads_y_test_predicted)\n", 210 | "\n", 211 | "print(accuracy)\n", 212 | "print(auc_score)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": { 219 | "collapsed": true 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "########################################################\n", 224 | "### MLflow and environment setup\n", 225 | "########################################################" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "# connect to MLflow\n", 235 | "mlflow.set_tracking_uri(\"http://localhost:5000\")\n", 236 | "mlflow.set_experiment(\"LeadScoringProcessed\") # creates an experiment if it doesn't exist" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": { 243 | "collapsed": true 244 | }, 245 | "outputs": [], 246 | "source": [ 247 | "# define specific python and package versions for environment\n", 248 | "mlflow_conda_env = {\n", 249 | " 'name': 'mlflow-env',\n", 250 | " 'channels': ['defaults'],\n", 251 | " 'dependencies': ['python=3.8.2', {'pip': ['mlflow==1.8.0','scikit-learn==0.23.1','cloudpickle==1.4.1']}]\n", 252 | "}" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": { 259 | "collapsed": true 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "########################################################\n", 264 | "### Define Model\n", 265 | "########################################################" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": { 272 | "collapsed": true 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "class leadsModel(mlflow.pyfunc.PythonModel):\n", 277 | " \n", 278 | " ## defining objects needed for leadsModel prediction. \n", 279 | " def __init__(self,\n", 280 | " train_df_columns,\n", 281 | " model,\n", 282 | " leads_categorical_columns,\n", 283 | " leads_numeric_columns,\n", 284 | " fitted_scaler,\n", 285 | " pre_process_leads_data):\n", 286 | " \n", 287 | " ## Setting up all needed objects\n", 288 | " self.train_df_columns = train_df_columns\n", 289 | " self.model = model\n", 290 | " self.leads_categorical_columns = leads_categorical_columns\n", 291 | " self.leads_numeric_columns = leads_numeric_columns\n", 292 | " self.fitted_scaler = fitted_scaler\n", 293 | " self.pre_process_leads_data = pre_process_leads_data\n", 294 | " \n", 295 | " ## define function with processing and feeding data into prediction at the end\n", 296 | " def predict(self,context,model_input):\n", 297 | " \n", 298 | " # make sure all inputted columns are lowercase\n", 299 | " model_input.columns = map(str.lower, model_input.columns)\n", 300 | " \n", 301 | " # run inputted dataset through our processing function\n", 302 | " # note: we are excluding the response columns here since not needed for deploy\n", 303 | " model_input_processed = self.pre_process_leads_data(\n", 304 | " df = model_input,\n", 305 | " numeric_columns = self.leads_numeric_columns,\n", 306 | " categorical_columns = self.leads_categorical_columns,\n", 307 | " fitted_scaler = self.fitted_scaler,\n", 308 | " train_df_columns = self.train_df_columns) \n", 309 | " \n", 310 | " # finally input the cleaned/adjusted dataset into our model for prediction\n", 311 | " return self.model.predict(model_input_processed)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "# Testing the prediction class before pushing to MLflow\n", 321 | "m = leadsModel(train_df_columns = leads_x_train_clean.columns.tolist(),\n", 322 | " model = rf,\n", 323 | " leads_categorical_columns = leads_categorical_columns,\n", 324 | " leads_numeric_columns = leads_numeric_columns,\n", 325 | " fitted_scaler = scaler,\n", 326 | " pre_process_leads_data = pre_process_leads_data)\n", 327 | "model_input = leads_x.head(1)\n", 328 | "model_output = m.predict(None,model_input)\n", 329 | "print(model_output)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": { 336 | "collapsed": true 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "########################################################\n", 341 | "### Log Model to MLflow\n", 342 | "########################################################" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": { 349 | "collapsed": true 350 | }, 351 | "outputs": [], 352 | "source": [ 353 | "# start mlflow run, log parameters, metrics, and the model\n", 354 | "with mlflow.start_run(run_name=\"Leads Model with Processing\") as run:\n", 355 | " # log the parameters that we defined for the model training\n", 356 | " mlflow.log_param(\"num_estimators\", num_estimators)\n", 357 | " mlflow.log_param(\"min_samples\", min_samples)\n", 358 | " \n", 359 | " # log the performance metrics that we calculated earlier\n", 360 | " mlflow.log_metric(\"accuracy\", accuracy)\n", 361 | " mlflow.log_metric(\"auc_score\", auc_score)\n", 362 | " \n", 363 | " # log model with all objects referenced in the leadsModel class\n", 364 | " ml_pyfunc.log_model(\n", 365 | " artifact_path = \"leads_pyfunc\",\n", 366 | " python_model = leadsModel(train_df_columns = leads_x_train_clean.columns.tolist(),\n", 367 | " model = rf,\n", 368 | " leads_categorical_columns = leads_categorical_columns,\n", 369 | " leads_numeric_columns = leads_numeric_columns,\n", 370 | " fitted_scaler = scaler,\n", 371 | " pre_process_leads_data = pre_process_leads_data\n", 372 | " ),\n", 373 | " conda_env = mlflow_conda_env\n", 374 | " )\n", 375 | " \n", 376 | " # save run_id and experiment_id for deployment\n", 377 | " run_id = run.info.run_uuid\n", 378 | " experiment_id = run.info.experiment_id\n", 379 | " \n", 380 | " # end the mlflow run!\n", 381 | " mlflow.end_run()" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": { 388 | "collapsed": true 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "########################################################\n", 393 | "### Test Local Deployment\n", 394 | "########################################################" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "# Run this command in the same directory as MLflow to kick-off a local sagemaker build\n", 404 | "\n", 405 | "sagemaker_local_command = 'mlflow sagemaker run-local -m ./mlruns/{experiment_id}/{run_id}/artifacts/leads_pyfunc -p 5001'. \\\n", 406 | " format(experiment_id=experiment_id,run_id=run_id)\n", 407 | "\n", 408 | "print(sagemaker_local_command)" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": { 415 | "collapsed": true 416 | }, 417 | "outputs": [], 418 | "source": [ 419 | "# Building a function to test out the locally-build sagemaker container\n", 420 | "def query_local_endpoint(input_json):\n", 421 | " response = requests.post('http://localhost:5001/invocations'\n", 422 | " , headers = {'Content-Type': 'application/json'} \n", 423 | " , data=input_json)\n", 424 | " print(response)\n", 425 | " preds = response.json()\n", 426 | " return preds" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "# Running a query against the local endpoint and examining the output\n", 436 | "model_input = leads_x.head(1)\n", 437 | "output=query_local_endpoint(model_input.to_json(orient=\"split\"))\n", 438 | "print(output)" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": { 445 | "collapsed": true 446 | }, 447 | "outputs": [], 448 | "source": [ 449 | "########################################################\n", 450 | "### Deploy Model to Sagemaker\n", 451 | "########################################################" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": { 458 | "collapsed": true 459 | }, 460 | "outputs": [], 461 | "source": [ 462 | "## Note: this requires a MLflow pyfunc docker container to already exist in sagemaker\n", 463 | "\n", 464 | "import mlflow.sagemaker as mfs\n", 465 | "\n", 466 | "\n", 467 | "# we pull the run and experiment id's from above to create this mlflow location\n", 468 | "model_uri = \"mlruns/%s/%s/artifacts/leads_pyfunc\" % (experiment_id,run_id)\n", 469 | "\n", 470 | "# The region is chosen, pick whats close to you or your systems!\n", 471 | "region = \"us-east-1\"\n", 472 | "# The aws account id can be found in the console\n", 473 | "aws_account_id = \"XXXXXXX\"\n", 474 | "# We use these inputs to automatically reference the sagemaker docker container\n", 475 | "image_url = aws_account_id \\\n", 476 | " + \".dkr.ecr.\" \\\n", 477 | " + region \\\n", 478 | " + \".amazonaws.com/mlflow-pyfunc:1.5.0\"\n", 479 | "\n", 480 | "# now we specify the role that we setup for sagemaker in the previous step\n", 481 | "sagemaker_arn = \"arn:aws:iam::\"+aws_account_id+\":role/AmazonSageMakerFullAccess\"\n", 482 | "\n", 483 | "\n", 484 | "# finally, we pick a name for our endpoint within sagemaker\n", 485 | "endpoint_name = \"lead-rf-1\" \n", 486 | "\n", 487 | "\n", 488 | "# with all of the inputs, we run the following to deploy the model it sagemaker\n", 489 | "mfs.deploy(app_name=endpoint_name, \n", 490 | " model_uri=model_uri,\n", 491 | " region_name=region,\n", 492 | " mode=\"create\", #this should change to replace if the endpoint already exists\n", 493 | " execution_role_arn=sagemaker_arn,\n", 494 | " image_url=image_url, \n", 495 | " instance_type='ml.t2.medium') # smallest/cheapest sagemaker allowed size" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": { 502 | "collapsed": true 503 | }, 504 | "outputs": [], 505 | "source": [] 506 | } 507 | ], 508 | "metadata": { 509 | "kernelspec": { 510 | "display_name": "lead-scoring", 511 | "language": "python", 512 | "name": "lead-scoring" 513 | }, 514 | "language_info": { 515 | "codemirror_mode": { 516 | "name": "ipython", 517 | "version": 3 518 | }, 519 | "file_extension": ".py", 520 | "mimetype": "text/x-python", 521 | "name": "python", 522 | "nbconvert_exporter": "python", 523 | "pygments_lexer": "ipython3", 524 | "version": "3.8.2" 525 | } 526 | }, 527 | "nbformat": 4, 528 | "nbformat_minor": 2 529 | } 530 | --------------------------------------------------------------------------------