├── .github └── workflows │ └── deploy_api.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── images ├── 02_video_cover.png ├── 03_video_cover.png ├── 04_video_cover.png ├── 05_video_cover.png ├── header.jpg ├── lecture_1.gif ├── lecture_2.gif ├── lecture_3.gif ├── logo_realworldml.png ├── video_1.png └── video_thumbnail.png ├── lectures ├── 01_model_training.md ├── 02_model_deployment.md └── 03_continuous_deployment_with_webhooks.md ├── poetry.lock ├── pyproject.toml ├── set_environment_variables_template.sh ├── src ├── __init__.py ├── baseline_model.py ├── config.py ├── data.py ├── deploy.py ├── hyperparams.py ├── logger.py ├── model_registry_api.py ├── paths.py ├── predict.py ├── preprocessing.py ├── test_endpoint.py └── train.py └── tests └── __init__.py /.github/workflows/deploy_api.yml: -------------------------------------------------------------------------------- 1 | name: Deploy REST API to Production 2 | run-name: Deployment run - ${{ github.actor }} 3 | on: 4 | repository_dispatch: 5 | types: [webhook_Production] 6 | 7 | workflow_dispatch: 8 | 9 | env: 10 | PYTHON_VERSION: 3.9 11 | POETRY_VERSION: 1.5.1 12 | POETRY_URL: https://install.python-poetry.org 13 | 14 | jobs: 15 | deploy-rest-api-prod: 16 | runs-on: ubuntu-latest 17 | steps: 18 | 19 | - name: Checkout 20 | uses: actions/checkout@v3 21 | 22 | # Poetry cache depends on OS, Python version and Poetry version. 23 | - name: Cache Poetry cache 24 | uses: actions/cache@v3 25 | 26 | with: 27 | path: ~/.cache/pypoetry 28 | key: poetry-cache-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.POETRY_VERSION }} 29 | 30 | # virtualenv cache should depends on OS, Python version and `poetry.lock` (and optionally workflow files). 31 | - name: Cache Packages 32 | uses: actions/cache@v3 33 | with: 34 | path: ~/.local 35 | key: poetry-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/*.yml') }} 36 | 37 | - name: Set up Python ${{ env.PYTHON_VERSION }} 38 | uses: actions/setup-python@v3 39 | with: 40 | python-version: ${{ env.PYTHON_VERSION }} 41 | 42 | - name: Install Poetry 43 | run: | 44 | curl -sSL ${{ env.POETRY_URL }} | python - --version ${{ env.POETRY_VERSION }} 45 | echo "$HOME/.local/bin" >> $GITHUB_PATH 46 | 47 | - name: Install Dependencies 48 | run: poetry install 49 | 50 | - name: Deploy endpoint to Cerebrium 51 | env: 52 | COMET_ML_API_KEY: ${{ secrets.COMET_ML_API_KEY }} 53 | COMET_ML_WORKSPACE: ${{ secrets.COMET_ML_WORKSPACE }} 54 | COMET_ML_MODEL_NAME: ${{ secrets.COMET_ML_MODEL_NAME }} 55 | CEREBRIUM_API_KEY: ${{ secrets.CEREBRIUM_API_KEY }} 56 | run: make deploy 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # prefect artifacts 2 | .prefectignore 3 | 4 | # python artifacts 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | *.egg-info/ 9 | *.egg 10 | 11 | # Type checking artifacts 12 | .mypy_cache/ 13 | .dmypy.json 14 | dmypy.json 15 | .pyre/ 16 | 17 | # IPython 18 | profile_default/ 19 | ipython_config.py 20 | *.ipynb_checkpoints/* 21 | 22 | # Environments 23 | .python-version 24 | .env 25 | .venv 26 | env/ 27 | venv/ 28 | 29 | # MacOS 30 | .DS_Store 31 | 32 | # Dask 33 | dask-worker-space/ 34 | 35 | # Editors 36 | .idea/ 37 | .vscode/ 38 | 39 | # VCS 40 | .git/ 41 | .hg/ 42 | 43 | # data file 44 | *.parquet 45 | *.csv 46 | 47 | deployment_dir/ 48 | *.pkl 49 | set_environment_variables.sh 50 | .links 51 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Pau Labarta Bajo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: init data baseline train deploy prepare-deployment test-endpoint 2 | 3 | DEPLOYMENT_DIR = deployment_dir 4 | 5 | init: 6 | curl -sSL https://install.python-poetry.org | python3 - 7 | poetry install 8 | 9 | data: 10 | poetry run python src/data.py 11 | 12 | baseline: 13 | poetry run python src/baseline_model.py 14 | 15 | train: 16 | poetry run python src/train.py 17 | 18 | prepare-deployment: 19 | rm -rf $(DEPLOYMENT_DIR) && mkdir $(DEPLOYMENT_DIR) 20 | poetry export -f requirements.txt --output $(DEPLOYMENT_DIR)/requirements.txt --without-hashes 21 | cp -r src/predict.py $(DEPLOYMENT_DIR)/main.py 22 | cp -r src $(DEPLOYMENT_DIR)/src/ 23 | # pip install cerebrium --upgrade # otherwise cerebrium deploy might fail 24 | 25 | deploy: prepare-deployment 26 | cd $(DEPLOYMENT_DIR) && poetry run cerebrium deploy --api-key $(CEREBRIUM_API_KEY) --hardware CPU eth-price-1-hour-predictor 27 | 28 | test-endpoint: 29 | poetry run python src/test_endpoint.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

Train and Deploy a Serverless API to predict crypto prices

4 | 5 |
6 | 7 |
8 | 9 | 12 | 13 | #### Contents 14 | * [The problem](#the-problem) 15 | * [This is what you will learn](#this-is-what-you-will-learn) 16 | * [Tools](#tools) 17 | * [Run the whole thing in 5 minutes](#run-the-whole-thing-in-5-minutes) 18 | * [Part 1. Model training 🏋️](#1-model-training) 19 | * [Step 1. Create your virtual environment](./lectures/01_model_training.md#1-create-the-virtual-environment-with-poetry) 20 | * [Step 2. Generate training data](./lectures/01_model_training.md#2-generate-training-data) 21 | * [Step 3. Build a baseline model](./lectures/01_model_training.md#3-build-a-baseline-model) 22 | * [Step 4. Build Machine Learning models](./lectures/01_model_training.md#4-build-machine-learning-models) 23 | * [Part 2. Model deployment as REST API 🚀](#2-model-deployment-as-rest-api) 24 | * [Step 5. Deploy the model as a Serverless REST API](./lectures/02_model_deployment.md#5-deploy-the-model-as-a-serverless-rest-api) 25 | * [Part 3. Automation with GitHub actions and the Model Registry 🪝](#3-automatic-deployments-with-github-actions-and-model-registry-webhooks) 26 | * [Wanna learn more real-time ML?](#wanna-learn-more-real-time-ml) 27 | 28 | ## The problem 29 | 30 | Predicting crypto price movements is extremely hard. But it is also a great field to show what Machine Learning has to offer. 31 | 32 | In this tutorial you won't build an ML system that will make you rich. But you will master the MLOps frameworks and tools you need to build ML systems that, together with tons of experimentation, can take you there. 33 | 34 | With this hands-on tutorial, I want to help you grow as an ML engineer and go beyond notebooks. 35 | 36 |
37 | 38 | ## This is what you will learn 39 | 40 | You will learn to 41 | - **train** an ML model prototype through careful experimentation, using [CometML](https://www.comet.com/signup?utm_source=pau&utm_medium=partner&utm_content=github). 42 | - **deploy** the model as a REST API, with [Cerebrium](https://www.cerebrium.ai?utm_source=pau&utm_medium=partner&utm_content=github). 43 | - **automate** safe deployments, using GitHub actions and Comet ML Model Registry. 44 | 45 | Without further ado, let's get to work! 46 | 47 |
48 | 49 | ## Tools 50 | We will use a 100% Serverless stack, so you don't need to set up and maintain infrastructure 51 | 52 | * [CometML](https://www.comet.com/signup?utm_source=pau&utm_medium=partner&utm_content=github) as experiment tracker and model registry 53 | * [Cerebrium](https://www.cerebrium.ai?utm_source=pau&utm_medium=partner&utm_content=github) as a deployment platform 54 | * [GitHub actions](https://github.com/features/actions) to automate workflows. 55 | 56 |
57 | 58 | ## Run the whole thing in 5 minutes 59 | 60 | If you only have 5 minutes and want to see the whole system in action, follow these steps: 61 | 62 | 1. Create a Python virtual environment with all project dependencies with 63 | 64 | ``` 65 | $ make init 66 | ``` 67 | 68 | 69 | 2. Set your API keys for [Comet ML](https://www.comet.com/signup?utm_source=pau&utm_medium=partner&utm_content=github) and [Cerebrium](https://www.cerebrium.ai?utm_source=pau&utm_medium=partner&utm_content=github) in `set_environment_variables_template.sh`, rename the file as `set_environment_variables.sh` and run it 70 | ``` 71 | $ . ./set_environment_variables.sh 72 | ``` 73 | 74 | 3. Download historical data from Coinbase and save it locally to disk 75 | ``` 76 | $ make data 77 | ``` 78 | 79 | 4. Train ML model 80 | ``` 81 | $ make train 82 | ``` 83 | 84 | 5. Deploy the model 85 | ``` 86 | $ make deploy 87 | ``` 88 | 89 | 6. Take the endpoint URL you get from Cerebrium in the previous step, and set the `CEREBRIUM_ENDPOINT_URL` variable in `set_environment_variables.sh`. Then re-run 90 | ``` 91 | $ . ./set_environment_variables.sh 92 | ``` 93 | 94 | 7. Test the endpoint works 95 | ``` 96 | $ make test-endpoint 97 | ``` 98 |
99 | 100 | ## Lectures 101 | 102 | ### 1. Model training 103 | 104 | In this first lecture you will 105 | 106 | - fetch raw data 107 | - transform it into features and targets 108 | - build a baseline model 109 | - experiment with several ML models in a fast and reliable way, using Python scripts and [Comet ML experiment tracking](https://www.comet.com/signup?utm_source=pau&utm_medium=partner&utm_content=github) 110 | 111 | In this lecture you won't train an ML model that will make you rich. But you will master the framework and skillset you need if you want to build ML models that, together with tons of experimentation, can take you there. 112 | 113 | Ready to get your hands-dirty? **[Start training models 👩‍💻👨🏽‍💻 🏋️](./lectures/01_model_training.md)** 114 | 115 |
116 | 117 | ### 2. Model deployment as REST API 118 | 119 | In this second lecture you will 120 | 121 | - deploy the best ML model you found in lecture 1 as a REST API using [Cerebrium](https://www.cerebrium.ai?utm_source=pau&utm_medium=partner&utm_content=github) 122 | 123 | - test the endpoint works. 124 | 125 | Ready to deploy? **[Start deploying models 👩‍💻👨🏽‍💻 🚀](./lectures/02_model_deployment.md)** 126 | 127 |
128 | 129 | ### 3. Automatic deployments with GitHub actions and Model Registry webhooks 130 | 131 | ML models often need to be re-trained to keep them performant. Hence, automating safe deployments is a must. 132 | 133 | In this third and final lecture you will 134 | 135 | - build a continuous deployment pipeline using GitHub actions 136 | - create a webhook to trigger deployments from the Model Registry. 137 | 138 | Ready for the final round? **[Start automating safe deployments 👩‍💻👨🏽‍💻 🪝](./lectures/03_continuous_deployment_with_webhooks.md)** 139 | 140 |
141 | 142 | ## Wanna learn more Real-Time ML? 143 | 144 | Wanna learn to build a complete ML system that 145 | 146 | - ingests real-time crypto data 147 | - trains predictive ML models, and 148 | - continuously deploys them 149 | 150 | using MLOps best practices? 151 | 152 | I am preparing a new hands-on tutorial where you will learn all this. 153 | 154 | **[Subscribe to The Real-World ML Newsletter](https://paulabartabajo.substack.com/)** to be notified when the tutorial is out. 155 | 156 |
157 | Let's connect 🤗 158 |
159 | Twitter • 160 | LinkedIn • 161 | Newsletter 162 |
163 |
164 | 165 |
166 | 167 |
168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /images/02_video_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/02_video_cover.png -------------------------------------------------------------------------------- /images/03_video_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/03_video_cover.png -------------------------------------------------------------------------------- /images/04_video_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/04_video_cover.png -------------------------------------------------------------------------------- /images/05_video_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/05_video_cover.png -------------------------------------------------------------------------------- /images/header.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/header.jpg -------------------------------------------------------------------------------- /images/lecture_1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/lecture_1.gif -------------------------------------------------------------------------------- /images/lecture_2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/lecture_2.gif -------------------------------------------------------------------------------- /images/lecture_3.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/lecture_3.gif -------------------------------------------------------------------------------- /images/logo_realworldml.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/logo_realworldml.png -------------------------------------------------------------------------------- /images/video_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/video_1.png -------------------------------------------------------------------------------- /images/video_thumbnail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/video_thumbnail.png -------------------------------------------------------------------------------- /lectures/01_model_training.md: -------------------------------------------------------------------------------- 1 |
2 |

Lecture 1

3 |

Model training and experimentation with Comet ML

4 |
5 | 6 |
7 | 8 |

9 | 10 | 11 | 12 |

13 | 14 | #### Steps 15 | 16 | 1. [Create the virtual environment with Poetry](#1-create-the-virtual-environment-with-poetry) 17 | 2. [Generate training data](#2-generate-training-data) 18 | 3. [Build a baseline model](#3-build-a-baseline-model) 19 | 4. [Build Machine Learning models](#4-build-machine-learning-models) 20 | 21 | 22 | 23 | ## 1. Create the virtual environment with [Poetry](https://python-poetry.org/docs/) 24 | 25 |
26 | 27 |

Watch the video 🎬

28 | Create virtual environment with Poetry 29 |
30 |
31 | 32 |
33 | 34 | 1. Create a Python virtual environment with all project dependencies with 35 | ``` 36 | $ curl -sSL https://install.python-poetry.org | python3 - 37 | $ poetry install 38 | ``` 39 | or simply use the `Makefile` 40 | ``` 41 | $ make init 42 | ``` 43 | 44 | 2. Activate the virtual environment you just created 45 | ``` 46 | $ poetry shell 47 | ``` 48 | 49 | 3. [Sign up for Comet ML for FREE](https://www.comet.com/signup?utm_source=pau&utm_medium=partner&utm_content=github), create a workspace and copy your API key from the dashboard. 50 | 51 | 4. Set your API key and workspace name variables in `set_environment_variables_template.sh`, rename the file and run it 52 | ``` 53 | $ . ./set_environment_variables.sh 54 | ``` 55 | 56 | Your local development environment is ready. Let's now generate some training data. 57 | 58 | ## 2. Generate training data 59 | 60 |
61 | 62 |

Watch the video 🎬

63 | Create virtual environment with Poetry 64 |
65 |
66 | 67 | 68 | Download historical data from Coinbase and save it locally to disk 69 | 70 | - Run either `$ python src/data.py`, or 71 | - Simply `$ make data` 72 | 73 | ## 3. Build a baseline model 74 | 75 |
76 | 77 |

Watch the video 🎬

78 | Create a simple, yet powerful, baseline model 79 |
80 |
81 | 82 | 83 | - Establish a baseline performance using a very dummy (yet powerful) baseline model 84 | ``` 85 | $ python src/baseline_model.py 86 | ``` 87 | 88 | ## 4. Build Machine Learning models 89 | 90 |
91 | 92 |

Watch the video 🎬

93 | Create a simple, yet powerful, baseline model 94 |
95 |
96 | 97 | - Here is the list of experiments I ran 98 | ``` 99 | $ python src/baseline_model.py 100 | $ python src/train.py --model lasso 101 | $ python src/train.py --model lasso --tune-hyperparams --hyperparam-trials 3 102 | $ python src/train.py --model lightgbm 103 | $ python src/train.py --model lightgbm --tune-hyperparams --hyperparam-trials 3 104 | ``` 105 | 106 | - Feel free to try adding more features, using other technical indicators, or experiment with other ML models. 107 | 108 | 109 | ### [➡️ Go to the next lecture](../lectures/02_model_deployment.md) 110 | 111 | -------------------------------------------------------------------------------- /lectures/02_model_deployment.md: -------------------------------------------------------------------------------- 1 |
2 |

Lecture 2. Model deployment as a REST API

3 | Serverless API with Cerebrium 4 |
5 | 6 |
7 | 8 |

9 | 10 |

11 | 12 | #### Steps 13 | 5. [Deploy the model as a Serverless REST API](#5-deploy-the-model-as-a-serverless-rest-api) 14 | 6. [Test the REST API endpoint](#6-test-the-api-endpoint) 15 | 16 | ## 5. Deploy the model as a Serverless REST API 17 | 18 | Let's deploy our ML model as a REST API using the Serverless platform Cerebrium. 19 | 20 | Forget about Docker, IAM roles, and EC2 instances. Serverless ML is about focusing on what differentiates your ML product, not setting up and mantaining infrastructure. 21 | 22 |
23 | 24 |

Click here to sign up for FREE and get your API key

25 |
26 |
27 | 28 |
29 | 30 | Deploy ML model as REST API using Cerebrium 31 | 32 |
33 | 34 | Your REST API endpoint needs to preload the ML model from the CometML Model Registry. For that, you need to set the following secrets on your Cerebrium Dashboard: 35 | - `COMET_ML_WORKSPACE` 36 | - `COMET_ML_API_KEY` 37 | - `COMET_ML_MODEL_NAME` 38 | 39 | Then run 40 | ``` 41 | $ make deploy 42 | ``` 43 | 44 | ## 6. Test the REST API endpoint 45 | 46 | [PENDING VIDEO 🎬] 47 | ``` 48 | $ make test-endpoint 49 | ``` 50 | 51 | ### [➡️ Go to the next lecture](../lectures/03_continuous_deployment_with_webhooks.md) 52 | -------------------------------------------------------------------------------- /lectures/03_continuous_deployment_with_webhooks.md: -------------------------------------------------------------------------------- 1 |
2 |

Lecture 3. Automatic deployment with Comet ML webhooks and GitHub actions

3 | Comet ML Model Registry + GitHub action 4 |
5 | 6 |
7 | 8 |

9 | 10 |

11 | 12 | #### Steps 13 | 7. [Create GitHub action to deploy the API](#7-create-githbu-action-to-deploy-the-api) 14 | 8. [Test the GitHub action works](#8-test-the-github-action-works) 15 | 9. [Create webhook to trigger the GitHub action](#9-create-webhook-to-trigger-the-github-action) 16 | 10. [Test the webhook works](#10-test-the-webhook-works) 17 | 18 | ## 7. Create GitHbu action to deploy the API 19 | [PENDING VIDEO 🎬] 20 | 21 | ## 8. Test the GitHub action works 22 | [PENDING VIDEO 🎬] 23 | 24 | ## 9. Create webhook to trigger the GitHub action 25 | [PENDING VIDEO 🎬] 26 | 27 | ## 10. Test the webhook works 28 | [PENDING VIDEO 🎬] -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "src" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Pau "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = ">=3.9,<3.11" 10 | requests = "^2.30.0" 11 | pandas = "1.5.3" 12 | fire = "^0.5.0" 13 | pyarrow = "^12.0.0" 14 | ta = "^0.10.2" 15 | notebook = "^6.5.4" 16 | scikit-learn = "^1.2.2" 17 | xgboost = "^1.7.5" 18 | lightgbm = "^3.3.5" 19 | optuna = "^3.1.1" 20 | comet-ml = "3.32.6" 21 | cerebrium = "^1.1.1" 22 | torch = "1.13.1" 23 | pydantic = "^2.1.1" 24 | 25 | 26 | [build-system] 27 | requires = ["poetry-core"] 28 | build-backend = "poetry.core.masonry.api" 29 | -------------------------------------------------------------------------------- /set_environment_variables_template.sh: -------------------------------------------------------------------------------- 1 | # replace placeholders and rename this file to `set_environment_variables.sh` 2 | export COMET_ML_API_KEY="YOUR_COMET_ML_API_KEY" 3 | export COMET_ML_WORKSPACE="YOUR_COMET_ML_WORKSPACE" 4 | export COMET_ML_MODEL_NAME="YOUR_COMET_ML_MODEL_NAME" 5 | 6 | export CEREBRIUM_API_KEY="YOUR_CEREBRIUM_API_KEY" 7 | 8 | # you set this after deploying the API for the first time 9 | export CEREBRIUM_ENDPOINT_URL="YOUR_CEREBRIUM_ENDPOINT_URL" -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/src/__init__.py -------------------------------------------------------------------------------- /src/baseline_model.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Union, Optional, Callable 2 | import os 3 | 4 | import pandas as pd 5 | from comet_ml import Experiment 6 | from sklearn.metrics import mean_absolute_error 7 | from sklearn.linear_model import Lasso 8 | 9 | 10 | from src.preprocessing import transform_ts_data_into_features_and_target 11 | from src.logger import get_console_logger 12 | 13 | logger = get_console_logger() 14 | 15 | 16 | def get_baseline_model_error(X_test: pd.DataFrame, y_test: pd.Series) -> float: 17 | """Returns the baseline model error.""" 18 | predictions = X_test['price_1_hour_ago'] 19 | return mean_absolute_error(y_test, predictions) 20 | 21 | 22 | def train( 23 | X: pd.DataFrame, 24 | y: pd.Series, 25 | ) -> None: 26 | """ 27 | Train a boosting tree model using the input features `X` and targets `y`, 28 | possibly running hyperparameter tuning. 29 | """ 30 | experiment = Experiment( 31 | api_key = os.environ["COMET_ML_API_KEY"], 32 | workspace=os.environ["COMET_ML_WORKSPACE"], 33 | project_name = "hands-on-train-and-deploy-tutorial", 34 | ) 35 | experiment.add_tag('baseline_model') 36 | 37 | # split the data into train and test 38 | train_sample_size = int(0.9 * len(X)) 39 | X_train, X_test = X[:train_sample_size], X[train_sample_size:] 40 | y_train, y_test = y[:train_sample_size], y[train_sample_size:] 41 | logger.info(f'Train sample size: {len(X_train)}') 42 | logger.info(f'Test sample size: {len(X_test)}') 43 | 44 | # baseline model performance 45 | baseline_mae = get_baseline_model_error(X_test, y_test) 46 | logger.info(f'Test MAE: {baseline_mae}') 47 | experiment.log_metrics({'Test_MAE': baseline_mae}) 48 | 49 | 50 | if __name__ == '__main__': 51 | 52 | logger.info('Generating features and targets') 53 | features, target = transform_ts_data_into_features_and_target() 54 | 55 | logger.info('Starting training') 56 | train(features, target) -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | WINDOW_SIZE = 24 -------------------------------------------------------------------------------- /src/data.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from pathlib import Path 3 | 4 | import pandas as pd 5 | import requests 6 | import fire 7 | 8 | from src.paths import DATA_DIR 9 | from src.logger import get_console_logger 10 | 11 | logger = get_console_logger(name='dataset_generation') 12 | 13 | def download_ohlc_data_from_coinbase( 14 | product_id: Optional[str] = "BTC-USD", 15 | from_day: Optional[str] = "2022-01-01", 16 | to_day: Optional[str] = "2023-06-01", 17 | ) -> Path: 18 | """ 19 | Downloads historical candles from Coinbase API and saves data to disk 20 | Reference: https://docs.cloud.coinbase.com/exchange/reference/exchangerestapi_getproductcandles 21 | """ 22 | # create list of days as strings 23 | days = pd.date_range(start=from_day, end=to_day, freq="1D") 24 | days = [day.strftime("%Y-%m-%d") for day in days] 25 | 26 | # create empty dataframe 27 | data = pd.DataFrame() 28 | 29 | # create download dir folder if it doesn't exist 30 | if not (DATA_DIR / 'downloads').exists(): 31 | logger.info('Create directory for downloads') 32 | (DATA_DIR / 'downloads').mkdir(parents=True) 33 | 34 | for day in days: 35 | 36 | # download file if it doesn't exist 37 | file_name = DATA_DIR / 'downloads' / f'{day}.parquet' 38 | if file_name.exists(): 39 | logger.info(f'File {file_name} already exists, skipping') 40 | data_one_day = pd.read_parquet(file_name) 41 | else: 42 | logger.info(f'Downloading data for {day}') 43 | data_one_day = download_data_for_one_day(product_id, day) 44 | data_one_day.to_parquet(file_name, index=False) 45 | 46 | # combine today's file with the rest of the data 47 | data = pd.concat([data, data_one_day]) 48 | 49 | # save data to disk 50 | # data.to_parquet(DATA_DIR / f"ohlc_from_{from_day}_to_{to_day}.parquet", index=False) 51 | data.to_parquet(DATA_DIR / f"ohlc_data.parquet", index=False) 52 | 53 | return DATA_DIR / f"ohlc_data.parquet" 54 | 55 | def download_data_for_one_day(product_id: str, day: str) -> pd.DataFrame: 56 | """ 57 | Downloads one day of data and returns pandas Dataframe 58 | """ 59 | # create start end end date strings 60 | start = f'{day}T00:00:00' 61 | from datetime import datetime, timedelta 62 | end = (datetime.strptime(day, "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d") 63 | end = f'{end}T00:00:00' 64 | 65 | # call API 66 | URL = f'https://api.exchange.coinbase.com/products/{product_id}/candles?start={start}&end={end}&granularity=3600' 67 | r = requests.get(URL) 68 | data = r.json() 69 | 70 | # transform list of lists to pandas dataframe and return 71 | return pd.DataFrame(data, columns=['time', 'low', 'high', 'open', 'close', 'volume']) 72 | 73 | if __name__== '__main__': 74 | fire.Fire(download_ohlc_data_from_coinbase) 75 | 76 | -------------------------------------------------------------------------------- /src/deploy.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import os 3 | 4 | import fire 5 | from cerebrium import deploy, model_type 6 | 7 | from src.logger import get_console_logger 8 | from src.paths import MODELS_DIR 9 | 10 | logger = get_console_logger(name='model_deployment') 11 | 12 | try: 13 | CEREBRIUM_API_KEY = os.environ['CEREBRIUM_API_KEY'] 14 | except KeyError: 15 | logger.error('CEREBRIUM_API_KEY environment variable not set.') 16 | raise 17 | 18 | def deploy( 19 | local_pickle: Optional[str] = None, 20 | from_model_registry: bool = False, 21 | ): 22 | """""" 23 | logger.info('Deploying model...') 24 | 25 | if from_model_registry: 26 | logger.info('Loading model from model registry...') 27 | raise NotImplementedError('TODO') 28 | 29 | elif local_pickle: 30 | logger.info('Deploying model from local pickle...') 31 | model_pickle_file = MODELS_DIR / local_pickle 32 | # TODO: not working. I am just following the docs here 33 | # https://docs.cerebrium.ai/quickstarts/scikit 34 | endpoint = deploy((model_type.SKLEARN, model_pickle_file), "sk-test-model" , CEREBRIUM_API_KEY) 35 | else: 36 | raise ValueError('Must specify either --local-pickle or --from-model-registry.') 37 | 38 | logger.info('Model deployed.') 39 | 40 | if __name__ == '__main__': 41 | fire.Fire(deploy) -------------------------------------------------------------------------------- /src/hyperparams.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Tuple, Callable, Union 2 | import os 3 | 4 | from comet_ml import Experiment 5 | import optuna 6 | import pandas as pd 7 | import numpy as np 8 | from sklearn.model_selection import TimeSeriesSplit 9 | from sklearn.metrics import mean_absolute_error 10 | from sklearn.pipeline import make_pipeline 11 | from sklearn.linear_model import Lasso 12 | from lightgbm import LGBMRegressor 13 | 14 | # from src.model_factory import get_preprocessing_and_model_pipeline 15 | from src.preprocessing import get_preprocessing_pipeline 16 | from src.logger import get_console_logger 17 | 18 | logger = get_console_logger() 19 | 20 | def sample_hyperparams( 21 | model_fn: Callable, 22 | trial: optuna.trial.Trial, 23 | ) -> Dict[str, Union[str, int, float]]: 24 | 25 | if model_fn == Lasso: 26 | return { 27 | 'alpha': trial.suggest_float('alpha', 0.01, 1.0, log=True) 28 | } 29 | elif model_fn == LGBMRegressor: 30 | return { 31 | "metric": 'mae', 32 | "verbose": -1, 33 | "num_leaves": trial.suggest_int("num_leaves", 2, 256), 34 | "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0), 35 | "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0), 36 | "min_child_samples": trial.suggest_int("min_child_samples", 3, 100), 37 | } 38 | else: 39 | raise NotImplementedError('TODO: implement other models') 40 | 41 | def find_best_hyperparams( 42 | model_fn: Callable, 43 | hyperparam_trials: int, 44 | X: pd.DataFrame, 45 | y: pd.Series, 46 | experiment: Experiment, 47 | ) -> Tuple[Dict, Dict]: 48 | """""" 49 | assert model_fn in {Lasso, LGBMRegressor} 50 | 51 | def objective(trial: optuna.trial.Trial) -> float: 52 | """ 53 | Error function we want to minimize (or maximize) using hyperparameter tuning. 54 | """ 55 | # sample hyper-parameters 56 | preprocessing_hyperparams = { 57 | 'pp_rsi_window': trial.suggest_int('pp_rsi_window', 5, 20), 58 | } 59 | model_hyperparams = sample_hyperparams(model_fn, trial) 60 | 61 | # evaluate the model using TimeSeriesSplit cross-validation 62 | tss = TimeSeriesSplit(n_splits=3) 63 | scores = [] 64 | logger.info(f'{trial.number=}') 65 | for split_number, (train_index, val_index) in enumerate(tss.split(X)): 66 | 67 | # split data for training and validation 68 | X_train, X_val = X.iloc[train_index], X.iloc[val_index] 69 | y_train, y_val = y.iloc[train_index], y.iloc[val_index] 70 | 71 | logger.info(f'{split_number=}') 72 | logger.info(f'{len(X_train)=}') 73 | logger.info(f'{len(X_val)=}') 74 | 75 | # train the model 76 | pipeline = make_pipeline( 77 | get_preprocessing_pipeline(**preprocessing_hyperparams), 78 | model_fn(**model_hyperparams) 79 | ) 80 | pipeline.fit(X_train, y_train) 81 | 82 | # evaluate the model 83 | y_pred = pipeline.predict(X_val) 84 | mae = mean_absolute_error(y_val, y_pred) 85 | scores.append(mae) 86 | 87 | logger.info(f'{mae=}') 88 | 89 | score = np.array(scores).mean() 90 | 91 | # Return the mean score 92 | return score 93 | 94 | logger.info('Starting hyper-parameter search...') 95 | study = optuna.create_study(direction="minimize") 96 | study.optimize(objective, n_trials=hyperparam_trials) 97 | 98 | # Get the best hyperparameters and their values 99 | best_params = study.best_params 100 | best_value = study.best_value 101 | 102 | # split best_params into preprocessing and model hyper-parameters 103 | best_preprocessing_hyperparams = \ 104 | {key: value for key, value in best_params.items() 105 | if key.startswith('pp_')} 106 | 107 | best_model_hyperparams = { 108 | key: value for key, value in best_params.items() 109 | if not key.startswith('pp_')} 110 | 111 | logger.info("Best Parameters:") 112 | for key, value in best_params.items(): 113 | logger.info(f"{key}: {value}") 114 | logger.info(f"Best MAE: {best_value}") 115 | 116 | experiment.log_metric('Cross_validation_MAE', best_value) 117 | 118 | return best_preprocessing_hyperparams, best_model_hyperparams 119 | 120 | -------------------------------------------------------------------------------- /src/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Optional 3 | 4 | def get_console_logger(name: Optional[str] = 'tutorial') -> logging.Logger: 5 | 6 | # Create logger if it doesn't exist 7 | logger = logging.getLogger(name) 8 | if not logger.handlers: 9 | logger.setLevel(logging.DEBUG) 10 | 11 | # Create console handler with formatting 12 | console_handler = logging.StreamHandler() 13 | console_handler.setLevel(logging.DEBUG) 14 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 15 | console_handler.setFormatter(formatter) 16 | 17 | # Add console handler to the logger 18 | logger.addHandler(console_handler) 19 | 20 | return logger -------------------------------------------------------------------------------- /src/model_registry_api.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from comet_ml import API 4 | from sklearn.pipeline import Pipeline 5 | 6 | from src.logger import get_console_logger 7 | 8 | logger = get_console_logger() 9 | 10 | def load_production_model_from_registry( 11 | workspace: str, 12 | api_key: str, 13 | model_name: str, 14 | status: str = 'Production', 15 | ) -> Pipeline: 16 | """Loads the production model from the remote model registry""" 17 | 18 | # find model version to deploy 19 | api = API(api_key) 20 | model_details = api.get_registry_model_details(workspace, model_name)['versions'] 21 | model_versions = [md['version'] for md in model_details if md['status'] == status] 22 | if len(model_versions) == 0: 23 | logger.error('No production model found') 24 | raise ValueError('No production model found') 25 | else: 26 | logger.info(f'Found {status} model versions: {model_versions}') 27 | model_version = model_versions[0] 28 | 29 | # download model from comet ml registry to local file 30 | api.download_registry_model( 31 | workspace, 32 | registry_name=model_name, 33 | version=model_version, 34 | output_path='./', 35 | expand=True 36 | ) 37 | 38 | # load model from local file to memory 39 | with open('./model.pkl', "rb") as f: 40 | model = pickle.load(f) 41 | 42 | return model -------------------------------------------------------------------------------- /src/paths.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import os 3 | 4 | PARENT_DIR = Path(__file__).parent.resolve().parent 5 | DATA_DIR = PARENT_DIR / 'data' 6 | MODELS_DIR = PARENT_DIR / 'models' 7 | 8 | if not Path(DATA_DIR).exists(): 9 | os.mkdir(DATA_DIR) 10 | 11 | if not Path(MODELS_DIR).exists(): 12 | os.mkdir(MODELS_DIR) -------------------------------------------------------------------------------- /src/predict.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | from src.model_registry_api import load_production_model_from_registry 4 | from src.logger import get_console_logger 5 | 6 | logger = get_console_logger('deployer') 7 | 8 | try: 9 | # this code works when running on Cerebrium 10 | from cerebrium import get_secret 11 | COMET_ML_WORKSPACE = get_secret("COMET_ML_WORKSPACE") 12 | COMET_ML_API_KEY = get_secret("COMET_ML_API_KEY") 13 | COMET_ML_MODEL_NAME = get_secret("COMET_ML_MODEL_NAME") 14 | 15 | except ImportError: 16 | # this code works when running locally 17 | import os 18 | COMET_ML_WORKSPACE = os.environ['COMET_ML_WORKSPACE'] 19 | COMET_ML_API_KEY = os.environ['COMET_ML_API_KEY'] 20 | COMET_ML_MODEL_NAME = os.environ['COMET_ML_MODEL_NAME'] 21 | 22 | model = load_production_model_from_registry( 23 | workspace=COMET_ML_WORKSPACE, 24 | api_key=COMET_ML_API_KEY, 25 | model_name=COMET_ML_MODEL_NAME, 26 | ) 27 | 28 | class Item(BaseModel): 29 | price_24_hour_ago: float 30 | price_23_hour_ago: float 31 | price_22_hour_ago: float 32 | price_21_hour_ago: float 33 | price_20_hour_ago: float 34 | price_19_hour_ago: float 35 | price_18_hour_ago: float 36 | price_17_hour_ago: float 37 | price_16_hour_ago: float 38 | price_15_hour_ago: float 39 | price_14_hour_ago: float 40 | price_13_hour_ago: float 41 | price_12_hour_ago: float 42 | price_11_hour_ago: float 43 | price_10_hour_ago: float 44 | price_9_hour_ago: float 45 | price_8_hour_ago: float 46 | price_7_hour_ago: float 47 | price_6_hour_ago: float 48 | price_5_hour_ago: float 49 | price_4_hour_ago: float 50 | price_3_hour_ago: float 51 | price_2_hour_ago: float 52 | price_1_hour_ago: float 53 | 54 | def predict(item, run_id, logger): 55 | item = Item(**item) 56 | 57 | # transform item to dataframe 58 | import pandas as pd 59 | df = pd.DataFrame([item.dict()]) 60 | 61 | # predict 62 | prediction = model.predict(df)[0] 63 | 64 | return {"prediction": prediction} 65 | 66 | if __name__ == '__main__': 67 | item = { 68 | 'price_24_hour_ago': 46656.851562, 69 | 'price_23_hour_ago': 46700.535156, 70 | 'price_22_hour_ago': 46700.535156, 71 | 'price_21_hour_ago': 46700.535156, 72 | 'price_20_hour_ago': 46700.535156, 73 | 'price_19_hour_ago': 46700.535156, 74 | 'price_18_hour_ago': 46700.535156, 75 | 'price_17_hour_ago': 46700.535156, 76 | 'price_16_hour_ago': 46700.535156, 77 | 'price_15_hour_ago': 46700.535156, 78 | 'price_14_hour_ago': 46700.535156, 79 | 'price_13_hour_ago': 46700.535156, 80 | 'price_12_hour_ago': 46700.535156, 81 | 'price_11_hour_ago': 46700.535156, 82 | 'price_10_hour_ago': 46700.535156, 83 | 'price_9_hour_ago': 46700.535156, 84 | 'price_8_hour_ago': 46700.535156, 85 | 'price_7_hour_ago': 46700.535156, 86 | 'price_6_hour_ago': 46700.535156, 87 | 'price_5_hour_ago': 46700.535156, 88 | 'price_4_hour_ago': 46700.535156, 89 | 'price_3_hour_ago': 46700.535156, 90 | 'price_2_hour_ago': 46700.535156, 91 | 'price_1_hour_ago': 46700.535156 92 | } 93 | 94 | prediction = predict(item, None, None) 95 | print(f'{prediction=}') -------------------------------------------------------------------------------- /src/preprocessing.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Optional, Union 2 | from pathlib import Path 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import fire 7 | import ta 8 | from sklearn.base import BaseEstimator, TransformerMixin 9 | from sklearn.pipeline import make_pipeline, Pipeline 10 | from sklearn.preprocessing import FunctionTransformer 11 | 12 | from src.paths import DATA_DIR 13 | from src.logger import get_console_logger 14 | 15 | logger = get_console_logger() 16 | 17 | def transform_ts_data_into_features_and_target( 18 | # ts_data: pd.DataFrame, 19 | path_to_input: Optional[Path] = DATA_DIR / 'ohlc_data.parquet', 20 | input_seq_len: Optional[int] = 24, 21 | step_size: Optional[int] = 1 22 | ) -> Tuple[pd.DataFrame, pd.Series]: 23 | """ 24 | Slices and transposes data from time-series format into a (features, target) 25 | format that we can use to train Supervised ML models 26 | """ 27 | # load parquet file 28 | ts_data = pd.read_parquet(path_to_input) 29 | ts_data = ts_data[['time', 'close']] 30 | ts_data.sort_values(by=['time'], inplace=True) 31 | 32 | # output features and targets 33 | features = pd.DataFrame() 34 | targets = pd.DataFrame() 35 | 36 | # pre-compute cutoff indices to split dataframe rows 37 | indices = get_cutoff_indices_features_and_target( 38 | ts_data, 39 | input_seq_len, 40 | step_size 41 | ) 42 | 43 | # slice and transpose data into numpy arrays for features and targets 44 | n_examples = len(indices) 45 | x = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32) 46 | y = np.ndarray(shape=(n_examples), dtype=np.float32) 47 | times = [] 48 | for i, idx in enumerate(indices): 49 | x[i, :] = ts_data.iloc[idx[0]:idx[1]]['close'].values 50 | y[i] = ts_data.iloc[idx[1]:idx[2]]['close'].values 51 | times.append(ts_data.iloc[idx[1]]['time']) 52 | 53 | # numpy -> pandas 54 | features = pd.DataFrame( 55 | x, 56 | columns=[f'price_{i+1}_hour_ago' for i in reversed(range(input_seq_len))] 57 | ) 58 | 59 | # add back column with the time 60 | # features['time'] = times 61 | 62 | # numpy -> pandas 63 | targets = pd.DataFrame(y, columns=[f'target_price_next_hour']) 64 | 65 | return features, targets['target_price_next_hour'] 66 | 67 | def get_cutoff_indices_features_and_target( 68 | data: pd.DataFrame, 69 | input_seq_len: int, 70 | step_size: int 71 | ) -> List[Tuple[int, int, int]]: 72 | 73 | stop_position = len(data) - 1 74 | 75 | # Start the first sub-sequence at index position 0 76 | subseq_first_idx = 0 77 | subseq_mid_idx = input_seq_len 78 | subseq_last_idx = input_seq_len + 1 79 | indices = [] 80 | 81 | while subseq_last_idx <= stop_position: 82 | indices.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx)) 83 | subseq_first_idx += step_size 84 | subseq_mid_idx += step_size 85 | subseq_last_idx += step_size 86 | 87 | return indices 88 | 89 | def get_price_columns(X: pd.DataFrame) -> List[str]: 90 | """Get the columns of the input DataFrame that contain the price data.""" 91 | return [col for col in X.columns if 'price' in col] 92 | 93 | class RSI(BaseEstimator, TransformerMixin): 94 | """ 95 | Adds RSI to the input DataFrame from the `close` prices 96 | 97 | New columns are: 98 | - 'rsi' 99 | """ 100 | def __init__(self, window: int = 14): 101 | self.window = window 102 | 103 | def fit(self, 104 | X: pd.DataFrame, 105 | y: Optional[Union[pd.DataFrame, pd.Series]] = None) -> "RSI": 106 | """In this scenario, the fit method isn't doing anything. But it must be implemented. This is a scenario of an estimator without parameters.""" 107 | return self 108 | 109 | def _add_indicator(self, row: pd.Series) -> float: 110 | return pd.Series([ta.momentum.rsi(row, window=self.window)[-1]]) 111 | 112 | def transform(self, X: pd.DataFrame) -> pd.DataFrame: 113 | """Compute the RSI and add it to the input DataFrame.""" 114 | logger.info('Adding RSI to the input DataFrame') 115 | df = X[get_price_columns(X)].apply(self._add_indicator, axis=1) 116 | df.columns = ['rsi'] 117 | X = pd.concat([X, df], axis=1) 118 | return X 119 | 120 | def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame: 121 | """Inverse the log of every cell of the DataFrame.""" 122 | X.drop(columns=['rsi'], inplace=True) 123 | return X 124 | 125 | def get_price_percentage_return(X: pd.DataFrame, hours: int) -> pd.DataFrame: 126 | """Add the price return of the last `hours` to the input DataFrame.""" 127 | X[f'percentage_return_{hours}_hour'] = \ 128 | (X['price_1_hour_ago'] - X[f'price_{hours}_hour_ago'])/ X[f'price_{hours}_hour_ago'] 129 | return X 130 | 131 | def get_subset_of_features(X: pd.DataFrame) -> pd.DataFrame: 132 | return X[['price_1_hour_ago', 'percentage_return_2_hour', 'percentage_return_24_hour', 'rsi']] 133 | 134 | def get_preprocessing_pipeline( 135 | pp_rsi_window: int = 14 136 | ) -> Pipeline: 137 | """Returns the preprocessing pipeline.""" 138 | return make_pipeline( 139 | # trends 140 | FunctionTransformer(get_price_percentage_return, kw_args={'hours': 2}), 141 | FunctionTransformer(get_price_percentage_return, kw_args={'hours': 24}), 142 | 143 | # momentum 144 | RSI(pp_rsi_window), 145 | 146 | # select columns 147 | FunctionTransformer(get_subset_of_features) 148 | ) 149 | 150 | if __name__ == '__main__': 151 | 152 | features, target = fire.Fire(transform_ts_data_into_features_and_target) 153 | 154 | preprocessing_pipeline = get_preprocessing_pipeline() 155 | 156 | preprocessing_pipeline.fit(features) 157 | X = preprocessing_pipeline.transform(features) 158 | print(X.head()) -------------------------------------------------------------------------------- /src/test_endpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | import requests 5 | 6 | from src.logger import get_console_logger 7 | 8 | logger = get_console_logger(__name__) 9 | 10 | try: 11 | url = os.environ['CEREBRIUM_ENDPOINT_URL'] 12 | except KeyError as e: 13 | logger.error('You need to specify the env variables CEREBRIUM_ENDPOINT_URL in your set_env_variables.sh script') 14 | raise e 15 | 16 | payload = json.dumps({ 17 | 'price_24_hour_ago': 46656.851562, 18 | 'price_23_hour_ago': 46700.535156, 19 | 'price_22_hour_ago': 46700.535156, 20 | 'price_21_hour_ago': 46700.535156, 21 | 'price_20_hour_ago': 46700.535156, 22 | 'price_19_hour_ago': 46700.535156, 23 | 'price_18_hour_ago': 46700.535156, 24 | 'price_17_hour_ago': 46700.535156, 25 | 'price_16_hour_ago': 46700.535156, 26 | 'price_15_hour_ago': 46700.535156, 27 | 'price_14_hour_ago': 46700.535156, 28 | 'price_13_hour_ago': 46700.535156, 29 | 'price_12_hour_ago': 46700.535156, 30 | 'price_11_hour_ago': 46700.535156, 31 | 'price_10_hour_ago': 46700.535156, 32 | 'price_9_hour_ago': 46700.535156, 33 | 'price_8_hour_ago': 46700.535156, 34 | 'price_7_hour_ago': 46700.535156, 35 | 'price_6_hour_ago': 46700.535156, 36 | 'price_5_hour_ago': 46700.535156, 37 | 'price_4_hour_ago': 46700.535156, 38 | 'price_3_hour_ago': 46700.535156, 39 | 'price_2_hour_ago': 46700.535156, 40 | 'price_1_hour_ago': 46700.535156 41 | }) 42 | 43 | headers = { 44 | 'Authorization': 'public-6fb5d32ac9801938a17b', 45 | 'Content-Type': 'application/json' 46 | } 47 | 48 | response = requests.request("POST", url, headers=headers, data=payload) 49 | 50 | print(response.text) -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Union, Optional, Callable 2 | import os 3 | 4 | import pandas as pd 5 | from comet_ml import Experiment 6 | from sklearn.pipeline import make_pipeline, Pipeline 7 | from sklearn.metrics import mean_absolute_error 8 | from sklearn.linear_model import Lasso 9 | from xgboost import XGBRegressor 10 | from lightgbm import LGBMRegressor 11 | import pickle 12 | 13 | from src.preprocessing import ( 14 | transform_ts_data_into_features_and_target, 15 | get_preprocessing_pipeline 16 | ) 17 | from src.hyperparams import find_best_hyperparams 18 | from src.logger import get_console_logger 19 | from src.paths import MODELS_DIR 20 | # from src.model_factory import get_preprocessing_and_model_pipeline 21 | 22 | logger = get_console_logger() 23 | 24 | 25 | def get_baseline_model_error(X_test: pd.DataFrame, y_test: pd.Series) -> float: 26 | """Returns the baseline model error.""" 27 | predictions = X_test['price_1_hour_ago'] 28 | return mean_absolute_error(y_test, predictions) 29 | 30 | def get_model_fn_from_name(model_name: str) -> Callable: 31 | """Returns the model function given the model name.""" 32 | if model_name == 'lasso': 33 | return Lasso 34 | elif model_name == 'xgboost': 35 | return XGBRegressor 36 | elif model_name == 'lightgbm': 37 | return LGBMRegressor 38 | else: 39 | raise ValueError(f'Unknown model name: {model_name}') 40 | 41 | def train( 42 | X: pd.DataFrame, 43 | y: pd.Series, 44 | model: str, 45 | tune_hyperparams: Optional[bool] = False, 46 | hyperparam_trials: Optional[int] = 10, 47 | ) -> None: 48 | """ 49 | Train a boosting tree model using the input features `X` and targets `y`, 50 | possibly running hyperparameter tuning. 51 | """ 52 | model_fn = get_model_fn_from_name(model) 53 | 54 | experiment = Experiment( 55 | api_key = os.environ["COMET_ML_API_KEY"], 56 | workspace=os.environ["COMET_ML_WORKSPACE"], 57 | project_name = "hands-on-train-and-deploy-tutorial", 58 | ) 59 | experiment.add_tag(model) 60 | 61 | # split the data into train and test 62 | train_sample_size = int(0.9 * len(X)) 63 | X_train, X_test = X[:train_sample_size], X[train_sample_size:] 64 | y_train, y_test = y[:train_sample_size], y[train_sample_size:] 65 | logger.info(f'Train sample size: {len(X_train)}') 66 | logger.info(f'Test sample size: {len(X_test)}') 67 | 68 | if not tune_hyperparams: 69 | # create the full pipeline with default hyperparameters 70 | logger.info('Using default hyperparameters') 71 | pipeline = make_pipeline( 72 | get_preprocessing_pipeline(), 73 | model_fn() 74 | ) 75 | 76 | else: 77 | 78 | # find best hyperparameters using cross-validation 79 | logger.info('Finding best hyperparameters with cross-validation') 80 | best_preprocessing_hyperparams, best_model_hyperparams = \ 81 | find_best_hyperparams(model_fn, hyperparam_trials, X_train, y_train, 82 | experiment) 83 | logger.info(f'Best preprocessing hyperparameters: {best_preprocessing_hyperparams}') 84 | logger.info(f'Best model hyperparameters: {best_model_hyperparams}') 85 | 86 | pipeline = make_pipeline( 87 | get_preprocessing_pipeline(**best_preprocessing_hyperparams), 88 | model_fn(**best_model_hyperparams) 89 | ) 90 | 91 | experiment.add_tag('hyper-parameter-tuning') 92 | 93 | # train the model 94 | logger.info('Fitting model with default hyperparameters') 95 | pipeline.fit(X_train, y_train) 96 | 97 | # compute test MAE 98 | predictions = pipeline.predict(X_test) 99 | test_error = mean_absolute_error(y_test, predictions) 100 | logger.info(f'Test MAE: {test_error}') 101 | experiment.log_metrics({'Test_MAE': test_error}) 102 | 103 | # save the model to disk 104 | logger.info('Saving model to disk') 105 | with open(MODELS_DIR / 'model.pkl', "wb") as f: 106 | pickle.dump(pipeline, f) 107 | 108 | # log model artifact 109 | # experiment.log_model('eth-eur-1h-price-predictor', str(MODELS_DIR / 'model.pkl')) 110 | experiment.log_model(str(model_fn), str(MODELS_DIR / 'model.pkl')) 111 | 112 | # breakpoint() 113 | 114 | # log model to the registry 115 | # experiment.register_model('eth-eur-1h-price-predictor') 116 | 117 | 118 | 119 | if __name__ == '__main__': 120 | 121 | from argparse import ArgumentParser 122 | parser = ArgumentParser() 123 | parser.add_argument('--model', type=str, default='lasso') 124 | parser.add_argument('--tune-hyperparams', action='store_true') 125 | parser.add_argument('--sample-size', type=int, default=None) 126 | parser.add_argument('--hyperparam-trials', type=int, default=10) 127 | args = parser.parse_args() 128 | 129 | logger.info('Generating features and targets') 130 | features, target = transform_ts_data_into_features_and_target() 131 | 132 | if args.sample_size is not None: 133 | # reduce input size to speed up training 134 | features = features.head(args.sample_size) 135 | target = target.head(args.sample_size) 136 | 137 | logger.info('Training model') 138 | train(features, target, 139 | model=args.model, 140 | tune_hyperparams=args.tune_hyperparams, 141 | hyperparam_trials=args.hyperparam_trials 142 | ) -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/tests/__init__.py --------------------------------------------------------------------------------