├── .github
    └── workflows
    │   └── deploy_api.yml
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── images
    ├── 02_video_cover.png
    ├── 03_video_cover.png
    ├── 04_video_cover.png
    ├── 05_video_cover.png
    ├── header.jpg
    ├── lecture_1.gif
    ├── lecture_2.gif
    ├── lecture_3.gif
    ├── logo_realworldml.png
    ├── video_1.png
    └── video_thumbnail.png
├── lectures
    ├── 01_model_training.md
    ├── 02_model_deployment.md
    └── 03_continuous_deployment_with_webhooks.md
├── poetry.lock
├── pyproject.toml
├── set_environment_variables_template.sh
├── src
    ├── __init__.py
    ├── baseline_model.py
    ├── config.py
    ├── data.py
    ├── deploy.py
    ├── hyperparams.py
    ├── logger.py
    ├── model_registry_api.py
    ├── paths.py
    ├── predict.py
    ├── preprocessing.py
    ├── test_endpoint.py
    └── train.py
└── tests
    └── __init__.py


/.github/workflows/deploy_api.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy REST API to Production
 2 | run-name: Deployment run - ${{ github.actor }}
 3 | on:
 4 |   repository_dispatch:
 5 |     types: [webhook_Production]
 6 |   
 7 |   workflow_dispatch:
 8 | 
 9 | env:
10 |     PYTHON_VERSION: 3.9
11 |     POETRY_VERSION: 1.5.1
12 |     POETRY_URL: https://install.python-poetry.org
13 |       
14 | jobs:    
15 |     deploy-rest-api-prod:
16 |         runs-on: ubuntu-latest
17 |         steps:
18 | 
19 |             - name: Checkout
20 |               uses: actions/checkout@v3
21 |         
22 |             # Poetry cache depends on OS, Python version and Poetry version.
23 |             - name: Cache Poetry cache
24 |               uses: actions/cache@v3
25 |         
26 |               with:
27 |                 path: ~/.cache/pypoetry
28 |                 key: poetry-cache-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ env.POETRY_VERSION }}
29 |             
30 |             # virtualenv cache should depends on OS, Python version and `poetry.lock` (and optionally workflow files).
31 |             - name: Cache Packages
32 |               uses: actions/cache@v3
33 |               with:
34 |                 path: ~/.local
35 |                 key: poetry-${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('**/poetry.lock') }}-${{ hashFiles('.github/workflows/*.yml') }}
36 |             
37 |             - name: Set up Python ${{ env.PYTHON_VERSION }}
38 |               uses: actions/setup-python@v3
39 |               with:
40 |                 python-version: ${{ env.PYTHON_VERSION }}
41 |             
42 |             - name: Install Poetry
43 |               run: |
44 |                 curl -sSL ${{ env.POETRY_URL }} | python - --version ${{ env.POETRY_VERSION }}
45 |                 echo "$HOME/.local/bin" >> $GITHUB_PATH
46 |             
47 |             - name: Install Dependencies
48 |               run: poetry install
49 |             
50 |             - name: Deploy endpoint to Cerebrium
51 |               env: 
52 |                 COMET_ML_API_KEY: ${{ secrets.COMET_ML_API_KEY }}
53 |                 COMET_ML_WORKSPACE: ${{ secrets.COMET_ML_WORKSPACE }}
54 |                 COMET_ML_MODEL_NAME: ${{ secrets.COMET_ML_MODEL_NAME }}
55 |                 CEREBRIUM_API_KEY: ${{ secrets.CEREBRIUM_API_KEY }}
56 |               run: make deploy
57 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # prefect artifacts
 2 | .prefectignore
 3 | 
 4 | # python artifacts
 5 | __pycache__/
 6 | *.py[cod]
 7 | *$py.class
 8 | *.egg-info/
 9 | *.egg
10 | 
11 | # Type checking artifacts
12 | .mypy_cache/
13 | .dmypy.json
14 | dmypy.json
15 | .pyre/
16 | 
17 | # IPython
18 | profile_default/
19 | ipython_config.py
20 | *.ipynb_checkpoints/*
21 | 
22 | # Environments
23 | .python-version
24 | .env
25 | .venv
26 | env/
27 | venv/
28 | 
29 | # MacOS
30 | .DS_Store
31 | 
32 | # Dask
33 | dask-worker-space/
34 | 
35 | # Editors
36 | .idea/
37 | .vscode/
38 | 
39 | # VCS
40 | .git/
41 | .hg/
42 | 
43 | # data file
44 | *.parquet
45 | *.csv
46 | 
47 | deployment_dir/
48 | *.pkl
49 | set_environment_variables.sh
50 | .links
51 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Pau Labarta Bajo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: init data baseline train deploy prepare-deployment test-endpoint
 2 | 
 3 | DEPLOYMENT_DIR = deployment_dir
 4 | 
 5 | init:
 6 | 	curl -sSL https://install.python-poetry.org | python3 -
 7 | 	poetry install
 8 | 	
 9 | data:
10 | 	poetry run python src/data.py
11 | 
12 | baseline:
13 | 	poetry run python src/baseline_model.py
14 | 
15 | train:
16 | 	poetry run python src/train.py
17 | 
18 | prepare-deployment:
19 | 	rm -rf $(DEPLOYMENT_DIR) && mkdir $(DEPLOYMENT_DIR)
20 | 	poetry export -f requirements.txt --output $(DEPLOYMENT_DIR)/requirements.txt --without-hashes
21 | 	cp -r src/predict.py $(DEPLOYMENT_DIR)/main.py
22 | 	cp -r src $(DEPLOYMENT_DIR)/src/
23 | 	# pip install cerebrium --upgrade # otherwise cerebrium deploy might fail
24 | 	
25 | deploy: prepare-deployment
26 | 	cd $(DEPLOYMENT_DIR) && poetry run cerebrium deploy --api-key $(CEREBRIUM_API_KEY) --hardware CPU eth-price-1-hour-predictor
27 | 
28 | test-endpoint:
29 | 	poetry run python src/test_endpoint.py


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |     <a href='https://www.realworldml.net/'><img src='./images/logo_realworldml.png' width='350'></a>
  3 |     <h1>Train and Deploy a Serverless API to predict crypto prices</h1>
  4 |     <img src="images/lecture_3.gif" width='450' />
  5 | </div>
  6 | 
  7 | <br>
  8 | 
  9 | <!-- <p align="center">
 10 |   <img src="images/lecture_3.gif" width='450' />
 11 | </p> -->
 12 | 
 13 | #### Contents
 14 | * [The problem](#the-problem)
 15 | * [This is what you will learn](#this-is-what-you-will-learn)
 16 | * [Tools](#tools)
 17 | * [Run the whole thing in 5 minutes](#run-the-whole-thing-in-5-minutes)
 18 | * [Part 1. Model training 🏋️](#1-model-training)
 19 |     * [Step 1. Create your virtual environment](./lectures/01_model_training.md#1-create-the-virtual-environment-with-poetry)
 20 |     * [Step 2. Generate training data](./lectures/01_model_training.md#2-generate-training-data)
 21 |     * [Step 3. Build a baseline model](./lectures/01_model_training.md#3-build-a-baseline-model)
 22 |     * [Step 4. Build Machine Learning models](./lectures/01_model_training.md#4-build-machine-learning-models)
 23 | * [Part 2. Model deployment as REST API 🚀](#2-model-deployment-as-rest-api)
 24 |     * [Step 5. Deploy the model as a Serverless REST API](./lectures/02_model_deployment.md#5-deploy-the-model-as-a-serverless-rest-api)
 25 | * [Part 3. Automation with GitHub actions and the Model Registry 🪝](#3-automatic-deployments-with-github-actions-and-model-registry-webhooks)
 26 | * [Wanna learn more real-time ML?](#wanna-learn-more-real-time-ml)
 27 | 
 28 | ## The problem
 29 | 
 30 | Predicting crypto price movements is extremely hard. But it is also a great field to show what Machine Learning has to offer.
 31 | 
 32 | In this tutorial you won't build an ML system that will make you rich. But you will master the MLOps frameworks and tools you need to build ML systems that, together with tons of experimentation, can take you there.
 33 | 
 34 | With this hands-on tutorial, I want to help you grow as an ML engineer and go beyond notebooks.
 35 | 
 36 | <br>
 37 | 
 38 | ## This is what you will learn
 39 | 
 40 | You will learn to
 41 | - **train** an ML model prototype through careful experimentation, using [CometML](https://www.comet.com/signup?utm_source=pau&utm_medium=partner&utm_content=github).
 42 | - **deploy** the model as a REST API, with [Cerebrium](https://www.cerebrium.ai?utm_source=pau&utm_medium=partner&utm_content=github).
 43 | - **automate** safe deployments, using GitHub actions and Comet ML Model Registry.
 44 | 
 45 | Without further ado, let's get to work!
 46 | 
 47 | <br>
 48 | 
 49 | ## Tools
 50 | We will use a 100% Serverless stack, so you don't need to set up and maintain infrastructure
 51 | 
 52 | * [CometML](https://www.comet.com/signup?utm_source=pau&utm_medium=partner&utm_content=github) as experiment tracker and model registry
 53 | * [Cerebrium](https://www.cerebrium.ai?utm_source=pau&utm_medium=partner&utm_content=github) as a deployment platform
 54 | * [GitHub actions](https://github.com/features/actions) to automate workflows.
 55 | 
 56 | <br>
 57 | 
 58 | ## Run the whole thing in 5 minutes
 59 | 
 60 | If you only have 5 minutes and want to see the whole system in action, follow these steps:
 61 | 
 62 | 1. Create a Python virtual environment with all project dependencies with
 63 | 
 64 |     ```
 65 |     $ make init
 66 |     ```
 67 | 
 68 | 
 69 | 2. Set your API keys for [Comet ML](https://www.comet.com/signup?utm_source=pau&utm_medium=partner&utm_content=github) and [Cerebrium](https://www.cerebrium.ai?utm_source=pau&utm_medium=partner&utm_content=github) in `set_environment_variables_template.sh`, rename the file as `set_environment_variables.sh` and run it
 70 |     ```
 71 |     $ . ./set_environment_variables.sh
 72 |     ```
 73 | 
 74 | 3. Download historical data from Coinbase and save it locally to disk
 75 |     ```
 76 |     $ make data
 77 |     ```
 78 | 
 79 | 4. Train ML model
 80 |     ```
 81 |     $ make train
 82 |     ```
 83 | 
 84 | 5. Deploy the model
 85 |     ```
 86 |     $ make deploy
 87 |     ```
 88 | 
 89 | 6. Take the endpoint URL you get from Cerebrium in the previous step, and set the `CEREBRIUM_ENDPOINT_URL` variable in `set_environment_variables.sh`. Then re-run
 90 |     ```
 91 |     $ . ./set_environment_variables.sh
 92 |     ```
 93 | 
 94 | 7. Test the endpoint works
 95 |     ```
 96 |     $ make test-endpoint
 97 |     ```
 98 | <br>
 99 | 
100 | ## Lectures
101 | 
102 | ### 1. Model training
103 | 
104 | In this first lecture you will
105 | 
106 | - fetch raw data
107 | - transform it into features and targets
108 | - build a baseline model
109 | - experiment with several ML models in a fast and reliable way, using Python scripts and [Comet ML experiment tracking](https://www.comet.com/signup?utm_source=pau&utm_medium=partner&utm_content=github)
110 | 
111 | In this lecture you won't train an ML model that will make you rich. But you will master the framework and skillset you need if you want to build ML models that, together with tons of experimentation, can take you there.
112 | 
113 | Ready to get your hands-dirty? **[Start training models 👩‍💻👨🏽‍💻 🏋️](./lectures/01_model_training.md)**
114 | 
115 | <br>
116 | 
117 | ### 2. Model deployment as REST API
118 | 
119 | In this second lecture you will
120 | 
121 | - deploy the best ML model you found in lecture 1 as a REST API using [Cerebrium](https://www.cerebrium.ai?utm_source=pau&utm_medium=partner&utm_content=github)
122 | 
123 | - test the endpoint works.
124 | 
125 | Ready to deploy? **[Start deploying models 👩‍💻👨🏽‍💻 🚀](./lectures/02_model_deployment.md)**
126 | 
127 | <br>
128 | 
129 | ### 3. Automatic deployments with GitHub actions and Model Registry webhooks
130 | 
131 | ML models often need to be re-trained to keep them performant. Hence, automating safe deployments is a must.
132 | 
133 | In this third and final lecture you will
134 | 
135 | - build a continuous deployment pipeline using GitHub actions
136 | - create a webhook to trigger deployments from the Model Registry.
137 | 
138 | Ready for the final round? **[Start automating safe deployments 👩‍💻👨🏽‍💻 🪝](./lectures/03_continuous_deployment_with_webhooks.md)**
139 | 
140 | <br>
141 | 
142 | ## Wanna learn more Real-Time ML?
143 | 
144 | Wanna learn to build a complete ML system that
145 | 
146 | - ingests real-time crypto data
147 | - trains predictive ML models, and
148 | - continuously deploys them
149 | 
150 | using MLOps best practices?
151 | 
152 | I am preparing a new hands-on tutorial where you will learn all this.
153 | 
154 | **[Subscribe to The Real-World ML Newsletter](https://paulabartabajo.substack.com/)** to be notified when the tutorial is out.
155 | 
156 | <div align="center">
157 |     <sub>Let's connect 🤗</sub>
158 |     <br />
159 |     <a href="https://twitter.com/paulabartabajo_">Twitter</a> •
160 |     <a href="https://www.linkedin.com/in/pau-labarta-bajo-4432074b/">LinkedIn</a> •
161 |     <a href="https://paulabartabajo.substack.com/">Newsletter</a>
162 | <br />
163 | </div>
164 | 
165 | <div align="center">
166 |     <a href='https://www.realworldml.net/'><img src='./images/logo_realworldml.png' width='250'></a>
167 | </div>
168 | 
169 | 
170 | 
171 | 
172 | 


--------------------------------------------------------------------------------
/images/02_video_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/02_video_cover.png


--------------------------------------------------------------------------------
/images/03_video_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/03_video_cover.png


--------------------------------------------------------------------------------
/images/04_video_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/04_video_cover.png


--------------------------------------------------------------------------------
/images/05_video_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/05_video_cover.png


--------------------------------------------------------------------------------
/images/header.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/header.jpg


--------------------------------------------------------------------------------
/images/lecture_1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/lecture_1.gif


--------------------------------------------------------------------------------
/images/lecture_2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/lecture_2.gif


--------------------------------------------------------------------------------
/images/lecture_3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/lecture_3.gif


--------------------------------------------------------------------------------
/images/logo_realworldml.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/logo_realworldml.png


--------------------------------------------------------------------------------
/images/video_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/video_1.png


--------------------------------------------------------------------------------
/images/video_thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/images/video_thumbnail.png


--------------------------------------------------------------------------------
/lectures/01_model_training.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |     <h1>Lecture 1</h1>
  3 |     <h1>Model training and experimentation with <a href="https://www.comet.com/signup?utm_source=pau&utm_medium=partner&utm_content=github">Comet ML </a></h1>
  4 | </div>
  5 | 
  6 | <br />
  7 | 
  8 | <p align="center">
  9 |     <a href="https://www.comet.com/signup?utm_source=pau&utm_medium=partner&utm_content=github">
 10 |     <img src="../images/lecture_1.gif" width='750' />
 11 |     </a>
 12 | </p>
 13 | 
 14 | #### Steps
 15 | 
 16 | 1. [Create the virtual environment with Poetry](#1-create-the-virtual-environment-with-poetry)
 17 | 2. [Generate training data](#2-generate-training-data)
 18 | 3. [Build a baseline model](#3-build-a-baseline-model)
 19 | 4. [Build Machine Learning models](#4-build-machine-learning-models)
 20 | 
 21 | 
 22 | 
 23 | ## 1. Create the virtual environment with [Poetry](https://python-poetry.org/docs/)
 24 | 
 25 | <div align="center">
 26 |   <a href="https://www.youtube.com/watch?v=xLtP7zSJwvE">
 27 |       <p>Watch the video 🎬</p>
 28 |     <img src="../images/video_1.png" alt="Create virtual environment with Poetry" style="width:75%;">
 29 |   </a>
 30 | </div>
 31 | 
 32 | <br>
 33 | 
 34 | 1. Create a Python virtual environment with all project dependencies with
 35 |     ```
 36 |     $ curl -sSL https://install.python-poetry.org | python3 -
 37 | 	$ poetry install
 38 |     ```
 39 |     or simply use the `Makefile`
 40 |     ```
 41 |     $ make init
 42 |     ```
 43 | 
 44 | 2. Activate the virtual environment you just created
 45 |     ```
 46 |     $ poetry shell
 47 |     ```
 48 | 
 49 | 3. [Sign up for Comet ML for FREE](https://www.comet.com/signup?utm_source=pau&utm_medium=partner&utm_content=github), create a workspace and copy your API key from the dashboard.
 50 | 
 51 | 4. Set your API key and workspace name variables in `set_environment_variables_template.sh`, rename the file and run it
 52 |     ```
 53 |     $ . ./set_environment_variables.sh
 54 |     ```
 55 | 
 56 | Your local development environment is ready. Let's now generate some training data.
 57 | 
 58 | ## 2. Generate training data
 59 | 
 60 | <div align="center">
 61 |   <a href="https://www.youtube.com/watch?v=u6dFm85hXK4">
 62 |     <p>Watch the video 🎬</p>
 63 |     <img src="../images/02_video_cover.png" alt="Create virtual environment with Poetry" style="width:75%;">
 64 |   </a>
 65 | </div>
 66 | 
 67 | 
 68 | Download historical data from Coinbase and save it locally to disk
 69 | 
 70 | - Run either `$ python src/data.py`, or
 71 | - Simply `$ make data`
 72 | 
 73 | ## 3. Build a baseline model
 74 | 
 75 | <div align="center">
 76 |   <a href="https://www.youtube.com/watch?v=kOrmsVFlXkI">
 77 |     <p>Watch the video 🎬</p>
 78 |     <img src="../images/03_video_cover.png" alt="Create a simple, yet powerful, baseline model" style="width:75%;">
 79 |   </a>
 80 | </div>
 81 | 
 82 | 
 83 | - Establish a baseline performance using a very dummy (yet powerful) baseline model
 84 |     ```
 85 |     $ python src/baseline_model.py
 86 |     ```
 87 | 
 88 | ## 4. Build Machine Learning models
 89 | 
 90 | <div align="center">
 91 |   <a href="https://www.youtube.com/watch?v=Sx7k0iDqy9I">
 92 |     <p>Watch the video 🎬</p>
 93 |     <img src="../images/04_video_cover.png" alt="Create a simple, yet powerful, baseline model" style="width:75%;">
 94 |   </a>
 95 | </div>
 96 | 
 97 | - Here is the list of experiments I ran
 98 |     ```
 99 |     $ python src/baseline_model.py
100 |     $ python src/train.py --model lasso
101 |     $ python src/train.py --model lasso --tune-hyperparams --hyperparam-trials 3
102 |     $ python src/train.py --model lightgbm
103 |     $ python src/train.py --model lightgbm --tune-hyperparams --hyperparam-trials 3
104 |     ```
105 | 
106 | - Feel free to try adding more features, using other technical indicators, or experiment with other ML models.
107 | 
108 | 
109 | ### [➡️ Go to the next lecture](../lectures/02_model_deployment.md)
110 | 
111 | 


--------------------------------------------------------------------------------
/lectures/02_model_deployment.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 |     <h1>Lecture 2. Model deployment as a REST API</h1>
 3 |     <i>Serverless API with <a href="https://www.cerebrium.ai?utm_source=pau&utm_medium=partner&utm_content=github">Cerebrium</a></i>
 4 | </div>
 5 | 
 6 | <br />
 7 | 
 8 | <p align="center">
 9 |   <img src="../images/lecture_2.gif" width='500' />
10 | </p>
11 | 
12 | #### Steps
13 | 5. [Deploy the model as a Serverless REST API](#5-deploy-the-model-as-a-serverless-rest-api)
14 | 6. [Test the REST API endpoint](#6-test-the-api-endpoint)
15 | 
16 | ## 5. Deploy the model as a Serverless REST API
17 | 
18 | Let's deploy our ML model as a REST API using the Serverless platform <a href="https://www.cerebrium.ai?utm_source=pau&utm_medium=partner&utm_content=github">Cerebrium</a>.
19 | 
20 | Forget about Docker, IAM roles, and EC2 instances. Serverless ML is about focusing on what differentiates your ML product, not setting up and mantaining infrastructure.
21 | 
22 | <div align="center">
23 |     <a href="https://www.cerebrium.ai?utm_source=pau&utm_medium=partner&utm_content=github">
24 |     <p>Click here to sign up for FREE and get your API key</p>
25 |     </a>
26 | </div>
27 | 
28 | <div align="center">
29 |   <a href="https://www.youtube.com/watch?v=oxUxlJ7xdCI">
30 |     <img src="../images/05_video_cover.png" alt="Deploy ML model as REST API using Cerebrium" style="width:75%;">
31 |   </a>
32 | </div>
33 | 
34 | Your REST API endpoint needs to preload the ML model from the CometML Model Registry. For that, you need to set the following secrets on your Cerebrium Dashboard:
35 | - `COMET_ML_WORKSPACE`
36 | - `COMET_ML_API_KEY`
37 | - `COMET_ML_MODEL_NAME`
38 | 
39 | Then run
40 | ```
41 | $ make deploy
42 | ```
43 | 
44 | ## 6. Test the REST API endpoint
45 | 
46 | [PENDING VIDEO 🎬]
47 | ```
48 | $ make test-endpoint
49 | ```
50 | 
51 | ### [➡️ Go to the next lecture](../lectures/03_continuous_deployment_with_webhooks.md)
52 | 


--------------------------------------------------------------------------------
/lectures/03_continuous_deployment_with_webhooks.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 |     <h1>Lecture 3. Automatic deployment with Comet ML webhooks and GitHub actions</h1>
 3 |     <i><a href="">Comet ML Model Registry </a> + GitHub action </i>
 4 | </div>
 5 | 
 6 | <br />
 7 | 
 8 | <p align="center">
 9 |   <img src="../images/lecture_3.gif" width='500' />
10 | </p>
11 | 
12 | #### Steps
13 | 7. [Create GitHub action to deploy the API](#7-create-githbu-action-to-deploy-the-api)
14 | 8. [Test the GitHub action works](#8-test-the-github-action-works)
15 | 9. [Create webhook to trigger the GitHub action](#9-create-webhook-to-trigger-the-github-action)
16 | 10. [Test the webhook works](#10-test-the-webhook-works)
17 | 
18 | ## 7. Create GitHbu action to deploy the API
19 | [PENDING VIDEO 🎬]
20 | 
21 | ## 8. Test the GitHub action works
22 | [PENDING VIDEO 🎬]
23 | 
24 | ## 9. Create webhook to trigger the GitHub action
25 | [PENDING VIDEO 🎬]
26 | 
27 | ## 10. Test the webhook works
28 | [PENDING VIDEO 🎬]


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "src"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Pau <plabartabajo@gmail.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = ">=3.9,<3.11"
10 | requests = "^2.30.0"
11 | pandas = "1.5.3"
12 | fire = "^0.5.0"
13 | pyarrow = "^12.0.0"
14 | ta = "^0.10.2"
15 | notebook = "^6.5.4"
16 | scikit-learn = "^1.2.2"
17 | xgboost = "^1.7.5"
18 | lightgbm = "^3.3.5"
19 | optuna = "^3.1.1"
20 | comet-ml = "3.32.6"
21 | cerebrium = "^1.1.1"
22 | torch = "1.13.1"
23 | pydantic = "^2.1.1"
24 | 
25 | 
26 | [build-system]
27 | requires = ["poetry-core"]
28 | build-backend = "poetry.core.masonry.api"
29 | 


--------------------------------------------------------------------------------
/set_environment_variables_template.sh:
--------------------------------------------------------------------------------
1 | # replace placeholders and rename this file to `set_environment_variables.sh`
2 | export COMET_ML_API_KEY="YOUR_COMET_ML_API_KEY"
3 | export COMET_ML_WORKSPACE="YOUR_COMET_ML_WORKSPACE"
4 | export COMET_ML_MODEL_NAME="YOUR_COMET_ML_MODEL_NAME"
5 | 
6 | export CEREBRIUM_API_KEY="YOUR_CEREBRIUM_API_KEY"
7 | 
8 | # you set this after deploying the API for the first time
9 | export CEREBRIUM_ENDPOINT_URL="YOUR_CEREBRIUM_ENDPOINT_URL"


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/src/__init__.py


--------------------------------------------------------------------------------
/src/baseline_model.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Union, Optional, Callable
 2 | import os
 3 | 
 4 | import pandas as pd
 5 | from comet_ml import Experiment
 6 | from sklearn.metrics import mean_absolute_error
 7 | from sklearn.linear_model import Lasso
 8 | 
 9 | 
10 | from src.preprocessing import transform_ts_data_into_features_and_target
11 | from src.logger import get_console_logger
12 | 
13 | logger = get_console_logger()
14 | 
15 | 
16 | def get_baseline_model_error(X_test: pd.DataFrame, y_test: pd.Series) -> float:
17 |     """Returns the baseline model error."""
18 |     predictions = X_test['price_1_hour_ago']
19 |     return mean_absolute_error(y_test, predictions)
20 | 
21 | 
22 | def train(
23 |     X: pd.DataFrame,
24 |     y: pd.Series,
25 |     ) -> None:
26 |     """
27 |     Train a boosting tree model using the input features `X` and targets `y`,
28 |     possibly running hyperparameter tuning.
29 |     """
30 |     experiment = Experiment(
31 |         api_key = os.environ["COMET_ML_API_KEY"],
32 |         workspace=os.environ["COMET_ML_WORKSPACE"],
33 |         project_name = "hands-on-train-and-deploy-tutorial",
34 |     )
35 |     experiment.add_tag('baseline_model')
36 | 
37 |     # split the data into train and test
38 |     train_sample_size = int(0.9 * len(X))
39 |     X_train, X_test = X[:train_sample_size], X[train_sample_size:]
40 |     y_train, y_test = y[:train_sample_size], y[train_sample_size:]
41 |     logger.info(f'Train sample size: {len(X_train)}')
42 |     logger.info(f'Test sample size: {len(X_test)}')
43 | 
44 |     # baseline model performance
45 |     baseline_mae = get_baseline_model_error(X_test, y_test)
46 |     logger.info(f'Test MAE: {baseline_mae}')
47 |     experiment.log_metrics({'Test_MAE': baseline_mae})
48 | 
49 | 
50 | if __name__ == '__main__':
51 | 
52 |     logger.info('Generating features and targets')
53 |     features, target = transform_ts_data_into_features_and_target()
54 |     
55 |     logger.info('Starting training')
56 |     train(features, target)


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
1 | WINDOW_SIZE = 24


--------------------------------------------------------------------------------
/src/data.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from pathlib import Path
 3 | 
 4 | import pandas as pd
 5 | import requests
 6 | import fire
 7 | 
 8 | from src.paths import DATA_DIR
 9 | from src.logger import get_console_logger
10 | 
11 | logger = get_console_logger(name='dataset_generation')
12 | 
13 | def download_ohlc_data_from_coinbase(
14 |     product_id: Optional[str] = "BTC-USD",
15 |     from_day: Optional[str] = "2022-01-01",
16 |     to_day: Optional[str] = "2023-06-01",
17 | ) -> Path:
18 |     """
19 |     Downloads historical candles from Coinbase API and saves data to disk
20 |     Reference: https://docs.cloud.coinbase.com/exchange/reference/exchangerestapi_getproductcandles
21 |     """
22 |     # create list of days as strings
23 |     days = pd.date_range(start=from_day, end=to_day, freq="1D")
24 |     days = [day.strftime("%Y-%m-%d") for day in days]
25 | 
26 |     # create empty dataframe
27 |     data = pd.DataFrame()
28 | 
29 |     # create download dir folder if it doesn't exist
30 |     if not (DATA_DIR / 'downloads').exists():
31 |         logger.info('Create directory for downloads')
32 |         (DATA_DIR / 'downloads').mkdir(parents=True)
33 |     
34 |     for day in days:
35 | 
36 |         # download file if it doesn't exist
37 |         file_name = DATA_DIR / 'downloads' / f'{day}.parquet'
38 |         if file_name.exists():
39 |             logger.info(f'File {file_name} already exists, skipping')
40 |             data_one_day = pd.read_parquet(file_name)
41 |         else:
42 |             logger.info(f'Downloading data for {day}')
43 |             data_one_day = download_data_for_one_day(product_id, day)
44 |             data_one_day.to_parquet(file_name, index=False)
45 |     
46 |         # combine today's file with the rest of the data
47 |         data = pd.concat([data, data_one_day])
48 | 
49 |     # save data to disk   
50 |     # data.to_parquet(DATA_DIR / f"ohlc_from_{from_day}_to_{to_day}.parquet", index=False)
51 |     data.to_parquet(DATA_DIR / f"ohlc_data.parquet", index=False)
52 | 
53 |     return DATA_DIR / f"ohlc_data.parquet"
54 | 
55 | def download_data_for_one_day(product_id: str, day: str) -> pd.DataFrame:
56 |     """
57 |     Downloads one day of data and returns pandas Dataframe
58 |     """
59 |     # create start end end date strings
60 |     start = f'{day}T00:00:00'
61 |     from datetime import datetime, timedelta
62 |     end = (datetime.strptime(day, "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
63 |     end = f'{end}T00:00:00'
64 | 
65 |     # call API
66 |     URL = f'https://api.exchange.coinbase.com/products/{product_id}/candles?start={start}&end={end}&granularity=3600'
67 |     r = requests.get(URL)
68 |     data = r.json()
69 | 
70 |     # transform list of lists to pandas dataframe and return
71 |     return pd.DataFrame(data, columns=['time', 'low', 'high', 'open', 'close', 'volume'])
72 | 
73 | if __name__== '__main__':
74 |     fire.Fire(download_ohlc_data_from_coinbase)
75 | 
76 | 


--------------------------------------------------------------------------------
/src/deploy.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | import os
 3 | 
 4 | import fire
 5 | from cerebrium import deploy, model_type
 6 | 
 7 | from src.logger import get_console_logger
 8 | from src.paths import MODELS_DIR
 9 | 
10 | logger = get_console_logger(name='model_deployment')
11 | 
12 | try:
13 |     CEREBRIUM_API_KEY = os.environ['CEREBRIUM_API_KEY']
14 | except KeyError:
15 |     logger.error('CEREBRIUM_API_KEY environment variable not set.')
16 |     raise
17 | 
18 | def deploy(
19 |     local_pickle: Optional[str] = None,
20 |     from_model_registry: bool = False,
21 | ):
22 |     """"""
23 |     logger.info('Deploying model...')
24 | 
25 |     if from_model_registry:
26 |         logger.info('Loading model from model registry...')
27 |         raise NotImplementedError('TODO')
28 | 
29 |     elif local_pickle:
30 |         logger.info('Deploying model from local pickle...')
31 |         model_pickle_file = MODELS_DIR / local_pickle
32 |         # TODO: not working. I am just following the docs here
33 |         # https://docs.cerebrium.ai/quickstarts/scikit
34 |         endpoint = deploy((model_type.SKLEARN, model_pickle_file), "sk-test-model" , CEREBRIUM_API_KEY)
35 |     else:
36 |         raise ValueError('Must specify either --local-pickle or --from-model-registry.')
37 | 
38 |     logger.info('Model deployed.')
39 | 
40 | if __name__ == '__main__':
41 |     fire.Fire(deploy)


--------------------------------------------------------------------------------
/src/hyperparams.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Tuple, Callable, Union
  2 | import os
  3 | 
  4 | from comet_ml import Experiment
  5 | import optuna
  6 | import pandas as pd
  7 | import numpy as np
  8 | from sklearn.model_selection import TimeSeriesSplit
  9 | from sklearn.metrics import mean_absolute_error
 10 | from sklearn.pipeline import make_pipeline
 11 | from sklearn.linear_model import Lasso
 12 | from lightgbm import LGBMRegressor
 13 | 
 14 | # from src.model_factory import get_preprocessing_and_model_pipeline
 15 | from src.preprocessing import get_preprocessing_pipeline
 16 | from src.logger import get_console_logger
 17 | 
 18 | logger = get_console_logger()
 19 | 
 20 | def sample_hyperparams(
 21 |     model_fn: Callable,
 22 |     trial: optuna.trial.Trial,
 23 | ) -> Dict[str, Union[str, int, float]]:
 24 | 
 25 |     if model_fn == Lasso:
 26 |         return {
 27 |             'alpha': trial.suggest_float('alpha', 0.01, 1.0, log=True)
 28 |         }
 29 |     elif model_fn == LGBMRegressor:
 30 |         return {
 31 |             "metric": 'mae',
 32 |             "verbose": -1,
 33 |             "num_leaves": trial.suggest_int("num_leaves", 2, 256),
 34 |             "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
 35 |             "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
 36 |             "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
 37 |         }
 38 |     else:
 39 |         raise NotImplementedError('TODO: implement other models')
 40 | 
 41 | def find_best_hyperparams(
 42 |     model_fn: Callable,
 43 |     hyperparam_trials: int,
 44 |     X: pd.DataFrame,
 45 |     y: pd.Series,
 46 |     experiment: Experiment,
 47 | ) -> Tuple[Dict, Dict]:
 48 |     """"""
 49 |     assert model_fn in {Lasso, LGBMRegressor}
 50 | 
 51 |     def objective(trial: optuna.trial.Trial) -> float:
 52 |         """
 53 |         Error function we want to minimize (or maximize) using hyperparameter tuning.
 54 |         """
 55 |         # sample hyper-parameters
 56 |         preprocessing_hyperparams = {
 57 |             'pp_rsi_window': trial.suggest_int('pp_rsi_window', 5, 20),
 58 |         }
 59 |         model_hyperparams = sample_hyperparams(model_fn, trial)
 60 |         
 61 |         # evaluate the model using TimeSeriesSplit cross-validation
 62 |         tss = TimeSeriesSplit(n_splits=3)
 63 |         scores = []
 64 |         logger.info(f'{trial.number=}')
 65 |         for split_number, (train_index, val_index) in enumerate(tss.split(X)):
 66 | 
 67 |             # split data for training and validation
 68 |             X_train, X_val = X.iloc[train_index], X.iloc[val_index]
 69 |             y_train, y_val = y.iloc[train_index], y.iloc[val_index]
 70 |             
 71 |             logger.info(f'{split_number=}')
 72 |             logger.info(f'{len(X_train)=}')
 73 |             logger.info(f'{len(X_val)=}')
 74 | 
 75 |             # train the model
 76 |             pipeline = make_pipeline(
 77 |                 get_preprocessing_pipeline(**preprocessing_hyperparams),
 78 |                 model_fn(**model_hyperparams)
 79 |             )
 80 |             pipeline.fit(X_train, y_train)
 81 |             
 82 |             # evaluate the model
 83 |             y_pred = pipeline.predict(X_val)
 84 |             mae = mean_absolute_error(y_val, y_pred)
 85 |             scores.append(mae)
 86 |             
 87 |             logger.info(f'{mae=}')
 88 | 
 89 |         score = np.array(scores).mean()
 90 | 
 91 |         # Return the mean score
 92 |         return score
 93 |     
 94 |     logger.info('Starting hyper-parameter search...')
 95 |     study = optuna.create_study(direction="minimize")
 96 |     study.optimize(objective, n_trials=hyperparam_trials)
 97 | 
 98 |     # Get the best hyperparameters and their values
 99 |     best_params = study.best_params
100 |     best_value = study.best_value
101 |     
102 |     # split best_params into preprocessing and model hyper-parameters
103 |     best_preprocessing_hyperparams = \
104 |         {key: value for key, value in best_params.items() 
105 |          if key.startswith('pp_')}
106 |     
107 |     best_model_hyperparams = {
108 |         key: value for key, value in best_params.items() 
109 |         if not key.startswith('pp_')}
110 | 
111 |     logger.info("Best Parameters:")
112 |     for key, value in best_params.items():
113 |         logger.info(f"{key}: {value}")
114 |     logger.info(f"Best MAE: {best_value}")
115 | 
116 |     experiment.log_metric('Cross_validation_MAE', best_value)
117 | 
118 |     return best_preprocessing_hyperparams, best_model_hyperparams
119 | 
120 | 


--------------------------------------------------------------------------------
/src/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Optional
 3 | 
 4 | def get_console_logger(name: Optional[str] = 'tutorial') -> logging.Logger:
 5 |     
 6 |     # Create logger if it doesn't exist
 7 |     logger = logging.getLogger(name)
 8 |     if not logger.handlers:
 9 |         logger.setLevel(logging.DEBUG)
10 | 
11 |         # Create console handler with formatting
12 |         console_handler = logging.StreamHandler()
13 |         console_handler.setLevel(logging.DEBUG)
14 |         formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
15 |         console_handler.setFormatter(formatter)
16 | 
17 |         # Add console handler to the logger
18 |         logger.addHandler(console_handler)
19 | 
20 |     return logger


--------------------------------------------------------------------------------
/src/model_registry_api.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from comet_ml import API
 4 | from sklearn.pipeline import Pipeline
 5 | 
 6 | from src.logger import get_console_logger
 7 | 
 8 | logger = get_console_logger()
 9 | 
10 | def load_production_model_from_registry(
11 |     workspace: str,
12 |     api_key: str,
13 |     model_name: str,
14 |     status: str = 'Production',
15 | ) -> Pipeline:
16 |     """Loads the production model from the remote model registry"""
17 | 
18 |     # find model version to deploy
19 |     api = API(api_key)
20 |     model_details = api.get_registry_model_details(workspace, model_name)['versions']
21 |     model_versions = [md['version'] for md in model_details if md['status'] == status]
22 |     if len(model_versions) == 0:
23 |         logger.error('No production model found')
24 |         raise ValueError('No production model found')
25 |     else:
26 |         logger.info(f'Found {status} model versions: {model_versions}')
27 |         model_version = model_versions[0]
28 |     
29 |     # download model from comet ml registry to local file
30 |     api.download_registry_model(
31 |         workspace,
32 |         registry_name=model_name,
33 |         version=model_version,
34 |         output_path='./',
35 |         expand=True
36 |     )
37 |     
38 |     # load model from local file to memory
39 |     with open('./model.pkl', "rb") as f:
40 |         model = pickle.load(f)
41 |     
42 |     return model


--------------------------------------------------------------------------------
/src/paths.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import os
 3 | 
 4 | PARENT_DIR = Path(__file__).parent.resolve().parent
 5 | DATA_DIR = PARENT_DIR / 'data'
 6 | MODELS_DIR = PARENT_DIR / 'models'
 7 | 
 8 | if not Path(DATA_DIR).exists():
 9 |     os.mkdir(DATA_DIR)
10 | 
11 | if not Path(MODELS_DIR).exists():
12 |     os.mkdir(MODELS_DIR)


--------------------------------------------------------------------------------
/src/predict.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | from src.model_registry_api import load_production_model_from_registry
 4 | from src.logger import get_console_logger
 5 | 
 6 | logger = get_console_logger('deployer')
 7 | 
 8 | try:
 9 |     # this code works when running on Cerebrium
10 |     from cerebrium import get_secret
11 |     COMET_ML_WORKSPACE = get_secret("COMET_ML_WORKSPACE")
12 |     COMET_ML_API_KEY = get_secret("COMET_ML_API_KEY")
13 |     COMET_ML_MODEL_NAME = get_secret("COMET_ML_MODEL_NAME")
14 | 
15 | except ImportError:
16 |     # this code works when running locally
17 |     import os
18 |     COMET_ML_WORKSPACE = os.environ['COMET_ML_WORKSPACE']
19 |     COMET_ML_API_KEY = os.environ['COMET_ML_API_KEY']
20 |     COMET_ML_MODEL_NAME = os.environ['COMET_ML_MODEL_NAME']
21 | 
22 | model = load_production_model_from_registry(
23 |     workspace=COMET_ML_WORKSPACE,
24 |     api_key=COMET_ML_API_KEY,
25 |     model_name=COMET_ML_MODEL_NAME,
26 | )
27 | 
28 | class Item(BaseModel):
29 |     price_24_hour_ago: float
30 |     price_23_hour_ago: float
31 |     price_22_hour_ago: float
32 |     price_21_hour_ago: float
33 |     price_20_hour_ago: float
34 |     price_19_hour_ago: float
35 |     price_18_hour_ago: float
36 |     price_17_hour_ago: float
37 |     price_16_hour_ago: float
38 |     price_15_hour_ago: float
39 |     price_14_hour_ago: float
40 |     price_13_hour_ago: float
41 |     price_12_hour_ago: float
42 |     price_11_hour_ago: float
43 |     price_10_hour_ago: float
44 |     price_9_hour_ago: float
45 |     price_8_hour_ago: float
46 |     price_7_hour_ago: float
47 |     price_6_hour_ago: float
48 |     price_5_hour_ago: float
49 |     price_4_hour_ago: float
50 |     price_3_hour_ago: float
51 |     price_2_hour_ago: float
52 |     price_1_hour_ago: float
53 | 
54 | def predict(item, run_id, logger):
55 |     item = Item(**item)
56 | 
57 |     # transform item to dataframe
58 |     import pandas as pd
59 |     df = pd.DataFrame([item.dict()])
60 | 
61 |     # predict
62 |     prediction = model.predict(df)[0]
63 | 
64 |     return {"prediction": prediction}
65 | 
66 | if __name__ == '__main__':
67 |     item = {
68 |         'price_24_hour_ago': 46656.851562,
69 |         'price_23_hour_ago': 46700.535156,
70 |         'price_22_hour_ago': 46700.535156,
71 |         'price_21_hour_ago': 46700.535156,
72 |         'price_20_hour_ago': 46700.535156,
73 |         'price_19_hour_ago': 46700.535156,
74 |         'price_18_hour_ago': 46700.535156,
75 |         'price_17_hour_ago': 46700.535156,
76 |         'price_16_hour_ago': 46700.535156,
77 |         'price_15_hour_ago': 46700.535156,
78 |         'price_14_hour_ago': 46700.535156,
79 |         'price_13_hour_ago': 46700.535156,
80 |         'price_12_hour_ago': 46700.535156,
81 |         'price_11_hour_ago': 46700.535156,
82 |         'price_10_hour_ago': 46700.535156,
83 |         'price_9_hour_ago': 46700.535156,
84 |         'price_8_hour_ago': 46700.535156,
85 |         'price_7_hour_ago': 46700.535156,
86 |         'price_6_hour_ago': 46700.535156,
87 |         'price_5_hour_ago': 46700.535156,
88 |         'price_4_hour_ago': 46700.535156,
89 |         'price_3_hour_ago': 46700.535156,
90 |         'price_2_hour_ago': 46700.535156,
91 |         'price_1_hour_ago': 46700.535156
92 |     }
93 | 
94 |     prediction = predict(item, None, None)
95 |     print(f'{prediction=}')


--------------------------------------------------------------------------------
/src/preprocessing.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Tuple, Optional, Union
  2 | from pathlib import Path
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | import fire
  7 | import ta
  8 | from sklearn.base import BaseEstimator, TransformerMixin
  9 | from sklearn.pipeline import make_pipeline, Pipeline
 10 | from sklearn.preprocessing import FunctionTransformer
 11 | 
 12 | from src.paths import DATA_DIR
 13 | from src.logger import get_console_logger
 14 | 
 15 | logger = get_console_logger()
 16 | 
 17 | def transform_ts_data_into_features_and_target(
 18 |     # ts_data: pd.DataFrame,
 19 |     path_to_input: Optional[Path] = DATA_DIR / 'ohlc_data.parquet',
 20 |     input_seq_len: Optional[int] = 24,
 21 |     step_size: Optional[int] = 1
 22 | ) -> Tuple[pd.DataFrame, pd.Series]:
 23 |     """
 24 |     Slices and transposes data from time-series format into a (features, target)
 25 |     format that we can use to train Supervised ML models
 26 |     """
 27 |     # load parquet file
 28 |     ts_data = pd.read_parquet(path_to_input)
 29 |     ts_data = ts_data[['time', 'close']]
 30 |     ts_data.sort_values(by=['time'], inplace=True)
 31 | 
 32 |     # output features and targets
 33 |     features = pd.DataFrame()
 34 |     targets = pd.DataFrame()
 35 |     
 36 |     # pre-compute cutoff indices to split dataframe rows
 37 |     indices = get_cutoff_indices_features_and_target(
 38 |         ts_data,
 39 |         input_seq_len,
 40 |         step_size
 41 |     )
 42 | 
 43 |     # slice and transpose data into numpy arrays for features and targets
 44 |     n_examples = len(indices)
 45 |     x = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32)
 46 |     y = np.ndarray(shape=(n_examples), dtype=np.float32)
 47 |     times = []
 48 |     for i, idx in enumerate(indices):
 49 |         x[i, :] = ts_data.iloc[idx[0]:idx[1]]['close'].values
 50 |         y[i] = ts_data.iloc[idx[1]:idx[2]]['close'].values
 51 |         times.append(ts_data.iloc[idx[1]]['time'])
 52 | 
 53 |     # numpy -> pandas
 54 |     features = pd.DataFrame(
 55 |         x,
 56 |         columns=[f'price_{i+1}_hour_ago' for i in reversed(range(input_seq_len))]
 57 |     )
 58 | 
 59 |     # add back column with the time
 60 |     # features['time'] = times
 61 | 
 62 |     # numpy -> pandas
 63 |     targets = pd.DataFrame(y, columns=[f'target_price_next_hour'])
 64 | 
 65 |     return features, targets['target_price_next_hour']
 66 | 
 67 | def get_cutoff_indices_features_and_target(
 68 |     data: pd.DataFrame,
 69 |     input_seq_len: int,
 70 |     step_size: int
 71 |     ) -> List[Tuple[int, int, int]]:
 72 | 
 73 |     stop_position = len(data) - 1
 74 |         
 75 |     # Start the first sub-sequence at index position 0
 76 |     subseq_first_idx = 0
 77 |     subseq_mid_idx = input_seq_len
 78 |     subseq_last_idx = input_seq_len + 1
 79 |     indices = []
 80 |     
 81 |     while subseq_last_idx <= stop_position:
 82 |         indices.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx))
 83 |         subseq_first_idx += step_size
 84 |         subseq_mid_idx += step_size
 85 |         subseq_last_idx += step_size
 86 | 
 87 |     return indices
 88 | 
 89 | def get_price_columns(X: pd.DataFrame) -> List[str]:
 90 |     """Get the columns of the input DataFrame that contain the price data."""
 91 |     return [col for col in X.columns if 'price' in col]
 92 | 
 93 | class RSI(BaseEstimator, TransformerMixin):
 94 |     """
 95 |     Adds RSI to the input DataFrame from the `close` prices
 96 | 
 97 |     New columns are:
 98 |         - 'rsi'
 99 |     """
100 |     def __init__(self, window: int = 14):
101 |         self.window = window
102 |     
103 |     def fit(self,
104 |             X: pd.DataFrame,
105 |             y: Optional[Union[pd.DataFrame, pd.Series]] = None) -> "RSI":
106 |         """In this scenario, the fit method isn't doing anything. But it must be implemented. This is a scenario of an estimator without parameters."""
107 |         return self
108 | 
109 |     def _add_indicator(self, row: pd.Series) -> float:
110 |         return pd.Series([ta.momentum.rsi(row, window=self.window)[-1]])
111 | 
112 |     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
113 |         """Compute the RSI and add it to the input DataFrame."""
114 |         logger.info('Adding RSI to the input DataFrame')
115 |         df = X[get_price_columns(X)].apply(self._add_indicator, axis=1)
116 |         df.columns = ['rsi']
117 |         X = pd.concat([X, df], axis=1)
118 |         return X
119 | 
120 |     def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
121 |         """Inverse the log of every cell of the DataFrame."""
122 |         X.drop(columns=['rsi'], inplace=True)
123 |         return X
124 | 
125 | def get_price_percentage_return(X: pd.DataFrame, hours: int) -> pd.DataFrame:
126 |     """Add the price return of the last `hours` to the input DataFrame."""
127 |     X[f'percentage_return_{hours}_hour'] = \
128 |         (X['price_1_hour_ago'] - X[f'price_{hours}_hour_ago'])/ X[f'price_{hours}_hour_ago']
129 |     return X
130 | 
131 | def get_subset_of_features(X: pd.DataFrame) -> pd.DataFrame:
132 |     return X[['price_1_hour_ago', 'percentage_return_2_hour', 'percentage_return_24_hour', 'rsi']]
133 | 
134 | def get_preprocessing_pipeline(
135 |     pp_rsi_window: int = 14
136 | ) -> Pipeline:
137 |     """Returns the preprocessing pipeline."""
138 |     return make_pipeline(
139 |         # trends
140 |         FunctionTransformer(get_price_percentage_return, kw_args={'hours': 2}),
141 |         FunctionTransformer(get_price_percentage_return, kw_args={'hours': 24}),
142 | 
143 |         # momentum
144 |         RSI(pp_rsi_window),
145 | 
146 |         # select columns
147 |         FunctionTransformer(get_subset_of_features)
148 |     )
149 | 
150 | if __name__ == '__main__':
151 |     
152 |     features, target = fire.Fire(transform_ts_data_into_features_and_target)
153 |     
154 |     preprocessing_pipeline = get_preprocessing_pipeline()
155 | 
156 |     preprocessing_pipeline.fit(features)
157 |     X = preprocessing_pipeline.transform(features)
158 |     print(X.head())


--------------------------------------------------------------------------------
/src/test_endpoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | from src.logger import get_console_logger
 7 | 
 8 | logger = get_console_logger(__name__)
 9 | 
10 | try:
11 |     url = os.environ['CEREBRIUM_ENDPOINT_URL']
12 | except KeyError as e:
13 |     logger.error('You need to specify the env variables CEREBRIUM_ENDPOINT_URL in your set_env_variables.sh script')
14 |     raise e
15 | 
16 | payload = json.dumps({
17 |     'price_24_hour_ago': 46656.851562,
18 |     'price_23_hour_ago': 46700.535156,
19 |     'price_22_hour_ago': 46700.535156,
20 |     'price_21_hour_ago': 46700.535156,
21 |     'price_20_hour_ago': 46700.535156,
22 |     'price_19_hour_ago': 46700.535156,
23 |     'price_18_hour_ago': 46700.535156,
24 |     'price_17_hour_ago': 46700.535156,
25 |     'price_16_hour_ago': 46700.535156,
26 |     'price_15_hour_ago': 46700.535156,
27 |     'price_14_hour_ago': 46700.535156,
28 |     'price_13_hour_ago': 46700.535156,
29 |     'price_12_hour_ago': 46700.535156,
30 |     'price_11_hour_ago': 46700.535156,
31 |     'price_10_hour_ago': 46700.535156,
32 |     'price_9_hour_ago': 46700.535156,
33 |     'price_8_hour_ago': 46700.535156,
34 |     'price_7_hour_ago': 46700.535156,
35 |     'price_6_hour_ago': 46700.535156,
36 |     'price_5_hour_ago': 46700.535156,
37 |     'price_4_hour_ago': 46700.535156,
38 |     'price_3_hour_ago': 46700.535156,
39 |     'price_2_hour_ago': 46700.535156,
40 |     'price_1_hour_ago': 46700.535156
41 | })
42 | 
43 | headers = {
44 |   'Authorization': 'public-6fb5d32ac9801938a17b',
45 |   'Content-Type': 'application/json'
46 | }
47 | 
48 | response = requests.request("POST", url, headers=headers, data=payload)
49 | 
50 | print(response.text)


--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Union, Optional, Callable
  2 | import os
  3 | 
  4 | import pandas as pd
  5 | from comet_ml import Experiment
  6 | from sklearn.pipeline import make_pipeline, Pipeline
  7 | from sklearn.metrics import mean_absolute_error
  8 | from sklearn.linear_model import Lasso
  9 | from xgboost import XGBRegressor
 10 | from lightgbm import LGBMRegressor
 11 | import pickle
 12 | 
 13 | from src.preprocessing import (
 14 |     transform_ts_data_into_features_and_target,
 15 |     get_preprocessing_pipeline
 16 | )
 17 | from src.hyperparams import find_best_hyperparams
 18 | from src.logger import get_console_logger
 19 | from src.paths import MODELS_DIR
 20 | # from src.model_factory import get_preprocessing_and_model_pipeline
 21 | 
 22 | logger = get_console_logger()
 23 | 
 24 | 
 25 | def get_baseline_model_error(X_test: pd.DataFrame, y_test: pd.Series) -> float:
 26 |     """Returns the baseline model error."""
 27 |     predictions = X_test['price_1_hour_ago']
 28 |     return mean_absolute_error(y_test, predictions)
 29 | 
 30 | def get_model_fn_from_name(model_name: str) -> Callable:
 31 |     """Returns the model function given the model name."""
 32 |     if model_name == 'lasso':
 33 |         return Lasso
 34 |     elif model_name == 'xgboost':
 35 |         return XGBRegressor
 36 |     elif model_name == 'lightgbm':
 37 |         return LGBMRegressor
 38 |     else:
 39 |         raise ValueError(f'Unknown model name: {model_name}')
 40 | 
 41 | def train(
 42 |     X: pd.DataFrame,
 43 |     y: pd.Series,
 44 |     model: str,
 45 |     tune_hyperparams: Optional[bool] = False,
 46 |     hyperparam_trials: Optional[int] = 10,
 47 |     ) -> None:
 48 |     """
 49 |     Train a boosting tree model using the input features `X` and targets `y`,
 50 |     possibly running hyperparameter tuning.
 51 |     """
 52 |     model_fn = get_model_fn_from_name(model)
 53 | 
 54 |     experiment = Experiment(
 55 |         api_key = os.environ["COMET_ML_API_KEY"],
 56 |         workspace=os.environ["COMET_ML_WORKSPACE"],
 57 |         project_name = "hands-on-train-and-deploy-tutorial",
 58 |     )
 59 |     experiment.add_tag(model)
 60 | 
 61 |     # split the data into train and test
 62 |     train_sample_size = int(0.9 * len(X))
 63 |     X_train, X_test = X[:train_sample_size], X[train_sample_size:]
 64 |     y_train, y_test = y[:train_sample_size], y[train_sample_size:]
 65 |     logger.info(f'Train sample size: {len(X_train)}')
 66 |     logger.info(f'Test sample size: {len(X_test)}')
 67 | 
 68 |     if not tune_hyperparams:
 69 |         # create the full pipeline with default hyperparameters
 70 |         logger.info('Using default hyperparameters')
 71 |         pipeline = make_pipeline(
 72 |             get_preprocessing_pipeline(),
 73 |             model_fn()
 74 |         )
 75 | 
 76 |     else:
 77 | 
 78 |         # find best hyperparameters using cross-validation
 79 |         logger.info('Finding best hyperparameters with cross-validation')
 80 |         best_preprocessing_hyperparams, best_model_hyperparams = \
 81 |             find_best_hyperparams(model_fn, hyperparam_trials, X_train, y_train,
 82 |                                   experiment)
 83 |         logger.info(f'Best preprocessing hyperparameters: {best_preprocessing_hyperparams}')
 84 |         logger.info(f'Best model hyperparameters: {best_model_hyperparams}')
 85 |         
 86 |         pipeline = make_pipeline(
 87 |             get_preprocessing_pipeline(**best_preprocessing_hyperparams),
 88 |             model_fn(**best_model_hyperparams)
 89 |         )
 90 | 
 91 |         experiment.add_tag('hyper-parameter-tuning')
 92 | 
 93 |     # train the model
 94 |     logger.info('Fitting model with default hyperparameters')
 95 |     pipeline.fit(X_train, y_train)
 96 | 
 97 |     # compute test MAE
 98 |     predictions = pipeline.predict(X_test)
 99 |     test_error = mean_absolute_error(y_test, predictions)
100 |     logger.info(f'Test MAE: {test_error}')
101 |     experiment.log_metrics({'Test_MAE': test_error})
102 | 
103 |     # save the model to disk
104 |     logger.info('Saving model to disk')
105 |     with open(MODELS_DIR / 'model.pkl', "wb") as f:
106 |         pickle.dump(pipeline, f)
107 | 
108 |     # log model artifact
109 |     # experiment.log_model('eth-eur-1h-price-predictor', str(MODELS_DIR / 'model.pkl'))
110 |     experiment.log_model(str(model_fn), str(MODELS_DIR / 'model.pkl'))
111 |     
112 |     # breakpoint()
113 | 
114 |     # log model to the registry
115 |     # experiment.register_model('eth-eur-1h-price-predictor')
116 |     
117 | 
118 | 
119 | if __name__ == '__main__':
120 | 
121 |     from argparse import ArgumentParser
122 |     parser = ArgumentParser()
123 |     parser.add_argument('--model', type=str, default='lasso')
124 |     parser.add_argument('--tune-hyperparams', action='store_true')
125 |     parser.add_argument('--sample-size', type=int, default=None)
126 |     parser.add_argument('--hyperparam-trials', type=int, default=10)
127 |     args = parser.parse_args()
128 | 
129 |     logger.info('Generating features and targets')
130 |     features, target = transform_ts_data_into_features_and_target()
131 | 
132 |     if args.sample_size is not None:
133 |         # reduce input size to speed up training
134 |         features = features.head(args.sample_size)
135 |         target = target.head(args.sample_size)
136 |         
137 |     logger.info('Training model')
138 |     train(features, target,
139 |           model=args.model,
140 |           tune_hyperparams=args.tune_hyperparams,
141 |           hyperparam_trials=args.hyperparam_trials
142 |           )


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Paulescu/hands-on-train-and-deploy-ml/0da555fcc090855896fb454be7dfc4ee74b50798/tests/__init__.py


--------------------------------------------------------------------------------