├── LICENSE └── sections ├── 4. Model Management ├── 4.4_mlflow_setup_readme.md ├── 4.5_getting_started.ipynb ├── 4.6_training_loop.ipynb ├── 4.7_mlflow_inference.ipynb └── 4.8_mlflow_authentication.py ├── 5. Advanced Model Deployment Techniques ├── 5.2_batching_and_dynamic_batching.ipynb ├── 5.2_batching_and_dynamic_batching.py ├── 5.3_the_role_of_sorting_batches.ipynb ├── 5.3_the_role_of_sorting_batches.py ├── 5.4_understanding_quantization.ipynb └── 5.4_understanding_quantization.py └── 7. Scheduling and Running Jobs on a Cluster ├── level1.py ├── level2.py ├── level3.py ├── level4.py ├── level_5 ├── consume_results.py ├── produce_prompts.py ├── rabbit.py └── ray_batch_job.py └── utils.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /sections/4. Model Management/4.4_mlflow_setup_readme.md: -------------------------------------------------------------------------------- 1 | # Section 4 - Model Management & ML-Ops 2 | 3 | 4 | Below is a guide and Bash commands to set up MLflow on an Ubuntu system. This setup includes installing MLflow, setting up a backend store for experiments and runs, and launching the MLflow UI. 5 | 6 | ### MLflow Setup Guide for Ubuntu 7 | 8 | #### Prerequisites: 9 | - Python 3.6 or higher 10 | - Pip (Python package installer) 11 | - Ubuntu system (or a similar Linux distribution) 12 | 13 | 14 | 15 | 16 | ### Step 1: Create custom user (optional) 17 | 18 | Setting up a custom user for MLflow and a dedicated Python environment is a good practice, especially for ensuring that the MLflow service runs securely and isolated from other system processes. Here's how you can set it up on an Ubuntu system: 19 | 20 | 1. **Create the User**: 21 | Open a terminal and run the following command to create a new user called `mlflow`. 22 | ```bash 23 | sudo adduser mlflow 24 | ``` 25 | 26 | 2. **Grant Sudo Privileges (Optional)**: 27 | If this user needs to perform administrative tasks, you can grant it sudo privileges. Otherwise, you can skip this step. 28 | ```bash 29 | sudo usermod -aG sudo mlflow 30 | ``` 31 | 32 | ### Step 2: Install Python and Create an Environment 33 | 34 | 1. **Switch to the mlflow User**: 35 | Switch to the new user account. 36 | ```bash 37 | su - mlflow 38 | ``` 39 | 40 | 2. **Install Python3 and Pip**: 41 | Ensure Python3 and Pip are installed. Most Ubuntu versions come with Python3 by default, but you might need to install pip. 42 | ```bash 43 | sudo apt update 44 | sudo apt install python3 python3-pip 45 | ``` 46 | 47 | 3. **Install Virtualenv**: 48 | Virtualenv is a tool to create isolated Python environments. 49 | ```bash 50 | pip3 install virtualenv 51 | ``` 52 | 53 | 4. **Create a Virtual Environment**: 54 | Create a new directory for the MLflow server and navigate into it. Then create a virtual environment. 55 | ```bash 56 | mkdir ~/mlflow_server 57 | cd ~/mlflow_server 58 | virtualenv mlflow_env 59 | ``` 60 | 61 | 5. **Activate the Virtual Environment**: 62 | Before installing MLflow and other dependencies, activate the virtual environment. 63 | ```bash 64 | source mlflow_env/bin/activate 65 | ``` 66 | 67 | ### Step 3: Install MLflow 68 | With the virtual environment activated, install MLflow. If you want to ensure compatibility, use the same versions I use in the course. If you'd like to use the latest, make sure to use matching versions for the other libraries. 69 | 70 | ```bash 71 | pip install mlflow==2.7.1 72 | ``` 73 | 74 | 75 | ### Step 4: Install Backend Store (Optional) 76 | MLflow uses a tracking server to log experiment data. By default, it logs to the local filesystem, but for more robust use, you may want to set up a database like MySQL or SQLite. 77 | 78 | **For SQLite (Simpler Option):** 79 | - SQLite comes pre-installed on many systems, including Ubuntu. 80 | - Decide on a directory where you want your SQLite database to reside 81 | ```bash 82 | cd ~/mlflow_server 83 | mkdir metrics_store 84 | ``` 85 | 86 | **For MySQL:** 87 | - Install MySQL Server: 88 | ```bash 89 | sudo apt update 90 | sudo apt install mysql-server 91 | ``` 92 | - Secure your installation and set up your user (follow the prompt after the command): 93 | ```bash 94 | sudo mysql_secure_installation 95 | ``` 96 | - Log into MySQL to create a database for MLflow: 97 | ```bash 98 | sudo mysql -u root -p 99 | ``` 100 | - Once inside MySQL, create a database: 101 | ```mysql 102 | CREATE DATABASE mlflow_db; 103 | EXIT; 104 | ``` 105 | 106 | ### Step 5: Set Backend Store for MLflow 107 | - **For SQLite**, you'll use a URI like: `sqlite:////home/mlflow/mlflow_server/metrics_store/mlflow.db` 108 | - **For MySQL**, the URI will be: `mysql://:@localhost/mlflow_db` 109 | 110 | 111 | ### Step 6: Install Artifact Store 112 | The artifact store is where MLflow saves model artifacts like models and plots. You can use S3, Azure Blob Storage, Google Cloud Storage, or even a shared filesystem. 113 | 114 | - **For local storage (simplest for getting started)**, use a local directory. 115 | ```bash 116 | cd ~/mlflow_server 117 | mkdir artifact_store 118 | ``` 119 | 120 | 121 | 122 | 123 | ### Step 7: Launch MLflow Tracking Server 124 | Open a terminal and run the following command, replacing the URIs with your chosen backend and artifact store paths: 125 | 126 | ```bash 127 | mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlflow-artifacts 128 | ``` 129 | 130 | Replace `sqlite:///mlflow.db` with your MySQL URI if you're using MySQL, and adjust `./mlflow-artifacts` to the path where you want artifacts stored. 131 | 132 | ### Step 8: Accessing the MLflow UI 133 | - Once the tracking server is running, it will display a URL, typically `http://127.0.0.1:5000`. Open this URL in a web browser to access the MLflow UI. 134 | - You can now navigate the UI to see your experiments, runs, metrics, and artifacts. 135 | ```bash 136 | ssh -L 5000:localhost:5000 remote 137 | ``` 138 | 139 | 140 | #### Additional Tips: 141 | - **Service**: For a more permanent setup, you might want to set up MLflow to run as a service or use a process manager like `supervisor` to manage the server process. 142 | - **Security**: If you're setting this up on a cloud server or an exposed machine, ensure you configure proper security settings, including firewalls and authentication for the MLflow server. 143 | 144 | ### Conclusion 145 | You now have MLflow set up on your Ubuntu system with a backend store for tracking experiments and an artifact store for saving model artifacts. You can start running experiments and tracking them using the MLflow Python library, and all your experiment details will be accessible through the MLflow UI. -------------------------------------------------------------------------------- /sections/4. Model Management/4.5_getting_started.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## **1. Setting Up MLflow**" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import mlflow\n", 17 | "\n", 18 | "# Set the tracking URI for MLflow to the local server\n", 19 | "mlflow.set_tracking_uri(\"http://localhost:5000\")" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "- **What is MLflow?**: MLflow is an open-source platform for managing the end-to-end machine learning lifecycle. It includes tools for tracking experiments, packaging code into reproducible runs, and sharing and deploying models.\n", 27 | "- **Setting up MLflow**: The first step in using MLflow is to set up the tracking server, where all the experiment data will be stored. ```mlflow.set_tracking_uri(\"http://localhost:5000\")``` sets the tracking URI to a local server (running on localhost at port 5000). This means all the data from your experiments will be sent to this server for tracking and storage.\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## **2. Creating and Managing Experiments**" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# Creating a new experiment\n", 44 | "experiment_id = mlflow.create_experiment(\"My New Experiment\")\n", 45 | "\n", 46 | "# Starting a new run using a context manager\n", 47 | "with mlflow.start_run(experiment_id=experiment_id):\n", 48 | " # Your ML code goes here\n", 49 | " pass\n", 50 | "\n", 51 | "\n", 52 | "# Manually creating a custom named run\n", 53 | "run = mlflow.start_run(experiment_id=experiment_id, run_name=\"First run\")\n" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "- **Creating Experiments**: `mlflow.create_experiment(\"My New Experiment\")` creates a new experiment in MLflow. An experiment is a way to organize and keep track of your machine learning runs. Each experiment contains multiple runs.\n", 61 | "- **Starting Runs**: A \"run\" is a single execution of a machine learning code. MLflow allows you to start a run using two methods:\n", 62 | " - **Context Manager**: The `with mlflow.start_run()` syntax automatically starts and ends a run. This is useful as it ensures the run is closed properly after the code block is executed.\n", 63 | " - **Manual Management**: You can also start and end a run manually using `mlflow.start_run()` and `mlflow.end_run()`. This method gives you more control over when the run starts and ends." 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "## **3. Logging Parameters**" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "# Logging multiple parameters\n", 80 | "mlflow.log_param(\"learning_rate\", 0.01)\n", 81 | "mlflow.log_param(\"batch_size\", 32)\n", 82 | "num_epochs = 10\n", 83 | "mlflow.log_param(\"num_epocs\", num_epochs)\n" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "\n", 91 | "- **Purpose of Logging Parameters**: Parameters are the configuration settings used for your machine learning model. Logging them helps you keep track of which settings were used in each run, which is crucial for experiment reproducibility and comparison.\n", 92 | "- **How it Works**: The `mlflow.log_param` function logs parameters like learning rate, batch size, and number of epochs. These parameters are then visible in the MLflow UI, allowing you to see how different configurations affect model performance." 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "## **4. Logging Metrics**" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "# !pip install numpy\n", 109 | "import numpy as np\n", 110 | "\n", 111 | "# Logging metrics for each epoch\n", 112 | "for epoch in range(num_epochs):\n", 113 | " mlflow.log_metric(\"accuracy\", np.random.random(), step=epoch)\n", 114 | " mlflow.log_metric(\"loss\", np.random.random(), step=epoch)\n", 115 | "\n", 116 | "# Logging a time-series metric\n", 117 | "for t in range(100):\n", 118 | " metric_value = np.sin(t * np.pi / 50)\n", 119 | " mlflow.log_metric(\"time_series_metric\", metric_value, step=t)\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "\n", 127 | "\n", 128 | "- **Metrics in Machine Learning**: Metrics are values that measure the performance of your model. Common metrics include accuracy and loss.\n", 129 | "- **Logging Metrics with MLflow**: `mlflow.log_metric` allows you to log these metrics during your training process. This is often done for each epoch (a single pass through the entire dataset), or step (a pass of a batch of data) to track how the model improves over time.\n", 130 | "- **Time-Series Metrics**: Besides typical metrics, you can also log custom metrics. In this example, a time-series metric based on a sine function is logged. This demonstrates how you can track any metric over time, which can be useful for more complex analyses.\n" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "## **5. Logging Data and Artefacts**" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "# Logging datasets\n", 147 | "with open(\"data/dataset.csv\", \"w\") as f:\n", 148 | " f.write(\"x,y\\n\")\n", 149 | " for x in range(100):\n", 150 | " f.write(f\"{x},{x * 2}\\n\")\n", 151 | "\n", 152 | "mlflow.log_artifact(\"data/dataset.csv\", \"data\")" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "### Exploring different types of artifacts" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "# !pip install plotly pandas\n", 169 | "import pandas as pd\n", 170 | "import plotly.express as px\n", 171 | "\n", 172 | "# Generate a confusion matrix\n", 173 | "confusion_matrix = np.random.randint(0, 100, size=(5, 5)) # 5x5 matrix\n", 174 | "\n", 175 | "labels = [\"Class A\", \"Class B\", \"Class C\", \"Class D\", \"Class E\"]\n", 176 | "df_cm = pd.DataFrame(confusion_matrix, index=labels, columns=labels)\n", 177 | "\n", 178 | "# Plot confusion matrix using Plotly Express\n", 179 | "fig = px.imshow(df_cm, text_auto=True, labels=dict(x=\"Predicted Label\", y=\"True Label\"), x=labels, y=labels, title=\"Confusion Matrix\")\n", 180 | "\n", 181 | "# Save the figure as an HTML file\n", 182 | "html_file = \"confusion_matrix.html\"\n", 183 | "fig.write_html(html_file)\n", 184 | "\n", 185 | "# Log the HTML file with MLflow\n", 186 | "mlflow.log_artifact(html_file)\n" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "\n", 194 | "- **What are Artifacts?**: In MLflow, an artifact is any file or data that you want to log along with your run. This can include datasets, models, images, or even custom files.\n", 195 | "- **Logging Artifacts**: The `mlflow.log_artifact` function allows you to log these artifacts. In this example, a dataset and a confusion matrix (saved as an HTML file) are logged. Logging artifacts helps in ensuring that all relevant data and outputs are stored and easily accessible for each run." 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "## **6. Logging Models**" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "# !pip install transformers\n", 212 | "from transformers import AutoModelForSeq2SeqLM\n", 213 | "\n", 214 | "# Initialize a model from Hugging Face Transformers\n", 215 | "model = AutoModelForSeq2SeqLM.from_pretrained(\"TheFuzzyScientist/T5-base_Amazon-product-reviews\")\n", 216 | "\n", 217 | "\n", 218 | "# Log the model in MLflow\n", 219 | "mlflow.pytorch.log_model(model, \"transformer_model\")\n" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "- **Importance of Logging Models**: Keeping track of the models used in different runs is critical. It helps in model comparison, versioning, and deployment.\n", 227 | "- **How to Log Models**: MLflow provides functions to log models from various machine learning frameworks. In this case, `mlflow.pytorch.log_model` is used to log a PyTorch model. This function saves the model in a format that can be easily reloaded for future predictions or analysis.\n" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "## **7. Ending the Run**" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "# End run\n", 244 | "mlflow.end_run()" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "- **What Does Ending a Run Mean?**: In MLflow, ending a run signifies the completion of a specific machine learning experiment or process. It marks the point where you have finished logging parameters, metrics, and artifacts for that particular execution of your model or script.\n", 252 | "\n", 253 | "- **Why is it Important?**: It helps in keeping your experiments organized. Each run is a separate record in MLflow. By ending a run, you ensure that all the data logged after this point will be part of a new run, keeping your experiment's data clean and segregated.\n", 254 | "\n", 255 | "- **How to End a Run**: You can end a run using `mlflow.end_run()`. This method is particularly important when you start a run without using a context manager (the `with` statement). With a context manager, the run is automatically ended when you exit the block of code inside the `with` statement. However, if you start a run manually using `mlflow.start_run()`, you should always ensure to call `mlflow.end_run()` once all logging is done.\n" 256 | ] 257 | } 258 | ], 259 | "metadata": { 260 | "kernelspec": { 261 | "display_name": "dev", 262 | "language": "python", 263 | "name": "python3" 264 | }, 265 | "language_info": { 266 | "codemirror_mode": { 267 | "name": "ipython", 268 | "version": 3 269 | }, 270 | "file_extension": ".py", 271 | "mimetype": "text/x-python", 272 | "name": "python", 273 | "nbconvert_exporter": "python", 274 | "pygments_lexer": "ipython3", 275 | "version": "3.10.12" 276 | } 277 | }, 278 | "nbformat": 4, 279 | "nbformat_minor": 2 280 | } 281 | -------------------------------------------------------------------------------- /sections/4. Model Management/4.6_training_loop.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "## MLflow Integration for Model Training and Tracking\n", 9 | "\n", 10 | "In this notebook, we're integrating MLflow into a machine learning workflow to track and manage experiments effectively. We're focusing on a text classification task using the DistilBert model, emphasizing the importance of experiment tracking, model management, and operational efficiency - core themes of our course.\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### Objective:\n", 18 | "\n", 19 | "- Dynamically set up and log parameters to MLflow\n", 20 | "- Understand the purpose and application of each step in the context of MLflow and MLOps principles\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "### Environment Setup\n", 28 | "\n", 29 | "Ensure all necessary libraries are installed and imported for our workflow.\n" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "\n", 37 | "### Imports\n", 38 | "\n", 39 | "Import the necessary libraries, focusing on MLflow for tracking, PyTorch for model training, and Transformers for our NLP model." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "# !pip install datasets\n", 49 | "import os\n", 50 | "import mlflow\n", 51 | "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n", 52 | "import torch\n", 53 | "from tqdm import tqdm\n", 54 | "from torch.utils.data import DataLoader\n", 55 | "from datasets import load_dataset\n", 56 | "from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, AdamW\n" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### Configuration Parameters as an Object\n", 64 | "\n", 65 | "By defining parameters as a dictionary, we can easily iterate through them when logging to MLflow. This method streamlines the process and adheres to best practices in code maintainability and scalability.\n" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "params = {\n", 75 | " 'model_name': 'distilbert-base-cased',\n", 76 | " 'learning_rate': 5e-5,\n", 77 | " 'batch_size': 16,\n", 78 | " 'num_epochs': 1,\n", 79 | " 'dataset_name': 'ag_news',\n", 80 | " 'task_name': 'sequence_classification',\n", 81 | " 'log_steps': 100,\n", 82 | " 'max_seq_length': 128,\n", 83 | " 'output_dir': 'models/distilbert-base-uncased-ag_news',\n", 84 | "}" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "\n", 92 | "### MLflow Setup\n", 93 | "\n", 94 | "Setting up MLflow is crucial for tracking our experiments, parameters, and results, allowing us to manage and compare different runs effectively - a practice that aligns with the MLOps goal of systematic and efficient model management." 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "mlflow.set_tracking_uri(\"http://localhost:5005\")\n", 104 | "mlflow.set_experiment(f\"{params['task_name']}\")" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "### Load and Preprocess Dataset\n", 112 | "\n", 113 | "We're using a well-known NLP dataset to ensure reproducibility and comparability. The preprocessing step is crucial for converting raw text into a format that our model can understand, highlighting the importance of data preparation in the ML pipeline." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "# Load and preprocess dataset\n", 123 | "dataset = load_dataset(params['dataset_name'], params['task_name'])\n", 124 | "tokenizer = DistilBertTokenizer.from_pretrained(params['model_name'])\n", 125 | "\n", 126 | "def tokenize(batch):\n", 127 | " return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=params['max_seq_length'])\n", 128 | "\n", 129 | "\n", 130 | "train_dataset = dataset[\"train\"].shuffle().select(range(20_000)).map(tokenize, batched=True)\n", 131 | "test_dataset = dataset[\"test\"].shuffle().select(range(2_000)).map(tokenize, batched=True)\n", 132 | "\n", 133 | "# Set format for PyTorch and create data loaders\n", 134 | "train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])\n", 135 | "test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])\n", 136 | "\n", 137 | "train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)\n", 138 | "test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)\n", 139 | "\n", 140 | "# get the labels\n", 141 | "labels = dataset[\"train\"].features['label'].names" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "\n", 149 | "### Model Initialization\n", 150 | "\n", 151 | "Initializing the model is a foundational step, showcasing the practical application of a pre-trained NLP model for a specific task - reflecting the course's focus on real-world applicability of machine learning models." 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "model = DistilBertForSequenceClassification.from_pretrained(params['model_name'], num_labels=len(labels))\n", 161 | "model.config.id2label = {i: label for i, label in enumerate(labels)}\n", 162 | "params['id2label'] = model.config.id2label\n", 163 | "\n", 164 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 165 | "model.to(device)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "### Optimizer Setup\n", 173 | "\n", 174 | "Choosing the right optimizer and learning rate is vital for effective model training. It demonstrates the importance of hyperparameter tuning, a key concept in achieving optimal model performance." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "optimizer = AdamW(model.parameters(), lr=params['learning_rate'])" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "### Evaluation Function\n", 191 | "\n", 192 | "Evaluating the model on a separate test set helps us understand its performance on unseen data, highlighting the concept of generalization which is crucial for real-world applications." 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "def evaluate_model(model, dataloader, device):\n", 202 | " model.eval() # Set model to evaluation mode\n", 203 | " predictions, true_labels = [], []\n", 204 | "\n", 205 | " with torch.no_grad():\n", 206 | " for batch in dataloader:\n", 207 | " inputs, masks, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)\n", 208 | "\n", 209 | " # Forward pass, calculate logit predictions\n", 210 | " outputs = model(inputs, attention_mask=masks)\n", 211 | " logits = outputs.logits\n", 212 | " _, predicted_labels = torch.max(logits, dim=1)\n", 213 | "\n", 214 | " predictions.extend(predicted_labels.cpu().numpy())\n", 215 | " true_labels.extend(labels.cpu().numpy())\n", 216 | "\n", 217 | " # Calculate Evaluation Metrics\n", 218 | " accuracy = accuracy_score(true_labels, predictions)\n", 219 | " precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')\n", 220 | "\n", 221 | " return accuracy, precision, recall, f1\n" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "### Training Loop\n", 229 | "\n", 230 | "The training loop is where the actual model training happens. Logging metrics and parameters at each step is crucial for tracking the model's progress, understanding its behavior, and making informed decisions - core aspects of the MLOps lifecycle." 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "# Start MLflow Run\n", 240 | "with mlflow.start_run(run_name=f\"{params['model_name']}-{params['dataset_name']}\") as run:\n", 241 | "\n", 242 | " # Log all parameters at once\n", 243 | " mlflow.log_params(params)\n", 244 | "\n", 245 | " with tqdm(total=params['num_epochs'] * len(train_loader), desc=f\"Epoch [1/{params['num_epochs']}] - (Loss: N/A) - Steps\") as pbar:\n", 246 | " for epoch in range(params['num_epochs']):\n", 247 | " running_loss = 0.0\n", 248 | " for i, batch in enumerate(train_loader, 0):\n", 249 | " inputs, masks, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)\n", 250 | "\n", 251 | " optimizer.zero_grad()\n", 252 | " outputs = model(inputs, attention_mask=masks, labels=labels)\n", 253 | " loss = outputs.loss\n", 254 | " loss.backward()\n", 255 | " optimizer.step()\n", 256 | "\n", 257 | " running_loss += loss.item()\n", 258 | " if i and i % params['log_steps'] == 0:\n", 259 | " avg_loss = running_loss / params['log_steps']\n", 260 | "\n", 261 | " pbar.set_description(f\"Epoch [{epoch + 1}/{params['num_epochs']}] - (Loss: {avg_loss:.3f}) - Steps\")\n", 262 | " mlflow.log_metric(\"loss\", avg_loss, step=epoch * len(train_loader) + i)\n", 263 | " \n", 264 | " running_loss = 0.0\n", 265 | " pbar.update(1)\n", 266 | "\n", 267 | " # Evaluate Model\n", 268 | " accuracy, precision, recall, f1 = evaluate_model(model, test_loader, device)\n", 269 | " print(f\"Epoch {epoch + 1} Metrics: Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}\")\n", 270 | "\n", 271 | " # Log metrics to MLflow\n", 272 | " mlflow.log_metrics({'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}, step=epoch)\n", 273 | "\n", 274 | "\n", 275 | " # Log model to MLflow through built-in PyTorch method\n", 276 | " # mlflow.pytorch.log_model(model, \"model\")\n", 277 | "\n", 278 | " # Log model to MLflow through custom method\n", 279 | " os.makedirs(params['output_dir'], exist_ok=True)\n", 280 | " model.save_pretrained(params['output_dir'])\n", 281 | " tokenizer.save_pretrained(params['output_dir'])\n", 282 | "\n", 283 | " mlflow.log_artifacts(params['output_dir'], artifact_path=\"model\")\n", 284 | "\n", 285 | " model_uri = f\"runs:/{run.info.run_id}/model\"\n", 286 | " mlflow.register_model(model_uri, \"agnews-transformer\")\n", 287 | "\n", 288 | "print('Finished Training')" 289 | ] 290 | } 291 | ], 292 | "metadata": { 293 | "kernelspec": { 294 | "display_name": "dev", 295 | "language": "python", 296 | "name": "python3" 297 | }, 298 | "language_info": { 299 | "codemirror_mode": { 300 | "name": "ipython", 301 | "version": 3 302 | }, 303 | "file_extension": ".py", 304 | "mimetype": "text/x-python", 305 | "name": "python", 306 | "nbconvert_exporter": "python", 307 | "pygments_lexer": "ipython3", 308 | "version": "3.10.12" 309 | } 310 | }, 311 | "nbformat": 4, 312 | "nbformat_minor": 2 313 | } 314 | -------------------------------------------------------------------------------- /sections/4. Model Management/4.7_mlflow_inference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## MLflow Integration for Model Serving and Registry Management\n", 8 | "\n", 9 | "In this notebook, we delve into advanced aspects of MLflow, focusing on model serving, inference, and the management of model versions in the MLflow model registry. Our goal is to demonstrate how MLflow supports the operational phase of the machine learning lifecycle, which includes serving models for inference and efficiently managing multiple versions of models.\n", 10 | "\n", 11 | "We will explore the practical application of these concepts using a text classification model. This will include loading models for inference, performing predictions, managing different versions of models, and understanding how to transition models through various stages in the model lifecycle. These skills are essential for operational efficiency and effective model management in real-world machine learning applications, aligning with the core themes of our course on MLops and experiment tracking.\n", 12 | "\n", 13 | "\n", 14 | "### Objective:\n", 15 | "* Loading and Serving Models\n", 16 | "* Inference with the Model\n", 17 | "* Managing Model Versions\n", 18 | "* Deleting Models and Versions" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### Environment Setup\n", 26 | "\n", 27 | "Ensure all necessary libraries are installed and imported for our workflow." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "#!pip install mlflow torch transformers" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### Imports\n", 44 | "\n", 45 | "Import necessary libraries focusing on MLflow for model retrieval, PyTorch for model operations, and Transformers for data processing." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "import mlflow\n", 55 | "import torch\n", 56 | "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n", 57 | "import os" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "### Connect to Mlflow Server" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# Set MLflow tracking URI\n", 74 | "mlflow.set_tracking_uri(\"http://localhost:5000\")\n", 75 | "client = mlflow.tracking.MlflowClient()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "### Retrieve the Model from MLflow\n", 83 | "\n", 84 | "In this step, we'll explore two methods to retrieve our trained model from MLflow. Understanding the nuances of each method is key to making an informed choice in a real-life scenario based on the requirements and constraints of your deployment environment.\n", 85 | "\n", 86 | "#### Method 1: Using the Built-in PyTorch Loader\n", 87 | "\n", 88 | "This method is straightforward and uses MLflow's built-in functionality to load PyTorch models. It's user-friendly and works well when you're working within a PyTorch-centric workflow.\n" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "# Load a specific model version\n", 98 | "model_name = \"agnews_pt_classifier\"\n", 99 | "model_version = \"1\" # or \"production\", \"staging\"\n", 100 | "\n", 101 | "\n", 102 | "model_uri = f\"models:/{model_name}/{model_version}\"\n", 103 | "model = mlflow.pytorch.load_model(model_uri)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "## Performing Inference\n", 111 | "\n", 112 | "Here, we define the `predict` function to perform inference using the loaded model. This function takes a list of texts, tokenizes them using a pre-trained tokenizer, and then feeds them into the model. The output is the model's prediction, which can be used for various applications such as text classification, sentiment analysis, etc. This step is crucial in demonstrating how a trained model can be utilized for practical applications.\n" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "\n", 122 | "def predict(texts, model, tokenizer):\n", 123 | " # Tokenize the texts\n", 124 | " inputs = tokenizer(texts, padding=True, truncation=True, return_tensors=\"pt\").to(model.device)\n", 125 | "\n", 126 | " # Pass the inputs to the model\n", 127 | " with torch.no_grad():\n", 128 | " outputs = model(**inputs)\n", 129 | " predictions = torch.argmax(outputs.logits, dim=-1)\n", 130 | "\n", 131 | " # Convert predictions to text labels\n", 132 | " predictions = predictions.cpu().numpy()\n", 133 | " predictions = [model.config.id2label[prediction] for prediction in predictions]\n", 134 | "\n", 135 | " # Print predictions\n", 136 | " return predictions\n" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# Sample text to predict\n", 146 | "texts = [\n", 147 | " \"The local high school soccer team triumphed in the state championship, securing victory with a last-second winning goal.\",\n", 148 | " \"DataCore is set to acquire startup InnovateAI for $2 billion, aiming to enhance its position in the artificial intelligence market.\",\n", 149 | "]\n" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "# Tokenizer needs to be loaded sepparetly for this\n", 159 | "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", 160 | "\n", 161 | "print(predict(texts, model, tokenizer))" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "\n", 169 | "#### Method 2: Versatile Loading with Custom Handling\n", 170 | "\n", 171 | "This alternate method is more versatile and can handle different types of models. It's particularly useful when you're working with a variety of models or when the environment requires a more customized approach." 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "\n", 181 | "# Load custom model\n", 182 | "model_name = \"agnews-transformer\"\n", 183 | "model_version = \"1\" # or \"production\", \"pstaging\"\n", 184 | "model_version_details = client.get_model_version(name=model_name, version=model_version)\n", 185 | "\n", 186 | "run_id = model_version_details.run_id\n", 187 | "artifact_path = model_version_details.source\n", 188 | "\n", 189 | "# Construct the model URI\n", 190 | "model_uri = f\"models:/{model_name}/{model_version}\"\n", 191 | "\n", 192 | "model_path = \"models/agnews_transformer\"\n", 193 | "os.makedirs(model_path, exist_ok=True)\n", 194 | "\n", 195 | "client.download_artifacts(run_id, artifact_path, dst_path=model_path)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "# Load the model and tokenizer\n", 205 | "custom_model = AutoModelForSequenceClassification.from_pretrained(\"models/agnews_transformer/custom_model\")\n", 206 | "tokenizer = AutoTokenizer.from_pretrained(\"models/agnews_transformer/custom_model\")\n" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "# Do the inference\n", 216 | "print(predict(texts, custom_model, tokenizer))" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "## Demonstrating Model Versioning with MLflow\n", 224 | "\n", 225 | "One of the powerful features of MLflow is its ability to manage multiple versions of models. In this section, we log new iterations of our model to showcase this versioning capability. By setting a new experiment and logging models under different run names, we effectively create multiple versions of the same model. This is a crucial aspect of MLOps, as it allows for tracking the evolution of models over time, comparing different iterations, and systematically managing the model lifecycle. We demonstrate this by logging two additional iterations of our model, tagged as \"iteration2\" and \"iteration3\".\n" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "# Log some new models for versioning demonstration\n", 235 | "mlflow.set_experiment(\"sequence_classification\")\n", 236 | "\n", 237 | "# Log a new model as iteration 2\n", 238 | "with mlflow.start_run(run_name=\"iteration2\"):\n", 239 | " mlflow.pytorch.log_model(model, \"model\")\n", 240 | "\n", 241 | "# Log another new model as iteration 3\n", 242 | "with mlflow.start_run(run_name=\"iteration3\"):\n", 243 | " mlflow.pytorch.log_model(model, \"model\")\n" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "## Performing Inference\n", 251 | "\n", 252 | "Here, we define the `predict` function to perform inference using the loaded model. This function takes a list of texts, tokenizes them using a pre-trained tokenizer, and then feeds them into the model. The output is the model's prediction, which can be used for various applications such as text classification, sentiment analysis, etc. This step is crucial in demonstrating how a trained model can be utilized for practical applications.\n" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "# Model version management\n", 262 | "model_versions = client.search_model_versions(f\"name='{model_name}'\")\n", 263 | "for version in model_versions:\n", 264 | " print(f\"Version: {version.version}, Stage: {version.current_stage}\")\n", 265 | "\n", 266 | "# Change model stage\n", 267 | "client.transition_model_version_stage(name=model_name, version=model_version, stage=\"Production\")\n" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "## Cleaning Up: Deleting Models and Versions\n", 275 | "\n", 276 | "In some scenarios, you might need to delete specific model versions or even entire registered models from MLflow. This section covers how to perform these deletions. Note that this should be done cautiously, as it cannot be undone. This is particularly useful for maintaining a clean and efficient model registry by removing outdated or unused models and versions.\n" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "# Delete a specific model version\n", 286 | "client.delete_model_version(name=model_name, version=model_version)\n", 287 | "\n", 288 | "# Delete the entire registered model\n", 289 | "client.delete_registered_model(name=model_name)\n" 290 | ] 291 | } 292 | ], 293 | "metadata": { 294 | "kernelspec": { 295 | "display_name": "dev", 296 | "language": "python", 297 | "name": "python3" 298 | }, 299 | "language_info": { 300 | "codemirror_mode": { 301 | "name": "ipython", 302 | "version": 3 303 | }, 304 | "file_extension": ".py", 305 | "mimetype": "text/x-python", 306 | "name": "python", 307 | "nbconvert_exporter": "python", 308 | "pygments_lexer": "ipython3", 309 | "version": "3.10.12" 310 | } 311 | }, 312 | "nbformat": 4, 313 | "nbformat_minor": 2 314 | } 315 | -------------------------------------------------------------------------------- /sections/4. Model Management/4.8_mlflow_authentication.py: -------------------------------------------------------------------------------- 1 | from mlflow.server import get_app_client 2 | import os 3 | 4 | tracking_uri = "http://localhost:5000" 5 | auth_client = get_app_client("basic-auth", tracking_uri=tracking_uri) 6 | 7 | # Set username and password 8 | os.environ["MLFLOW_TRACKING_USERNAME"] = "admin" 9 | os.environ["MLFLOW_TRACKING_PASSWORD"] = "password" 10 | 11 | # Change password 12 | auth_client.update_user_password(username="admin", password="zG*8!7p@TKmS") 13 | -------------------------------------------------------------------------------- /sections/5. Advanced Model Deployment Techniques/5.2_batching_and_dynamic_batching.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Text Generation with Transformers and Dynamic Batching 🚀\n", 8 | "\n", 9 | "**Objectives:**\n", 10 | "- 📚 Learn to use pretrained models/tokenizers from Hugging Face.\n", 11 | "- ✍️ Generate text for prompts.\n", 12 | "- 🧑‍🔬 Explore batch and dynamic batch text generation.\n", 13 | "- 🏎 Optimize text generation efficiency.\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "## Setup and Imports 🛠\n", 21 | "\n", 22 | "**Imports:**\n", 23 | "- `transformers`: For models & tokenizers.\n", 24 | "- `datasets`: Easy data access.\n", 25 | "- `torch`: Tensor operations.\n", 26 | "- `tqdm`: Progress bars.\n" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "from transformers import AutoModelForCausalLM, AutoTokenizer\n", 36 | "from datasets import load_dataset\n", 37 | "import torch\n", 38 | "from tqdm.auto import tqdm" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## Loading the Model and Tokenizer 📦\n", 46 | "\n", 47 | "**Key Steps:**\n", 48 | "- Load model (`TheFuzzyScientist/diabloGPT_open-instruct`).\n", 49 | "- Load tokenizer (`microsoft/DialoGPT-medium`).\n", 50 | "- Set tokenizer padding to `eos_token`.\n", 51 | "- Enable GPU acceleration (`cuda`).\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "\n", 61 | "model = AutoModelForCausalLM.from_pretrained(\"TheFuzzyScientist/diabloGPT_open-instruct\").to(\"cuda\")\n", 62 | "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/DialoGPT-medium\", padding_side=\"left\")\n", 63 | "tokenizer.pad_token = tokenizer.eos_token\n" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "## Dataset Preparation 📈\n", 71 | "\n", 72 | "**Process:**\n", 73 | "- Use `hakurei/open-instruct-v1` dataset.\n", 74 | "- Convert to pandas DataFrame for easier handling.\n" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "\n", 84 | "dataset = load_dataset(\"hakurei/open-instruct-v1\", split=\"train\")\n", 85 | "dataset = dataset.to_pandas()\n" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## Text Generation Functions ✒️\n", 93 | "\n", 94 | "**Functions:**\n", 95 | "- `generate_text`: Single prompt text generation.\n", 96 | "- `batch_generate_texts`: Batch prompt text generation for efficiency.\n" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "\n", 106 | "def generate_text(prompt):\n", 107 | " inputs = tokenizer.encode(prompt, return_tensors=\"pt\").to(model.device)\n", 108 | " outputs = model.generate(inputs, max_length=64)\n", 109 | " generated = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", 110 | "\n", 111 | " return generated[: generated.find(\".\") + 1]\n", 112 | "\n", 113 | "\n", 114 | "generate_text(\"What's the best way to cook chiken breast?\")" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "## Text Generation Demo 🎭\n", 122 | "\n", 123 | "**Activities:**\n", 124 | "- Generate text from a single prompt.\n", 125 | "- Generate texts in batches to observe efficiency.\n" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "\n", 135 | "def batch_generate_texts(prompts):\n", 136 | " inputs = tokenizer(prompts, return_tensors=\"pt\", padding=True).to(model.device)[\"input_ids\"]\n", 137 | " outputs = model.generate(inputs, max_length=64, pad_token_id=tokenizer.eos_token_id)\n", 138 | " generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)\n", 139 | "\n", 140 | " return generated\n", 141 | "\n", 142 | "\n", 143 | "batch_generate_texts(dataset[\"instruction\"][:1].tolist())\n", 144 | "batch_generate_texts(dataset[\"instruction\"][:20].tolist())\n", 145 | "batch_generate_texts(dataset[\"instruction\"][:100].tolist())\n", 146 | "batch_generate_texts(dataset[\"instruction\"][:200].tolist())\n", 147 | "# batch_generate_texts(dataset[\"instruction\"].sample(200).tolist()) # this might crash\n" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "## Advanced: Dynamic Batching for Efficiency ⚙️\n", 155 | "\n", 156 | "**Concepts:**\n", 157 | "- Implement dynamic batching for hardware optimization.\n", 158 | "- Utilize different batching techniques for performance improvement.\n" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "\n", 168 | "def batch_generate_tokens(tokens):\n", 169 | " outputs = model.generate(torch.stack(tokens), max_length=64, pad_token_id=tokenizer.eos_token_id)\n", 170 | "\n", 171 | " return tokenizer.batch_decode(outputs, skip_special_tokens=True)\n", 172 | "\n", 173 | "\n", 174 | "def dynamic_batching(prompts, max_tokens, is_pretokenized=False):\n", 175 | " if not is_pretokenized:\n", 176 | " tokenized_texts = tokenizer(prompts, return_tensors=\"pt\", padding=True)[\"input_ids\"].to(model.device)\n", 177 | " else:\n", 178 | " tokenized_texts = prompts\n", 179 | "\n", 180 | " current_batch = []\n", 181 | " current_batch_size = 0\n", 182 | "\n", 183 | " for tokenized_text in tokenized_texts:\n", 184 | " if current_batch_size + len(tokenized_text) > max_tokens and current_batch:\n", 185 | " yield batch_generate_tokens(current_batch)\n", 186 | "\n", 187 | " current_batch, current_batch_size = [], 0\n", 188 | "\n", 189 | " current_batch.append(tokenized_text)\n", 190 | " current_batch_size += len(tokenized_text)\n", 191 | "\n", 192 | " # Process final batch\n", 193 | " if current_batch:\n", 194 | " yield batch_generate_tokens(current_batch)\n", 195 | " pass\n", 196 | "\n", 197 | "\n", 198 | "generator = dynamic_batching(dataset[\"instruction\"][:40].tolist() * 1000, 3200)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "## Applying Dynamic Batching and Measuring Performance 📊 ⏱\n", 206 | "\n", 207 | "**Steps:**\n", 208 | "- Apply dynamic batching on a large dataset.\n", 209 | "- Track performance and efficiency improvements.\n" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "\n", 219 | "from contextlib import contextmanager\n", 220 | "import time\n", 221 | "\n", 222 | "\n", 223 | "@contextmanager\n", 224 | "def track_time():\n", 225 | " start = time.time() # Record start time\n", 226 | " yield\n", 227 | " end = time.time() # Record end time\n", 228 | " print(f\"Execution time: {end - start} seconds\")\n", 229 | "\n", 230 | "\n", 231 | "with track_time():\n", 232 | " for batch_predictions in tqdm(generator):\n", 233 | " continue\n", 234 | "\n", 235 | "\n", 236 | "def sort_batches(prompts, max_tokens):\n", 237 | " tokenized_texts = tokenizer(prompts, padding=False)[\"input_ids\"]\n", 238 | " sorted_tokens = sorted(tokenized_texts, key=len)\n", 239 | "\n", 240 | " sorted_batches = {}\n", 241 | " for sorted_token in sorted_tokens:\n", 242 | " length = len(sorted_token)\n", 243 | " if length not in sorted_batches:\n", 244 | " sorted_batches[length] = []\n", 245 | "\n", 246 | " sorted_batches[length].append(sorted_token)\n", 247 | "\n", 248 | " for length, sorted_batch in sorted_batches.items():\n", 249 | " tensor_batch = torch.stack([torch.tensor(sorted_token) for sorted_token in sorted_batch]).to(model.device)\n", 250 | " for batch_prediction in dynamic_batching(tensor_batch, max_tokens=max_tokens, is_pretokenized=True):\n", 251 | " yield batch_prediction\n", 252 | "\n", 253 | "\n", 254 | "generator = sort_batches(dataset[\"instruction\"][:40].tolist() * 1000, 3200)\n", 255 | "\n", 256 | "with track_time():\n", 257 | " for batch_predictions in tqdm(generator):\n", 258 | " print(len(batch_predictions))\n" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "# Conclusion and Next Steps 🌈\n", 266 | "\n", 267 | "**Achievements:**\n", 268 | "- Mastered text generation with Transformers.\n", 269 | "- Learned about batch and dynamic batching efficiencies.\n", 270 | "\n", 271 | "**Explore Further:**\n", 272 | "- Experiment with different models/tokenizers.\n", 273 | "- Test with various datasets.\n", 274 | "- Adjust batch size to see performance differences.\n" 275 | ] 276 | } 277 | ], 278 | "metadata": { 279 | "language_info": { 280 | "name": "python" 281 | } 282 | }, 283 | "nbformat": 4, 284 | "nbformat_minor": 2 285 | } 286 | -------------------------------------------------------------------------------- /sections/5. Advanced Model Deployment Techniques/5.2_batching_and_dynamic_batching.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | from datasets import load_dataset 3 | import torch 4 | from tqdm.auto import tqdm 5 | 6 | model = AutoModelForCausalLM.from_pretrained("TheFuzzyScientist/diabloGPT_open-instruct").to("cuda") 7 | tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium", padding_side="left") 8 | tokenizer.pad_token = tokenizer.eos_token 9 | 10 | 11 | dataset = load_dataset("hakurei/open-instruct-v1", split="train") 12 | dataset = dataset.to_pandas() 13 | 14 | 15 | def generate_text(prompt): 16 | inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device) 17 | outputs = model.generate(inputs, max_length=64) 18 | generated = tokenizer.decode(outputs[0], skip_special_tokens=True) 19 | 20 | return generated[: generated.find(".") + 1] 21 | 22 | 23 | generate_text("What's the best way to cook chiken breast?") 24 | 25 | 26 | def batch_generate_texts(prompts): 27 | inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)["input_ids"] 28 | outputs = model.generate(inputs, max_length=64, pad_token_id=tokenizer.eos_token_id) 29 | generated = tokenizer.batch_decode(outputs, skip_special_tokens=True) 30 | 31 | return generated 32 | 33 | 34 | batch_generate_texts(dataset["instruction"][:1].tolist()) 35 | batch_generate_texts(dataset["instruction"][:20].tolist()) 36 | batch_generate_texts(dataset["instruction"][:100].tolist()) 37 | batch_generate_texts(dataset["instruction"][:200].tolist()) 38 | # batch_generate_texts(dataset["instruction"].sample(200).tolist()) # this might crash 39 | 40 | 41 | # Dynamic batching 42 | 43 | 44 | def batch_generate_tokens(tokens): 45 | outputs = model.generate(torch.stack(tokens), max_length=64, pad_token_id=tokenizer.eos_token_id) 46 | 47 | return tokenizer.batch_decode(outputs, skip_special_tokens=True) 48 | 49 | 50 | def dynamic_batching(prompts, max_tokens, is_pretokenized=False): 51 | if not is_pretokenized: 52 | tokenized_texts = tokenizer(prompts, return_tensors="pt", padding=True)["input_ids"].to(model.device) 53 | else: 54 | tokenized_texts = prompts 55 | 56 | current_batch = [] 57 | current_batch_size = 0 58 | 59 | for tokenized_text in tokenized_texts: 60 | if current_batch_size + len(tokenized_text) > max_tokens and current_batch: 61 | yield batch_generate_tokens(current_batch) 62 | 63 | current_batch, current_batch_size = [], 0 64 | 65 | current_batch.append(tokenized_text) 66 | current_batch_size += len(tokenized_text) 67 | 68 | # Process final batch 69 | if current_batch: 70 | yield batch_generate_tokens(current_batch) 71 | pass 72 | 73 | 74 | generator = dynamic_batching(dataset["instruction"][:40].tolist() * 1000, 3200) 75 | 76 | 77 | from contextlib import contextmanager 78 | import time 79 | 80 | 81 | @contextmanager 82 | def track_time(): 83 | start = time.time() # Record start time 84 | yield 85 | end = time.time() # Record end time 86 | print(f"Execution time: {end - start} seconds") 87 | 88 | 89 | with track_time(): 90 | for batch_predictions in tqdm(generator): 91 | continue 92 | 93 | 94 | def sort_batches(prompts, max_tokens): 95 | tokenized_texts = tokenizer(prompts, padding=False)["input_ids"] 96 | sorted_tokens = sorted(tokenized_texts, key=len) 97 | 98 | sorted_batches = {} 99 | for sorted_token in sorted_tokens: 100 | length = len(sorted_token) 101 | if length not in sorted_batches: 102 | sorted_batches[length] = [] 103 | 104 | sorted_batches[length].append(sorted_token) 105 | 106 | for length, sorted_batch in sorted_batches.items(): 107 | tensor_batch = torch.stack([torch.tensor(sorted_token) for sorted_token in sorted_batch]).to(model.device) 108 | for batch_prediction in dynamic_batching(tensor_batch, max_tokens=max_tokens, is_pretokenized=True): 109 | yield batch_prediction 110 | 111 | 112 | generator = sort_batches(dataset["instruction"][:40].tolist() * 1000, 3200) 113 | 114 | with track_time(): 115 | for batch_predictions in tqdm(generator): 116 | print(len(batch_predictions)) 117 | -------------------------------------------------------------------------------- /sections/5. Advanced Model Deployment Techniques/5.3_the_role_of_sorting_batches.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Advanced Text Generation Techniques with Transformers 🚀\n", 8 | "\n", 9 | "In this advanced lab, we dive deeper into efficient text generation techniques using Transformers. We'll explore two batching strategies: normal batching and sorted batching, to optimize our text generation tasks.\n", 10 | "\n", 11 | "**Objectives:**\n", 12 | "- 🧰 Implement advanced text generation functions.\n", 13 | "- 📊 Compare normal vs. sorted batching efficiency.\n", 14 | "- ⏱ Measure and understand execution time improvements.\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Setup and Imports 🛠\n", 22 | "\n", 23 | "Before diving into the code, let's ensure we have all the necessary tools:\n", 24 | "\n", 25 | "- `transformers` & `datasets`: For our model and data.\n", 26 | "- `torch`: For tensor operations.\n", 27 | "- `tqdm`: For progress tracking.\n", 28 | "- `contextlib` & `time`: For measuring execution time.\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "from transformers import AutoModelForCausalLM, AutoTokenizer\n", 38 | "from datasets import load_dataset\n", 39 | "import torch\n", 40 | "from tqdm.auto import tqdm\n", 41 | "from contextlib import contextmanager\n", 42 | "import time\n" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Time Tracking Utility ⏱\n", 50 | "\n", 51 | "To compare the efficiency of our batching strategies, we'll use a context manager to track the execution time:\n", 52 | "\n", 53 | "- **Purpose:** Measure the time it takes to execute a block of code.\n", 54 | "- **Output:** Prints the execution time in seconds.\n" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "@contextmanager\n", 64 | "def track_time():\n", 65 | " start = time.time()\n", 66 | " yield\n", 67 | " end = time.time()\n", 68 | " print(f\"Execution time: {end - start:.2f}s\")" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Model and Tokenizer Setup 🧩\n", 76 | " \n", 77 | "Setting up our model and tokenizer is crucial for text generation:\n", 78 | "\n", 79 | "- **Model:** \"TheFuzzyScientist/diabloGPT_open-instruct\" for instructive text generation.\n", 80 | "- **Tokenizer:** \"microsoft/DialoGPT-medium\" with padding adjusted.\n", 81 | "- **Device:** Utilize CUDA for GPU acceleration.\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "model = AutoModelForCausalLM.from_pretrained(\"TheFuzzyScientist/diabloGPT_open-instruct\").to(\"cuda\")\n", 91 | "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/DialoGPT-medium\", padding_side=\"left\")\n", 92 | "tokenizer.pad_token = tokenizer.eos_token" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "## Dataset Preparation and Initial Tokenization 📚\n", 100 | "\n", 101 | "We'll work with a sample dataset for text generation tasks:\n", 102 | "\n", 103 | "- **Dataset:** \"hakurei/open-instruct-v1\" converted to a pandas DataFrame.\n", 104 | "- **Initial Tokenization:** Convert a sample of prompts to input IDs.\n" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "dataset = load_dataset(\"hakurei/open-instruct-v1\", split=\"train\")\n", 114 | "dataset = dataset.to_pandas()\n", 115 | "\n", 116 | "prompts = dataset[\"instruction\"].sample(4).tolist()\n", 117 | "inputs = tokenizer(prompts, padding=True)[\"input_ids\"]\n", 118 | "\n", 119 | "# print('\\n\\n'.join(tokenizer.batch_decode(inputs)))\n", 120 | "print(\"\\n\\n\".join(tokenizer.batch_decode(inputs)).replace(tokenizer.eos_token, \"[PAD]\"))\n" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "## Normal Batching Method 🔄\n", 128 | "\n", 129 | "Normal batching processes prompts in fixed-size batches:\n", 130 | "\n", 131 | "- **Chunker Function:** Splits our data into specified batch sizes.\n", 132 | "- **Batch Generation:** Generates text for each batch of tokens.\n", 133 | "- **Predict Function:** Orchestrates the batching and generation process.\n" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "# Normal batching\n", 143 | "def chunker(seq, size):\n", 144 | " return (seq[pos : pos + size] for pos in range(0, len(seq), size))\n", 145 | "\n", 146 | "\n", 147 | "def batch_generate_tokens(tokens):\n", 148 | " outputs = model.generate(tokens, max_new_tokens=64, pad_token_id=tokenizer.eos_token_id)\n", 149 | "\n", 150 | " return tokenizer.batch_decode(outputs, skip_special_tokens=True)\n", 151 | "\n", 152 | "\n", 153 | "def predict_batch(prompts, batch_size):\n", 154 | " inputs = tokenizer(prompts, return_tensors=\"pt\", padding=True, truncation=True, max_length=512)[\"input_ids\"]\n", 155 | "\n", 156 | " for batch in chunker(inputs, batch_size):\n", 157 | " yield batch_generate_tokens(batch.to(model.device))" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "## Predicting with Normal Batching ⚡\n", 165 | "\n", 166 | "Let's generate text using the normal batching method:\n", 167 | "\n", 168 | "- **Process:** Tokenize prompts, generate text in batches, and track execution time.\n", 169 | "- **Observation:** Note the time it takes to process 3000 prompts.\n" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "prompts = dataset[\"instruction\"].sample(3000).tolist()\n", 179 | "\n", 180 | "with track_time():\n", 181 | " for batch_prediction in tqdm(predict_batch(prompts, 32)):\n", 182 | " print(len(batch_prediction))\n", 183 | " \n", 184 | "# Execution time: 137.19s" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "## Sorted Batching Method 🔢\n", 192 | "\n", 193 | "Sorted batching aims to improve efficiency by grouping prompts of similar lengths:\n", 194 | "\n", 195 | "- **Strategy:** Sort prompts by length and batch accordingly.\n", 196 | "- **Benefits:** Reduces padding, potentially speeding up computation.\n" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "# Sorted Batching\n", 206 | "def predict_sorted_batches(prompts, max_batch_size):\n", 207 | " inputs = tokenizer(prompts, padding=False, truncation=True, max_length=512)[\"input_ids\"]\n", 208 | "\n", 209 | " sorted_tokens = sorted(inputs, key=len)\n", 210 | " sorted_batches = {}\n", 211 | " for sorted_input in sorted_tokens:\n", 212 | " if not len(sorted_input):\n", 213 | " continue\n", 214 | "\n", 215 | " length = len(sorted_input)\n", 216 | " if length not in sorted_batches:\n", 217 | " sorted_batches[length] = []\n", 218 | "\n", 219 | " sorted_batches[length].append(sorted_input)\n", 220 | "\n", 221 | " for length, sorted_batch in sorted_batches.items():\n", 222 | " for batch in chunker(sorted_batch, max_batch_size):\n", 223 | " tensor_batch = torch.tensor(batch).to(model.device)\n", 224 | " yield batch_generate_tokens(tensor_batch)\n" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "## Predicting with Sorted Batching 🚀\n", 232 | "\n", 233 | "Applying the sorted batching method:\n", 234 | "\n", 235 | "- **Execution:** Similar to normal batching but with sorted prompts.\n", 236 | "- **Comparison:** Observe the execution time difference from normal batching.\n" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "with track_time():\n", 246 | " for batch_prediction in tqdm(predict_sorted_batches(prompts, 32)):\n", 247 | " print(len(batch_prediction))\n", 248 | "\n", 249 | "# Execution time: 72.74s" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "# Conclusion and Next Steps 🌈\n", 257 | "\n", 258 | "Through this lab, we've explored advanced batching techniques for text generation with Transformers. We saw firsthand how sorted batching can significantly reduce execution time compared to normal batching.\n", 259 | "\n", 260 | "**Encouraged Next Steps:**\n", 261 | "- 🤖 Experiment with different models and datasets.\n", 262 | "- 📐 Adjust batch sizes and observe the impact on performance.\n", 263 | "- 🔄 Explore other optimization techniques for text generation.\n" 264 | ] 265 | } 266 | ], 267 | "metadata": { 268 | "language_info": { 269 | "name": "python" 270 | } 271 | }, 272 | "nbformat": 4, 273 | "nbformat_minor": 2 274 | } 275 | -------------------------------------------------------------------------------- /sections/5. Advanced Model Deployment Techniques/5.3_the_role_of_sorting_batches.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | from datasets import load_dataset 3 | import torch 4 | from tqdm.auto import tqdm 5 | from contextlib import contextmanager 6 | import time 7 | 8 | 9 | @contextmanager 10 | def track_time(): 11 | start = time.time() 12 | yield 13 | end = time.time() 14 | print(f"Execution time: {end - start:.2f}s") 15 | 16 | 17 | model = AutoModelForCausalLM.from_pretrained("TheFuzzyScientist/diabloGPT_open-instruct").to("cuda") 18 | tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium", padding_side="left") 19 | tokenizer.pad_token = tokenizer.eos_token 20 | 21 | 22 | dataset = load_dataset("hakurei/open-instruct-v1", split="train") 23 | dataset = dataset.to_pandas() 24 | 25 | prompts = dataset["instruction"].sample(4).tolist() 26 | inputs = tokenizer(prompts, padding=True)["input_ids"] 27 | 28 | 29 | # print('\n\n'.join(tokenizer.batch_decode(inputs))) 30 | print("\n\n".join(tokenizer.batch_decode(inputs)).replace(tokenizer.eos_token, "[PAD]")) 31 | 32 | 33 | # Normal batching 34 | def chunker(seq, size): 35 | return (seq[pos : pos + size] for pos in range(0, len(seq), size)) 36 | 37 | 38 | def batch_generate_tokens(tokens): 39 | outputs = model.generate(tokens, max_new_tokens=64, pad_token_id=tokenizer.eos_token_id) 40 | 41 | return tokenizer.batch_decode(outputs, skip_special_tokens=True) 42 | 43 | 44 | def predict_batch(prompts, batch_size): 45 | inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=512)["input_ids"] 46 | 47 | for batch in chunker(inputs, batch_size): 48 | yield batch_generate_tokens(batch.to(model.device)) 49 | 50 | 51 | prompts = dataset["instruction"].sample(3000).tolist() 52 | 53 | with track_time(): 54 | for batch_prediction in tqdm(predict_batch(prompts, 32)): 55 | print(len(batch_prediction)) 56 | # Execution time: 137.19s 57 | 58 | 59 | # Sorted Batching 60 | def predict_sorted_batches(prompts, max_batch_size): 61 | inputs = tokenizer(prompts, padding=False, truncation=True, max_length=512)["input_ids"] 62 | 63 | sorted_tokens = sorted(inputs, key=len) 64 | sorted_batches = {} 65 | for sorted_input in sorted_tokens: 66 | if not len(sorted_input): 67 | continue 68 | 69 | length = len(sorted_input) 70 | if length not in sorted_batches: 71 | sorted_batches[length] = [] 72 | 73 | sorted_batches[length].append(sorted_input) 74 | 75 | for length, sorted_batch in sorted_batches.items(): 76 | for batch in chunker(sorted_batch, max_batch_size): 77 | tensor_batch = torch.tensor(batch).to(model.device) 78 | yield batch_generate_tokens(tensor_batch) 79 | 80 | 81 | with track_time(): 82 | for batch_prediction in tqdm(predict_sorted_batches(prompts, 32)): 83 | print(len(batch_prediction)) 84 | 85 | # Execution time: 72.74s 86 | -------------------------------------------------------------------------------- /sections/5. Advanced Model Deployment Techniques/5.4_understanding_quantization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Model Quantization for Efficient Text Generation 🚀\n", 8 | "\n", 9 | "In this lab, we'll explore model quantization using ctranslate2 and its impact on text generation efficiency. Quantization reduces model size and speeds up inference, crucial for deploying models in resource-constrained environments.\n", 10 | "\n", 11 | "**Objectives:**\n", 12 | "- 📦 Understand the basics of model quantization.\n", 13 | "- ⚖️ Quantize a pre-trained model for efficient text generation.\n", 14 | "- ⏱ Compare execution times before and after quantization.\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Setup and Imports 🛠\n", 22 | "\n", 23 | "First, let's get our workspace ready with all the necessary tools:\n", 24 | "\n", 25 | "- `ctranslate2`: For model conversion and quantization.\n", 26 | "- `transformers` & `datasets`: For our model, tokenizer, and data.\n", 27 | "- `torch`: For tensor operations.\n", 28 | "- `tqdm`: Visual progress indication.\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# !pip install ctranslate2\n", 38 | "from transformers import AutoModelForCausalLM, AutoTokenizer\n", 39 | "from datasets import load_dataset\n", 40 | "import torch\n", 41 | "from tqdm.auto import tqdm\n", 42 | "from ctranslate2.converters import TransformersConverter\n", 43 | "from ctranslate2 import Generator\n", 44 | "\n", 45 | "from contextlib import contextmanager\n", 46 | "import time\n", 47 | "\n", 48 | "@contextmanager\n", 49 | "def track_time():\n", 50 | " start = time.time() # Record start time\n", 51 | " yield\n", 52 | " end = time.time() # Record end time\n", 53 | " print(f\"Execution time: {end - start} seconds\")" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Model and Tokenizer Setup 🧩\n", 61 | "\n", 62 | "Before quantization, we need to load and prepare our model and tokenizer:\n", 63 | "\n", 64 | "- **Model:** \"TheFuzzyScientist/diabloGPT_open-instruct\" for instructive text generation.\n", 65 | "- **Tokenizer:** Adjusted for our model's needs.\n", 66 | "- **Device:** Using CUDA for GPU acceleration.\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "model = AutoModelForCausalLM.from_pretrained(\"TheFuzzyScientist/diabloGPT_open-instruct\").to(\"cuda\")\n", 76 | "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/DialoGPT-medium\", padding_side=\"left\")\n", 77 | "tokenizer.pad_token = tokenizer.eos_token\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "## Model Quantization ⚖️\n", 85 | "\n", 86 | "Quantizing our model to reduce its size and improve inference speed:\n", 87 | "\n", 88 | "- **Conversion & Quantization:** Using `TransformersConverter` for ctranslate2 format conversion with float16 quantization.\n", 89 | "- **Output:** Quantized model ready for efficient text generation.\n" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "# Convert the model to CTranslate2\n", 99 | "model.save_pretrained(\"models/gpt-instruct\")\n", 100 | "tokenizer.save_pretrained(\"models/gpt-instruct\")\n", 101 | "\n", 102 | "converter = TransformersConverter(\"models/gpt-instruct\")\n", 103 | "out_path = converter.convert(output_dir=\"models/gpt-instruct-quant\", quantization=\"float16\")\n", 104 | "\n", 105 | "generator = Generator(\"models/gpt-instruct-quant\", device=\"cuda\")" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "## Dataset Preparation 📚\n", 113 | "\n", 114 | "Loading and preparing a dataset for our text generation tasks:\n", 115 | "\n", 116 | "- **Dataset:** \"hakurei/open-instruct-v1\", a rich source for instructive prompts.\n", 117 | "- **Sampling:** Selecting 3000 random samples for our experiments.\n" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "dataset = load_dataset(\"hakurei/open-instruct-v1\", split=\"train\")\n", 127 | "dataset = dataset.to_pandas()\n", 128 | "\n", 129 | "prompts = dataset[\"instruction\"].sample(3000, random_state=42).tolist()\n" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "## Normal Batching Method 🔄\n", 137 | "\n", 138 | "Using the original model, we'll generate text in batches to establish a baseline for performance:\n", 139 | "\n", 140 | "- **Chunker:** Splits prompts into manageable batch sizes.\n", 141 | "- **Batch Generation:** Generates text for each batch.\n" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "\n", 151 | "# Normal batching\n", 152 | "def chunker(seq, size):\n", 153 | " return (seq[pos : pos + size] for pos in range(0, len(seq), size))\n", 154 | "\n", 155 | "\n", 156 | "def batch_generate_tokens(tokens):\n", 157 | " outputs = model.generate(tokens, max_length=256, pad_token_id=tokenizer.eos_token_id, num_beams=2, repetition_penalty=1.5)\n", 158 | "\n", 159 | " return tokenizer.batch_decode(outputs, skip_special_tokens=True)\n", 160 | "\n", 161 | "\n", 162 | "def predict_batch(prompts, batch_size):\n", 163 | " inputs = tokenizer(prompts, return_tensors=\"pt\", padding=True, truncation=True, max_length=128)[\"input_ids\"]\n", 164 | "\n", 165 | " for batch in chunker(inputs, batch_size):\n", 166 | " yield batch_generate_tokens(batch.to(model.device))\n", 167 | "\n", 168 | "\n", 169 | "with track_time():\n", 170 | " for batch_prediction in tqdm(predict_batch(prompts, 32)):\n", 171 | " continue\n", 172 | "\n", 173 | "# Execution time: 242.11289978027344 seconds" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "## Quantized Model Batching 🎯\n", 181 | "\n", 182 | "Switching to our quantized model for more efficient text generation:\n", 183 | "\n", 184 | "- **CTRANS Tokenization:** Adjusting tokenization for ctranslate2 input.\n", 185 | "- **Batch Generation:** Utilizing the quantized model.\n" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "# CTranslate2 batching with quantized model\n", 195 | "def batch_generate_ctrans(prompts, batch_size):\n", 196 | " inputs = [tokenizer.tokenize(prompt, truncation=True, max_length=128) for prompt in prompts]\n", 197 | "\n", 198 | " results = generator.generate_batch(inputs, max_length=256, max_batch_size=batch_size, beam_size=2, repetition_penalty=1.5)\n", 199 | "\n", 200 | " result_ids = [res.sequences_ids[0] for res in results]\n", 201 | " return tokenizer.batch_decode(result_ids, skip_special_tokens=True)\n", 202 | "\n" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "## Predicting with Quantized Model 🚀\n", 210 | "\n", 211 | "Finally, let's see the performance improvement with our quantized model:\n", 212 | "\n", 213 | "- **Execution:** Generate text with the quantized model.\n", 214 | "- **Comparison:** Observe the reduction in execution time versus the unquantized model.\n" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "del model\n", 224 | "torch.cuda.empty_cache()\n", 225 | "with track_time():\n", 226 | " batch_generate_ctrans(prompts, 32)\n", 227 | "\n", 228 | "# Execution time: 150.97192573547363 seconds\n" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "# Conclusion and Next Steps 🌈\n", 236 | "\n", 237 | "We've successfully quantized a text generation model and demonstrated significant improvements in efficiency. This showcases the power of model quantization for deploying NLP models in production.\n", 238 | "\n", 239 | "**Encouraged Next Steps:**\n", 240 | "- 🤖 Try quantizing different models.\n", 241 | "- 📊 Compare quantization effects on various model sizes.\n", 242 | "- 🔍 Explore further optimizations for deployment.\n" 243 | ] 244 | } 245 | ], 246 | "metadata": { 247 | "language_info": { 248 | "name": "python" 249 | } 250 | }, 251 | "nbformat": 4, 252 | "nbformat_minor": 2 253 | } 254 | -------------------------------------------------------------------------------- /sections/5. Advanced Model Deployment Techniques/5.4_understanding_quantization.py: -------------------------------------------------------------------------------- 1 | # !pip install ctranslate2 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | from datasets import load_dataset 4 | import torch 5 | from tqdm.auto import tqdm 6 | from ctranslate2.converters import TransformersConverter 7 | from ctranslate2 import Generator 8 | 9 | from contextlib import contextmanager 10 | import time 11 | 12 | 13 | @contextmanager 14 | def track_time(): 15 | start = time.time() # Record start time 16 | yield 17 | end = time.time() # Record end time 18 | print(f"Execution time: {end - start} seconds") 19 | 20 | 21 | model = AutoModelForCausalLM.from_pretrained("TheFuzzyScientist/diabloGPT_open-instruct").to("cuda") 22 | tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium", padding_side="left") 23 | tokenizer.pad_token = tokenizer.eos_token 24 | 25 | # Convert the model to CTranslate2 26 | model.save_pretrained("models/gpt-instruct") 27 | tokenizer.save_pretrained("models/gpt-instruct") 28 | 29 | converter = TransformersConverter("models/gpt-instruct") 30 | out_path = converter.convert(output_dir="models/gpt-instruct-quant", quantization="float16") 31 | 32 | generator = Generator("models/gpt-instruct-quant", device="cuda") 33 | 34 | # Dataset 35 | dataset = load_dataset("hakurei/open-instruct-v1", split="train") 36 | dataset = dataset.to_pandas() 37 | 38 | prompts = dataset["instruction"].sample(3000, random_state=42).tolist() 39 | 40 | 41 | # Normal batching 42 | def chunker(seq, size): 43 | return (seq[pos : pos + size] for pos in range(0, len(seq), size)) 44 | 45 | 46 | def batch_generate_tokens(tokens): 47 | outputs = model.generate(tokens, max_length=256, pad_token_id=tokenizer.eos_token_id, num_beams=2, repetition_penalty=1.5) 48 | 49 | return tokenizer.batch_decode(outputs, skip_special_tokens=True) 50 | 51 | 52 | def predict_batch(prompts, batch_size): 53 | inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=128)["input_ids"] 54 | 55 | for batch in chunker(inputs, batch_size): 56 | yield batch_generate_tokens(batch.to(model.device)) 57 | 58 | 59 | with track_time(): 60 | for batch_prediction in tqdm(predict_batch(prompts, 32)): 61 | continue 62 | 63 | # Execution time: 242.11289978027344 seconds 64 | 65 | 66 | # CTranslate2 batching with quantized model 67 | def batch_generate_ctrans(prompts, batch_size): 68 | inputs = [tokenizer.tokenize(prompt, truncation=True, max_length=128) for prompt in prompts] 69 | 70 | results = generator.generate_batch(inputs, max_length=256, max_batch_size=batch_size, beam_size=2, repetition_penalty=1.5) 71 | 72 | result_ids = [res.sequences_ids[0] for res in results] 73 | return tokenizer.batch_decode(result_ids, skip_special_tokens=True) 74 | 75 | 76 | del model 77 | torch.cuda.empty_cache() 78 | with track_time(): 79 | batch_generate_ctrans(prompts, 32) 80 | 81 | # Execution time: 150.97192573547363 seconds 82 | -------------------------------------------------------------------------------- /sections/7. Scheduling and Running Jobs on a Cluster/level1.py: -------------------------------------------------------------------------------- 1 | # pip install transformers==4.38.1 2 | from transformers import pipeline 3 | from .utils import track_time 4 | 5 | pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="cuda") 6 | 7 | # We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating 8 | messages = [ 9 | { 10 | "role": "system", 11 | "content": "You are a friendly chatbot who is always helpful.", 12 | }, 13 | {"role": "user", "content": "How can I get rid of a llama on my lawn?"}, 14 | ] 15 | 16 | prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 17 | prompts = [prompt] * 5 18 | 19 | with track_time(prompts): 20 | outputs = pipe(prompts, max_new_tokens=256, do_sample=True, temperature=0.1, top_k=50, top_p=0.95) 21 | 22 | print(outputs[0][0]["generated_text"]) 23 | 24 | ## cpu 25 | ## latency: 48 s 26 | 27 | ## gpu 28 | ## latency: 2.9073479175567627s 29 | ## throughput: 0.34 inputs/s 30 | -------------------------------------------------------------------------------- /sections/7. Scheduling and Running Jobs on a Cluster/level2.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM 2 | from .utils import track_time 3 | 4 | 5 | model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="cuda") 6 | tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") 7 | messages = [ 8 | { 9 | "role": "system", 10 | "content": "You are a friendly chatbot who is always helpful.", 11 | }, 12 | {"role": "user", "content": "How can I get rid of a llama on my lawn?"}, 13 | ] 14 | 15 | prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_special_tokens=False) 16 | input_ids = tokenizer([prompt] * 256, return_tensors="pt").to("cuda") 17 | 18 | with track_time(input_ids["input_ids"]): 19 | outputs = model.generate(**input_ids, max_length=256, do_sample=True, temperature=0.1, top_k=50, top_p=0.95) 20 | 21 | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) 22 | 23 | # latency: 2.6578898429870605s 24 | # throughput: 14.10 inputs/s 25 | -------------------------------------------------------------------------------- /sections/7. Scheduling and Running Jobs on a Cluster/level3.py: -------------------------------------------------------------------------------- 1 | # pip install ctranslate2==4.0.0 2 | from .utils import track_time 3 | from transformers import AutoTokenizer 4 | from ctranslate2 import Generator 5 | 6 | tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") 7 | model = Generator("models/TinyLlama-1.1B-Chat-v1.0-ctrans", device="cuda") 8 | 9 | messages = [ 10 | { 11 | "role": "system", 12 | "content": "You are a friendly chatbot who is always helpful.", 13 | }, 14 | {"role": "user", "content": "How can I get rid of a llama on my lawn?"}, 15 | ] 16 | 17 | prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_special_tokens=False) 18 | 19 | input_tokens = [tokenizer.tokenize(prompt)] * 256 20 | 21 | with track_time(input_tokens): 22 | outputs = model.generate_batch(input_tokens) 23 | 24 | results_ids = [res.sequences_ids[0] for res in outputs] 25 | outputs = tokenizer.batch_decode(results_ids, skip_special_tokens=True) 26 | 27 | print(outputs[100]) 28 | 29 | # latency: 1.3013768196105957s 30 | # throughput: 31.77 inputs/s 31 | -------------------------------------------------------------------------------- /sections/7. Scheduling and Running Jobs on a Cluster/level4.py: -------------------------------------------------------------------------------- 1 | # !pip install vllm==0.3.3 2 | from .utils import track_time 3 | from vllm import LLM, SamplingParams 4 | 5 | llm = LLM(model="models/TinyLlama-1.1B-Chat-v1.0") 6 | tokenizer = llm.get_tokenizer() 7 | 8 | 9 | messages = [ 10 | { 11 | "role": "system", 12 | "content": "You are a friendly chatbot who is always helpful.", 13 | }, 14 | {"role": "user", "content": "How can I get rid of a llama on my lawn?"}, 15 | ] 16 | 17 | prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_special_tokens=False) 18 | 19 | 20 | sampling = SamplingParams(max_tokens=256, seed=42, temperature=0) 21 | 22 | prompts = [prompt] * 1024 23 | 24 | with track_time(prompts): 25 | outputs = llm.generate(prompts, sampling) 26 | 27 | results = [output.outputs[0].text for output in outputs] 28 | 29 | print(results[1000]) 30 | 31 | # latency: 0.7040367126464844s 32 | # throughput: 68.22 inputs/s 33 | -------------------------------------------------------------------------------- /sections/7. Scheduling and Running Jobs on a Cluster/level_5/consume_results.py: -------------------------------------------------------------------------------- 1 | from src.level_five.rabbit import RabbitBuffer 2 | 3 | buffer = RabbitBuffer("llama-results") 4 | 5 | results = buffer.consume(10_000) 6 | len(results) 7 | 8 | print(results[9000].decode()) 9 | -------------------------------------------------------------------------------- /sections/7. Scheduling and Running Jobs on a Cluster/level_5/produce_prompts.py: -------------------------------------------------------------------------------- 1 | from src.level_five.rabbit import RabbitBuffer 2 | from transformers import AutoTokenizer 3 | 4 | messages = [ 5 | { 6 | "role": "system", 7 | "content": "You are a friendly chatbot who is always helpful.", 8 | }, 9 | {"role": "user", "content": "How can I get rid of a llama on my lawn?"}, 10 | ] 11 | tokenizer = AutoTokenizer.from_pretrained("models/TinyLlama-1.1B-Chat-v1.0") 12 | 13 | prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_special_tokens=False) 14 | 15 | 16 | buffer = RabbitBuffer("llama-queue") 17 | buffer.produce([prompt] * 100_000) 18 | -------------------------------------------------------------------------------- /sections/7. Scheduling and Running Jobs on a Cluster/level_5/rabbit.py: -------------------------------------------------------------------------------- 1 | # pip install pika 2 | import pika 3 | 4 | RABBIT_USER = "rabbitmq_user" 5 | RABBIT_PASS = "pass" 6 | 7 | HEAD_IP = "your_head_node_ip" 8 | RABBIT_PORT = 5672 9 | 10 | 11 | class RabbitBuffer: 12 | def __init__(self, queue_name: str) -> None: 13 | self.queue_name = queue_name 14 | 15 | self.credentials = pika.PlainCredentials(RABBIT_USER, RABBIT_PASS) 16 | 17 | self.connection = pika.BlockingConnection(pika.ConnectionParameters(HEAD_IP, RABBIT_PORT, "/", self.credentials)) 18 | 19 | self.channel = self.connection.channel() 20 | self.queue = self.channel.queue_declare(queue=self.queue_name, durable=True) 21 | 22 | def produce(self, messages: list[str]): 23 | for message in messages: 24 | self.channel.basic_publish( 25 | exchange="", 26 | routing_key=self.queue_name, 27 | body=message, 28 | properties=pika.BasicProperties(delivery_mode=2), # make messages persistent 29 | ) 30 | 31 | def consume(self, num_messages: int): 32 | messages = [] 33 | for _ in range(num_messages): 34 | method_frame, header_frame, body = self.channel.basic_get(queue=self.queue_name) 35 | if method_frame: 36 | messages.append(body) 37 | self.channel.basic_ack(method_frame.delivery_tag) 38 | return messages 39 | -------------------------------------------------------------------------------- /sections/7. Scheduling and Running Jobs on a Cluster/level_5/ray_batch_job.py: -------------------------------------------------------------------------------- 1 | # pip install -U "ray[default]" 2 | import ray 3 | from rabbit import RabbitBuffer 4 | 5 | ray.init(address="auto") 6 | from vllm import LLM, SamplingParams 7 | 8 | 9 | @ray.remote 10 | def predict_batch(): 11 | buffer = RabbitBuffer("llama-queue") 12 | 13 | messages = buffer.consume(5000) 14 | prompts = [m.decode() for m in messages] 15 | 16 | sampling = SamplingParams(max_tokens=256, seed=42, temperature=0) 17 | llm = LLM(model="/root/ml-deployment/models/TinyLlama-1.1B-Chat-v1.0") 18 | 19 | outputs = llm.generate(prompts, sampling) 20 | 21 | results = [output.outputs[0].text for output in outputs] 22 | 23 | result_buffer = RabbitBuffer("llama-results") 24 | result_buffer.produce(results) 25 | 26 | return results 27 | 28 | 29 | if __name__ == "__main__": 30 | future = predict_batch.options(num_gpus=1, num_cpus=1).remote() 31 | ray.get(future) 32 | ray.shutdown() 33 | 34 | # sumbit command 35 | # ray job submit --submission-id llamma-batch1 --working-dir src/level_five/ -- python ray_batch_job.py 36 | 37 | # throughput: 111 inputs/s 38 | -------------------------------------------------------------------------------- /sections/7. Scheduling and Running Jobs on a Cluster/utils.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | import time 3 | 4 | @contextmanager 5 | def track_time(inputs:list=None): 6 | start = time.time() # Record the start time 7 | yield # Pass control back to the context block 8 | duration = time.time() - start # Calculate the duration 9 | 10 | if inputs is None: 11 | print(f"Execution time: {duration:.2f} seconds") 12 | else: 13 | print(f"Took {duration:.2f} seconds to process {len(inputs)} inputs") --------------------------------------------------------------------------------