├── LICENSE
└── sections
    ├── 4. Model Management
        ├── 4.4_mlflow_setup_readme.md
        ├── 4.5_getting_started.ipynb
        ├── 4.6_training_loop.ipynb
        ├── 4.7_mlflow_inference.ipynb
        └── 4.8_mlflow_authentication.py
    ├── 5. Advanced Model Deployment Techniques
        ├── 5.2_batching_and_dynamic_batching.ipynb
        ├── 5.2_batching_and_dynamic_batching.py
        ├── 5.3_the_role_of_sorting_batches.ipynb
        ├── 5.3_the_role_of_sorting_batches.py
        ├── 5.4_understanding_quantization.ipynb
        └── 5.4_understanding_quantization.py
    └── 7. Scheduling and Running Jobs on a Cluster
        ├── level1.py
        ├── level2.py
        ├── level3.py
        ├── level4.py
        ├── level_5
            ├── consume_results.py
            ├── produce_prompts.py
            ├── rabbit.py
            └── ray_batch_job.py
        └── utils.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/sections/4. Model Management/4.4_mlflow_setup_readme.md:
--------------------------------------------------------------------------------
  1 | # Section 4 - Model Management & ML-Ops
  2 | 
  3 | 
  4 | Below is a guide and Bash commands to set up MLflow on an Ubuntu system. This setup includes installing MLflow, setting up a backend store for experiments and runs, and launching the MLflow UI.
  5 | 
  6 | ### MLflow Setup Guide for Ubuntu
  7 | 
  8 | #### Prerequisites:
  9 | - Python 3.6 or higher
 10 | - Pip (Python package installer)
 11 | - Ubuntu system (or a similar Linux distribution)
 12 | 
 13 | 
 14 | 
 15 | 
 16 | ### Step 1: Create custom user (optional)
 17 | 
 18 | Setting up a custom user for MLflow and a dedicated Python environment is a good practice, especially for ensuring that the MLflow service runs securely and isolated from other system processes. Here's how you can set it up on an Ubuntu system:
 19 | 
 20 | 1. **Create the User**:
 21 |    Open a terminal and run the following command to create a new user called `mlflow`.
 22 |    ```bash
 23 |    sudo adduser mlflow
 24 |    ```
 25 | 
 26 | 2. **Grant Sudo Privileges (Optional)**:
 27 |    If this user needs to perform administrative tasks, you can grant it sudo privileges. Otherwise, you can skip this step.
 28 |    ```bash
 29 |    sudo usermod -aG sudo mlflow
 30 |    ```
 31 | 
 32 | ### Step 2: Install Python and Create an Environment
 33 | 
 34 | 1. **Switch to the mlflow User**:
 35 |    Switch to the new user account.
 36 |    ```bash
 37 |    su - mlflow
 38 |    ```
 39 | 
 40 | 2. **Install Python3 and Pip**:
 41 |    Ensure Python3 and Pip are installed. Most Ubuntu versions come with Python3 by default, but you might need to install pip.
 42 |    ```bash
 43 |    sudo apt update
 44 |    sudo apt install python3 python3-pip
 45 |    ```
 46 | 
 47 | 3. **Install Virtualenv**:
 48 |    Virtualenv is a tool to create isolated Python environments.
 49 |    ```bash
 50 |    pip3 install virtualenv
 51 |    ```
 52 | 
 53 | 4. **Create a Virtual Environment**:
 54 |    Create a new directory for the MLflow server and navigate into it. Then create a virtual environment.
 55 |    ```bash
 56 |    mkdir ~/mlflow_server
 57 |    cd ~/mlflow_server
 58 |    virtualenv mlflow_env
 59 |    ```
 60 | 
 61 | 5. **Activate the Virtual Environment**:
 62 |    Before installing MLflow and other dependencies, activate the virtual environment.
 63 |    ```bash
 64 |    source mlflow_env/bin/activate
 65 |    ```
 66 | 
 67 | ### Step 3: Install MLflow
 68 | With the virtual environment activated, install MLflow. If you want to ensure compatibility, use the same versions I use in the course. If you'd like to use the latest, make sure to use matching versions for the other libraries.
 69 | 
 70 | ```bash
 71 | pip install mlflow==2.7.1
 72 | ```
 73 | 
 74 | 
 75 | ### Step 4: Install Backend Store (Optional)
 76 | MLflow uses a tracking server to log experiment data. By default, it logs to the local filesystem, but for more robust use, you may want to set up a database like MySQL or SQLite.
 77 | 
 78 | **For SQLite (Simpler Option):**
 79 | - SQLite comes pre-installed on many systems, including Ubuntu.
 80 | - Decide on a directory where you want your SQLite database to reside
 81 | ```bash
 82 | cd ~/mlflow_server
 83 | mkdir metrics_store
 84 | ```
 85 | 
 86 | **For MySQL:**
 87 | - Install MySQL Server:
 88 |   ```bash
 89 |   sudo apt update
 90 |   sudo apt install mysql-server
 91 |   ```
 92 | - Secure your installation and set up your user (follow the prompt after the command):
 93 |   ```bash
 94 |   sudo mysql_secure_installation
 95 |   ```
 96 | - Log into MySQL to create a database for MLflow:
 97 |   ```bash
 98 |   sudo mysql -u root -p
 99 |   ```
100 | - Once inside MySQL, create a database:
101 |   ```mysql
102 |   CREATE DATABASE mlflow_db;
103 |   EXIT;
104 |   ```
105 | 
106 | ### Step 5: Set Backend Store for MLflow
107 | - **For SQLite**, you'll use a URI like: `sqlite:////home/mlflow/mlflow_server/metrics_store/mlflow.db`
108 | - **For MySQL**, the URI will be: `mysql://<username>:<password>@localhost/mlflow_db`
109 | 
110 | 
111 | ### Step 6: Install Artifact Store
112 | The artifact store is where MLflow saves model artifacts like models and plots. You can use S3, Azure Blob Storage, Google Cloud Storage, or even a shared filesystem.
113 | 
114 | - **For local storage (simplest for getting started)**, use a local directory.
115 | ```bash
116 | cd ~/mlflow_server
117 | mkdir artifact_store
118 | ```
119 | 
120 | 
121 | 
122 | 
123 | ### Step 7: Launch MLflow Tracking Server
124 | Open a terminal and run the following command, replacing the URIs with your chosen backend and artifact store paths:
125 | 
126 | ```bash
127 | mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlflow-artifacts
128 | ```
129 | 
130 | Replace `sqlite:///mlflow.db` with your MySQL URI if you're using MySQL, and adjust `./mlflow-artifacts` to the path where you want artifacts stored.
131 | 
132 | ### Step 8: Accessing the MLflow UI
133 | - Once the tracking server is running, it will display a URL, typically `http://127.0.0.1:5000`. Open this URL in a web browser to access the MLflow UI.
134 | - You can now navigate the UI to see your experiments, runs, metrics, and artifacts.
135 | ```bash
136 | ssh -L 5000:localhost:5000 remote
137 | ```
138 | 
139 | 
140 | #### Additional Tips:
141 | - **Service**: For a more permanent setup, you might want to set up MLflow to run as a service or use a process manager like `supervisor` to manage the server process.
142 | - **Security**: If you're setting this up on a cloud server or an exposed machine, ensure you configure proper security settings, including firewalls and authentication for the MLflow server.
143 | 
144 | ### Conclusion
145 | You now have MLflow set up on your Ubuntu system with a backend store for tracking experiments and an artifact store for saving model artifacts. You can start running experiments and tracking them using the MLflow Python library, and all your experiment details will be accessible through the MLflow UI.


--------------------------------------------------------------------------------
/sections/4. Model Management/4.5_getting_started.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## **1. Setting Up MLflow**"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import mlflow\n",
 17 |     "\n",
 18 |     "# Set the tracking URI for MLflow to the local server\n",
 19 |     "mlflow.set_tracking_uri(\"http://localhost:5000\")"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "- **What is MLflow?**: MLflow is an open-source platform for managing the end-to-end machine learning lifecycle. It includes tools for tracking experiments, packaging code into reproducible runs, and sharing and deploying models.\n",
 27 |     "- **Setting up MLflow**: The first step in using MLflow is to set up the tracking server, where all the experiment data will be stored. ```mlflow.set_tracking_uri(\"http://localhost:5000\")``` sets the tracking URI to a local server (running on localhost at port 5000). This means all the data from your experiments will be sent to this server for tracking and storage.\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## **2. Creating and Managing Experiments**"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "# Creating a new experiment\n",
 44 |     "experiment_id = mlflow.create_experiment(\"My New Experiment\")\n",
 45 |     "\n",
 46 |     "# Starting a new run using a context manager\n",
 47 |     "with mlflow.start_run(experiment_id=experiment_id):\n",
 48 |     "    # Your ML code goes here\n",
 49 |     "    pass\n",
 50 |     "\n",
 51 |     "\n",
 52 |     "# Manually creating a custom named run\n",
 53 |     "run = mlflow.start_run(experiment_id=experiment_id, run_name=\"First run\")\n"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "- **Creating Experiments**: `mlflow.create_experiment(\"My New Experiment\")` creates a new experiment in MLflow. An experiment is a way to organize and keep track of your machine learning runs. Each experiment contains multiple runs.\n",
 61 |     "- **Starting Runs**: A \"run\" is a single execution of a machine learning code. MLflow allows you to start a run using two methods:\n",
 62 |     "    - **Context Manager**: The `with mlflow.start_run()` syntax automatically starts and ends a run. This is useful as it ensures the run is closed properly after the code block is executed.\n",
 63 |     "    - **Manual Management**: You can also start and end a run manually using `mlflow.start_run()` and `mlflow.end_run()`. This method gives you more control over when the run starts and ends."
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "## **3. Logging Parameters**"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "# Logging multiple parameters\n",
 80 |     "mlflow.log_param(\"learning_rate\", 0.01)\n",
 81 |     "mlflow.log_param(\"batch_size\", 32)\n",
 82 |     "num_epochs = 10\n",
 83 |     "mlflow.log_param(\"num_epocs\", num_epochs)\n"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "\n",
 91 |     "- **Purpose of Logging Parameters**: Parameters are the configuration settings used for your machine learning model. Logging them helps you keep track of which settings were used in each run, which is crucial for experiment reproducibility and comparison.\n",
 92 |     "- **How it Works**: The `mlflow.log_param` function logs parameters like learning rate, batch size, and number of epochs. These parameters are then visible in the MLflow UI, allowing you to see how different configurations affect model performance."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "## **4. Logging Metrics**"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "# !pip install numpy\n",
109 |     "import numpy as np\n",
110 |     "\n",
111 |     "# Logging metrics for each epoch\n",
112 |     "for epoch in range(num_epochs):\n",
113 |     "    mlflow.log_metric(\"accuracy\", np.random.random(), step=epoch)\n",
114 |     "    mlflow.log_metric(\"loss\", np.random.random(), step=epoch)\n",
115 |     "\n",
116 |     "# Logging a time-series metric\n",
117 |     "for t in range(100):\n",
118 |     "    metric_value = np.sin(t * np.pi / 50)\n",
119 |     "    mlflow.log_metric(\"time_series_metric\", metric_value, step=t)\n"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "\n",
127 |     "\n",
128 |     "- **Metrics in Machine Learning**: Metrics are values that measure the performance of your model. Common metrics include accuracy and loss.\n",
129 |     "- **Logging Metrics with MLflow**: `mlflow.log_metric` allows you to log these metrics during your training process. This is often done for each epoch (a single pass through the entire dataset), or step (a pass of a batch of data) to track how the model improves over time.\n",
130 |     "- **Time-Series Metrics**: Besides typical metrics, you can also log custom metrics. In this example, a time-series metric based on a sine function is logged. This demonstrates how you can track any metric over time, which can be useful for more complex analyses.\n"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "## **5. Logging Data and Artefacts**"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "# Logging datasets\n",
147 |     "with open(\"data/dataset.csv\", \"w\") as f:\n",
148 |     "    f.write(\"x,y\\n\")\n",
149 |     "    for x in range(100):\n",
150 |     "        f.write(f\"{x},{x * 2}\\n\")\n",
151 |     "\n",
152 |     "mlflow.log_artifact(\"data/dataset.csv\", \"data\")"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "### Exploring different types of artifacts"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "# !pip install plotly pandas\n",
169 |     "import pandas as pd\n",
170 |     "import plotly.express as px\n",
171 |     "\n",
172 |     "# Generate a confusion matrix\n",
173 |     "confusion_matrix = np.random.randint(0, 100, size=(5, 5))  # 5x5 matrix\n",
174 |     "\n",
175 |     "labels = [\"Class A\", \"Class B\", \"Class C\", \"Class D\", \"Class E\"]\n",
176 |     "df_cm = pd.DataFrame(confusion_matrix, index=labels, columns=labels)\n",
177 |     "\n",
178 |     "# Plot confusion matrix using Plotly Express\n",
179 |     "fig = px.imshow(df_cm, text_auto=True, labels=dict(x=\"Predicted Label\", y=\"True Label\"), x=labels, y=labels, title=\"Confusion Matrix\")\n",
180 |     "\n",
181 |     "# Save the figure as an HTML file\n",
182 |     "html_file = \"confusion_matrix.html\"\n",
183 |     "fig.write_html(html_file)\n",
184 |     "\n",
185 |     "# Log the HTML file with MLflow\n",
186 |     "mlflow.log_artifact(html_file)\n"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "\n",
194 |     "- **What are Artifacts?**: In MLflow, an artifact is any file or data that you want to log along with your run. This can include datasets, models, images, or even custom files.\n",
195 |     "- **Logging Artifacts**: The `mlflow.log_artifact` function allows you to log these artifacts. In this example, a dataset and a confusion matrix (saved as an HTML file) are logged. Logging artifacts helps in ensuring that all relevant data and outputs are stored and easily accessible for each run."
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "## **6. Logging Models**"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "# !pip install transformers\n",
212 |     "from transformers import AutoModelForSeq2SeqLM\n",
213 |     "\n",
214 |     "# Initialize a model from Hugging Face Transformers\n",
215 |     "model = AutoModelForSeq2SeqLM.from_pretrained(\"TheFuzzyScientist/T5-base_Amazon-product-reviews\")\n",
216 |     "\n",
217 |     "\n",
218 |     "# Log the model in MLflow\n",
219 |     "mlflow.pytorch.log_model(model, \"transformer_model\")\n"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "- **Importance of Logging Models**: Keeping track of the models used in different runs is critical. It helps in model comparison, versioning, and deployment.\n",
227 |     "- **How to Log Models**: MLflow provides functions to log models from various machine learning frameworks. In this case, `mlflow.pytorch.log_model` is used to log a PyTorch model. This function saves the model in a format that can be easily reloaded for future predictions or analysis.\n"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "## **7. Ending the Run**"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "# End run\n",
244 |     "mlflow.end_run()"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "markdown",
249 |    "metadata": {},
250 |    "source": [
251 |     "- **What Does Ending a Run Mean?**: In MLflow, ending a run signifies the completion of a specific machine learning experiment or process. It marks the point where you have finished logging parameters, metrics, and artifacts for that particular execution of your model or script.\n",
252 |     "\n",
253 |     "- **Why is it Important?**: It helps in keeping your experiments organized. Each run is a separate record in MLflow. By ending a run, you ensure that all the data logged after this point will be part of a new run, keeping your experiment's data clean and segregated.\n",
254 |     "\n",
255 |     "- **How to End a Run**: You can end a run using `mlflow.end_run()`. This method is particularly important when you start a run without using a context manager (the `with` statement). With a context manager, the run is automatically ended when you exit the block of code inside the `with` statement. However, if you start a run manually using `mlflow.start_run()`, you should always ensure to call `mlflow.end_run()` once all logging is done.\n"
256 |    ]
257 |   }
258 |  ],
259 |  "metadata": {
260 |   "kernelspec": {
261 |    "display_name": "dev",
262 |    "language": "python",
263 |    "name": "python3"
264 |   },
265 |   "language_info": {
266 |    "codemirror_mode": {
267 |     "name": "ipython",
268 |     "version": 3
269 |    },
270 |    "file_extension": ".py",
271 |    "mimetype": "text/x-python",
272 |    "name": "python",
273 |    "nbconvert_exporter": "python",
274 |    "pygments_lexer": "ipython3",
275 |    "version": "3.10.12"
276 |   }
277 |  },
278 |  "nbformat": 4,
279 |  "nbformat_minor": 2
280 | }
281 | 


--------------------------------------------------------------------------------
/sections/4. Model Management/4.6_training_loop.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "\n",
  8 |     "## MLflow Integration for Model Training and Tracking\n",
  9 |     "\n",
 10 |     "In this notebook, we're integrating MLflow into a machine learning workflow to track and manage experiments effectively. We're focusing on a text classification task using the DistilBert model, emphasizing the importance of experiment tracking, model management, and operational efficiency - core themes of our course.\n"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "### Objective:\n",
 18 |     "\n",
 19 |     "- Dynamically set up and log parameters to MLflow\n",
 20 |     "- Understand the purpose and application of each step in the context of MLflow and MLOps principles\n"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "### Environment Setup\n",
 28 |     "\n",
 29 |     "Ensure all necessary libraries are installed and imported for our workflow.\n"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "\n",
 37 |     "### Imports\n",
 38 |     "\n",
 39 |     "Import the necessary libraries, focusing on MLflow for tracking, PyTorch for model training, and Transformers for our NLP model."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# !pip install datasets\n",
 49 |     "import os\n",
 50 |     "import mlflow\n",
 51 |     "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n",
 52 |     "import torch\n",
 53 |     "from tqdm import tqdm\n",
 54 |     "from torch.utils.data import DataLoader\n",
 55 |     "from datasets import load_dataset\n",
 56 |     "from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, AdamW\n"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "### Configuration Parameters as an Object\n",
 64 |     "\n",
 65 |     "By defining parameters as a dictionary, we can easily iterate through them when logging to MLflow. This method streamlines the process and adheres to best practices in code maintainability and scalability.\n"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "params = {\n",
 75 |     "    'model_name': 'distilbert-base-cased',\n",
 76 |     "    'learning_rate': 5e-5,\n",
 77 |     "    'batch_size': 16,\n",
 78 |     "    'num_epochs': 1,\n",
 79 |     "    'dataset_name': 'ag_news',\n",
 80 |     "    'task_name': 'sequence_classification',\n",
 81 |     "    'log_steps': 100,\n",
 82 |     "    'max_seq_length': 128,\n",
 83 |     "    'output_dir': 'models/distilbert-base-uncased-ag_news',\n",
 84 |     "}"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "\n",
 92 |     "### MLflow Setup\n",
 93 |     "\n",
 94 |     "Setting up MLflow is crucial for tracking our experiments, parameters, and results, allowing us to manage and compare different runs effectively - a practice that aligns with the MLOps goal of systematic and efficient model management."
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "mlflow.set_tracking_uri(\"http://localhost:5005\")\n",
104 |     "mlflow.set_experiment(f\"{params['task_name']}\")"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "### Load and Preprocess Dataset\n",
112 |     "\n",
113 |     "We're using a well-known NLP dataset to ensure reproducibility and comparability. The preprocessing step is crucial for converting raw text into a format that our model can understand, highlighting the importance of data preparation in the ML pipeline."
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "# Load and preprocess dataset\n",
123 |     "dataset = load_dataset(params['dataset_name'], params['task_name'])\n",
124 |     "tokenizer = DistilBertTokenizer.from_pretrained(params['model_name'])\n",
125 |     "\n",
126 |     "def tokenize(batch):\n",
127 |     "    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=params['max_seq_length'])\n",
128 |     "\n",
129 |     "\n",
130 |     "train_dataset = dataset[\"train\"].shuffle().select(range(20_000)).map(tokenize, batched=True)\n",
131 |     "test_dataset = dataset[\"test\"].shuffle().select(range(2_000)).map(tokenize, batched=True)\n",
132 |     "\n",
133 |     "# Set format for PyTorch and create data loaders\n",
134 |     "train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])\n",
135 |     "test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])\n",
136 |     "\n",
137 |     "train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)\n",
138 |     "test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], shuffle=False)\n",
139 |     "\n",
140 |     "# get the labels\n",
141 |     "labels = dataset[\"train\"].features['label'].names"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "\n",
149 |     "### Model Initialization\n",
150 |     "\n",
151 |     "Initializing the model is a foundational step, showcasing the practical application of a pre-trained NLP model for a specific task - reflecting the course's focus on real-world applicability of machine learning models."
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "model = DistilBertForSequenceClassification.from_pretrained(params['model_name'], num_labels=len(labels))\n",
161 |     "model.config.id2label = {i: label for i, label in enumerate(labels)}\n",
162 |     "params['id2label'] = model.config.id2label\n",
163 |     "\n",
164 |     "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
165 |     "model.to(device)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "### Optimizer Setup\n",
173 |     "\n",
174 |     "Choosing the right optimizer and learning rate is vital for effective model training. It demonstrates the importance of hyperparameter tuning, a key concept in achieving optimal model performance."
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "optimizer = AdamW(model.parameters(), lr=params['learning_rate'])"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "### Evaluation Function\n",
191 |     "\n",
192 |     "Evaluating the model on a separate test set helps us understand its performance on unseen data, highlighting the concept of generalization which is crucial for real-world applications."
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "def evaluate_model(model, dataloader, device):\n",
202 |     "    model.eval()  # Set model to evaluation mode\n",
203 |     "    predictions, true_labels = [], []\n",
204 |     "\n",
205 |     "    with torch.no_grad():\n",
206 |     "        for batch in dataloader:\n",
207 |     "            inputs, masks, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)\n",
208 |     "\n",
209 |     "            # Forward pass, calculate logit predictions\n",
210 |     "            outputs = model(inputs, attention_mask=masks)\n",
211 |     "            logits = outputs.logits\n",
212 |     "            _, predicted_labels = torch.max(logits, dim=1)\n",
213 |     "\n",
214 |     "            predictions.extend(predicted_labels.cpu().numpy())\n",
215 |     "            true_labels.extend(labels.cpu().numpy())\n",
216 |     "\n",
217 |     "    # Calculate Evaluation Metrics\n",
218 |     "    accuracy = accuracy_score(true_labels, predictions)\n",
219 |     "    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')\n",
220 |     "\n",
221 |     "    return accuracy, precision, recall, f1\n"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "### Training Loop\n",
229 |     "\n",
230 |     "The training loop is where the actual model training happens. Logging metrics and parameters at each step is crucial for tracking the model's progress, understanding its behavior, and making informed decisions - core aspects of the MLOps lifecycle."
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "# Start MLflow Run\n",
240 |     "with mlflow.start_run(run_name=f\"{params['model_name']}-{params['dataset_name']}\") as run:\n",
241 |     "\n",
242 |     "    # Log all parameters at once\n",
243 |     "    mlflow.log_params(params)\n",
244 |     "\n",
245 |     "    with tqdm(total=params['num_epochs'] * len(train_loader), desc=f\"Epoch [1/{params['num_epochs']}] - (Loss: N/A) - Steps\") as pbar:\n",
246 |     "        for epoch in range(params['num_epochs']):\n",
247 |     "            running_loss = 0.0\n",
248 |     "            for i, batch in enumerate(train_loader, 0):\n",
249 |     "                inputs, masks, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].to(device)\n",
250 |     "\n",
251 |     "                optimizer.zero_grad()\n",
252 |     "                outputs = model(inputs, attention_mask=masks, labels=labels)\n",
253 |     "                loss = outputs.loss\n",
254 |     "                loss.backward()\n",
255 |     "                optimizer.step()\n",
256 |     "\n",
257 |     "                running_loss += loss.item()\n",
258 |     "                if i and i % params['log_steps'] == 0:\n",
259 |     "                    avg_loss = running_loss / params['log_steps']\n",
260 |     "\n",
261 |     "                    pbar.set_description(f\"Epoch [{epoch + 1}/{params['num_epochs']}] - (Loss: {avg_loss:.3f}) - Steps\")\n",
262 |     "                    mlflow.log_metric(\"loss\", avg_loss, step=epoch * len(train_loader) + i)\n",
263 |     "                    \n",
264 |     "                    running_loss = 0.0\n",
265 |     "                pbar.update(1)\n",
266 |     "\n",
267 |     "            # Evaluate Model\n",
268 |     "            accuracy, precision, recall, f1 = evaluate_model(model, test_loader, device)\n",
269 |     "            print(f\"Epoch {epoch + 1} Metrics: Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}\")\n",
270 |     "\n",
271 |     "            # Log metrics to MLflow\n",
272 |     "            mlflow.log_metrics({'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}, step=epoch)\n",
273 |     "\n",
274 |     "\n",
275 |     "    # Log model to MLflow through built-in PyTorch method\n",
276 |     "    # mlflow.pytorch.log_model(model, \"model\")\n",
277 |     "\n",
278 |     "    # Log model to MLflow through custom method\n",
279 |     "    os.makedirs(params['output_dir'], exist_ok=True)\n",
280 |     "    model.save_pretrained(params['output_dir'])\n",
281 |     "    tokenizer.save_pretrained(params['output_dir'])\n",
282 |     "\n",
283 |     "    mlflow.log_artifacts(params['output_dir'], artifact_path=\"model\")\n",
284 |     "\n",
285 |     "    model_uri = f\"runs:/{run.info.run_id}/model\"\n",
286 |     "    mlflow.register_model(model_uri, \"agnews-transformer\")\n",
287 |     "\n",
288 |     "print('Finished Training')"
289 |    ]
290 |   }
291 |  ],
292 |  "metadata": {
293 |   "kernelspec": {
294 |    "display_name": "dev",
295 |    "language": "python",
296 |    "name": "python3"
297 |   },
298 |   "language_info": {
299 |    "codemirror_mode": {
300 |     "name": "ipython",
301 |     "version": 3
302 |    },
303 |    "file_extension": ".py",
304 |    "mimetype": "text/x-python",
305 |    "name": "python",
306 |    "nbconvert_exporter": "python",
307 |    "pygments_lexer": "ipython3",
308 |    "version": "3.10.12"
309 |   }
310 |  },
311 |  "nbformat": 4,
312 |  "nbformat_minor": 2
313 | }
314 | 


--------------------------------------------------------------------------------
/sections/4. Model Management/4.7_mlflow_inference.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## MLflow Integration for Model Serving and Registry Management\n",
  8 |     "\n",
  9 |     "In this notebook, we delve into advanced aspects of MLflow, focusing on model serving, inference, and the management of model versions in the MLflow model registry. Our goal is to demonstrate how MLflow supports the operational phase of the machine learning lifecycle, which includes serving models for inference and efficiently managing multiple versions of models.\n",
 10 |     "\n",
 11 |     "We will explore the practical application of these concepts using a text classification model. This will include loading models for inference, performing predictions, managing different versions of models, and understanding how to transition models through various stages in the model lifecycle. These skills are essential for operational efficiency and effective model management in real-world machine learning applications, aligning with the core themes of our course on MLops and experiment tracking.\n",
 12 |     "\n",
 13 |     "\n",
 14 |     "### Objective:\n",
 15 |     "* Loading and Serving Models\n",
 16 |     "* Inference with the Model\n",
 17 |     "* Managing Model Versions\n",
 18 |     "* Deleting Models and Versions"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "### Environment Setup\n",
 26 |     "\n",
 27 |     "Ensure all necessary libraries are installed and imported for our workflow."
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "#!pip install mlflow torch transformers"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "### Imports\n",
 44 |     "\n",
 45 |     "Import necessary libraries focusing on MLflow for model retrieval, PyTorch for model operations, and Transformers for data processing."
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "import mlflow\n",
 55 |     "import torch\n",
 56 |     "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
 57 |     "import os"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "### Connect to Mlflow Server"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "# Set MLflow tracking URI\n",
 74 |     "mlflow.set_tracking_uri(\"http://localhost:5000\")\n",
 75 |     "client = mlflow.tracking.MlflowClient()"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "### Retrieve the Model from MLflow\n",
 83 |     "\n",
 84 |     "In this step, we'll explore two methods to retrieve our trained model from MLflow. Understanding the nuances of each method is key to making an informed choice in a real-life scenario based on the requirements and constraints of your deployment environment.\n",
 85 |     "\n",
 86 |     "#### Method 1: Using the Built-in PyTorch Loader\n",
 87 |     "\n",
 88 |     "This method is straightforward and uses MLflow's built-in functionality to load PyTorch models. It's user-friendly and works well when you're working within a PyTorch-centric workflow.\n"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "# Load a specific model version\n",
 98 |     "model_name = \"agnews_pt_classifier\"\n",
 99 |     "model_version = \"1\"  # or \"production\", \"staging\"\n",
100 |     "\n",
101 |     "\n",
102 |     "model_uri = f\"models:/{model_name}/{model_version}\"\n",
103 |     "model = mlflow.pytorch.load_model(model_uri)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "## Performing Inference\n",
111 |     "\n",
112 |     "Here, we define the `predict` function to perform inference using the loaded model. This function takes a list of texts, tokenizes them using a pre-trained tokenizer, and then feeds them into the model. The output is the model's prediction, which can be used for various applications such as text classification, sentiment analysis, etc. This step is crucial in demonstrating how a trained model can be utilized for practical applications.\n"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "\n",
122 |     "def predict(texts, model, tokenizer):\n",
123 |     "    # Tokenize the texts\n",
124 |     "    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors=\"pt\").to(model.device)\n",
125 |     "\n",
126 |     "    # Pass the inputs to the model\n",
127 |     "    with torch.no_grad():\n",
128 |     "        outputs = model(**inputs)\n",
129 |     "        predictions = torch.argmax(outputs.logits, dim=-1)\n",
130 |     "\n",
131 |     "    # Convert predictions to text labels\n",
132 |     "    predictions = predictions.cpu().numpy()\n",
133 |     "    predictions = [model.config.id2label[prediction] for prediction in predictions]\n",
134 |     "\n",
135 |     "    # Print predictions\n",
136 |     "    return predictions\n"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "# Sample text to predict\n",
146 |     "texts = [\n",
147 |     "    \"The local high school soccer team triumphed in the state championship, securing victory with a last-second winning goal.\",\n",
148 |     "    \"DataCore is set to acquire startup InnovateAI for $2 billion, aiming to enhance its position in the artificial intelligence market.\",\n",
149 |     "]\n"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "# Tokenizer needs to be loaded sepparetly for this\n",
159 |     "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
160 |     "\n",
161 |     "print(predict(texts, model, tokenizer))"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "\n",
169 |     "#### Method 2: Versatile Loading with Custom Handling\n",
170 |     "\n",
171 |     "This alternate method is more versatile and can handle different types of models. It's particularly useful when you're working with a variety of models or when the environment requires a more customized approach."
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "\n",
181 |     "# Load custom model\n",
182 |     "model_name = \"agnews-transformer\"\n",
183 |     "model_version = \"1\"  # or \"production\", \"pstaging\"\n",
184 |     "model_version_details = client.get_model_version(name=model_name, version=model_version)\n",
185 |     "\n",
186 |     "run_id = model_version_details.run_id\n",
187 |     "artifact_path = model_version_details.source\n",
188 |     "\n",
189 |     "# Construct the model URI\n",
190 |     "model_uri = f\"models:/{model_name}/{model_version}\"\n",
191 |     "\n",
192 |     "model_path = \"models/agnews_transformer\"\n",
193 |     "os.makedirs(model_path, exist_ok=True)\n",
194 |     "\n",
195 |     "client.download_artifacts(run_id, artifact_path, dst_path=model_path)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "# Load the model and tokenizer\n",
205 |     "custom_model = AutoModelForSequenceClassification.from_pretrained(\"models/agnews_transformer/custom_model\")\n",
206 |     "tokenizer = AutoTokenizer.from_pretrained(\"models/agnews_transformer/custom_model\")\n"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "# Do the inference\n",
216 |     "print(predict(texts, custom_model, tokenizer))"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "## Demonstrating Model Versioning with MLflow\n",
224 |     "\n",
225 |     "One of the powerful features of MLflow is its ability to manage multiple versions of models. In this section, we log new iterations of our model to showcase this versioning capability. By setting a new experiment and logging models under different run names, we effectively create multiple versions of the same model. This is a crucial aspect of MLOps, as it allows for tracking the evolution of models over time, comparing different iterations, and systematically managing the model lifecycle. We demonstrate this by logging two additional iterations of our model, tagged as \"iteration2\" and \"iteration3\".\n"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "# Log some new models for versioning demonstration\n",
235 |     "mlflow.set_experiment(\"sequence_classification\")\n",
236 |     "\n",
237 |     "# Log a new model as iteration 2\n",
238 |     "with mlflow.start_run(run_name=\"iteration2\"):\n",
239 |     "    mlflow.pytorch.log_model(model, \"model\")\n",
240 |     "\n",
241 |     "# Log another new model as iteration 3\n",
242 |     "with mlflow.start_run(run_name=\"iteration3\"):\n",
243 |     "    mlflow.pytorch.log_model(model, \"model\")\n"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "## Performing Inference\n",
251 |     "\n",
252 |     "Here, we define the `predict` function to perform inference using the loaded model. This function takes a list of texts, tokenizes them using a pre-trained tokenizer, and then feeds them into the model. The output is the model's prediction, which can be used for various applications such as text classification, sentiment analysis, etc. This step is crucial in demonstrating how a trained model can be utilized for practical applications.\n"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "# Model version management\n",
262 |     "model_versions = client.search_model_versions(f\"name='{model_name}'\")\n",
263 |     "for version in model_versions:\n",
264 |     "    print(f\"Version: {version.version}, Stage: {version.current_stage}\")\n",
265 |     "\n",
266 |     "# Change model stage\n",
267 |     "client.transition_model_version_stage(name=model_name, version=model_version, stage=\"Production\")\n"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "## Cleaning Up: Deleting Models and Versions\n",
275 |     "\n",
276 |     "In some scenarios, you might need to delete specific model versions or even entire registered models from MLflow. This section covers how to perform these deletions. Note that this should be done cautiously, as it cannot be undone. This is particularly useful for maintaining a clean and efficient model registry by removing outdated or unused models and versions.\n"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "# Delete a specific model version\n",
286 |     "client.delete_model_version(name=model_name, version=model_version)\n",
287 |     "\n",
288 |     "# Delete the entire registered model\n",
289 |     "client.delete_registered_model(name=model_name)\n"
290 |    ]
291 |   }
292 |  ],
293 |  "metadata": {
294 |   "kernelspec": {
295 |    "display_name": "dev",
296 |    "language": "python",
297 |    "name": "python3"
298 |   },
299 |   "language_info": {
300 |    "codemirror_mode": {
301 |     "name": "ipython",
302 |     "version": 3
303 |    },
304 |    "file_extension": ".py",
305 |    "mimetype": "text/x-python",
306 |    "name": "python",
307 |    "nbconvert_exporter": "python",
308 |    "pygments_lexer": "ipython3",
309 |    "version": "3.10.12"
310 |   }
311 |  },
312 |  "nbformat": 4,
313 |  "nbformat_minor": 2
314 | }
315 | 


--------------------------------------------------------------------------------
/sections/4. Model Management/4.8_mlflow_authentication.py:
--------------------------------------------------------------------------------
 1 | from mlflow.server import get_app_client
 2 | import os
 3 | 
 4 | tracking_uri = "http://localhost:5000"
 5 | auth_client = get_app_client("basic-auth", tracking_uri=tracking_uri)
 6 | 
 7 | # Set username and password
 8 | os.environ["MLFLOW_TRACKING_USERNAME"] = "admin"
 9 | os.environ["MLFLOW_TRACKING_PASSWORD"] = "password"
10 | 
11 | # Change password
12 | auth_client.update_user_password(username="admin", password="zG*8!7p@TKmS")
13 | 


--------------------------------------------------------------------------------
/sections/5. Advanced Model Deployment Techniques/5.2_batching_and_dynamic_batching.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Text Generation with Transformers and Dynamic Batching 🚀\n",
  8 |     "\n",
  9 |     "**Objectives:**\n",
 10 |     "- 📚 Learn to use pretrained models/tokenizers from Hugging Face.\n",
 11 |     "- ✍️ Generate text for prompts.\n",
 12 |     "- 🧑‍🔬 Explore batch and dynamic batch text generation.\n",
 13 |     "- 🏎 Optimize text generation efficiency.\n"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## Setup and Imports 🛠\n",
 21 |     "\n",
 22 |     "**Imports:**\n",
 23 |     "- `transformers`: For models & tokenizers.\n",
 24 |     "- `datasets`: Easy data access.\n",
 25 |     "- `torch`: Tensor operations.\n",
 26 |     "- `tqdm`: Progress bars.\n"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
 36 |     "from datasets import load_dataset\n",
 37 |     "import torch\n",
 38 |     "from tqdm.auto import tqdm"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## Loading the Model and Tokenizer 📦\n",
 46 |     "\n",
 47 |     "**Key Steps:**\n",
 48 |     "- Load model (`TheFuzzyScientist/diabloGPT_open-instruct`).\n",
 49 |     "- Load tokenizer (`microsoft/DialoGPT-medium`).\n",
 50 |     "- Set tokenizer padding to `eos_token`.\n",
 51 |     "- Enable GPU acceleration (`cuda`).\n"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "\n",
 61 |     "model = AutoModelForCausalLM.from_pretrained(\"TheFuzzyScientist/diabloGPT_open-instruct\").to(\"cuda\")\n",
 62 |     "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/DialoGPT-medium\", padding_side=\"left\")\n",
 63 |     "tokenizer.pad_token = tokenizer.eos_token\n"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "## Dataset Preparation 📈\n",
 71 |     "\n",
 72 |     "**Process:**\n",
 73 |     "- Use `hakurei/open-instruct-v1` dataset.\n",
 74 |     "- Convert to pandas DataFrame for easier handling.\n"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "\n",
 84 |     "dataset = load_dataset(\"hakurei/open-instruct-v1\", split=\"train\")\n",
 85 |     "dataset = dataset.to_pandas()\n"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "## Text Generation Functions ✒️\n",
 93 |     "\n",
 94 |     "**Functions:**\n",
 95 |     "- `generate_text`: Single prompt text generation.\n",
 96 |     "- `batch_generate_texts`: Batch prompt text generation for efficiency.\n"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "\n",
106 |     "def generate_text(prompt):\n",
107 |     "    inputs = tokenizer.encode(prompt, return_tensors=\"pt\").to(model.device)\n",
108 |     "    outputs = model.generate(inputs, max_length=64)\n",
109 |     "    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
110 |     "\n",
111 |     "    return generated[: generated.find(\".\") + 1]\n",
112 |     "\n",
113 |     "\n",
114 |     "generate_text(\"What's the best way to cook chiken breast?\")"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "## Text Generation Demo 🎭\n",
122 |     "\n",
123 |     "**Activities:**\n",
124 |     "- Generate text from a single prompt.\n",
125 |     "- Generate texts in batches to observe efficiency.\n"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "\n",
135 |     "def batch_generate_texts(prompts):\n",
136 |     "    inputs = tokenizer(prompts, return_tensors=\"pt\", padding=True).to(model.device)[\"input_ids\"]\n",
137 |     "    outputs = model.generate(inputs, max_length=64, pad_token_id=tokenizer.eos_token_id)\n",
138 |     "    generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)\n",
139 |     "\n",
140 |     "    return generated\n",
141 |     "\n",
142 |     "\n",
143 |     "batch_generate_texts(dataset[\"instruction\"][:1].tolist())\n",
144 |     "batch_generate_texts(dataset[\"instruction\"][:20].tolist())\n",
145 |     "batch_generate_texts(dataset[\"instruction\"][:100].tolist())\n",
146 |     "batch_generate_texts(dataset[\"instruction\"][:200].tolist())\n",
147 |     "# batch_generate_texts(dataset[\"instruction\"].sample(200).tolist()) # this might crash\n"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "## Advanced: Dynamic Batching for Efficiency ⚙️\n",
155 |     "\n",
156 |     "**Concepts:**\n",
157 |     "- Implement dynamic batching for hardware optimization.\n",
158 |     "- Utilize different batching techniques for performance improvement.\n"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "\n",
168 |     "def batch_generate_tokens(tokens):\n",
169 |     "    outputs = model.generate(torch.stack(tokens), max_length=64, pad_token_id=tokenizer.eos_token_id)\n",
170 |     "\n",
171 |     "    return tokenizer.batch_decode(outputs, skip_special_tokens=True)\n",
172 |     "\n",
173 |     "\n",
174 |     "def dynamic_batching(prompts, max_tokens, is_pretokenized=False):\n",
175 |     "    if not is_pretokenized:\n",
176 |     "        tokenized_texts = tokenizer(prompts, return_tensors=\"pt\", padding=True)[\"input_ids\"].to(model.device)\n",
177 |     "    else:\n",
178 |     "        tokenized_texts = prompts\n",
179 |     "\n",
180 |     "    current_batch = []\n",
181 |     "    current_batch_size = 0\n",
182 |     "\n",
183 |     "    for tokenized_text in tokenized_texts:\n",
184 |     "        if current_batch_size + len(tokenized_text) > max_tokens and current_batch:\n",
185 |     "            yield batch_generate_tokens(current_batch)\n",
186 |     "\n",
187 |     "            current_batch, current_batch_size = [], 0\n",
188 |     "\n",
189 |     "        current_batch.append(tokenized_text)\n",
190 |     "        current_batch_size += len(tokenized_text)\n",
191 |     "\n",
192 |     "    # Process final batch\n",
193 |     "    if current_batch:\n",
194 |     "        yield batch_generate_tokens(current_batch)\n",
195 |     "        pass\n",
196 |     "\n",
197 |     "\n",
198 |     "generator = dynamic_batching(dataset[\"instruction\"][:40].tolist() * 1000, 3200)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "## Applying Dynamic Batching and Measuring Performance 📊 ⏱\n",
206 |     "\n",
207 |     "**Steps:**\n",
208 |     "- Apply dynamic batching on a large dataset.\n",
209 |     "- Track performance and efficiency improvements.\n"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "\n",
219 |     "from contextlib import contextmanager\n",
220 |     "import time\n",
221 |     "\n",
222 |     "\n",
223 |     "@contextmanager\n",
224 |     "def track_time():\n",
225 |     "    start = time.time()  # Record start time\n",
226 |     "    yield\n",
227 |     "    end = time.time()  # Record end time\n",
228 |     "    print(f\"Execution time: {end - start} seconds\")\n",
229 |     "\n",
230 |     "\n",
231 |     "with track_time():\n",
232 |     "    for batch_predictions in tqdm(generator):\n",
233 |     "        continue\n",
234 |     "\n",
235 |     "\n",
236 |     "def sort_batches(prompts, max_tokens):\n",
237 |     "    tokenized_texts = tokenizer(prompts, padding=False)[\"input_ids\"]\n",
238 |     "    sorted_tokens = sorted(tokenized_texts, key=len)\n",
239 |     "\n",
240 |     "    sorted_batches = {}\n",
241 |     "    for sorted_token in sorted_tokens:\n",
242 |     "        length = len(sorted_token)\n",
243 |     "        if length not in sorted_batches:\n",
244 |     "            sorted_batches[length] = []\n",
245 |     "\n",
246 |     "        sorted_batches[length].append(sorted_token)\n",
247 |     "\n",
248 |     "    for length, sorted_batch in sorted_batches.items():\n",
249 |     "        tensor_batch = torch.stack([torch.tensor(sorted_token) for sorted_token in sorted_batch]).to(model.device)\n",
250 |     "        for batch_prediction in dynamic_batching(tensor_batch, max_tokens=max_tokens, is_pretokenized=True):\n",
251 |     "            yield batch_prediction\n",
252 |     "\n",
253 |     "\n",
254 |     "generator = sort_batches(dataset[\"instruction\"][:40].tolist() * 1000, 3200)\n",
255 |     "\n",
256 |     "with track_time():\n",
257 |     "    for batch_predictions in tqdm(generator):\n",
258 |     "        print(len(batch_predictions))\n"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "# Conclusion and Next Steps 🌈\n",
266 |     "\n",
267 |     "**Achievements:**\n",
268 |     "- Mastered text generation with Transformers.\n",
269 |     "- Learned about batch and dynamic batching efficiencies.\n",
270 |     "\n",
271 |     "**Explore Further:**\n",
272 |     "- Experiment with different models/tokenizers.\n",
273 |     "- Test with various datasets.\n",
274 |     "- Adjust batch size to see performance differences.\n"
275 |    ]
276 |   }
277 |  ],
278 |  "metadata": {
279 |   "language_info": {
280 |    "name": "python"
281 |   }
282 |  },
283 |  "nbformat": 4,
284 |  "nbformat_minor": 2
285 | }
286 | 


--------------------------------------------------------------------------------
/sections/5. Advanced Model Deployment Techniques/5.2_batching_and_dynamic_batching.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoModelForCausalLM, AutoTokenizer
  2 | from datasets import load_dataset
  3 | import torch
  4 | from tqdm.auto import tqdm
  5 | 
  6 | model = AutoModelForCausalLM.from_pretrained("TheFuzzyScientist/diabloGPT_open-instruct").to("cuda")
  7 | tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium", padding_side="left")
  8 | tokenizer.pad_token = tokenizer.eos_token
  9 | 
 10 | 
 11 | dataset = load_dataset("hakurei/open-instruct-v1", split="train")
 12 | dataset = dataset.to_pandas()
 13 | 
 14 | 
 15 | def generate_text(prompt):
 16 |     inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
 17 |     outputs = model.generate(inputs, max_length=64)
 18 |     generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
 19 | 
 20 |     return generated[: generated.find(".") + 1]
 21 | 
 22 | 
 23 | generate_text("What's the best way to cook chiken breast?")
 24 | 
 25 | 
 26 | def batch_generate_texts(prompts):
 27 |     inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)["input_ids"]
 28 |     outputs = model.generate(inputs, max_length=64, pad_token_id=tokenizer.eos_token_id)
 29 |     generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)
 30 | 
 31 |     return generated
 32 | 
 33 | 
 34 | batch_generate_texts(dataset["instruction"][:1].tolist())
 35 | batch_generate_texts(dataset["instruction"][:20].tolist())
 36 | batch_generate_texts(dataset["instruction"][:100].tolist())
 37 | batch_generate_texts(dataset["instruction"][:200].tolist())
 38 | # batch_generate_texts(dataset["instruction"].sample(200).tolist()) # this might crash
 39 | 
 40 | 
 41 | # Dynamic batching
 42 | 
 43 | 
 44 | def batch_generate_tokens(tokens):
 45 |     outputs = model.generate(torch.stack(tokens), max_length=64, pad_token_id=tokenizer.eos_token_id)
 46 | 
 47 |     return tokenizer.batch_decode(outputs, skip_special_tokens=True)
 48 | 
 49 | 
 50 | def dynamic_batching(prompts, max_tokens, is_pretokenized=False):
 51 |     if not is_pretokenized:
 52 |         tokenized_texts = tokenizer(prompts, return_tensors="pt", padding=True)["input_ids"].to(model.device)
 53 |     else:
 54 |         tokenized_texts = prompts
 55 | 
 56 |     current_batch = []
 57 |     current_batch_size = 0
 58 | 
 59 |     for tokenized_text in tokenized_texts:
 60 |         if current_batch_size + len(tokenized_text) > max_tokens and current_batch:
 61 |             yield batch_generate_tokens(current_batch)
 62 | 
 63 |             current_batch, current_batch_size = [], 0
 64 | 
 65 |         current_batch.append(tokenized_text)
 66 |         current_batch_size += len(tokenized_text)
 67 | 
 68 |     # Process final batch
 69 |     if current_batch:
 70 |         yield batch_generate_tokens(current_batch)
 71 |         pass
 72 | 
 73 | 
 74 | generator = dynamic_batching(dataset["instruction"][:40].tolist() * 1000, 3200)
 75 | 
 76 | 
 77 | from contextlib import contextmanager
 78 | import time
 79 | 
 80 | 
 81 | @contextmanager
 82 | def track_time():
 83 |     start = time.time()  # Record start time
 84 |     yield
 85 |     end = time.time()  # Record end time
 86 |     print(f"Execution time: {end - start} seconds")
 87 | 
 88 | 
 89 | with track_time():
 90 |     for batch_predictions in tqdm(generator):
 91 |         continue
 92 | 
 93 | 
 94 | def sort_batches(prompts, max_tokens):
 95 |     tokenized_texts = tokenizer(prompts, padding=False)["input_ids"]
 96 |     sorted_tokens = sorted(tokenized_texts, key=len)
 97 | 
 98 |     sorted_batches = {}
 99 |     for sorted_token in sorted_tokens:
100 |         length = len(sorted_token)
101 |         if length not in sorted_batches:
102 |             sorted_batches[length] = []
103 | 
104 |         sorted_batches[length].append(sorted_token)
105 | 
106 |     for length, sorted_batch in sorted_batches.items():
107 |         tensor_batch = torch.stack([torch.tensor(sorted_token) for sorted_token in sorted_batch]).to(model.device)
108 |         for batch_prediction in dynamic_batching(tensor_batch, max_tokens=max_tokens, is_pretokenized=True):
109 |             yield batch_prediction
110 | 
111 | 
112 | generator = sort_batches(dataset["instruction"][:40].tolist() * 1000, 3200)
113 | 
114 | with track_time():
115 |     for batch_predictions in tqdm(generator):
116 |         print(len(batch_predictions))
117 | 


--------------------------------------------------------------------------------
/sections/5. Advanced Model Deployment Techniques/5.3_the_role_of_sorting_batches.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Advanced Text Generation Techniques with Transformers 🚀\n",
  8 |     "\n",
  9 |     "In this advanced lab, we dive deeper into efficient text generation techniques using Transformers. We'll explore two batching strategies: normal batching and sorted batching, to optimize our text generation tasks.\n",
 10 |     "\n",
 11 |     "**Objectives:**\n",
 12 |     "- 🧰 Implement advanced text generation functions.\n",
 13 |     "- 📊 Compare normal vs. sorted batching efficiency.\n",
 14 |     "- ⏱ Measure and understand execution time improvements.\n"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Setup and Imports 🛠\n",
 22 |     "\n",
 23 |     "Before diving into the code, let's ensure we have all the necessary tools:\n",
 24 |     "\n",
 25 |     "- `transformers` & `datasets`: For our model and data.\n",
 26 |     "- `torch`: For tensor operations.\n",
 27 |     "- `tqdm`: For progress tracking.\n",
 28 |     "- `contextlib` & `time`: For measuring execution time.\n"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
 38 |     "from datasets import load_dataset\n",
 39 |     "import torch\n",
 40 |     "from tqdm.auto import tqdm\n",
 41 |     "from contextlib import contextmanager\n",
 42 |     "import time\n"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## Time Tracking Utility ⏱\n",
 50 |     "\n",
 51 |     "To compare the efficiency of our batching strategies, we'll use a context manager to track the execution time:\n",
 52 |     "\n",
 53 |     "- **Purpose:** Measure the time it takes to execute a block of code.\n",
 54 |     "- **Output:** Prints the execution time in seconds.\n"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "@contextmanager\n",
 64 |     "def track_time():\n",
 65 |     "    start = time.time()\n",
 66 |     "    yield\n",
 67 |     "    end = time.time()\n",
 68 |     "    print(f\"Execution time: {end - start:.2f}s\")"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "## Model and Tokenizer Setup 🧩\n",
 76 |     " \n",
 77 |     "Setting up our model and tokenizer is crucial for text generation:\n",
 78 |     "\n",
 79 |     "- **Model:** \"TheFuzzyScientist/diabloGPT_open-instruct\" for instructive text generation.\n",
 80 |     "- **Tokenizer:** \"microsoft/DialoGPT-medium\" with padding adjusted.\n",
 81 |     "- **Device:** Utilize CUDA for GPU acceleration.\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "model = AutoModelForCausalLM.from_pretrained(\"TheFuzzyScientist/diabloGPT_open-instruct\").to(\"cuda\")\n",
 91 |     "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/DialoGPT-medium\", padding_side=\"left\")\n",
 92 |     "tokenizer.pad_token = tokenizer.eos_token"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "## Dataset Preparation and Initial Tokenization 📚\n",
100 |     "\n",
101 |     "We'll work with a sample dataset for text generation tasks:\n",
102 |     "\n",
103 |     "- **Dataset:** \"hakurei/open-instruct-v1\" converted to a pandas DataFrame.\n",
104 |     "- **Initial Tokenization:** Convert a sample of prompts to input IDs.\n"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "dataset = load_dataset(\"hakurei/open-instruct-v1\", split=\"train\")\n",
114 |     "dataset = dataset.to_pandas()\n",
115 |     "\n",
116 |     "prompts = dataset[\"instruction\"].sample(4).tolist()\n",
117 |     "inputs = tokenizer(prompts, padding=True)[\"input_ids\"]\n",
118 |     "\n",
119 |     "# print('\\n\\n'.join(tokenizer.batch_decode(inputs)))\n",
120 |     "print(\"\\n\\n\".join(tokenizer.batch_decode(inputs)).replace(tokenizer.eos_token, \"[PAD]\"))\n"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "## Normal Batching Method 🔄\n",
128 |     "\n",
129 |     "Normal batching processes prompts in fixed-size batches:\n",
130 |     "\n",
131 |     "- **Chunker Function:** Splits our data into specified batch sizes.\n",
132 |     "- **Batch Generation:** Generates text for each batch of tokens.\n",
133 |     "- **Predict Function:** Orchestrates the batching and generation process.\n"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "# Normal batching\n",
143 |     "def chunker(seq, size):\n",
144 |     "    return (seq[pos : pos + size] for pos in range(0, len(seq), size))\n",
145 |     "\n",
146 |     "\n",
147 |     "def batch_generate_tokens(tokens):\n",
148 |     "    outputs = model.generate(tokens, max_new_tokens=64, pad_token_id=tokenizer.eos_token_id)\n",
149 |     "\n",
150 |     "    return tokenizer.batch_decode(outputs, skip_special_tokens=True)\n",
151 |     "\n",
152 |     "\n",
153 |     "def predict_batch(prompts, batch_size):\n",
154 |     "    inputs = tokenizer(prompts, return_tensors=\"pt\", padding=True, truncation=True, max_length=512)[\"input_ids\"]\n",
155 |     "\n",
156 |     "    for batch in chunker(inputs, batch_size):\n",
157 |     "        yield batch_generate_tokens(batch.to(model.device))"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "## Predicting with Normal Batching  ⚡\n",
165 |     "\n",
166 |     "Let's generate text using the normal batching method:\n",
167 |     "\n",
168 |     "- **Process:** Tokenize prompts, generate text in batches, and track execution time.\n",
169 |     "- **Observation:** Note the time it takes to process 3000 prompts.\n"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "prompts = dataset[\"instruction\"].sample(3000).tolist()\n",
179 |     "\n",
180 |     "with track_time():\n",
181 |     "    for batch_prediction in tqdm(predict_batch(prompts, 32)):\n",
182 |     "        print(len(batch_prediction))\n",
183 |     "        \n",
184 |     "# Execution time: 137.19s"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "## Sorted Batching Method  🔢\n",
192 |     "\n",
193 |     "Sorted batching aims to improve efficiency by grouping prompts of similar lengths:\n",
194 |     "\n",
195 |     "- **Strategy:** Sort prompts by length and batch accordingly.\n",
196 |     "- **Benefits:** Reduces padding, potentially speeding up computation.\n"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "# Sorted Batching\n",
206 |     "def predict_sorted_batches(prompts, max_batch_size):\n",
207 |     "    inputs = tokenizer(prompts, padding=False, truncation=True, max_length=512)[\"input_ids\"]\n",
208 |     "\n",
209 |     "    sorted_tokens = sorted(inputs, key=len)\n",
210 |     "    sorted_batches = {}\n",
211 |     "    for sorted_input in sorted_tokens:\n",
212 |     "        if not len(sorted_input):\n",
213 |     "            continue\n",
214 |     "\n",
215 |     "        length = len(sorted_input)\n",
216 |     "        if length not in sorted_batches:\n",
217 |     "            sorted_batches[length] = []\n",
218 |     "\n",
219 |     "        sorted_batches[length].append(sorted_input)\n",
220 |     "\n",
221 |     "    for length, sorted_batch in sorted_batches.items():\n",
222 |     "        for batch in chunker(sorted_batch, max_batch_size):\n",
223 |     "            tensor_batch = torch.tensor(batch).to(model.device)\n",
224 |     "            yield batch_generate_tokens(tensor_batch)\n"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {},
230 |    "source": [
231 |     "## Predicting with Sorted Batching 🚀\n",
232 |     "\n",
233 |     "Applying the sorted batching method:\n",
234 |     "\n",
235 |     "- **Execution:** Similar to normal batching but with sorted prompts.\n",
236 |     "- **Comparison:** Observe the execution time difference from normal batching.\n"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "with track_time():\n",
246 |     "    for batch_prediction in tqdm(predict_sorted_batches(prompts, 32)):\n",
247 |     "        print(len(batch_prediction))\n",
248 |     "\n",
249 |     "# Execution time: 72.74s"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {},
255 |    "source": [
256 |     "# Conclusion and Next Steps 🌈\n",
257 |     "\n",
258 |     "Through this lab, we've explored advanced batching techniques for text generation with Transformers. We saw firsthand how sorted batching can significantly reduce execution time compared to normal batching.\n",
259 |     "\n",
260 |     "**Encouraged Next Steps:**\n",
261 |     "- 🤖 Experiment with different models and datasets.\n",
262 |     "- 📐 Adjust batch sizes and observe the impact on performance.\n",
263 |     "- 🔄 Explore other optimization techniques for text generation.\n"
264 |    ]
265 |   }
266 |  ],
267 |  "metadata": {
268 |   "language_info": {
269 |    "name": "python"
270 |   }
271 |  },
272 |  "nbformat": 4,
273 |  "nbformat_minor": 2
274 | }
275 | 


--------------------------------------------------------------------------------
/sections/5. Advanced Model Deployment Techniques/5.3_the_role_of_sorting_batches.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | from datasets import load_dataset
 3 | import torch
 4 | from tqdm.auto import tqdm
 5 | from contextlib import contextmanager
 6 | import time
 7 | 
 8 | 
 9 | @contextmanager
10 | def track_time():
11 |     start = time.time()
12 |     yield
13 |     end = time.time()
14 |     print(f"Execution time: {end - start:.2f}s")
15 | 
16 | 
17 | model = AutoModelForCausalLM.from_pretrained("TheFuzzyScientist/diabloGPT_open-instruct").to("cuda")
18 | tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium", padding_side="left")
19 | tokenizer.pad_token = tokenizer.eos_token
20 | 
21 | 
22 | dataset = load_dataset("hakurei/open-instruct-v1", split="train")
23 | dataset = dataset.to_pandas()
24 | 
25 | prompts = dataset["instruction"].sample(4).tolist()
26 | inputs = tokenizer(prompts, padding=True)["input_ids"]
27 | 
28 | 
29 | # print('\n\n'.join(tokenizer.batch_decode(inputs)))
30 | print("\n\n".join(tokenizer.batch_decode(inputs)).replace(tokenizer.eos_token, "[PAD]"))
31 | 
32 | 
33 | # Normal batching
34 | def chunker(seq, size):
35 |     return (seq[pos : pos + size] for pos in range(0, len(seq), size))
36 | 
37 | 
38 | def batch_generate_tokens(tokens):
39 |     outputs = model.generate(tokens, max_new_tokens=64, pad_token_id=tokenizer.eos_token_id)
40 | 
41 |     return tokenizer.batch_decode(outputs, skip_special_tokens=True)
42 | 
43 | 
44 | def predict_batch(prompts, batch_size):
45 |     inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=512)["input_ids"]
46 | 
47 |     for batch in chunker(inputs, batch_size):
48 |         yield batch_generate_tokens(batch.to(model.device))
49 | 
50 | 
51 | prompts = dataset["instruction"].sample(3000).tolist()
52 | 
53 | with track_time():
54 |     for batch_prediction in tqdm(predict_batch(prompts, 32)):
55 |         print(len(batch_prediction))
56 | # Execution time: 137.19s
57 | 
58 | 
59 | # Sorted Batching
60 | def predict_sorted_batches(prompts, max_batch_size):
61 |     inputs = tokenizer(prompts, padding=False, truncation=True, max_length=512)["input_ids"]
62 | 
63 |     sorted_tokens = sorted(inputs, key=len)
64 |     sorted_batches = {}
65 |     for sorted_input in sorted_tokens:
66 |         if not len(sorted_input):
67 |             continue
68 | 
69 |         length = len(sorted_input)
70 |         if length not in sorted_batches:
71 |             sorted_batches[length] = []
72 | 
73 |         sorted_batches[length].append(sorted_input)
74 | 
75 |     for length, sorted_batch in sorted_batches.items():
76 |         for batch in chunker(sorted_batch, max_batch_size):
77 |             tensor_batch = torch.tensor(batch).to(model.device)
78 |             yield batch_generate_tokens(tensor_batch)
79 | 
80 | 
81 | with track_time():
82 |     for batch_prediction in tqdm(predict_sorted_batches(prompts, 32)):
83 |         print(len(batch_prediction))
84 | 
85 | # Execution time: 72.74s
86 | 


--------------------------------------------------------------------------------
/sections/5. Advanced Model Deployment Techniques/5.4_understanding_quantization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Model Quantization for Efficient Text Generation 🚀\n",
  8 |     "\n",
  9 |     "In this lab, we'll explore model quantization using ctranslate2 and its impact on text generation efficiency. Quantization reduces model size and speeds up inference, crucial for deploying models in resource-constrained environments.\n",
 10 |     "\n",
 11 |     "**Objectives:**\n",
 12 |     "- 📦 Understand the basics of model quantization.\n",
 13 |     "- ⚖️ Quantize a pre-trained model for efficient text generation.\n",
 14 |     "- ⏱ Compare execution times before and after quantization.\n"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Setup and Imports  🛠\n",
 22 |     "\n",
 23 |     "First, let's get our workspace ready with all the necessary tools:\n",
 24 |     "\n",
 25 |     "- `ctranslate2`: For model conversion and quantization.\n",
 26 |     "- `transformers` & `datasets`: For our model, tokenizer, and data.\n",
 27 |     "- `torch`: For tensor operations.\n",
 28 |     "- `tqdm`: Visual progress indication.\n"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "# !pip install ctranslate2\n",
 38 |     "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
 39 |     "from datasets import load_dataset\n",
 40 |     "import torch\n",
 41 |     "from tqdm.auto import tqdm\n",
 42 |     "from ctranslate2.converters import TransformersConverter\n",
 43 |     "from ctranslate2 import Generator\n",
 44 |     "\n",
 45 |     "from contextlib import contextmanager\n",
 46 |     "import time\n",
 47 |     "\n",
 48 |     "@contextmanager\n",
 49 |     "def track_time():\n",
 50 |     "    start = time.time()  # Record start time\n",
 51 |     "    yield\n",
 52 |     "    end = time.time()  # Record end time\n",
 53 |     "    print(f\"Execution time: {end - start} seconds\")"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "## Model and Tokenizer Setup  🧩\n",
 61 |     "\n",
 62 |     "Before quantization, we need to load and prepare our model and tokenizer:\n",
 63 |     "\n",
 64 |     "- **Model:** \"TheFuzzyScientist/diabloGPT_open-instruct\" for instructive text generation.\n",
 65 |     "- **Tokenizer:** Adjusted for our model's needs.\n",
 66 |     "- **Device:** Using CUDA for GPU acceleration.\n"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "model = AutoModelForCausalLM.from_pretrained(\"TheFuzzyScientist/diabloGPT_open-instruct\").to(\"cuda\")\n",
 76 |     "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/DialoGPT-medium\", padding_side=\"left\")\n",
 77 |     "tokenizer.pad_token = tokenizer.eos_token\n"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "## Model Quantization  ⚖️\n",
 85 |     "\n",
 86 |     "Quantizing our model to reduce its size and improve inference speed:\n",
 87 |     "\n",
 88 |     "- **Conversion & Quantization:** Using `TransformersConverter` for ctranslate2 format conversion with float16 quantization.\n",
 89 |     "- **Output:** Quantized model ready for efficient text generation.\n"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "# Convert the model to CTranslate2\n",
 99 |     "model.save_pretrained(\"models/gpt-instruct\")\n",
100 |     "tokenizer.save_pretrained(\"models/gpt-instruct\")\n",
101 |     "\n",
102 |     "converter = TransformersConverter(\"models/gpt-instruct\")\n",
103 |     "out_path = converter.convert(output_dir=\"models/gpt-instruct-quant\", quantization=\"float16\")\n",
104 |     "\n",
105 |     "generator = Generator(\"models/gpt-instruct-quant\", device=\"cuda\")"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "## Dataset Preparation 📚\n",
113 |     "\n",
114 |     "Loading and preparing a dataset for our text generation tasks:\n",
115 |     "\n",
116 |     "- **Dataset:** \"hakurei/open-instruct-v1\", a rich source for instructive prompts.\n",
117 |     "- **Sampling:** Selecting 3000 random samples for our experiments.\n"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "dataset = load_dataset(\"hakurei/open-instruct-v1\", split=\"train\")\n",
127 |     "dataset = dataset.to_pandas()\n",
128 |     "\n",
129 |     "prompts = dataset[\"instruction\"].sample(3000, random_state=42).tolist()\n"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "## Normal Batching Method 🔄\n",
137 |     "\n",
138 |     "Using the original model, we'll generate text in batches to establish a baseline for performance:\n",
139 |     "\n",
140 |     "- **Chunker:** Splits prompts into manageable batch sizes.\n",
141 |     "- **Batch Generation:** Generates text for each batch.\n"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "\n",
151 |     "# Normal batching\n",
152 |     "def chunker(seq, size):\n",
153 |     "    return (seq[pos : pos + size] for pos in range(0, len(seq), size))\n",
154 |     "\n",
155 |     "\n",
156 |     "def batch_generate_tokens(tokens):\n",
157 |     "    outputs = model.generate(tokens, max_length=256, pad_token_id=tokenizer.eos_token_id, num_beams=2, repetition_penalty=1.5)\n",
158 |     "\n",
159 |     "    return tokenizer.batch_decode(outputs, skip_special_tokens=True)\n",
160 |     "\n",
161 |     "\n",
162 |     "def predict_batch(prompts, batch_size):\n",
163 |     "    inputs = tokenizer(prompts, return_tensors=\"pt\", padding=True, truncation=True, max_length=128)[\"input_ids\"]\n",
164 |     "\n",
165 |     "    for batch in chunker(inputs, batch_size):\n",
166 |     "        yield batch_generate_tokens(batch.to(model.device))\n",
167 |     "\n",
168 |     "\n",
169 |     "with track_time():\n",
170 |     "    for batch_prediction in tqdm(predict_batch(prompts, 32)):\n",
171 |     "        continue\n",
172 |     "\n",
173 |     "# Execution time: 242.11289978027344 seconds"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "## Quantized Model Batching 🎯\n",
181 |     "\n",
182 |     "Switching to our quantized model for more efficient text generation:\n",
183 |     "\n",
184 |     "- **CTRANS Tokenization:** Adjusting tokenization for ctranslate2 input.\n",
185 |     "- **Batch Generation:** Utilizing the quantized model.\n"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "# CTranslate2 batching with quantized model\n",
195 |     "def batch_generate_ctrans(prompts, batch_size):\n",
196 |     "    inputs = [tokenizer.tokenize(prompt, truncation=True, max_length=128) for prompt in prompts]\n",
197 |     "\n",
198 |     "    results = generator.generate_batch(inputs, max_length=256, max_batch_size=batch_size, beam_size=2, repetition_penalty=1.5)\n",
199 |     "\n",
200 |     "    result_ids = [res.sequences_ids[0] for res in results]\n",
201 |     "    return tokenizer.batch_decode(result_ids, skip_special_tokens=True)\n",
202 |     "\n"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "## Predicting with Quantized Model 🚀\n",
210 |     "\n",
211 |     "Finally, let's see the performance improvement with our quantized model:\n",
212 |     "\n",
213 |     "- **Execution:** Generate text with the quantized model.\n",
214 |     "- **Comparison:** Observe the reduction in execution time versus the unquantized model.\n"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "del model\n",
224 |     "torch.cuda.empty_cache()\n",
225 |     "with track_time():\n",
226 |     "    batch_generate_ctrans(prompts, 32)\n",
227 |     "\n",
228 |     "# Execution time: 150.97192573547363 seconds\n"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "# Conclusion and Next Steps 🌈\n",
236 |     "\n",
237 |     "We've successfully quantized a text generation model and demonstrated significant improvements in efficiency. This showcases the power of model quantization for deploying NLP models in production.\n",
238 |     "\n",
239 |     "**Encouraged Next Steps:**\n",
240 |     "- 🤖 Try quantizing different models.\n",
241 |     "- 📊 Compare quantization effects on various model sizes.\n",
242 |     "- 🔍 Explore further optimizations for deployment.\n"
243 |    ]
244 |   }
245 |  ],
246 |  "metadata": {
247 |   "language_info": {
248 |    "name": "python"
249 |   }
250 |  },
251 |  "nbformat": 4,
252 |  "nbformat_minor": 2
253 | }
254 | 


--------------------------------------------------------------------------------
/sections/5. Advanced Model Deployment Techniques/5.4_understanding_quantization.py:
--------------------------------------------------------------------------------
 1 | # !pip install ctranslate2
 2 | from transformers import AutoModelForCausalLM, AutoTokenizer
 3 | from datasets import load_dataset
 4 | import torch
 5 | from tqdm.auto import tqdm
 6 | from ctranslate2.converters import TransformersConverter
 7 | from ctranslate2 import Generator
 8 | 
 9 | from contextlib import contextmanager
10 | import time
11 | 
12 | 
13 | @contextmanager
14 | def track_time():
15 |     start = time.time()  # Record start time
16 |     yield
17 |     end = time.time()  # Record end time
18 |     print(f"Execution time: {end - start} seconds")
19 | 
20 | 
21 | model = AutoModelForCausalLM.from_pretrained("TheFuzzyScientist/diabloGPT_open-instruct").to("cuda")
22 | tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium", padding_side="left")
23 | tokenizer.pad_token = tokenizer.eos_token
24 | 
25 | # Convert the model to CTranslate2
26 | model.save_pretrained("models/gpt-instruct")
27 | tokenizer.save_pretrained("models/gpt-instruct")
28 | 
29 | converter = TransformersConverter("models/gpt-instruct")
30 | out_path = converter.convert(output_dir="models/gpt-instruct-quant", quantization="float16")
31 | 
32 | generator = Generator("models/gpt-instruct-quant", device="cuda")
33 | 
34 | # Dataset
35 | dataset = load_dataset("hakurei/open-instruct-v1", split="train")
36 | dataset = dataset.to_pandas()
37 | 
38 | prompts = dataset["instruction"].sample(3000, random_state=42).tolist()
39 | 
40 | 
41 | # Normal batching
42 | def chunker(seq, size):
43 |     return (seq[pos : pos + size] for pos in range(0, len(seq), size))
44 | 
45 | 
46 | def batch_generate_tokens(tokens):
47 |     outputs = model.generate(tokens, max_length=256, pad_token_id=tokenizer.eos_token_id, num_beams=2, repetition_penalty=1.5)
48 | 
49 |     return tokenizer.batch_decode(outputs, skip_special_tokens=True)
50 | 
51 | 
52 | def predict_batch(prompts, batch_size):
53 |     inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=128)["input_ids"]
54 | 
55 |     for batch in chunker(inputs, batch_size):
56 |         yield batch_generate_tokens(batch.to(model.device))
57 | 
58 | 
59 | with track_time():
60 |     for batch_prediction in tqdm(predict_batch(prompts, 32)):
61 |         continue
62 | 
63 | # Execution time: 242.11289978027344 seconds
64 | 
65 | 
66 | # CTranslate2 batching with quantized model
67 | def batch_generate_ctrans(prompts, batch_size):
68 |     inputs = [tokenizer.tokenize(prompt, truncation=True, max_length=128) for prompt in prompts]
69 | 
70 |     results = generator.generate_batch(inputs, max_length=256, max_batch_size=batch_size, beam_size=2, repetition_penalty=1.5)
71 | 
72 |     result_ids = [res.sequences_ids[0] for res in results]
73 |     return tokenizer.batch_decode(result_ids, skip_special_tokens=True)
74 | 
75 | 
76 | del model
77 | torch.cuda.empty_cache()
78 | with track_time():
79 |     batch_generate_ctrans(prompts, 32)
80 | 
81 | # Execution time: 150.97192573547363 seconds
82 | 


--------------------------------------------------------------------------------
/sections/7. Scheduling and Running Jobs on a Cluster/level1.py:
--------------------------------------------------------------------------------
 1 | # pip install transformers==4.38.1
 2 | from transformers import pipeline
 3 | from .utils import track_time
 4 | 
 5 | pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="cuda")
 6 | 
 7 | # We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
 8 | messages = [
 9 |     {
10 |         "role": "system",
11 |         "content": "You are a friendly chatbot who is always helpful.",
12 |     },
13 |     {"role": "user", "content": "How can I get rid of a llama on my lawn?"},
14 | ]
15 | 
16 | prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
17 | prompts = [prompt] * 5
18 | 
19 | with track_time(prompts):
20 |     outputs = pipe(prompts, max_new_tokens=256, do_sample=True, temperature=0.1, top_k=50, top_p=0.95)
21 | 
22 | print(outputs[0][0]["generated_text"])
23 | 
24 | ## cpu
25 | ## latency: 48 s
26 | 
27 | ## gpu
28 | ## latency: 2.9073479175567627s
29 | ## throughput: 0.34 inputs/s
30 | 


--------------------------------------------------------------------------------
/sections/7. Scheduling and Running Jobs on a Cluster/level2.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, AutoModelForCausalLM
 2 | from .utils import track_time
 3 | 
 4 | 
 5 | model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="cuda")
 6 | tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 7 | messages = [
 8 |     {
 9 |         "role": "system",
10 |         "content": "You are a friendly chatbot who is always helpful.",
11 |     },
12 |     {"role": "user", "content": "How can I get rid of a llama on my lawn?"},
13 | ]
14 | 
15 | prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_special_tokens=False)
16 | input_ids = tokenizer([prompt] * 256, return_tensors="pt").to("cuda")
17 | 
18 | with track_time(input_ids["input_ids"]):
19 |     outputs = model.generate(**input_ids, max_length=256, do_sample=True, temperature=0.1, top_k=50, top_p=0.95)
20 | 
21 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
22 | 
23 | # latency:  2.6578898429870605s
24 | # throughput:  14.10 inputs/s
25 | 


--------------------------------------------------------------------------------
/sections/7. Scheduling and Running Jobs on a Cluster/level3.py:
--------------------------------------------------------------------------------
 1 | # pip install ctranslate2==4.0.0
 2 | from .utils import track_time
 3 | from transformers import AutoTokenizer
 4 | from ctranslate2 import Generator
 5 | 
 6 | tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 7 | model = Generator("models/TinyLlama-1.1B-Chat-v1.0-ctrans", device="cuda")
 8 | 
 9 | messages = [
10 |     {
11 |         "role": "system",
12 |         "content": "You are a friendly chatbot who is always helpful.",
13 |     },
14 |     {"role": "user", "content": "How can I get rid of a llama on my lawn?"},
15 | ]
16 | 
17 | prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_special_tokens=False)
18 | 
19 | input_tokens = [tokenizer.tokenize(prompt)] * 256
20 | 
21 | with track_time(input_tokens):
22 |     outputs = model.generate_batch(input_tokens)
23 | 
24 | results_ids = [res.sequences_ids[0] for res in outputs]
25 | outputs = tokenizer.batch_decode(results_ids, skip_special_tokens=True)
26 | 
27 | print(outputs[100])
28 | 
29 | # latency: 1.3013768196105957s
30 | # throughput: 31.77 inputs/s
31 | 


--------------------------------------------------------------------------------
/sections/7. Scheduling and Running Jobs on a Cluster/level4.py:
--------------------------------------------------------------------------------
 1 | # !pip install vllm==0.3.3
 2 | from .utils import track_time
 3 | from vllm import LLM, SamplingParams
 4 | 
 5 | llm = LLM(model="models/TinyLlama-1.1B-Chat-v1.0")
 6 | tokenizer = llm.get_tokenizer()
 7 | 
 8 | 
 9 | messages = [
10 |     {
11 |         "role": "system",
12 |         "content": "You are a friendly chatbot who is always helpful.",
13 |     },
14 |     {"role": "user", "content": "How can I get rid of a llama on my lawn?"},
15 | ]
16 | 
17 | prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_special_tokens=False)
18 | 
19 | 
20 | sampling = SamplingParams(max_tokens=256, seed=42, temperature=0)
21 | 
22 | prompts = [prompt] * 1024
23 | 
24 | with track_time(prompts):
25 |     outputs = llm.generate(prompts, sampling)
26 | 
27 | results = [output.outputs[0].text for output in outputs]
28 | 
29 | print(results[1000])
30 | 
31 | # latency: 0.7040367126464844s
32 | # throughput: 68.22 inputs/s
33 | 


--------------------------------------------------------------------------------
/sections/7. Scheduling and Running Jobs on a Cluster/level_5/consume_results.py:
--------------------------------------------------------------------------------
1 | from src.level_five.rabbit import RabbitBuffer
2 | 
3 | buffer = RabbitBuffer("llama-results")
4 | 
5 | results = buffer.consume(10_000)
6 | len(results)
7 | 
8 | print(results[9000].decode())
9 | 


--------------------------------------------------------------------------------
/sections/7. Scheduling and Running Jobs on a Cluster/level_5/produce_prompts.py:
--------------------------------------------------------------------------------
 1 | from src.level_five.rabbit import RabbitBuffer
 2 | from transformers import AutoTokenizer
 3 | 
 4 | messages = [
 5 |     {
 6 |         "role": "system",
 7 |         "content": "You are a friendly chatbot who is always helpful.",
 8 |     },
 9 |     {"role": "user", "content": "How can I get rid of a llama on my lawn?"},
10 | ]
11 | tokenizer = AutoTokenizer.from_pretrained("models/TinyLlama-1.1B-Chat-v1.0")
12 | 
13 | prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_special_tokens=False)
14 | 
15 | 
16 | buffer = RabbitBuffer("llama-queue")
17 | buffer.produce([prompt] * 100_000)
18 | 


--------------------------------------------------------------------------------
/sections/7. Scheduling and Running Jobs on a Cluster/level_5/rabbit.py:
--------------------------------------------------------------------------------
 1 | # pip install pika
 2 | import pika
 3 | 
 4 | RABBIT_USER = "rabbitmq_user"
 5 | RABBIT_PASS = "pass"
 6 | 
 7 | HEAD_IP = "your_head_node_ip"
 8 | RABBIT_PORT = 5672
 9 | 
10 | 
11 | class RabbitBuffer:
12 |     def __init__(self, queue_name: str) -> None:
13 |         self.queue_name = queue_name
14 | 
15 |         self.credentials = pika.PlainCredentials(RABBIT_USER, RABBIT_PASS)
16 | 
17 |         self.connection = pika.BlockingConnection(pika.ConnectionParameters(HEAD_IP, RABBIT_PORT, "/", self.credentials))
18 | 
19 |         self.channel = self.connection.channel()
20 |         self.queue = self.channel.queue_declare(queue=self.queue_name, durable=True)
21 | 
22 |     def produce(self, messages: list[str]):
23 |         for message in messages:
24 |             self.channel.basic_publish(
25 |                 exchange="",
26 |                 routing_key=self.queue_name,
27 |                 body=message,
28 |                 properties=pika.BasicProperties(delivery_mode=2),  # make messages persistent
29 |             )
30 | 
31 |     def consume(self, num_messages: int):
32 |         messages = []
33 |         for _ in range(num_messages):
34 |             method_frame, header_frame, body = self.channel.basic_get(queue=self.queue_name)
35 |             if method_frame:
36 |                 messages.append(body)
37 |                 self.channel.basic_ack(method_frame.delivery_tag)
38 |         return messages
39 | 


--------------------------------------------------------------------------------
/sections/7. Scheduling and Running Jobs on a Cluster/level_5/ray_batch_job.py:
--------------------------------------------------------------------------------
 1 | # pip install -U "ray[default]"
 2 | import ray
 3 | from rabbit import RabbitBuffer
 4 | 
 5 | ray.init(address="auto")
 6 | from vllm import LLM, SamplingParams
 7 | 
 8 | 
 9 | @ray.remote
10 | def predict_batch():
11 |     buffer = RabbitBuffer("llama-queue")
12 | 
13 |     messages = buffer.consume(5000)
14 |     prompts = [m.decode() for m in messages]
15 | 
16 |     sampling = SamplingParams(max_tokens=256, seed=42, temperature=0)
17 |     llm = LLM(model="/root/ml-deployment/models/TinyLlama-1.1B-Chat-v1.0")
18 | 
19 |     outputs = llm.generate(prompts, sampling)
20 | 
21 |     results = [output.outputs[0].text for output in outputs]
22 | 
23 |     result_buffer = RabbitBuffer("llama-results")
24 |     result_buffer.produce(results)
25 | 
26 |     return results
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     future = predict_batch.options(num_gpus=1, num_cpus=1).remote()
31 |     ray.get(future)
32 |     ray.shutdown()
33 | 
34 | # sumbit command
35 | # ray job submit --submission-id llamma-batch1 --working-dir src/level_five/ -- python ray_batch_job.py
36 | 
37 | # throughput: 111 inputs/s
38 | 


--------------------------------------------------------------------------------
/sections/7. Scheduling and Running Jobs on a Cluster/utils.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | import time
 3 | 
 4 | @contextmanager
 5 | def track_time(inputs:list=None):
 6 |     start = time.time() # Record the start time
 7 |     yield # Pass control back to the context block
 8 |     duration = time.time() - start # Calculate the duration
 9 | 
10 |     if inputs is None:
11 |         print(f"Execution time: {duration:.2f} seconds")
12 |     else:
13 |         print(f"Took {duration:.2f} seconds to process {len(inputs)} inputs")


--------------------------------------------------------------------------------