├── .gitignore
├── LICENSE
├── README.md
└── online-prediction
    ├── 01-build-custom-inference-images.ipynb
    ├── 02-upload-register-and-deploy-models.ipynb
    ├── 03-from-hub-to-vertex-ai.ipynb
    ├── 04-from-hub-to-vertex-ai-gpu.ipynb
    ├── Dockerfile
    ├── README.md
    ├── assets
        ├── docker-artifact-registry.png
        ├── endpoint-deployment-vertex-ai.png
        ├── model-cloud-storage.png
        └── model-registry-vertex-ai.png
    ├── huggingface_predictor
        ├── predictor.py
        └── requirements.txt
    └── huggingface_predictor_gpu
        ├── predictor.py
        └── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # .DS_Store files
163 | .DS_Store
164 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Alvaro Bartolome
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Vertex AI 🤝🏻 HuggingFace
 2 | 
 3 | Collection of examples on how to train, deploy and monitor 🤗 HuggingFace models in Google Cloud Vertex AI
 4 | 
 5 | ## Installation
 6 | 
 7 | * `gcloud` CLI needs to be installed and logged in the project that will be used. See the installation notes at https://cloud.google.com/sdk/docs/install
 8 | 
 9 | * `docker` needs to be installed locally, and up and running, since it will be used to build the CPR images before pushing those to the container registry. See the installation notes at https://docs.docker.com/engine/install/
10 | 
11 | * `google-cloud-aiplatform` Python library is required to programatically build the CPR image, to define the custom prediction code via a custom `Predictor`, to run the online/batch predictions, etc.
12 | 
13 |     `pip install google-cloud-aiplatform --upgrade`
14 | 
15 | * `git lfs` needs to be installed for pulling / cloning models from the HuggingFace Hub. See the installation notes at https://git-lfs.com/.
16 | 
17 | ## Contents
18 | 
19 | * [`online-prediction/`](./online-prediction): contains some notebooks to explain the different steps to upload, register and deploy any model from the HuggingFace Hub in Vertex AI, covering both CPU-only and GPU accelerated inference.
20 | 
21 | ## What's next?
22 | 
23 | This collection will be updated iteratively, and here are some of the things that are currently under development and will be published soon:
24 | 
25 | * [ ] `pipelines/`: contains some notebooks to explain how to build custom pipelines, focusing in fine-tuning of HuggingFace models (which could later be deployed within Vertex AI)
26 | * [ ] `batch-predictions/`: contains some notebooks to explain the concept of batch predictions, comparing those to online predictions, and also providing some practical use cases
27 | * [ ] `real-time-inference/sentence-transformers/`: contains some notebooks explaining how to upload, register and deploy `SentenceTransformers` models in Vertex AI
28 | 
29 | Additionally, a Python package is being currently developed to remove the hustle of using `google-cloud-aiplatform` with HuggingFace models, providing more extensibility towards HuggingFace's use cases. The interface looks similar to the one defined in `sagemaker`, AWS SageMaker's Python SDK; check it at [`sagemaker.huggingface.HuggingFaceModel`](https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/huggingface/model.py).
30 | 


--------------------------------------------------------------------------------
/online-prediction/01-build-custom-inference-images.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "db1210e7-a6dc-4aaa-ad05-71a4021e5bef",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Build images for custom inference for 🤗 HuggingFace models"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "49d2dc2a-b14d-4d7f-a2b2-08de387137c4",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "In order to deploy HuggingFace models in Vertex AI we will need to make use of the Custom Prediction Routines (CPR) which are Docker images built with custom inference code and requirements (among other optional stuff).\n",
 17 |     "\n",
 18 |     "These images need to be built for HuggingFace's libraries i.e. `tranformers`, `diffusers` and much more; as there are no official Docker images yet for HuggingFace, while those are available for `sklearn`, `AutoML` and `XGBoost` models.\n",
 19 |     "\n",
 20 |     "In this tutorial we will show how to use the Python library `google-cloud-aiplatform` to build and register new CPRs in Google's Container Registry, for its later inference usage within Vertex AI."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "c59b3400-2a4e-4a4c-a2b2-6038a8c8d13f",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Installation"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "id": "7e0d124f-3484-4961-b3a6-c6e8555f3665",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "* `gcloud` CLI needs to be installed and logged in the project that will be used to push the Docker images. See the installation notes at https://cloud.google.com/sdk/docs/install\n",
 37 |     "\n",
 38 |     "* `docker` needs to be installed locally, and up and running, since it will be used to build the CPR images before pushing those to the container registry. See the installation notes at https://docs.docker.com/engine/install/\n",
 39 |     "\n",
 40 |     "* `google-cloud-aiplatform` Python library is required to programatically build the CPR image and to define the custom prediction code via a custom `Predictor`.\n",
 41 |     "\n",
 42 |     "    `pip install google-cloud-aiplatform --upgrade`"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "id": "66d9d0f2-70aa-4c9c-a09c-923399a00f29",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## Setup"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "id": "dbf982f7-305e-4951-abc2-439accea80c4",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "To successfully run the code below, you will need to be authenticated into your Google Cloud account and the following variable values must be set in advance:\n",
 59 |     "\n",
 60 |     "* `REGION` is the region where the resources will be hosted in.\n",
 61 |     "* `PROJECT_ID` is the identifier of the project in Google Cloud.\n",
 62 |     "* `REPOSITORY` is the directory where the Docker images will be uploaded to.\n",
 63 |     "* `IMAGE` is the name of the Docker image (without tag).\n",
 64 |     "* `TAG` is that tag of the Docker image."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "id": "c22d929a-aa2e-464a-904d-30d349340591",
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "REGION = \"europe-west9\"\n",
 75 |     "PROJECT_ID = \"huggingface-cloud\"\n",
 76 |     "REPOSITORY = \"custom-inference\"\n",
 77 |     "IMAGE = \"huggingface-pipeline\"\n",
 78 |     "TAG = \"py310-cpu-torch-2.2.0-transformers-4.37.2\""
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "id": "d42b1686-95ca-4cd2-8f76-b1d53f106404",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "---"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "id": "5a93acb3-fd28-4f81-a3d9-36492029c712",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "_**Note**: the following is a CPU-only version for custom inference, to use GPU for inference, the `device_map` should be defined (ideally with `auto` as the value), `accelerate` should be included to `requirements.txt`, and a CUDA image should be used when building the CPR via `build_cpr_model`._\n",
 95 |     "\n",
 96 |     "Check the end to end notebook suitable for GPU inference at [`04-from-hub-to-vertex-ai-gpu.ipynb`](./04-from-hub-to-vertex-ai-gpu.ipynb)."
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "id": "50f374e4-8d20-46bc-aade-a27ecc3b3b1a",
102 |    "metadata": {},
103 |    "source": [
104 |     "## Custom inference code"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "id": "aaec09db-920b-4aac-a6da-4bcd058de8e8",
110 |    "metadata": {},
111 |    "source": [
112 |     "In order to successfully run the inference over a HuggingFace model, we will need to define a custom class inheriting from the `Predictor` class from `google-cloud-aiplatform`.\n",
113 |     "\n",
114 |     "To run the inference we will use the `pipeline` method from 🤗 `transformers`, which will be loaded as part of the `Predictor.load` method, controlled by the environment variable `HF_TASK`; then the `pipeline` will run within the `predict` method and will generate the output as a Python dict.\n",
115 |     "\n",
116 |     "Alternatively, we could also implement ourselves the code rather than relying on `pipeline` if our model needs to suit specific needs, while the `pipeline` controlled via the `HF_TASK` environment variable gives the image flexibility as we can build one image and make it work for different models and tasks (as long as those don't have extra requirements), and custom images with too specific codes may only perform well under certain scenarios."
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "id": "5240dffb-c281-4741-aa54-f9bb67b9d6c2",
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "!mkdir huggingface_predictor"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "id": "a237d6ff-4f97-4f8e-a2ed-63e3c162afd3",
132 |    "metadata": {},
133 |    "source": [
134 |     "_Note that we will use the magic method `%%writefile` to write the Python code of the `HuggingFacePredictor` into the `predictor.py` file, which won't import the code within the Jupyter Notebook, so we will need to load it afterwards using `%load`._"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "id": "26ec7465-be6b-492a-bcde-5df91eb10109",
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "%%writefile huggingface_predictor/predictor.py\n",
145 |     "import os\n",
146 |     "import logging\n",
147 |     "import tarfile\n",
148 |     "from typing import Any, Dict\n",
149 |     "\n",
150 |     "from transformers import pipeline\n",
151 |     "\n",
152 |     "from google.cloud.aiplatform.prediction.predictor import Predictor\n",
153 |     "from google.cloud.aiplatform.utils import prediction_utils\n",
154 |     "\n",
155 |     "logger = logging.getLogger(__name__)\n",
156 |     "logger.setLevel(logging.DEBUG)\n",
157 |     "\n",
158 |     "\n",
159 |     "class HuggingFacePredictor(Predictor):\n",
160 |     "    def __init__(self) -> None:\n",
161 |     "        pass\n",
162 |     "    \n",
163 |     "    def load(self, artifacts_uri: str) -> None:\n",
164 |     "        \"\"\"Loads the preprocessor and model artifacts.\"\"\"\n",
165 |     "        logger.debug(f\"Downloading artifacts from {artifacts_uri}\")\n",
166 |     "        prediction_utils.download_model_artifacts(artifacts_uri)\n",
167 |     "        logger.debug(f\"Artifacts successfully downloaded!\")\n",
168 |     "        os.makedirs(\"./model\", exist_ok=True)\n",
169 |     "        with tarfile.open(\"model.tar.gz\", \"r:gz\") as tar:\n",
170 |     "            tar.extractall(path=\"./model\")\n",
171 |     "        logger.debug(f\"HF_TASK value is {os.getenv('HF_TASK')}\")\n",
172 |     "        self._pipeline = pipeline(os.getenv(\"HF_TASK\", None), model=\"./model\")\n",
173 |     "        logger.debug(f\"`pipeline` successfully loaded!\")\n",
174 |     "\n",
175 |     "    def predict(self, instances: Dict[str, Any]) -> Dict[str, Any]:\n",
176 |     "        return self._pipeline(**instances)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "id": "0e1e588b-7319-481a-9576-dee56c18474b",
182 |    "metadata": {},
183 |    "source": [
184 |     "Besides the `Predictor` we can alternatively add a `requirements.txt` file containing the requirements needed to run the code snippet above, which will be installed as part of the Custom Prediction Routine (CPR), that will build a Vertex AI compatible Docker image automatically including both `predictor.py` and `requirements.txt`."
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "id": "52e9a5da-25fc-43cf-988e-da5280c70779",
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "%%writefile huggingface_predictor/requirements.txt\n",
195 |     "torch==2.2.0\n",
196 |     "transformers==4.37.2"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "id": "6eed9207-d092-43ae-845d-c4eb0cfadcab",
202 |    "metadata": {},
203 |    "source": [
204 |     "## Build Docker image"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "id": "beca09fa-534f-4734-a69f-c209c8d9951a",
210 |    "metadata": {},
211 |    "source": [
212 |     "Before building the image we will need to create the Docker repository in Google Artifact Registry, otherwise it will fail not when building the image but when trying to push it, meaning that we would need to re-run the build, so make sure that the repository exists in advance or create it with `gcloud` as it follows:"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "id": "200d86d1-a0b2-41c7-861c-24454935c9ac",
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "!gcloud artifacts repositories create custom-inference --repository-format=docker --location={REGION}"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "id": "bdb1ead9-6f40-4e28-8b47-21015b500a8b",
228 |    "metadata": {},
229 |    "source": [
230 |     "Once we've ensured that the Docker repository exists and that we've packaged the code into the `huggingface_predictor` directory (containing both `predictor.py` and `requirements.txt`), we will build the Custom Prediction Routine (CPR) which will create and build the Docker image in the Google Container Registry.\n",
231 |     "\n",
232 |     "In this case we need to define the following args:\n",
233 |     "* `src_dir` is the path to the local directory including the required files (will be copied into the image)\n",
234 |     "* `output_image_uri` is the URI where the Docker image will be pushed to in Google Cloud\n",
235 |     "* `predictor` is the class instance that inherits from the `Predictor` i.e. `HuggingFacePredictor`\n",
236 |     "* `requirements_path` is the path to the `requirements.txt` file\n",
237 |     "* `base_image` is the base image that will be defined within the Docker image"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "id": "b2cbd437-4b4e-4609-894e-b451e5658b58",
243 |    "metadata": {},
244 |    "source": [
245 |     "_Note that `docker` needs to be installed in advanced and running, since it will be internally used to build the image, and it may take a while._"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "id": "a0fed392-5311-4840-9a19-715b543b5788",
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "import os\n",
256 |     "from google.cloud.aiplatform.prediction import LocalModel\n",
257 |     "\n",
258 |     "from huggingface_predictor.predictor import HuggingFacePredictor\n",
259 |     "\n",
260 |     "local_model = LocalModel.build_cpr_model(\n",
261 |     "    \"huggingface_predictor\",\n",
262 |     "    f\"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}:{TAG}\",\n",
263 |     "    predictor=HuggingFacePredictor,\n",
264 |     "    requirements_path=\"huggingface_predictor/requirements.txt\",\n",
265 |     "    base_image=\"--platform=linux/amd64 python:3.10 AS build\",\n",
266 |     ")"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "id": "681fc0c1-b3fb-4abb-9d42-d3d65d739f83",
272 |    "metadata": {},
273 |    "source": [
274 |     "## Push Docker image"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "id": "81912453-c376-42e2-a472-5f6a6921c00c",
280 |    "metadata": {},
281 |    "source": [
282 |     "Once the image has been built, we can push it to the Google Container Registry via the `push_image` method of the `LocalModel`. But before pushing the image we will need to ensure that the container registry is configured via the `gcloud auth configure-docker` command."
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "id": "3b4bbe25-62d3-4d41-8b35-c8734fad47cd",
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "!gcloud auth configure-docker {REGION}-docker.pkg.dev --quiet"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "id": "c71d6eae-0d62-4e98-8c6b-d84ec43771c1",
298 |    "metadata": {},
299 |    "source": [
300 |     "Then we can call the `push_image` method that will internally call `docker push` to the container registry."
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": null,
306 |    "id": "d5216ed4-ccbf-4c11-bd03-1082d0ae22b8",
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "local_model.push_image()"
311 |    ]
312 |   },
313 |   {
314 |    "attachments": {},
315 |    "cell_type": "markdown",
316 |    "id": "bc4265d7-8e0b-4aff-9183-83fcf3d1e6f8",
317 |    "metadata": {},
318 |    "source": [
319 |     "![](https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/main/online-prediction/assets/docker-artifact-registry.png)"
320 |    ]
321 |   }
322 |  ],
323 |  "metadata": {
324 |   "kernelspec": {
325 |    "display_name": "Python 3 (ipykernel)",
326 |    "language": "python",
327 |    "name": "python3"
328 |   },
329 |   "language_info": {
330 |    "codemirror_mode": {
331 |     "name": "ipython",
332 |     "version": 3
333 |    },
334 |    "file_extension": ".py",
335 |    "mimetype": "text/x-python",
336 |    "name": "python",
337 |    "nbconvert_exporter": "python",
338 |    "pygments_lexer": "ipython3",
339 |    "version": "3.11.3"
340 |   }
341 |  },
342 |  "nbformat": 4,
343 |  "nbformat_minor": 5
344 | }
345 | 


--------------------------------------------------------------------------------
/online-prediction/02-upload-register-and-deploy-models.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "bc03b2f2-001b-4c39-864d-b01bf4cec602",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Upload, register and deploy models from the 🤗 Hub"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "ddbe85d3-a6c3-4e88-a7d0-fe5e5f9b9852",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "In order to deploy a model in Vertex AI, we will first need to upload it to a Google Cloud Storage (GCS) bucket, and then register it in Vertex AI; meaning that we cannot deploy straight from the HuggingFace Hub, but using GCS as the intermediate storage."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "id": "72fff362-11f9-4d4d-bdcb-bc06d743a37a",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Installation"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "id": "a0426d78-bdfb-483d-8fa5-b9cd6c9eca8b",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "* `gcloud` CLI needs to be installed and logged in the project that will be used. See the installation notes at https://cloud.google.com/sdk/docs/install\n",
 33 |     "\n",
 34 |     "* `google-cloud-aiplatform` Python library is required to register the model and deploy it as an endpoint in Vertex AI.\n",
 35 |     "\n",
 36 |     "    `pip install google-cloud-aiplatform --upgrade`\n",
 37 |     "\n",
 38 |     "* `git lfs` needs to be installed for pulling / cloning models from the HuggingFace Hub. See the installation notes at https://git-lfs.com/."
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "id": "24fa6442-2ba5-4899-8e59-0e24223b90b5",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "## Setup"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "id": "923e0887-a873-4af1-8f5c-235a0b359ac8",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "To successfully run the code below, you will need to be authenticated into your Google Cloud account and the following variable values must be set in advance:\n",
 55 |     "\n",
 56 |     "* `REGION` is the region where the resources will be hosted in.\n",
 57 |     "* `PROJECT_ID` is the identifier of the project in Google Cloud.\n",
 58 |     "* `REPOSITORY` is the directory where the Docker images will be uploaded to.\n",
 59 |     "* `IMAGE` is the name of the Docker image (without tag).\n",
 60 |     "* `TAG` is that tag of the Docker image.\n",
 61 |     "* `BUCKET_NAME` is the name of the bucket were the model will be / has been uploaded to.\n",
 62 |     "* `BUCKET_URI` is the full path to the `model.tar.gz` file in Cloud Storage."
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "id": "4e1a8b17-4a27-45d2-9955-3e4d0672c4c1",
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "REGION = \"europe-west9\"\n",
 73 |     "PROJECT_ID = \"huggingface-cloud\"\n",
 74 |     "REPOSITORY = \"custom-inference\"\n",
 75 |     "IMAGE = \"huggingface-pipeline\"\n",
 76 |     "TAG = \"py310-cpu-torch-2.2.0-transformers-4.38.1\"\n",
 77 |     "SERVING_CONTAINER_IMAGE_URI = f\"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}:{TAG}\"\n",
 78 |     "BUCKET_NAME = \"huggingface-cloud\"\n",
 79 |     "BUCKET_URI = f\"gs://{BUCKET_NAME}/bart-large-mnli/model.tar.gz\""
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "id": "fa4e1243-fdb8-4754-8b27-ade13d4f1b52",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "## Upload model from the 🤗 Hub"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "id": "46a1d688-44de-4db9-9bc2-67eba21fae2b",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "First we need to decide which model from the HuggingFace Hub we want to use, in this case, we will be using `facebook/bart-large-mnli` which is a zero-shot classification model, but could be any model in the Hub.\n",
 96 |     "\n",
 97 |     "In order to do so, we will pull the model from the HuggingFace Hub using `git pull`, which requires `git lfs` to be installed in advanced, in order to also pull the large files from the repository."
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "id": "0b77aeb4-a4a9-4086-a5a5-beff8b5f3b60",
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "!git lfs install\n",
108 |     "!git clone https://huggingface.co/facebook/bart-large-mnli"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "id": "aaa3e90d-930f-416f-b8b0-b979531ed937",
114 |    "metadata": {},
115 |    "source": [
116 |     "Once we clone it, we will need to decide which files we want to package into `model.tar.gz` and which models we want to leave outside i.e. the weights in other frameworks instead of the one we want to use e.g. `torch`, and any other file that may be part of the model repository that's not needed to load the model for inference."
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "id": "74b6d294-5feb-4c9c-8d5c-a9c7e94f1793",
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "!cd bart-large-mnli/ && tar zcvf model.tar.gz --exclude flax_model.msgpack --exclude pytorch_model.bin --exclude rust_model.ot * && mv model.tar.gz ../"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "id": "3bbf230f-d372-401c-bc72-233801c2d79e",
132 |    "metadata": {},
133 |    "source": [
134 |     "Once we've packaged all the required files into `model.tar.gz`, we can upload it to Google Cloud Storage, so that the URI pointing to that file will be later on provided to Vertex AI to register the model from that location."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "id": "e7b145c0-e6f2-4466-b0ca-1ef88c8606de",
141 |    "metadata": {
142 |     "id": "NIq7R4HZCfIc"
143 |    },
144 |    "outputs": [],
145 |    "source": [
146 |     "!gcloud config set storage/parallel_composite_upload_enabled True\n",
147 |     "!gcloud storage cp model.tar.gz $BUCKET_URI"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "id": "cc97cf23-cffc-46d1-8d70-6771bea50e0f",
153 |    "metadata": {},
154 |    "source": [
155 |     "Optionally, we can `gcloud storage ls` to ensure that the `model.tar.gz` file has been indeed uploaded to GCS."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "a39a7ebe-478b-45ad-9b9d-138876feff39",
162 |    "metadata": {
163 |     "id": "vhOb7YnwClBb"
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "!gcloud storage ls --recursive gs://{BUCKET_NAME}"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "id": "4751d71f-0f8f-46af-92b5-d9d5d3a116d0",
173 |    "metadata": {},
174 |    "source": [
175 |     "![](https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/main/online-prediction/assets/model-cloud-storage.png)"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "id": "2753e46d-3659-4a6c-9a68-1c97277edbec",
181 |    "metadata": {},
182 |    "source": [
183 |     "## Register model in Vertex AI"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "id": "001de1b4-a85e-45f4-9c5a-ad3f0bdfb5fa",
189 |    "metadata": {},
190 |    "source": [
191 |     "Once the model is uploaded to GCS, we can already register it in Vertex AI, but to do so we will also need to specify the container that will run that model in advance.\n",
192 |     "\n",
193 |     "It could be any container with `python` and `pip` installed, and the required CPR requirements met, so that the endpoint can be deployed normally, otherwise the deployment will fail.\n",
194 |     "\n",
195 |     "For more information check [`01-build-custom-inference-images.ipynb`](./01-build-custom-inference-images.ipynb)."
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "id": "c9e49e1b-7e68-46b1-b828-96d742591bca",
202 |    "metadata": {
203 |     "id": "8d682d8388ec"
204 |    },
205 |    "outputs": [],
206 |    "source": [
207 |     "from google.cloud import aiplatform\n",
208 |     "\n",
209 |     "aiplatform.init(project=PROJECT_ID, location=REGION)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "id": "6d1dcc3d-9174-42c9-8e5b-ced0ecdaad7f",
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "!gcloud auth login\n",
220 |     "!gcloud auth application-default login"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "id": "f25184f6-6692-471b-9ba7-5d38470b3d32",
226 |    "metadata": {},
227 |    "source": [
228 |     "So we will be using `google-cloud-aiplatform` to register a model from GCS into Vertex AI, which in this case will match the model previously uploaded to GCS.\n",
229 |     "\n",
230 |     "_**Note**: that all the `serving_*` arguments of the classmethod `upload` from `aiplatform.Model` refer to the container that will be pulled from the container registry and used during inference when deploying the endpoint, which must have been created in advance, as explained in `01-build-custom-inference-images.ipynb`."
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "id": "b8d948cf-cf8e-4225-b315-72c8bc744aa0",
237 |    "metadata": {
238 |     "id": "2738154345d5"
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "model = aiplatform.Model.upload(\n",
243 |     "    display_name=\"bart-large-mnli\",\n",
244 |     "    artifact_uri=\"gs://huggingface-cloud/bart-large-mnli\",\n",
245 |     "    serving_container_image_uri=SERVING_CONTAINER_IMAGE_URI,\n",
246 |     "    serving_container_environment_variables={\n",
247 |     "        \"HF_TASK\": \"zero-shot-classification\",\n",
248 |     "        # Optional env var so that `uvicorn` only runs the model in 1 worker\n",
249 |     "        # \"VERTEX_CPR_WEB_CONCURRENCY\": 1,\n",
250 |     "    },\n",
251 |     ")"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "id": "94a1df93-e4a7-4060-a804-b14cf15d4e73",
257 |    "metadata": {},
258 |    "source": [
259 |     "![](https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/main/online-prediction/assets/model-registry-vertex-ai.png)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "id": "0a47b05a-be73-4b62-bff9-38b061f2fa95",
265 |    "metadata": {},
266 |    "source": [
267 |     "## Deploy endpoint in Vertex AI"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "id": "463bc372-7de3-49e3-be6f-78a14be24498",
273 |    "metadata": {},
274 |    "source": [
275 |     "Finally, we can use the `aiplatform.Model` object returned before when the `upload` class method was called, to call the `deploy` method that will deploy an endpoint using FastAPI (unless the handler in the CPR was overwritten) running in a machine matching the `machine_type` argument.\n",
276 |     "\n",
277 |     "In this case we will only define the `machine_type` arg, and we will use the `e2-standard-4` which is a VM from Compute Engine with 4 vCPUs and 16GiB of RAM.\n",
278 |     "\n",
279 |     "To see all the possible args of `deploy` see the source code at [`google.cloud.aiplatform.models`](https://github.com/googleapis/python-aiplatform/blob/f249353b918823b35495b295a75a90528ad652c0/google/cloud/aiplatform/models.py#L3418)."
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "id": "307f4dea-dddd-4c14-9f7a-4fa407aa7c2a",
285 |    "metadata": {},
286 |    "source": [
287 |     "_**Note**: the `deploy` method will take a while ~15-20 minutes in order to deploy the model in Vertex AI as an endpoint._"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "id": "9e7329f2-eb35-40ec-9e5a-388ed052fe6f",
294 |    "metadata": {
295 |     "id": "62cf66498a28"
296 |    },
297 |    "outputs": [],
298 |    "source": [
299 |     "endpoint = model.deploy(machine_type=\"e2-standard-4\")"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "id": "05e35d3a-efa1-42bb-ab75-3099ca182c0a",
305 |    "metadata": {},
306 |    "source": [
307 |     "![](https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/main/online-prediction/assets/endpoint-deployment-vertex-ai.png)"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "id": "bbabf289-48f3-4174-9067-eb2d506ea966",
313 |    "metadata": {},
314 |    "source": [
315 |     "## Online predictions in Vertex AI"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "markdown",
320 |    "id": "1d1598b4-3f88-4de8-a585-9de104986e48",
321 |    "metadata": {},
322 |    "source": [
323 |     "Then, once the Vertex AI endpoint is running, we can try it out using the `PredictionServiceClient` from `google-cloud-aiplatform` to send an HTTP request to the `bart-base-mnli` model running the `zero-shot-classification` task."
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "id": "47b335f4-ea96-4c11-a681-4bc98e7540f9",
330 |    "metadata": {},
331 |    "outputs": [],
332 |    "source": [
333 |     "import json\n",
334 |     "from google.api import httpbody_pb2\n",
335 |     "from google.cloud import aiplatform_v1\n",
336 |     "\n",
337 |     "prediction_client = aiplatform_v1.PredictionServiceClient(\n",
338 |     "    client_options={\"api_endpoint\": f\"{REGION}-aiplatform.googleapis.com\"}\n",
339 |     ")\n",
340 |     "\n",
341 |     "data = {\n",
342 |     "    \"sequences\": \"Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.\",\n",
343 |     "    \"candidate_labels\": [\"mobile\", \"website\", \"billing\", \"account access\"],\n",
344 |     "}\n",
345 |     "\n",
346 |     "json_data = json.dumps(data)\n",
347 |     "\n",
348 |     "http_body = httpbody_pb2.HttpBody(\n",
349 |     "    data=json_data.encode(\"utf-8\"),\n",
350 |     "    content_type=\"application/json\",\n",
351 |     ")\n",
352 |     "\n",
353 |     "request = aiplatform_v1.RawPredictRequest(\n",
354 |     "    endpoint=endpoint.resource_name,\n",
355 |     "    http_body=http_body,\n",
356 |     ")\n",
357 |     "\n",
358 |     "response = prediction_client.raw_predict(request)\n",
359 |     "json.loads(response.data)"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "markdown",
364 |    "id": "eabfc951-6d6d-420c-9044-cf79c8028ef7",
365 |    "metadata": {},
366 |    "source": [
367 |     "## Resource clean up"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "markdown",
372 |    "id": "0d85ebec-732b-4446-a4e5-841d4dcf659d",
373 |    "metadata": {},
374 |    "source": [
375 |     "Finally, we clean up the resources used i.e. the Vertex AI endpoint and the model. In this case, we clean the resources up because we are no longer going to use those, but alternatively we could leave those running."
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": null,
381 |    "id": "0b628def-8d82-4733-9796-287dd2cb583e",
382 |    "metadata": {},
383 |    "outputs": [],
384 |    "source": [
385 |     "endpoint.delete(force=True)\n",
386 |     "model.delete()"
387 |    ]
388 |   }
389 |  ],
390 |  "metadata": {
391 |   "kernelspec": {
392 |    "display_name": "Python 3 (ipykernel)",
393 |    "language": "python",
394 |    "name": "python3"
395 |   },
396 |   "language_info": {
397 |    "codemirror_mode": {
398 |     "name": "ipython",
399 |     "version": 3
400 |    },
401 |    "file_extension": ".py",
402 |    "mimetype": "text/x-python",
403 |    "name": "python",
404 |    "nbconvert_exporter": "python",
405 |    "pygments_lexer": "ipython3",
406 |    "version": "3.11.3"
407 |   }
408 |  },
409 |  "nbformat": 4,
410 |  "nbformat_minor": 5
411 | }
412 | 


--------------------------------------------------------------------------------
/online-prediction/03-from-hub-to-vertex-ai.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# From the 🤗 HuggingFace Hub to Vertex AI on CPU"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In this notebook, we will go through the same steps as mentioned before in the following notebooks: `01-build-custom-inference-images.ipynb` and `02-upload-register-and-deploy-models.ipynb`, but putting it all together as an end to end guide on how to deploy models from the HuggingFace Hub into Vertex AI on CPU-only machines.\n",
 15 |     "\n",
 16 |     "For a more detailed explanation on each step, please consider having a look at the notebooks mentioned before."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Installation"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "* `gcloud` CLI needs to be installed and logged in the project that will be used. See the installation notes at https://cloud.google.com/sdk/docs/install\n",
 31 |     "\n",
 32 |     "* `docker` needs to be installed locally, and up and running, since it will be used to build the CPR images before pushing those to the container registry. See the installation notes at https://docs.docker.com/engine/install/\n",
 33 |     "\n",
 34 |     "* `google-cloud-aiplatform` Python library is required to programatically build the CPR image, to define the custom prediction code via a custom `Predictor`, to register and deploy the model to an endpoint in Vertex AI, and to run the online prediction on it.\n",
 35 |     "\n",
 36 |     "    `pip install google-cloud-aiplatform --upgrade`\n",
 37 |     "\n",
 38 |     "* `git lfs` needs to be installed for pulling / cloning models from the HuggingFace Hub. See the installation notes at https://git-lfs.com/."
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## Setup"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "To successfully run the code below, you will need to be authenticated into your Google Cloud account and the following variable values must be set in advance:\n",
 53 |     "\n",
 54 |     "* `REGION` is the region where the resources will be hosted in.\n",
 55 |     "* `PROJECT_ID` is the identifier of the project in Google Cloud.\n",
 56 |     "* `REPOSITORY` is the directory where the Docker images will be uploaded to.\n",
 57 |     "* `IMAGE` is the name of the Docker image (without tag).\n",
 58 |     "* `TAG` is that tag of the Docker image.\n",
 59 |     "* `BUCKET_NAME` is the name of the bucket were the model will be / has been uploaded to.\n",
 60 |     "* `BUCKET_URI` is the full path to the `model.tar.gz` file in Cloud Storage."
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "REGION = \"europe-west9\"\n",
 70 |     "PROJECT_ID = \"huggingface-cloud\"\n",
 71 |     "REPOSITORY = \"custom-inference\"\n",
 72 |     "IMAGE = \"huggingface-pipeline\"\n",
 73 |     "TAG = \"py310-cpu-torch-2.2.0-transformers-4.38.1\"\n",
 74 |     "BUCKET_NAME = \"huggingface-cloud\"\n",
 75 |     "BUCKET_URI = f\"gs://{BUCKET_NAME}/bart-large-mnli/model.tar.gz\""
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "---"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "## Custom Prediction Routine (CPR) using 🤗 `tranformers.pipeline`"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "In order to successfully run the inference over a HuggingFace model, we will need to define a custom class inheriting from the `Predictor` class from `google-cloud-aiplatform`. To run the inference we will use the `pipeline` method from 🤗 `transformers`, which will be loaded as part of the `Predictor.load` method, controlled by the environment variable `HF_TASK`; then the `pipeline` will run within the `predict` method and will generate the output as a Python dict.\n",
 97 |     "\n",
 98 |     "For more information check [`01-build-custom-inference-images.ipynb`](./01-build-custom-inference-images.ipynb)."
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "!mkdir huggingface_predictor"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "%%writefile huggingface_predictor/predictor.py\n",
117 |     "import os\n",
118 |     "import logging\n",
119 |     "import tarfile\n",
120 |     "from typing import Any, Dict\n",
121 |     "\n",
122 |     "from transformers import pipeline\n",
123 |     "\n",
124 |     "from google.cloud.aiplatform.prediction.predictor import Predictor\n",
125 |     "from google.cloud.aiplatform.utils import prediction_utils\n",
126 |     "\n",
127 |     "logger = logging.getLogger(__name__)\n",
128 |     "logger.setLevel(logging.DEBUG)\n",
129 |     "\n",
130 |     "\n",
131 |     "class HuggingFacePredictor(Predictor):\n",
132 |     "    def __init__(self) -> None:\n",
133 |     "        pass\n",
134 |     "\n",
135 |     "    def load(self, artifacts_uri: str) -> None:\n",
136 |     "        \"\"\"Loads the preprocessor and model artifacts.\"\"\"\n",
137 |     "        logger.debug(f\"Downloading artifacts from {artifacts_uri}\")\n",
138 |     "        prediction_utils.download_model_artifacts(artifacts_uri)\n",
139 |     "        logger.debug(\"Artifacts successfully downloaded!\")\n",
140 |     "        os.makedirs(\"./model\", exist_ok=True)\n",
141 |     "        with tarfile.open(\"model.tar.gz\", \"r:gz\") as tar:\n",
142 |     "            tar.extractall(path=\"./model\")\n",
143 |     "        logger.debug(f\"HF_TASK value is {os.getenv('HF_TASK')}\")\n",
144 |     "        self._pipeline = pipeline(os.getenv(\"HF_TASK\", \"\"), model=\"./model\")\n",
145 |     "        logger.debug(\"`pipeline` successfully loaded!\")\n",
146 |     "\n",
147 |     "    def predict(self, instances: Dict[str, Any]) -> Dict[str, Any]:\n",
148 |     "        return self._pipeline(**instances)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "%%writefile huggingface_predictor/requirements.txt\n",
158 |     "torch==2.2.0\n",
159 |     "transformers==4.38.1"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "Then we can build the Docker image for the CPR, and we will be using `python:3.10-slim` as the base image, as it has `python` and `pip` installed which are the main requirements for the Vertex AI endpoint to be deployed, and required to install the rest of the dependencies needed.\n",
167 |     "\n",
168 |     "Anyway, everyone can create their own custom Docker image, but it should have the following:\n",
169 |     "\n",
170 |     "* `python` installed under the alias of `python`, not `python3`\n",
171 |     "* `pip` installed under the alias of `pip`, not `pip3`\n",
172 |     "* Needs to be built with `--platform=linux/amd64` to work on Vertex AI (if not using Linux already)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "import os\n",
182 |     "from google.cloud.aiplatform.prediction import LocalModel\n",
183 |     "\n",
184 |     "from huggingface_predictor.predictor import HuggingFacePredictor\n",
185 |     "\n",
186 |     "local_model = LocalModel.build_cpr_model(\n",
187 |     "    \"huggingface_predictor\",\n",
188 |     "    f\"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}:{TAG}\",\n",
189 |     "    predictor=HuggingFacePredictor,\n",
190 |     "    requirements_path=\"huggingface_predictor/requirements.txt\",\n",
191 |     "    base_image=\"--platform=linux/amd64 python:3.10-slim AS build\",\n",
192 |     ")"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "!gcloud auth configure-docker {REGION}-docker.pkg.dev --quiet"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "!gcloud artifacts repositories create custom-inference --repository-format=docker --location={REGION}"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "local_model.push_image()"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "![](https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/main/online-prediction/assets/docker-artifact-registry.png)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "---"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "## Upload model from the 🤗 Hub"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "markdown",
245 |    "metadata": {},
246 |    "source": [
247 |     "First we need to decide which model from the HuggingFace Hub we want to use, in this case, we will be using `facebook/bart-large-mnli` which is a zero-shot classification model.\n",
248 |     "\n",
249 |     "In order to do so, we will pull the model from the HuggingFace Hub using `git pull`, which requires `git lfs` to be installed in advanced, in order to also pull the large files from the repository.\n",
250 |     "\n",
251 |     "For more information check [`02-upload-register-and-deploy-models.ipynb`](./02-upload-register-and-deploy-models.ipynb)."
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "!git lfs install\n",
261 |     "!git clone https://huggingface.co/facebook/bart-large-mnli"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": [
270 |     "!ls"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "!cd bart-large-mnli/ && tar zcvf model.tar.gz --exclude flax_model.msgpack --exclude pytorch_model.bin --exclude rust_model.ot * && mv model.tar.gz ../"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {
286 |     "id": "NIq7R4HZCfIc"
287 |    },
288 |    "outputs": [],
289 |    "source": [
290 |     "!gcloud config set storage/parallel_composite_upload_enabled True\n",
291 |     "!gcloud storage cp model.tar.gz $BUCKET_URI"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": null,
297 |    "metadata": {
298 |     "id": "vhOb7YnwClBb"
299 |    },
300 |    "outputs": [],
301 |    "source": [
302 |     "!gcloud storage ls --recursive gs://{BUCKET_NAME}/bart-large-mnli"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {},
308 |    "source": [
309 |     "![](https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/main/online-prediction/assets/model-cloud-storage.png)"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "metadata": {},
315 |    "source": [
316 |     "## Register model in Vertex AI"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "metadata": {},
322 |    "source": [
323 |     "Once the model is uploaded to GCS and that the CPR image has been pushed to Google's Container Registry, then we can already register the model in Vertex AI.\n",
324 |     "\n",
325 |     "For more information check [`02-upload-register-and-deploy-models.ipynb`](./02-upload-register-and-deploy-models.ipynb)."
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {
332 |     "id": "8d682d8388ec"
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "from google.cloud import aiplatform\n",
337 |     "\n",
338 |     "aiplatform.init(project=PROJECT_ID, location=REGION)"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": null,
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": [
347 |     "!gcloud auth login\n",
348 |     "!gcloud auth application-default login"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": null,
354 |    "metadata": {
355 |     "id": "2738154345d5"
356 |    },
357 |    "outputs": [],
358 |    "source": [
359 |     "model = aiplatform.Model.upload(\n",
360 |     "    display_name=\"bart-large-mnli\",\n",
361 |     "    artifact_uri=\"gs://huggingface-cloud/bart-large-mnli\",\n",
362 |     "    serving_container_image_uri=local_model.get_serving_container_spec().image_uri,\n",
363 |     "    serving_container_environment_variables={\n",
364 |     "        \"HF_TASK\": \"zero-shot-classification\",\n",
365 |     "        # Optional env var so that `uvicorn` only runs the model in 1 worker\n",
366 |     "        # \"VERTEX_CPR_WEB_CONCURRENCY\": 1,\n",
367 |     "    },\n",
368 |     ")"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "markdown",
373 |    "metadata": {},
374 |    "source": [
375 |     "![](https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/main/online-prediction/assets/model-registry-vertex-ai.png)"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "markdown",
380 |    "metadata": {},
381 |    "source": [
382 |     "_**Note**: the `deploy` method will take a while ~15-20 minutes in order to deploy the model in Vertex AI as an endpoint._"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "metadata": {
389 |     "id": "62cf66498a28"
390 |    },
391 |    "outputs": [],
392 |    "source": [
393 |     "endpoint = model.deploy(machine_type=\"e2-standard-4\")"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "markdown",
398 |    "metadata": {},
399 |    "source": [
400 |     "![](https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/main/online-prediction/assets/endpoint-deployment-vertex-ai.png)"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "markdown",
405 |    "metadata": {},
406 |    "source": [
407 |     "---"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "markdown",
412 |    "metadata": {},
413 |    "source": [
414 |     "## Run online predictionson Vertex AI"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "markdown",
419 |    "metadata": {},
420 |    "source": [
421 |     "Finally, we can proceed to run the online predictions on Vertex AI using their Python client, which will basically send the requests to the running endpoint, and we will also be able to closely monitor it via Google Cloud Logging service."
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {},
428 |    "outputs": [],
429 |    "source": [
430 |     "import json\n",
431 |     "from google.api import httpbody_pb2\n",
432 |     "from google.cloud import aiplatform_v1\n",
433 |     "\n",
434 |     "prediction_client = aiplatform_v1.PredictionServiceClient(\n",
435 |     "    client_options={\"api_endpoint\": f\"{REGION}-aiplatform.googleapis.com\"}\n",
436 |     ")\n",
437 |     "\n",
438 |     "data = {\n",
439 |     "    \"sequences\": \"Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.\",\n",
440 |     "    \"candidate_labels\": [\"mobile\", \"website\", \"billing\", \"account access\"],\n",
441 |     "}\n",
442 |     "\n",
443 |     "json_data = json.dumps(data)\n",
444 |     "\n",
445 |     "http_body = httpbody_pb2.HttpBody(\n",
446 |     "    data=json_data.encode(\"utf-8\"),\n",
447 |     "    content_type=\"application/json\",\n",
448 |     ")\n",
449 |     "\n",
450 |     "request = aiplatform_v1.RawPredictRequest(\n",
451 |     "    endpoint=endpoint.resource_name,\n",
452 |     "    http_body=http_body,\n",
453 |     ")\n",
454 |     "\n",
455 |     "response = prediction_client.raw_predict(request)\n",
456 |     "json.loads(response.data)"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "markdown",
461 |    "metadata": {},
462 |    "source": [
463 |     "---"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "markdown",
468 |    "metadata": {},
469 |    "source": [
470 |     "## Resource clean-up"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "markdown",
475 |    "metadata": {},
476 |    "source": [
477 |     "Finally, we can release the resources we've allocated / created using the following `delete` methods over both the `endpoint` and the `model` variables in Vertex AI, and also the following `gcloud` CLI commands to remove both the Docker image from the Container Registry and the HuggingFace model from the Cloud Storage."
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": null,
483 |    "metadata": {},
484 |    "outputs": [],
485 |    "source": [
486 |     "endpoint.delete(force=True)\n",
487 |     "model.delete()"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "code",
492 |    "execution_count": null,
493 |    "metadata": {},
494 |    "outputs": [],
495 |    "source": [
496 |     "!gcloud artifacts docker images delete --quiet --delete-tags {REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}\n",
497 |     "!gcloud storage rm -r $BUCKET_URI"
498 |    ]
499 |   }
500 |  ],
501 |  "metadata": {
502 |   "colab": {
503 |    "collapsed_sections": [],
504 |    "name": "SDK_Custom_Predict_and_Handler_SDK_Integration.ipynb",
505 |    "toc_visible": true
506 |   },
507 |   "kernelspec": {
508 |    "display_name": "Python 3 (ipykernel)",
509 |    "language": "python",
510 |    "name": "python3"
511 |   },
512 |   "language_info": {
513 |    "codemirror_mode": {
514 |     "name": "ipython",
515 |     "version": 3
516 |    },
517 |    "file_extension": ".py",
518 |    "mimetype": "text/x-python",
519 |    "name": "python",
520 |    "nbconvert_exporter": "python",
521 |    "pygments_lexer": "ipython3",
522 |    "version": "3.11.3"
523 |   }
524 |  },
525 |  "nbformat": 4,
526 |  "nbformat_minor": 4
527 | }
528 | 


--------------------------------------------------------------------------------
/online-prediction/04-from-hub-to-vertex-ai-gpu.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# From the 🤗 HuggingFace Hub to Vertex AI on GPU"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In this notebook, we will go through the same steps as mentioned before in the following notebooks: `01-build-custom-inference-images.ipynb` and `02-upload-register-and-deploy-models.ipynb`, similarly to `03-from-hub-to-vertex-ai.ipynb` which is an end to end guide on how to deploy models from the HuggingFace Hub into Vertex AI on CPU, this notebook will only add replacements/modifications were needed in order to use the GPU.\n",
 15 |     "\n",
 16 |     "For a more detailed explanation on each step, please consider having a look at the notebooks mentioned before."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Installation"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "* `gcloud` CLI needs to be installed and logged in the project that will be used. See the installation notes at https://cloud.google.com/sdk/docs/install\n",
 31 |     "\n",
 32 |     "* `docker` needs to be installed locally, and up and running, since it will be used to build the CPR images before pushing those to the container registry. See the installation notes at https://docs.docker.com/engine/install/\n",
 33 |     "\n",
 34 |     "* `google-cloud-aiplatform` Python library is required to programatically build the CPR image, to define the custom prediction code via a custom `Predictor`, to register and deploy the model to an endpoint in Vertex AI, and to run the online prediction on it.\n",
 35 |     "\n",
 36 |     "    `pip install google-cloud-aiplatform --upgrade`\n",
 37 |     "\n",
 38 |     "* `git lfs` needs to be installed for pulling / cloning models from the HuggingFace Hub. See the installation notes at https://git-lfs.com/."
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## Setup"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "To successfully run the code below, you will need to be authenticated into your Google Cloud account and the following variable values must be set in advance:\n",
 53 |     "\n",
 54 |     "* `REGION` is the region where the resources will be hosted in.\n",
 55 |     "* `PROJECT_ID` is the identifier of the project in Google Cloud.\n",
 56 |     "* `REPOSITORY` is the directory where the Docker images will be uploaded to.\n",
 57 |     "* `IMAGE` is the name of the Docker image (without tag).\n",
 58 |     "* `TAG` is that tag of the Docker image.\n",
 59 |     "* `BUCKET_NAME` is the name of the bucket were the model will be / has been uploaded to.\n",
 60 |     "* `BUCKET_URI` is the full path to the `model.tar.gz` file in Cloud Storage."
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "REGION = \"europe-west4\"\n",
 70 |     "PROJECT_ID = \"huggingface-cloud\"\n",
 71 |     "REPOSITORY = \"custom-inference-gpu\"\n",
 72 |     "IMAGE = \"huggingface-pipeline-gpu\"\n",
 73 |     "TAG = \"py310-cu12.3-torch-2.2.0-transformers-4.38.1\"\n",
 74 |     "BUCKET_NAME = \"huggingface-cloud\"\n",
 75 |     "BUCKET_URI = f\"gs://{BUCKET_NAME}/bart-large-mnli/model.tar.gz\""
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "_**Note**: before choosing a region, you should ensure that the region has access to machines with GPU accelerators, otherwise, the deployment will fail as some zones won't have access to those i.e. `europe-west4` has but `europe-west9` hasn't. More information at https://cloud.google.com/vertex-ai/pricing#prediction-prices._"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "---"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "## Custom Prediction Routine (CPR) using 🤗 `tranformers.pipeline`"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "There are some things to take into consideration before developing the `HuggingFacePredictor` that will be in charge of running the inference for the model above, as we need to ensure that it uses the GPU accelerator. Since we're using the `pipeline` method from `transformers`, we can simply rely on the `device_map` argument that has an `auto` option (powered by `accelerate`) that will automatically define the device mapping for the given model.\n",
104 |     "\n",
105 |     "For more information check [`01-build-custom-inference-images.ipynb`](./01-build-custom-inference-images.ipynb)."
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "!mkdir huggingface_predictor_gpu"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "%%writefile huggingface_predictor_gpu/predictor.py\n",
124 |     "import os\n",
125 |     "import logging\n",
126 |     "import tarfile\n",
127 |     "from typing import Any, Dict\n",
128 |     "\n",
129 |     "from transformers import pipeline\n",
130 |     "\n",
131 |     "from google.cloud.aiplatform.prediction.predictor import Predictor\n",
132 |     "from google.cloud.aiplatform.utils import prediction_utils\n",
133 |     "\n",
134 |     "logger = logging.getLogger(__name__)\n",
135 |     "logger.setLevel(logging.DEBUG)\n",
136 |     "\n",
137 |     "\n",
138 |     "class HuggingFacePredictor(Predictor):\n",
139 |     "    def __init__(self) -> None:\n",
140 |     "        pass\n",
141 |     "    \n",
142 |     "    def load(self, artifacts_uri: str) -> None:\n",
143 |     "        \"\"\"Loads the preprocessor and model artifacts.\"\"\"\n",
144 |     "        logger.debug(f\"Downloading artifacts from {artifacts_uri}\")\n",
145 |     "        prediction_utils.download_model_artifacts(artifacts_uri)\n",
146 |     "        logger.debug(\"Artifacts successfully downloaded!\")\n",
147 |     "        os.makedirs(\"./model\", exist_ok=True)\n",
148 |     "        with tarfile.open(\"model.tar.gz\", \"r:gz\") as tar:\n",
149 |     "            tar.extractall(path=\"./model\")\n",
150 |     "        logger.debug(f\"HF_TASK value is {os.getenv('HF_TASK')}\")\n",
151 |     "        self._pipeline = pipeline(os.getenv(\"HF_TASK\", None), model=\"./model\", device_map=\"auto\")\n",
152 |     "        logger.debug(\"`pipeline` successfully loaded!\")\n",
153 |     "        logger.debug(f\"`pipeline` is using device={self._pipeline.device}\")\n",
154 |     "\n",
155 |     "    def predict(self, instances: Dict[str, Any]) -> Dict[str, Any]:\n",
156 |     "        return self._pipeline(**instances)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "Also note that since we're using `device_map` from 🤗 `accelerate`, we will need to also add `accelerate` as a dependency within the `requirements.txt` file."
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "%%writefile huggingface_predictor_gpu/requirements.txt\n",
173 |     "torch==2.2.0\n",
174 |     "transformers==4.38.1\n",
175 |     "accelerate==0.27.0"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "Then we can already build the new Docker image with support for GPU inference, and we will use a custom image from the Docker Hub `alvarobartt/torch-gpu:py310-cu12.3-torch-2.2.0` which has been put together so that it's easier to use within Vertex AI, so that the user just needs to add the dependencies within `requirements.txt` without caring about the CUDA drivers and such.\n",
183 |     "\n",
184 |     "Anyway, everyone can create their own custom Docker image, but it should have the following:\n",
185 |     "\n",
186 |     "* `python` installed under the alias of `python`, not `python3`\n",
187 |     "* `pip` installed under the alias of `pip`, not `pip3`\n",
188 |     "* Needs to be built with `--platform=linux/amd64` to work on Vertex AI (if not using Linux already)\n",
189 |     "\n",
190 |     "_**Note**: I will soon add another tutorial on how to create custom Docker images and directly push those to Google's Container Registry without having to rely on `LocalModel.build_cpr_model`, in an easier and more flexible way._"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "import os\n",
200 |     "from google.cloud.aiplatform.prediction import LocalModel\n",
201 |     "\n",
202 |     "from huggingface_predictor_gpu.predictor import HuggingFacePredictor\n",
203 |     "\n",
204 |     "local_model = LocalModel.build_cpr_model(\n",
205 |     "    \"huggingface_predictor_gpu\",\n",
206 |     "    f\"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}:{TAG}\",\n",
207 |     "    predictor=HuggingFacePredictor,\n",
208 |     "    requirements_path=\"huggingface_predictor_gpu/requirements.txt\",\n",
209 |     "    base_image=\"--platform=linux/amd64 alvarobartt/torch-gpu:py310-cu12.3-torch-2.2.0 AS build\",\n",
210 |     ")"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "!gcloud auth configure-docker {REGION}-docker.pkg.dev --quiet"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "!gcloud artifacts repositories create custom-inference-gpu --repository-format=docker --location={REGION}"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "local_model.push_image()"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "![](https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/main/online-prediction/assets/docker-artifact-registry.png)"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "markdown",
249 |    "metadata": {},
250 |    "source": [
251 |     "---"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "metadata": {},
257 |    "source": [
258 |     "## Upload model from the 🤗 Hub"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "First we need to decide which model from the HuggingFace Hub we want to use, in this case, we will be using `facebook/bart-large-mnli` which is a zero-shot classification model.\n",
266 |     "\n",
267 |     "In order to do so, we will pull the model from the HuggingFace Hub using `git pull`, which requires `git lfs` to be installed in advanced, in order to also pull the large files from the repository.\n",
268 |     "\n",
269 |     "For more information check [`02-upload-register-and-deploy-models.ipynb`](./02-upload-register-and-deploy-models.ipynb)."
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": [
278 |     "!git lfs install\n",
279 |     "!git clone https://huggingface.co/facebook/bart-large-mnli"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "!ls"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "!cd bart-large-mnli/ && tar zcvf model.tar.gz --exclude flax_model.msgpack --exclude pytorch_model.bin --exclude rust_model.ot * && mv model.tar.gz ../"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "metadata": {
304 |     "id": "NIq7R4HZCfIc"
305 |    },
306 |    "outputs": [],
307 |    "source": [
308 |     "!gcloud config set storage/parallel_composite_upload_enabled True\n",
309 |     "!gcloud storage cp model.tar.gz $BUCKET_URI"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {
316 |     "id": "vhOb7YnwClBb"
317 |    },
318 |    "outputs": [],
319 |    "source": [
320 |     "!gcloud storage ls --recursive gs://{BUCKET_NAME}/bart-large-mnli"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "markdown",
325 |    "metadata": {},
326 |    "source": [
327 |     "![](https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/main/online-prediction/assets/model-cloud-storage.png)"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "metadata": {},
333 |    "source": [
334 |     "## Register model in Vertex AI"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {},
340 |    "source": [
341 |     "Once the model is uploaded to GCS and that the CPR image has been pushed to Google's Container Registry, then we can already register the model in Vertex AI.\n",
342 |     "\n",
343 |     "For more information check [`02-upload-register-and-deploy-models.ipynb`](./02-upload-register-and-deploy-models.ipynb)."
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {
350 |     "id": "8d682d8388ec"
351 |    },
352 |    "outputs": [],
353 |    "source": [
354 |     "from google.cloud import aiplatform\n",
355 |     "\n",
356 |     "aiplatform.init(project=PROJECT_ID, location=REGION)"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": null,
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "!gcloud auth login\n",
366 |     "!gcloud auth application-default login"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "metadata": {
373 |     "id": "2738154345d5"
374 |    },
375 |    "outputs": [],
376 |    "source": [
377 |     "model = aiplatform.Model.upload(\n",
378 |     "    display_name=\"bart-large-mnli\",\n",
379 |     "    artifact_uri=\"gs://huggingface-cloud/bart-large-mnli\",\n",
380 |     "    serving_container_image_uri=local_model.get_serving_container_spec().image_uri,\n",
381 |     "    serving_container_environment_variables={\n",
382 |     "        \"HF_TASK\": \"zero-shot-classification\",\n",
383 |     "        # Optional env var so that `uvicorn` only runs the model in 1 worker\n",
384 |     "        # \"VERTEX_CPR_WEB_CONCURRENCY\": 1,\n",
385 |     "    },\n",
386 |     ")"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "markdown",
391 |    "metadata": {},
392 |    "source": [
393 |     "![](https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/main/online-prediction/assets/model-registry-vertex-ai.png)"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "markdown",
398 |    "metadata": {},
399 |    "source": [
400 |     "_**Note**: the `deploy` method will take a while ~15-20 minutes in order to deploy the model in Vertex AI as an endpoint._"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "markdown",
405 |    "metadata": {},
406 |    "source": [
407 |     "In this case, since we want to run the inference on GPU, we need to ensure that the `machine_type` matches a machine with GPU, those being the ones from the G2 Series. Also, as already mentioned at the beginning of the notebook, some regions/zones may not have access to GPU accelerators, so double check that the region/zone that you're using has access to the machine."
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": null,
413 |    "metadata": {
414 |     "id": "62cf66498a28"
415 |    },
416 |    "outputs": [],
417 |    "source": [
418 |     "endpoint = model.deploy(\n",
419 |     "    machine_type=\"g2-standard-4\",\n",
420 |     "    accelerator_type=\"NVIDIA_L4\",\n",
421 |     "    accelerator_count=1,\n",
422 |     ")"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "markdown",
427 |    "metadata": {},
428 |    "source": [
429 |     "![](https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/main/online-prediction/assets/endpoint-deployment-vertex-ai.png)"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "markdown",
434 |    "metadata": {},
435 |    "source": [
436 |     "---"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "markdown",
441 |    "metadata": {},
442 |    "source": [
443 |     "## Run online predictions on Vertex AI"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "markdown",
448 |    "metadata": {},
449 |    "source": [
450 |     "Finally, we can proceed to run the online predictions on Vertex AI using their Python client, which will basically send the requests to the running endpoint, and we will also be able to closely monitor it via Google Cloud Logging service."
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": null,
456 |    "metadata": {},
457 |    "outputs": [],
458 |    "source": [
459 |     "import json\n",
460 |     "from google.api import httpbody_pb2\n",
461 |     "from google.cloud import aiplatform_v1\n",
462 |     "\n",
463 |     "prediction_client = aiplatform_v1.PredictionServiceClient(\n",
464 |     "    client_options={\"api_endpoint\": f\"{REGION}-aiplatform.googleapis.com\"}\n",
465 |     ")\n",
466 |     "\n",
467 |     "data = {\n",
468 |     "    \"sequences\": \"Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.\",\n",
469 |     "    \"candidate_labels\": [\"mobile\", \"website\", \"billing\", \"account access\"],\n",
470 |     "}\n",
471 |     "\n",
472 |     "json_data = json.dumps(data)\n",
473 |     "\n",
474 |     "http_body = httpbody_pb2.HttpBody(\n",
475 |     "    data=json_data.encode(\"utf-8\"),\n",
476 |     "    content_type=\"application/json\",\n",
477 |     ")\n",
478 |     "\n",
479 |     "request = aiplatform_v1.RawPredictRequest(\n",
480 |     "    endpoint=endpoint.resource_name,\n",
481 |     "    http_body=http_body,\n",
482 |     ")\n",
483 |     "\n",
484 |     "response = prediction_client.raw_predict(request)\n",
485 |     "json.loads(response.data)"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "markdown",
490 |    "metadata": {},
491 |    "source": [
492 |     "---"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "markdown",
497 |    "metadata": {},
498 |    "source": [
499 |     "## Resource clean-up"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "markdown",
504 |    "metadata": {},
505 |    "source": [
506 |     "Finally, we can release the resources we've allocated / created using the following `delete` methods over both the `endpoint` and the `model` variables in Vertex AI, and also the following `gcloud` CLI commands to remove both the Docker image from the Container Registry and the HuggingFace model from the Cloud Storage."
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "code",
511 |    "execution_count": null,
512 |    "metadata": {},
513 |    "outputs": [],
514 |    "source": [
515 |     "endpoint.delete(force=True)\n",
516 |     "model.delete()"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": null,
522 |    "metadata": {},
523 |    "outputs": [],
524 |    "source": [
525 |     "!gcloud artifacts docker images delete --quiet --delete-tags {REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY}/{IMAGE}\n",
526 |     "!gcloud storage rm -r $BUCKET_URI"
527 |    ]
528 |   }
529 |  ],
530 |  "metadata": {
531 |   "colab": {
532 |    "collapsed_sections": [],
533 |    "name": "SDK_Custom_Predict_and_Handler_SDK_Integration.ipynb",
534 |    "toc_visible": true
535 |   },
536 |   "kernelspec": {
537 |    "display_name": "Python 3 (ipykernel)",
538 |    "language": "python",
539 |    "name": "python3"
540 |   },
541 |   "language_info": {
542 |    "codemirror_mode": {
543 |     "name": "ipython",
544 |     "version": 3
545 |    },
546 |    "file_extension": ".py",
547 |    "mimetype": "text/x-python",
548 |    "name": "python",
549 |    "nbconvert_exporter": "python",
550 |    "pygments_lexer": "ipython3",
551 |    "version": "3.11.3"
552 |   }
553 |  },
554 |  "nbformat": 4,
555 |  "nbformat_minor": 4
556 | }
557 | 


--------------------------------------------------------------------------------
/online-prediction/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:12.3.0-base-ubuntu22.04 AS build
 2 | LABEL maintainer="Alvaro Bartolome"
 3 | 
 4 | ARG DEBIAN_FRONTEND=noninteractive
 5 | 
 6 | RUN apt-get update && \
 7 |     apt-get install python3 python3-pip -y
 8 | 
 9 | RUN ln -s /usr/bin/python3 /usr/bin/python
10 | ENV PYTHON=/usr/bin/python
11 | 
12 | ARG TORCH="2.2.0"
13 | 
14 | RUN python -m pip install --no-cache-dir --upgrade pip && \
15 |     python -m pip install --no-cache-dir torch==${TORCH}


--------------------------------------------------------------------------------
/online-prediction/README.md:
--------------------------------------------------------------------------------
 1 | # Online Prediction
 2 | 
 3 | This directory contains all the notebooks related to the online (real-time) prediction service offered by Vertex AI, and how to use it to deploy 🤗 HuggingFace models from the Hub.
 4 | 
 5 | For more information on Vertex AI please check [Google Cloud - Vertex AI](https://cloud.google.com/vertex-ai).
 6 | 
 7 | ## Contents
 8 | 
 9 | ### Initial guides
10 | 
11 | * [`01-build-custom-inference-images.ipynb`](./01-build-custom-inference-images.ipynb): contains the steps to build custom prediction containers to run inference on HuggingFace models using 🤗 `transformers.pipeline` for both CPU and GPU, and how to push those to the Google Cloud Container Registry.
12 | 
13 | * [`02-upload-register-and-deploy-models.ipynb`](./02-upload-register-and-deploy-models.ipynb): contains the steps to upload, register and deploy any model from the HuggingFace Hub in Vertex AI, covering both CPU-only and GPU accelerated inference.
14 | 
15 | ### End-to-end guides (recommended)
16 | 
17 | * [`03-from-hub-to-vertex-ai.ipynb`](./03-from-hub-to-vertex-ai.ipynb): contains a complete guide to define a custom container for HuggingFace's models and how to upload, register and deploy those in Vertex AI, also including how to use those endpoints for online predictions.
18 | 
19 | * [`04-from-hub-to-vertex-ai-gpu.ipynb`](./04-from-hub-to-vertex-ai-gpu.ipynb): contains a complete guide to define a custom container for HuggingFace's models and how to upload, register and deploy those in Vertex AI, also including how to use those endpoints for online predictions, but using GPU accelerated inference.
20 | 
21 | 


--------------------------------------------------------------------------------
/online-prediction/assets/docker-artifact-registry.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/d0bded48643a25ab46837c92747c767847941099/online-prediction/assets/docker-artifact-registry.png


--------------------------------------------------------------------------------
/online-prediction/assets/endpoint-deployment-vertex-ai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/d0bded48643a25ab46837c92747c767847941099/online-prediction/assets/endpoint-deployment-vertex-ai.png


--------------------------------------------------------------------------------
/online-prediction/assets/model-cloud-storage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/d0bded48643a25ab46837c92747c767847941099/online-prediction/assets/model-cloud-storage.png


--------------------------------------------------------------------------------
/online-prediction/assets/model-registry-vertex-ai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alvarobartt/vertex-ai-huggingface/d0bded48643a25ab46837c92747c767847941099/online-prediction/assets/model-registry-vertex-ai.png


--------------------------------------------------------------------------------
/online-prediction/huggingface_predictor/predictor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import tarfile
 4 | from typing import Any, Dict
 5 | 
 6 | from transformers import pipeline
 7 | 
 8 | from google.cloud.aiplatform.prediction.predictor import Predictor
 9 | from google.cloud.aiplatform.utils import prediction_utils
10 | 
11 | logger = logging.getLogger(__name__)
12 | logger.setLevel(logging.DEBUG)
13 | 
14 | 
15 | class HuggingFacePredictor(Predictor):
16 |     def __init__(self) -> None:
17 |         pass
18 | 
19 |     def load(self, artifacts_uri: str) -> None:
20 |         """Loads the preprocessor and model artifacts."""
21 |         logger.debug(f"Downloading artifacts from {artifacts_uri}")
22 |         prediction_utils.download_model_artifacts(artifacts_uri)
23 |         logger.debug("Artifacts successfully downloaded!")
24 |         os.makedirs("./model", exist_ok=True)
25 |         with tarfile.open("model.tar.gz", "r:gz") as tar:
26 |             tar.extractall(path="./model")
27 |         logger.debug(f"HF_TASK value is {os.getenv('HF_TASK')}")
28 |         self._pipeline = pipeline(os.getenv("HF_TASK", ""), model="./model")
29 |         logger.debug("`pipeline` successfully loaded!")
30 | 
31 |     def predict(self, instances: Dict[str, Any]) -> Dict[str, Any]:
32 |         return self._pipeline(**instances)
33 | 


--------------------------------------------------------------------------------
/online-prediction/huggingface_predictor/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.2.0
2 | transformers==4.38.1
3 | 


--------------------------------------------------------------------------------
/online-prediction/huggingface_predictor_gpu/predictor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import tarfile
 4 | from typing import Any, Dict
 5 | 
 6 | from transformers import pipeline
 7 | 
 8 | from google.cloud.aiplatform.prediction.predictor import Predictor
 9 | from google.cloud.aiplatform.utils import prediction_utils
10 | 
11 | logger = logging.getLogger(__name__)
12 | logger.setLevel(logging.DEBUG)
13 | 
14 | 
15 | class HuggingFacePredictor(Predictor):
16 |     def __init__(self) -> None:
17 |         pass
18 | 
19 |     def load(self, artifacts_uri: str) -> None:
20 |         """Loads the preprocessor and model artifacts."""
21 |         logger.debug(f"Downloading artifacts from {artifacts_uri}")
22 |         prediction_utils.download_model_artifacts(artifacts_uri)
23 |         logger.debug("Artifacts successfully downloaded!")
24 |         os.makedirs("./model", exist_ok=True)
25 |         with tarfile.open("model.tar.gz", "r:gz") as tar:
26 |             tar.extractall(path="./model")
27 |         logger.debug(f"HF_TASK value is {os.getenv('HF_TASK')}")
28 |         self._pipeline = pipeline(os.getenv("HF_TASK", ""), model="./model", device_map="auto")
29 |         logger.debug("`pipeline` successfully loaded!")
30 |         logger.debug(f"`pipeline` is using device={self._pipeline.device}")
31 | 
32 |     def predict(self, instances: Dict[str, Any]) -> Dict[str, Any]:
33 |         return self._pipeline(**instances)
34 | 


--------------------------------------------------------------------------------
/online-prediction/huggingface_predictor_gpu/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==2.2.0
2 | transformers==4.38.1
3 | accelerate==0.27.0
4 | 


--------------------------------------------------------------------------------