├── .gitignore ├── 01-dataset-management.ipynb ├── 02-experimentation.ipynb ├── 03-training-formalization.ipynb ├── 04-pipeline-deployment.ipynb ├── 05-continuous-training.ipynb ├── 06-model-deployment.ipynb ├── 07-prediction-serving.ipynb ├── 08-model-monitoring.ipynb ├── Dockerfile ├── LICENSE ├── README.md ├── build ├── Dockerfile ├── model-deployment.yaml ├── pipeline-deployment.yaml ├── serving_resources_spec.json └── utils.py ├── mlops.png ├── provision ├── README.md └── terraform │ ├── gcs-bucket.tf │ ├── main.tf │ ├── notebook-instance.tf │ ├── service-accounts.tf │ ├── services.tf │ ├── terraform.tfvars │ └── variables.tf ├── requirements.txt ├── setup.py └── src ├── __init__.py ├── common ├── __init__.py ├── datasource_utils.py └── features.py ├── model_training ├── __init__.py ├── data.py ├── defaults.py ├── exporter.py ├── model.py ├── runner.py ├── task.py └── trainer.py ├── pipeline_triggering ├── __init__.py ├── main.py └── requirements.txt ├── preprocessing ├── __init__.py ├── etl.py └── transformations.py ├── raw_schema └── schema.pbtxt ├── tests ├── __init__.py ├── datasource_utils_tests.py ├── etl_tests.py ├── model_deployment_tests.py ├── model_tests.py └── pipeline_deployment_tests.py └── tfx_pipelines ├── __init__.py ├── components.py ├── config.py ├── prediction_pipeline.py ├── runner.py └── training_pipeline.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | develop-eggs/ 12 | dist/ 13 | downloads/ 14 | eggs/ 15 | .eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | pip-wheel-metadata/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | .idea/ 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | _workspace/ 132 | *.tar.gz 133 | .egg-info/ 134 | *.whl 135 | mlpipeline-ui-metadata.json 136 | *.csv 137 | *.sqllite 138 | model.png 139 | *-pipeline.json 140 | *.DS_Store -------------------------------------------------------------------------------- /01-dataset-management.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "e8ba7f4f", 6 | "metadata": {}, 7 | "source": [ 8 | "# 01 - Data Analysis and Preparation\n", 9 | "\n", 10 | "This notebook covers the following tasks:\n", 11 | "\n", 12 | "1. Perform exploratory data analysis and visualization.\n", 13 | "2. Prepare the data for the ML task in BigQuery.\n", 14 | "3. Generate and fix a ` TFDV schema` for the source data.\n", 15 | "4. Create a `Vertex Dataset resource` dataset.\n" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "id": "b481a247", 21 | "metadata": {}, 22 | "source": [ 23 | "## Dataset\n", 24 | "\n", 25 | "The [Chicago Taxi Trips](https://pantheon.corp.google.com/marketplace/details/city-of-chicago-public-data/chicago-taxi-trips) dataset is one of [public datasets hosted with BigQuery](https://cloud.google.com/bigquery/public-data/), which includes taxi trips from 2013 to the present, reported to the City of Chicago in its role as a regulatory agency. The `taxi_trips` table size is 70.72 GB and includes more than 195 million records. The dataset includes information about the trips, like pickup and dropoff datetime and location, passengers count, miles travelled, and trip toll. \n", 26 | "\n", 27 | "The ML task is to predict whether a given trip will result in a tip > 20%." 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "id": "4fedd0ac", 33 | "metadata": {}, 34 | "source": [ 35 | "## Setup" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "id": "b25967c9", 41 | "metadata": {}, 42 | "source": [ 43 | "### Import libraries" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "fa2cf3f1", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "import os\n", 54 | "import pandas as pd\n", 55 | "import tensorflow as tf\n", 56 | "import tensorflow_data_validation as tfdv\n", 57 | "from google.cloud import bigquery\n", 58 | "import matplotlib.pyplot as plt\n", 59 | "\n", 60 | "from google.cloud import aiplatform as vertex_ai" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "id": "9bd0ee37", 66 | "metadata": {}, 67 | "source": [ 68 | "### Setup Google Cloud project" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "c7e4712e", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "PROJECT = '[your-project-id]' # Change to your project id.\n", 79 | "REGION = 'us-central1' # Change to your region.\n", 80 | "\n", 81 | "if PROJECT == \"\" or PROJECT is None or PROJECT == \"[your-project-id]\":\n", 82 | " # Get your GCP project id from gcloud\n", 83 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n", 84 | " PROJECT = shell_output[0]\n", 85 | " \n", 86 | "print(\"Project ID:\", PROJECT)\n", 87 | "print(\"Region:\", REGION)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "id": "384a817b", 93 | "metadata": {}, 94 | "source": [ 95 | "### Set configurations" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "b71d0738", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "BQ_DATASET_NAME = 'playground_us' # Change to your BQ dataset name.\n", 106 | "BQ_TABLE_NAME = 'chicago_taxitrips_prep'\n", 107 | "BQ_LOCATION = 'US'\n", 108 | "\n", 109 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n", 110 | "\n", 111 | "RAW_SCHEMA_DIR = 'src/raw_schema'" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "39395512", 117 | "metadata": {}, 118 | "source": [ 119 | "## 1. Explore the data in BigQuery" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "id": "9e4300d3", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "%%bigquery data\n", 130 | "\n", 131 | "SELECT \n", 132 | " CAST(EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS string) AS trip_dayofweek, \n", 133 | " FORMAT_DATE('%A',cast(trip_start_timestamp as date)) AS trip_dayname,\n", 134 | " COUNT(*) as trip_count,\n", 135 | "FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`\n", 136 | "WHERE\n", 137 | " EXTRACT(YEAR FROM trip_start_timestamp) = 2015 \n", 138 | "GROUP BY\n", 139 | " trip_dayofweek,\n", 140 | " trip_dayname\n", 141 | "ORDER BY\n", 142 | " trip_dayofweek\n", 143 | ";" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "699804c5", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "data" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "id": "a7f2447e", 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "data.plot(kind='bar', x='trip_dayname', y='trip_count')" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "id": "a7782c69", 169 | "metadata": {}, 170 | "source": [ 171 | "## 2. Create data for the ML task\n", 172 | "\n", 173 | "We add a `ML_use` column for pre-splitting the data, where 80% of the datsa items are set to `UNASSIGNED` while the other 20% is set to `TEST`.\n", 174 | "\n", 175 | "This column is used during training to split the dataset for training and test.\n", 176 | "\n", 177 | "In the training phase, the `UNASSIGNED` are split into `train` and `eval`. The `TEST` split is will be used for the final model validation." 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "id": "7987d132", 183 | "metadata": {}, 184 | "source": [ 185 | "### Create destination BigQuery dataset" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "id": "56a7f6d6", 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "!bq --location=$BQ_LOCATION mk -d \\\n", 196 | "$PROJECT:$BQ_DATASET_NAME" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "id": "832f99ba", 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "sample_size = 1000000\n", 207 | "year = 2020" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "id": "1b19789f", 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "sql_script = '''\n", 218 | "CREATE OR REPLACE TABLE `@PROJECT.@DATASET.@TABLE` \n", 219 | "AS (\n", 220 | " WITH\n", 221 | " taxitrips AS (\n", 222 | " SELECT\n", 223 | " trip_start_timestamp,\n", 224 | " trip_seconds,\n", 225 | " trip_miles,\n", 226 | " payment_type,\n", 227 | " pickup_longitude,\n", 228 | " pickup_latitude,\n", 229 | " dropoff_longitude,\n", 230 | " dropoff_latitude,\n", 231 | " tips,\n", 232 | " fare\n", 233 | " FROM\n", 234 | " `bigquery-public-data.chicago_taxi_trips.taxi_trips`\n", 235 | " WHERE 1=1 \n", 236 | " AND pickup_longitude IS NOT NULL\n", 237 | " AND pickup_latitude IS NOT NULL\n", 238 | " AND dropoff_longitude IS NOT NULL\n", 239 | " AND dropoff_latitude IS NOT NULL\n", 240 | " AND trip_miles > 0\n", 241 | " AND trip_seconds > 0\n", 242 | " AND fare > 0\n", 243 | " AND EXTRACT(YEAR FROM trip_start_timestamp) = @YEAR\n", 244 | " )\n", 245 | "\n", 246 | " SELECT\n", 247 | " trip_start_timestamp,\n", 248 | " EXTRACT(MONTH from trip_start_timestamp) as trip_month,\n", 249 | " EXTRACT(DAY from trip_start_timestamp) as trip_day,\n", 250 | " EXTRACT(DAYOFWEEK from trip_start_timestamp) as trip_day_of_week,\n", 251 | " EXTRACT(HOUR from trip_start_timestamp) as trip_hour,\n", 252 | " trip_seconds,\n", 253 | " trip_miles,\n", 254 | " payment_type,\n", 255 | " ST_AsText(\n", 256 | " ST_SnapToGrid(ST_GeogPoint(pickup_longitude, pickup_latitude), 0.1)\n", 257 | " ) AS pickup_grid,\n", 258 | " ST_AsText(\n", 259 | " ST_SnapToGrid(ST_GeogPoint(dropoff_longitude, dropoff_latitude), 0.1)\n", 260 | " ) AS dropoff_grid,\n", 261 | " ST_Distance(\n", 262 | " ST_GeogPoint(pickup_longitude, pickup_latitude), \n", 263 | " ST_GeogPoint(dropoff_longitude, dropoff_latitude)\n", 264 | " ) AS euclidean,\n", 265 | " CONCAT(\n", 266 | " ST_AsText(ST_SnapToGrid(ST_GeogPoint(pickup_longitude,\n", 267 | " pickup_latitude), 0.1)), \n", 268 | " ST_AsText(ST_SnapToGrid(ST_GeogPoint(dropoff_longitude,\n", 269 | " dropoff_latitude), 0.1))\n", 270 | " ) AS loc_cross,\n", 271 | " IF((tips/fare >= 0.2), 1, 0) AS tip_bin,\n", 272 | " IF(RAND() <= 0.8, 'UNASSIGNED', 'TEST') AS ML_use\n", 273 | " FROM\n", 274 | " taxitrips\n", 275 | " LIMIT @LIMIT\n", 276 | ")\n", 277 | "'''" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "id": "6f1d2837", 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "sql_script = sql_script.replace(\n", 288 | " '@PROJECT', PROJECT).replace(\n", 289 | " '@DATASET', BQ_DATASET_NAME).replace(\n", 290 | " '@TABLE', BQ_TABLE_NAME).replace(\n", 291 | " '@YEAR', str(year)).replace(\n", 292 | " '@LIMIT', str(sample_size))" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "id": "27acb6b5", 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "print(sql_script)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "id": "88f31e8c", 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "bq_client = bigquery.Client(project=PROJECT, location=BQ_LOCATION)\n", 313 | "job = bq_client.query(sql_script)\n", 314 | "_ = job.result()" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "id": "b32eea6f", 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "%%bigquery --project {PROJECT}\n", 325 | "\n", 326 | "SELECT ML_use, COUNT(*)\n", 327 | "FROM playground_us.chicago_taxitrips_prep # Change to your BQ dataset and table names.\n", 328 | "GROUP BY ML_use" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "id": "7fcc75fc", 334 | "metadata": {}, 335 | "source": [ 336 | "### Load a sample data to a Pandas DataFrame" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "id": "f252a846", 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "%%bigquery sample_data --project {PROJECT}\n", 347 | "\n", 348 | "SELECT * EXCEPT (trip_start_timestamp, ML_use)\n", 349 | "FROM playground_us.chicago_taxitrips_prep # Change to your BQ dataset and table names." 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "id": "798809d2", 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "sample_data.head().T" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "id": "23595838", 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "sample_data.tip_bin.value_counts()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "id": "3d06bb64", 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "sample_data.euclidean.hist()" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "id": "10aae180", 385 | "metadata": {}, 386 | "source": [ 387 | "## 3. Generate raw data schema\n", 388 | "\n", 389 | "The [TensorFlow Data Validation (TFDV)](https://www.tensorflow.org/tfx/data_validation/get_started) data schema will be used in:\n", 390 | "1. Identify the raw data types and shapes in the data transformation.\n", 391 | "2. Create the serving input signature for the custom model.\n", 392 | "3. Validate the new raw training data in the TFX pipeline." 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "id": "4087d5fa", 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "stats = tfdv.generate_statistics_from_dataframe(\n", 403 | " dataframe=sample_data,\n", 404 | " stats_options=tfdv.StatsOptions(\n", 405 | " label_feature='tip_bin',\n", 406 | " weight_feature=None,\n", 407 | " sample_rate=1,\n", 408 | " num_top_values=50\n", 409 | " )\n", 410 | ")" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "id": "091fbd77", 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "tfdv.visualize_statistics(stats)" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "id": "d251e09b", 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "schema = tfdv.infer_schema(statistics=stats)\n", 431 | "tfdv.display_schema(schema=schema)" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "id": "502c49f1", 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [ 441 | "raw_schema_location = os.path.join(RAW_SCHEMA_DIR, 'schema.pbtxt')\n", 442 | "tfdv.write_schema_text(schema, raw_schema_location)" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "id": "59df0723", 448 | "metadata": {}, 449 | "source": [ 450 | "## 4. Create Vertex Dataset resource" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "id": "90d9b605", 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "vertex_ai.init(\n", 461 | " project=PROJECT,\n", 462 | " location=REGION\n", 463 | ")" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "id": "464ab920", 469 | "metadata": {}, 470 | "source": [ 471 | "### Create the dataset resource" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "id": "a0a1707a", 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "bq_uri = f\"bq://{PROJECT}.{BQ_DATASET_NAME}.{BQ_TABLE_NAME}\"\n", 482 | "\n", 483 | "dataset = vertex_ai.TabularDataset.create(\n", 484 | " display_name=DATASET_DISPLAY_NAME, bq_source=bq_uri)\n", 485 | "\n", 486 | "dataset.gca_resource" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "id": "c761fadb", 492 | "metadata": {}, 493 | "source": [ 494 | "### Get the dataset resource\n", 495 | "\n", 496 | "The dataset resource is retrieved by display name. Because multiple datasets can have the same display name, we retrieve the most recent updated one." 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "id": "d78b7f4d", 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "dataset = vertex_ai.TabularDataset.list(\n", 507 | " filter=f\"display_name={DATASET_DISPLAY_NAME}\", \n", 508 | " order_by=\"update_time\")[-1]\n", 509 | "\n", 510 | "print(\"Dataset resource name:\", dataset.resource_name)\n", 511 | "print(\"Dataset BigQuery source:\", dataset.gca_resource.metadata['inputConfig']['bigquerySource']['uri'])" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "id": "569249de", 518 | "metadata": {}, 519 | "outputs": [], 520 | "source": [] 521 | } 522 | ], 523 | "metadata": { 524 | "environment": { 525 | "name": "common-cpu.m79", 526 | "type": "gcloud", 527 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m79" 528 | }, 529 | "kernelspec": { 530 | "display_name": "Python 3", 531 | "language": "python", 532 | "name": "python3" 533 | }, 534 | "language_info": { 535 | "codemirror_mode": { 536 | "name": "ipython", 537 | "version": 3 538 | }, 539 | "file_extension": ".py", 540 | "mimetype": "text/x-python", 541 | "name": "python", 542 | "nbconvert_exporter": "python", 543 | "pygments_lexer": "ipython3", 544 | "version": "3.7.10" 545 | } 546 | }, 547 | "nbformat": 4, 548 | "nbformat_minor": 5 549 | } 550 | -------------------------------------------------------------------------------- /05-continuous-training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5043bfb9", 6 | "metadata": {}, 7 | "source": [ 8 | "# 05 - Continuous Training\n", 9 | "\n", 10 | "After testing, compiling, and uploading the pipeline definition to Cloud Storage, the pipeline is executed with respect to a trigger. We use [Cloud Functions](https://cloud.google.com/functions) and [Cloud Pub/Sub](https://cloud.google.com/pubsub) as a triggering mechanism. The triggering can be scheduled using [Cloud Scheduler](https://cloud.google.com/scheduler). The trigger source sends a message to a Cloud Pub/Sub topic that the Cloud Function listens to, and then it submits the pipeline to AI Platform Managed Pipelines to be executed.\n", 11 | "\n", 12 | "This notebook covers the following steps:\n", 13 | "1. Create the Cloud Pub/Sub topic.\n", 14 | "2. Deploy the Cloud Function \n", 15 | "3. Test triggering a pipeline.\n", 16 | "4. Extracting pipeline run metadata." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "6f2f9013", 22 | "metadata": {}, 23 | "source": [ 24 | "## Setup" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "a0e71c08", 30 | "metadata": {}, 31 | "source": [ 32 | "### Import libraries" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "37cefa26", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import json\n", 43 | "import os\n", 44 | "import logging\n", 45 | "import tensorflow as tf\n", 46 | "import tfx\n", 47 | "import IPython \n", 48 | "\n", 49 | "logging.getLogger().setLevel(logging.INFO)\n", 50 | "\n", 51 | "print(\"Tensorflow Version:\", tfx.__version__)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "id": "ab763d6d", 57 | "metadata": {}, 58 | "source": [ 59 | "### Setup Google Cloud project" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "id": "5260d069", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "PROJECT = '[your-project-id]' # Change to your project id.\n", 70 | "REGION = 'us-central1' # Change to your region.\n", 71 | "BUCKET = '[your-bucket-name]' # Change to your bucket name.\n", 72 | "\n", 73 | "if PROJECT == \"\" or PROJECT is None or PROJECT == \"[your-project-id]\":\n", 74 | " # Get your GCP project id from gcloud\n", 75 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n", 76 | " PROJECT = shell_output[0]\n", 77 | " \n", 78 | "if BUCKET == \"\" or BUCKET is None or BUCKET == \"[your-bucket-name]\":\n", 79 | " # Get your bucket name to GCP projet id\n", 80 | " BUCKET = PROJECT\n", 81 | "\n", 82 | "print(\"Project ID:\", PROJECT)\n", 83 | "print(\"Region:\", REGION)\n", 84 | "print(\"Bucket name:\", BUCKET)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "85dd5e16", 90 | "metadata": {}, 91 | "source": [ 92 | "### Set configurations" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "171a800f", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "VERSION = 'v01'\n", 103 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n", 104 | "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n", 105 | "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n", 106 | "\n", 107 | "PIPELINES_STORE = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/compiled_pipelines/'\n", 108 | "GCS_PIPELINE_FILE_LOCATION = os.path.join(PIPELINES_STORE, f'{PIPELINE_NAME}.json')\n", 109 | "PUBSUB_TOPIC = f'trigger-{PIPELINE_NAME}'\n", 110 | "CLOUD_FUNCTION_NAME = f'trigger-{PIPELINE_NAME}-fn'" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "016df25c", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "!gsutil ls {GCS_PIPELINE_FILE_LOCATION}" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "id": "76d82223", 126 | "metadata": {}, 127 | "source": [ 128 | "## 1. Create a Pub/Sub topic" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "id": "0c1032c6", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "!gcloud pubsub topics create {PUBSUB_TOPIC}" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "id": "08de54ef", 144 | "metadata": {}, 145 | "source": [ 146 | "## 2. Deploy the Cloud Function" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "id": "8597ad8d", 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "ENV_VARS=f\"\"\"\\\n", 157 | "PROJECT={PROJECT},\\\n", 158 | "REGION={REGION},\\\n", 159 | "GCS_PIPELINE_FILE_LOCATION={GCS_PIPELINE_FILE_LOCATION}\n", 160 | "\"\"\"\n", 161 | "\n", 162 | "!echo {ENV_VARS}" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "id": "01a3d62a", 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "!rm -r src/pipeline_triggering/.ipynb_checkpoints" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "id": "b5acdb73", 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "!gcloud functions deploy {CLOUD_FUNCTION_NAME} \\\n", 183 | " --region={REGION} \\\n", 184 | " --trigger-topic={PUBSUB_TOPIC} \\\n", 185 | " --runtime=python37 \\\n", 186 | " --source=src/pipeline_triggering\\\n", 187 | " --entry-point=trigger_pipeline\\\n", 188 | " --stage-bucket={BUCKET}\\\n", 189 | " --update-env-vars={ENV_VARS}" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "id": "5a5c41af", 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "cloud_fn_url = f\"https://console.cloud.google.com/functions/details/{REGION}/{CLOUD_FUNCTION_NAME}\"\n", 200 | "html = f'See the Cloud Function details here.'\n", 201 | "IPython.display.display(IPython.display.HTML(html))" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "id": "ebbe047a", 207 | "metadata": {}, 208 | "source": [ 209 | "## 3. Trigger the pipeline" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "id": "0c30028d", 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "from google.cloud import pubsub\n", 220 | "\n", 221 | "publish_client = pubsub.PublisherClient()\n", 222 | "topic = f'projects/{PROJECT}/topics/{PUBSUB_TOPIC}'\n", 223 | "data = {\n", 224 | " 'num_epochs': 7,\n", 225 | " 'learning_rate': 0.0015,\n", 226 | " 'batch_size': 512,\n", 227 | " 'hidden_units': '256,126'\n", 228 | "}\n", 229 | "message = json.dumps(data)\n", 230 | "\n", 231 | "_ = publish_client.publish(topic, message.encode())" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "id": "7ba049fe", 237 | "metadata": {}, 238 | "source": [ 239 | "Wait for a few seconds for the pipeline run to be submitted, then you can see the run in the Cloud Console" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "id": "0dc29797", 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "from kfp.v2.google.client import AIPlatformClient\n", 250 | "\n", 251 | "pipeline_client = AIPlatformClient(\n", 252 | " project_id=PROJECT, region=REGION)\n", 253 | " \n", 254 | "job_display_name = pipeline_client.list_jobs()['pipelineJobs'][0]['displayName']\n", 255 | "job_url = f\"https://console.cloud.google.com/vertex-ai/locations/{REGION}/pipelines/runs/{job_display_name}\"\n", 256 | "html = f'See the Pipeline job here.'\n", 257 | "IPython.display.display(IPython.display.HTML(html))" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "id": "b4b3ff42", 263 | "metadata": {}, 264 | "source": [ 265 | "## 4. Extracting pipeline runs metadata" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "id": "b13c1b19", 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "from google.cloud import aiplatform as vertex_ai\n", 276 | "\n", 277 | "pipeline_df = vertex_ai.get_pipeline_df(PIPELINE_NAME)\n", 278 | "pipeline_df = pipeline_df[pipeline_df.pipeline_name == PIPELINE_NAME]\n", 279 | "pipeline_df.T" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "id": "9254cbc3", 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [] 289 | } 290 | ], 291 | "metadata": { 292 | "environment": { 293 | "name": "common-cpu.m73", 294 | "type": "gcloud", 295 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73" 296 | }, 297 | "kernelspec": { 298 | "display_name": "Python 3", 299 | "language": "python", 300 | "name": "python3" 301 | }, 302 | "language_info": { 303 | "codemirror_mode": { 304 | "name": "ipython", 305 | "version": 3 306 | }, 307 | "file_extension": ".py", 308 | "mimetype": "text/x-python", 309 | "name": "python", 310 | "nbconvert_exporter": "python", 311 | "pygments_lexer": "ipython3", 312 | "version": "3.7.10" 313 | } 314 | }, 315 | "nbformat": 4, 316 | "nbformat_minor": 5 317 | } 318 | -------------------------------------------------------------------------------- /06-model-deployment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "ee01c81b", 6 | "metadata": {}, 7 | "source": [ 8 | "# 06 - Model Deployment\n", 9 | "\n", 10 | "The purpose of this notebook is to execute a CI/CD routine to test and deploy the trained model to `Vertex AI` as an `Endpoint` for online prediction serving. The notebook covers the following steps:\n", 11 | "1. Run the test steps locally.\n", 12 | "2. Execute the model deployment `CI/CD` steps using `Cloud Build`.\n", 13 | "\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "0da8290c", 19 | "metadata": {}, 20 | "source": [ 21 | "## Setup" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "id": "4873f8cf", 27 | "metadata": {}, 28 | "source": [ 29 | "### Import libraries" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "id": "59085129", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import os\n", 40 | "import logging\n", 41 | "\n", 42 | "logging.getLogger().setLevel(logging.INFO)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "e37fb189", 48 | "metadata": {}, 49 | "source": [ 50 | "### Setup Google Cloud project" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "e45be804", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "PROJECT = '[your-project-id]' # Change to your project id.\n", 61 | "REGION = 'us-central1' # Change to your region.\n", 62 | "\n", 63 | "if PROJECT == \"\" or PROJECT is None or PROJECT == \"[your-project-id]\":\n", 64 | " # Get your GCP project id from gcloud\n", 65 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n", 66 | " PROJECT = shell_output[0]\n", 67 | "\n", 68 | "print(\"Project ID:\", PROJECT)\n", 69 | "print(\"Region:\", REGION)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "id": "1574964f", 75 | "metadata": {}, 76 | "source": [ 77 | "### Set configurations" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "id": "4a01278c", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "VERSION = 'v01'\n", 88 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n", 89 | "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n", 90 | "ENDPOINT_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier'\n", 91 | "\n", 92 | "CICD_IMAGE_NAME = 'cicd:latest'\n", 93 | "CICD_IMAGE_URI = f\"gcr.io/{PROJECT}/{CICD_IMAGE_NAME}\"" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "id": "87f6f1e0", 99 | "metadata": {}, 100 | "source": [ 101 | "## 1. Run CI/CD steps locally" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "id": "a223cdf6", 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "os.environ['PROJECT'] = PROJECT\n", 112 | "os.environ['REGION'] = REGION\n", 113 | "os.environ['MODEL_DISPLAY_NAME'] = MODEL_DISPLAY_NAME\n", 114 | "os.environ['ENDPOINT_DISPLAY_NAME'] = ENDPOINT_DISPLAY_NAME" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "id": "b6546ac1", 120 | "metadata": {}, 121 | "source": [ 122 | "### Run the model artifact testing" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "id": "74c0f8a8", 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "!py.test src/tests/model_deployment_tests.py::test_model_artifact -s" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "id": "77885b24", 138 | "metadata": {}, 139 | "source": [ 140 | "### Run create endpoint" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "0efe73b5", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "!python build/utils.py \\\n", 151 | " --mode=create-endpoint\\\n", 152 | " --project={PROJECT}\\\n", 153 | " --region={REGION}\\\n", 154 | " --endpoint-display-name={ENDPOINT_DISPLAY_NAME}" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "id": "3eb28c6f", 160 | "metadata": {}, 161 | "source": [ 162 | "### Run deploy model" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "id": "9cb3f19d", 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "!python build/utils.py \\\n", 173 | " --mode=deploy-model\\\n", 174 | " --project={PROJECT}\\\n", 175 | " --region={REGION}\\\n", 176 | " --endpoint-display-name={ENDPOINT_DISPLAY_NAME}\\\n", 177 | " --model-display-name={MODEL_DISPLAY_NAME}" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "id": "ee492355", 183 | "metadata": {}, 184 | "source": [ 185 | "### Test deployed model endpoint" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "id": "3d4bce50", 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "!py.test src/tests/model_deployment_tests.py::test_model_endpoint" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "id": "37b150c9", 201 | "metadata": {}, 202 | "source": [ 203 | "## 2. Execute the Model Deployment CI/CD routine in Cloud Build\n", 204 | "\n", 205 | "The CI/CD routine is defined in the [model-deployment.yaml](model-deployment.yaml) file, and consists of the following steps:\n", 206 | "1. Load and test the the trained model interface.\n", 207 | "2. Create and endpoint in Vertex AI if it doesn't exists.\n", 208 | "3. Deploy the model to the endpoint.\n", 209 | "4. Test the endpoint." 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "id": "839e540c", 215 | "metadata": {}, 216 | "source": [ 217 | "### Build CI/CD container Image for Cloud Build\n", 218 | "\n", 219 | "This is the runtime environment where the steps of testing and deploying model will be executed." 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "id": "a7f9bf4e", 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "!echo $CICD_IMAGE_URI" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "id": "3855daae", 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "!gcloud builds submit --tag $CICD_IMAGE_URI build/. --timeout=15m" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "id": "90fbd4b9", 245 | "metadata": {}, 246 | "source": [ 247 | "### Run CI/CD from model deployment using Cloud Build" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "id": "e1aec70c", 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "REPO_URL = \"https://github.com/GoogleCloudPlatform/mlops-with-vertex-ai.git\" # Change to your github repo.\n", 258 | "BRANCH = \"main\" " 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "id": "01995fa5", 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "SUBSTITUTIONS=f\"\"\"\\\n", 269 | "_REPO_URL='{REPO_URL}',\\\n", 270 | "_BRANCH={BRANCH},\\\n", 271 | "_CICD_IMAGE_URI={CICD_IMAGE_URI},\\\n", 272 | "_PROJECT={PROJECT},\\\n", 273 | "_REGION={REGION},\\\n", 274 | "_MODEL_DISPLAY_NAME={MODEL_DISPLAY_NAME},\\\n", 275 | "_ENDPOINT_DISPLAY_NAME={ENDPOINT_DISPLAY_NAME},\\\n", 276 | "\"\"\"\n", 277 | "\n", 278 | "!echo $SUBSTITUTIONS" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "id": "8849d3e4", 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "!gcloud builds submit --no-source --config build/model-deployment.yaml --substitutions {SUBSTITUTIONS} --timeout=30m" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "id": "01831724", 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "id": "4418b01e", 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [] 306 | } 307 | ], 308 | "metadata": { 309 | "environment": { 310 | "name": "common-cpu.m79", 311 | "type": "gcloud", 312 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m79" 313 | }, 314 | "kernelspec": { 315 | "display_name": "Python 3", 316 | "language": "python", 317 | "name": "python3" 318 | }, 319 | "language_info": { 320 | "codemirror_mode": { 321 | "name": "ipython", 322 | "version": 3 323 | }, 324 | "file_extension": ".py", 325 | "mimetype": "text/x-python", 326 | "name": "python", 327 | "nbconvert_exporter": "python", 328 | "pygments_lexer": "ipython3", 329 | "version": "3.7.10" 330 | } 331 | }, 332 | "nbformat": 4, 333 | "nbformat_minor": 5 334 | } 335 | -------------------------------------------------------------------------------- /07-prediction-serving.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4004af63", 6 | "metadata": {}, 7 | "source": [ 8 | "# 07 - Prediction Serving\n", 9 | "\n", 10 | "The purpose of the notebook is to show how to use the deployed model for online and batch prediction.\n", 11 | "The notebook covers the following tasks:\n", 12 | "1. Test the endpoints for online prediction.\n", 13 | "2. Use the uploaded custom model for batch prediction.\n", 14 | "3. Run a the batch prediction pipeline using `Vertex Pipelines`." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "id": "3dad1f75", 20 | "metadata": {}, 21 | "source": [ 22 | "## Setup" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "id": "6d02a9d5", 28 | "metadata": {}, 29 | "source": [ 30 | "### Import libraries" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "id": "c7f3ce81", 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import os\n", 41 | "from datetime import datetime\n", 42 | "import tensorflow as tf\n", 43 | "\n", 44 | "from google.cloud import aiplatform as vertex_ai" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "8e73bc25", 50 | "metadata": {}, 51 | "source": [ 52 | "### Setup Google Cloud project" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "29ea9b0a", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "PROJECT = '[your-project-id]' # Change to your project id.\n", 63 | "REGION = 'us-central1' # Change to your region.\n", 64 | "BUCKET = '[your-bucket-name]' # Change to your bucket name.\n", 65 | "\n", 66 | "if PROJECT == \"\" or PROJECT is None or PROJECT == \"[your-project-id]\":\n", 67 | " # Get your GCP project id from gcloud\n", 68 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n", 69 | " PROJECT = shell_output[0]\n", 70 | " \n", 71 | "if BUCKET == \"\" or BUCKET is None or BUCKET == \"[your-bucket-name]\":\n", 72 | " # Get your bucket name to GCP project id\n", 73 | " BUCKET = PROJECT\n", 74 | " # Try to create the bucket if it doesn't exists\n", 75 | " ! gsutil mb -l $REGION gs://$BUCKET\n", 76 | " print(\"\")\n", 77 | " \n", 78 | "print(\"Project ID:\", PROJECT)\n", 79 | "print(\"Region:\", REGION)\n", 80 | "print(\"Bucket name:\", BUCKET)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "ecba79b0", 86 | "metadata": {}, 87 | "source": [ 88 | "### Set configurations" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "id": "537732be", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "VERSION = 'v01'\n", 99 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n", 100 | "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n", 101 | "ENDPOINT_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier'\n", 102 | "\n", 103 | "SERVE_BQ_DATASET_NAME = 'playground_us' # Change to your serving BigQuery dataset name.\n", 104 | "SERVE_BQ_TABLE_NAME = 'chicago_taxitrips_prep' # Change to your serving BigQuery table name." 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "4e508dd0", 110 | "metadata": {}, 111 | "source": [ 112 | "## 1. Making Online Predicitons\n" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "38be76f5", 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "vertex_ai.init(\n", 123 | " project=PROJECT,\n", 124 | " location=REGION,\n", 125 | " staging_bucket=BUCKET\n", 126 | ")\n", 127 | "\n", 128 | "endpoint_name = vertex_ai.Endpoint.list(\n", 129 | " filter=f'display_name={ENDPOINT_DISPLAY_NAME}', \n", 130 | " order_by=\"update_time\")[-1].gca_resource.name\n", 131 | "\n", 132 | "endpoint = vertex_ai.Endpoint(endpoint_name)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "id": "f6b8053d", 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "test_instances = [ \n", 143 | " {\n", 144 | " \"dropoff_grid\": [\"POINT(-87.6 41.9)\"],\n", 145 | " \"euclidean\": [2064.2696],\n", 146 | " \"loc_cross\": [\"\"],\n", 147 | " \"payment_type\": [\"Credit Card\"],\n", 148 | " \"pickup_grid\": [\"POINT(-87.6 41.9)\"],\n", 149 | " \"trip_miles\": [1.37],\n", 150 | " \"trip_day\": [12],\n", 151 | " \"trip_hour\": [16],\n", 152 | " \"trip_month\": [2],\n", 153 | " \"trip_day_of_week\": [4],\n", 154 | " \"trip_seconds\": [555]\n", 155 | " }\n", 156 | "]" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "id": "f7cb447e", 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "predictions = endpoint.predict(test_instances).predictions\n", 167 | "\n", 168 | "for prediction in predictions:\n", 169 | " print(prediction)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "id": "330d9dfc", 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "explanations = endpoint.explain(test_instances).explanations\n", 180 | "\n", 181 | "for explanation in explanations:\n", 182 | " print(explanation)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "id": "ddc90ffa", 188 | "metadata": {}, 189 | "source": [ 190 | "## 2. Batch Prediction" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "id": "046757e2", 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "WORKSPACE = f\"gs://{BUCKET}/{DATASET_DISPLAY_NAME}/\"\n", 201 | "SERVING_DATA_DIR = os.path.join(WORKSPACE, 'serving_data')\n", 202 | "SERVING_INPUT_DATA_DIR = os.path.join(SERVING_DATA_DIR, 'input_data')\n", 203 | "SERVING_OUTPUT_DATA_DIR = os.path.join(SERVING_DATA_DIR, 'output_predictions')" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "id": "0e8fbc4d", 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "if tf.io.gfile.exists(SERVING_DATA_DIR):\n", 214 | " print(\"Removing previous serving data...\")\n", 215 | " tf.io.gfile.rmtree(SERVING_DATA_DIR)\n", 216 | " \n", 217 | "print(\"Creating serving data directory...\")\n", 218 | "tf.io.gfile.mkdir(SERVING_DATA_DIR)\n", 219 | "print(\"Serving data directory is ready.\")" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "id": "6f7b60fa", 225 | "metadata": {}, 226 | "source": [ 227 | "### Extract serving data to Cloud Storage as JSONL" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "id": "04bb69ff", 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "from src.common import datasource_utils\n", 238 | "from src.preprocessing import etl" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "id": "dfd4cf91", 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "LIMIT = 10000\n", 249 | "\n", 250 | "sql_query = datasource_utils.get_serving_source_query(\n", 251 | " bq_dataset_name=SERVE_BQ_DATASET_NAME, \n", 252 | " bq_table_name=SERVE_BQ_TABLE_NAME,\n", 253 | " limit=LIMIT\n", 254 | ")\n", 255 | "\n", 256 | "print(sql_query)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "id": "5f5afb73", 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "job_name = f\"extract-{DATASET_DISPLAY_NAME}-serving-{datetime.now().strftime('%Y%m%d%H%M%S')}\"\n", 267 | "\n", 268 | "args = {\n", 269 | " 'job_name': job_name,\n", 270 | " #'runner': 'DataflowRunner',\n", 271 | " 'sql_query': sql_query,\n", 272 | " 'exported_data_prefix': os.path.join(SERVING_INPUT_DATA_DIR, \"data-\"),\n", 273 | " 'temporary_dir': os.path.join(WORKSPACE, 'tmp'),\n", 274 | " 'gcs_location': os.path.join(WORKSPACE, 'bq_tmp'),\n", 275 | " 'project': PROJECT,\n", 276 | " 'region': REGION,\n", 277 | " 'setup_file': './setup.py'\n", 278 | "}" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "id": "588e1949", 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "tf.get_logger().setLevel('ERROR')\n", 289 | "\n", 290 | "print(\"Data extraction started...\")\n", 291 | "etl.run_extract_pipeline(args)\n", 292 | "print(\"Data extraction completed.\")" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "id": "a036944a", 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "!gsutil ls {SERVING_INPUT_DATA_DIR}" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "id": "5ff0d72b", 308 | "metadata": {}, 309 | "source": [ 310 | "### Submit the batch prediction job" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "id": "eb72b16e", 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "model_name = vertex_ai.Model.list(\n", 321 | " filter=f'display_name={MODEL_DISPLAY_NAME}',\n", 322 | " order_by=\"update_time\")[-1].gca_resource.name" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "id": "dac58bf2", 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "job_resources = {\n", 333 | " \"machine_type\": 'n1-standard-2',\n", 334 | " #'accelerator_count': 1,\n", 335 | " #'accelerator_type': 'NVIDIA_TESLA_T4'\n", 336 | " \"starting_replica_count\": 1,\n", 337 | " \"max_replica_count\": 10,\n", 338 | "}\n", 339 | "\n", 340 | "job_display_name = f\"{MODEL_DISPLAY_NAME}-prediction-job-{datetime.now().strftime('%Y%m%d%H%M%S')}\"\n", 341 | "\n", 342 | "vertex_ai.BatchPredictionJob.create(\n", 343 | " job_display_name=job_display_name,\n", 344 | " model_name=model_name,\n", 345 | " gcs_source=SERVING_INPUT_DATA_DIR + '/*.jsonl',\n", 346 | " gcs_destination_prefix=SERVING_OUTPUT_DATA_DIR,\n", 347 | " instances_format='jsonl',\n", 348 | " predictions_format='jsonl',\n", 349 | " sync=True,\n", 350 | " **job_resources,\n", 351 | ")" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "id": "a4f281a9", 357 | "metadata": {}, 358 | "source": [ 359 | "## 3. Run the batch prediction pipeline using Vertex Pipelines" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "id": "809ba028", 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "WORKSPACE = f\"gs://{BUCKET}/{DATASET_DISPLAY_NAME}/\"\n", 370 | "ARTIFACT_STORE = os.path.join(WORKSPACE, 'tfx_artifacts')\n", 371 | "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-predict-pipeline'" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "id": "769a1d9e", 377 | "metadata": {}, 378 | "source": [ 379 | "### Set the pipeline configurations for the Vertex AI run" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "id": "c5add19d", 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "os.environ[\"PROJECT\"] = PROJECT\n", 390 | "os.environ[\"REGION\"] = REGION\n", 391 | "os.environ[\"GCS_LOCATION\"] = f\"gs://{BUCKET}/{DATASET_DISPLAY_NAME}\"\n", 392 | "os.environ[\"MODEL_DISPLAY_NAME\"] = MODEL_DISPLAY_NAME\n", 393 | "os.environ[\"PIPELINE_NAME\"] = PIPELINE_NAME\n", 394 | "os.environ[\"ARTIFACT_STORE_URI\"] = ARTIFACT_STORE\n", 395 | "os.environ[\"BATCH_PREDICTION_BQ_DATASET_NAME\"] = SERVE_BQ_DATASET_NAME\n", 396 | "os.environ[\"BATCH_PREDICTION_BQ_TABLE_NAME\"] = SERVE_BQ_TABLE_NAME\n", 397 | "os.environ[\"SERVE_LIMIT\"] = \"1000\"\n", 398 | "os.environ[\"BEAM_RUNNER\"] = \"DirectRunner\"\n", 399 | "os.environ[\"TFX_IMAGE_URI\"] = f\"gcr.io/{PROJECT}/{DATASET_DISPLAY_NAME}:{VERSION}\"" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "id": "f6d0e2ec", 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "import importlib\n", 410 | "from src.tfx_pipelines import config\n", 411 | "importlib.reload(config)\n", 412 | "\n", 413 | "for key, value in config.__dict__.items():\n", 414 | " if key.isupper(): print(f'{key}: {value}')" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "id": "f128b46e", 420 | "metadata": {}, 421 | "source": [ 422 | "### (Optional) Build the ML container image\n", 423 | "\n", 424 | "This is the `TFX` runtime environment for the training pipeline steps." 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "id": "f24fa5e6", 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "!echo $TFX_IMAGE_URI" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "id": "3949cc7e", 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "!gcloud builds submit --tag $TFX_IMAGE_URI . --timeout=15m --machine-type=e2-highcpu-8" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "id": "98a9890d", 450 | "metadata": {}, 451 | "source": [ 452 | "### Compile pipeline" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "id": "09c8a3a0", 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "from src.tfx_pipelines import runner\n", 463 | "\n", 464 | "pipeline_definition_file = f'{config.PIPELINE_NAME}.json'\n", 465 | "pipeline_definition = runner.compile_prediction_pipeline(pipeline_definition_file)" 466 | ] 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "id": "2bc2792a", 471 | "metadata": {}, 472 | "source": [ 473 | "### Submit run to Vertex Pipelines" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "id": "37dcc92d", 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [ 483 | "from kfp.v2.google.client import AIPlatformClient\n", 484 | "\n", 485 | "pipeline_client = AIPlatformClient(\n", 486 | " project_id=PROJECT, region=REGION)\n", 487 | " \n", 488 | "pipeline_client.create_run_from_job_spec(\n", 489 | " job_spec_path=pipeline_definition_file\n", 490 | ")" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "id": "5e0d5bef", 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [] 500 | } 501 | ], 502 | "metadata": { 503 | "environment": { 504 | "name": "common-cpu.m79", 505 | "type": "gcloud", 506 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m79" 507 | }, 508 | "kernelspec": { 509 | "display_name": "Python 3", 510 | "language": "python", 511 | "name": "python3" 512 | }, 513 | "language_info": { 514 | "codemirror_mode": { 515 | "name": "ipython", 516 | "version": 3 517 | }, 518 | "file_extension": ".py", 519 | "mimetype": "text/x-python", 520 | "name": "python", 521 | "nbconvert_exporter": "python", 522 | "pygments_lexer": "ipython3", 523 | "version": "3.7.10" 524 | } 525 | }, 526 | "nbformat": 4, 527 | "nbformat_minor": 5 528 | } 529 | -------------------------------------------------------------------------------- /08-model-monitoring.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "441c8b66", 6 | "metadata": {}, 7 | "source": [ 8 | "# 08 - Model Monitoring\n", 9 | "\n", 10 | "This notebook covers configuring model monitoring jobs for skew and drift detection:\n", 11 | "1. Set skew and drift threshold.\n", 12 | "2. Create a monitoring job for all the models under and endpoint.\n", 13 | "3. List the monitoring jobs.\n", 14 | "4. Simulate skewed prediction requests.\n", 15 | "5. Pause and delete the monitoring job." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "id": "b2a0d93e", 21 | "metadata": {}, 22 | "source": [ 23 | "## Setup" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "c95c73cf", 29 | "metadata": {}, 30 | "source": [ 31 | "### Import libraries" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "aee62910", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import copy\n", 42 | "from datetime import datetime\n", 43 | "import time\n", 44 | "\n", 45 | "from google.protobuf.duration_pb2 import Duration\n", 46 | "from google.cloud import aiplatform as vertex_ai\n", 47 | "from google.cloud import aiplatform_v1beta1 as vertex_ai_beta" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "id": "33eb5433", 53 | "metadata": {}, 54 | "source": [ 55 | "### Setup Google Cloud project" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "c9e34ea5", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "PROJECT = '[your-project-id]' # Change to your project id.\n", 66 | "REGION = 'us-central1' # Change to your region.\n", 67 | "BUCKET = '[your-bucket-name]' # Change to your bucket name.\n", 68 | "\n", 69 | "if PROJECT == \"\" or PROJECT is None or PROJECT == \"[your-project-id]\":\n", 70 | " # Get your GCP project id from gcloud\n", 71 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n", 72 | " PROJECT = shell_output[0]\n", 73 | " \n", 74 | "if BUCKET == \"\" or BUCKET is None or BUCKET == \"[your-bucket-name]\":\n", 75 | " # Get your bucket name to GCP project id\n", 76 | " BUCKET = PROJECT\n", 77 | " # Try to create the bucket if it doesn't exists\n", 78 | " ! gsutil mb -l $REGION gs://$BUCKET\n", 79 | " print(\"\")\n", 80 | "\n", 81 | "PARENT = f\"projects/{PROJECT}/locations/{REGION}\"\n", 82 | "\n", 83 | "print(\"Project ID:\", PROJECT)\n", 84 | "print(\"Region:\", REGION)\n", 85 | "print(\"Bucket name:\", BUCKET)\n", 86 | "print(\"Vertex API Parent URI:\", PARENT)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "id": "0051e6da", 92 | "metadata": {}, 93 | "source": [ 94 | "### Set configurations" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "id": "c4ffa4e8", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n", 105 | "ENDPOINT_DISPLAY_NAME = 'chicago-taxi-tips-classifier'\n", 106 | "MONITORING_JOB_NAME = f\"monitor-{ENDPOINT_DISPLAY_NAME}\"\n", 107 | "NOTIFY_EMAILS = [\"\"] # Change to your email address.\n", 108 | "\n", 109 | "LOG_SAMPLE_RATE = 0.8\n", 110 | "MONITOR_INTERVAL = 3600\n", 111 | "TARGET_FEATURE_NAME = 'tip_bin'" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "09153445", 117 | "metadata": {}, 118 | "source": [ 119 | "## Create Job Service Client" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "id": "3c8c8872", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "job_client_beta = vertex_ai_beta.JobServiceClient(\n", 130 | " client_options={\"api_endpoint\": f\"{REGION}-aiplatform.googleapis.com\"}\n", 131 | ")" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "id": "250e521b", 137 | "metadata": {}, 138 | "source": [ 139 | "## 1. Set Skew and Drift Thresholds" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "id": "10bd314f", 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "SKEW_THRESHOLDS = {\n", 150 | " 'trip_month': 0.3,\n", 151 | " 'trip_day': 0.3,\n", 152 | " 'trip_day_of_week': 0.3,\n", 153 | " 'trip_hour': 0.3,\n", 154 | " 'trip_seconds': 0.3,\n", 155 | " 'trip_miles': 0.3,\n", 156 | " 'payment_type': 0.3,\n", 157 | " 'pickup_grid': 0.3,\n", 158 | " 'dropoff_grid': 0.3,\n", 159 | " 'euclidean': 0.3,\n", 160 | " 'loc_cross': 0.3, \n", 161 | "}\n", 162 | "\n", 163 | "DRIFT_THRESHOLDS = {\n", 164 | " 'trip_month': 0.3,\n", 165 | " 'trip_day': 0.3,\n", 166 | " 'trip_day_of_week': 0.3,\n", 167 | " 'trip_hour': 0.3,\n", 168 | " 'trip_seconds': 0.3,\n", 169 | " 'trip_miles': 0.3,\n", 170 | " 'payment_type': 0.3,\n", 171 | " 'pickup_grid': 0.3,\n", 172 | " 'dropoff_grid': 0.3,\n", 173 | " 'euclidean': 0.3,\n", 174 | " 'loc_cross': 0.3, \n", 175 | "}" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "id": "1579ae9d", 181 | "metadata": {}, 182 | "source": [ 183 | "## 2. Create Monitoring Job" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "id": "32756e32", 189 | "metadata": {}, 190 | "source": [ 191 | "### Retrieve the Vertex dataset and endpoint models to monitor" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "id": "1f17f0d7", 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "dataset = vertex_ai.TabularDataset.list(\n", 202 | " filter=f\"display_name={DATASET_DISPLAY_NAME}\", \n", 203 | " order_by=\"update_time\")[-1]\n", 204 | "\n", 205 | "bq_source_uri = dataset.gca_resource.metadata[\"inputConfig\"][\"bigquerySource\"][\"uri\"]\n", 206 | " \n", 207 | "endpoint = vertex_ai.Endpoint.list(\n", 208 | " filter=f'display_name={ENDPOINT_DISPLAY_NAME}', \n", 209 | " order_by=\"update_time\")[-1]\n", 210 | "\n", 211 | "endpoint_uri = endpoint.gca_resource.name\n", 212 | "\n", 213 | "model_ids = [model.id for model in endpoint.list_models()]" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "id": "f8f3315d", 219 | "metadata": {}, 220 | "source": [ 221 | "### Configure the monitoring job" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "id": "d2998243", 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "skew_thresholds = {\n", 232 | " feature: vertex_ai_beta.ThresholdConfig(value=float(value))\n", 233 | " for feature, value in SKEW_THRESHOLDS.items()\n", 234 | "}\n", 235 | "\n", 236 | "drift_thresholds = {\n", 237 | " feature: vertex_ai_beta.ThresholdConfig(value=float(value))\n", 238 | " for feature, value in DRIFT_THRESHOLDS.items()\n", 239 | "}\n", 240 | "\n", 241 | "skew_config = vertex_ai_beta.ModelMonitoringObjectiveConfig.TrainingPredictionSkewDetectionConfig(\n", 242 | " skew_thresholds=skew_thresholds\n", 243 | ")\n", 244 | "\n", 245 | "drift_config = vertex_ai_beta.ModelMonitoringObjectiveConfig.PredictionDriftDetectionConfig(\n", 246 | " drift_thresholds=drift_thresholds\n", 247 | ")\n", 248 | "\n", 249 | "sampling_config = vertex_ai_beta.SamplingStrategy(\n", 250 | " random_sample_config=vertex_ai_beta.SamplingStrategy.RandomSampleConfig(\n", 251 | " sample_rate=LOG_SAMPLE_RATE\n", 252 | " )\n", 253 | ")\n", 254 | "\n", 255 | "schedule_config = vertex_ai_beta.ModelDeploymentMonitoringScheduleConfig(\n", 256 | " monitor_interval=Duration(seconds=MONITOR_INTERVAL)\n", 257 | ")\n", 258 | "\n", 259 | "training_dataset = vertex_ai_beta.ModelMonitoringObjectiveConfig.TrainingDataset(\n", 260 | " target_field=TARGET_FEATURE_NAME,\n", 261 | " bigquery_source = vertex_ai_beta.types.io.BigQuerySource(\n", 262 | " input_uri=bq_source_uri\n", 263 | " )\n", 264 | ")\n", 265 | "\n", 266 | "\n", 267 | "objective_template = vertex_ai_beta.ModelDeploymentMonitoringObjectiveConfig(\n", 268 | " objective_config=vertex_ai_beta.ModelMonitoringObjectiveConfig(\n", 269 | " training_dataset=training_dataset,\n", 270 | " training_prediction_skew_detection_config=skew_config,\n", 271 | " prediction_drift_detection_config=drift_config,\n", 272 | " )\n", 273 | ")\n", 274 | "\n", 275 | "deployment_objective_configs = []\n", 276 | "for model_id in model_ids:\n", 277 | " objective_config = copy.deepcopy(objective_template)\n", 278 | " objective_config.deployed_model_id = model_id\n", 279 | " deployment_objective_configs.append(objective_config)\n", 280 | "\n", 281 | "alerting_config = vertex_ai_beta.ModelMonitoringAlertConfig(\n", 282 | " email_alert_config=vertex_ai_beta.ModelMonitoringAlertConfig.EmailAlertConfig(\n", 283 | " user_emails=NOTIFY_EMAILS\n", 284 | " )\n", 285 | ")\n" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "id": "7311422c", 291 | "metadata": {}, 292 | "source": [ 293 | "### Instantiate a monitoring job" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "id": "7b414c32", 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "job = vertex_ai_beta.ModelDeploymentMonitoringJob(\n", 304 | " display_name=MONITORING_JOB_NAME,\n", 305 | " endpoint=endpoint_uri,\n", 306 | " model_deployment_monitoring_objective_configs=deployment_objective_configs,\n", 307 | " logging_sampling_strategy=sampling_config,\n", 308 | " model_deployment_monitoring_schedule_config=schedule_config,\n", 309 | " model_monitoring_alert_config=alerting_config,\n", 310 | ")" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "id": "8e87fd5c", 316 | "metadata": {}, 317 | "source": [ 318 | "### Submit the job for creation" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "id": "a7d54b6f", 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "response = job_client_beta.create_model_deployment_monitoring_job(\n", 329 | " parent=PARENT, model_deployment_monitoring_job=job\n", 330 | ")\n", 331 | "response" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "id": "7c8d2120", 337 | "metadata": {}, 338 | "source": [ 339 | "## 3. List Monitoring Jobs" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "id": "ef38d00d", 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "monitoring_jobs = job_client_beta.list_model_deployment_monitoring_jobs(parent=PARENT)\n", 350 | "monitoring_job = [entry for entry in monitoring_jobs if entry.display_name == MONITORING_JOB_NAME][0]\n", 351 | "monitoring_job" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "id": "fb136f64", 357 | "metadata": {}, 358 | "source": [ 359 | "## 4. Simulate skewed prediction requests" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "id": "07ff9ab8", 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "num_requests = 100\n", 370 | "\n", 371 | "print(\"Simulation started...\")\n", 372 | "for idx in range(num_requests):\n", 373 | " request = [{\n", 374 | " \"dropoff_grid\": [\"POINT(-87.6 41.9)\"],\n", 375 | " \"euclidean\": [2064.2696],\n", 376 | " \"loc_cross\": [\"\"],\n", 377 | " \"payment_type\": [\"Credit Card\"],\n", 378 | " \"pickup_grid\": [\"POINT(-87.6 41.9)\"],\n", 379 | " \"trip_miles\": [1.37],\n", 380 | " \"trip_day\": [int(random.uniform(10, 50))],\n", 381 | " \"trip_hour\": [int(random.uniform(10, 50))],\n", 382 | " \"trip_month\": [int(random.uniform(1, 10))],\n", 383 | " \"trip_day_of_week\": [int(random.uniform(1, 7))],\n", 384 | " \"trip_seconds\": [int(random.uniform(60, 600))]\n", 385 | " }]\n", 386 | " \n", 387 | " endpoint.predict(request)\n", 388 | " time.sleep(0.5)\n", 389 | " \n", 390 | " if idx % 10 == 0:\n", 391 | " print(f'{idx + 1} of {num_requests} prediction requests were invoked.')\n", 392 | "print(\"Simulation finished.\")" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "id": "06a03835", 398 | "metadata": {}, 399 | "source": [ 400 | "## 5. Pause Monitoring Job" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "id": "6e4ba104", 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "job_client_beta.pause_model_deployment_monitoring_job(name=monitoring_job.name)" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "id": "8fb6f259", 416 | "metadata": {}, 417 | "source": [ 418 | "## Delete Monitoring Job" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "id": "4668f9dd", 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "job_client_beta.delete_model_deployment_monitoring_job(name=monitoring_job.name)" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "id": "ac101746", 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [] 438 | } 439 | ], 440 | "metadata": { 441 | "environment": { 442 | "name": "common-cpu.m79", 443 | "type": "gcloud", 444 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m79" 445 | }, 446 | "kernelspec": { 447 | "display_name": "Python 3", 448 | "language": "python", 449 | "name": "python3" 450 | }, 451 | "language_info": { 452 | "codemirror_mode": { 453 | "name": "ipython", 454 | "version": 3 455 | }, 456 | "file_extension": ".py", 457 | "mimetype": "text/x-python", 458 | "name": "python", 459 | "nbconvert_exporter": "python", 460 | "pygments_lexer": "ipython3", 461 | "version": "3.7.10" 462 | } 463 | }, 464 | "nbformat": 4, 465 | "nbformat_minor": 5 466 | } 467 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/tfx-oss-public/tfx:1.2.0 2 | 3 | COPY requirements.txt requirements.txt 4 | 5 | RUN pip install -r requirements.txt 6 | 7 | COPY src/ src/ 8 | 9 | ENV PYTHONPATH="/pipeline:${PYTHONPATH}" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MLOps with Vertex AI 2 | 3 | This example implements the end-to-end [MLOps process](https://services.google.com/fh/files/misc/practitioners_guide_to_mlops_whitepaper.pdf) using [Vertex AI](https://cloud.google.com/vertex-ai) platform and [Smart Analytics](https://cloud.google.com/solutions/smart-analytics) technology capabilities. The example uses [Keras](https://keras.io/) to implement the ML model, [TFX](https://www.tensorflow.org/tfx) to implement the training pipeline, and [Model Builder SDK](https://github.com/googleapis/python-aiplatform/tree/569d4cd03e888fde0171f7b0060695a14f99b072/google/cloud/aiplatform) to interact with Vertex AI. 4 | 5 |

6 | MLOps lifecycle 7 |

8 | 9 | 10 | ## Getting started 11 | 12 | 1. [Setup your MLOps environment](provision) on Google Cloud. 13 | 2. Start your AI Notebook instance. 14 | 3. Open the JupyterLab then open a new Terminal 15 | 4. Clone the repository to your AI Notebook instance: 16 | ``` 17 | git clone https://github.com/GoogleCloudPlatform/mlops-with-vertex-ai.git 18 | cd mlops-with-vertex-ai 19 | ``` 20 | 5. Install the required Python packages: 21 | ``` 22 | pip install tfx==1.2.0 --user 23 | pip install -r requirements.txt 24 | ``` 25 | --- 26 | **NOTE**: You can ignore the pip dependencies issues. These will be fixed when upgrading to subsequent TFX version. 27 | 28 | --- 29 | 6. Upgrade the `gcloud` components: 30 | ``` 31 | sudo apt-get install google-cloud-sdk 32 | gcloud components update 33 | ``` 34 | 35 | ## Dataset Management 36 | 37 | The [Chicago Taxi Trips](https://pantheon.corp.google.com/marketplace/details/city-of-chicago-public-data/chicago-taxi-trips) dataset is one of [public datasets hosted with BigQuery](https://cloud.google.com/bigquery/public-data/), which includes taxi trips from 2013 to the present, reported to the City of Chicago in its role as a regulatory agency. The task is to predict whether a given trip will result in a tip > 20%. 38 | 39 | The [01-dataset-management](01-dataset-management.ipynb) notebook covers: 40 | 41 | 1. Performing exploratory data analysis on the data in `BigQuery`. 42 | 2. Creating `Vertex AI` Dataset resource using the Python SDK. 43 | 3. Generating the schema for the raw data using [TensorFlow Data Validation](https://www.tensorflow.org/tfx/guide/tfdv). 44 | 45 | 46 | ## ML Development 47 | 48 | We experiment with creating a [Custom Model](https://cloud.google.com/ai-platform-unified/docs/training/create-model-custom-training) using [02-experimentation](02-experimentation.ipynb) notebook, which covers: 49 | 50 | 1. Preparing the data using `Dataflow`. 51 | 2. Implementing a `Keras` classification model. 52 | 3. Training the `Keras` model with `Vertex AI` using a [pre-built container](https://cloud.google.com/ai-platform-unified/docs/training/pre-built-containers). 53 | 4. Upload the exported model from `Cloud Storage` to `Vertex AI`. 54 | 5. Extract and visualize experiment parameters from [Vertex AI Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction). 55 | 6. Use `Vertex AI` for [hyperparameter tuning](https://cloud.google.com/vertex-ai/docs/training/hyperparameter-tuning-overview). 56 | 57 | We use [Vertex TensorBoard](https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview) 58 | and [Vertex ML Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction) to track, visualize, and compare ML experiments. 59 | 60 | In addition, the training steps are formalized by implementing a [TFX pipeline](https://www.tensorflow.org/tfx). 61 | The [03-training-formalization](03-training-formalization.ipynb) notebook covers implementing and testing the pipeline components interactively. 62 | 63 | ## Training Operationalization 64 | 65 | The [04-pipeline-deployment](04-pipeline-deployment.ipynb) notebook covers executing the CI/CD steps for the training pipeline deployment using [Cloud Build](https://cloud.google.com/build/docs/overview). The CI/CD routine is defined in the [pipeline-deployment.yaml](build/pipeline-deployment.yaml) file, and consists of the following steps: 66 | 67 | 1. Clone the repository to the build environment. 68 | 2. Run unit tests. 69 | 3. Run a local e2e test of the `TFX` pipeline. 70 | 4. Build the ML container image for pipeline steps. 71 | 5. Compile the pipeline. 72 | 6. Upload the pipeline to `Cloud Storage`. 73 | 74 | ## Continuous Training 75 | 76 | After testing, compiling, and uploading the pipeline definition to `Cloud Storage`, the pipeline is executed with respect to a trigger. 77 | We use [Cloud Functions](https://cloud.google.com/functions) and [Cloud Pub/Sub](https://cloud.google.com/pubsub) as a triggering mechanism. 78 | The `Cloud Function` listens to the `Pub/Sub` topic, and runs the training pipeline given a message sent to the `Pub/Sub` topic. 79 | The `Cloud Function` is implemented in [src/pipeline_triggering](src/pipeline_triggering). 80 | 81 | The [05-continuous-training](05-continuous-training.ipynb) notebook covers: 82 | 83 | 1. Creating a Cloud `Pub/Sub` topic. 84 | 2. Deploying a `Cloud Function`. 85 | 3. Triggering the pipeline. 86 | 87 | The end-to-end TFX training pipeline implementation is in the [src/pipelines](src/tfx_pipelines) directory, which covers the following steps: 88 | 89 | 1. Receive hyper-parameters using `hyperparam_gen` custom python component. 90 | 2. Extract data from `BigQuery` using `BigQueryExampleGen` component. 91 | 3. Validate the raw data using `StatisticsGen` and `ExampleValidator` component. 92 | 4. Process the data using on `Dataflow` `Transform` component. 93 | 5. Train a custom model with `Vertex AI` using `Trainer` component. 94 | 6. Evaluate and validate the custom model using `ModelEvaluator` component. 95 | 7. Save the blessed to model registry location in `Cloud Storage` using `Pusher` component. 96 | 8. Upload the model to `Vertex AI` using `vertex_model_pusher` custom python component. 97 | 98 | 99 | ## Model Deployment 100 | 101 | The [06-model-deployment](06-model-deployment.ipynb) notebook covers executing the CI/CD steps for the model deployment using [Cloud Build](https://cloud.google.com/build/docs/overview). The CI/CD routine is defined in [build/model-deployment.yaml](build/model-deployment.yaml) 102 | file, and consists of the following steps: 103 | 104 | 2. Test model interface. 105 | 3. Create an endpoint in `Vertex AI`. 106 | 4. Deploy the model to the `endpoint`. 107 | 5. Test the `Vertex AI` endpoint. 108 | 109 | ## Prediction Serving 110 | 111 | We serve the deployed model for prediction. 112 | The [07-prediction-serving](07-prediction-serving.ipynb) notebook covers: 113 | 114 | 1. Use the `Vertex AI` endpoint for online prediction. 115 | 2. Use the `Vertex AI` uploaded model for batch prediction. 116 | 3. Run the batch prediction using `Vertex Pipelines`. 117 | 118 | ## Model Monitoring 119 | 120 | After a model is deployed in for prediction serving, continuous monitoring is set up to ensure that the model continue to perform as expected. 121 | The [08-model-monitoring](08-model-monitoring.ipynb) notebook covers configuring [Vertex AI Model Monitoring](https://cloud.google.com/vertex-ai/docs/model-monitoring/overview?hl=nn) for skew and drift detection: 122 | 123 | 1. Set skew and drift threshold. 124 | 2. Create a monitoring job for all the models under and endpoint. 125 | 3. List the monitoring jobs. 126 | 4. List artifacts produced by monitoring job. 127 | 5. Pause and delete the monitoring job. 128 | 129 | 130 | ## Metadata Tracking 131 | 132 | You can view the parameters and metrics logged by your experiments, as well as the artifacts and metadata stored by 133 | your `Vertex Pipelines` in [Cloud Console](https://console.cloud.google.com/vertex-ai/metadata). 134 | 135 | ## Disclaimer 136 | 137 | This is not an official Google product but sample code provided for an educational purpose. 138 | 139 | --- 140 | 141 | Copyright 2021 Google LLC. 142 | 143 | Licensed under the Apache License, Version 2.0 (the "License"); 144 | you may not use this file except in compliance with the License. 145 | You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0 146 | 147 | Unless required by applicable law or agreed to in writing, software 148 | distributed under the License is distributed on an "AS IS" BASIS, 149 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 150 | See the License for the specific language governing permissions and 151 | limitations under the License. 152 | 153 | 154 | 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /build/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/tfx-oss-public/tfx:1.2.0 2 | 3 | RUN pip install -U pip 4 | RUN pip install google-cloud-aiplatform==1.4.2 google-cloud-aiplatform[tensorboard] 5 | RUN pip install pytest kfp==1.8.1 google-cloud-bigquery==2.26.0 google-cloud-bigquery-storage==2.7.0 google-cloud-aiplatform==1.4.2 -------------------------------------------------------------------------------- /build/model-deployment.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ###################################################################### 16 | # CI/CD steps for Cloud Build to test and deploy a model to Vertex AI. 17 | ###################################################################### 18 | 19 | steps: 20 | 21 | # Clone the repository. 22 | - name: 'gcr.io/cloud-builders/git' 23 | args: ['clone', '--single-branch', '--branch', 24 | '$_BRANCH', '$_REPO_URL', 25 | '--depth', '1', 26 | '--verbose'] 27 | id: 'Clone Repository' 28 | 29 | # Test uploaded model artifact. 30 | - name: '$_CICD_IMAGE_URI' 31 | entrypoint: 'pytest' 32 | args: ['src/tests/model_deployment_tests.py::test_model_artifact'] 33 | dir: 'mlops-with-vertex-ai' 34 | env: 35 | - 'PROJECT=$_PROJECT' 36 | - 'REGION=$_REGION' 37 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' 38 | id: 'Test Model Artifact' 39 | waitFor: ['Clone Repository'] 40 | 41 | # Create an endpoint. 42 | - name: '$_CICD_IMAGE_URI' 43 | entrypoint: 'python' 44 | args: ['build/utils.py', 45 | '--mode', 'create-endpoint', 46 | '--project', '$_PROJECT', 47 | '--region', '$_REGION', 48 | '--endpoint-display-name', '$_ENDPOINT_DISPLAY_NAME'] 49 | dir: 'mlops-with-vertex-ai' 50 | id: 'Create Endpoint' 51 | waitFor: ['Test Model Artifact'] 52 | 53 | # Deploy the model. 54 | - name: '$_CICD_IMAGE_URI' 55 | entrypoint: 'python' 56 | args: ['build/utils.py', 57 | '--mode', 'deploy-model', 58 | '--project', '$_PROJECT', 59 | '--region', '$_REGION', 60 | '--endpoint-display-name', '$_ENDPOINT_DISPLAY_NAME', 61 | '--model-display-name', '$_MODEL_DISPLAY_NAME' 62 | ] 63 | dir: 'mlops-with-vertex-ai' 64 | id: 'Deploy Model' 65 | waitFor: ['Create Endpoint'] 66 | 67 | # Test deployed model endpoint. 68 | - name: '$_CICD_IMAGE_URI' 69 | entrypoint: 'pytest' 70 | args: ['src/tests/model_deployment_tests.py::test_model_endpoint'] 71 | dir: 'mlops-with-vertex-ai' 72 | env: 73 | - 'PROJECT=$_PROJECT' 74 | - 'REGION=$_REGION' 75 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' 76 | - 'ENDPOINT_DISPLAY_NAME=$_ENDPOINT_DISPLAY_NAME' 77 | id: 'Test Model Endpoint' 78 | waitFor: ['Deploy Model'] 79 | -------------------------------------------------------------------------------- /build/pipeline-deployment.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ############################################################################# 16 | # CI/CD steps for Cloud Build to test and deploy a TFX pipeline to Vertex AI. 17 | ############################################################################# 18 | 19 | steps: 20 | 21 | # Clone the repository. 22 | - name: 'gcr.io/cloud-builders/git' 23 | args: ['clone', '--single-branch', '--branch', 24 | '$_BRANCH', '$_REPO_URL', 25 | '--depth', '1', 26 | '--verbose'] 27 | id: 'Clone Repository' 28 | 29 | 30 | # Run datasource_utils unit tests. 31 | - name: '$_CICD_IMAGE_URI' 32 | entrypoint: 'pytest' 33 | args: ['src/tests/datasource_utils_tests.py', '-s'] 34 | dir: 'mlops-with-vertex-ai' 35 | env: 36 | - 'PROJECT=$_PROJECT' 37 | - 'BQ_LOCATION=$_BQ_LOCATION' 38 | - 'BQ_DATASET_NAME=$_BQ_DATASET_NAME' 39 | - 'BQ_TABLE_NAME=$_BQ_TABLE_NAME' 40 | id: 'Unit Test Datasource Utils' 41 | waitFor: ['Clone Repository'] 42 | 43 | 44 | # Run model unit tests. 45 | - name: '$_CICD_IMAGE_URI' 46 | entrypoint: 'pytest' 47 | args: ['src/tests/model_tests.py', '-s'] 48 | dir: 'mlops-with-vertex-ai' 49 | id: 'Unit Test Model' 50 | waitFor: ['Clone Repository'] 51 | timeout: 1800s 52 | 53 | 54 | # Test e2e pipeline using local runner. 55 | - name: '$_CICD_IMAGE_URI' 56 | entrypoint: 'pytest' 57 | args: ['src/tests/pipeline_deployment_tests.py::test_e2e_pipeline', '-s'] 58 | dir: 'mlops-with-vertex-ai' 59 | env: 60 | - 'PROJECT=$_PROJECT' 61 | - 'REGION=$_REGION' 62 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' 63 | - 'DATASET_DISPLAY_NAME=$_DATASET_DISPLAY_NAME' 64 | - 'GCS_LOCATION=$_TEST_GCS_LOCATION' 65 | - 'TRAIN_LIMIT=$_CI_TRAIN_LIMIT' 66 | - 'TEST_LIMIT=$_CI_TEST_LIMIT' 67 | - 'UPLOAD_MODEL=$_CI_UPLOAD_MODEL' 68 | - 'ACCURACY_THRESHOLD=$_CI_ACCURACY_THRESHOLD' 69 | id: 'Local Test E2E Pipeline' 70 | waitFor: ['Unit Test Datasource Utils', 'Unit Test Model'] 71 | timeout: 1800s 72 | 73 | 74 | # Build the image that encapsulates the pipeline. 75 | - name: 'gcr.io/cloud-builders/docker' 76 | args: ['build', '-t', '$_TFX_IMAGE_URI', '.'] 77 | dir: 'mlops-with-vertex-ai' 78 | id: 'Build TFX Image' 79 | waitFor: ['Local Test E2E Pipeline'] 80 | 81 | 82 | # Compile the pipeline. 83 | - name: '$_CICD_IMAGE_URI' 84 | entrypoint: 'python' 85 | args: ['build/utils.py', 86 | '--mode', 'compile-pipeline', 87 | '--pipeline-name', '$_PIPELINE_NAME' 88 | ] 89 | dir: 'mlops-with-vertex-ai' 90 | env: 91 | - 'PROJECT=$_PROJECT' 92 | - 'REGION=$_REGION' 93 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' 94 | - 'DATASET_DISPLAY_NAME=$_DATASET_DISPLAY_NAME' 95 | - 'GCS_LOCATION=$_GCS_LOCATION' 96 | - 'TFX_IMAGE_URI=$_TFX_IMAGE_URI' 97 | - 'BEAM_RUNNER=$_BEAM_RUNNER' 98 | - 'TRAINING_RUNNER=$_TRAINING_RUNNER' 99 | id: 'Compile Pipeline' 100 | waitFor: ['Local Test E2E Pipeline'] 101 | 102 | 103 | # Upload compiled pipeline to GCS. 104 | - name: 'gcr.io/cloud-builders/gsutil' 105 | args: ['cp', '$_PIPELINE_NAME.json', '$_PIPELINES_STORE'] 106 | dir: 'mlops-with-vertex-ai' 107 | id: 'Upload Pipeline to GCS' 108 | waitFor: ['Compile Pipeline'] 109 | 110 | 111 | # Push TFX Image to Container Registy. 112 | images: ['$_TFX_IMAGE_URI'] 113 | -------------------------------------------------------------------------------- /build/serving_resources_spec.json: -------------------------------------------------------------------------------- 1 | { 2 | "traffic_percentage": 100, 3 | "machine_type": "n1-standard-2", 4 | "min_replica_count": 1, 5 | "max_replica_count": 1, 6 | "accelerator_type": null, 7 | "accelerator_count": null 8 | } -------------------------------------------------------------------------------- /build/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Utilities for deploying pipelines and models to Vertex AI.""" 15 | 16 | 17 | import argparse 18 | import os 19 | import sys 20 | import logging 21 | import json 22 | 23 | from google.cloud import aiplatform as vertex_ai 24 | 25 | 26 | SCRIPT_DIR = os.path.dirname( 27 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))) 28 | ) 29 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, ".."))) 30 | 31 | SERVING_SPEC_FILEPATH = 'build/serving_resources_spec.json' 32 | 33 | def get_args(): 34 | parser = argparse.ArgumentParser() 35 | 36 | parser.add_argument( 37 | '--mode', 38 | type=str, 39 | ) 40 | 41 | parser.add_argument( 42 | '--project', 43 | type=str, 44 | ) 45 | 46 | parser.add_argument( 47 | '--region', 48 | type=str, 49 | ) 50 | 51 | parser.add_argument( 52 | '--endpoint-display-name', 53 | type=str, 54 | ) 55 | 56 | parser.add_argument( 57 | '--model-display-name', 58 | type=str, 59 | ) 60 | 61 | parser.add_argument( 62 | '--pipeline-name', 63 | type=str, 64 | ) 65 | 66 | parser.add_argument( 67 | '--pipelines-store', 68 | type=str, 69 | ) 70 | 71 | return parser.parse_args() 72 | 73 | 74 | def create_endpoint(project, region, endpoint_display_name): 75 | logging.info(f"Creating endpoint {endpoint_display_name}") 76 | vertex_ai.init( 77 | project=project, 78 | location=region 79 | ) 80 | 81 | endpoints = vertex_ai.Endpoint.list( 82 | filter=f'display_name={endpoint_display_name}', 83 | order_by="update_time") 84 | 85 | if len(endpoints) > 0: 86 | logging.info(f"Endpoint {endpoint_display_name} already exists.") 87 | endpoint = endpoints[-1] 88 | else: 89 | endpoint = vertex_ai.Endpoint.create(endpoint_display_name) 90 | logging.info(f"Endpoint is ready.") 91 | logging.info(endpoint.gca_resource) 92 | return endpoint 93 | 94 | 95 | def deploy_model(project, region, endpoint_display_name, model_display_name, serving_resources_spec): 96 | logging.info(f"Deploying model {model_display_name} to endpoint {endpoint_display_name}") 97 | vertex_ai.init( 98 | project=project, 99 | location=region 100 | ) 101 | 102 | model = vertex_ai.Model.list( 103 | filter=f'display_name={model_display_name}', 104 | order_by="update_time" 105 | )[-1] 106 | 107 | endpoint = vertex_ai.Endpoint.list( 108 | filter=f'display_name={endpoint_display_name}', 109 | order_by="update_time" 110 | )[-1] 111 | 112 | deployed_model = endpoint.deploy(model=model, **serving_resources_spec) 113 | logging.info(f"Model is deployed.") 114 | logging.info(deployed_model) 115 | return deployed_model 116 | 117 | 118 | def compile_pipeline(pipeline_name): 119 | from src.tfx_pipelines import runner 120 | pipeline_definition_file = f"{pipeline_name}.json" 121 | pipeline_definition = runner.compile_training_pipeline(pipeline_definition_file) 122 | return pipeline_definition 123 | 124 | 125 | 126 | def main(): 127 | args = get_args() 128 | 129 | if args.mode == 'create-endpoint': 130 | if not args.project: 131 | raise ValueError("project must be supplied.") 132 | if not args.region: 133 | raise ValueError("region must be supplied.") 134 | if not args.endpoint_display_name: 135 | raise ValueError("endpoint_display_name must be supplied.") 136 | 137 | result = create_endpoint( 138 | args.project, 139 | args.region, 140 | args.endpoint_display_name 141 | ) 142 | 143 | elif args.mode == 'deploy-model': 144 | if not args.project: 145 | raise ValueError("project must be supplied.") 146 | if not args.region: 147 | raise ValueError("region must be supplied.") 148 | if not args.endpoint_display_name: 149 | raise ValueError("endpoint-display-name must be supplied.") 150 | if not args.model_display_name: 151 | raise ValueError("model-display-name must be supplied.") 152 | 153 | with open(SERVING_SPEC_FILEPATH) as json_file: 154 | serving_resources_spec = json.load(json_file) 155 | logging.info(f"serving resources: {serving_resources_spec}") 156 | result = deploy_model( 157 | args.project, 158 | args.region, 159 | args.endpoint_display_name, 160 | args.model_display_name, 161 | serving_resources_spec 162 | ) 163 | 164 | elif args.mode == 'compile-pipeline': 165 | if not args.pipeline_name: 166 | raise ValueError("pipeline-name must be supplied.") 167 | 168 | result = compile_pipeline(args.pipeline_name) 169 | 170 | else: 171 | raise ValueError(f"Invalid mode {args.mode}.") 172 | 173 | logging.info(result) 174 | 175 | 176 | if __name__ == "__main__": 177 | main() 178 | -------------------------------------------------------------------------------- /mlops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/mlops.png -------------------------------------------------------------------------------- /provision/README.md: -------------------------------------------------------------------------------- 1 | # Creating a Vertex environment 2 | 3 | You can use the [Terraform](https://www.terraform.io/) scripts in the `terraform` folder to automatically provision the environment required by the samples. 4 | 5 | The scripts perform the following actions: 6 | 7 | 1. Enable the required Cloud APIs 8 | * **Essentials**: compute, iam, iamcredentials 9 | * **ML**: notebooks, aiplatform 10 | * **Data**: dataflow, bigquery, bigquerydatatransfer 11 | * **CI/CD**: cloudbuild, container, artifactregistry 12 | * **Operations**: cloudtrace, monitoring, logging, cloudresourcemanager 13 | 2. Create a regional GCS bucket. 14 | 3. Create an instance of Vertex Notebooks. 15 | 4. Create service accounts for Vertex Training and Vertex Pipelines. 16 | 17 | You can customize your configuration using the following variables: 18 | 19 | |Variable|Required|Default|Description| 20 | |--------|--------|-------|-----------| 21 | |name_prefix|Yes||Prefix added to the names of provisioned resources. **The prefix should start with a letter and include letters and digits only**.| 22 | |project_id|Yes||GCP project ID| 23 | |network_name|No|default|Name of the network for the Notebook instance. The network must already exist.| 24 | |subnet_name|No|default|Name of the subnet for the Notebook instance. The subnet must already exist.| 25 | |subnet_region|No|us-central1|Region where the subnet was created.| 26 | |zone|Yes||GCP zone for the Notebook instance. The zone must be in the region defined in the `subnet_region` variable| 27 | |machine_type|No|n1-standard-4|Machine type of the Notebook instance| 28 | |boot_disk_size|No|200GB|Size of the Notebook instance's boot disk| 29 | |image_family|No|base-cpu|Image family for the Notebook instance| 30 | |gpu_type|No|null|GPU type of the Notebook instance. By default, the Notebook instance will be provisioned without a GPU| 31 | |gpu_count|No|null|GPU count of the Notebook instance| 32 | |install_gpu_driver|No|false|Whether to install a GPU driver| 33 | |region|No|Set to subnet_region.|GCP region for the GCS bucket and Artifact Registry. It is recommended that the same region is used for all: the bucket, the registry and the Notebook instance. If not provided the `region` will be set to `subnet_region`.| 34 | |force_destroy|No|false|Whether to force the removal of the bucket on terraform destroy. **Note that by default the bucket will not be destroyed**.| 35 | 36 | 37 | To provision the environment: 38 | 39 | 1. Open [Cloud Shell](https://cloud.google.com/shell/docs/launching-cloud-shell) 40 | 41 | 2. Download the installation scripts 42 | ``` 43 | SRC_REPO=https://github.com/GoogleCloudPlatform/mlops-with-vertex-ai.git 44 | LOCAL_DIR=provision 45 | kpt pkg get $SRC_REPO/provision@main $LOCAL_DIR 46 | cd $LOCAL_DIR/terraform 47 | ``` 48 | 49 | 3. Update the `terraform.tfvars` file with the values reflecting your environment. Alternatively, you can provide the values using the Terraform CLI `-var` options when you execute `terraform apply` in the next step 50 | 51 | 4. Execute the following commands. : 52 | ``` 53 | terraform init 54 | terraform apply 55 | ``` 56 | 57 | 58 | To destroy the environment, execute: 59 | ``` 60 | terraform destroy 61 | ``` 62 | -------------------------------------------------------------------------------- /provision/terraform/gcs-bucket.tf: -------------------------------------------------------------------------------- 1 | 2 | # Copyright 2021 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | resource "google_storage_bucket" "artifact_repo" { 18 | project = module.project-services.project_id 19 | name = "${var.name_prefix}-bucket" 20 | location = local.region 21 | storage_class = local.bucket_type 22 | force_destroy = var.force_destroy 23 | uniform_bucket_level_access = true 24 | } -------------------------------------------------------------------------------- /provision/terraform/main.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_version = ">= 0.14" 17 | required_providers { 18 | google = "~> 3.6" 19 | } 20 | } 21 | 22 | provider "google" { 23 | project = var.project_id 24 | } 25 | 26 | data "google_project" "project" { 27 | project_id = var.project_id 28 | } 29 | 30 | locals { 31 | bucket_type = "REGIONAL" 32 | region = var.region == null ? var.subnet_region : var.region 33 | } 34 | 35 | 36 | -------------------------------------------------------------------------------- /provision/terraform/notebook-instance.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | locals { 16 | image_project = "deeplearning-platform-release" 17 | } 18 | 19 | data "google_compute_network" "vm_network" { 20 | project = module.project-services.project_id 21 | name = var.network_name 22 | 23 | depends_on = [ 24 | module.project-services 25 | ] 26 | } 27 | 28 | data "google_compute_subnetwork" "vm_subnetwork" { 29 | project = module.project-services.project_id 30 | name = var.subnet_name 31 | region = var.subnet_region 32 | 33 | depends_on = [ 34 | module.project-services 35 | ] 36 | } 37 | 38 | resource "google_notebooks_instance" "notebook_instance" { 39 | project = module.project-services.project_id 40 | name = "${var.name_prefix}-notebook" 41 | machine_type = var.machine_type 42 | location = var.zone 43 | 44 | network = data.google_compute_network.vm_network.id 45 | subnet = data.google_compute_subnetwork.vm_subnetwork.id 46 | 47 | vm_image { 48 | project = local.image_project 49 | image_family = var.image_family 50 | } 51 | 52 | dynamic accelerator_config { 53 | for_each = var.gpu_type != null ? [1] : [] 54 | content { 55 | type = var.gpu_type 56 | core_count = var.gpu_count 57 | } 58 | } 59 | 60 | install_gpu_driver = var.install_gpu_driver 61 | 62 | boot_disk_size_gb = var.boot_disk_size 63 | } 64 | -------------------------------------------------------------------------------- /provision/terraform/service-accounts.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Create Vertex Training service account 16 | resource "google_service_account" "training_sa" { 17 | project = module.project-services.project_id 18 | account_id = var.training_sa_name 19 | display_name = "Vertex Training service account" 20 | } 21 | 22 | # Create Vertex Training SA role bindings 23 | resource "google_project_iam_member" "training_sa_role_bindings" { 24 | project = module.project-services.project_id 25 | for_each = toset(var.training_sa_roles) 26 | member = "serviceAccount:${google_service_account.training_sa.email}" 27 | role = "roles/${each.value}" 28 | } 29 | 30 | # Create Vertex Pipelines service account 31 | resource "google_service_account" "pipelines_sa" { 32 | project = module.project-services.project_id 33 | account_id = var.pipelines_sa_name 34 | display_name = "Vertex Pipelines account name" 35 | } 36 | 37 | # Create Vertex Pipelines SA role bindings 38 | resource "google_project_iam_member" "role_bindings" { 39 | project = module.project-services.project_id 40 | for_each = toset(var.pipelines_sa_roles) 41 | member = "serviceAccount:${google_service_account.pipelines_sa.email}" 42 | role = "roles/${each.value}" 43 | } 44 | -------------------------------------------------------------------------------- /provision/terraform/services.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | module "project-services" { 17 | source = "terraform-google-modules/project-factory/google//modules/project_services" 18 | 19 | project_id = data.google_project.project.project_id 20 | 21 | disable_services_on_destroy = false 22 | activate_apis = [ 23 | "compute.googleapis.com", 24 | "iam.googleapis.com", 25 | "container.googleapis.com", 26 | "artifactregistry.googleapis.com", 27 | "cloudresourcemanager.googleapis.com", 28 | "cloudtrace.googleapis.com", 29 | "iamcredentials.googleapis.com", 30 | "monitoring.googleapis.com", 31 | "logging.googleapis.com", 32 | "notebooks.googleapis.com", 33 | "aiplatform.googleapis.com", 34 | "dataflow.googleapis.com", 35 | "bigquery.googleapis.com", 36 | "cloudbuild.googleapis.com", 37 | "bigquerydatatransfer.googleapis.com", 38 | "cloudfunctions.googleapis.com" 39 | ] 40 | } -------------------------------------------------------------------------------- /provision/terraform/terraform.tfvars: -------------------------------------------------------------------------------- 1 | project_id = "vertex-mlops" 2 | subnet_region = "us-central1" 3 | zone = "us-central1-a" 4 | name_prefix = "vertex-mlops" 5 | machine_type = "n1-standard-8" 6 | #gpu_type = "NVIDIA_TESLA_T4" 7 | #gpu_count = 1 8 | #install_gpu_driver = true 9 | #image_family = "common-gpu" 10 | 11 | 12 | -------------------------------------------------------------------------------- /provision/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | variable "project_id" { 17 | description = "The GCP project ID" 18 | type = string 19 | } 20 | 21 | variable "region" { 22 | description = "The region for the GCS bucket and Artifact Registry" 23 | type = string 24 | default = null 25 | } 26 | 27 | variable "zone" { 28 | description = "The zone for a Vertex Notebook instance" 29 | type = string 30 | } 31 | 32 | variable "name_prefix" { 33 | description = "The name prefix to add to the resource names" 34 | type = string 35 | } 36 | 37 | variable "machine_type" { 38 | description = "The Notebook instance's machine type" 39 | type = string 40 | } 41 | 42 | variable "network_name" { 43 | description = "The network name for the Notebook instance" 44 | type = string 45 | default = "default" 46 | } 47 | 48 | variable "subnet_name" { 49 | description = "The subnet name for the Notebook instance" 50 | type = string 51 | default = "default" 52 | } 53 | 54 | variable "subnet_region" { 55 | description = "The region for the Notebook subnet" 56 | type = string 57 | default = "us-central1" 58 | } 59 | 60 | variable "boot_disk_size" { 61 | description = "The size of the boot disk" 62 | default = 200 63 | } 64 | 65 | variable "image_family" { 66 | description = "A Deep Learning image family for the Notebook instance" 67 | type = string 68 | default = "common-cpu" 69 | } 70 | 71 | variable "gpu_type" { 72 | description = "A GPU type for the Notebook instance" 73 | type = string 74 | default = null 75 | } 76 | 77 | variable "gpu_count" { 78 | description = "A GPU count for the Notebook instance" 79 | type = string 80 | default = null 81 | } 82 | 83 | variable "install_gpu_driver" { 84 | description = "Whether to install GPU driver" 85 | type = bool 86 | default = true 87 | } 88 | 89 | variable "force_destroy" { 90 | description = "Whether to remove the bucket on destroy" 91 | type = bool 92 | default = false 93 | } 94 | 95 | variable "training_sa_roles" { 96 | description = "The roles to assign to the Vertex Training service account" 97 | default = [ 98 | "storage.admin", 99 | "aiplatform.user", 100 | "bigquery.admin" 101 | ] 102 | } 103 | 104 | variable "pipelines_sa_roles" { 105 | description = "The roles to assign to the Vertex Pipelines service account" 106 | default = [ 107 | "storage.admin", 108 | "bigquery.admin", 109 | "aiplatform.user" 110 | ] 111 | } 112 | 113 | variable "training_sa_name" { 114 | description = "Vertex training service account name." 115 | default = "training-sa" 116 | } 117 | 118 | variable "pipelines_sa_name" { 119 | description = "Vertex pipelines service account name." 120 | default = "pipelines-sa" 121 | } 122 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | kfp==1.8.1 2 | google-cloud-bigquery==2.26.0 3 | google-cloud-bigquery-storage==2.7.0 4 | google-cloud-aiplatform==1.4.2 5 | cloudml-hypertune==0.1.0.dev6 6 | pytest -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | REQUIRED_PACKAGES = [ 4 | "google-cloud-aiplatform==1.4.2", 5 | "tensorflow-transform==1.2.0", 6 | "tensorflow-data-validation==1.2.0", 7 | "cloudml-hypertune==0.1.0.dev6" 8 | ] 9 | 10 | setuptools.setup( 11 | name="executor", 12 | version="0.0.1", 13 | install_requires=REQUIRED_PACKAGES, 14 | packages=setuptools.find_packages(), 15 | include_package_data=True, 16 | package_data={"src": ["raw_schema/schema.pbtxt"]}, 17 | ) 18 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/src/__init__.py -------------------------------------------------------------------------------- /src/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/src/common/__init__.py -------------------------------------------------------------------------------- /src/common/datasource_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Utilities for generating BigQuery data querying scirpts.""" 15 | 16 | 17 | from google.cloud import aiplatform as vertex_ai 18 | 19 | 20 | def _get_source_query(bq_dataset_name, bq_table_name, ml_use, limit=None): 21 | query = f""" 22 | SELECT 23 | IF(trip_month IS NULL, -1, trip_month) trip_month, 24 | IF(trip_day IS NULL, -1, trip_day) trip_day, 25 | IF(trip_day_of_week IS NULL, -1, trip_day_of_week) trip_day_of_week, 26 | IF(trip_hour IS NULL, -1, trip_hour) trip_hour, 27 | IF(trip_seconds IS NULL, -1, trip_seconds) trip_seconds, 28 | IF(trip_miles IS NULL, -1, trip_miles) trip_miles, 29 | IF(payment_type IS NULL, 'NA', payment_type) payment_type, 30 | IF(pickup_grid IS NULL, 'NA', pickup_grid) pickup_grid, 31 | IF(dropoff_grid IS NULL, 'NA', dropoff_grid) dropoff_grid, 32 | IF(euclidean IS NULL, -1, euclidean) euclidean, 33 | IF(loc_cross IS NULL, 'NA', loc_cross) loc_cross""" 34 | if ml_use: 35 | query += f""", 36 | tip_bin 37 | FROM {bq_dataset_name}.{bq_table_name} 38 | WHERE ML_use = '{ml_use}' 39 | """ 40 | else: 41 | query += f""" 42 | FROM {bq_dataset_name}.{bq_table_name} 43 | """ 44 | if limit: 45 | query += f"LIMIT {limit}" 46 | 47 | return query 48 | 49 | 50 | def get_training_source_query( 51 | project, region, dataset_display_name, ml_use, limit=None 52 | ): 53 | vertex_ai.init(project=project, location=region) 54 | 55 | dataset = vertex_ai.TabularDataset.list( 56 | filter=f"display_name={dataset_display_name}", order_by="update_time" 57 | )[-1] 58 | bq_source_uri = dataset.gca_resource.metadata["inputConfig"]["bigquerySource"][ 59 | "uri" 60 | ] 61 | _, bq_dataset_name, bq_table_name = bq_source_uri.replace("g://", "").split(".") 62 | 63 | return _get_source_query(bq_dataset_name, bq_table_name, ml_use, limit) 64 | 65 | 66 | def get_serving_source_query(bq_dataset_name, bq_table_name, limit=None): 67 | 68 | return _get_source_query(bq_dataset_name, bq_table_name, ml_use=None, limit=limit) 69 | -------------------------------------------------------------------------------- /src/common/features.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Model features metadata utils.""" 15 | 16 | 17 | FEATURE_NAMES = [ 18 | "trip_month", 19 | "trip_day", 20 | "trip_day_of_week", 21 | "trip_hour", 22 | "trip_seconds", 23 | "trip_miles", 24 | "payment_type", 25 | "pickup_grid", 26 | "dropoff_grid", 27 | "euclidean", 28 | "loc_cross", 29 | ] 30 | 31 | TARGET_FEATURE_NAME = "tip_bin" 32 | 33 | TARGET_LABELS = ["tip<20%", "tip>=20%"] 34 | 35 | NUMERICAL_FEATURE_NAMES = [ 36 | "trip_seconds", 37 | "trip_miles", 38 | "euclidean", 39 | ] 40 | 41 | EMBEDDING_CATEGORICAL_FEATURES = { 42 | "trip_month": 2, 43 | "trip_day": 4, 44 | "trip_hour": 3, 45 | "pickup_grid": 3, 46 | "dropoff_grid": 3, 47 | "loc_cross": 10, 48 | } 49 | 50 | ONEHOT_CATEGORICAL_FEATURE_NAMES = ["payment_type", "trip_day_of_week"] 51 | 52 | 53 | def transformed_name(key: str) -> str: 54 | """Generate the name of the transformed feature from original name.""" 55 | return f"{key}_xf" 56 | 57 | 58 | def original_name(key: str) -> str: 59 | """Generate the name of the original feature from transformed name.""" 60 | return key.replace("_xf", "") 61 | 62 | 63 | def vocabulary_name(key: str) -> str: 64 | """Generate the name of the vocabulary feature from original name.""" 65 | return f"{key}_vocab" 66 | 67 | 68 | def categorical_feature_names() -> list: 69 | return ( 70 | list(EMBEDDING_CATEGORICAL_FEATURES.keys()) + ONEHOT_CATEGORICAL_FEATURE_NAMES 71 | ) 72 | 73 | 74 | def generate_explanation_config(): 75 | explanation_config = { 76 | "inputs": {}, 77 | "outputs": {}, 78 | "params": {"sampled_shapley_attribution": {"path_count": 10}}, 79 | } 80 | 81 | for feature_name in FEATURE_NAMES: 82 | if feature_name in NUMERICAL_FEATURE_NAMES: 83 | explanation_config["inputs"][feature_name] = { 84 | "input_tensor_name": feature_name, 85 | "modality": "numeric", 86 | } 87 | else: 88 | explanation_config["inputs"][feature_name] = { 89 | "input_tensor_name": feature_name, 90 | "encoding": 'IDENTITY', 91 | "modality": "categorical", 92 | } 93 | 94 | explanation_config["outputs"] = {"scores": {"output_tensor_name": "scores"}} 95 | 96 | return explanation_config 97 | -------------------------------------------------------------------------------- /src/model_training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/src/model_training/__init__.py -------------------------------------------------------------------------------- /src/model_training/data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Functions for reading data as tf.data.Dataset.""" 15 | 16 | import tensorflow as tf 17 | 18 | from src.common import features 19 | 20 | 21 | def _gzip_reader_fn(filenames): 22 | """Small utility returning a record reader that can read gzip'ed files.""" 23 | return tf.data.TFRecordDataset(filenames, compression_type="GZIP") 24 | 25 | 26 | def get_dataset(file_pattern, feature_spec, batch_size=200): 27 | """Generates features and label for tuning/training. 28 | Args: 29 | file_pattern: input tfrecord file pattern. 30 | feature_spec: a dictionary of feature specifications. 31 | batch_size: representing the number of consecutive elements of returned 32 | dataset to combine in a single batch 33 | Returns: 34 | A dataset that contains (features, indices) tuple where features is a 35 | dictionary of Tensors, and indices is a single Tensor of label indices. 36 | """ 37 | 38 | dataset = tf.data.experimental.make_batched_features_dataset( 39 | file_pattern=file_pattern, 40 | batch_size=batch_size, 41 | features=feature_spec, 42 | label_key=features.TARGET_FEATURE_NAME, 43 | reader=_gzip_reader_fn, 44 | num_epochs=1, 45 | drop_final_batch=True, 46 | ) 47 | 48 | return dataset 49 | -------------------------------------------------------------------------------- /src/model_training/defaults.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Defaults for the model. 15 | 16 | These values can be tweaked to affect model training performance. 17 | """ 18 | 19 | 20 | HIDDEN_UNITS = [64, 32] 21 | LEARNING_RATE = 0.0001 22 | BATCH_SIZE = 512 23 | NUM_EPOCHS = 10 24 | NUM_EVAL_STEPS = 100 25 | 26 | 27 | def update_hyperparams(hyperparams: dict) -> dict: 28 | if "hidden_units" not in hyperparams: 29 | hyperparams["hidden_units"] = HIDDEN_UNITS 30 | else: 31 | if not isinstance(hyperparams["hidden_units"], list): 32 | hyperparams["hidden_units"] = [ 33 | int(v) for v in hyperparams["hidden_units"].split(",") 34 | ] 35 | if "learning_rate" not in hyperparams: 36 | hyperparams["learning_rate"] = LEARNING_RATE 37 | if "batch_size" not in hyperparams: 38 | hyperparams["batch_size"] = BATCH_SIZE 39 | if "num_epochs" not in hyperparams: 40 | hyperparams["num_epochs"] = NUM_EPOCHS 41 | return hyperparams 42 | -------------------------------------------------------------------------------- /src/model_training/exporter.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Functions for exporting the model for serving.""" 15 | 16 | import logging 17 | 18 | import tensorflow as tf 19 | import tensorflow_transform as tft 20 | import tensorflow_data_validation as tfdv 21 | from tensorflow_transform.tf_metadata import schema_utils 22 | import tensorflow.keras as keras 23 | 24 | from src.common import features 25 | 26 | 27 | def _get_serve_tf_examples_fn(classifier, tft_output, raw_feature_spec): 28 | """Returns a function that parses a serialized tf.Example and applies TFT.""" 29 | 30 | classifier.tft_layer = tft_output.transform_features_layer() 31 | 32 | @tf.function 33 | def serve_tf_examples_fn(serialized_tf_examples): 34 | """Returns the output to be used in the serving signature.""" 35 | for key in list(raw_feature_spec.keys()): 36 | if key not in features.FEATURE_NAMES: 37 | raw_feature_spec.pop(key) 38 | 39 | parsed_features = tf.io.parse_example(serialized_tf_examples, raw_feature_spec) 40 | 41 | transformed_features = classifier.tft_layer(parsed_features) 42 | logits = classifier(transformed_features) 43 | probabilities = keras.activations.sigmoid(logits) 44 | return {"probabilities": probabilities} 45 | 46 | return serve_tf_examples_fn 47 | 48 | 49 | def _get_serve_features_fn(classifier, tft_output): 50 | """Returns a function that accept a dictionary of features and applies TFT.""" 51 | 52 | classifier.tft_layer = tft_output.transform_features_layer() 53 | 54 | @tf.function 55 | def serve_features_fn(raw_features): 56 | """Returns the output to be used in the serving signature.""" 57 | 58 | transformed_features = classifier.tft_layer(raw_features) 59 | logits = classifier(transformed_features) 60 | neg_probabilities = keras.activations.sigmoid(logits) 61 | pos_probabilities = 1 - neg_probabilities 62 | probabilities = tf.concat([neg_probabilities, pos_probabilities], -1) 63 | batch_size = tf.shape(probabilities)[0] 64 | classes = tf.repeat([features.TARGET_LABELS], [batch_size], axis=0) 65 | return {"classes": classes, "scores": probabilities} 66 | 67 | return serve_features_fn 68 | 69 | 70 | def export_serving_model( 71 | classifier, serving_model_dir, raw_schema_location, tft_output_dir 72 | ): 73 | 74 | raw_schema = tfdv.load_schema_text(raw_schema_location) 75 | raw_feature_spec = schema_utils.schema_as_feature_spec(raw_schema).feature_spec 76 | 77 | tft_output = tft.TFTransformOutput(tft_output_dir) 78 | 79 | features_input_signature = { 80 | feature_name: tf.TensorSpec( 81 | shape=(None, 1), dtype=spec.dtype, name=feature_name 82 | ) 83 | for feature_name, spec in raw_feature_spec.items() 84 | if feature_name in features.FEATURE_NAMES 85 | } 86 | 87 | signatures = { 88 | "serving_default": _get_serve_features_fn( 89 | classifier, tft_output 90 | ).get_concrete_function(features_input_signature), 91 | "serving_tf_example": _get_serve_tf_examples_fn( 92 | classifier, tft_output, raw_feature_spec 93 | ).get_concrete_function( 94 | tf.TensorSpec(shape=[None], dtype=tf.string, name="examples") 95 | ), 96 | } 97 | 98 | logging.info("Model export started...") 99 | classifier.save(serving_model_dir, signatures=signatures) 100 | logging.info("Model export completed.") 101 | -------------------------------------------------------------------------------- /src/model_training/model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """A DNN keras classification model.""" 15 | 16 | import tensorflow as tf 17 | from tensorflow import keras 18 | 19 | from src.common import features 20 | 21 | 22 | def create_model_inputs(): 23 | inputs = {} 24 | for feature_name in features.FEATURE_NAMES: 25 | name = features.transformed_name(feature_name) 26 | if feature_name in features.NUMERICAL_FEATURE_NAMES: 27 | inputs[name] = keras.layers.Input(name=name, shape=[], dtype=tf.float32) 28 | elif feature_name in features.categorical_feature_names(): 29 | inputs[name] = keras.layers.Input(name=name, shape=[], dtype=tf.int64) 30 | else: 31 | pass 32 | return inputs 33 | 34 | 35 | def _create_binary_classifier(feature_vocab_sizes, hyperparams): 36 | input_layers = create_model_inputs() 37 | 38 | layers = [] 39 | for key in input_layers: 40 | feature_name = features.original_name(key) 41 | if feature_name in features.EMBEDDING_CATEGORICAL_FEATURES: 42 | vocab_size = feature_vocab_sizes[feature_name] 43 | embedding_size = features.EMBEDDING_CATEGORICAL_FEATURES[feature_name] 44 | embedding_output = keras.layers.Embedding( 45 | input_dim=vocab_size + 1, 46 | output_dim=embedding_size, 47 | name=f"{key}_embedding", 48 | )(input_layers[key]) 49 | layers.append(embedding_output) 50 | elif feature_name in features.ONEHOT_CATEGORICAL_FEATURE_NAMES: 51 | vocab_size = feature_vocab_sizes[feature_name] 52 | onehot_layer = keras.layers.experimental.preprocessing.CategoryEncoding( 53 | max_tokens=vocab_size, 54 | output_mode="binary", 55 | name=f"{key}_onehot", 56 | )(input_layers[key]) 57 | layers.append(onehot_layer) 58 | elif feature_name in features.NUMERICAL_FEATURE_NAMES: 59 | numeric_layer = tf.expand_dims(input_layers[key], -1) 60 | layers.append(numeric_layer) 61 | else: 62 | pass 63 | 64 | joined = keras.layers.Concatenate(name="combines_inputs")(layers) 65 | feedforward_output = keras.Sequential( 66 | [ 67 | keras.layers.Dense(units, activation="relu") 68 | for units in hyperparams["hidden_units"] 69 | ], 70 | name="feedforward_network", 71 | )(joined) 72 | logits = keras.layers.Dense(units=1, name="logits")(feedforward_output) 73 | 74 | model = keras.Model(inputs=input_layers, outputs=[logits]) 75 | return model 76 | 77 | 78 | def create_binary_classifier(tft_output, hyperparams): 79 | feature_vocab_sizes = dict() 80 | for feature_name in features.categorical_feature_names(): 81 | feature_vocab_sizes[feature_name] = tft_output.vocabulary_size_by_name( 82 | feature_name 83 | ) 84 | 85 | return _create_binary_classifier(feature_vocab_sizes, hyperparams) 86 | -------------------------------------------------------------------------------- /src/model_training/runner.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """A run_fn method called by the TFX Trainer component.""" 15 | 16 | import os 17 | import logging 18 | 19 | from src.model_training import trainer, exporter, defaults 20 | 21 | 22 | # TFX Trainer will call this function. 23 | def run_fn(fn_args): 24 | """Train the model based on given args. 25 | Args: 26 | fn_args: Holds args used to train the model as name/value pairs. 27 | """ 28 | logging.info("Runner started...") 29 | logging.info(f"fn_args: {fn_args}") 30 | logging.info("") 31 | 32 | try: 33 | log_dir = fn_args.model_run_dir 34 | except KeyError: 35 | log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), "logs") 36 | 37 | hyperparams = fn_args.hyperparameters 38 | if not hyperparams: 39 | hyperparams = dict() 40 | 41 | hyperparams = defaults.update_hyperparams(hyperparams) 42 | logging.info("Hyperparameter:") 43 | logging.info(hyperparams) 44 | logging.info("") 45 | 46 | logging.info("Runner executing trainer...") 47 | classifier = trainer.train( 48 | train_data_dir=fn_args.train_files, 49 | eval_data_dir=fn_args.eval_files, 50 | tft_output_dir=fn_args.transform_output, 51 | hyperparams=hyperparams, 52 | log_dir=log_dir, 53 | base_model_dir=fn_args.base_model, 54 | ) 55 | 56 | logging.info("Runner executing exporter...") 57 | exporter.export_serving_model( 58 | classifier=classifier, 59 | serving_model_dir=fn_args.serving_model_dir, 60 | raw_schema_location=fn_args.schema_path, 61 | tft_output_dir=fn_args.transform_output, 62 | ) 63 | logging.info("Runner completed.") 64 | -------------------------------------------------------------------------------- /src/model_training/task.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """The entrypoint for the Vertex training job.""" 15 | 16 | import os 17 | import sys 18 | from datetime import datetime 19 | import logging 20 | import tensorflow as tf 21 | from tensorflow.python.client import device_lib 22 | import argparse 23 | 24 | from google.cloud import aiplatform as vertex_ai 25 | import hypertune 26 | 27 | from src.model_training import defaults, trainer, exporter 28 | 29 | 30 | dirname = os.path.dirname(__file__) 31 | dirname = dirname.replace("/model_training", "") 32 | RAW_SCHEMA_LOCATION = os.path.join(dirname, "raw_schema/schema.pbtxt") 33 | HYPERTUNE_METRIC_NAME = 'ACCURACY' 34 | 35 | 36 | def get_args(): 37 | parser = argparse.ArgumentParser() 38 | 39 | parser.add_argument( 40 | "--model-dir", 41 | default=os.getenv("AIP_MODEL_DIR"), 42 | type=str, 43 | ) 44 | 45 | parser.add_argument( 46 | "--log-dir", 47 | default=os.getenv("AIP_TENSORBOARD_LOG_DIR"), 48 | type=str, 49 | ) 50 | 51 | parser.add_argument( 52 | "--train-data-dir", 53 | type=str, 54 | ) 55 | 56 | parser.add_argument( 57 | "--eval-data-dir", 58 | type=str, 59 | ) 60 | 61 | parser.add_argument( 62 | "--tft-output-dir", 63 | type=str, 64 | ) 65 | 66 | parser.add_argument("--learning-rate", default=0.001, type=float) 67 | parser.add_argument("--batch-size", default=512, type=float) 68 | parser.add_argument("--hidden-units", default="64,32", type=str) 69 | parser.add_argument("--num-epochs", default=10, type=int) 70 | 71 | parser.add_argument("--project", type=str) 72 | parser.add_argument("--region", type=str) 73 | parser.add_argument("--staging-bucket", type=str) 74 | parser.add_argument("--experiment-name", type=str) 75 | parser.add_argument("--run-name", type=str) 76 | 77 | return parser.parse_args() 78 | 79 | 80 | def main(): 81 | args = get_args() 82 | 83 | hyperparams = vars(args) 84 | hyperparams = defaults.update_hyperparams(hyperparams) 85 | logging.info(f"Hyperparameter: {hyperparams}") 86 | 87 | if args.experiment_name: 88 | vertex_ai.init( 89 | project=args.project, 90 | staging_bucket=args.staging_bucket, 91 | experiment=args.experiment_name, 92 | ) 93 | 94 | logging.info(f"Using Vertex AI experiment: {args.experiment_name}") 95 | 96 | run_id = args.run_name 97 | if not run_id: 98 | run_id = f"run-gcp-{datetime.now().strftime('%Y%m%d%H%M%S')}" 99 | 100 | vertex_ai.start_run(run_id) 101 | logging.info(f"Run {run_id} started.") 102 | 103 | vertex_ai.log_params(hyperparams) 104 | 105 | classifier = trainer.train( 106 | train_data_dir=args.train_data_dir, 107 | eval_data_dir=args.eval_data_dir, 108 | tft_output_dir=args.tft_output_dir, 109 | hyperparams=hyperparams, 110 | log_dir=args.log_dir, 111 | ) 112 | 113 | val_loss, val_accuracy = trainer.evaluate( 114 | model=classifier, 115 | data_dir=args.eval_data_dir, 116 | raw_schema_location=RAW_SCHEMA_LOCATION, 117 | tft_output_dir=args.tft_output_dir, 118 | hyperparams=hyperparams, 119 | ) 120 | 121 | 122 | # Report val_accuracy to Vertex hypertuner. 123 | logging.info(f'Reporting metric {HYPERTUNE_METRIC_NAME}={val_accuracy} to Vertex hypertuner...') 124 | hpt = hypertune.HyperTune() 125 | hpt.report_hyperparameter_tuning_metric( 126 | hyperparameter_metric_tag=HYPERTUNE_METRIC_NAME, 127 | metric_value=val_accuracy, 128 | global_step=args.num_epochs * args.batch_size 129 | ) 130 | 131 | # Log metrics in Vertex Experiments. 132 | logging.info(f'Logging metrics to Vertex Experiments...') 133 | if args.experiment_name: 134 | vertex_ai.log_metrics({"val_loss": val_loss, "val_accuracy": val_accuracy}) 135 | 136 | try: 137 | exporter.export_serving_model( 138 | classifier=classifier, 139 | serving_model_dir=args.model_dir, 140 | raw_schema_location=RAW_SCHEMA_LOCATION, 141 | tft_output_dir=args.tft_output_dir, 142 | ) 143 | except: 144 | # Swallow Ignored Errors while exporting the model. 145 | pass 146 | 147 | 148 | if __name__ == "__main__": 149 | logging.getLogger().setLevel(logging.INFO) 150 | logging.info(f"Python Version = {sys.version}") 151 | logging.info(f"TensorFlow Version = {tf.__version__}") 152 | logging.info(f'TF_CONFIG = {os.environ.get("TF_CONFIG", "Not found")}') 153 | logging.info(f"DEVICES = {device_lib.list_local_devices()}") 154 | logging.info(f"Task started...") 155 | main() 156 | logging.info(f"Task completed.") 157 | -------------------------------------------------------------------------------- /src/model_training/trainer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Train and evaluate the model.""" 15 | 16 | import logging 17 | import tensorflow as tf 18 | import tensorflow_transform as tft 19 | from tensorflow import keras 20 | 21 | 22 | from src.model_training import data, model 23 | 24 | 25 | def train( 26 | train_data_dir, 27 | eval_data_dir, 28 | tft_output_dir, 29 | hyperparams, 30 | log_dir, 31 | base_model_dir=None, 32 | ): 33 | 34 | logging.info(f"Loading tft output from {tft_output_dir}") 35 | tft_output = tft.TFTransformOutput(tft_output_dir) 36 | transformed_feature_spec = tft_output.transformed_feature_spec() 37 | 38 | train_dataset = data.get_dataset( 39 | train_data_dir, 40 | transformed_feature_spec, 41 | hyperparams["batch_size"], 42 | ) 43 | 44 | eval_dataset = data.get_dataset( 45 | eval_data_dir, 46 | transformed_feature_spec, 47 | hyperparams["batch_size"], 48 | ) 49 | 50 | optimizer = keras.optimizers.Adam(learning_rate=hyperparams["learning_rate"]) 51 | loss = keras.losses.BinaryCrossentropy(from_logits=True) 52 | metrics = [keras.metrics.BinaryAccuracy(name="accuracy")] 53 | 54 | early_stopping = tf.keras.callbacks.EarlyStopping( 55 | monitor="val_loss", patience=5, restore_best_weights=True 56 | ) 57 | tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir) 58 | 59 | classifier = model.create_binary_classifier(tft_output, hyperparams) 60 | if base_model_dir: 61 | try: 62 | classifier = keras.load_model(base_model_dir) 63 | except: 64 | pass 65 | 66 | classifier.compile(optimizer=optimizer, loss=loss, metrics=metrics) 67 | 68 | logging.info("Model training started...") 69 | classifier.fit( 70 | train_dataset, 71 | epochs=hyperparams["num_epochs"], 72 | validation_data=eval_dataset, 73 | callbacks=[early_stopping, tensorboard_callback], 74 | ) 75 | logging.info("Model training completed.") 76 | 77 | return classifier 78 | 79 | 80 | def evaluate(model, data_dir, raw_schema_location, tft_output_dir, hyperparams): 81 | logging.info(f"Loading raw schema from {raw_schema_location}") 82 | 83 | logging.info(f"Loading tft output from {tft_output_dir}") 84 | tft_output = tft.TFTransformOutput(tft_output_dir) 85 | transformed_feature_spec = tft_output.transformed_feature_spec() 86 | 87 | logging.info("Model evaluation started...") 88 | eval_dataset = data.get_dataset( 89 | data_dir, 90 | transformed_feature_spec, 91 | hyperparams["batch_size"], 92 | ) 93 | 94 | evaluation_metrics = model.evaluate(eval_dataset) 95 | logging.info("Model evaluation completed.") 96 | 97 | return evaluation_metrics 98 | -------------------------------------------------------------------------------- /src/pipeline_triggering/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/src/pipeline_triggering/__init__.py -------------------------------------------------------------------------------- /src/pipeline_triggering/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Cloud Function to be triggered by Pub/Sub.""" 15 | 16 | import os 17 | import json 18 | import logging 19 | from kfp.v2.google.client import AIPlatformClient 20 | from google.cloud import storage 21 | import base64 22 | 23 | 24 | def trigger_pipeline(event, context): 25 | 26 | project = os.getenv("PROJECT") 27 | region = os.getenv("REGION") 28 | gcs_pipeline_file_location = os.getenv("GCS_PIPELINE_FILE_LOCATION") 29 | 30 | if not project: 31 | raise ValueError("Environment variable PROJECT is not set.") 32 | if not region: 33 | raise ValueError("Environment variable REGION is not set.") 34 | if not gcs_pipeline_file_location: 35 | raise ValueError("Environment variable GCS_PIPELINE_FILE_LOCATION is not set.") 36 | 37 | storage_client = storage.Client() 38 | 39 | if not gcs_pipeline_file_location: 40 | raise ValueError("Environment variable GCS_PIPELINE_FILE_LOCATION is not set.") 41 | 42 | path_parts = gcs_pipeline_file_location.replace("gs://", "").split("/") 43 | bucket_name = path_parts[0] 44 | blob_name = "/".join(path_parts[1:]) 45 | 46 | bucket = storage_client.bucket(bucket_name) 47 | blob = storage.Blob(bucket=bucket, name=blob_name) 48 | 49 | if not blob.exists(storage_client): 50 | raise ValueError(f"{gcs_pipeline_file_location} does not exist.") 51 | 52 | data = base64.b64decode(event["data"]).decode("utf-8") 53 | logging.info(f"Event data: {data}") 54 | 55 | parameter_values = json.loads(data) 56 | 57 | api_client = AIPlatformClient(project_id=project, region=region) 58 | 59 | response = api_client.create_run_from_job_spec( 60 | job_spec_path=gcs_pipeline_file_location, parameter_values=parameter_values 61 | ) 62 | 63 | logging.info(response) 64 | -------------------------------------------------------------------------------- /src/pipeline_triggering/requirements.txt: -------------------------------------------------------------------------------- 1 | kfp==1.6.2 2 | google-cloud-aiplatform 3 | google-cloud-storage -------------------------------------------------------------------------------- /src/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/src/preprocessing/__init__.py -------------------------------------------------------------------------------- /src/preprocessing/etl.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Data preprocessing pipelines.""" 15 | 16 | import os 17 | 18 | import tensorflow_transform as tft 19 | import tensorflow_data_validation as tfdv 20 | import apache_beam as beam 21 | from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore 22 | import tensorflow_transform.beam as tft_beam 23 | from tensorflow_transform.tf_metadata import dataset_metadata 24 | from tensorflow_transform.tf_metadata import schema_utils 25 | 26 | 27 | from src.preprocessing import transformations 28 | 29 | RAW_SCHEMA_LOCATION = "src/raw_schema/schema.pbtxt" 30 | 31 | 32 | def parse_bq_record(bq_record): 33 | output = {} 34 | for key in bq_record: 35 | output[key] = [bq_record[key]] 36 | return output 37 | 38 | 39 | def split_dataset(bq_row, num_partitions, ratio): 40 | import json 41 | 42 | assert num_partitions == len(ratio) 43 | bucket = sum(map(ord, json.dumps(bq_row))) % sum(ratio) 44 | total = 0 45 | for i, part in enumerate(ratio): 46 | total += part 47 | if bucket < total: 48 | return i 49 | return len(ratio) - 1 50 | 51 | 52 | def run_transform_pipeline(args): 53 | 54 | pipeline_options = beam.pipeline.PipelineOptions(flags=[], **args) 55 | 56 | raw_data_query = args["raw_data_query"] 57 | write_raw_data = args["write_raw_data"] 58 | exported_data_prefix = args["exported_data_prefix"] 59 | transformed_data_prefix = args["transformed_data_prefix"] 60 | transform_artifact_dir = args["transform_artifact_dir"] 61 | temporary_dir = args["temporary_dir"] 62 | gcs_location = args["gcs_location"] 63 | project = args["project"] 64 | 65 | source_raw_schema = tfdv.load_schema_text(RAW_SCHEMA_LOCATION) 66 | raw_feature_spec = schema_utils.schema_as_feature_spec( 67 | source_raw_schema 68 | ).feature_spec 69 | 70 | raw_metadata = dataset_metadata.DatasetMetadata( 71 | schema_utils.schema_from_feature_spec(raw_feature_spec) 72 | ) 73 | 74 | with beam.Pipeline(options=pipeline_options) as pipeline: 75 | with tft_beam.Context(temporary_dir): 76 | 77 | # Read raw BigQuery data. 78 | raw_train_data, raw_eval_data = ( 79 | pipeline 80 | | "Read Raw Data" 81 | >> beam.io.ReadFromBigQuery( 82 | query=raw_data_query, 83 | project=project, 84 | use_standard_sql=True, 85 | gcs_location=gcs_location, 86 | ) 87 | | "Parse Data" >> beam.Map(parse_bq_record) 88 | | "Split" >> beam.Partition(split_dataset, 2, ratio=[8, 2]) 89 | ) 90 | 91 | # Create a train_dataset from the data and schema. 92 | raw_train_dataset = (raw_train_data, raw_metadata) 93 | 94 | # Analyze and transform raw_train_dataset to produced transformed_train_dataset and transform_fn. 95 | transformed_train_dataset, transform_fn = ( 96 | raw_train_dataset 97 | | "Analyze & Transform" 98 | >> tft_beam.AnalyzeAndTransformDataset(transformations.preprocessing_fn) 99 | ) 100 | 101 | # Get data and schema separately from the transformed_dataset. 102 | transformed_train_data, transformed_metadata = transformed_train_dataset 103 | 104 | # write transformed train data. 105 | _ = ( 106 | transformed_train_data 107 | | "Write Transformed Train Data" 108 | >> beam.io.tfrecordio.WriteToTFRecord( 109 | file_path_prefix=os.path.join( 110 | transformed_data_prefix, "train/data" 111 | ), 112 | file_name_suffix=".gz", 113 | coder=tft.coders.ExampleProtoCoder(transformed_metadata.schema), 114 | ) 115 | ) 116 | 117 | # Create a eval_dataset from the data and schema. 118 | raw_eval_dataset = (raw_eval_data, raw_metadata) 119 | 120 | # Transform raw_eval_dataset to produced transformed_eval_dataset using transform_fn. 121 | transformed_eval_dataset = ( 122 | raw_eval_dataset, 123 | transform_fn, 124 | ) | "Transform" >> tft_beam.TransformDataset() 125 | 126 | # Get data from the transformed_eval_dataset. 127 | transformed_eval_data, _ = transformed_eval_dataset 128 | 129 | # write transformed train data. 130 | _ = ( 131 | transformed_eval_data 132 | | "Write Transformed Eval Data" 133 | >> beam.io.tfrecordio.WriteToTFRecord( 134 | file_path_prefix=os.path.join(transformed_data_prefix, "eval/data"), 135 | file_name_suffix=".gz", 136 | coder=tft.coders.ExampleProtoCoder(transformed_metadata.schema), 137 | ) 138 | ) 139 | 140 | # Write transform_fn. 141 | _ = transform_fn | "Write Transform Artifacts" >> tft_beam.WriteTransformFn( 142 | transform_artifact_dir 143 | ) 144 | 145 | if write_raw_data: 146 | # write raw eval data. 147 | _ = ( 148 | raw_eval_data 149 | | "Write Raw Eval Data" 150 | >> beam.io.tfrecordio.WriteToTFRecord( 151 | file_path_prefix=os.path.join(exported_data_prefix, "data"), 152 | file_name_suffix=".tfrecord", 153 | coder=tft.coders.ExampleProtoCoder(raw_metadata.schema), 154 | ) 155 | ) 156 | 157 | 158 | def convert_to_jsonl(bq_record): 159 | import json 160 | 161 | output = {} 162 | for key in bq_record: 163 | output[key] = [bq_record[key]] 164 | return json.dumps(output) 165 | 166 | 167 | def run_extract_pipeline(args): 168 | 169 | pipeline_options = beam.pipeline.PipelineOptions(flags=[], **args) 170 | 171 | sql_query = args["sql_query"] 172 | exported_data_prefix = args["exported_data_prefix"] 173 | temporary_dir = args["temporary_dir"] 174 | gcs_location = args["gcs_location"] 175 | project = args["project"] 176 | 177 | with beam.Pipeline(options=pipeline_options) as pipeline: 178 | with tft_beam.Context(temporary_dir): 179 | 180 | # Read BigQuery data. 181 | raw_data = ( 182 | pipeline 183 | | "Read Data" 184 | >> beam.io.ReadFromBigQuery( 185 | query=sql_query, 186 | project=project, 187 | use_standard_sql=True, 188 | gcs_location=gcs_location, 189 | ) 190 | | "Parse Data" >> beam.Map(convert_to_jsonl) 191 | ) 192 | 193 | # Write raw data to GCS as JSONL files. 194 | _ = raw_data | "Write Data" >> beam.io.WriteToText( 195 | file_path_prefix=exported_data_prefix, file_name_suffix=".jsonl" 196 | ) 197 | 198 | 199 | def parse_prediction_results(jsonl): 200 | import uuid 201 | import json 202 | 203 | prediction_results = json.loads(jsonl)["prediction"] 204 | prediction_id = str(uuid.uuid4()) 205 | scores = prediction_results["scores"] 206 | classes = prediction_results["classes"] 207 | 208 | return {"prediction_id": prediction_id, "scores": scores, "classes": classes} 209 | 210 | 211 | def create_datastore_entity(prediction_response, kind): 212 | from apache_beam.io.gcp.datastore.v1new.types import Entity 213 | from apache_beam.io.gcp.datastore.v1new.types import Key 214 | 215 | user_id = prediction_response.pop("prediction_id") 216 | key = Key([kind, user_id]) 217 | prediction_entity = Entity(key) 218 | prediction_entity.set_properties(prediction_response) 219 | return prediction_entity 220 | 221 | 222 | def run_store_predictions_pipeline(args): 223 | 224 | project = args["project"] 225 | datastore_kind = args["datastore_kind"] 226 | prediction_results_uri = args["prediction_results_uri"] 227 | 228 | pipeline_options = beam.options.pipeline_options.PipelineOptions(args) 229 | with beam.Pipeline(options=pipeline_options) as pipeline: 230 | _ = ( 231 | pipeline 232 | | "ReadFromJSONL" >> beam.io.ReadFromText(prediction_results_uri) 233 | | "ParsePredictionResults" >> beam.Map(parse_prediction_results) 234 | | "ConvertToDatastoreEntity" 235 | >> beam.Map(create_datastore_entity, datastore_kind) 236 | | "WriteToDatastore" >> WriteToDatastore(project=project) 237 | ) 238 | -------------------------------------------------------------------------------- /src/preprocessing/transformations.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """TensorFlow Transform preprocessing function.""" 15 | 16 | import tensorflow as tf 17 | import tensorflow_transform as tft 18 | 19 | from src.common import features 20 | 21 | 22 | def preprocessing_fn(inputs): 23 | """tf.transform's callback function for preprocessing inputs. 24 | Args: 25 | inputs: map from feature keys to raw not-yet-transformed features. 26 | Returns: 27 | Map from string feature key to transformed feature operations. 28 | """ 29 | 30 | outputs = {} 31 | 32 | for key in features.FEATURE_NAMES: 33 | if key in features.NUMERICAL_FEATURE_NAMES: 34 | outputs[features.transformed_name(key)] = tft.scale_to_z_score(inputs[key]) 35 | 36 | elif key in features.categorical_feature_names(): 37 | outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( 38 | inputs[key], 39 | num_oov_buckets=1, 40 | vocab_filename=key, 41 | ) 42 | 43 | outputs[features.TARGET_FEATURE_NAME] = inputs[features.TARGET_FEATURE_NAME] 44 | 45 | for key in outputs: 46 | outputs[key] = tf.squeeze(outputs[key], -1) 47 | 48 | return outputs 49 | -------------------------------------------------------------------------------- /src/raw_schema/schema.pbtxt: -------------------------------------------------------------------------------- 1 | feature { 2 | name: "trip_month" 3 | type: INT 4 | presence { 5 | min_fraction: 1.0 6 | min_count: 1 7 | } 8 | shape { 9 | dim { 10 | size: 1 11 | } 12 | } 13 | } 14 | feature { 15 | name: "trip_day" 16 | type: INT 17 | presence { 18 | min_fraction: 1.0 19 | min_count: 1 20 | } 21 | shape { 22 | dim { 23 | size: 1 24 | } 25 | } 26 | } 27 | feature { 28 | name: "trip_day_of_week" 29 | type: INT 30 | presence { 31 | min_fraction: 1.0 32 | min_count: 1 33 | } 34 | shape { 35 | dim { 36 | size: 1 37 | } 38 | } 39 | } 40 | feature { 41 | name: "trip_hour" 42 | type: INT 43 | presence { 44 | min_fraction: 1.0 45 | min_count: 1 46 | } 47 | shape { 48 | dim { 49 | size: 1 50 | } 51 | } 52 | } 53 | feature { 54 | name: "trip_seconds" 55 | type: INT 56 | presence { 57 | min_fraction: 1.0 58 | min_count: 1 59 | } 60 | shape { 61 | dim { 62 | size: 1 63 | } 64 | } 65 | } 66 | feature { 67 | name: "trip_miles" 68 | type: FLOAT 69 | presence { 70 | min_fraction: 1.0 71 | min_count: 1 72 | } 73 | shape { 74 | dim { 75 | size: 1 76 | } 77 | } 78 | } 79 | feature { 80 | name: "payment_type" 81 | type: BYTES 82 | domain: "payment_type" 83 | presence { 84 | min_fraction: 1.0 85 | min_count: 1 86 | } 87 | shape { 88 | dim { 89 | size: 1 90 | } 91 | } 92 | } 93 | feature { 94 | name: "pickup_grid" 95 | type: BYTES 96 | domain: "pickup_grid" 97 | presence { 98 | min_fraction: 1.0 99 | min_count: 1 100 | } 101 | shape { 102 | dim { 103 | size: 1 104 | } 105 | } 106 | } 107 | feature { 108 | name: "dropoff_grid" 109 | type: BYTES 110 | domain: "dropoff_grid" 111 | presence { 112 | min_fraction: 1.0 113 | min_count: 1 114 | } 115 | shape { 116 | dim { 117 | size: 1 118 | } 119 | } 120 | } 121 | feature { 122 | name: "euclidean" 123 | type: FLOAT 124 | presence { 125 | min_fraction: 1.0 126 | min_count: 1 127 | } 128 | shape { 129 | dim { 130 | size: 1 131 | } 132 | } 133 | } 134 | feature { 135 | name: "loc_cross" 136 | type: BYTES 137 | presence { 138 | min_fraction: 1.0 139 | min_count: 1 140 | } 141 | shape { 142 | dim { 143 | size: 1 144 | } 145 | } 146 | } 147 | feature { 148 | name: "tip_bin" 149 | type: INT 150 | bool_domain { 151 | } 152 | presence { 153 | min_fraction: 1.0 154 | min_count: 1 155 | } 156 | shape { 157 | dim { 158 | size: 1 159 | } 160 | } 161 | } 162 | string_domain { 163 | name: "payment_type" 164 | value: "Cash" 165 | value: "Credit Card" 166 | value: "Dispute" 167 | value: "Mobile" 168 | value: "No Charge" 169 | value: "Prcard" 170 | value: "Prepaid" 171 | value: "Unknown" 172 | } 173 | string_domain { 174 | name: "pickup_grid" 175 | value: "POINT(-87.5 41.7)" 176 | value: "POINT(-87.6 41.7)" 177 | value: "POINT(-87.6 41.8)" 178 | value: "POINT(-87.6 41.9)" 179 | value: "POINT(-87.6 42)" 180 | value: "POINT(-87.7 41.7)" 181 | value: "POINT(-87.7 41.8)" 182 | value: "POINT(-87.7 41.9)" 183 | value: "POINT(-87.7 42)" 184 | value: "POINT(-87.8 41.8)" 185 | value: "POINT(-87.8 41.9)" 186 | value: "POINT(-87.8 42)" 187 | value: "POINT(-87.9 42)" 188 | } 189 | string_domain { 190 | name: "dropoff_grid" 191 | value: "POINT(-87.5 41.7)" 192 | value: "POINT(-87.6 41.7)" 193 | value: "POINT(-87.6 41.8)" 194 | value: "POINT(-87.6 41.9)" 195 | value: "POINT(-87.6 42)" 196 | value: "POINT(-87.7 41.7)" 197 | value: "POINT(-87.7 41.8)" 198 | value: "POINT(-87.7 41.9)" 199 | value: "POINT(-87.7 42)" 200 | value: "POINT(-87.8 41.8)" 201 | value: "POINT(-87.8 41.9)" 202 | value: "POINT(-87.8 42)" 203 | value: "POINT(-87.9 42)" 204 | } 205 | -------------------------------------------------------------------------------- /src/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/src/tests/__init__.py -------------------------------------------------------------------------------- /src/tests/datasource_utils_tests.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Test utilities for generating BigQuery data querying scirpts.""" 15 | 16 | import sys 17 | import os 18 | import logging 19 | from google.cloud import bigquery 20 | 21 | from src.common import datasource_utils 22 | 23 | root = logging.getLogger() 24 | root.setLevel(logging.INFO) 25 | handler = logging.StreamHandler(sys.stdout) 26 | handler.setLevel(logging.INFO) 27 | root.addHandler(handler) 28 | 29 | LIMIT = 100 30 | 31 | TARGET_COLUMN = "tip_bin" 32 | 33 | EXPECTED_TRAINING_COLUMNS = [ 34 | "trip_month", 35 | "trip_day", 36 | "trip_day_of_week", 37 | "trip_hour", 38 | "trip_seconds", 39 | "trip_miles", 40 | "payment_type", 41 | "pickup_grid", 42 | "dropoff_grid", 43 | "euclidean", 44 | "loc_cross", 45 | "tip_bin", 46 | ] 47 | 48 | 49 | def test_training_query(): 50 | 51 | project = os.getenv("PROJECT") 52 | location = os.getenv("BQ_LOCATION") 53 | bq_dataset_name = os.getenv("BQ_DATASET_NAME") 54 | bq_table_name = os.getenv("BQ_TABLE_NAME") 55 | 56 | assert project, "Environment variable PROJECT is None!" 57 | assert location, "Environment variable BQ_LOCATION is None!" 58 | assert bq_dataset_name, "Environment variable BQ_DATASET_NAME is None!" 59 | assert bq_table_name, "Environment variable BQ_TABLE_NAME is None!" 60 | 61 | logging.info(f"BigQuery Source: {project}.{bq_dataset_name}.{bq_table_name}") 62 | 63 | query = datasource_utils._get_source_query( 64 | bq_dataset_name=bq_dataset_name, 65 | bq_table_name=bq_table_name, 66 | ml_use="UNASSIGNED", 67 | limit=LIMIT, 68 | ) 69 | 70 | bq_client = bigquery.Client(project=project, location=location) 71 | df = bq_client.query(query).to_dataframe() 72 | columns = set(df.columns) 73 | assert columns == set(EXPECTED_TRAINING_COLUMNS) 74 | assert df.shape == (LIMIT, 12) 75 | 76 | 77 | def test_serving_query(): 78 | 79 | project = os.getenv("PROJECT") 80 | location = os.getenv("BQ_LOCATION") 81 | bq_dataset_name = os.getenv("BQ_DATASET_NAME") 82 | bq_table_name = os.getenv("BQ_TABLE_NAME") 83 | 84 | assert project, "Environment variable PROJECT is None!" 85 | assert location, "Environment variable BQ_LOCATION is None!" 86 | assert bq_dataset_name, "Environment variable BQ_DATASET_NAME is None!" 87 | assert bq_table_name, "Environment variable BQ_TABLE_NAME is None!" 88 | 89 | logging.info(f"BigQuery Source: {project}.{bq_dataset_name}.{bq_table_name}") 90 | 91 | query = datasource_utils._get_source_query( 92 | bq_dataset_name=bq_dataset_name, 93 | bq_table_name=bq_table_name, 94 | ml_use=None, 95 | limit=LIMIT, 96 | ) 97 | 98 | bq_client = bigquery.Client(project=project, location=location) 99 | df = bq_client.query(query).to_dataframe() 100 | columns = set(df.columns) 101 | expected_serving_columns = EXPECTED_TRAINING_COLUMNS 102 | expected_serving_columns.remove(TARGET_COLUMN) 103 | assert columns == set(expected_serving_columns) 104 | assert df.shape == (LIMIT, 11) 105 | -------------------------------------------------------------------------------- /src/tests/etl_tests.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Test data processing.""" 15 | 16 | import sys 17 | import os 18 | import logging 19 | import tensorflow_transform as tft 20 | import tensorflow as tf 21 | from tensorflow.io import FixedLenFeature 22 | 23 | from src.preprocessing import etl 24 | from src.comm import datasource_utils 25 | 26 | root = logging.getLogger() 27 | root.setLevel(logging.INFO) 28 | handler = logging.StreamHandler(sys.stdout) 29 | handler.setLevel(logging.INFO) 30 | root.addHandler(handler) 31 | 32 | OUTPUT_DIR = "test_etl_output_dir" 33 | ML_USE = "UNASSIGNED" 34 | LIMIT = 100 35 | 36 | EXPECTED_FEATURE_SPEC = { 37 | "dropoff_grid_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 38 | "euclidean_xf": FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 39 | "loc_cross_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 40 | "payment_type_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 41 | "pickup_grid_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 42 | "tip_bin": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 43 | "trip_day_of_week_xf": FixedLenFeature( 44 | shape=[], dtype=tf.int64, default_value=None 45 | ), 46 | "trip_day_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 47 | "trip_hour_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 48 | "trip_miles_xf": FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 49 | "trip_month_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 50 | "trip_seconds_xf": FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 51 | } 52 | 53 | 54 | def test_transform_pipeline(): 55 | 56 | project = os.getenv("PROJECT") 57 | region = os.getenv("REGION") 58 | bucket = os.getenv("BUCKET") 59 | dataset_display_name = os.getenv("DATASET_DISPLAY_NAME") 60 | 61 | assert project, "Environment variable PROJECT is None!" 62 | assert region, "Environment variable REGION is None!" 63 | assert bucket, "Environment variable BUCKET is None!" 64 | assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!" 65 | 66 | os.mkdir(OUTPUT_DIR) 67 | 68 | exported_data_dir = os.path.join(OUTPUT_DIR, "exported_data") 69 | transformed_data_dir = os.path.join(OUTPUT_DIR, "transformed_data") 70 | transform_artifacts_dir = os.path.join(OUTPUT_DIR, "transform_artifacts") 71 | temporary_dir = os.path.join(OUTPUT_DIR, "tmp") 72 | 73 | raw_data_query = datasource_utils.get_training_source_query( 74 | project=project, 75 | region=region, 76 | dataset_display_name=dataset_display_name, 77 | ml_use=ML_USE, 78 | limit=LIMIT, 79 | ) 80 | 81 | args = { 82 | "runner": "DirectRunner", 83 | "raw_data_query": raw_data_query, 84 | "write_raw_data": False, 85 | "exported_data_prefix": exported_data_dir, 86 | "transformed_data_prefix": transformed_data_dir, 87 | "transform_artefact_dir": transform_artifacts_dir, 88 | "temporary_dir": temporary_dir, 89 | "gcs_location": f"gs://{bucket}/bq_tmp", 90 | "project": project, 91 | } 92 | 93 | logging.info(f"Transform pipeline args: {args}") 94 | etl.run_transform_pipeline(args) 95 | logging.info(f"Transform pipeline finished.") 96 | 97 | tft_output = tft.TFTransformOutput(transform_artifacts_dir) 98 | transform_feature_spec = tft_output.transformed_feature_spec() 99 | assert transform_feature_spec == EXPECTED_FEATURE_SPEC 100 | -------------------------------------------------------------------------------- /src/tests/model_deployment_tests.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Test an uploaded model to Vertex AI.""" 15 | 16 | import os 17 | import logging 18 | import tensorflow as tf 19 | 20 | test_instance = { 21 | "dropoff_grid": ["POINT(-87.6 41.9)"], 22 | "euclidean": [2064.2696], 23 | "loc_cross": [""], 24 | "payment_type": ["Credit Card"], 25 | "pickup_grid": ["POINT(-87.6 41.9)"], 26 | "trip_miles": [1.37], 27 | "trip_day": [12], 28 | "trip_hour": [16], 29 | "trip_month": [2], 30 | "trip_day_of_week": [4], 31 | "trip_seconds": [555], 32 | } 33 | 34 | SERVING_DEFAULT_SIGNATURE_NAME = "serving_default" 35 | 36 | from google.cloud import aiplatform as vertex_ai 37 | 38 | 39 | def test_model_artifact(): 40 | 41 | feature_types = { 42 | "dropoff_grid": tf.dtypes.string, 43 | "euclidean": tf.dtypes.float32, 44 | "loc_cross": tf.dtypes.string, 45 | "payment_type": tf.dtypes.string, 46 | "pickup_grid": tf.dtypes.string, 47 | "trip_miles": tf.dtypes.float32, 48 | "trip_day": tf.dtypes.int64, 49 | "trip_hour": tf.dtypes.int64, 50 | "trip_month": tf.dtypes.int64, 51 | "trip_day_of_week": tf.dtypes.int64, 52 | "trip_seconds": tf.dtypes.int64, 53 | } 54 | 55 | new_test_instance = dict() 56 | for key in test_instance: 57 | new_test_instance[key] = tf.constant( 58 | [test_instance[key]], dtype=feature_types[key] 59 | ) 60 | 61 | print(new_test_instance) 62 | 63 | project = os.getenv("PROJECT") 64 | region = os.getenv("REGION") 65 | model_display_name = os.getenv("MODEL_DISPLAY_NAME") 66 | 67 | assert project, "Environment variable PROJECT is None!" 68 | assert region, "Environment variable REGION is None!" 69 | assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!" 70 | 71 | vertex_ai.init(project=project, location=region,) 72 | 73 | models = vertex_ai.Model.list( 74 | filter=f'display_name={model_display_name}', 75 | order_by="update_time" 76 | ) 77 | 78 | assert ( 79 | models 80 | ), f"No model with display name {model_display_name} exists!" 81 | 82 | model = models[-1] 83 | artifact_uri = model.gca_resource.artifact_uri 84 | logging.info(f"Model artifact uri:{artifact_uri}") 85 | assert tf.io.gfile.exists( 86 | artifact_uri 87 | ), f"Model artifact uri {artifact_uri} does not exist!" 88 | 89 | saved_model = tf.saved_model.load(artifact_uri) 90 | logging.info("Model loaded successfully.") 91 | 92 | assert ( 93 | SERVING_DEFAULT_SIGNATURE_NAME in saved_model.signatures 94 | ), f"{SERVING_DEFAULT_SIGNATURE_NAME} not in model signatures!" 95 | 96 | prediction_fn = saved_model.signatures["serving_default"] 97 | predictions = prediction_fn(**new_test_instance) 98 | logging.info("Model produced predictions.") 99 | 100 | keys = ["classes", "scores"] 101 | for key in keys: 102 | assert key in predictions, f"{key} in prediction outputs!" 103 | 104 | assert predictions["classes"].shape == ( 105 | 1, 106 | 2, 107 | ), f"Invalid output classes shape: {predictions['classes'].shape}!" 108 | assert predictions["scores"].shape == ( 109 | 1, 110 | 2, 111 | ), f"Invalid output scores shape: {predictions['scores'].shape}!" 112 | logging.info(f"Prediction output: {predictions}") 113 | 114 | 115 | def test_model_endpoint(): 116 | 117 | project = os.getenv("PROJECT") 118 | region = os.getenv("REGION") 119 | model_display_name = os.getenv("MODEL_DISPLAY_NAME") 120 | endpoint_display_name = os.getenv("ENDPOINT_DISPLAY_NAME") 121 | 122 | assert project, "Environment variable PROJECT is None!" 123 | assert region, "Environment variable REGION is None!" 124 | assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!" 125 | assert endpoint_display_name, "Environment variable ENDPOINT_DISPLAY_NAME is None!" 126 | 127 | endpoints = vertex_ai.Endpoint.list( 128 | filter=f'display_name={endpoint_display_name}', 129 | order_by="update_time" 130 | ) 131 | assert ( 132 | endpoints 133 | ), f"Endpoint with display name {endpoint_display_name} does not exist! in region {region}" 134 | 135 | endpoint = endpoints[-1] 136 | logging.info(f"Calling endpoint: {endpoint}.") 137 | 138 | prediction = endpoint.predict([test_instance]).predictions[0] 139 | 140 | keys = ["classes", "scores"] 141 | for key in keys: 142 | assert key in prediction, f"{key} in prediction outputs!" 143 | 144 | assert ( 145 | len(prediction["classes"]) == 2 146 | ), f"Invalid number of output classes: {len(prediction['classes'])}!" 147 | assert ( 148 | len(prediction["scores"]) == 2 149 | ), f"Invalid number output scores: {len(prediction['scores'])}!" 150 | 151 | logging.info(f"Prediction output: {prediction}") 152 | -------------------------------------------------------------------------------- /src/tests/model_tests.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Test model functions.""" 15 | 16 | import sys 17 | import logging 18 | import tensorflow as tf 19 | 20 | from src.common import features 21 | from src.model_training import model, defaults 22 | 23 | root = logging.getLogger() 24 | root.setLevel(logging.INFO) 25 | handler = logging.StreamHandler(sys.stdout) 26 | handler.setLevel(logging.INFO) 27 | root.addHandler(handler) 28 | 29 | EXPECTED_HYPERPARAMS_KEYS = [ 30 | "hidden_units", 31 | "learning_rate", 32 | "batch_size", 33 | "num_epochs", 34 | ] 35 | 36 | 37 | def test_hyperparams_defaults(): 38 | hyperparams = {"hidden_units": [64, 32]} 39 | 40 | hyperparams = defaults.update_hyperparams(hyperparams) 41 | assert set(hyperparams.keys()) == set(EXPECTED_HYPERPARAMS_KEYS) 42 | 43 | 44 | def test_create_binary_classifier(): 45 | 46 | hyperparams = hyperparams = defaults.update_hyperparams(dict()) 47 | 48 | model_inputs = { 49 | "dropoff_grid_xf": tf.convert_to_tensor([0, 0, 0]), 50 | "euclidean_xf": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), 51 | "loc_cross_xf": tf.convert_to_tensor([0, 0, 0]), 52 | "payment_type_xf": tf.convert_to_tensor([1, 0, 0]), 53 | "pickup_grid_xf": tf.convert_to_tensor([0, 0, 0]), 54 | "trip_day_of_week_xf": tf.convert_to_tensor([5, 4, 4]), 55 | "trip_day_xf": tf.convert_to_tensor([26, 24, 1]), 56 | "trip_hour_xf": tf.convert_to_tensor([0, 4, 2]), 57 | "trip_miles_xf": tf.convert_to_tensor([5.9717827, -0.7121308, -0.7601589]), 58 | "trip_month_xf": tf.convert_to_tensor([4, 3, 4]), 59 | "trip_seconds_xf": tf.convert_to_tensor([4.9029775, -0.34146854, -0.34479955]), 60 | } 61 | 62 | feature_vocab_sizes = { 63 | feature_name: 100 for feature_name in features.categorical_feature_names() 64 | } 65 | classifier = model._create_binary_classifier(feature_vocab_sizes, hyperparams) 66 | model_outputs = classifier(model_inputs) # .numpy() 67 | assert model_outputs.shape == (3, 1) 68 | assert model_outputs.dtype == "float32" 69 | -------------------------------------------------------------------------------- /src/tests/pipeline_deployment_tests.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Test training pipeline using local runner.""" 15 | 16 | import sys 17 | import os 18 | from tfx.orchestration.local.local_dag_runner import LocalDagRunner 19 | import tensorflow as tf 20 | from ml_metadata.proto import metadata_store_pb2 21 | import logging 22 | 23 | from src.tfx_pipelines import config 24 | from src.tfx_pipelines import training_pipeline 25 | 26 | root = logging.getLogger() 27 | root.setLevel(logging.INFO) 28 | handler = logging.StreamHandler(sys.stdout) 29 | handler.setLevel(logging.INFO) 30 | root.addHandler(handler) 31 | 32 | MLMD_SQLLITE = "mlmd.sqllite" 33 | NUM_EPOCHS = 1 34 | BATCH_SIZE = 512 35 | LEARNING_RATE = 0.001 36 | HIDDEN_UNITS = "128,128" 37 | 38 | 39 | def test_e2e_pipeline(): 40 | 41 | project = os.getenv("PROJECT") 42 | region = os.getenv("REGION") 43 | model_display_name = os.getenv("MODEL_DISPLAY_NAME") 44 | dataset_display_name = os.getenv("DATASET_DISPLAY_NAME") 45 | gcs_location = os.getenv("GCS_LOCATION") 46 | model_registry = os.getenv("MODEL_REGISTRY_URI") 47 | upload_model = os.getenv("UPLOAD_MODEL") 48 | 49 | assert project, "Environment variable PROJECT is None!" 50 | assert region, "Environment variable REGION is None!" 51 | assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!" 52 | assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!" 53 | assert gcs_location, "Environment variable GCS_LOCATION is None!" 54 | assert model_registry, "Environment variable MODEL_REGISTRY_URI is None!" 55 | 56 | logging.info(f"upload_model: {upload_model}") 57 | if tf.io.gfile.exists(gcs_location): 58 | tf.io.gfile.rmtree(gcs_location) 59 | logging.info(f"Pipeline e2e test artifacts stored in: {gcs_location}") 60 | 61 | if tf.io.gfile.exists(MLMD_SQLLITE): 62 | tf.io.gfile.remove(MLMD_SQLLITE) 63 | 64 | metadata_connection_config = metadata_store_pb2.ConnectionConfig() 65 | metadata_connection_config.sqlite.filename_uri = MLMD_SQLLITE 66 | metadata_connection_config.sqlite.connection_mode = 3 67 | logging.info("ML metadata store is ready.") 68 | 69 | pipeline_root = os.path.join( 70 | config.ARTIFACT_STORE_URI, 71 | config.PIPELINE_NAME, 72 | ) 73 | 74 | runner = LocalDagRunner() 75 | 76 | pipeline = training_pipeline.create_pipeline( 77 | pipeline_root=pipeline_root, 78 | num_epochs=NUM_EPOCHS, 79 | batch_size=BATCH_SIZE, 80 | learning_rate=LEARNING_RATE, 81 | hidden_units=HIDDEN_UNITS, 82 | metadata_connection_config=metadata_connection_config, 83 | ) 84 | 85 | runner.run(pipeline) 86 | 87 | logging.info(f"Model output: {os.path.join(model_registry, model_display_name)}") 88 | assert tf.io.gfile.exists(os.path.join(model_registry, model_display_name)) 89 | -------------------------------------------------------------------------------- /src/tfx_pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/src/tfx_pipelines/__init__.py -------------------------------------------------------------------------------- /src/tfx_pipelines/components.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """TFX Custom Python Components.""" 15 | 16 | 17 | import sys 18 | import os 19 | import json 20 | import logging 21 | from datetime import datetime 22 | import tensorflow as tf 23 | 24 | from tfx.types import artifact_utils 25 | from tfx.utils import io_utils 26 | from tfx.components.util import model_utils 27 | from tfx.dsl.component.experimental.decorators import component 28 | from tfx.dsl.component.experimental.annotations import ( 29 | InputArtifact, 30 | OutputArtifact, 31 | Parameter, 32 | ) 33 | from tfx.types.standard_artifacts import HyperParameters, ModelBlessing 34 | from tfx.types.experimental.simple_artifacts import File as UploadedModel 35 | from tfx.types.experimental.simple_artifacts import Dataset 36 | 37 | from google.cloud import aiplatform as vertex_ai 38 | 39 | SCRIPT_DIR = os.path.dirname( 40 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))) 41 | ) 42 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, ".."))) 43 | 44 | from src.preprocessing import etl 45 | 46 | 47 | HYPERPARAM_FILENAME = "hyperparameters.json" 48 | SERVING_DATA_PREFIX = "serving-data-" 49 | PREDICTION_RESULTS_PREFIX = "prediction.results-*" 50 | 51 | 52 | @component 53 | def hyperparameters_gen( 54 | num_epochs: Parameter[int], 55 | batch_size: Parameter[int], 56 | learning_rate: Parameter[float], 57 | hidden_units: Parameter[str], 58 | hyperparameters: OutputArtifact[HyperParameters], 59 | ): 60 | 61 | hp_dict = dict() 62 | hp_dict["num_epochs"] = num_epochs 63 | hp_dict["batch_size"] = batch_size 64 | hp_dict["learning_rate"] = learning_rate 65 | hp_dict["hidden_units"] = [int(units) for units in hidden_units.split(",")] 66 | logging.info(f"Hyperparameters: {hp_dict}") 67 | 68 | hyperparams_uri = os.path.join( 69 | artifact_utils.get_single_uri([hyperparameters]), HYPERPARAM_FILENAME 70 | ) 71 | io_utils.write_string_file(hyperparams_uri, json.dumps(hp_dict)) 72 | logging.info(f"Hyperparameters are written to: {hyperparams_uri}") 73 | 74 | 75 | @component 76 | def vertex_model_uploader( 77 | project: Parameter[str], 78 | region: Parameter[str], 79 | model_display_name: Parameter[str], 80 | pushed_model_location: Parameter[str], 81 | serving_image_uri: Parameter[str], 82 | model_blessing: InputArtifact[ModelBlessing], 83 | uploaded_model: OutputArtifact[UploadedModel], 84 | explanation_config: Parameter[str]="", 85 | labels: Parameter[str]="", 86 | ): 87 | 88 | vertex_ai.init(project=project, location=region) 89 | 90 | blessing = artifact_utils.get_single_instance([model_blessing]) 91 | if not model_utils.is_model_blessed(blessing): 92 | logging.info(f"Model is not uploaded to Vertex AI because it was not blessed by the evaluator.") 93 | uploaded_model.set_int_custom_property("uploaded", 0) 94 | return 95 | 96 | pushed_model_dir = os.path.join( 97 | pushed_model_location, tf.io.gfile.listdir(pushed_model_location)[-1] 98 | ) 99 | 100 | logging.info(f"Model registry location: {pushed_model_dir}") 101 | 102 | try: 103 | explanation_config = json.loads(explanation_config) 104 | explanation_metadata = vertex_ai.explain.ExplanationMetadata( 105 | inputs=explanation_config["inputs"], 106 | outputs=explanation_config["outputs"], 107 | ) 108 | explanation_parameters = vertex_ai.explain.ExplanationParameters( 109 | explanation_config["params"] 110 | ) 111 | except: 112 | explanation_metadata = None 113 | explanation_parameters = None 114 | 115 | try: 116 | labels = json.loads(labels) 117 | except: 118 | labels = None 119 | 120 | vertex_model = vertex_ai.Model.upload( 121 | display_name=model_display_name, 122 | artifact_uri=pushed_model_dir, 123 | serving_container_image_uri=serving_image_uri, 124 | parameters_schema_uri=None, 125 | instance_schema_uri=None, 126 | explanation_metadata=explanation_metadata, 127 | explanation_parameters=explanation_parameters, 128 | labels=labels 129 | ) 130 | 131 | model_uri = vertex_model.gca_resource.name 132 | logging.info(f"Model uploaded to Vertex AI: {model_uri}") 133 | uploaded_model.set_string_custom_property("model_uri", model_uri) 134 | uploaded_model.set_int_custom_property("uploaded", 1) 135 | 136 | 137 | @component 138 | def bigquery_data_gen( 139 | sql_query: Parameter[str], 140 | output_data_format: Parameter[str], 141 | beam_args: Parameter[str], 142 | serving_dataset: OutputArtifact[Dataset], 143 | ): 144 | 145 | output_dir = os.path.join( 146 | artifact_utils.get_single_uri([serving_dataset]), SERVING_DATA_PREFIX 147 | ) 148 | 149 | pipeline_args = json.loads(beam_args) 150 | pipeline_args["sql_query"] = sql_query 151 | pipeline_args["exported_data_prefix"] = output_dir 152 | pipeline_args["output_data_format"] = output_data_format 153 | 154 | logging.info("Data extraction started. Source query:") 155 | logging.info("{sql_query}") 156 | etl.run_extract_pipeline(pipeline_args) 157 | logging.info("Data extraction completed.") 158 | 159 | 160 | @component 161 | def vertex_batch_prediction( 162 | project: Parameter[str], 163 | region: Parameter[str], 164 | model_display_name: Parameter[str], 165 | instances_format: Parameter[str], 166 | predictions_format: Parameter[str], 167 | job_resources: Parameter[str], 168 | serving_dataset: InputArtifact[Dataset], 169 | prediction_results: OutputArtifact[Dataset], 170 | ): 171 | 172 | job_resources = json.loads(job_resources) 173 | gcs_source_pattern = ( 174 | os.path.join( 175 | artifact_utils.get_single_uri([serving_dataset]), SERVING_DATA_PREFIX 176 | ) 177 | + "*.jsonl" 178 | ) 179 | gcs_destination_prefix = artifact_utils.get_single_uri([prediction_results]) 180 | job_name = f"extract-{model_display_name}-serving-{datetime.now().strftime('%Y%m%d%H%M%S')}" 181 | 182 | vertex_ai.init(project=project, location=region) 183 | 184 | logging.info("Submitting Vertex AI batch prediction job...") 185 | batch_prediction_job = vertex_ai.BatchPredictionJob.create( 186 | job_display_name=job_name, 187 | model_name=model_display_name, 188 | gcs_source=gcs_source_pattern, 189 | gcs_destination_prefix=gcs_destination_prefix, 190 | instances_format=instances_format, 191 | predictions_format=predictions_format, 192 | sync=True, 193 | **job_resources, 194 | ) 195 | logging.info("Batch prediction job completed.") 196 | 197 | prediction_results.set_string_custom_property( 198 | "batch_prediction_job", batch_prediction_job.gca_resource.name 199 | ) 200 | 201 | 202 | @component 203 | def datastore_prediction_writer( 204 | datastore_kind: Parameter[str], 205 | predictions_format: Parameter[str], 206 | beam_args: Parameter[str], 207 | prediction_results: InputArtifact[Dataset], 208 | ): 209 | 210 | prediction_results_dir = os.path.join( 211 | artifact_utils.get_single_uri([prediction_results]) 212 | ) 213 | prediction_results_dir = os.path.join( 214 | prediction_results_dir, tf.io.gfile.listdir(prediction_results_dir)[0] 215 | ) 216 | prediction_results_uri = os.path.join( 217 | prediction_results_dir, PREDICTION_RESULTS_PREFIX 218 | ) 219 | 220 | pipeline_args = json.loads(beam_args) 221 | pipeline_args["prediction_results_uri"] = prediction_results_uri 222 | pipeline_args["datastore_kind"] = datastore_kind 223 | pipeline_args["predictions_format"] = predictions_format 224 | 225 | logging.info(f"Storing predictions to Datastore kind: {datastore_kind}") 226 | etl.run_store_predictions_pipeline(pipeline_args) 227 | logging.info("Predictions are stored.") 228 | -------------------------------------------------------------------------------- /src/tfx_pipelines/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """TFX pipeline configurations.""" 15 | 16 | import os 17 | from tfx import v1 as tfx 18 | 19 | PROJECT = os.getenv("PROJECT", "") 20 | REGION = os.getenv("REGION", "") 21 | GCS_LOCATION = os.getenv("GCS_LOCATION", "") 22 | 23 | ARTIFACT_STORE_URI = os.path.join(GCS_LOCATION, "tfx_artifacts") 24 | MODEL_REGISTRY_URI = os.getenv( 25 | "MODEL_REGISTRY_URI", 26 | os.path.join(GCS_LOCATION, "model_registry"), 27 | ) 28 | 29 | DATASET_DISPLAY_NAME = os.getenv("DATASET_DISPLAY_NAME", "chicago-taxi-tips") 30 | MODEL_DISPLAY_NAME = os.getenv( 31 | "MODEL_DISPLAY_NAME", f"{DATASET_DISPLAY_NAME}-classifier" 32 | ) 33 | PIPELINE_NAME = os.getenv("PIPELINE_NAME", f"{MODEL_DISPLAY_NAME}-train-pipeline") 34 | 35 | ML_USE_COLUMN = "ml_use" 36 | EXCLUDE_COLUMNS = ",".join(["trip_start_timestamp"]) 37 | TRAIN_LIMIT = os.getenv("TRAIN_LIMIT", "0") 38 | TEST_LIMIT = os.getenv("TEST_LIMIT", "0") 39 | SERVE_LIMIT = os.getenv("SERVE_LIMIT", "0") 40 | 41 | NUM_TRAIN_SPLITS = os.getenv("NUM_TRAIN_SPLITS", "4") 42 | NUM_EVAL_SPLITS = os.getenv("NUM_EVAL_SPLITS", "1") 43 | ACCURACY_THRESHOLD = os.getenv("ACCURACY_THRESHOLD", "0.8") 44 | 45 | USE_KFP_SA = os.getenv("USE_KFP_SA", "False") 46 | 47 | TFX_IMAGE_URI = os.getenv( 48 | "TFX_IMAGE_URI", f"gcr.io/{PROJECT}/tfx-{DATASET_DISPLAY_NAME}:latest" 49 | ) 50 | 51 | BEAM_RUNNER = os.getenv("BEAM_RUNNER", "DirectRunner") 52 | BEAM_DIRECT_PIPELINE_ARGS = [ 53 | f"--project={PROJECT}", 54 | f"--temp_location={os.path.join(GCS_LOCATION, 'temp')}", 55 | ] 56 | BEAM_DATAFLOW_PIPELINE_ARGS = [ 57 | f"--project={PROJECT}", 58 | f"--temp_location={os.path.join(GCS_LOCATION, 'temp')}", 59 | f"--region={REGION}", 60 | f"--runner={BEAM_RUNNER}", 61 | ] 62 | 63 | TRAINING_RUNNER = os.getenv("TRAINING_RUNNER", "local") 64 | VERTEX_TRAINING_ARGS = { 65 | 'project': PROJECT, 66 | 'worker_pool_specs': [{ 67 | 'machine_spec': { 68 | 'machine_type': 'n1-standard-4', 69 | # 'accelerator_type': 'NVIDIA_TESLA_K80', 70 | # 'accelerator_count': 1 71 | }, 72 | 'replica_count': 1, 73 | 'container_spec': { 74 | 'image_uri': TFX_IMAGE_URI, 75 | }, 76 | }], 77 | } 78 | VERTEX_TRAINING_CONFIG = { 79 | tfx.extensions.google_cloud_ai_platform.ENABLE_UCAIP_KEY: True, 80 | tfx.extensions.google_cloud_ai_platform.UCAIP_REGION_KEY: REGION, 81 | tfx.extensions.google_cloud_ai_platform.TRAINING_ARGS_KEY: VERTEX_TRAINING_ARGS, 82 | 'use_gpu': False, 83 | } 84 | 85 | SERVING_RUNTIME = os.getenv("SERVING_RUNTIME", "tf2-cpu.2-5") 86 | SERVING_IMAGE_URI = f"us-docker.pkg.dev/vertex-ai/prediction/{SERVING_RUNTIME}:latest" 87 | 88 | BATCH_PREDICTION_BQ_DATASET_NAME = os.getenv( 89 | "BATCH_PREDICTION_BQ_DATASET_NAME", "playground_us" 90 | ) 91 | BATCH_PREDICTION_BQ_TABLE_NAME = os.getenv( 92 | "BATCH_PREDICTION_BQ_TABLE_NAME", "chicago_taxitrips_prep" 93 | ) 94 | BATCH_PREDICTION_BEAM_ARGS = { 95 | "runner": f"{BEAM_RUNNER}", 96 | "temporary_dir": os.path.join(GCS_LOCATION, "temp"), 97 | "gcs_location": os.path.join(GCS_LOCATION, "temp"), 98 | "project": PROJECT, 99 | "region": REGION, 100 | "setup_file": "./setup.py", 101 | } 102 | BATCH_PREDICTION_JOB_RESOURCES = { 103 | "machine_type": "n1-standard-2", 104 | #'accelerator_count': 1, 105 | #'accelerator_type': 'NVIDIA_TESLA_T4' 106 | "starting_replica_count": 1, 107 | "max_replica_count": 10, 108 | } 109 | DATASTORE_PREDICTION_KIND = f"{MODEL_DISPLAY_NAME}-predictions" 110 | 111 | ENABLE_CACHE = os.getenv("ENABLE_CACHE", "0") 112 | UPLOAD_MODEL = os.getenv("UPLOAD_MODEL", "1") 113 | 114 | os.environ["PROJECT"] = PROJECT 115 | os.environ["PIPELINE_NAME"] = PIPELINE_NAME 116 | os.environ["TFX_IMAGE_URI"] = TFX_IMAGE_URI 117 | os.environ["MODEL_REGISTRY_URI"] = MODEL_REGISTRY_URI 118 | -------------------------------------------------------------------------------- /src/tfx_pipelines/prediction_pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """TFX prediction pipeline definition.""" 15 | 16 | import os 17 | import sys 18 | import json 19 | import logging 20 | 21 | from tfx.orchestration import pipeline, data_types 22 | from ml_metadata.proto import metadata_store_pb2 23 | 24 | SCRIPT_DIR = os.path.dirname( 25 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))) 26 | ) 27 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, ".."))) 28 | 29 | from src.tfx_pipelines import config 30 | from src.tfx_pipelines import components as custom_components 31 | from src.common import datasource_utils 32 | 33 | 34 | def create_pipeline( 35 | pipeline_root: str, 36 | metadata_connection_config: metadata_store_pb2.ConnectionConfig = None, 37 | ): 38 | 39 | # Get source query. 40 | sql_query = datasource_utils.get_serving_source_query( 41 | bq_dataset_name=config.BATCH_PREDICTION_BQ_DATASET_NAME, 42 | bq_table_name=config.BATCH_PREDICTION_BQ_TABLE_NAME, 43 | limit=int(config.SERVE_LIMIT), 44 | ) 45 | 46 | bigquery_data_gen = custom_components.bigquery_data_gen( 47 | sql_query=sql_query, 48 | output_data_format="jsonl", 49 | beam_args=json.dumps(config.BATCH_PREDICTION_BEAM_ARGS), 50 | ) 51 | 52 | vertex_batch_prediction = custom_components.vertex_batch_prediction( 53 | project=config.PROJECT, 54 | region=config.REGION, 55 | model_display_name=config.MODEL_DISPLAY_NAME, 56 | instances_format="jsonl", 57 | predictions_format="jsonl", 58 | job_resources=json.dumps(config.BATCH_PREDICTION_JOB_RESOURCES), 59 | serving_dataset=bigquery_data_gen.outputs["serving_dataset"], 60 | ) 61 | 62 | datastore_prediction_writer = custom_components.datastore_prediction_writer( 63 | datastore_kind=config.DATASTORE_PREDICTION_KIND, 64 | predictions_format="jsonl", 65 | beam_args=json.dumps(config.BATCH_PREDICTION_BEAM_ARGS), 66 | prediction_results=vertex_batch_prediction.outputs["prediction_results"], 67 | ) 68 | 69 | pipeline_components = [ 70 | bigquery_data_gen, 71 | vertex_batch_prediction, 72 | datastore_prediction_writer, 73 | ] 74 | 75 | logging.info( 76 | f"Pipeline components: {[component.id for component in pipeline_components]}" 77 | ) 78 | 79 | beam_pipeline_args = config.BEAM_DIRECT_PIPELINE_ARGS 80 | if config.BEAM_RUNNER == "DataflowRunner": 81 | beam_pipeline_args = config.BEAM_DATAFLOW_PIPELINE_ARGS 82 | 83 | logging.info(f"Beam pipeline args: {beam_pipeline_args}") 84 | 85 | return pipeline.Pipeline( 86 | pipeline_name=config.PIPELINE_NAME, 87 | pipeline_root=pipeline_root, 88 | components=pipeline_components, 89 | beam_pipeline_args=beam_pipeline_args, 90 | metadata_connection_config=metadata_connection_config, 91 | enable_cache=int(config.ENABLE_CACHE), 92 | ) 93 | -------------------------------------------------------------------------------- /src/tfx_pipelines/runner.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Define KubeflowV2DagRunner to run the training pipeline using Managed Pipelines.""" 15 | 16 | 17 | import os 18 | from kfp.v2.google.client import AIPlatformClient 19 | from tfx.orchestration import data_types 20 | from tfx.orchestration.kubeflow.v2 import kubeflow_v2_dag_runner 21 | 22 | 23 | from src.tfx_pipelines import config, training_pipeline, prediction_pipeline 24 | from src.model_training import defaults 25 | 26 | 27 | def compile_training_pipeline(pipeline_definition_file): 28 | 29 | pipeline_root = os.path.join( 30 | config.ARTIFACT_STORE_URI, 31 | config.PIPELINE_NAME, 32 | ) 33 | 34 | managed_pipeline = training_pipeline.create_pipeline( 35 | pipeline_root=pipeline_root, 36 | num_epochs=data_types.RuntimeParameter( 37 | name="num_epochs", 38 | default=defaults.NUM_EPOCHS, 39 | ptype=int, 40 | ), 41 | batch_size=data_types.RuntimeParameter( 42 | name="batch_size", 43 | default=defaults.BATCH_SIZE, 44 | ptype=int, 45 | ), 46 | learning_rate=data_types.RuntimeParameter( 47 | name="learning_rate", 48 | default=defaults.LEARNING_RATE, 49 | ptype=float, 50 | ), 51 | hidden_units=data_types.RuntimeParameter( 52 | name="hidden_units", 53 | default=",".join(str(u) for u in defaults.HIDDEN_UNITS), 54 | ptype=str, 55 | ), 56 | ) 57 | 58 | runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner( 59 | config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( 60 | default_image=config.TFX_IMAGE_URI 61 | ), 62 | output_filename=pipeline_definition_file, 63 | ) 64 | 65 | return runner.run(managed_pipeline, write_out=True) 66 | 67 | 68 | def compile_prediction_pipeline(pipeline_definition_file): 69 | 70 | pipeline_root = os.path.join( 71 | config.ARTIFACT_STORE_URI, 72 | config.PIPELINE_NAME, 73 | ) 74 | 75 | managed_pipeline = prediction_pipeline.create_pipeline( 76 | pipeline_root=pipeline_root, 77 | ) 78 | 79 | runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner( 80 | config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( 81 | default_image=config.TFX_IMAGE_URI 82 | ), 83 | output_filename=pipeline_definition_file, 84 | ) 85 | 86 | return runner.run(managed_pipeline, write_out=True) 87 | 88 | 89 | def submit_pipeline(pipeline_definition_file): 90 | 91 | pipeline_client = AIPlatformClient(project_id=config.PROJECT, region=config.REGION) 92 | pipeline_client.create_run_from_job_spec(pipeline_definition_file) 93 | -------------------------------------------------------------------------------- /src/tfx_pipelines/training_pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """TFX training pipeline definition.""" 15 | 16 | import os 17 | import sys 18 | import logging 19 | import json 20 | 21 | import tensorflow_model_analysis as tfma 22 | 23 | from ml_metadata.proto import metadata_store_pb2 24 | from tfx.proto import example_gen_pb2, transform_pb2, pusher_pb2 25 | from tfx.types import Channel, standard_artifacts 26 | from tfx.orchestration import pipeline, data_types 27 | from tfx.dsl.components.common.importer import Importer 28 | from tfx.dsl.components.common.resolver import Resolver 29 | from tfx.dsl.experimental import latest_artifacts_resolver 30 | from tfx.dsl.experimental import latest_blessed_model_resolver 31 | from tfx.v1.extensions.google_cloud_big_query import BigQueryExampleGen 32 | from tfx.v1.extensions.google_cloud_ai_platform import Trainer as VertexTrainer 33 | from tfx.v1.components import ( 34 | StatisticsGen, 35 | ExampleValidator, 36 | Transform, 37 | Trainer, 38 | Evaluator, 39 | Pusher, 40 | ) 41 | 42 | SCRIPT_DIR = os.path.dirname( 43 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))) 44 | ) 45 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, ".."))) 46 | 47 | from src.tfx_pipelines import config 48 | from src.tfx_pipelines import components as custom_components 49 | from src.common import features, datasource_utils 50 | 51 | RAW_SCHEMA_DIR = "src/raw_schema" 52 | TRANSFORM_MODULE_FILE = "src/preprocessing/transformations.py" 53 | TRAIN_MODULE_FILE = "src/model_training/runner.py" 54 | 55 | 56 | def create_pipeline( 57 | pipeline_root: str, 58 | num_epochs: data_types.RuntimeParameter, 59 | batch_size: data_types.RuntimeParameter, 60 | learning_rate: data_types.RuntimeParameter, 61 | hidden_units: data_types.RuntimeParameter, 62 | metadata_connection_config: metadata_store_pb2.ConnectionConfig = None, 63 | ): 64 | 65 | # Hyperparameter generation. 66 | hyperparams_gen = custom_components.hyperparameters_gen( 67 | num_epochs=num_epochs, 68 | batch_size=batch_size, 69 | learning_rate=learning_rate, 70 | hidden_units=hidden_units, 71 | ).with_id("HyperparamsGen") 72 | 73 | # Get train source query. 74 | train_sql_query = datasource_utils.get_training_source_query( 75 | config.PROJECT, 76 | config.REGION, 77 | config.DATASET_DISPLAY_NAME, 78 | ml_use="UNASSIGNED", 79 | limit=int(config.TRAIN_LIMIT), 80 | ) 81 | 82 | train_output_config = example_gen_pb2.Output( 83 | split_config=example_gen_pb2.SplitConfig( 84 | splits=[ 85 | example_gen_pb2.SplitConfig.Split( 86 | name="train", hash_buckets=int(config.NUM_TRAIN_SPLITS) 87 | ), 88 | example_gen_pb2.SplitConfig.Split( 89 | name="eval", hash_buckets=int(config.NUM_EVAL_SPLITS) 90 | ), 91 | ] 92 | ) 93 | ) 94 | 95 | # Train example generation. 96 | train_example_gen = BigQueryExampleGen( 97 | query=train_sql_query, 98 | output_config=train_output_config, 99 | ).with_id("TrainDataGen") 100 | 101 | # Get test source query. 102 | test_sql_query = datasource_utils.get_training_source_query( 103 | config.PROJECT, 104 | config.REGION, 105 | config.DATASET_DISPLAY_NAME, 106 | ml_use="TEST", 107 | limit=int(config.TEST_LIMIT), 108 | ) 109 | 110 | test_output_config = example_gen_pb2.Output( 111 | split_config=example_gen_pb2.SplitConfig( 112 | splits=[ 113 | example_gen_pb2.SplitConfig.Split(name="test", hash_buckets=1), 114 | ] 115 | ) 116 | ) 117 | 118 | # Test example generation. 119 | test_example_gen = BigQueryExampleGen( 120 | query=test_sql_query, 121 | output_config=test_output_config, 122 | ).with_id("TestDataGen") 123 | 124 | # Schema importer. 125 | schema_importer = Importer( 126 | source_uri=RAW_SCHEMA_DIR, 127 | artifact_type=standard_artifacts.Schema, 128 | ).with_id("SchemaImporter") 129 | 130 | # Statistics generation. 131 | statistics_gen = StatisticsGen(examples=train_example_gen.outputs["examples"]).with_id( 132 | "StatisticsGen" 133 | ) 134 | 135 | # Example validation. 136 | example_validator = ExampleValidator( 137 | statistics=statistics_gen.outputs["statistics"], 138 | schema=schema_importer.outputs["result"], 139 | ).with_id("ExampleValidator") 140 | 141 | # Data transformation. 142 | transform = Transform( 143 | examples=train_example_gen.outputs["examples"], 144 | schema=schema_importer.outputs["result"], 145 | module_file=TRANSFORM_MODULE_FILE, 146 | # This is a temporary workaround to run on Dataflow. 147 | force_tf_compat_v1=config.BEAM_RUNNER == "DataflowRunner", 148 | splits_config=transform_pb2.SplitsConfig( 149 | analyze=["train"], transform=["train", "eval"] 150 | ), 151 | ).with_id("DataTransformer") 152 | 153 | # Add dependency from example_validator to transform. 154 | transform.add_upstream_node(example_validator) 155 | 156 | # Get the latest model to warmstart 157 | warmstart_model_resolver = Resolver( 158 | strategy_class=latest_artifacts_resolver.LatestArtifactsResolver, 159 | latest_model=Channel(type=standard_artifacts.Model), 160 | ).with_id("WarmstartModelResolver") 161 | 162 | # Model training. 163 | trainer = Trainer( 164 | module_file=TRAIN_MODULE_FILE, 165 | examples=transform.outputs["transformed_examples"], 166 | schema=schema_importer.outputs["result"], 167 | base_model=warmstart_model_resolver.outputs["latest_model"], 168 | transform_graph=transform.outputs["transform_graph"], 169 | hyperparameters=hyperparams_gen.outputs["hyperparameters"], 170 | ).with_id("ModelTrainer") 171 | 172 | if config.TRAINING_RUNNER == "vertex": 173 | trainer = VertexTrainer( 174 | module_file=TRAIN_MODULE_FILE, 175 | examples=transform.outputs["transformed_examples"], 176 | schema=schema_importer.outputs["result"], 177 | base_model=warmstart_model_resolver.outputs["latest_model"], 178 | transform_graph=transform.outputs["transform_graph"], 179 | hyperparameters=hyperparams_gen.outputs["hyperparameters"], 180 | custom_config=config.VERTEX_TRAINING_CONFIG 181 | ).with_id("ModelTrainer") 182 | 183 | 184 | # Get the latest blessed model (baseline) for model validation. 185 | baseline_model_resolver = Resolver( 186 | strategy_class=latest_blessed_model_resolver.LatestBlessedModelResolver, 187 | model=Channel(type=standard_artifacts.Model), 188 | model_blessing=Channel(type=standard_artifacts.ModelBlessing), 189 | ).with_id("BaselineModelResolver") 190 | 191 | # Prepare evaluation config. 192 | eval_config = tfma.EvalConfig( 193 | model_specs=[ 194 | tfma.ModelSpec( 195 | signature_name="serving_tf_example", 196 | label_key=features.TARGET_FEATURE_NAME, 197 | prediction_key="probabilities", 198 | ) 199 | ], 200 | slicing_specs=[ 201 | tfma.SlicingSpec(), 202 | ], 203 | metrics_specs=[ 204 | tfma.MetricsSpec( 205 | metrics=[ 206 | tfma.MetricConfig(class_name="ExampleCount"), 207 | tfma.MetricConfig( 208 | class_name="BinaryAccuracy", 209 | threshold=tfma.MetricThreshold( 210 | value_threshold=tfma.GenericValueThreshold( 211 | lower_bound={"value": float(config.ACCURACY_THRESHOLD)} 212 | ), 213 | # Change threshold will be ignored if there is no 214 | # baseline model resolved from MLMD (first run). 215 | change_threshold=tfma.GenericChangeThreshold( 216 | direction=tfma.MetricDirection.HIGHER_IS_BETTER, 217 | absolute={"value": -1e-10}, 218 | ), 219 | ), 220 | ), 221 | ] 222 | ) 223 | ], 224 | ) 225 | 226 | # Model evaluation. 227 | evaluator = Evaluator( 228 | examples=test_example_gen.outputs["examples"], 229 | example_splits=["test"], 230 | model=trainer.outputs["model"], 231 | baseline_model=baseline_model_resolver.outputs["model"], 232 | eval_config=eval_config, 233 | schema=schema_importer.outputs["result"], 234 | ).with_id("ModelEvaluator") 235 | 236 | exported_model_location = os.path.join( 237 | config.MODEL_REGISTRY_URI, config.MODEL_DISPLAY_NAME 238 | ) 239 | push_destination = pusher_pb2.PushDestination( 240 | filesystem=pusher_pb2.PushDestination.Filesystem( 241 | base_directory=exported_model_location 242 | ) 243 | ) 244 | 245 | # Push custom model to model registry. 246 | pusher = Pusher( 247 | model=trainer.outputs["model"], 248 | model_blessing=evaluator.outputs["blessing"], 249 | push_destination=push_destination, 250 | ).with_id("ModelPusher") 251 | 252 | # Upload custom trained model to Vertex AI. 253 | labels = { 254 | "dataset_name": config.DATASET_DISPLAY_NAME, 255 | "pipeline_name": config.PIPELINE_NAME, 256 | "pipeline_root": pipeline_root 257 | } 258 | labels = json.dumps(labels) 259 | explanation_config = json.dumps(features.generate_explanation_config()) 260 | 261 | vertex_model_uploader = custom_components.vertex_model_uploader( 262 | project=config.PROJECT, 263 | region=config.REGION, 264 | model_display_name=config.MODEL_DISPLAY_NAME, 265 | pushed_model_location=exported_model_location, 266 | serving_image_uri=config.SERVING_IMAGE_URI, 267 | model_blessing=evaluator.outputs["blessing"], 268 | explanation_config=explanation_config, 269 | labels=labels 270 | ).with_id("VertexUploader") 271 | 272 | pipeline_components = [ 273 | hyperparams_gen, 274 | train_example_gen, 275 | test_example_gen, 276 | statistics_gen, 277 | schema_importer, 278 | example_validator, 279 | transform, 280 | warmstart_model_resolver, 281 | trainer, 282 | baseline_model_resolver, 283 | evaluator, 284 | pusher, 285 | ] 286 | 287 | if int(config.UPLOAD_MODEL): 288 | pipeline_components.append(vertex_model_uploader) 289 | # Add dependency from pusher to aip_model_uploader. 290 | vertex_model_uploader.add_upstream_node(pusher) 291 | 292 | logging.info( 293 | f"Pipeline components: {[component.id for component in pipeline_components]}" 294 | ) 295 | 296 | beam_pipeline_args = config.BEAM_DIRECT_PIPELINE_ARGS 297 | if config.BEAM_RUNNER == "DataflowRunner": 298 | beam_pipeline_args = config.BEAM_DATAFLOW_PIPELINE_ARGS 299 | 300 | logging.info(f"Beam pipeline args: {beam_pipeline_args}") 301 | 302 | return pipeline.Pipeline( 303 | pipeline_name=config.PIPELINE_NAME, 304 | pipeline_root=pipeline_root, 305 | components=pipeline_components, 306 | beam_pipeline_args=beam_pipeline_args, 307 | metadata_connection_config=metadata_connection_config, 308 | enable_cache=int(config.ENABLE_CACHE), 309 | ) 310 | --------------------------------------------------------------------------------