├── .gitignore
├── 01-dataset-management.ipynb
├── 02-experimentation.ipynb
├── 03-training-formalization.ipynb
├── 04-pipeline-deployment.ipynb
├── 05-continuous-training.ipynb
├── 06-model-deployment.ipynb
├── 07-prediction-serving.ipynb
├── 08-model-monitoring.ipynb
├── Dockerfile
├── LICENSE
├── README.md
├── build
├── Dockerfile
├── model-deployment.yaml
├── pipeline-deployment.yaml
├── serving_resources_spec.json
└── utils.py
├── mlops.png
├── provision
├── README.md
└── terraform
│ ├── gcs-bucket.tf
│ ├── main.tf
│ ├── notebook-instance.tf
│ ├── service-accounts.tf
│ ├── services.tf
│ ├── terraform.tfvars
│ └── variables.tf
├── requirements.txt
├── setup.py
└── src
├── __init__.py
├── common
├── __init__.py
├── datasource_utils.py
└── features.py
├── model_training
├── __init__.py
├── data.py
├── defaults.py
├── exporter.py
├── model.py
├── runner.py
├── task.py
└── trainer.py
├── pipeline_triggering
├── __init__.py
├── main.py
└── requirements.txt
├── preprocessing
├── __init__.py
├── etl.py
└── transformations.py
├── raw_schema
└── schema.pbtxt
├── tests
├── __init__.py
├── datasource_utils_tests.py
├── etl_tests.py
├── model_deployment_tests.py
├── model_tests.py
└── pipeline_deployment_tests.py
└── tfx_pipelines
├── __init__.py
├── components.py
├── config.py
├── prediction_pipeline.py
├── runner.py
└── training_pipeline.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | develop-eggs/
12 | dist/
13 | downloads/
14 | eggs/
15 | .eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | wheels/
22 | pip-wheel-metadata/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 | db.sqlite3-journal
62 |
63 | # Flask stuff:
64 | instance/
65 | .webassets-cache
66 |
67 | # Scrapy stuff:
68 | .scrapy
69 |
70 | # Sphinx documentation
71 | docs/_build/
72 |
73 | # PyBuilder
74 | target/
75 |
76 | # Jupyter Notebook
77 | .ipynb_checkpoints
78 |
79 | # IPython
80 | profile_default/
81 | ipython_config.py
82 | .idea/
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | _workspace/
132 | *.tar.gz
133 | .egg-info/
134 | *.whl
135 | mlpipeline-ui-metadata.json
136 | *.csv
137 | *.sqllite
138 | model.png
139 | *-pipeline.json
140 | *.DS_Store
--------------------------------------------------------------------------------
/01-dataset-management.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "e8ba7f4f",
6 | "metadata": {},
7 | "source": [
8 | "# 01 - Data Analysis and Preparation\n",
9 | "\n",
10 | "This notebook covers the following tasks:\n",
11 | "\n",
12 | "1. Perform exploratory data analysis and visualization.\n",
13 | "2. Prepare the data for the ML task in BigQuery.\n",
14 | "3. Generate and fix a ` TFDV schema` for the source data.\n",
15 | "4. Create a `Vertex Dataset resource` dataset.\n"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "id": "b481a247",
21 | "metadata": {},
22 | "source": [
23 | "## Dataset\n",
24 | "\n",
25 | "The [Chicago Taxi Trips](https://pantheon.corp.google.com/marketplace/details/city-of-chicago-public-data/chicago-taxi-trips) dataset is one of [public datasets hosted with BigQuery](https://cloud.google.com/bigquery/public-data/), which includes taxi trips from 2013 to the present, reported to the City of Chicago in its role as a regulatory agency. The `taxi_trips` table size is 70.72 GB and includes more than 195 million records. The dataset includes information about the trips, like pickup and dropoff datetime and location, passengers count, miles travelled, and trip toll. \n",
26 | "\n",
27 | "The ML task is to predict whether a given trip will result in a tip > 20%."
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "id": "4fedd0ac",
33 | "metadata": {},
34 | "source": [
35 | "## Setup"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "id": "b25967c9",
41 | "metadata": {},
42 | "source": [
43 | "### Import libraries"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "id": "fa2cf3f1",
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "import os\n",
54 | "import pandas as pd\n",
55 | "import tensorflow as tf\n",
56 | "import tensorflow_data_validation as tfdv\n",
57 | "from google.cloud import bigquery\n",
58 | "import matplotlib.pyplot as plt\n",
59 | "\n",
60 | "from google.cloud import aiplatform as vertex_ai"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "id": "9bd0ee37",
66 | "metadata": {},
67 | "source": [
68 | "### Setup Google Cloud project"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "id": "c7e4712e",
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "PROJECT = '[your-project-id]' # Change to your project id.\n",
79 | "REGION = 'us-central1' # Change to your region.\n",
80 | "\n",
81 | "if PROJECT == \"\" or PROJECT is None or PROJECT == \"[your-project-id]\":\n",
82 | " # Get your GCP project id from gcloud\n",
83 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n",
84 | " PROJECT = shell_output[0]\n",
85 | " \n",
86 | "print(\"Project ID:\", PROJECT)\n",
87 | "print(\"Region:\", REGION)"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "id": "384a817b",
93 | "metadata": {},
94 | "source": [
95 | "### Set configurations"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "id": "b71d0738",
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "BQ_DATASET_NAME = 'playground_us' # Change to your BQ dataset name.\n",
106 | "BQ_TABLE_NAME = 'chicago_taxitrips_prep'\n",
107 | "BQ_LOCATION = 'US'\n",
108 | "\n",
109 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n",
110 | "\n",
111 | "RAW_SCHEMA_DIR = 'src/raw_schema'"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "id": "39395512",
117 | "metadata": {},
118 | "source": [
119 | "## 1. Explore the data in BigQuery"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "id": "9e4300d3",
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "%%bigquery data\n",
130 | "\n",
131 | "SELECT \n",
132 | " CAST(EXTRACT(DAYOFWEEK FROM trip_start_timestamp) AS string) AS trip_dayofweek, \n",
133 | " FORMAT_DATE('%A',cast(trip_start_timestamp as date)) AS trip_dayname,\n",
134 | " COUNT(*) as trip_count,\n",
135 | "FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`\n",
136 | "WHERE\n",
137 | " EXTRACT(YEAR FROM trip_start_timestamp) = 2015 \n",
138 | "GROUP BY\n",
139 | " trip_dayofweek,\n",
140 | " trip_dayname\n",
141 | "ORDER BY\n",
142 | " trip_dayofweek\n",
143 | ";"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "id": "699804c5",
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "data"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "id": "a7f2447e",
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "data.plot(kind='bar', x='trip_dayname', y='trip_count')"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "id": "a7782c69",
169 | "metadata": {},
170 | "source": [
171 | "## 2. Create data for the ML task\n",
172 | "\n",
173 | "We add a `ML_use` column for pre-splitting the data, where 80% of the datsa items are set to `UNASSIGNED` while the other 20% is set to `TEST`.\n",
174 | "\n",
175 | "This column is used during training to split the dataset for training and test.\n",
176 | "\n",
177 | "In the training phase, the `UNASSIGNED` are split into `train` and `eval`. The `TEST` split is will be used for the final model validation."
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "id": "7987d132",
183 | "metadata": {},
184 | "source": [
185 | "### Create destination BigQuery dataset"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "id": "56a7f6d6",
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "!bq --location=$BQ_LOCATION mk -d \\\n",
196 | "$PROJECT:$BQ_DATASET_NAME"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "id": "832f99ba",
203 | "metadata": {},
204 | "outputs": [],
205 | "source": [
206 | "sample_size = 1000000\n",
207 | "year = 2020"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "id": "1b19789f",
214 | "metadata": {},
215 | "outputs": [],
216 | "source": [
217 | "sql_script = '''\n",
218 | "CREATE OR REPLACE TABLE `@PROJECT.@DATASET.@TABLE` \n",
219 | "AS (\n",
220 | " WITH\n",
221 | " taxitrips AS (\n",
222 | " SELECT\n",
223 | " trip_start_timestamp,\n",
224 | " trip_seconds,\n",
225 | " trip_miles,\n",
226 | " payment_type,\n",
227 | " pickup_longitude,\n",
228 | " pickup_latitude,\n",
229 | " dropoff_longitude,\n",
230 | " dropoff_latitude,\n",
231 | " tips,\n",
232 | " fare\n",
233 | " FROM\n",
234 | " `bigquery-public-data.chicago_taxi_trips.taxi_trips`\n",
235 | " WHERE 1=1 \n",
236 | " AND pickup_longitude IS NOT NULL\n",
237 | " AND pickup_latitude IS NOT NULL\n",
238 | " AND dropoff_longitude IS NOT NULL\n",
239 | " AND dropoff_latitude IS NOT NULL\n",
240 | " AND trip_miles > 0\n",
241 | " AND trip_seconds > 0\n",
242 | " AND fare > 0\n",
243 | " AND EXTRACT(YEAR FROM trip_start_timestamp) = @YEAR\n",
244 | " )\n",
245 | "\n",
246 | " SELECT\n",
247 | " trip_start_timestamp,\n",
248 | " EXTRACT(MONTH from trip_start_timestamp) as trip_month,\n",
249 | " EXTRACT(DAY from trip_start_timestamp) as trip_day,\n",
250 | " EXTRACT(DAYOFWEEK from trip_start_timestamp) as trip_day_of_week,\n",
251 | " EXTRACT(HOUR from trip_start_timestamp) as trip_hour,\n",
252 | " trip_seconds,\n",
253 | " trip_miles,\n",
254 | " payment_type,\n",
255 | " ST_AsText(\n",
256 | " ST_SnapToGrid(ST_GeogPoint(pickup_longitude, pickup_latitude), 0.1)\n",
257 | " ) AS pickup_grid,\n",
258 | " ST_AsText(\n",
259 | " ST_SnapToGrid(ST_GeogPoint(dropoff_longitude, dropoff_latitude), 0.1)\n",
260 | " ) AS dropoff_grid,\n",
261 | " ST_Distance(\n",
262 | " ST_GeogPoint(pickup_longitude, pickup_latitude), \n",
263 | " ST_GeogPoint(dropoff_longitude, dropoff_latitude)\n",
264 | " ) AS euclidean,\n",
265 | " CONCAT(\n",
266 | " ST_AsText(ST_SnapToGrid(ST_GeogPoint(pickup_longitude,\n",
267 | " pickup_latitude), 0.1)), \n",
268 | " ST_AsText(ST_SnapToGrid(ST_GeogPoint(dropoff_longitude,\n",
269 | " dropoff_latitude), 0.1))\n",
270 | " ) AS loc_cross,\n",
271 | " IF((tips/fare >= 0.2), 1, 0) AS tip_bin,\n",
272 | " IF(RAND() <= 0.8, 'UNASSIGNED', 'TEST') AS ML_use\n",
273 | " FROM\n",
274 | " taxitrips\n",
275 | " LIMIT @LIMIT\n",
276 | ")\n",
277 | "'''"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "id": "6f1d2837",
284 | "metadata": {},
285 | "outputs": [],
286 | "source": [
287 | "sql_script = sql_script.replace(\n",
288 | " '@PROJECT', PROJECT).replace(\n",
289 | " '@DATASET', BQ_DATASET_NAME).replace(\n",
290 | " '@TABLE', BQ_TABLE_NAME).replace(\n",
291 | " '@YEAR', str(year)).replace(\n",
292 | " '@LIMIT', str(sample_size))"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": null,
298 | "id": "27acb6b5",
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "print(sql_script)"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": null,
308 | "id": "88f31e8c",
309 | "metadata": {},
310 | "outputs": [],
311 | "source": [
312 | "bq_client = bigquery.Client(project=PROJECT, location=BQ_LOCATION)\n",
313 | "job = bq_client.query(sql_script)\n",
314 | "_ = job.result()"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": null,
320 | "id": "b32eea6f",
321 | "metadata": {},
322 | "outputs": [],
323 | "source": [
324 | "%%bigquery --project {PROJECT}\n",
325 | "\n",
326 | "SELECT ML_use, COUNT(*)\n",
327 | "FROM playground_us.chicago_taxitrips_prep # Change to your BQ dataset and table names.\n",
328 | "GROUP BY ML_use"
329 | ]
330 | },
331 | {
332 | "cell_type": "markdown",
333 | "id": "7fcc75fc",
334 | "metadata": {},
335 | "source": [
336 | "### Load a sample data to a Pandas DataFrame"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "id": "f252a846",
343 | "metadata": {},
344 | "outputs": [],
345 | "source": [
346 | "%%bigquery sample_data --project {PROJECT}\n",
347 | "\n",
348 | "SELECT * EXCEPT (trip_start_timestamp, ML_use)\n",
349 | "FROM playground_us.chicago_taxitrips_prep # Change to your BQ dataset and table names."
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": null,
355 | "id": "798809d2",
356 | "metadata": {},
357 | "outputs": [],
358 | "source": [
359 | "sample_data.head().T"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "id": "23595838",
366 | "metadata": {},
367 | "outputs": [],
368 | "source": [
369 | "sample_data.tip_bin.value_counts()"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": null,
375 | "id": "3d06bb64",
376 | "metadata": {},
377 | "outputs": [],
378 | "source": [
379 | "sample_data.euclidean.hist()"
380 | ]
381 | },
382 | {
383 | "cell_type": "markdown",
384 | "id": "10aae180",
385 | "metadata": {},
386 | "source": [
387 | "## 3. Generate raw data schema\n",
388 | "\n",
389 | "The [TensorFlow Data Validation (TFDV)](https://www.tensorflow.org/tfx/data_validation/get_started) data schema will be used in:\n",
390 | "1. Identify the raw data types and shapes in the data transformation.\n",
391 | "2. Create the serving input signature for the custom model.\n",
392 | "3. Validate the new raw training data in the TFX pipeline."
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": null,
398 | "id": "4087d5fa",
399 | "metadata": {},
400 | "outputs": [],
401 | "source": [
402 | "stats = tfdv.generate_statistics_from_dataframe(\n",
403 | " dataframe=sample_data,\n",
404 | " stats_options=tfdv.StatsOptions(\n",
405 | " label_feature='tip_bin',\n",
406 | " weight_feature=None,\n",
407 | " sample_rate=1,\n",
408 | " num_top_values=50\n",
409 | " )\n",
410 | ")"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": null,
416 | "id": "091fbd77",
417 | "metadata": {},
418 | "outputs": [],
419 | "source": [
420 | "tfdv.visualize_statistics(stats)"
421 | ]
422 | },
423 | {
424 | "cell_type": "code",
425 | "execution_count": null,
426 | "id": "d251e09b",
427 | "metadata": {},
428 | "outputs": [],
429 | "source": [
430 | "schema = tfdv.infer_schema(statistics=stats)\n",
431 | "tfdv.display_schema(schema=schema)"
432 | ]
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": null,
437 | "id": "502c49f1",
438 | "metadata": {},
439 | "outputs": [],
440 | "source": [
441 | "raw_schema_location = os.path.join(RAW_SCHEMA_DIR, 'schema.pbtxt')\n",
442 | "tfdv.write_schema_text(schema, raw_schema_location)"
443 | ]
444 | },
445 | {
446 | "cell_type": "markdown",
447 | "id": "59df0723",
448 | "metadata": {},
449 | "source": [
450 | "## 4. Create Vertex Dataset resource"
451 | ]
452 | },
453 | {
454 | "cell_type": "code",
455 | "execution_count": null,
456 | "id": "90d9b605",
457 | "metadata": {},
458 | "outputs": [],
459 | "source": [
460 | "vertex_ai.init(\n",
461 | " project=PROJECT,\n",
462 | " location=REGION\n",
463 | ")"
464 | ]
465 | },
466 | {
467 | "cell_type": "markdown",
468 | "id": "464ab920",
469 | "metadata": {},
470 | "source": [
471 | "### Create the dataset resource"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": null,
477 | "id": "a0a1707a",
478 | "metadata": {},
479 | "outputs": [],
480 | "source": [
481 | "bq_uri = f\"bq://{PROJECT}.{BQ_DATASET_NAME}.{BQ_TABLE_NAME}\"\n",
482 | "\n",
483 | "dataset = vertex_ai.TabularDataset.create(\n",
484 | " display_name=DATASET_DISPLAY_NAME, bq_source=bq_uri)\n",
485 | "\n",
486 | "dataset.gca_resource"
487 | ]
488 | },
489 | {
490 | "cell_type": "markdown",
491 | "id": "c761fadb",
492 | "metadata": {},
493 | "source": [
494 | "### Get the dataset resource\n",
495 | "\n",
496 | "The dataset resource is retrieved by display name. Because multiple datasets can have the same display name, we retrieve the most recent updated one."
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": null,
502 | "id": "d78b7f4d",
503 | "metadata": {},
504 | "outputs": [],
505 | "source": [
506 | "dataset = vertex_ai.TabularDataset.list(\n",
507 | " filter=f\"display_name={DATASET_DISPLAY_NAME}\", \n",
508 | " order_by=\"update_time\")[-1]\n",
509 | "\n",
510 | "print(\"Dataset resource name:\", dataset.resource_name)\n",
511 | "print(\"Dataset BigQuery source:\", dataset.gca_resource.metadata['inputConfig']['bigquerySource']['uri'])"
512 | ]
513 | },
514 | {
515 | "cell_type": "code",
516 | "execution_count": null,
517 | "id": "569249de",
518 | "metadata": {},
519 | "outputs": [],
520 | "source": []
521 | }
522 | ],
523 | "metadata": {
524 | "environment": {
525 | "name": "common-cpu.m79",
526 | "type": "gcloud",
527 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m79"
528 | },
529 | "kernelspec": {
530 | "display_name": "Python 3",
531 | "language": "python",
532 | "name": "python3"
533 | },
534 | "language_info": {
535 | "codemirror_mode": {
536 | "name": "ipython",
537 | "version": 3
538 | },
539 | "file_extension": ".py",
540 | "mimetype": "text/x-python",
541 | "name": "python",
542 | "nbconvert_exporter": "python",
543 | "pygments_lexer": "ipython3",
544 | "version": "3.7.10"
545 | }
546 | },
547 | "nbformat": 4,
548 | "nbformat_minor": 5
549 | }
550 |
--------------------------------------------------------------------------------
/05-continuous-training.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "5043bfb9",
6 | "metadata": {},
7 | "source": [
8 | "# 05 - Continuous Training\n",
9 | "\n",
10 | "After testing, compiling, and uploading the pipeline definition to Cloud Storage, the pipeline is executed with respect to a trigger. We use [Cloud Functions](https://cloud.google.com/functions) and [Cloud Pub/Sub](https://cloud.google.com/pubsub) as a triggering mechanism. The triggering can be scheduled using [Cloud Scheduler](https://cloud.google.com/scheduler). The trigger source sends a message to a Cloud Pub/Sub topic that the Cloud Function listens to, and then it submits the pipeline to AI Platform Managed Pipelines to be executed.\n",
11 | "\n",
12 | "This notebook covers the following steps:\n",
13 | "1. Create the Cloud Pub/Sub topic.\n",
14 | "2. Deploy the Cloud Function \n",
15 | "3. Test triggering a pipeline.\n",
16 | "4. Extracting pipeline run metadata."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "id": "6f2f9013",
22 | "metadata": {},
23 | "source": [
24 | "## Setup"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "id": "a0e71c08",
30 | "metadata": {},
31 | "source": [
32 | "### Import libraries"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "id": "37cefa26",
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import json\n",
43 | "import os\n",
44 | "import logging\n",
45 | "import tensorflow as tf\n",
46 | "import tfx\n",
47 | "import IPython \n",
48 | "\n",
49 | "logging.getLogger().setLevel(logging.INFO)\n",
50 | "\n",
51 | "print(\"Tensorflow Version:\", tfx.__version__)"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "id": "ab763d6d",
57 | "metadata": {},
58 | "source": [
59 | "### Setup Google Cloud project"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "id": "5260d069",
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "PROJECT = '[your-project-id]' # Change to your project id.\n",
70 | "REGION = 'us-central1' # Change to your region.\n",
71 | "BUCKET = '[your-bucket-name]' # Change to your bucket name.\n",
72 | "\n",
73 | "if PROJECT == \"\" or PROJECT is None or PROJECT == \"[your-project-id]\":\n",
74 | " # Get your GCP project id from gcloud\n",
75 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n",
76 | " PROJECT = shell_output[0]\n",
77 | " \n",
78 | "if BUCKET == \"\" or BUCKET is None or BUCKET == \"[your-bucket-name]\":\n",
79 | " # Get your bucket name to GCP projet id\n",
80 | " BUCKET = PROJECT\n",
81 | "\n",
82 | "print(\"Project ID:\", PROJECT)\n",
83 | "print(\"Region:\", REGION)\n",
84 | "print(\"Bucket name:\", BUCKET)"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "id": "85dd5e16",
90 | "metadata": {},
91 | "source": [
92 | "### Set configurations"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "id": "171a800f",
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "VERSION = 'v01'\n",
103 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n",
104 | "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n",
105 | "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n",
106 | "\n",
107 | "PIPELINES_STORE = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/compiled_pipelines/'\n",
108 | "GCS_PIPELINE_FILE_LOCATION = os.path.join(PIPELINES_STORE, f'{PIPELINE_NAME}.json')\n",
109 | "PUBSUB_TOPIC = f'trigger-{PIPELINE_NAME}'\n",
110 | "CLOUD_FUNCTION_NAME = f'trigger-{PIPELINE_NAME}-fn'"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "id": "016df25c",
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "!gsutil ls {GCS_PIPELINE_FILE_LOCATION}"
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "id": "76d82223",
126 | "metadata": {},
127 | "source": [
128 | "## 1. Create a Pub/Sub topic"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "id": "0c1032c6",
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "!gcloud pubsub topics create {PUBSUB_TOPIC}"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "id": "08de54ef",
144 | "metadata": {},
145 | "source": [
146 | "## 2. Deploy the Cloud Function"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "id": "8597ad8d",
153 | "metadata": {},
154 | "outputs": [],
155 | "source": [
156 | "ENV_VARS=f\"\"\"\\\n",
157 | "PROJECT={PROJECT},\\\n",
158 | "REGION={REGION},\\\n",
159 | "GCS_PIPELINE_FILE_LOCATION={GCS_PIPELINE_FILE_LOCATION}\n",
160 | "\"\"\"\n",
161 | "\n",
162 | "!echo {ENV_VARS}"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "id": "01a3d62a",
169 | "metadata": {},
170 | "outputs": [],
171 | "source": [
172 | "!rm -r src/pipeline_triggering/.ipynb_checkpoints"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "id": "b5acdb73",
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "!gcloud functions deploy {CLOUD_FUNCTION_NAME} \\\n",
183 | " --region={REGION} \\\n",
184 | " --trigger-topic={PUBSUB_TOPIC} \\\n",
185 | " --runtime=python37 \\\n",
186 | " --source=src/pipeline_triggering\\\n",
187 | " --entry-point=trigger_pipeline\\\n",
188 | " --stage-bucket={BUCKET}\\\n",
189 | " --update-env-vars={ENV_VARS}"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "id": "5a5c41af",
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "cloud_fn_url = f\"https://console.cloud.google.com/functions/details/{REGION}/{CLOUD_FUNCTION_NAME}\"\n",
200 | "html = f'See the Cloud Function details here.'\n",
201 | "IPython.display.display(IPython.display.HTML(html))"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "id": "ebbe047a",
207 | "metadata": {},
208 | "source": [
209 | "## 3. Trigger the pipeline"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "id": "0c30028d",
216 | "metadata": {},
217 | "outputs": [],
218 | "source": [
219 | "from google.cloud import pubsub\n",
220 | "\n",
221 | "publish_client = pubsub.PublisherClient()\n",
222 | "topic = f'projects/{PROJECT}/topics/{PUBSUB_TOPIC}'\n",
223 | "data = {\n",
224 | " 'num_epochs': 7,\n",
225 | " 'learning_rate': 0.0015,\n",
226 | " 'batch_size': 512,\n",
227 | " 'hidden_units': '256,126'\n",
228 | "}\n",
229 | "message = json.dumps(data)\n",
230 | "\n",
231 | "_ = publish_client.publish(topic, message.encode())"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "id": "7ba049fe",
237 | "metadata": {},
238 | "source": [
239 | "Wait for a few seconds for the pipeline run to be submitted, then you can see the run in the Cloud Console"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "id": "0dc29797",
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "from kfp.v2.google.client import AIPlatformClient\n",
250 | "\n",
251 | "pipeline_client = AIPlatformClient(\n",
252 | " project_id=PROJECT, region=REGION)\n",
253 | " \n",
254 | "job_display_name = pipeline_client.list_jobs()['pipelineJobs'][0]['displayName']\n",
255 | "job_url = f\"https://console.cloud.google.com/vertex-ai/locations/{REGION}/pipelines/runs/{job_display_name}\"\n",
256 | "html = f'See the Pipeline job here.'\n",
257 | "IPython.display.display(IPython.display.HTML(html))"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "id": "b4b3ff42",
263 | "metadata": {},
264 | "source": [
265 | "## 4. Extracting pipeline runs metadata"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": null,
271 | "id": "b13c1b19",
272 | "metadata": {},
273 | "outputs": [],
274 | "source": [
275 | "from google.cloud import aiplatform as vertex_ai\n",
276 | "\n",
277 | "pipeline_df = vertex_ai.get_pipeline_df(PIPELINE_NAME)\n",
278 | "pipeline_df = pipeline_df[pipeline_df.pipeline_name == PIPELINE_NAME]\n",
279 | "pipeline_df.T"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "id": "9254cbc3",
286 | "metadata": {},
287 | "outputs": [],
288 | "source": []
289 | }
290 | ],
291 | "metadata": {
292 | "environment": {
293 | "name": "common-cpu.m73",
294 | "type": "gcloud",
295 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73"
296 | },
297 | "kernelspec": {
298 | "display_name": "Python 3",
299 | "language": "python",
300 | "name": "python3"
301 | },
302 | "language_info": {
303 | "codemirror_mode": {
304 | "name": "ipython",
305 | "version": 3
306 | },
307 | "file_extension": ".py",
308 | "mimetype": "text/x-python",
309 | "name": "python",
310 | "nbconvert_exporter": "python",
311 | "pygments_lexer": "ipython3",
312 | "version": "3.7.10"
313 | }
314 | },
315 | "nbformat": 4,
316 | "nbformat_minor": 5
317 | }
318 |
--------------------------------------------------------------------------------
/06-model-deployment.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "ee01c81b",
6 | "metadata": {},
7 | "source": [
8 | "# 06 - Model Deployment\n",
9 | "\n",
10 | "The purpose of this notebook is to execute a CI/CD routine to test and deploy the trained model to `Vertex AI` as an `Endpoint` for online prediction serving. The notebook covers the following steps:\n",
11 | "1. Run the test steps locally.\n",
12 | "2. Execute the model deployment `CI/CD` steps using `Cloud Build`.\n",
13 | "\n"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "id": "0da8290c",
19 | "metadata": {},
20 | "source": [
21 | "## Setup"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "id": "4873f8cf",
27 | "metadata": {},
28 | "source": [
29 | "### Import libraries"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "id": "59085129",
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "import os\n",
40 | "import logging\n",
41 | "\n",
42 | "logging.getLogger().setLevel(logging.INFO)"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "id": "e37fb189",
48 | "metadata": {},
49 | "source": [
50 | "### Setup Google Cloud project"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "id": "e45be804",
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "PROJECT = '[your-project-id]' # Change to your project id.\n",
61 | "REGION = 'us-central1' # Change to your region.\n",
62 | "\n",
63 | "if PROJECT == \"\" or PROJECT is None or PROJECT == \"[your-project-id]\":\n",
64 | " # Get your GCP project id from gcloud\n",
65 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n",
66 | " PROJECT = shell_output[0]\n",
67 | "\n",
68 | "print(\"Project ID:\", PROJECT)\n",
69 | "print(\"Region:\", REGION)"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "id": "1574964f",
75 | "metadata": {},
76 | "source": [
77 | "### Set configurations"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "id": "4a01278c",
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "VERSION = 'v01'\n",
88 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n",
89 | "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n",
90 | "ENDPOINT_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier'\n",
91 | "\n",
92 | "CICD_IMAGE_NAME = 'cicd:latest'\n",
93 | "CICD_IMAGE_URI = f\"gcr.io/{PROJECT}/{CICD_IMAGE_NAME}\""
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "id": "87f6f1e0",
99 | "metadata": {},
100 | "source": [
101 | "## 1. Run CI/CD steps locally"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "id": "a223cdf6",
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "os.environ['PROJECT'] = PROJECT\n",
112 | "os.environ['REGION'] = REGION\n",
113 | "os.environ['MODEL_DISPLAY_NAME'] = MODEL_DISPLAY_NAME\n",
114 | "os.environ['ENDPOINT_DISPLAY_NAME'] = ENDPOINT_DISPLAY_NAME"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "id": "b6546ac1",
120 | "metadata": {},
121 | "source": [
122 | "### Run the model artifact testing"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "id": "74c0f8a8",
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "!py.test src/tests/model_deployment_tests.py::test_model_artifact -s"
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "id": "77885b24",
138 | "metadata": {},
139 | "source": [
140 | "### Run create endpoint"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "id": "0efe73b5",
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "!python build/utils.py \\\n",
151 | " --mode=create-endpoint\\\n",
152 | " --project={PROJECT}\\\n",
153 | " --region={REGION}\\\n",
154 | " --endpoint-display-name={ENDPOINT_DISPLAY_NAME}"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "id": "3eb28c6f",
160 | "metadata": {},
161 | "source": [
162 | "### Run deploy model"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "id": "9cb3f19d",
169 | "metadata": {},
170 | "outputs": [],
171 | "source": [
172 | "!python build/utils.py \\\n",
173 | " --mode=deploy-model\\\n",
174 | " --project={PROJECT}\\\n",
175 | " --region={REGION}\\\n",
176 | " --endpoint-display-name={ENDPOINT_DISPLAY_NAME}\\\n",
177 | " --model-display-name={MODEL_DISPLAY_NAME}"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "id": "ee492355",
183 | "metadata": {},
184 | "source": [
185 | "### Test deployed model endpoint"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "id": "3d4bce50",
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "!py.test src/tests/model_deployment_tests.py::test_model_endpoint"
196 | ]
197 | },
198 | {
199 | "cell_type": "markdown",
200 | "id": "37b150c9",
201 | "metadata": {},
202 | "source": [
203 | "## 2. Execute the Model Deployment CI/CD routine in Cloud Build\n",
204 | "\n",
205 | "The CI/CD routine is defined in the [model-deployment.yaml](model-deployment.yaml) file, and consists of the following steps:\n",
206 | "1. Load and test the the trained model interface.\n",
207 | "2. Create and endpoint in Vertex AI if it doesn't exists.\n",
208 | "3. Deploy the model to the endpoint.\n",
209 | "4. Test the endpoint."
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "id": "839e540c",
215 | "metadata": {},
216 | "source": [
217 | "### Build CI/CD container Image for Cloud Build\n",
218 | "\n",
219 | "This is the runtime environment where the steps of testing and deploying model will be executed."
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "id": "a7f9bf4e",
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "!echo $CICD_IMAGE_URI"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": null,
235 | "id": "3855daae",
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "!gcloud builds submit --tag $CICD_IMAGE_URI build/. --timeout=15m"
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "id": "90fbd4b9",
245 | "metadata": {},
246 | "source": [
247 | "### Run CI/CD from model deployment using Cloud Build"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "id": "e1aec70c",
254 | "metadata": {},
255 | "outputs": [],
256 | "source": [
257 | "REPO_URL = \"https://github.com/GoogleCloudPlatform/mlops-with-vertex-ai.git\" # Change to your github repo.\n",
258 | "BRANCH = \"main\" "
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": null,
264 | "id": "01995fa5",
265 | "metadata": {},
266 | "outputs": [],
267 | "source": [
268 | "SUBSTITUTIONS=f\"\"\"\\\n",
269 | "_REPO_URL='{REPO_URL}',\\\n",
270 | "_BRANCH={BRANCH},\\\n",
271 | "_CICD_IMAGE_URI={CICD_IMAGE_URI},\\\n",
272 | "_PROJECT={PROJECT},\\\n",
273 | "_REGION={REGION},\\\n",
274 | "_MODEL_DISPLAY_NAME={MODEL_DISPLAY_NAME},\\\n",
275 | "_ENDPOINT_DISPLAY_NAME={ENDPOINT_DISPLAY_NAME},\\\n",
276 | "\"\"\"\n",
277 | "\n",
278 | "!echo $SUBSTITUTIONS"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": null,
284 | "id": "8849d3e4",
285 | "metadata": {},
286 | "outputs": [],
287 | "source": [
288 | "!gcloud builds submit --no-source --config build/model-deployment.yaml --substitutions {SUBSTITUTIONS} --timeout=30m"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": null,
294 | "id": "01831724",
295 | "metadata": {},
296 | "outputs": [],
297 | "source": []
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": null,
302 | "id": "4418b01e",
303 | "metadata": {},
304 | "outputs": [],
305 | "source": []
306 | }
307 | ],
308 | "metadata": {
309 | "environment": {
310 | "name": "common-cpu.m79",
311 | "type": "gcloud",
312 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m79"
313 | },
314 | "kernelspec": {
315 | "display_name": "Python 3",
316 | "language": "python",
317 | "name": "python3"
318 | },
319 | "language_info": {
320 | "codemirror_mode": {
321 | "name": "ipython",
322 | "version": 3
323 | },
324 | "file_extension": ".py",
325 | "mimetype": "text/x-python",
326 | "name": "python",
327 | "nbconvert_exporter": "python",
328 | "pygments_lexer": "ipython3",
329 | "version": "3.7.10"
330 | }
331 | },
332 | "nbformat": 4,
333 | "nbformat_minor": 5
334 | }
335 |
--------------------------------------------------------------------------------
/07-prediction-serving.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "4004af63",
6 | "metadata": {},
7 | "source": [
8 | "# 07 - Prediction Serving\n",
9 | "\n",
10 | "The purpose of the notebook is to show how to use the deployed model for online and batch prediction.\n",
11 | "The notebook covers the following tasks:\n",
12 | "1. Test the endpoints for online prediction.\n",
13 | "2. Use the uploaded custom model for batch prediction.\n",
14 | "3. Run a the batch prediction pipeline using `Vertex Pipelines`."
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "id": "3dad1f75",
20 | "metadata": {},
21 | "source": [
22 | "## Setup"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "id": "6d02a9d5",
28 | "metadata": {},
29 | "source": [
30 | "### Import libraries"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "id": "c7f3ce81",
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "import os\n",
41 | "from datetime import datetime\n",
42 | "import tensorflow as tf\n",
43 | "\n",
44 | "from google.cloud import aiplatform as vertex_ai"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "id": "8e73bc25",
50 | "metadata": {},
51 | "source": [
52 | "### Setup Google Cloud project"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "id": "29ea9b0a",
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "PROJECT = '[your-project-id]' # Change to your project id.\n",
63 | "REGION = 'us-central1' # Change to your region.\n",
64 | "BUCKET = '[your-bucket-name]' # Change to your bucket name.\n",
65 | "\n",
66 | "if PROJECT == \"\" or PROJECT is None or PROJECT == \"[your-project-id]\":\n",
67 | " # Get your GCP project id from gcloud\n",
68 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n",
69 | " PROJECT = shell_output[0]\n",
70 | " \n",
71 | "if BUCKET == \"\" or BUCKET is None or BUCKET == \"[your-bucket-name]\":\n",
72 | " # Get your bucket name to GCP project id\n",
73 | " BUCKET = PROJECT\n",
74 | " # Try to create the bucket if it doesn't exists\n",
75 | " ! gsutil mb -l $REGION gs://$BUCKET\n",
76 | " print(\"\")\n",
77 | " \n",
78 | "print(\"Project ID:\", PROJECT)\n",
79 | "print(\"Region:\", REGION)\n",
80 | "print(\"Bucket name:\", BUCKET)"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "id": "ecba79b0",
86 | "metadata": {},
87 | "source": [
88 | "### Set configurations"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "id": "537732be",
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "VERSION = 'v01'\n",
99 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n",
100 | "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n",
101 | "ENDPOINT_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier'\n",
102 | "\n",
103 | "SERVE_BQ_DATASET_NAME = 'playground_us' # Change to your serving BigQuery dataset name.\n",
104 | "SERVE_BQ_TABLE_NAME = 'chicago_taxitrips_prep' # Change to your serving BigQuery table name."
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "id": "4e508dd0",
110 | "metadata": {},
111 | "source": [
112 | "## 1. Making Online Predicitons\n"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "id": "38be76f5",
119 | "metadata": {},
120 | "outputs": [],
121 | "source": [
122 | "vertex_ai.init(\n",
123 | " project=PROJECT,\n",
124 | " location=REGION,\n",
125 | " staging_bucket=BUCKET\n",
126 | ")\n",
127 | "\n",
128 | "endpoint_name = vertex_ai.Endpoint.list(\n",
129 | " filter=f'display_name={ENDPOINT_DISPLAY_NAME}', \n",
130 | " order_by=\"update_time\")[-1].gca_resource.name\n",
131 | "\n",
132 | "endpoint = vertex_ai.Endpoint(endpoint_name)"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "id": "f6b8053d",
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "test_instances = [ \n",
143 | " {\n",
144 | " \"dropoff_grid\": [\"POINT(-87.6 41.9)\"],\n",
145 | " \"euclidean\": [2064.2696],\n",
146 | " \"loc_cross\": [\"\"],\n",
147 | " \"payment_type\": [\"Credit Card\"],\n",
148 | " \"pickup_grid\": [\"POINT(-87.6 41.9)\"],\n",
149 | " \"trip_miles\": [1.37],\n",
150 | " \"trip_day\": [12],\n",
151 | " \"trip_hour\": [16],\n",
152 | " \"trip_month\": [2],\n",
153 | " \"trip_day_of_week\": [4],\n",
154 | " \"trip_seconds\": [555]\n",
155 | " }\n",
156 | "]"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "id": "f7cb447e",
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "predictions = endpoint.predict(test_instances).predictions\n",
167 | "\n",
168 | "for prediction in predictions:\n",
169 | " print(prediction)"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "id": "330d9dfc",
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "explanations = endpoint.explain(test_instances).explanations\n",
180 | "\n",
181 | "for explanation in explanations:\n",
182 | " print(explanation)"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "id": "ddc90ffa",
188 | "metadata": {},
189 | "source": [
190 | "## 2. Batch Prediction"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": null,
196 | "id": "046757e2",
197 | "metadata": {},
198 | "outputs": [],
199 | "source": [
200 | "WORKSPACE = f\"gs://{BUCKET}/{DATASET_DISPLAY_NAME}/\"\n",
201 | "SERVING_DATA_DIR = os.path.join(WORKSPACE, 'serving_data')\n",
202 | "SERVING_INPUT_DATA_DIR = os.path.join(SERVING_DATA_DIR, 'input_data')\n",
203 | "SERVING_OUTPUT_DATA_DIR = os.path.join(SERVING_DATA_DIR, 'output_predictions')"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": null,
209 | "id": "0e8fbc4d",
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "if tf.io.gfile.exists(SERVING_DATA_DIR):\n",
214 | " print(\"Removing previous serving data...\")\n",
215 | " tf.io.gfile.rmtree(SERVING_DATA_DIR)\n",
216 | " \n",
217 | "print(\"Creating serving data directory...\")\n",
218 | "tf.io.gfile.mkdir(SERVING_DATA_DIR)\n",
219 | "print(\"Serving data directory is ready.\")"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "id": "6f7b60fa",
225 | "metadata": {},
226 | "source": [
227 | "### Extract serving data to Cloud Storage as JSONL"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "id": "04bb69ff",
234 | "metadata": {},
235 | "outputs": [],
236 | "source": [
237 | "from src.common import datasource_utils\n",
238 | "from src.preprocessing import etl"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": null,
244 | "id": "dfd4cf91",
245 | "metadata": {},
246 | "outputs": [],
247 | "source": [
248 | "LIMIT = 10000\n",
249 | "\n",
250 | "sql_query = datasource_utils.get_serving_source_query(\n",
251 | " bq_dataset_name=SERVE_BQ_DATASET_NAME, \n",
252 | " bq_table_name=SERVE_BQ_TABLE_NAME,\n",
253 | " limit=LIMIT\n",
254 | ")\n",
255 | "\n",
256 | "print(sql_query)"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": null,
262 | "id": "5f5afb73",
263 | "metadata": {},
264 | "outputs": [],
265 | "source": [
266 | "job_name = f\"extract-{DATASET_DISPLAY_NAME}-serving-{datetime.now().strftime('%Y%m%d%H%M%S')}\"\n",
267 | "\n",
268 | "args = {\n",
269 | " 'job_name': job_name,\n",
270 | " #'runner': 'DataflowRunner',\n",
271 | " 'sql_query': sql_query,\n",
272 | " 'exported_data_prefix': os.path.join(SERVING_INPUT_DATA_DIR, \"data-\"),\n",
273 | " 'temporary_dir': os.path.join(WORKSPACE, 'tmp'),\n",
274 | " 'gcs_location': os.path.join(WORKSPACE, 'bq_tmp'),\n",
275 | " 'project': PROJECT,\n",
276 | " 'region': REGION,\n",
277 | " 'setup_file': './setup.py'\n",
278 | "}"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": null,
284 | "id": "588e1949",
285 | "metadata": {},
286 | "outputs": [],
287 | "source": [
288 | "tf.get_logger().setLevel('ERROR')\n",
289 | "\n",
290 | "print(\"Data extraction started...\")\n",
291 | "etl.run_extract_pipeline(args)\n",
292 | "print(\"Data extraction completed.\")"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": null,
298 | "id": "a036944a",
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "!gsutil ls {SERVING_INPUT_DATA_DIR}"
303 | ]
304 | },
305 | {
306 | "cell_type": "markdown",
307 | "id": "5ff0d72b",
308 | "metadata": {},
309 | "source": [
310 | "### Submit the batch prediction job"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "id": "eb72b16e",
317 | "metadata": {},
318 | "outputs": [],
319 | "source": [
320 | "model_name = vertex_ai.Model.list(\n",
321 | " filter=f'display_name={MODEL_DISPLAY_NAME}',\n",
322 | " order_by=\"update_time\")[-1].gca_resource.name"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "id": "dac58bf2",
329 | "metadata": {},
330 | "outputs": [],
331 | "source": [
332 | "job_resources = {\n",
333 | " \"machine_type\": 'n1-standard-2',\n",
334 | " #'accelerator_count': 1,\n",
335 | " #'accelerator_type': 'NVIDIA_TESLA_T4'\n",
336 | " \"starting_replica_count\": 1,\n",
337 | " \"max_replica_count\": 10,\n",
338 | "}\n",
339 | "\n",
340 | "job_display_name = f\"{MODEL_DISPLAY_NAME}-prediction-job-{datetime.now().strftime('%Y%m%d%H%M%S')}\"\n",
341 | "\n",
342 | "vertex_ai.BatchPredictionJob.create(\n",
343 | " job_display_name=job_display_name,\n",
344 | " model_name=model_name,\n",
345 | " gcs_source=SERVING_INPUT_DATA_DIR + '/*.jsonl',\n",
346 | " gcs_destination_prefix=SERVING_OUTPUT_DATA_DIR,\n",
347 | " instances_format='jsonl',\n",
348 | " predictions_format='jsonl',\n",
349 | " sync=True,\n",
350 | " **job_resources,\n",
351 | ")"
352 | ]
353 | },
354 | {
355 | "cell_type": "markdown",
356 | "id": "a4f281a9",
357 | "metadata": {},
358 | "source": [
359 | "## 3. Run the batch prediction pipeline using Vertex Pipelines"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "id": "809ba028",
366 | "metadata": {},
367 | "outputs": [],
368 | "source": [
369 | "WORKSPACE = f\"gs://{BUCKET}/{DATASET_DISPLAY_NAME}/\"\n",
370 | "ARTIFACT_STORE = os.path.join(WORKSPACE, 'tfx_artifacts')\n",
371 | "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-predict-pipeline'"
372 | ]
373 | },
374 | {
375 | "cell_type": "markdown",
376 | "id": "769a1d9e",
377 | "metadata": {},
378 | "source": [
379 | "### Set the pipeline configurations for the Vertex AI run"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": null,
385 | "id": "c5add19d",
386 | "metadata": {},
387 | "outputs": [],
388 | "source": [
389 | "os.environ[\"PROJECT\"] = PROJECT\n",
390 | "os.environ[\"REGION\"] = REGION\n",
391 | "os.environ[\"GCS_LOCATION\"] = f\"gs://{BUCKET}/{DATASET_DISPLAY_NAME}\"\n",
392 | "os.environ[\"MODEL_DISPLAY_NAME\"] = MODEL_DISPLAY_NAME\n",
393 | "os.environ[\"PIPELINE_NAME\"] = PIPELINE_NAME\n",
394 | "os.environ[\"ARTIFACT_STORE_URI\"] = ARTIFACT_STORE\n",
395 | "os.environ[\"BATCH_PREDICTION_BQ_DATASET_NAME\"] = SERVE_BQ_DATASET_NAME\n",
396 | "os.environ[\"BATCH_PREDICTION_BQ_TABLE_NAME\"] = SERVE_BQ_TABLE_NAME\n",
397 | "os.environ[\"SERVE_LIMIT\"] = \"1000\"\n",
398 | "os.environ[\"BEAM_RUNNER\"] = \"DirectRunner\"\n",
399 | "os.environ[\"TFX_IMAGE_URI\"] = f\"gcr.io/{PROJECT}/{DATASET_DISPLAY_NAME}:{VERSION}\""
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": null,
405 | "id": "f6d0e2ec",
406 | "metadata": {},
407 | "outputs": [],
408 | "source": [
409 | "import importlib\n",
410 | "from src.tfx_pipelines import config\n",
411 | "importlib.reload(config)\n",
412 | "\n",
413 | "for key, value in config.__dict__.items():\n",
414 | " if key.isupper(): print(f'{key}: {value}')"
415 | ]
416 | },
417 | {
418 | "cell_type": "markdown",
419 | "id": "f128b46e",
420 | "metadata": {},
421 | "source": [
422 | "### (Optional) Build the ML container image\n",
423 | "\n",
424 | "This is the `TFX` runtime environment for the training pipeline steps."
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": null,
430 | "id": "f24fa5e6",
431 | "metadata": {},
432 | "outputs": [],
433 | "source": [
434 | "!echo $TFX_IMAGE_URI"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": null,
440 | "id": "3949cc7e",
441 | "metadata": {},
442 | "outputs": [],
443 | "source": [
444 | "!gcloud builds submit --tag $TFX_IMAGE_URI . --timeout=15m --machine-type=e2-highcpu-8"
445 | ]
446 | },
447 | {
448 | "cell_type": "markdown",
449 | "id": "98a9890d",
450 | "metadata": {},
451 | "source": [
452 | "### Compile pipeline"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": null,
458 | "id": "09c8a3a0",
459 | "metadata": {},
460 | "outputs": [],
461 | "source": [
462 | "from src.tfx_pipelines import runner\n",
463 | "\n",
464 | "pipeline_definition_file = f'{config.PIPELINE_NAME}.json'\n",
465 | "pipeline_definition = runner.compile_prediction_pipeline(pipeline_definition_file)"
466 | ]
467 | },
468 | {
469 | "cell_type": "markdown",
470 | "id": "2bc2792a",
471 | "metadata": {},
472 | "source": [
473 | "### Submit run to Vertex Pipelines"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": null,
479 | "id": "37dcc92d",
480 | "metadata": {},
481 | "outputs": [],
482 | "source": [
483 | "from kfp.v2.google.client import AIPlatformClient\n",
484 | "\n",
485 | "pipeline_client = AIPlatformClient(\n",
486 | " project_id=PROJECT, region=REGION)\n",
487 | " \n",
488 | "pipeline_client.create_run_from_job_spec(\n",
489 | " job_spec_path=pipeline_definition_file\n",
490 | ")"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": null,
496 | "id": "5e0d5bef",
497 | "metadata": {},
498 | "outputs": [],
499 | "source": []
500 | }
501 | ],
502 | "metadata": {
503 | "environment": {
504 | "name": "common-cpu.m79",
505 | "type": "gcloud",
506 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m79"
507 | },
508 | "kernelspec": {
509 | "display_name": "Python 3",
510 | "language": "python",
511 | "name": "python3"
512 | },
513 | "language_info": {
514 | "codemirror_mode": {
515 | "name": "ipython",
516 | "version": 3
517 | },
518 | "file_extension": ".py",
519 | "mimetype": "text/x-python",
520 | "name": "python",
521 | "nbconvert_exporter": "python",
522 | "pygments_lexer": "ipython3",
523 | "version": "3.7.10"
524 | }
525 | },
526 | "nbformat": 4,
527 | "nbformat_minor": 5
528 | }
529 |
--------------------------------------------------------------------------------
/08-model-monitoring.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "441c8b66",
6 | "metadata": {},
7 | "source": [
8 | "# 08 - Model Monitoring\n",
9 | "\n",
10 | "This notebook covers configuring model monitoring jobs for skew and drift detection:\n",
11 | "1. Set skew and drift threshold.\n",
12 | "2. Create a monitoring job for all the models under and endpoint.\n",
13 | "3. List the monitoring jobs.\n",
14 | "4. Simulate skewed prediction requests.\n",
15 | "5. Pause and delete the monitoring job."
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "id": "b2a0d93e",
21 | "metadata": {},
22 | "source": [
23 | "## Setup"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "id": "c95c73cf",
29 | "metadata": {},
30 | "source": [
31 | "### Import libraries"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "id": "aee62910",
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "import copy\n",
42 | "from datetime import datetime\n",
43 | "import time\n",
44 | "\n",
45 | "from google.protobuf.duration_pb2 import Duration\n",
46 | "from google.cloud import aiplatform as vertex_ai\n",
47 | "from google.cloud import aiplatform_v1beta1 as vertex_ai_beta"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "id": "33eb5433",
53 | "metadata": {},
54 | "source": [
55 | "### Setup Google Cloud project"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "id": "c9e34ea5",
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "PROJECT = '[your-project-id]' # Change to your project id.\n",
66 | "REGION = 'us-central1' # Change to your region.\n",
67 | "BUCKET = '[your-bucket-name]' # Change to your bucket name.\n",
68 | "\n",
69 | "if PROJECT == \"\" or PROJECT is None or PROJECT == \"[your-project-id]\":\n",
70 | " # Get your GCP project id from gcloud\n",
71 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n",
72 | " PROJECT = shell_output[0]\n",
73 | " \n",
74 | "if BUCKET == \"\" or BUCKET is None or BUCKET == \"[your-bucket-name]\":\n",
75 | " # Get your bucket name to GCP project id\n",
76 | " BUCKET = PROJECT\n",
77 | " # Try to create the bucket if it doesn't exists\n",
78 | " ! gsutil mb -l $REGION gs://$BUCKET\n",
79 | " print(\"\")\n",
80 | "\n",
81 | "PARENT = f\"projects/{PROJECT}/locations/{REGION}\"\n",
82 | "\n",
83 | "print(\"Project ID:\", PROJECT)\n",
84 | "print(\"Region:\", REGION)\n",
85 | "print(\"Bucket name:\", BUCKET)\n",
86 | "print(\"Vertex API Parent URI:\", PARENT)"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "id": "0051e6da",
92 | "metadata": {},
93 | "source": [
94 | "### Set configurations"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "id": "c4ffa4e8",
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n",
105 | "ENDPOINT_DISPLAY_NAME = 'chicago-taxi-tips-classifier'\n",
106 | "MONITORING_JOB_NAME = f\"monitor-{ENDPOINT_DISPLAY_NAME}\"\n",
107 | "NOTIFY_EMAILS = [\"\"] # Change to your email address.\n",
108 | "\n",
109 | "LOG_SAMPLE_RATE = 0.8\n",
110 | "MONITOR_INTERVAL = 3600\n",
111 | "TARGET_FEATURE_NAME = 'tip_bin'"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "id": "09153445",
117 | "metadata": {},
118 | "source": [
119 | "## Create Job Service Client"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "id": "3c8c8872",
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "job_client_beta = vertex_ai_beta.JobServiceClient(\n",
130 | " client_options={\"api_endpoint\": f\"{REGION}-aiplatform.googleapis.com\"}\n",
131 | ")"
132 | ]
133 | },
134 | {
135 | "cell_type": "markdown",
136 | "id": "250e521b",
137 | "metadata": {},
138 | "source": [
139 | "## 1. Set Skew and Drift Thresholds"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "id": "10bd314f",
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "SKEW_THRESHOLDS = {\n",
150 | " 'trip_month': 0.3,\n",
151 | " 'trip_day': 0.3,\n",
152 | " 'trip_day_of_week': 0.3,\n",
153 | " 'trip_hour': 0.3,\n",
154 | " 'trip_seconds': 0.3,\n",
155 | " 'trip_miles': 0.3,\n",
156 | " 'payment_type': 0.3,\n",
157 | " 'pickup_grid': 0.3,\n",
158 | " 'dropoff_grid': 0.3,\n",
159 | " 'euclidean': 0.3,\n",
160 | " 'loc_cross': 0.3, \n",
161 | "}\n",
162 | "\n",
163 | "DRIFT_THRESHOLDS = {\n",
164 | " 'trip_month': 0.3,\n",
165 | " 'trip_day': 0.3,\n",
166 | " 'trip_day_of_week': 0.3,\n",
167 | " 'trip_hour': 0.3,\n",
168 | " 'trip_seconds': 0.3,\n",
169 | " 'trip_miles': 0.3,\n",
170 | " 'payment_type': 0.3,\n",
171 | " 'pickup_grid': 0.3,\n",
172 | " 'dropoff_grid': 0.3,\n",
173 | " 'euclidean': 0.3,\n",
174 | " 'loc_cross': 0.3, \n",
175 | "}"
176 | ]
177 | },
178 | {
179 | "cell_type": "markdown",
180 | "id": "1579ae9d",
181 | "metadata": {},
182 | "source": [
183 | "## 2. Create Monitoring Job"
184 | ]
185 | },
186 | {
187 | "cell_type": "markdown",
188 | "id": "32756e32",
189 | "metadata": {},
190 | "source": [
191 | "### Retrieve the Vertex dataset and endpoint models to monitor"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": null,
197 | "id": "1f17f0d7",
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "dataset = vertex_ai.TabularDataset.list(\n",
202 | " filter=f\"display_name={DATASET_DISPLAY_NAME}\", \n",
203 | " order_by=\"update_time\")[-1]\n",
204 | "\n",
205 | "bq_source_uri = dataset.gca_resource.metadata[\"inputConfig\"][\"bigquerySource\"][\"uri\"]\n",
206 | " \n",
207 | "endpoint = vertex_ai.Endpoint.list(\n",
208 | " filter=f'display_name={ENDPOINT_DISPLAY_NAME}', \n",
209 | " order_by=\"update_time\")[-1]\n",
210 | "\n",
211 | "endpoint_uri = endpoint.gca_resource.name\n",
212 | "\n",
213 | "model_ids = [model.id for model in endpoint.list_models()]"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "id": "f8f3315d",
219 | "metadata": {},
220 | "source": [
221 | "### Configure the monitoring job"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "id": "d2998243",
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "skew_thresholds = {\n",
232 | " feature: vertex_ai_beta.ThresholdConfig(value=float(value))\n",
233 | " for feature, value in SKEW_THRESHOLDS.items()\n",
234 | "}\n",
235 | "\n",
236 | "drift_thresholds = {\n",
237 | " feature: vertex_ai_beta.ThresholdConfig(value=float(value))\n",
238 | " for feature, value in DRIFT_THRESHOLDS.items()\n",
239 | "}\n",
240 | "\n",
241 | "skew_config = vertex_ai_beta.ModelMonitoringObjectiveConfig.TrainingPredictionSkewDetectionConfig(\n",
242 | " skew_thresholds=skew_thresholds\n",
243 | ")\n",
244 | "\n",
245 | "drift_config = vertex_ai_beta.ModelMonitoringObjectiveConfig.PredictionDriftDetectionConfig(\n",
246 | " drift_thresholds=drift_thresholds\n",
247 | ")\n",
248 | "\n",
249 | "sampling_config = vertex_ai_beta.SamplingStrategy(\n",
250 | " random_sample_config=vertex_ai_beta.SamplingStrategy.RandomSampleConfig(\n",
251 | " sample_rate=LOG_SAMPLE_RATE\n",
252 | " )\n",
253 | ")\n",
254 | "\n",
255 | "schedule_config = vertex_ai_beta.ModelDeploymentMonitoringScheduleConfig(\n",
256 | " monitor_interval=Duration(seconds=MONITOR_INTERVAL)\n",
257 | ")\n",
258 | "\n",
259 | "training_dataset = vertex_ai_beta.ModelMonitoringObjectiveConfig.TrainingDataset(\n",
260 | " target_field=TARGET_FEATURE_NAME,\n",
261 | " bigquery_source = vertex_ai_beta.types.io.BigQuerySource(\n",
262 | " input_uri=bq_source_uri\n",
263 | " )\n",
264 | ")\n",
265 | "\n",
266 | "\n",
267 | "objective_template = vertex_ai_beta.ModelDeploymentMonitoringObjectiveConfig(\n",
268 | " objective_config=vertex_ai_beta.ModelMonitoringObjectiveConfig(\n",
269 | " training_dataset=training_dataset,\n",
270 | " training_prediction_skew_detection_config=skew_config,\n",
271 | " prediction_drift_detection_config=drift_config,\n",
272 | " )\n",
273 | ")\n",
274 | "\n",
275 | "deployment_objective_configs = []\n",
276 | "for model_id in model_ids:\n",
277 | " objective_config = copy.deepcopy(objective_template)\n",
278 | " objective_config.deployed_model_id = model_id\n",
279 | " deployment_objective_configs.append(objective_config)\n",
280 | "\n",
281 | "alerting_config = vertex_ai_beta.ModelMonitoringAlertConfig(\n",
282 | " email_alert_config=vertex_ai_beta.ModelMonitoringAlertConfig.EmailAlertConfig(\n",
283 | " user_emails=NOTIFY_EMAILS\n",
284 | " )\n",
285 | ")\n"
286 | ]
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "id": "7311422c",
291 | "metadata": {},
292 | "source": [
293 | "### Instantiate a monitoring job"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": null,
299 | "id": "7b414c32",
300 | "metadata": {},
301 | "outputs": [],
302 | "source": [
303 | "job = vertex_ai_beta.ModelDeploymentMonitoringJob(\n",
304 | " display_name=MONITORING_JOB_NAME,\n",
305 | " endpoint=endpoint_uri,\n",
306 | " model_deployment_monitoring_objective_configs=deployment_objective_configs,\n",
307 | " logging_sampling_strategy=sampling_config,\n",
308 | " model_deployment_monitoring_schedule_config=schedule_config,\n",
309 | " model_monitoring_alert_config=alerting_config,\n",
310 | ")"
311 | ]
312 | },
313 | {
314 | "cell_type": "markdown",
315 | "id": "8e87fd5c",
316 | "metadata": {},
317 | "source": [
318 | "### Submit the job for creation"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "id": "a7d54b6f",
325 | "metadata": {},
326 | "outputs": [],
327 | "source": [
328 | "response = job_client_beta.create_model_deployment_monitoring_job(\n",
329 | " parent=PARENT, model_deployment_monitoring_job=job\n",
330 | ")\n",
331 | "response"
332 | ]
333 | },
334 | {
335 | "cell_type": "markdown",
336 | "id": "7c8d2120",
337 | "metadata": {},
338 | "source": [
339 | "## 3. List Monitoring Jobs"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "id": "ef38d00d",
346 | "metadata": {},
347 | "outputs": [],
348 | "source": [
349 | "monitoring_jobs = job_client_beta.list_model_deployment_monitoring_jobs(parent=PARENT)\n",
350 | "monitoring_job = [entry for entry in monitoring_jobs if entry.display_name == MONITORING_JOB_NAME][0]\n",
351 | "monitoring_job"
352 | ]
353 | },
354 | {
355 | "cell_type": "markdown",
356 | "id": "fb136f64",
357 | "metadata": {},
358 | "source": [
359 | "## 4. Simulate skewed prediction requests"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "id": "07ff9ab8",
366 | "metadata": {},
367 | "outputs": [],
368 | "source": [
369 | "num_requests = 100\n",
370 | "\n",
371 | "print(\"Simulation started...\")\n",
372 | "for idx in range(num_requests):\n",
373 | " request = [{\n",
374 | " \"dropoff_grid\": [\"POINT(-87.6 41.9)\"],\n",
375 | " \"euclidean\": [2064.2696],\n",
376 | " \"loc_cross\": [\"\"],\n",
377 | " \"payment_type\": [\"Credit Card\"],\n",
378 | " \"pickup_grid\": [\"POINT(-87.6 41.9)\"],\n",
379 | " \"trip_miles\": [1.37],\n",
380 | " \"trip_day\": [int(random.uniform(10, 50))],\n",
381 | " \"trip_hour\": [int(random.uniform(10, 50))],\n",
382 | " \"trip_month\": [int(random.uniform(1, 10))],\n",
383 | " \"trip_day_of_week\": [int(random.uniform(1, 7))],\n",
384 | " \"trip_seconds\": [int(random.uniform(60, 600))]\n",
385 | " }]\n",
386 | " \n",
387 | " endpoint.predict(request)\n",
388 | " time.sleep(0.5)\n",
389 | " \n",
390 | " if idx % 10 == 0:\n",
391 | " print(f'{idx + 1} of {num_requests} prediction requests were invoked.')\n",
392 | "print(\"Simulation finished.\")"
393 | ]
394 | },
395 | {
396 | "cell_type": "markdown",
397 | "id": "06a03835",
398 | "metadata": {},
399 | "source": [
400 | "## 5. Pause Monitoring Job"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": null,
406 | "id": "6e4ba104",
407 | "metadata": {},
408 | "outputs": [],
409 | "source": [
410 | "job_client_beta.pause_model_deployment_monitoring_job(name=monitoring_job.name)"
411 | ]
412 | },
413 | {
414 | "cell_type": "markdown",
415 | "id": "8fb6f259",
416 | "metadata": {},
417 | "source": [
418 | "## Delete Monitoring Job"
419 | ]
420 | },
421 | {
422 | "cell_type": "code",
423 | "execution_count": null,
424 | "id": "4668f9dd",
425 | "metadata": {},
426 | "outputs": [],
427 | "source": [
428 | "job_client_beta.delete_model_deployment_monitoring_job(name=monitoring_job.name)"
429 | ]
430 | },
431 | {
432 | "cell_type": "code",
433 | "execution_count": null,
434 | "id": "ac101746",
435 | "metadata": {},
436 | "outputs": [],
437 | "source": []
438 | }
439 | ],
440 | "metadata": {
441 | "environment": {
442 | "name": "common-cpu.m79",
443 | "type": "gcloud",
444 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m79"
445 | },
446 | "kernelspec": {
447 | "display_name": "Python 3",
448 | "language": "python",
449 | "name": "python3"
450 | },
451 | "language_info": {
452 | "codemirror_mode": {
453 | "name": "ipython",
454 | "version": 3
455 | },
456 | "file_extension": ".py",
457 | "mimetype": "text/x-python",
458 | "name": "python",
459 | "nbconvert_exporter": "python",
460 | "pygments_lexer": "ipython3",
461 | "version": "3.7.10"
462 | }
463 | },
464 | "nbformat": 4,
465 | "nbformat_minor": 5
466 | }
467 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gcr.io/tfx-oss-public/tfx:1.2.0
2 |
3 | COPY requirements.txt requirements.txt
4 |
5 | RUN pip install -r requirements.txt
6 |
7 | COPY src/ src/
8 |
9 | ENV PYTHONPATH="/pipeline:${PYTHONPATH}"
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MLOps with Vertex AI
2 |
3 | This example implements the end-to-end [MLOps process](https://services.google.com/fh/files/misc/practitioners_guide_to_mlops_whitepaper.pdf) using [Vertex AI](https://cloud.google.com/vertex-ai) platform and [Smart Analytics](https://cloud.google.com/solutions/smart-analytics) technology capabilities. The example uses [Keras](https://keras.io/) to implement the ML model, [TFX](https://www.tensorflow.org/tfx) to implement the training pipeline, and [Model Builder SDK](https://github.com/googleapis/python-aiplatform/tree/569d4cd03e888fde0171f7b0060695a14f99b072/google/cloud/aiplatform) to interact with Vertex AI.
4 |
5 |
6 |
7 |
8 |
9 |
10 | ## Getting started
11 |
12 | 1. [Setup your MLOps environment](provision) on Google Cloud.
13 | 2. Start your AI Notebook instance.
14 | 3. Open the JupyterLab then open a new Terminal
15 | 4. Clone the repository to your AI Notebook instance:
16 | ```
17 | git clone https://github.com/GoogleCloudPlatform/mlops-with-vertex-ai.git
18 | cd mlops-with-vertex-ai
19 | ```
20 | 5. Install the required Python packages:
21 | ```
22 | pip install tfx==1.2.0 --user
23 | pip install -r requirements.txt
24 | ```
25 | ---
26 | **NOTE**: You can ignore the pip dependencies issues. These will be fixed when upgrading to subsequent TFX version.
27 |
28 | ---
29 | 6. Upgrade the `gcloud` components:
30 | ```
31 | sudo apt-get install google-cloud-sdk
32 | gcloud components update
33 | ```
34 |
35 | ## Dataset Management
36 |
37 | The [Chicago Taxi Trips](https://pantheon.corp.google.com/marketplace/details/city-of-chicago-public-data/chicago-taxi-trips) dataset is one of [public datasets hosted with BigQuery](https://cloud.google.com/bigquery/public-data/), which includes taxi trips from 2013 to the present, reported to the City of Chicago in its role as a regulatory agency. The task is to predict whether a given trip will result in a tip > 20%.
38 |
39 | The [01-dataset-management](01-dataset-management.ipynb) notebook covers:
40 |
41 | 1. Performing exploratory data analysis on the data in `BigQuery`.
42 | 2. Creating `Vertex AI` Dataset resource using the Python SDK.
43 | 3. Generating the schema for the raw data using [TensorFlow Data Validation](https://www.tensorflow.org/tfx/guide/tfdv).
44 |
45 |
46 | ## ML Development
47 |
48 | We experiment with creating a [Custom Model](https://cloud.google.com/ai-platform-unified/docs/training/create-model-custom-training) using [02-experimentation](02-experimentation.ipynb) notebook, which covers:
49 |
50 | 1. Preparing the data using `Dataflow`.
51 | 2. Implementing a `Keras` classification model.
52 | 3. Training the `Keras` model with `Vertex AI` using a [pre-built container](https://cloud.google.com/ai-platform-unified/docs/training/pre-built-containers).
53 | 4. Upload the exported model from `Cloud Storage` to `Vertex AI`.
54 | 5. Extract and visualize experiment parameters from [Vertex AI Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction).
55 | 6. Use `Vertex AI` for [hyperparameter tuning](https://cloud.google.com/vertex-ai/docs/training/hyperparameter-tuning-overview).
56 |
57 | We use [Vertex TensorBoard](https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview)
58 | and [Vertex ML Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction) to track, visualize, and compare ML experiments.
59 |
60 | In addition, the training steps are formalized by implementing a [TFX pipeline](https://www.tensorflow.org/tfx).
61 | The [03-training-formalization](03-training-formalization.ipynb) notebook covers implementing and testing the pipeline components interactively.
62 |
63 | ## Training Operationalization
64 |
65 | The [04-pipeline-deployment](04-pipeline-deployment.ipynb) notebook covers executing the CI/CD steps for the training pipeline deployment using [Cloud Build](https://cloud.google.com/build/docs/overview). The CI/CD routine is defined in the [pipeline-deployment.yaml](build/pipeline-deployment.yaml) file, and consists of the following steps:
66 |
67 | 1. Clone the repository to the build environment.
68 | 2. Run unit tests.
69 | 3. Run a local e2e test of the `TFX` pipeline.
70 | 4. Build the ML container image for pipeline steps.
71 | 5. Compile the pipeline.
72 | 6. Upload the pipeline to `Cloud Storage`.
73 |
74 | ## Continuous Training
75 |
76 | After testing, compiling, and uploading the pipeline definition to `Cloud Storage`, the pipeline is executed with respect to a trigger.
77 | We use [Cloud Functions](https://cloud.google.com/functions) and [Cloud Pub/Sub](https://cloud.google.com/pubsub) as a triggering mechanism.
78 | The `Cloud Function` listens to the `Pub/Sub` topic, and runs the training pipeline given a message sent to the `Pub/Sub` topic.
79 | The `Cloud Function` is implemented in [src/pipeline_triggering](src/pipeline_triggering).
80 |
81 | The [05-continuous-training](05-continuous-training.ipynb) notebook covers:
82 |
83 | 1. Creating a Cloud `Pub/Sub` topic.
84 | 2. Deploying a `Cloud Function`.
85 | 3. Triggering the pipeline.
86 |
87 | The end-to-end TFX training pipeline implementation is in the [src/pipelines](src/tfx_pipelines) directory, which covers the following steps:
88 |
89 | 1. Receive hyper-parameters using `hyperparam_gen` custom python component.
90 | 2. Extract data from `BigQuery` using `BigQueryExampleGen` component.
91 | 3. Validate the raw data using `StatisticsGen` and `ExampleValidator` component.
92 | 4. Process the data using on `Dataflow` `Transform` component.
93 | 5. Train a custom model with `Vertex AI` using `Trainer` component.
94 | 6. Evaluate and validate the custom model using `ModelEvaluator` component.
95 | 7. Save the blessed to model registry location in `Cloud Storage` using `Pusher` component.
96 | 8. Upload the model to `Vertex AI` using `vertex_model_pusher` custom python component.
97 |
98 |
99 | ## Model Deployment
100 |
101 | The [06-model-deployment](06-model-deployment.ipynb) notebook covers executing the CI/CD steps for the model deployment using [Cloud Build](https://cloud.google.com/build/docs/overview). The CI/CD routine is defined in [build/model-deployment.yaml](build/model-deployment.yaml)
102 | file, and consists of the following steps:
103 |
104 | 2. Test model interface.
105 | 3. Create an endpoint in `Vertex AI`.
106 | 4. Deploy the model to the `endpoint`.
107 | 5. Test the `Vertex AI` endpoint.
108 |
109 | ## Prediction Serving
110 |
111 | We serve the deployed model for prediction.
112 | The [07-prediction-serving](07-prediction-serving.ipynb) notebook covers:
113 |
114 | 1. Use the `Vertex AI` endpoint for online prediction.
115 | 2. Use the `Vertex AI` uploaded model for batch prediction.
116 | 3. Run the batch prediction using `Vertex Pipelines`.
117 |
118 | ## Model Monitoring
119 |
120 | After a model is deployed in for prediction serving, continuous monitoring is set up to ensure that the model continue to perform as expected.
121 | The [08-model-monitoring](08-model-monitoring.ipynb) notebook covers configuring [Vertex AI Model Monitoring](https://cloud.google.com/vertex-ai/docs/model-monitoring/overview?hl=nn) for skew and drift detection:
122 |
123 | 1. Set skew and drift threshold.
124 | 2. Create a monitoring job for all the models under and endpoint.
125 | 3. List the monitoring jobs.
126 | 4. List artifacts produced by monitoring job.
127 | 5. Pause and delete the monitoring job.
128 |
129 |
130 | ## Metadata Tracking
131 |
132 | You can view the parameters and metrics logged by your experiments, as well as the artifacts and metadata stored by
133 | your `Vertex Pipelines` in [Cloud Console](https://console.cloud.google.com/vertex-ai/metadata).
134 |
135 | ## Disclaimer
136 |
137 | This is not an official Google product but sample code provided for an educational purpose.
138 |
139 | ---
140 |
141 | Copyright 2021 Google LLC.
142 |
143 | Licensed under the Apache License, Version 2.0 (the "License");
144 | you may not use this file except in compliance with the License.
145 | You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0
146 |
147 | Unless required by applicable law or agreed to in writing, software
148 | distributed under the License is distributed on an "AS IS" BASIS,
149 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
150 | See the License for the specific language governing permissions and
151 | limitations under the License.
152 |
153 |
154 |
155 |
156 |
157 |
158 |
--------------------------------------------------------------------------------
/build/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gcr.io/tfx-oss-public/tfx:1.2.0
2 |
3 | RUN pip install -U pip
4 | RUN pip install google-cloud-aiplatform==1.4.2 google-cloud-aiplatform[tensorboard]
5 | RUN pip install pytest kfp==1.8.1 google-cloud-bigquery==2.26.0 google-cloud-bigquery-storage==2.7.0 google-cloud-aiplatform==1.4.2
--------------------------------------------------------------------------------
/build/model-deployment.yaml:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | ######################################################################
16 | # CI/CD steps for Cloud Build to test and deploy a model to Vertex AI.
17 | ######################################################################
18 |
19 | steps:
20 |
21 | # Clone the repository.
22 | - name: 'gcr.io/cloud-builders/git'
23 | args: ['clone', '--single-branch', '--branch',
24 | '$_BRANCH', '$_REPO_URL',
25 | '--depth', '1',
26 | '--verbose']
27 | id: 'Clone Repository'
28 |
29 | # Test uploaded model artifact.
30 | - name: '$_CICD_IMAGE_URI'
31 | entrypoint: 'pytest'
32 | args: ['src/tests/model_deployment_tests.py::test_model_artifact']
33 | dir: 'mlops-with-vertex-ai'
34 | env:
35 | - 'PROJECT=$_PROJECT'
36 | - 'REGION=$_REGION'
37 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME'
38 | id: 'Test Model Artifact'
39 | waitFor: ['Clone Repository']
40 |
41 | # Create an endpoint.
42 | - name: '$_CICD_IMAGE_URI'
43 | entrypoint: 'python'
44 | args: ['build/utils.py',
45 | '--mode', 'create-endpoint',
46 | '--project', '$_PROJECT',
47 | '--region', '$_REGION',
48 | '--endpoint-display-name', '$_ENDPOINT_DISPLAY_NAME']
49 | dir: 'mlops-with-vertex-ai'
50 | id: 'Create Endpoint'
51 | waitFor: ['Test Model Artifact']
52 |
53 | # Deploy the model.
54 | - name: '$_CICD_IMAGE_URI'
55 | entrypoint: 'python'
56 | args: ['build/utils.py',
57 | '--mode', 'deploy-model',
58 | '--project', '$_PROJECT',
59 | '--region', '$_REGION',
60 | '--endpoint-display-name', '$_ENDPOINT_DISPLAY_NAME',
61 | '--model-display-name', '$_MODEL_DISPLAY_NAME'
62 | ]
63 | dir: 'mlops-with-vertex-ai'
64 | id: 'Deploy Model'
65 | waitFor: ['Create Endpoint']
66 |
67 | # Test deployed model endpoint.
68 | - name: '$_CICD_IMAGE_URI'
69 | entrypoint: 'pytest'
70 | args: ['src/tests/model_deployment_tests.py::test_model_endpoint']
71 | dir: 'mlops-with-vertex-ai'
72 | env:
73 | - 'PROJECT=$_PROJECT'
74 | - 'REGION=$_REGION'
75 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME'
76 | - 'ENDPOINT_DISPLAY_NAME=$_ENDPOINT_DISPLAY_NAME'
77 | id: 'Test Model Endpoint'
78 | waitFor: ['Deploy Model']
79 |
--------------------------------------------------------------------------------
/build/pipeline-deployment.yaml:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | #############################################################################
16 | # CI/CD steps for Cloud Build to test and deploy a TFX pipeline to Vertex AI.
17 | #############################################################################
18 |
19 | steps:
20 |
21 | # Clone the repository.
22 | - name: 'gcr.io/cloud-builders/git'
23 | args: ['clone', '--single-branch', '--branch',
24 | '$_BRANCH', '$_REPO_URL',
25 | '--depth', '1',
26 | '--verbose']
27 | id: 'Clone Repository'
28 |
29 |
30 | # Run datasource_utils unit tests.
31 | - name: '$_CICD_IMAGE_URI'
32 | entrypoint: 'pytest'
33 | args: ['src/tests/datasource_utils_tests.py', '-s']
34 | dir: 'mlops-with-vertex-ai'
35 | env:
36 | - 'PROJECT=$_PROJECT'
37 | - 'BQ_LOCATION=$_BQ_LOCATION'
38 | - 'BQ_DATASET_NAME=$_BQ_DATASET_NAME'
39 | - 'BQ_TABLE_NAME=$_BQ_TABLE_NAME'
40 | id: 'Unit Test Datasource Utils'
41 | waitFor: ['Clone Repository']
42 |
43 |
44 | # Run model unit tests.
45 | - name: '$_CICD_IMAGE_URI'
46 | entrypoint: 'pytest'
47 | args: ['src/tests/model_tests.py', '-s']
48 | dir: 'mlops-with-vertex-ai'
49 | id: 'Unit Test Model'
50 | waitFor: ['Clone Repository']
51 | timeout: 1800s
52 |
53 |
54 | # Test e2e pipeline using local runner.
55 | - name: '$_CICD_IMAGE_URI'
56 | entrypoint: 'pytest'
57 | args: ['src/tests/pipeline_deployment_tests.py::test_e2e_pipeline', '-s']
58 | dir: 'mlops-with-vertex-ai'
59 | env:
60 | - 'PROJECT=$_PROJECT'
61 | - 'REGION=$_REGION'
62 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME'
63 | - 'DATASET_DISPLAY_NAME=$_DATASET_DISPLAY_NAME'
64 | - 'GCS_LOCATION=$_TEST_GCS_LOCATION'
65 | - 'TRAIN_LIMIT=$_CI_TRAIN_LIMIT'
66 | - 'TEST_LIMIT=$_CI_TEST_LIMIT'
67 | - 'UPLOAD_MODEL=$_CI_UPLOAD_MODEL'
68 | - 'ACCURACY_THRESHOLD=$_CI_ACCURACY_THRESHOLD'
69 | id: 'Local Test E2E Pipeline'
70 | waitFor: ['Unit Test Datasource Utils', 'Unit Test Model']
71 | timeout: 1800s
72 |
73 |
74 | # Build the image that encapsulates the pipeline.
75 | - name: 'gcr.io/cloud-builders/docker'
76 | args: ['build', '-t', '$_TFX_IMAGE_URI', '.']
77 | dir: 'mlops-with-vertex-ai'
78 | id: 'Build TFX Image'
79 | waitFor: ['Local Test E2E Pipeline']
80 |
81 |
82 | # Compile the pipeline.
83 | - name: '$_CICD_IMAGE_URI'
84 | entrypoint: 'python'
85 | args: ['build/utils.py',
86 | '--mode', 'compile-pipeline',
87 | '--pipeline-name', '$_PIPELINE_NAME'
88 | ]
89 | dir: 'mlops-with-vertex-ai'
90 | env:
91 | - 'PROJECT=$_PROJECT'
92 | - 'REGION=$_REGION'
93 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME'
94 | - 'DATASET_DISPLAY_NAME=$_DATASET_DISPLAY_NAME'
95 | - 'GCS_LOCATION=$_GCS_LOCATION'
96 | - 'TFX_IMAGE_URI=$_TFX_IMAGE_URI'
97 | - 'BEAM_RUNNER=$_BEAM_RUNNER'
98 | - 'TRAINING_RUNNER=$_TRAINING_RUNNER'
99 | id: 'Compile Pipeline'
100 | waitFor: ['Local Test E2E Pipeline']
101 |
102 |
103 | # Upload compiled pipeline to GCS.
104 | - name: 'gcr.io/cloud-builders/gsutil'
105 | args: ['cp', '$_PIPELINE_NAME.json', '$_PIPELINES_STORE']
106 | dir: 'mlops-with-vertex-ai'
107 | id: 'Upload Pipeline to GCS'
108 | waitFor: ['Compile Pipeline']
109 |
110 |
111 | # Push TFX Image to Container Registy.
112 | images: ['$_TFX_IMAGE_URI']
113 |
--------------------------------------------------------------------------------
/build/serving_resources_spec.json:
--------------------------------------------------------------------------------
1 | {
2 | "traffic_percentage": 100,
3 | "machine_type": "n1-standard-2",
4 | "min_replica_count": 1,
5 | "max_replica_count": 1,
6 | "accelerator_type": null,
7 | "accelerator_count": null
8 | }
--------------------------------------------------------------------------------
/build/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Utilities for deploying pipelines and models to Vertex AI."""
15 |
16 |
17 | import argparse
18 | import os
19 | import sys
20 | import logging
21 | import json
22 |
23 | from google.cloud import aiplatform as vertex_ai
24 |
25 |
26 | SCRIPT_DIR = os.path.dirname(
27 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))
28 | )
29 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, "..")))
30 |
31 | SERVING_SPEC_FILEPATH = 'build/serving_resources_spec.json'
32 |
33 | def get_args():
34 | parser = argparse.ArgumentParser()
35 |
36 | parser.add_argument(
37 | '--mode',
38 | type=str,
39 | )
40 |
41 | parser.add_argument(
42 | '--project',
43 | type=str,
44 | )
45 |
46 | parser.add_argument(
47 | '--region',
48 | type=str,
49 | )
50 |
51 | parser.add_argument(
52 | '--endpoint-display-name',
53 | type=str,
54 | )
55 |
56 | parser.add_argument(
57 | '--model-display-name',
58 | type=str,
59 | )
60 |
61 | parser.add_argument(
62 | '--pipeline-name',
63 | type=str,
64 | )
65 |
66 | parser.add_argument(
67 | '--pipelines-store',
68 | type=str,
69 | )
70 |
71 | return parser.parse_args()
72 |
73 |
74 | def create_endpoint(project, region, endpoint_display_name):
75 | logging.info(f"Creating endpoint {endpoint_display_name}")
76 | vertex_ai.init(
77 | project=project,
78 | location=region
79 | )
80 |
81 | endpoints = vertex_ai.Endpoint.list(
82 | filter=f'display_name={endpoint_display_name}',
83 | order_by="update_time")
84 |
85 | if len(endpoints) > 0:
86 | logging.info(f"Endpoint {endpoint_display_name} already exists.")
87 | endpoint = endpoints[-1]
88 | else:
89 | endpoint = vertex_ai.Endpoint.create(endpoint_display_name)
90 | logging.info(f"Endpoint is ready.")
91 | logging.info(endpoint.gca_resource)
92 | return endpoint
93 |
94 |
95 | def deploy_model(project, region, endpoint_display_name, model_display_name, serving_resources_spec):
96 | logging.info(f"Deploying model {model_display_name} to endpoint {endpoint_display_name}")
97 | vertex_ai.init(
98 | project=project,
99 | location=region
100 | )
101 |
102 | model = vertex_ai.Model.list(
103 | filter=f'display_name={model_display_name}',
104 | order_by="update_time"
105 | )[-1]
106 |
107 | endpoint = vertex_ai.Endpoint.list(
108 | filter=f'display_name={endpoint_display_name}',
109 | order_by="update_time"
110 | )[-1]
111 |
112 | deployed_model = endpoint.deploy(model=model, **serving_resources_spec)
113 | logging.info(f"Model is deployed.")
114 | logging.info(deployed_model)
115 | return deployed_model
116 |
117 |
118 | def compile_pipeline(pipeline_name):
119 | from src.tfx_pipelines import runner
120 | pipeline_definition_file = f"{pipeline_name}.json"
121 | pipeline_definition = runner.compile_training_pipeline(pipeline_definition_file)
122 | return pipeline_definition
123 |
124 |
125 |
126 | def main():
127 | args = get_args()
128 |
129 | if args.mode == 'create-endpoint':
130 | if not args.project:
131 | raise ValueError("project must be supplied.")
132 | if not args.region:
133 | raise ValueError("region must be supplied.")
134 | if not args.endpoint_display_name:
135 | raise ValueError("endpoint_display_name must be supplied.")
136 |
137 | result = create_endpoint(
138 | args.project,
139 | args.region,
140 | args.endpoint_display_name
141 | )
142 |
143 | elif args.mode == 'deploy-model':
144 | if not args.project:
145 | raise ValueError("project must be supplied.")
146 | if not args.region:
147 | raise ValueError("region must be supplied.")
148 | if not args.endpoint_display_name:
149 | raise ValueError("endpoint-display-name must be supplied.")
150 | if not args.model_display_name:
151 | raise ValueError("model-display-name must be supplied.")
152 |
153 | with open(SERVING_SPEC_FILEPATH) as json_file:
154 | serving_resources_spec = json.load(json_file)
155 | logging.info(f"serving resources: {serving_resources_spec}")
156 | result = deploy_model(
157 | args.project,
158 | args.region,
159 | args.endpoint_display_name,
160 | args.model_display_name,
161 | serving_resources_spec
162 | )
163 |
164 | elif args.mode == 'compile-pipeline':
165 | if not args.pipeline_name:
166 | raise ValueError("pipeline-name must be supplied.")
167 |
168 | result = compile_pipeline(args.pipeline_name)
169 |
170 | else:
171 | raise ValueError(f"Invalid mode {args.mode}.")
172 |
173 | logging.info(result)
174 |
175 |
176 | if __name__ == "__main__":
177 | main()
178 |
--------------------------------------------------------------------------------
/mlops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/mlops.png
--------------------------------------------------------------------------------
/provision/README.md:
--------------------------------------------------------------------------------
1 | # Creating a Vertex environment
2 |
3 | You can use the [Terraform](https://www.terraform.io/) scripts in the `terraform` folder to automatically provision the environment required by the samples.
4 |
5 | The scripts perform the following actions:
6 |
7 | 1. Enable the required Cloud APIs
8 | * **Essentials**: compute, iam, iamcredentials
9 | * **ML**: notebooks, aiplatform
10 | * **Data**: dataflow, bigquery, bigquerydatatransfer
11 | * **CI/CD**: cloudbuild, container, artifactregistry
12 | * **Operations**: cloudtrace, monitoring, logging, cloudresourcemanager
13 | 2. Create a regional GCS bucket.
14 | 3. Create an instance of Vertex Notebooks.
15 | 4. Create service accounts for Vertex Training and Vertex Pipelines.
16 |
17 | You can customize your configuration using the following variables:
18 |
19 | |Variable|Required|Default|Description|
20 | |--------|--------|-------|-----------|
21 | |name_prefix|Yes||Prefix added to the names of provisioned resources. **The prefix should start with a letter and include letters and digits only**.|
22 | |project_id|Yes||GCP project ID|
23 | |network_name|No|default|Name of the network for the Notebook instance. The network must already exist.|
24 | |subnet_name|No|default|Name of the subnet for the Notebook instance. The subnet must already exist.|
25 | |subnet_region|No|us-central1|Region where the subnet was created.|
26 | |zone|Yes||GCP zone for the Notebook instance. The zone must be in the region defined in the `subnet_region` variable|
27 | |machine_type|No|n1-standard-4|Machine type of the Notebook instance|
28 | |boot_disk_size|No|200GB|Size of the Notebook instance's boot disk|
29 | |image_family|No|base-cpu|Image family for the Notebook instance|
30 | |gpu_type|No|null|GPU type of the Notebook instance. By default, the Notebook instance will be provisioned without a GPU|
31 | |gpu_count|No|null|GPU count of the Notebook instance|
32 | |install_gpu_driver|No|false|Whether to install a GPU driver|
33 | |region|No|Set to subnet_region.|GCP region for the GCS bucket and Artifact Registry. It is recommended that the same region is used for all: the bucket, the registry and the Notebook instance. If not provided the `region` will be set to `subnet_region`.|
34 | |force_destroy|No|false|Whether to force the removal of the bucket on terraform destroy. **Note that by default the bucket will not be destroyed**.|
35 |
36 |
37 | To provision the environment:
38 |
39 | 1. Open [Cloud Shell](https://cloud.google.com/shell/docs/launching-cloud-shell)
40 |
41 | 2. Download the installation scripts
42 | ```
43 | SRC_REPO=https://github.com/GoogleCloudPlatform/mlops-with-vertex-ai.git
44 | LOCAL_DIR=provision
45 | kpt pkg get $SRC_REPO/provision@main $LOCAL_DIR
46 | cd $LOCAL_DIR/terraform
47 | ```
48 |
49 | 3. Update the `terraform.tfvars` file with the values reflecting your environment. Alternatively, you can provide the values using the Terraform CLI `-var` options when you execute `terraform apply` in the next step
50 |
51 | 4. Execute the following commands. :
52 | ```
53 | terraform init
54 | terraform apply
55 | ```
56 |
57 |
58 | To destroy the environment, execute:
59 | ```
60 | terraform destroy
61 | ```
62 |
--------------------------------------------------------------------------------
/provision/terraform/gcs-bucket.tf:
--------------------------------------------------------------------------------
1 |
2 | # Copyright 2021 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | resource "google_storage_bucket" "artifact_repo" {
18 | project = module.project-services.project_id
19 | name = "${var.name_prefix}-bucket"
20 | location = local.region
21 | storage_class = local.bucket_type
22 | force_destroy = var.force_destroy
23 | uniform_bucket_level_access = true
24 | }
--------------------------------------------------------------------------------
/provision/terraform/main.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | terraform {
16 | required_version = ">= 0.14"
17 | required_providers {
18 | google = "~> 3.6"
19 | }
20 | }
21 |
22 | provider "google" {
23 | project = var.project_id
24 | }
25 |
26 | data "google_project" "project" {
27 | project_id = var.project_id
28 | }
29 |
30 | locals {
31 | bucket_type = "REGIONAL"
32 | region = var.region == null ? var.subnet_region : var.region
33 | }
34 |
35 |
36 |
--------------------------------------------------------------------------------
/provision/terraform/notebook-instance.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | locals {
16 | image_project = "deeplearning-platform-release"
17 | }
18 |
19 | data "google_compute_network" "vm_network" {
20 | project = module.project-services.project_id
21 | name = var.network_name
22 |
23 | depends_on = [
24 | module.project-services
25 | ]
26 | }
27 |
28 | data "google_compute_subnetwork" "vm_subnetwork" {
29 | project = module.project-services.project_id
30 | name = var.subnet_name
31 | region = var.subnet_region
32 |
33 | depends_on = [
34 | module.project-services
35 | ]
36 | }
37 |
38 | resource "google_notebooks_instance" "notebook_instance" {
39 | project = module.project-services.project_id
40 | name = "${var.name_prefix}-notebook"
41 | machine_type = var.machine_type
42 | location = var.zone
43 |
44 | network = data.google_compute_network.vm_network.id
45 | subnet = data.google_compute_subnetwork.vm_subnetwork.id
46 |
47 | vm_image {
48 | project = local.image_project
49 | image_family = var.image_family
50 | }
51 |
52 | dynamic accelerator_config {
53 | for_each = var.gpu_type != null ? [1] : []
54 | content {
55 | type = var.gpu_type
56 | core_count = var.gpu_count
57 | }
58 | }
59 |
60 | install_gpu_driver = var.install_gpu_driver
61 |
62 | boot_disk_size_gb = var.boot_disk_size
63 | }
64 |
--------------------------------------------------------------------------------
/provision/terraform/service-accounts.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Create Vertex Training service account
16 | resource "google_service_account" "training_sa" {
17 | project = module.project-services.project_id
18 | account_id = var.training_sa_name
19 | display_name = "Vertex Training service account"
20 | }
21 |
22 | # Create Vertex Training SA role bindings
23 | resource "google_project_iam_member" "training_sa_role_bindings" {
24 | project = module.project-services.project_id
25 | for_each = toset(var.training_sa_roles)
26 | member = "serviceAccount:${google_service_account.training_sa.email}"
27 | role = "roles/${each.value}"
28 | }
29 |
30 | # Create Vertex Pipelines service account
31 | resource "google_service_account" "pipelines_sa" {
32 | project = module.project-services.project_id
33 | account_id = var.pipelines_sa_name
34 | display_name = "Vertex Pipelines account name"
35 | }
36 |
37 | # Create Vertex Pipelines SA role bindings
38 | resource "google_project_iam_member" "role_bindings" {
39 | project = module.project-services.project_id
40 | for_each = toset(var.pipelines_sa_roles)
41 | member = "serviceAccount:${google_service_account.pipelines_sa.email}"
42 | role = "roles/${each.value}"
43 | }
44 |
--------------------------------------------------------------------------------
/provision/terraform/services.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | module "project-services" {
17 | source = "terraform-google-modules/project-factory/google//modules/project_services"
18 |
19 | project_id = data.google_project.project.project_id
20 |
21 | disable_services_on_destroy = false
22 | activate_apis = [
23 | "compute.googleapis.com",
24 | "iam.googleapis.com",
25 | "container.googleapis.com",
26 | "artifactregistry.googleapis.com",
27 | "cloudresourcemanager.googleapis.com",
28 | "cloudtrace.googleapis.com",
29 | "iamcredentials.googleapis.com",
30 | "monitoring.googleapis.com",
31 | "logging.googleapis.com",
32 | "notebooks.googleapis.com",
33 | "aiplatform.googleapis.com",
34 | "dataflow.googleapis.com",
35 | "bigquery.googleapis.com",
36 | "cloudbuild.googleapis.com",
37 | "bigquerydatatransfer.googleapis.com",
38 | "cloudfunctions.googleapis.com"
39 | ]
40 | }
--------------------------------------------------------------------------------
/provision/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
1 | project_id = "vertex-mlops"
2 | subnet_region = "us-central1"
3 | zone = "us-central1-a"
4 | name_prefix = "vertex-mlops"
5 | machine_type = "n1-standard-8"
6 | #gpu_type = "NVIDIA_TESLA_T4"
7 | #gpu_count = 1
8 | #install_gpu_driver = true
9 | #image_family = "common-gpu"
10 |
11 |
12 |
--------------------------------------------------------------------------------
/provision/terraform/variables.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | variable "project_id" {
17 | description = "The GCP project ID"
18 | type = string
19 | }
20 |
21 | variable "region" {
22 | description = "The region for the GCS bucket and Artifact Registry"
23 | type = string
24 | default = null
25 | }
26 |
27 | variable "zone" {
28 | description = "The zone for a Vertex Notebook instance"
29 | type = string
30 | }
31 |
32 | variable "name_prefix" {
33 | description = "The name prefix to add to the resource names"
34 | type = string
35 | }
36 |
37 | variable "machine_type" {
38 | description = "The Notebook instance's machine type"
39 | type = string
40 | }
41 |
42 | variable "network_name" {
43 | description = "The network name for the Notebook instance"
44 | type = string
45 | default = "default"
46 | }
47 |
48 | variable "subnet_name" {
49 | description = "The subnet name for the Notebook instance"
50 | type = string
51 | default = "default"
52 | }
53 |
54 | variable "subnet_region" {
55 | description = "The region for the Notebook subnet"
56 | type = string
57 | default = "us-central1"
58 | }
59 |
60 | variable "boot_disk_size" {
61 | description = "The size of the boot disk"
62 | default = 200
63 | }
64 |
65 | variable "image_family" {
66 | description = "A Deep Learning image family for the Notebook instance"
67 | type = string
68 | default = "common-cpu"
69 | }
70 |
71 | variable "gpu_type" {
72 | description = "A GPU type for the Notebook instance"
73 | type = string
74 | default = null
75 | }
76 |
77 | variable "gpu_count" {
78 | description = "A GPU count for the Notebook instance"
79 | type = string
80 | default = null
81 | }
82 |
83 | variable "install_gpu_driver" {
84 | description = "Whether to install GPU driver"
85 | type = bool
86 | default = true
87 | }
88 |
89 | variable "force_destroy" {
90 | description = "Whether to remove the bucket on destroy"
91 | type = bool
92 | default = false
93 | }
94 |
95 | variable "training_sa_roles" {
96 | description = "The roles to assign to the Vertex Training service account"
97 | default = [
98 | "storage.admin",
99 | "aiplatform.user",
100 | "bigquery.admin"
101 | ]
102 | }
103 |
104 | variable "pipelines_sa_roles" {
105 | description = "The roles to assign to the Vertex Pipelines service account"
106 | default = [
107 | "storage.admin",
108 | "bigquery.admin",
109 | "aiplatform.user"
110 | ]
111 | }
112 |
113 | variable "training_sa_name" {
114 | description = "Vertex training service account name."
115 | default = "training-sa"
116 | }
117 |
118 | variable "pipelines_sa_name" {
119 | description = "Vertex pipelines service account name."
120 | default = "pipelines-sa"
121 | }
122 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | kfp==1.8.1
2 | google-cloud-bigquery==2.26.0
3 | google-cloud-bigquery-storage==2.7.0
4 | google-cloud-aiplatform==1.4.2
5 | cloudml-hypertune==0.1.0.dev6
6 | pytest
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | REQUIRED_PACKAGES = [
4 | "google-cloud-aiplatform==1.4.2",
5 | "tensorflow-transform==1.2.0",
6 | "tensorflow-data-validation==1.2.0",
7 | "cloudml-hypertune==0.1.0.dev6"
8 | ]
9 |
10 | setuptools.setup(
11 | name="executor",
12 | version="0.0.1",
13 | install_requires=REQUIRED_PACKAGES,
14 | packages=setuptools.find_packages(),
15 | include_package_data=True,
16 | package_data={"src": ["raw_schema/schema.pbtxt"]},
17 | )
18 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/src/__init__.py
--------------------------------------------------------------------------------
/src/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/src/common/__init__.py
--------------------------------------------------------------------------------
/src/common/datasource_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Utilities for generating BigQuery data querying scirpts."""
15 |
16 |
17 | from google.cloud import aiplatform as vertex_ai
18 |
19 |
20 | def _get_source_query(bq_dataset_name, bq_table_name, ml_use, limit=None):
21 | query = f"""
22 | SELECT
23 | IF(trip_month IS NULL, -1, trip_month) trip_month,
24 | IF(trip_day IS NULL, -1, trip_day) trip_day,
25 | IF(trip_day_of_week IS NULL, -1, trip_day_of_week) trip_day_of_week,
26 | IF(trip_hour IS NULL, -1, trip_hour) trip_hour,
27 | IF(trip_seconds IS NULL, -1, trip_seconds) trip_seconds,
28 | IF(trip_miles IS NULL, -1, trip_miles) trip_miles,
29 | IF(payment_type IS NULL, 'NA', payment_type) payment_type,
30 | IF(pickup_grid IS NULL, 'NA', pickup_grid) pickup_grid,
31 | IF(dropoff_grid IS NULL, 'NA', dropoff_grid) dropoff_grid,
32 | IF(euclidean IS NULL, -1, euclidean) euclidean,
33 | IF(loc_cross IS NULL, 'NA', loc_cross) loc_cross"""
34 | if ml_use:
35 | query += f""",
36 | tip_bin
37 | FROM {bq_dataset_name}.{bq_table_name}
38 | WHERE ML_use = '{ml_use}'
39 | """
40 | else:
41 | query += f"""
42 | FROM {bq_dataset_name}.{bq_table_name}
43 | """
44 | if limit:
45 | query += f"LIMIT {limit}"
46 |
47 | return query
48 |
49 |
50 | def get_training_source_query(
51 | project, region, dataset_display_name, ml_use, limit=None
52 | ):
53 | vertex_ai.init(project=project, location=region)
54 |
55 | dataset = vertex_ai.TabularDataset.list(
56 | filter=f"display_name={dataset_display_name}", order_by="update_time"
57 | )[-1]
58 | bq_source_uri = dataset.gca_resource.metadata["inputConfig"]["bigquerySource"][
59 | "uri"
60 | ]
61 | _, bq_dataset_name, bq_table_name = bq_source_uri.replace("g://", "").split(".")
62 |
63 | return _get_source_query(bq_dataset_name, bq_table_name, ml_use, limit)
64 |
65 |
66 | def get_serving_source_query(bq_dataset_name, bq_table_name, limit=None):
67 |
68 | return _get_source_query(bq_dataset_name, bq_table_name, ml_use=None, limit=limit)
69 |
--------------------------------------------------------------------------------
/src/common/features.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Model features metadata utils."""
15 |
16 |
17 | FEATURE_NAMES = [
18 | "trip_month",
19 | "trip_day",
20 | "trip_day_of_week",
21 | "trip_hour",
22 | "trip_seconds",
23 | "trip_miles",
24 | "payment_type",
25 | "pickup_grid",
26 | "dropoff_grid",
27 | "euclidean",
28 | "loc_cross",
29 | ]
30 |
31 | TARGET_FEATURE_NAME = "tip_bin"
32 |
33 | TARGET_LABELS = ["tip<20%", "tip>=20%"]
34 |
35 | NUMERICAL_FEATURE_NAMES = [
36 | "trip_seconds",
37 | "trip_miles",
38 | "euclidean",
39 | ]
40 |
41 | EMBEDDING_CATEGORICAL_FEATURES = {
42 | "trip_month": 2,
43 | "trip_day": 4,
44 | "trip_hour": 3,
45 | "pickup_grid": 3,
46 | "dropoff_grid": 3,
47 | "loc_cross": 10,
48 | }
49 |
50 | ONEHOT_CATEGORICAL_FEATURE_NAMES = ["payment_type", "trip_day_of_week"]
51 |
52 |
53 | def transformed_name(key: str) -> str:
54 | """Generate the name of the transformed feature from original name."""
55 | return f"{key}_xf"
56 |
57 |
58 | def original_name(key: str) -> str:
59 | """Generate the name of the original feature from transformed name."""
60 | return key.replace("_xf", "")
61 |
62 |
63 | def vocabulary_name(key: str) -> str:
64 | """Generate the name of the vocabulary feature from original name."""
65 | return f"{key}_vocab"
66 |
67 |
68 | def categorical_feature_names() -> list:
69 | return (
70 | list(EMBEDDING_CATEGORICAL_FEATURES.keys()) + ONEHOT_CATEGORICAL_FEATURE_NAMES
71 | )
72 |
73 |
74 | def generate_explanation_config():
75 | explanation_config = {
76 | "inputs": {},
77 | "outputs": {},
78 | "params": {"sampled_shapley_attribution": {"path_count": 10}},
79 | }
80 |
81 | for feature_name in FEATURE_NAMES:
82 | if feature_name in NUMERICAL_FEATURE_NAMES:
83 | explanation_config["inputs"][feature_name] = {
84 | "input_tensor_name": feature_name,
85 | "modality": "numeric",
86 | }
87 | else:
88 | explanation_config["inputs"][feature_name] = {
89 | "input_tensor_name": feature_name,
90 | "encoding": 'IDENTITY',
91 | "modality": "categorical",
92 | }
93 |
94 | explanation_config["outputs"] = {"scores": {"output_tensor_name": "scores"}}
95 |
96 | return explanation_config
97 |
--------------------------------------------------------------------------------
/src/model_training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/src/model_training/__init__.py
--------------------------------------------------------------------------------
/src/model_training/data.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Functions for reading data as tf.data.Dataset."""
15 |
16 | import tensorflow as tf
17 |
18 | from src.common import features
19 |
20 |
21 | def _gzip_reader_fn(filenames):
22 | """Small utility returning a record reader that can read gzip'ed files."""
23 | return tf.data.TFRecordDataset(filenames, compression_type="GZIP")
24 |
25 |
26 | def get_dataset(file_pattern, feature_spec, batch_size=200):
27 | """Generates features and label for tuning/training.
28 | Args:
29 | file_pattern: input tfrecord file pattern.
30 | feature_spec: a dictionary of feature specifications.
31 | batch_size: representing the number of consecutive elements of returned
32 | dataset to combine in a single batch
33 | Returns:
34 | A dataset that contains (features, indices) tuple where features is a
35 | dictionary of Tensors, and indices is a single Tensor of label indices.
36 | """
37 |
38 | dataset = tf.data.experimental.make_batched_features_dataset(
39 | file_pattern=file_pattern,
40 | batch_size=batch_size,
41 | features=feature_spec,
42 | label_key=features.TARGET_FEATURE_NAME,
43 | reader=_gzip_reader_fn,
44 | num_epochs=1,
45 | drop_final_batch=True,
46 | )
47 |
48 | return dataset
49 |
--------------------------------------------------------------------------------
/src/model_training/defaults.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Defaults for the model.
15 |
16 | These values can be tweaked to affect model training performance.
17 | """
18 |
19 |
20 | HIDDEN_UNITS = [64, 32]
21 | LEARNING_RATE = 0.0001
22 | BATCH_SIZE = 512
23 | NUM_EPOCHS = 10
24 | NUM_EVAL_STEPS = 100
25 |
26 |
27 | def update_hyperparams(hyperparams: dict) -> dict:
28 | if "hidden_units" not in hyperparams:
29 | hyperparams["hidden_units"] = HIDDEN_UNITS
30 | else:
31 | if not isinstance(hyperparams["hidden_units"], list):
32 | hyperparams["hidden_units"] = [
33 | int(v) for v in hyperparams["hidden_units"].split(",")
34 | ]
35 | if "learning_rate" not in hyperparams:
36 | hyperparams["learning_rate"] = LEARNING_RATE
37 | if "batch_size" not in hyperparams:
38 | hyperparams["batch_size"] = BATCH_SIZE
39 | if "num_epochs" not in hyperparams:
40 | hyperparams["num_epochs"] = NUM_EPOCHS
41 | return hyperparams
42 |
--------------------------------------------------------------------------------
/src/model_training/exporter.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Functions for exporting the model for serving."""
15 |
16 | import logging
17 |
18 | import tensorflow as tf
19 | import tensorflow_transform as tft
20 | import tensorflow_data_validation as tfdv
21 | from tensorflow_transform.tf_metadata import schema_utils
22 | import tensorflow.keras as keras
23 |
24 | from src.common import features
25 |
26 |
27 | def _get_serve_tf_examples_fn(classifier, tft_output, raw_feature_spec):
28 | """Returns a function that parses a serialized tf.Example and applies TFT."""
29 |
30 | classifier.tft_layer = tft_output.transform_features_layer()
31 |
32 | @tf.function
33 | def serve_tf_examples_fn(serialized_tf_examples):
34 | """Returns the output to be used in the serving signature."""
35 | for key in list(raw_feature_spec.keys()):
36 | if key not in features.FEATURE_NAMES:
37 | raw_feature_spec.pop(key)
38 |
39 | parsed_features = tf.io.parse_example(serialized_tf_examples, raw_feature_spec)
40 |
41 | transformed_features = classifier.tft_layer(parsed_features)
42 | logits = classifier(transformed_features)
43 | probabilities = keras.activations.sigmoid(logits)
44 | return {"probabilities": probabilities}
45 |
46 | return serve_tf_examples_fn
47 |
48 |
49 | def _get_serve_features_fn(classifier, tft_output):
50 | """Returns a function that accept a dictionary of features and applies TFT."""
51 |
52 | classifier.tft_layer = tft_output.transform_features_layer()
53 |
54 | @tf.function
55 | def serve_features_fn(raw_features):
56 | """Returns the output to be used in the serving signature."""
57 |
58 | transformed_features = classifier.tft_layer(raw_features)
59 | logits = classifier(transformed_features)
60 | neg_probabilities = keras.activations.sigmoid(logits)
61 | pos_probabilities = 1 - neg_probabilities
62 | probabilities = tf.concat([neg_probabilities, pos_probabilities], -1)
63 | batch_size = tf.shape(probabilities)[0]
64 | classes = tf.repeat([features.TARGET_LABELS], [batch_size], axis=0)
65 | return {"classes": classes, "scores": probabilities}
66 |
67 | return serve_features_fn
68 |
69 |
70 | def export_serving_model(
71 | classifier, serving_model_dir, raw_schema_location, tft_output_dir
72 | ):
73 |
74 | raw_schema = tfdv.load_schema_text(raw_schema_location)
75 | raw_feature_spec = schema_utils.schema_as_feature_spec(raw_schema).feature_spec
76 |
77 | tft_output = tft.TFTransformOutput(tft_output_dir)
78 |
79 | features_input_signature = {
80 | feature_name: tf.TensorSpec(
81 | shape=(None, 1), dtype=spec.dtype, name=feature_name
82 | )
83 | for feature_name, spec in raw_feature_spec.items()
84 | if feature_name in features.FEATURE_NAMES
85 | }
86 |
87 | signatures = {
88 | "serving_default": _get_serve_features_fn(
89 | classifier, tft_output
90 | ).get_concrete_function(features_input_signature),
91 | "serving_tf_example": _get_serve_tf_examples_fn(
92 | classifier, tft_output, raw_feature_spec
93 | ).get_concrete_function(
94 | tf.TensorSpec(shape=[None], dtype=tf.string, name="examples")
95 | ),
96 | }
97 |
98 | logging.info("Model export started...")
99 | classifier.save(serving_model_dir, signatures=signatures)
100 | logging.info("Model export completed.")
101 |
--------------------------------------------------------------------------------
/src/model_training/model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """A DNN keras classification model."""
15 |
16 | import tensorflow as tf
17 | from tensorflow import keras
18 |
19 | from src.common import features
20 |
21 |
22 | def create_model_inputs():
23 | inputs = {}
24 | for feature_name in features.FEATURE_NAMES:
25 | name = features.transformed_name(feature_name)
26 | if feature_name in features.NUMERICAL_FEATURE_NAMES:
27 | inputs[name] = keras.layers.Input(name=name, shape=[], dtype=tf.float32)
28 | elif feature_name in features.categorical_feature_names():
29 | inputs[name] = keras.layers.Input(name=name, shape=[], dtype=tf.int64)
30 | else:
31 | pass
32 | return inputs
33 |
34 |
35 | def _create_binary_classifier(feature_vocab_sizes, hyperparams):
36 | input_layers = create_model_inputs()
37 |
38 | layers = []
39 | for key in input_layers:
40 | feature_name = features.original_name(key)
41 | if feature_name in features.EMBEDDING_CATEGORICAL_FEATURES:
42 | vocab_size = feature_vocab_sizes[feature_name]
43 | embedding_size = features.EMBEDDING_CATEGORICAL_FEATURES[feature_name]
44 | embedding_output = keras.layers.Embedding(
45 | input_dim=vocab_size + 1,
46 | output_dim=embedding_size,
47 | name=f"{key}_embedding",
48 | )(input_layers[key])
49 | layers.append(embedding_output)
50 | elif feature_name in features.ONEHOT_CATEGORICAL_FEATURE_NAMES:
51 | vocab_size = feature_vocab_sizes[feature_name]
52 | onehot_layer = keras.layers.experimental.preprocessing.CategoryEncoding(
53 | max_tokens=vocab_size,
54 | output_mode="binary",
55 | name=f"{key}_onehot",
56 | )(input_layers[key])
57 | layers.append(onehot_layer)
58 | elif feature_name in features.NUMERICAL_FEATURE_NAMES:
59 | numeric_layer = tf.expand_dims(input_layers[key], -1)
60 | layers.append(numeric_layer)
61 | else:
62 | pass
63 |
64 | joined = keras.layers.Concatenate(name="combines_inputs")(layers)
65 | feedforward_output = keras.Sequential(
66 | [
67 | keras.layers.Dense(units, activation="relu")
68 | for units in hyperparams["hidden_units"]
69 | ],
70 | name="feedforward_network",
71 | )(joined)
72 | logits = keras.layers.Dense(units=1, name="logits")(feedforward_output)
73 |
74 | model = keras.Model(inputs=input_layers, outputs=[logits])
75 | return model
76 |
77 |
78 | def create_binary_classifier(tft_output, hyperparams):
79 | feature_vocab_sizes = dict()
80 | for feature_name in features.categorical_feature_names():
81 | feature_vocab_sizes[feature_name] = tft_output.vocabulary_size_by_name(
82 | feature_name
83 | )
84 |
85 | return _create_binary_classifier(feature_vocab_sizes, hyperparams)
86 |
--------------------------------------------------------------------------------
/src/model_training/runner.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """A run_fn method called by the TFX Trainer component."""
15 |
16 | import os
17 | import logging
18 |
19 | from src.model_training import trainer, exporter, defaults
20 |
21 |
22 | # TFX Trainer will call this function.
23 | def run_fn(fn_args):
24 | """Train the model based on given args.
25 | Args:
26 | fn_args: Holds args used to train the model as name/value pairs.
27 | """
28 | logging.info("Runner started...")
29 | logging.info(f"fn_args: {fn_args}")
30 | logging.info("")
31 |
32 | try:
33 | log_dir = fn_args.model_run_dir
34 | except KeyError:
35 | log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), "logs")
36 |
37 | hyperparams = fn_args.hyperparameters
38 | if not hyperparams:
39 | hyperparams = dict()
40 |
41 | hyperparams = defaults.update_hyperparams(hyperparams)
42 | logging.info("Hyperparameter:")
43 | logging.info(hyperparams)
44 | logging.info("")
45 |
46 | logging.info("Runner executing trainer...")
47 | classifier = trainer.train(
48 | train_data_dir=fn_args.train_files,
49 | eval_data_dir=fn_args.eval_files,
50 | tft_output_dir=fn_args.transform_output,
51 | hyperparams=hyperparams,
52 | log_dir=log_dir,
53 | base_model_dir=fn_args.base_model,
54 | )
55 |
56 | logging.info("Runner executing exporter...")
57 | exporter.export_serving_model(
58 | classifier=classifier,
59 | serving_model_dir=fn_args.serving_model_dir,
60 | raw_schema_location=fn_args.schema_path,
61 | tft_output_dir=fn_args.transform_output,
62 | )
63 | logging.info("Runner completed.")
64 |
--------------------------------------------------------------------------------
/src/model_training/task.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """The entrypoint for the Vertex training job."""
15 |
16 | import os
17 | import sys
18 | from datetime import datetime
19 | import logging
20 | import tensorflow as tf
21 | from tensorflow.python.client import device_lib
22 | import argparse
23 |
24 | from google.cloud import aiplatform as vertex_ai
25 | import hypertune
26 |
27 | from src.model_training import defaults, trainer, exporter
28 |
29 |
30 | dirname = os.path.dirname(__file__)
31 | dirname = dirname.replace("/model_training", "")
32 | RAW_SCHEMA_LOCATION = os.path.join(dirname, "raw_schema/schema.pbtxt")
33 | HYPERTUNE_METRIC_NAME = 'ACCURACY'
34 |
35 |
36 | def get_args():
37 | parser = argparse.ArgumentParser()
38 |
39 | parser.add_argument(
40 | "--model-dir",
41 | default=os.getenv("AIP_MODEL_DIR"),
42 | type=str,
43 | )
44 |
45 | parser.add_argument(
46 | "--log-dir",
47 | default=os.getenv("AIP_TENSORBOARD_LOG_DIR"),
48 | type=str,
49 | )
50 |
51 | parser.add_argument(
52 | "--train-data-dir",
53 | type=str,
54 | )
55 |
56 | parser.add_argument(
57 | "--eval-data-dir",
58 | type=str,
59 | )
60 |
61 | parser.add_argument(
62 | "--tft-output-dir",
63 | type=str,
64 | )
65 |
66 | parser.add_argument("--learning-rate", default=0.001, type=float)
67 | parser.add_argument("--batch-size", default=512, type=float)
68 | parser.add_argument("--hidden-units", default="64,32", type=str)
69 | parser.add_argument("--num-epochs", default=10, type=int)
70 |
71 | parser.add_argument("--project", type=str)
72 | parser.add_argument("--region", type=str)
73 | parser.add_argument("--staging-bucket", type=str)
74 | parser.add_argument("--experiment-name", type=str)
75 | parser.add_argument("--run-name", type=str)
76 |
77 | return parser.parse_args()
78 |
79 |
80 | def main():
81 | args = get_args()
82 |
83 | hyperparams = vars(args)
84 | hyperparams = defaults.update_hyperparams(hyperparams)
85 | logging.info(f"Hyperparameter: {hyperparams}")
86 |
87 | if args.experiment_name:
88 | vertex_ai.init(
89 | project=args.project,
90 | staging_bucket=args.staging_bucket,
91 | experiment=args.experiment_name,
92 | )
93 |
94 | logging.info(f"Using Vertex AI experiment: {args.experiment_name}")
95 |
96 | run_id = args.run_name
97 | if not run_id:
98 | run_id = f"run-gcp-{datetime.now().strftime('%Y%m%d%H%M%S')}"
99 |
100 | vertex_ai.start_run(run_id)
101 | logging.info(f"Run {run_id} started.")
102 |
103 | vertex_ai.log_params(hyperparams)
104 |
105 | classifier = trainer.train(
106 | train_data_dir=args.train_data_dir,
107 | eval_data_dir=args.eval_data_dir,
108 | tft_output_dir=args.tft_output_dir,
109 | hyperparams=hyperparams,
110 | log_dir=args.log_dir,
111 | )
112 |
113 | val_loss, val_accuracy = trainer.evaluate(
114 | model=classifier,
115 | data_dir=args.eval_data_dir,
116 | raw_schema_location=RAW_SCHEMA_LOCATION,
117 | tft_output_dir=args.tft_output_dir,
118 | hyperparams=hyperparams,
119 | )
120 |
121 |
122 | # Report val_accuracy to Vertex hypertuner.
123 | logging.info(f'Reporting metric {HYPERTUNE_METRIC_NAME}={val_accuracy} to Vertex hypertuner...')
124 | hpt = hypertune.HyperTune()
125 | hpt.report_hyperparameter_tuning_metric(
126 | hyperparameter_metric_tag=HYPERTUNE_METRIC_NAME,
127 | metric_value=val_accuracy,
128 | global_step=args.num_epochs * args.batch_size
129 | )
130 |
131 | # Log metrics in Vertex Experiments.
132 | logging.info(f'Logging metrics to Vertex Experiments...')
133 | if args.experiment_name:
134 | vertex_ai.log_metrics({"val_loss": val_loss, "val_accuracy": val_accuracy})
135 |
136 | try:
137 | exporter.export_serving_model(
138 | classifier=classifier,
139 | serving_model_dir=args.model_dir,
140 | raw_schema_location=RAW_SCHEMA_LOCATION,
141 | tft_output_dir=args.tft_output_dir,
142 | )
143 | except:
144 | # Swallow Ignored Errors while exporting the model.
145 | pass
146 |
147 |
148 | if __name__ == "__main__":
149 | logging.getLogger().setLevel(logging.INFO)
150 | logging.info(f"Python Version = {sys.version}")
151 | logging.info(f"TensorFlow Version = {tf.__version__}")
152 | logging.info(f'TF_CONFIG = {os.environ.get("TF_CONFIG", "Not found")}')
153 | logging.info(f"DEVICES = {device_lib.list_local_devices()}")
154 | logging.info(f"Task started...")
155 | main()
156 | logging.info(f"Task completed.")
157 |
--------------------------------------------------------------------------------
/src/model_training/trainer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Train and evaluate the model."""
15 |
16 | import logging
17 | import tensorflow as tf
18 | import tensorflow_transform as tft
19 | from tensorflow import keras
20 |
21 |
22 | from src.model_training import data, model
23 |
24 |
25 | def train(
26 | train_data_dir,
27 | eval_data_dir,
28 | tft_output_dir,
29 | hyperparams,
30 | log_dir,
31 | base_model_dir=None,
32 | ):
33 |
34 | logging.info(f"Loading tft output from {tft_output_dir}")
35 | tft_output = tft.TFTransformOutput(tft_output_dir)
36 | transformed_feature_spec = tft_output.transformed_feature_spec()
37 |
38 | train_dataset = data.get_dataset(
39 | train_data_dir,
40 | transformed_feature_spec,
41 | hyperparams["batch_size"],
42 | )
43 |
44 | eval_dataset = data.get_dataset(
45 | eval_data_dir,
46 | transformed_feature_spec,
47 | hyperparams["batch_size"],
48 | )
49 |
50 | optimizer = keras.optimizers.Adam(learning_rate=hyperparams["learning_rate"])
51 | loss = keras.losses.BinaryCrossentropy(from_logits=True)
52 | metrics = [keras.metrics.BinaryAccuracy(name="accuracy")]
53 |
54 | early_stopping = tf.keras.callbacks.EarlyStopping(
55 | monitor="val_loss", patience=5, restore_best_weights=True
56 | )
57 | tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
58 |
59 | classifier = model.create_binary_classifier(tft_output, hyperparams)
60 | if base_model_dir:
61 | try:
62 | classifier = keras.load_model(base_model_dir)
63 | except:
64 | pass
65 |
66 | classifier.compile(optimizer=optimizer, loss=loss, metrics=metrics)
67 |
68 | logging.info("Model training started...")
69 | classifier.fit(
70 | train_dataset,
71 | epochs=hyperparams["num_epochs"],
72 | validation_data=eval_dataset,
73 | callbacks=[early_stopping, tensorboard_callback],
74 | )
75 | logging.info("Model training completed.")
76 |
77 | return classifier
78 |
79 |
80 | def evaluate(model, data_dir, raw_schema_location, tft_output_dir, hyperparams):
81 | logging.info(f"Loading raw schema from {raw_schema_location}")
82 |
83 | logging.info(f"Loading tft output from {tft_output_dir}")
84 | tft_output = tft.TFTransformOutput(tft_output_dir)
85 | transformed_feature_spec = tft_output.transformed_feature_spec()
86 |
87 | logging.info("Model evaluation started...")
88 | eval_dataset = data.get_dataset(
89 | data_dir,
90 | transformed_feature_spec,
91 | hyperparams["batch_size"],
92 | )
93 |
94 | evaluation_metrics = model.evaluate(eval_dataset)
95 | logging.info("Model evaluation completed.")
96 |
97 | return evaluation_metrics
98 |
--------------------------------------------------------------------------------
/src/pipeline_triggering/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/src/pipeline_triggering/__init__.py
--------------------------------------------------------------------------------
/src/pipeline_triggering/main.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Cloud Function to be triggered by Pub/Sub."""
15 |
16 | import os
17 | import json
18 | import logging
19 | from kfp.v2.google.client import AIPlatformClient
20 | from google.cloud import storage
21 | import base64
22 |
23 |
24 | def trigger_pipeline(event, context):
25 |
26 | project = os.getenv("PROJECT")
27 | region = os.getenv("REGION")
28 | gcs_pipeline_file_location = os.getenv("GCS_PIPELINE_FILE_LOCATION")
29 |
30 | if not project:
31 | raise ValueError("Environment variable PROJECT is not set.")
32 | if not region:
33 | raise ValueError("Environment variable REGION is not set.")
34 | if not gcs_pipeline_file_location:
35 | raise ValueError("Environment variable GCS_PIPELINE_FILE_LOCATION is not set.")
36 |
37 | storage_client = storage.Client()
38 |
39 | if not gcs_pipeline_file_location:
40 | raise ValueError("Environment variable GCS_PIPELINE_FILE_LOCATION is not set.")
41 |
42 | path_parts = gcs_pipeline_file_location.replace("gs://", "").split("/")
43 | bucket_name = path_parts[0]
44 | blob_name = "/".join(path_parts[1:])
45 |
46 | bucket = storage_client.bucket(bucket_name)
47 | blob = storage.Blob(bucket=bucket, name=blob_name)
48 |
49 | if not blob.exists(storage_client):
50 | raise ValueError(f"{gcs_pipeline_file_location} does not exist.")
51 |
52 | data = base64.b64decode(event["data"]).decode("utf-8")
53 | logging.info(f"Event data: {data}")
54 |
55 | parameter_values = json.loads(data)
56 |
57 | api_client = AIPlatformClient(project_id=project, region=region)
58 |
59 | response = api_client.create_run_from_job_spec(
60 | job_spec_path=gcs_pipeline_file_location, parameter_values=parameter_values
61 | )
62 |
63 | logging.info(response)
64 |
--------------------------------------------------------------------------------
/src/pipeline_triggering/requirements.txt:
--------------------------------------------------------------------------------
1 | kfp==1.6.2
2 | google-cloud-aiplatform
3 | google-cloud-storage
--------------------------------------------------------------------------------
/src/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/src/preprocessing/__init__.py
--------------------------------------------------------------------------------
/src/preprocessing/etl.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Data preprocessing pipelines."""
15 |
16 | import os
17 |
18 | import tensorflow_transform as tft
19 | import tensorflow_data_validation as tfdv
20 | import apache_beam as beam
21 | from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore
22 | import tensorflow_transform.beam as tft_beam
23 | from tensorflow_transform.tf_metadata import dataset_metadata
24 | from tensorflow_transform.tf_metadata import schema_utils
25 |
26 |
27 | from src.preprocessing import transformations
28 |
29 | RAW_SCHEMA_LOCATION = "src/raw_schema/schema.pbtxt"
30 |
31 |
32 | def parse_bq_record(bq_record):
33 | output = {}
34 | for key in bq_record:
35 | output[key] = [bq_record[key]]
36 | return output
37 |
38 |
39 | def split_dataset(bq_row, num_partitions, ratio):
40 | import json
41 |
42 | assert num_partitions == len(ratio)
43 | bucket = sum(map(ord, json.dumps(bq_row))) % sum(ratio)
44 | total = 0
45 | for i, part in enumerate(ratio):
46 | total += part
47 | if bucket < total:
48 | return i
49 | return len(ratio) - 1
50 |
51 |
52 | def run_transform_pipeline(args):
53 |
54 | pipeline_options = beam.pipeline.PipelineOptions(flags=[], **args)
55 |
56 | raw_data_query = args["raw_data_query"]
57 | write_raw_data = args["write_raw_data"]
58 | exported_data_prefix = args["exported_data_prefix"]
59 | transformed_data_prefix = args["transformed_data_prefix"]
60 | transform_artifact_dir = args["transform_artifact_dir"]
61 | temporary_dir = args["temporary_dir"]
62 | gcs_location = args["gcs_location"]
63 | project = args["project"]
64 |
65 | source_raw_schema = tfdv.load_schema_text(RAW_SCHEMA_LOCATION)
66 | raw_feature_spec = schema_utils.schema_as_feature_spec(
67 | source_raw_schema
68 | ).feature_spec
69 |
70 | raw_metadata = dataset_metadata.DatasetMetadata(
71 | schema_utils.schema_from_feature_spec(raw_feature_spec)
72 | )
73 |
74 | with beam.Pipeline(options=pipeline_options) as pipeline:
75 | with tft_beam.Context(temporary_dir):
76 |
77 | # Read raw BigQuery data.
78 | raw_train_data, raw_eval_data = (
79 | pipeline
80 | | "Read Raw Data"
81 | >> beam.io.ReadFromBigQuery(
82 | query=raw_data_query,
83 | project=project,
84 | use_standard_sql=True,
85 | gcs_location=gcs_location,
86 | )
87 | | "Parse Data" >> beam.Map(parse_bq_record)
88 | | "Split" >> beam.Partition(split_dataset, 2, ratio=[8, 2])
89 | )
90 |
91 | # Create a train_dataset from the data and schema.
92 | raw_train_dataset = (raw_train_data, raw_metadata)
93 |
94 | # Analyze and transform raw_train_dataset to produced transformed_train_dataset and transform_fn.
95 | transformed_train_dataset, transform_fn = (
96 | raw_train_dataset
97 | | "Analyze & Transform"
98 | >> tft_beam.AnalyzeAndTransformDataset(transformations.preprocessing_fn)
99 | )
100 |
101 | # Get data and schema separately from the transformed_dataset.
102 | transformed_train_data, transformed_metadata = transformed_train_dataset
103 |
104 | # write transformed train data.
105 | _ = (
106 | transformed_train_data
107 | | "Write Transformed Train Data"
108 | >> beam.io.tfrecordio.WriteToTFRecord(
109 | file_path_prefix=os.path.join(
110 | transformed_data_prefix, "train/data"
111 | ),
112 | file_name_suffix=".gz",
113 | coder=tft.coders.ExampleProtoCoder(transformed_metadata.schema),
114 | )
115 | )
116 |
117 | # Create a eval_dataset from the data and schema.
118 | raw_eval_dataset = (raw_eval_data, raw_metadata)
119 |
120 | # Transform raw_eval_dataset to produced transformed_eval_dataset using transform_fn.
121 | transformed_eval_dataset = (
122 | raw_eval_dataset,
123 | transform_fn,
124 | ) | "Transform" >> tft_beam.TransformDataset()
125 |
126 | # Get data from the transformed_eval_dataset.
127 | transformed_eval_data, _ = transformed_eval_dataset
128 |
129 | # write transformed train data.
130 | _ = (
131 | transformed_eval_data
132 | | "Write Transformed Eval Data"
133 | >> beam.io.tfrecordio.WriteToTFRecord(
134 | file_path_prefix=os.path.join(transformed_data_prefix, "eval/data"),
135 | file_name_suffix=".gz",
136 | coder=tft.coders.ExampleProtoCoder(transformed_metadata.schema),
137 | )
138 | )
139 |
140 | # Write transform_fn.
141 | _ = transform_fn | "Write Transform Artifacts" >> tft_beam.WriteTransformFn(
142 | transform_artifact_dir
143 | )
144 |
145 | if write_raw_data:
146 | # write raw eval data.
147 | _ = (
148 | raw_eval_data
149 | | "Write Raw Eval Data"
150 | >> beam.io.tfrecordio.WriteToTFRecord(
151 | file_path_prefix=os.path.join(exported_data_prefix, "data"),
152 | file_name_suffix=".tfrecord",
153 | coder=tft.coders.ExampleProtoCoder(raw_metadata.schema),
154 | )
155 | )
156 |
157 |
158 | def convert_to_jsonl(bq_record):
159 | import json
160 |
161 | output = {}
162 | for key in bq_record:
163 | output[key] = [bq_record[key]]
164 | return json.dumps(output)
165 |
166 |
167 | def run_extract_pipeline(args):
168 |
169 | pipeline_options = beam.pipeline.PipelineOptions(flags=[], **args)
170 |
171 | sql_query = args["sql_query"]
172 | exported_data_prefix = args["exported_data_prefix"]
173 | temporary_dir = args["temporary_dir"]
174 | gcs_location = args["gcs_location"]
175 | project = args["project"]
176 |
177 | with beam.Pipeline(options=pipeline_options) as pipeline:
178 | with tft_beam.Context(temporary_dir):
179 |
180 | # Read BigQuery data.
181 | raw_data = (
182 | pipeline
183 | | "Read Data"
184 | >> beam.io.ReadFromBigQuery(
185 | query=sql_query,
186 | project=project,
187 | use_standard_sql=True,
188 | gcs_location=gcs_location,
189 | )
190 | | "Parse Data" >> beam.Map(convert_to_jsonl)
191 | )
192 |
193 | # Write raw data to GCS as JSONL files.
194 | _ = raw_data | "Write Data" >> beam.io.WriteToText(
195 | file_path_prefix=exported_data_prefix, file_name_suffix=".jsonl"
196 | )
197 |
198 |
199 | def parse_prediction_results(jsonl):
200 | import uuid
201 | import json
202 |
203 | prediction_results = json.loads(jsonl)["prediction"]
204 | prediction_id = str(uuid.uuid4())
205 | scores = prediction_results["scores"]
206 | classes = prediction_results["classes"]
207 |
208 | return {"prediction_id": prediction_id, "scores": scores, "classes": classes}
209 |
210 |
211 | def create_datastore_entity(prediction_response, kind):
212 | from apache_beam.io.gcp.datastore.v1new.types import Entity
213 | from apache_beam.io.gcp.datastore.v1new.types import Key
214 |
215 | user_id = prediction_response.pop("prediction_id")
216 | key = Key([kind, user_id])
217 | prediction_entity = Entity(key)
218 | prediction_entity.set_properties(prediction_response)
219 | return prediction_entity
220 |
221 |
222 | def run_store_predictions_pipeline(args):
223 |
224 | project = args["project"]
225 | datastore_kind = args["datastore_kind"]
226 | prediction_results_uri = args["prediction_results_uri"]
227 |
228 | pipeline_options = beam.options.pipeline_options.PipelineOptions(args)
229 | with beam.Pipeline(options=pipeline_options) as pipeline:
230 | _ = (
231 | pipeline
232 | | "ReadFromJSONL" >> beam.io.ReadFromText(prediction_results_uri)
233 | | "ParsePredictionResults" >> beam.Map(parse_prediction_results)
234 | | "ConvertToDatastoreEntity"
235 | >> beam.Map(create_datastore_entity, datastore_kind)
236 | | "WriteToDatastore" >> WriteToDatastore(project=project)
237 | )
238 |
--------------------------------------------------------------------------------
/src/preprocessing/transformations.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TensorFlow Transform preprocessing function."""
15 |
16 | import tensorflow as tf
17 | import tensorflow_transform as tft
18 |
19 | from src.common import features
20 |
21 |
22 | def preprocessing_fn(inputs):
23 | """tf.transform's callback function for preprocessing inputs.
24 | Args:
25 | inputs: map from feature keys to raw not-yet-transformed features.
26 | Returns:
27 | Map from string feature key to transformed feature operations.
28 | """
29 |
30 | outputs = {}
31 |
32 | for key in features.FEATURE_NAMES:
33 | if key in features.NUMERICAL_FEATURE_NAMES:
34 | outputs[features.transformed_name(key)] = tft.scale_to_z_score(inputs[key])
35 |
36 | elif key in features.categorical_feature_names():
37 | outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
38 | inputs[key],
39 | num_oov_buckets=1,
40 | vocab_filename=key,
41 | )
42 |
43 | outputs[features.TARGET_FEATURE_NAME] = inputs[features.TARGET_FEATURE_NAME]
44 |
45 | for key in outputs:
46 | outputs[key] = tf.squeeze(outputs[key], -1)
47 |
48 | return outputs
49 |
--------------------------------------------------------------------------------
/src/raw_schema/schema.pbtxt:
--------------------------------------------------------------------------------
1 | feature {
2 | name: "trip_month"
3 | type: INT
4 | presence {
5 | min_fraction: 1.0
6 | min_count: 1
7 | }
8 | shape {
9 | dim {
10 | size: 1
11 | }
12 | }
13 | }
14 | feature {
15 | name: "trip_day"
16 | type: INT
17 | presence {
18 | min_fraction: 1.0
19 | min_count: 1
20 | }
21 | shape {
22 | dim {
23 | size: 1
24 | }
25 | }
26 | }
27 | feature {
28 | name: "trip_day_of_week"
29 | type: INT
30 | presence {
31 | min_fraction: 1.0
32 | min_count: 1
33 | }
34 | shape {
35 | dim {
36 | size: 1
37 | }
38 | }
39 | }
40 | feature {
41 | name: "trip_hour"
42 | type: INT
43 | presence {
44 | min_fraction: 1.0
45 | min_count: 1
46 | }
47 | shape {
48 | dim {
49 | size: 1
50 | }
51 | }
52 | }
53 | feature {
54 | name: "trip_seconds"
55 | type: INT
56 | presence {
57 | min_fraction: 1.0
58 | min_count: 1
59 | }
60 | shape {
61 | dim {
62 | size: 1
63 | }
64 | }
65 | }
66 | feature {
67 | name: "trip_miles"
68 | type: FLOAT
69 | presence {
70 | min_fraction: 1.0
71 | min_count: 1
72 | }
73 | shape {
74 | dim {
75 | size: 1
76 | }
77 | }
78 | }
79 | feature {
80 | name: "payment_type"
81 | type: BYTES
82 | domain: "payment_type"
83 | presence {
84 | min_fraction: 1.0
85 | min_count: 1
86 | }
87 | shape {
88 | dim {
89 | size: 1
90 | }
91 | }
92 | }
93 | feature {
94 | name: "pickup_grid"
95 | type: BYTES
96 | domain: "pickup_grid"
97 | presence {
98 | min_fraction: 1.0
99 | min_count: 1
100 | }
101 | shape {
102 | dim {
103 | size: 1
104 | }
105 | }
106 | }
107 | feature {
108 | name: "dropoff_grid"
109 | type: BYTES
110 | domain: "dropoff_grid"
111 | presence {
112 | min_fraction: 1.0
113 | min_count: 1
114 | }
115 | shape {
116 | dim {
117 | size: 1
118 | }
119 | }
120 | }
121 | feature {
122 | name: "euclidean"
123 | type: FLOAT
124 | presence {
125 | min_fraction: 1.0
126 | min_count: 1
127 | }
128 | shape {
129 | dim {
130 | size: 1
131 | }
132 | }
133 | }
134 | feature {
135 | name: "loc_cross"
136 | type: BYTES
137 | presence {
138 | min_fraction: 1.0
139 | min_count: 1
140 | }
141 | shape {
142 | dim {
143 | size: 1
144 | }
145 | }
146 | }
147 | feature {
148 | name: "tip_bin"
149 | type: INT
150 | bool_domain {
151 | }
152 | presence {
153 | min_fraction: 1.0
154 | min_count: 1
155 | }
156 | shape {
157 | dim {
158 | size: 1
159 | }
160 | }
161 | }
162 | string_domain {
163 | name: "payment_type"
164 | value: "Cash"
165 | value: "Credit Card"
166 | value: "Dispute"
167 | value: "Mobile"
168 | value: "No Charge"
169 | value: "Prcard"
170 | value: "Prepaid"
171 | value: "Unknown"
172 | }
173 | string_domain {
174 | name: "pickup_grid"
175 | value: "POINT(-87.5 41.7)"
176 | value: "POINT(-87.6 41.7)"
177 | value: "POINT(-87.6 41.8)"
178 | value: "POINT(-87.6 41.9)"
179 | value: "POINT(-87.6 42)"
180 | value: "POINT(-87.7 41.7)"
181 | value: "POINT(-87.7 41.8)"
182 | value: "POINT(-87.7 41.9)"
183 | value: "POINT(-87.7 42)"
184 | value: "POINT(-87.8 41.8)"
185 | value: "POINT(-87.8 41.9)"
186 | value: "POINT(-87.8 42)"
187 | value: "POINT(-87.9 42)"
188 | }
189 | string_domain {
190 | name: "dropoff_grid"
191 | value: "POINT(-87.5 41.7)"
192 | value: "POINT(-87.6 41.7)"
193 | value: "POINT(-87.6 41.8)"
194 | value: "POINT(-87.6 41.9)"
195 | value: "POINT(-87.6 42)"
196 | value: "POINT(-87.7 41.7)"
197 | value: "POINT(-87.7 41.8)"
198 | value: "POINT(-87.7 41.9)"
199 | value: "POINT(-87.7 42)"
200 | value: "POINT(-87.8 41.8)"
201 | value: "POINT(-87.8 41.9)"
202 | value: "POINT(-87.8 42)"
203 | value: "POINT(-87.9 42)"
204 | }
205 |
--------------------------------------------------------------------------------
/src/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/src/tests/__init__.py
--------------------------------------------------------------------------------
/src/tests/datasource_utils_tests.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Test utilities for generating BigQuery data querying scirpts."""
15 |
16 | import sys
17 | import os
18 | import logging
19 | from google.cloud import bigquery
20 |
21 | from src.common import datasource_utils
22 |
23 | root = logging.getLogger()
24 | root.setLevel(logging.INFO)
25 | handler = logging.StreamHandler(sys.stdout)
26 | handler.setLevel(logging.INFO)
27 | root.addHandler(handler)
28 |
29 | LIMIT = 100
30 |
31 | TARGET_COLUMN = "tip_bin"
32 |
33 | EXPECTED_TRAINING_COLUMNS = [
34 | "trip_month",
35 | "trip_day",
36 | "trip_day_of_week",
37 | "trip_hour",
38 | "trip_seconds",
39 | "trip_miles",
40 | "payment_type",
41 | "pickup_grid",
42 | "dropoff_grid",
43 | "euclidean",
44 | "loc_cross",
45 | "tip_bin",
46 | ]
47 |
48 |
49 | def test_training_query():
50 |
51 | project = os.getenv("PROJECT")
52 | location = os.getenv("BQ_LOCATION")
53 | bq_dataset_name = os.getenv("BQ_DATASET_NAME")
54 | bq_table_name = os.getenv("BQ_TABLE_NAME")
55 |
56 | assert project, "Environment variable PROJECT is None!"
57 | assert location, "Environment variable BQ_LOCATION is None!"
58 | assert bq_dataset_name, "Environment variable BQ_DATASET_NAME is None!"
59 | assert bq_table_name, "Environment variable BQ_TABLE_NAME is None!"
60 |
61 | logging.info(f"BigQuery Source: {project}.{bq_dataset_name}.{bq_table_name}")
62 |
63 | query = datasource_utils._get_source_query(
64 | bq_dataset_name=bq_dataset_name,
65 | bq_table_name=bq_table_name,
66 | ml_use="UNASSIGNED",
67 | limit=LIMIT,
68 | )
69 |
70 | bq_client = bigquery.Client(project=project, location=location)
71 | df = bq_client.query(query).to_dataframe()
72 | columns = set(df.columns)
73 | assert columns == set(EXPECTED_TRAINING_COLUMNS)
74 | assert df.shape == (LIMIT, 12)
75 |
76 |
77 | def test_serving_query():
78 |
79 | project = os.getenv("PROJECT")
80 | location = os.getenv("BQ_LOCATION")
81 | bq_dataset_name = os.getenv("BQ_DATASET_NAME")
82 | bq_table_name = os.getenv("BQ_TABLE_NAME")
83 |
84 | assert project, "Environment variable PROJECT is None!"
85 | assert location, "Environment variable BQ_LOCATION is None!"
86 | assert bq_dataset_name, "Environment variable BQ_DATASET_NAME is None!"
87 | assert bq_table_name, "Environment variable BQ_TABLE_NAME is None!"
88 |
89 | logging.info(f"BigQuery Source: {project}.{bq_dataset_name}.{bq_table_name}")
90 |
91 | query = datasource_utils._get_source_query(
92 | bq_dataset_name=bq_dataset_name,
93 | bq_table_name=bq_table_name,
94 | ml_use=None,
95 | limit=LIMIT,
96 | )
97 |
98 | bq_client = bigquery.Client(project=project, location=location)
99 | df = bq_client.query(query).to_dataframe()
100 | columns = set(df.columns)
101 | expected_serving_columns = EXPECTED_TRAINING_COLUMNS
102 | expected_serving_columns.remove(TARGET_COLUMN)
103 | assert columns == set(expected_serving_columns)
104 | assert df.shape == (LIMIT, 11)
105 |
--------------------------------------------------------------------------------
/src/tests/etl_tests.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Test data processing."""
15 |
16 | import sys
17 | import os
18 | import logging
19 | import tensorflow_transform as tft
20 | import tensorflow as tf
21 | from tensorflow.io import FixedLenFeature
22 |
23 | from src.preprocessing import etl
24 | from src.comm import datasource_utils
25 |
26 | root = logging.getLogger()
27 | root.setLevel(logging.INFO)
28 | handler = logging.StreamHandler(sys.stdout)
29 | handler.setLevel(logging.INFO)
30 | root.addHandler(handler)
31 |
32 | OUTPUT_DIR = "test_etl_output_dir"
33 | ML_USE = "UNASSIGNED"
34 | LIMIT = 100
35 |
36 | EXPECTED_FEATURE_SPEC = {
37 | "dropoff_grid_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
38 | "euclidean_xf": FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
39 | "loc_cross_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
40 | "payment_type_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
41 | "pickup_grid_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
42 | "tip_bin": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
43 | "trip_day_of_week_xf": FixedLenFeature(
44 | shape=[], dtype=tf.int64, default_value=None
45 | ),
46 | "trip_day_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
47 | "trip_hour_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
48 | "trip_miles_xf": FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
49 | "trip_month_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
50 | "trip_seconds_xf": FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
51 | }
52 |
53 |
54 | def test_transform_pipeline():
55 |
56 | project = os.getenv("PROJECT")
57 | region = os.getenv("REGION")
58 | bucket = os.getenv("BUCKET")
59 | dataset_display_name = os.getenv("DATASET_DISPLAY_NAME")
60 |
61 | assert project, "Environment variable PROJECT is None!"
62 | assert region, "Environment variable REGION is None!"
63 | assert bucket, "Environment variable BUCKET is None!"
64 | assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!"
65 |
66 | os.mkdir(OUTPUT_DIR)
67 |
68 | exported_data_dir = os.path.join(OUTPUT_DIR, "exported_data")
69 | transformed_data_dir = os.path.join(OUTPUT_DIR, "transformed_data")
70 | transform_artifacts_dir = os.path.join(OUTPUT_DIR, "transform_artifacts")
71 | temporary_dir = os.path.join(OUTPUT_DIR, "tmp")
72 |
73 | raw_data_query = datasource_utils.get_training_source_query(
74 | project=project,
75 | region=region,
76 | dataset_display_name=dataset_display_name,
77 | ml_use=ML_USE,
78 | limit=LIMIT,
79 | )
80 |
81 | args = {
82 | "runner": "DirectRunner",
83 | "raw_data_query": raw_data_query,
84 | "write_raw_data": False,
85 | "exported_data_prefix": exported_data_dir,
86 | "transformed_data_prefix": transformed_data_dir,
87 | "transform_artefact_dir": transform_artifacts_dir,
88 | "temporary_dir": temporary_dir,
89 | "gcs_location": f"gs://{bucket}/bq_tmp",
90 | "project": project,
91 | }
92 |
93 | logging.info(f"Transform pipeline args: {args}")
94 | etl.run_transform_pipeline(args)
95 | logging.info(f"Transform pipeline finished.")
96 |
97 | tft_output = tft.TFTransformOutput(transform_artifacts_dir)
98 | transform_feature_spec = tft_output.transformed_feature_spec()
99 | assert transform_feature_spec == EXPECTED_FEATURE_SPEC
100 |
--------------------------------------------------------------------------------
/src/tests/model_deployment_tests.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Test an uploaded model to Vertex AI."""
15 |
16 | import os
17 | import logging
18 | import tensorflow as tf
19 |
20 | test_instance = {
21 | "dropoff_grid": ["POINT(-87.6 41.9)"],
22 | "euclidean": [2064.2696],
23 | "loc_cross": [""],
24 | "payment_type": ["Credit Card"],
25 | "pickup_grid": ["POINT(-87.6 41.9)"],
26 | "trip_miles": [1.37],
27 | "trip_day": [12],
28 | "trip_hour": [16],
29 | "trip_month": [2],
30 | "trip_day_of_week": [4],
31 | "trip_seconds": [555],
32 | }
33 |
34 | SERVING_DEFAULT_SIGNATURE_NAME = "serving_default"
35 |
36 | from google.cloud import aiplatform as vertex_ai
37 |
38 |
39 | def test_model_artifact():
40 |
41 | feature_types = {
42 | "dropoff_grid": tf.dtypes.string,
43 | "euclidean": tf.dtypes.float32,
44 | "loc_cross": tf.dtypes.string,
45 | "payment_type": tf.dtypes.string,
46 | "pickup_grid": tf.dtypes.string,
47 | "trip_miles": tf.dtypes.float32,
48 | "trip_day": tf.dtypes.int64,
49 | "trip_hour": tf.dtypes.int64,
50 | "trip_month": tf.dtypes.int64,
51 | "trip_day_of_week": tf.dtypes.int64,
52 | "trip_seconds": tf.dtypes.int64,
53 | }
54 |
55 | new_test_instance = dict()
56 | for key in test_instance:
57 | new_test_instance[key] = tf.constant(
58 | [test_instance[key]], dtype=feature_types[key]
59 | )
60 |
61 | print(new_test_instance)
62 |
63 | project = os.getenv("PROJECT")
64 | region = os.getenv("REGION")
65 | model_display_name = os.getenv("MODEL_DISPLAY_NAME")
66 |
67 | assert project, "Environment variable PROJECT is None!"
68 | assert region, "Environment variable REGION is None!"
69 | assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!"
70 |
71 | vertex_ai.init(project=project, location=region,)
72 |
73 | models = vertex_ai.Model.list(
74 | filter=f'display_name={model_display_name}',
75 | order_by="update_time"
76 | )
77 |
78 | assert (
79 | models
80 | ), f"No model with display name {model_display_name} exists!"
81 |
82 | model = models[-1]
83 | artifact_uri = model.gca_resource.artifact_uri
84 | logging.info(f"Model artifact uri:{artifact_uri}")
85 | assert tf.io.gfile.exists(
86 | artifact_uri
87 | ), f"Model artifact uri {artifact_uri} does not exist!"
88 |
89 | saved_model = tf.saved_model.load(artifact_uri)
90 | logging.info("Model loaded successfully.")
91 |
92 | assert (
93 | SERVING_DEFAULT_SIGNATURE_NAME in saved_model.signatures
94 | ), f"{SERVING_DEFAULT_SIGNATURE_NAME} not in model signatures!"
95 |
96 | prediction_fn = saved_model.signatures["serving_default"]
97 | predictions = prediction_fn(**new_test_instance)
98 | logging.info("Model produced predictions.")
99 |
100 | keys = ["classes", "scores"]
101 | for key in keys:
102 | assert key in predictions, f"{key} in prediction outputs!"
103 |
104 | assert predictions["classes"].shape == (
105 | 1,
106 | 2,
107 | ), f"Invalid output classes shape: {predictions['classes'].shape}!"
108 | assert predictions["scores"].shape == (
109 | 1,
110 | 2,
111 | ), f"Invalid output scores shape: {predictions['scores'].shape}!"
112 | logging.info(f"Prediction output: {predictions}")
113 |
114 |
115 | def test_model_endpoint():
116 |
117 | project = os.getenv("PROJECT")
118 | region = os.getenv("REGION")
119 | model_display_name = os.getenv("MODEL_DISPLAY_NAME")
120 | endpoint_display_name = os.getenv("ENDPOINT_DISPLAY_NAME")
121 |
122 | assert project, "Environment variable PROJECT is None!"
123 | assert region, "Environment variable REGION is None!"
124 | assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!"
125 | assert endpoint_display_name, "Environment variable ENDPOINT_DISPLAY_NAME is None!"
126 |
127 | endpoints = vertex_ai.Endpoint.list(
128 | filter=f'display_name={endpoint_display_name}',
129 | order_by="update_time"
130 | )
131 | assert (
132 | endpoints
133 | ), f"Endpoint with display name {endpoint_display_name} does not exist! in region {region}"
134 |
135 | endpoint = endpoints[-1]
136 | logging.info(f"Calling endpoint: {endpoint}.")
137 |
138 | prediction = endpoint.predict([test_instance]).predictions[0]
139 |
140 | keys = ["classes", "scores"]
141 | for key in keys:
142 | assert key in prediction, f"{key} in prediction outputs!"
143 |
144 | assert (
145 | len(prediction["classes"]) == 2
146 | ), f"Invalid number of output classes: {len(prediction['classes'])}!"
147 | assert (
148 | len(prediction["scores"]) == 2
149 | ), f"Invalid number output scores: {len(prediction['scores'])}!"
150 |
151 | logging.info(f"Prediction output: {prediction}")
152 |
--------------------------------------------------------------------------------
/src/tests/model_tests.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Test model functions."""
15 |
16 | import sys
17 | import logging
18 | import tensorflow as tf
19 |
20 | from src.common import features
21 | from src.model_training import model, defaults
22 |
23 | root = logging.getLogger()
24 | root.setLevel(logging.INFO)
25 | handler = logging.StreamHandler(sys.stdout)
26 | handler.setLevel(logging.INFO)
27 | root.addHandler(handler)
28 |
29 | EXPECTED_HYPERPARAMS_KEYS = [
30 | "hidden_units",
31 | "learning_rate",
32 | "batch_size",
33 | "num_epochs",
34 | ]
35 |
36 |
37 | def test_hyperparams_defaults():
38 | hyperparams = {"hidden_units": [64, 32]}
39 |
40 | hyperparams = defaults.update_hyperparams(hyperparams)
41 | assert set(hyperparams.keys()) == set(EXPECTED_HYPERPARAMS_KEYS)
42 |
43 |
44 | def test_create_binary_classifier():
45 |
46 | hyperparams = hyperparams = defaults.update_hyperparams(dict())
47 |
48 | model_inputs = {
49 | "dropoff_grid_xf": tf.convert_to_tensor([0, 0, 0]),
50 | "euclidean_xf": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]),
51 | "loc_cross_xf": tf.convert_to_tensor([0, 0, 0]),
52 | "payment_type_xf": tf.convert_to_tensor([1, 0, 0]),
53 | "pickup_grid_xf": tf.convert_to_tensor([0, 0, 0]),
54 | "trip_day_of_week_xf": tf.convert_to_tensor([5, 4, 4]),
55 | "trip_day_xf": tf.convert_to_tensor([26, 24, 1]),
56 | "trip_hour_xf": tf.convert_to_tensor([0, 4, 2]),
57 | "trip_miles_xf": tf.convert_to_tensor([5.9717827, -0.7121308, -0.7601589]),
58 | "trip_month_xf": tf.convert_to_tensor([4, 3, 4]),
59 | "trip_seconds_xf": tf.convert_to_tensor([4.9029775, -0.34146854, -0.34479955]),
60 | }
61 |
62 | feature_vocab_sizes = {
63 | feature_name: 100 for feature_name in features.categorical_feature_names()
64 | }
65 | classifier = model._create_binary_classifier(feature_vocab_sizes, hyperparams)
66 | model_outputs = classifier(model_inputs) # .numpy()
67 | assert model_outputs.shape == (3, 1)
68 | assert model_outputs.dtype == "float32"
69 |
--------------------------------------------------------------------------------
/src/tests/pipeline_deployment_tests.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Test training pipeline using local runner."""
15 |
16 | import sys
17 | import os
18 | from tfx.orchestration.local.local_dag_runner import LocalDagRunner
19 | import tensorflow as tf
20 | from ml_metadata.proto import metadata_store_pb2
21 | import logging
22 |
23 | from src.tfx_pipelines import config
24 | from src.tfx_pipelines import training_pipeline
25 |
26 | root = logging.getLogger()
27 | root.setLevel(logging.INFO)
28 | handler = logging.StreamHandler(sys.stdout)
29 | handler.setLevel(logging.INFO)
30 | root.addHandler(handler)
31 |
32 | MLMD_SQLLITE = "mlmd.sqllite"
33 | NUM_EPOCHS = 1
34 | BATCH_SIZE = 512
35 | LEARNING_RATE = 0.001
36 | HIDDEN_UNITS = "128,128"
37 |
38 |
39 | def test_e2e_pipeline():
40 |
41 | project = os.getenv("PROJECT")
42 | region = os.getenv("REGION")
43 | model_display_name = os.getenv("MODEL_DISPLAY_NAME")
44 | dataset_display_name = os.getenv("DATASET_DISPLAY_NAME")
45 | gcs_location = os.getenv("GCS_LOCATION")
46 | model_registry = os.getenv("MODEL_REGISTRY_URI")
47 | upload_model = os.getenv("UPLOAD_MODEL")
48 |
49 | assert project, "Environment variable PROJECT is None!"
50 | assert region, "Environment variable REGION is None!"
51 | assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!"
52 | assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!"
53 | assert gcs_location, "Environment variable GCS_LOCATION is None!"
54 | assert model_registry, "Environment variable MODEL_REGISTRY_URI is None!"
55 |
56 | logging.info(f"upload_model: {upload_model}")
57 | if tf.io.gfile.exists(gcs_location):
58 | tf.io.gfile.rmtree(gcs_location)
59 | logging.info(f"Pipeline e2e test artifacts stored in: {gcs_location}")
60 |
61 | if tf.io.gfile.exists(MLMD_SQLLITE):
62 | tf.io.gfile.remove(MLMD_SQLLITE)
63 |
64 | metadata_connection_config = metadata_store_pb2.ConnectionConfig()
65 | metadata_connection_config.sqlite.filename_uri = MLMD_SQLLITE
66 | metadata_connection_config.sqlite.connection_mode = 3
67 | logging.info("ML metadata store is ready.")
68 |
69 | pipeline_root = os.path.join(
70 | config.ARTIFACT_STORE_URI,
71 | config.PIPELINE_NAME,
72 | )
73 |
74 | runner = LocalDagRunner()
75 |
76 | pipeline = training_pipeline.create_pipeline(
77 | pipeline_root=pipeline_root,
78 | num_epochs=NUM_EPOCHS,
79 | batch_size=BATCH_SIZE,
80 | learning_rate=LEARNING_RATE,
81 | hidden_units=HIDDEN_UNITS,
82 | metadata_connection_config=metadata_connection_config,
83 | )
84 |
85 | runner.run(pipeline)
86 |
87 | logging.info(f"Model output: {os.path.join(model_registry, model_display_name)}")
88 | assert tf.io.gfile.exists(os.path.join(model_registry, model_display_name))
89 |
--------------------------------------------------------------------------------
/src/tfx_pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/mlops-with-vertex-ai/11a2f341626a036a5ebbc030eadc10dfd2dfa5e0/src/tfx_pipelines/__init__.py
--------------------------------------------------------------------------------
/src/tfx_pipelines/components.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TFX Custom Python Components."""
15 |
16 |
17 | import sys
18 | import os
19 | import json
20 | import logging
21 | from datetime import datetime
22 | import tensorflow as tf
23 |
24 | from tfx.types import artifact_utils
25 | from tfx.utils import io_utils
26 | from tfx.components.util import model_utils
27 | from tfx.dsl.component.experimental.decorators import component
28 | from tfx.dsl.component.experimental.annotations import (
29 | InputArtifact,
30 | OutputArtifact,
31 | Parameter,
32 | )
33 | from tfx.types.standard_artifacts import HyperParameters, ModelBlessing
34 | from tfx.types.experimental.simple_artifacts import File as UploadedModel
35 | from tfx.types.experimental.simple_artifacts import Dataset
36 |
37 | from google.cloud import aiplatform as vertex_ai
38 |
39 | SCRIPT_DIR = os.path.dirname(
40 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))
41 | )
42 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, "..")))
43 |
44 | from src.preprocessing import etl
45 |
46 |
47 | HYPERPARAM_FILENAME = "hyperparameters.json"
48 | SERVING_DATA_PREFIX = "serving-data-"
49 | PREDICTION_RESULTS_PREFIX = "prediction.results-*"
50 |
51 |
52 | @component
53 | def hyperparameters_gen(
54 | num_epochs: Parameter[int],
55 | batch_size: Parameter[int],
56 | learning_rate: Parameter[float],
57 | hidden_units: Parameter[str],
58 | hyperparameters: OutputArtifact[HyperParameters],
59 | ):
60 |
61 | hp_dict = dict()
62 | hp_dict["num_epochs"] = num_epochs
63 | hp_dict["batch_size"] = batch_size
64 | hp_dict["learning_rate"] = learning_rate
65 | hp_dict["hidden_units"] = [int(units) for units in hidden_units.split(",")]
66 | logging.info(f"Hyperparameters: {hp_dict}")
67 |
68 | hyperparams_uri = os.path.join(
69 | artifact_utils.get_single_uri([hyperparameters]), HYPERPARAM_FILENAME
70 | )
71 | io_utils.write_string_file(hyperparams_uri, json.dumps(hp_dict))
72 | logging.info(f"Hyperparameters are written to: {hyperparams_uri}")
73 |
74 |
75 | @component
76 | def vertex_model_uploader(
77 | project: Parameter[str],
78 | region: Parameter[str],
79 | model_display_name: Parameter[str],
80 | pushed_model_location: Parameter[str],
81 | serving_image_uri: Parameter[str],
82 | model_blessing: InputArtifact[ModelBlessing],
83 | uploaded_model: OutputArtifact[UploadedModel],
84 | explanation_config: Parameter[str]="",
85 | labels: Parameter[str]="",
86 | ):
87 |
88 | vertex_ai.init(project=project, location=region)
89 |
90 | blessing = artifact_utils.get_single_instance([model_blessing])
91 | if not model_utils.is_model_blessed(blessing):
92 | logging.info(f"Model is not uploaded to Vertex AI because it was not blessed by the evaluator.")
93 | uploaded_model.set_int_custom_property("uploaded", 0)
94 | return
95 |
96 | pushed_model_dir = os.path.join(
97 | pushed_model_location, tf.io.gfile.listdir(pushed_model_location)[-1]
98 | )
99 |
100 | logging.info(f"Model registry location: {pushed_model_dir}")
101 |
102 | try:
103 | explanation_config = json.loads(explanation_config)
104 | explanation_metadata = vertex_ai.explain.ExplanationMetadata(
105 | inputs=explanation_config["inputs"],
106 | outputs=explanation_config["outputs"],
107 | )
108 | explanation_parameters = vertex_ai.explain.ExplanationParameters(
109 | explanation_config["params"]
110 | )
111 | except:
112 | explanation_metadata = None
113 | explanation_parameters = None
114 |
115 | try:
116 | labels = json.loads(labels)
117 | except:
118 | labels = None
119 |
120 | vertex_model = vertex_ai.Model.upload(
121 | display_name=model_display_name,
122 | artifact_uri=pushed_model_dir,
123 | serving_container_image_uri=serving_image_uri,
124 | parameters_schema_uri=None,
125 | instance_schema_uri=None,
126 | explanation_metadata=explanation_metadata,
127 | explanation_parameters=explanation_parameters,
128 | labels=labels
129 | )
130 |
131 | model_uri = vertex_model.gca_resource.name
132 | logging.info(f"Model uploaded to Vertex AI: {model_uri}")
133 | uploaded_model.set_string_custom_property("model_uri", model_uri)
134 | uploaded_model.set_int_custom_property("uploaded", 1)
135 |
136 |
137 | @component
138 | def bigquery_data_gen(
139 | sql_query: Parameter[str],
140 | output_data_format: Parameter[str],
141 | beam_args: Parameter[str],
142 | serving_dataset: OutputArtifact[Dataset],
143 | ):
144 |
145 | output_dir = os.path.join(
146 | artifact_utils.get_single_uri([serving_dataset]), SERVING_DATA_PREFIX
147 | )
148 |
149 | pipeline_args = json.loads(beam_args)
150 | pipeline_args["sql_query"] = sql_query
151 | pipeline_args["exported_data_prefix"] = output_dir
152 | pipeline_args["output_data_format"] = output_data_format
153 |
154 | logging.info("Data extraction started. Source query:")
155 | logging.info("{sql_query}")
156 | etl.run_extract_pipeline(pipeline_args)
157 | logging.info("Data extraction completed.")
158 |
159 |
160 | @component
161 | def vertex_batch_prediction(
162 | project: Parameter[str],
163 | region: Parameter[str],
164 | model_display_name: Parameter[str],
165 | instances_format: Parameter[str],
166 | predictions_format: Parameter[str],
167 | job_resources: Parameter[str],
168 | serving_dataset: InputArtifact[Dataset],
169 | prediction_results: OutputArtifact[Dataset],
170 | ):
171 |
172 | job_resources = json.loads(job_resources)
173 | gcs_source_pattern = (
174 | os.path.join(
175 | artifact_utils.get_single_uri([serving_dataset]), SERVING_DATA_PREFIX
176 | )
177 | + "*.jsonl"
178 | )
179 | gcs_destination_prefix = artifact_utils.get_single_uri([prediction_results])
180 | job_name = f"extract-{model_display_name}-serving-{datetime.now().strftime('%Y%m%d%H%M%S')}"
181 |
182 | vertex_ai.init(project=project, location=region)
183 |
184 | logging.info("Submitting Vertex AI batch prediction job...")
185 | batch_prediction_job = vertex_ai.BatchPredictionJob.create(
186 | job_display_name=job_name,
187 | model_name=model_display_name,
188 | gcs_source=gcs_source_pattern,
189 | gcs_destination_prefix=gcs_destination_prefix,
190 | instances_format=instances_format,
191 | predictions_format=predictions_format,
192 | sync=True,
193 | **job_resources,
194 | )
195 | logging.info("Batch prediction job completed.")
196 |
197 | prediction_results.set_string_custom_property(
198 | "batch_prediction_job", batch_prediction_job.gca_resource.name
199 | )
200 |
201 |
202 | @component
203 | def datastore_prediction_writer(
204 | datastore_kind: Parameter[str],
205 | predictions_format: Parameter[str],
206 | beam_args: Parameter[str],
207 | prediction_results: InputArtifact[Dataset],
208 | ):
209 |
210 | prediction_results_dir = os.path.join(
211 | artifact_utils.get_single_uri([prediction_results])
212 | )
213 | prediction_results_dir = os.path.join(
214 | prediction_results_dir, tf.io.gfile.listdir(prediction_results_dir)[0]
215 | )
216 | prediction_results_uri = os.path.join(
217 | prediction_results_dir, PREDICTION_RESULTS_PREFIX
218 | )
219 |
220 | pipeline_args = json.loads(beam_args)
221 | pipeline_args["prediction_results_uri"] = prediction_results_uri
222 | pipeline_args["datastore_kind"] = datastore_kind
223 | pipeline_args["predictions_format"] = predictions_format
224 |
225 | logging.info(f"Storing predictions to Datastore kind: {datastore_kind}")
226 | etl.run_store_predictions_pipeline(pipeline_args)
227 | logging.info("Predictions are stored.")
228 |
--------------------------------------------------------------------------------
/src/tfx_pipelines/config.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TFX pipeline configurations."""
15 |
16 | import os
17 | from tfx import v1 as tfx
18 |
19 | PROJECT = os.getenv("PROJECT", "")
20 | REGION = os.getenv("REGION", "")
21 | GCS_LOCATION = os.getenv("GCS_LOCATION", "")
22 |
23 | ARTIFACT_STORE_URI = os.path.join(GCS_LOCATION, "tfx_artifacts")
24 | MODEL_REGISTRY_URI = os.getenv(
25 | "MODEL_REGISTRY_URI",
26 | os.path.join(GCS_LOCATION, "model_registry"),
27 | )
28 |
29 | DATASET_DISPLAY_NAME = os.getenv("DATASET_DISPLAY_NAME", "chicago-taxi-tips")
30 | MODEL_DISPLAY_NAME = os.getenv(
31 | "MODEL_DISPLAY_NAME", f"{DATASET_DISPLAY_NAME}-classifier"
32 | )
33 | PIPELINE_NAME = os.getenv("PIPELINE_NAME", f"{MODEL_DISPLAY_NAME}-train-pipeline")
34 |
35 | ML_USE_COLUMN = "ml_use"
36 | EXCLUDE_COLUMNS = ",".join(["trip_start_timestamp"])
37 | TRAIN_LIMIT = os.getenv("TRAIN_LIMIT", "0")
38 | TEST_LIMIT = os.getenv("TEST_LIMIT", "0")
39 | SERVE_LIMIT = os.getenv("SERVE_LIMIT", "0")
40 |
41 | NUM_TRAIN_SPLITS = os.getenv("NUM_TRAIN_SPLITS", "4")
42 | NUM_EVAL_SPLITS = os.getenv("NUM_EVAL_SPLITS", "1")
43 | ACCURACY_THRESHOLD = os.getenv("ACCURACY_THRESHOLD", "0.8")
44 |
45 | USE_KFP_SA = os.getenv("USE_KFP_SA", "False")
46 |
47 | TFX_IMAGE_URI = os.getenv(
48 | "TFX_IMAGE_URI", f"gcr.io/{PROJECT}/tfx-{DATASET_DISPLAY_NAME}:latest"
49 | )
50 |
51 | BEAM_RUNNER = os.getenv("BEAM_RUNNER", "DirectRunner")
52 | BEAM_DIRECT_PIPELINE_ARGS = [
53 | f"--project={PROJECT}",
54 | f"--temp_location={os.path.join(GCS_LOCATION, 'temp')}",
55 | ]
56 | BEAM_DATAFLOW_PIPELINE_ARGS = [
57 | f"--project={PROJECT}",
58 | f"--temp_location={os.path.join(GCS_LOCATION, 'temp')}",
59 | f"--region={REGION}",
60 | f"--runner={BEAM_RUNNER}",
61 | ]
62 |
63 | TRAINING_RUNNER = os.getenv("TRAINING_RUNNER", "local")
64 | VERTEX_TRAINING_ARGS = {
65 | 'project': PROJECT,
66 | 'worker_pool_specs': [{
67 | 'machine_spec': {
68 | 'machine_type': 'n1-standard-4',
69 | # 'accelerator_type': 'NVIDIA_TESLA_K80',
70 | # 'accelerator_count': 1
71 | },
72 | 'replica_count': 1,
73 | 'container_spec': {
74 | 'image_uri': TFX_IMAGE_URI,
75 | },
76 | }],
77 | }
78 | VERTEX_TRAINING_CONFIG = {
79 | tfx.extensions.google_cloud_ai_platform.ENABLE_UCAIP_KEY: True,
80 | tfx.extensions.google_cloud_ai_platform.UCAIP_REGION_KEY: REGION,
81 | tfx.extensions.google_cloud_ai_platform.TRAINING_ARGS_KEY: VERTEX_TRAINING_ARGS,
82 | 'use_gpu': False,
83 | }
84 |
85 | SERVING_RUNTIME = os.getenv("SERVING_RUNTIME", "tf2-cpu.2-5")
86 | SERVING_IMAGE_URI = f"us-docker.pkg.dev/vertex-ai/prediction/{SERVING_RUNTIME}:latest"
87 |
88 | BATCH_PREDICTION_BQ_DATASET_NAME = os.getenv(
89 | "BATCH_PREDICTION_BQ_DATASET_NAME", "playground_us"
90 | )
91 | BATCH_PREDICTION_BQ_TABLE_NAME = os.getenv(
92 | "BATCH_PREDICTION_BQ_TABLE_NAME", "chicago_taxitrips_prep"
93 | )
94 | BATCH_PREDICTION_BEAM_ARGS = {
95 | "runner": f"{BEAM_RUNNER}",
96 | "temporary_dir": os.path.join(GCS_LOCATION, "temp"),
97 | "gcs_location": os.path.join(GCS_LOCATION, "temp"),
98 | "project": PROJECT,
99 | "region": REGION,
100 | "setup_file": "./setup.py",
101 | }
102 | BATCH_PREDICTION_JOB_RESOURCES = {
103 | "machine_type": "n1-standard-2",
104 | #'accelerator_count': 1,
105 | #'accelerator_type': 'NVIDIA_TESLA_T4'
106 | "starting_replica_count": 1,
107 | "max_replica_count": 10,
108 | }
109 | DATASTORE_PREDICTION_KIND = f"{MODEL_DISPLAY_NAME}-predictions"
110 |
111 | ENABLE_CACHE = os.getenv("ENABLE_CACHE", "0")
112 | UPLOAD_MODEL = os.getenv("UPLOAD_MODEL", "1")
113 |
114 | os.environ["PROJECT"] = PROJECT
115 | os.environ["PIPELINE_NAME"] = PIPELINE_NAME
116 | os.environ["TFX_IMAGE_URI"] = TFX_IMAGE_URI
117 | os.environ["MODEL_REGISTRY_URI"] = MODEL_REGISTRY_URI
118 |
--------------------------------------------------------------------------------
/src/tfx_pipelines/prediction_pipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TFX prediction pipeline definition."""
15 |
16 | import os
17 | import sys
18 | import json
19 | import logging
20 |
21 | from tfx.orchestration import pipeline, data_types
22 | from ml_metadata.proto import metadata_store_pb2
23 |
24 | SCRIPT_DIR = os.path.dirname(
25 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))
26 | )
27 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, "..")))
28 |
29 | from src.tfx_pipelines import config
30 | from src.tfx_pipelines import components as custom_components
31 | from src.common import datasource_utils
32 |
33 |
34 | def create_pipeline(
35 | pipeline_root: str,
36 | metadata_connection_config: metadata_store_pb2.ConnectionConfig = None,
37 | ):
38 |
39 | # Get source query.
40 | sql_query = datasource_utils.get_serving_source_query(
41 | bq_dataset_name=config.BATCH_PREDICTION_BQ_DATASET_NAME,
42 | bq_table_name=config.BATCH_PREDICTION_BQ_TABLE_NAME,
43 | limit=int(config.SERVE_LIMIT),
44 | )
45 |
46 | bigquery_data_gen = custom_components.bigquery_data_gen(
47 | sql_query=sql_query,
48 | output_data_format="jsonl",
49 | beam_args=json.dumps(config.BATCH_PREDICTION_BEAM_ARGS),
50 | )
51 |
52 | vertex_batch_prediction = custom_components.vertex_batch_prediction(
53 | project=config.PROJECT,
54 | region=config.REGION,
55 | model_display_name=config.MODEL_DISPLAY_NAME,
56 | instances_format="jsonl",
57 | predictions_format="jsonl",
58 | job_resources=json.dumps(config.BATCH_PREDICTION_JOB_RESOURCES),
59 | serving_dataset=bigquery_data_gen.outputs["serving_dataset"],
60 | )
61 |
62 | datastore_prediction_writer = custom_components.datastore_prediction_writer(
63 | datastore_kind=config.DATASTORE_PREDICTION_KIND,
64 | predictions_format="jsonl",
65 | beam_args=json.dumps(config.BATCH_PREDICTION_BEAM_ARGS),
66 | prediction_results=vertex_batch_prediction.outputs["prediction_results"],
67 | )
68 |
69 | pipeline_components = [
70 | bigquery_data_gen,
71 | vertex_batch_prediction,
72 | datastore_prediction_writer,
73 | ]
74 |
75 | logging.info(
76 | f"Pipeline components: {[component.id for component in pipeline_components]}"
77 | )
78 |
79 | beam_pipeline_args = config.BEAM_DIRECT_PIPELINE_ARGS
80 | if config.BEAM_RUNNER == "DataflowRunner":
81 | beam_pipeline_args = config.BEAM_DATAFLOW_PIPELINE_ARGS
82 |
83 | logging.info(f"Beam pipeline args: {beam_pipeline_args}")
84 |
85 | return pipeline.Pipeline(
86 | pipeline_name=config.PIPELINE_NAME,
87 | pipeline_root=pipeline_root,
88 | components=pipeline_components,
89 | beam_pipeline_args=beam_pipeline_args,
90 | metadata_connection_config=metadata_connection_config,
91 | enable_cache=int(config.ENABLE_CACHE),
92 | )
93 |
--------------------------------------------------------------------------------
/src/tfx_pipelines/runner.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Define KubeflowV2DagRunner to run the training pipeline using Managed Pipelines."""
15 |
16 |
17 | import os
18 | from kfp.v2.google.client import AIPlatformClient
19 | from tfx.orchestration import data_types
20 | from tfx.orchestration.kubeflow.v2 import kubeflow_v2_dag_runner
21 |
22 |
23 | from src.tfx_pipelines import config, training_pipeline, prediction_pipeline
24 | from src.model_training import defaults
25 |
26 |
27 | def compile_training_pipeline(pipeline_definition_file):
28 |
29 | pipeline_root = os.path.join(
30 | config.ARTIFACT_STORE_URI,
31 | config.PIPELINE_NAME,
32 | )
33 |
34 | managed_pipeline = training_pipeline.create_pipeline(
35 | pipeline_root=pipeline_root,
36 | num_epochs=data_types.RuntimeParameter(
37 | name="num_epochs",
38 | default=defaults.NUM_EPOCHS,
39 | ptype=int,
40 | ),
41 | batch_size=data_types.RuntimeParameter(
42 | name="batch_size",
43 | default=defaults.BATCH_SIZE,
44 | ptype=int,
45 | ),
46 | learning_rate=data_types.RuntimeParameter(
47 | name="learning_rate",
48 | default=defaults.LEARNING_RATE,
49 | ptype=float,
50 | ),
51 | hidden_units=data_types.RuntimeParameter(
52 | name="hidden_units",
53 | default=",".join(str(u) for u in defaults.HIDDEN_UNITS),
54 | ptype=str,
55 | ),
56 | )
57 |
58 | runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
59 | config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
60 | default_image=config.TFX_IMAGE_URI
61 | ),
62 | output_filename=pipeline_definition_file,
63 | )
64 |
65 | return runner.run(managed_pipeline, write_out=True)
66 |
67 |
68 | def compile_prediction_pipeline(pipeline_definition_file):
69 |
70 | pipeline_root = os.path.join(
71 | config.ARTIFACT_STORE_URI,
72 | config.PIPELINE_NAME,
73 | )
74 |
75 | managed_pipeline = prediction_pipeline.create_pipeline(
76 | pipeline_root=pipeline_root,
77 | )
78 |
79 | runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
80 | config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
81 | default_image=config.TFX_IMAGE_URI
82 | ),
83 | output_filename=pipeline_definition_file,
84 | )
85 |
86 | return runner.run(managed_pipeline, write_out=True)
87 |
88 |
89 | def submit_pipeline(pipeline_definition_file):
90 |
91 | pipeline_client = AIPlatformClient(project_id=config.PROJECT, region=config.REGION)
92 | pipeline_client.create_run_from_job_spec(pipeline_definition_file)
93 |
--------------------------------------------------------------------------------
/src/tfx_pipelines/training_pipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TFX training pipeline definition."""
15 |
16 | import os
17 | import sys
18 | import logging
19 | import json
20 |
21 | import tensorflow_model_analysis as tfma
22 |
23 | from ml_metadata.proto import metadata_store_pb2
24 | from tfx.proto import example_gen_pb2, transform_pb2, pusher_pb2
25 | from tfx.types import Channel, standard_artifacts
26 | from tfx.orchestration import pipeline, data_types
27 | from tfx.dsl.components.common.importer import Importer
28 | from tfx.dsl.components.common.resolver import Resolver
29 | from tfx.dsl.experimental import latest_artifacts_resolver
30 | from tfx.dsl.experimental import latest_blessed_model_resolver
31 | from tfx.v1.extensions.google_cloud_big_query import BigQueryExampleGen
32 | from tfx.v1.extensions.google_cloud_ai_platform import Trainer as VertexTrainer
33 | from tfx.v1.components import (
34 | StatisticsGen,
35 | ExampleValidator,
36 | Transform,
37 | Trainer,
38 | Evaluator,
39 | Pusher,
40 | )
41 |
42 | SCRIPT_DIR = os.path.dirname(
43 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))
44 | )
45 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, "..")))
46 |
47 | from src.tfx_pipelines import config
48 | from src.tfx_pipelines import components as custom_components
49 | from src.common import features, datasource_utils
50 |
51 | RAW_SCHEMA_DIR = "src/raw_schema"
52 | TRANSFORM_MODULE_FILE = "src/preprocessing/transformations.py"
53 | TRAIN_MODULE_FILE = "src/model_training/runner.py"
54 |
55 |
56 | def create_pipeline(
57 | pipeline_root: str,
58 | num_epochs: data_types.RuntimeParameter,
59 | batch_size: data_types.RuntimeParameter,
60 | learning_rate: data_types.RuntimeParameter,
61 | hidden_units: data_types.RuntimeParameter,
62 | metadata_connection_config: metadata_store_pb2.ConnectionConfig = None,
63 | ):
64 |
65 | # Hyperparameter generation.
66 | hyperparams_gen = custom_components.hyperparameters_gen(
67 | num_epochs=num_epochs,
68 | batch_size=batch_size,
69 | learning_rate=learning_rate,
70 | hidden_units=hidden_units,
71 | ).with_id("HyperparamsGen")
72 |
73 | # Get train source query.
74 | train_sql_query = datasource_utils.get_training_source_query(
75 | config.PROJECT,
76 | config.REGION,
77 | config.DATASET_DISPLAY_NAME,
78 | ml_use="UNASSIGNED",
79 | limit=int(config.TRAIN_LIMIT),
80 | )
81 |
82 | train_output_config = example_gen_pb2.Output(
83 | split_config=example_gen_pb2.SplitConfig(
84 | splits=[
85 | example_gen_pb2.SplitConfig.Split(
86 | name="train", hash_buckets=int(config.NUM_TRAIN_SPLITS)
87 | ),
88 | example_gen_pb2.SplitConfig.Split(
89 | name="eval", hash_buckets=int(config.NUM_EVAL_SPLITS)
90 | ),
91 | ]
92 | )
93 | )
94 |
95 | # Train example generation.
96 | train_example_gen = BigQueryExampleGen(
97 | query=train_sql_query,
98 | output_config=train_output_config,
99 | ).with_id("TrainDataGen")
100 |
101 | # Get test source query.
102 | test_sql_query = datasource_utils.get_training_source_query(
103 | config.PROJECT,
104 | config.REGION,
105 | config.DATASET_DISPLAY_NAME,
106 | ml_use="TEST",
107 | limit=int(config.TEST_LIMIT),
108 | )
109 |
110 | test_output_config = example_gen_pb2.Output(
111 | split_config=example_gen_pb2.SplitConfig(
112 | splits=[
113 | example_gen_pb2.SplitConfig.Split(name="test", hash_buckets=1),
114 | ]
115 | )
116 | )
117 |
118 | # Test example generation.
119 | test_example_gen = BigQueryExampleGen(
120 | query=test_sql_query,
121 | output_config=test_output_config,
122 | ).with_id("TestDataGen")
123 |
124 | # Schema importer.
125 | schema_importer = Importer(
126 | source_uri=RAW_SCHEMA_DIR,
127 | artifact_type=standard_artifacts.Schema,
128 | ).with_id("SchemaImporter")
129 |
130 | # Statistics generation.
131 | statistics_gen = StatisticsGen(examples=train_example_gen.outputs["examples"]).with_id(
132 | "StatisticsGen"
133 | )
134 |
135 | # Example validation.
136 | example_validator = ExampleValidator(
137 | statistics=statistics_gen.outputs["statistics"],
138 | schema=schema_importer.outputs["result"],
139 | ).with_id("ExampleValidator")
140 |
141 | # Data transformation.
142 | transform = Transform(
143 | examples=train_example_gen.outputs["examples"],
144 | schema=schema_importer.outputs["result"],
145 | module_file=TRANSFORM_MODULE_FILE,
146 | # This is a temporary workaround to run on Dataflow.
147 | force_tf_compat_v1=config.BEAM_RUNNER == "DataflowRunner",
148 | splits_config=transform_pb2.SplitsConfig(
149 | analyze=["train"], transform=["train", "eval"]
150 | ),
151 | ).with_id("DataTransformer")
152 |
153 | # Add dependency from example_validator to transform.
154 | transform.add_upstream_node(example_validator)
155 |
156 | # Get the latest model to warmstart
157 | warmstart_model_resolver = Resolver(
158 | strategy_class=latest_artifacts_resolver.LatestArtifactsResolver,
159 | latest_model=Channel(type=standard_artifacts.Model),
160 | ).with_id("WarmstartModelResolver")
161 |
162 | # Model training.
163 | trainer = Trainer(
164 | module_file=TRAIN_MODULE_FILE,
165 | examples=transform.outputs["transformed_examples"],
166 | schema=schema_importer.outputs["result"],
167 | base_model=warmstart_model_resolver.outputs["latest_model"],
168 | transform_graph=transform.outputs["transform_graph"],
169 | hyperparameters=hyperparams_gen.outputs["hyperparameters"],
170 | ).with_id("ModelTrainer")
171 |
172 | if config.TRAINING_RUNNER == "vertex":
173 | trainer = VertexTrainer(
174 | module_file=TRAIN_MODULE_FILE,
175 | examples=transform.outputs["transformed_examples"],
176 | schema=schema_importer.outputs["result"],
177 | base_model=warmstart_model_resolver.outputs["latest_model"],
178 | transform_graph=transform.outputs["transform_graph"],
179 | hyperparameters=hyperparams_gen.outputs["hyperparameters"],
180 | custom_config=config.VERTEX_TRAINING_CONFIG
181 | ).with_id("ModelTrainer")
182 |
183 |
184 | # Get the latest blessed model (baseline) for model validation.
185 | baseline_model_resolver = Resolver(
186 | strategy_class=latest_blessed_model_resolver.LatestBlessedModelResolver,
187 | model=Channel(type=standard_artifacts.Model),
188 | model_blessing=Channel(type=standard_artifacts.ModelBlessing),
189 | ).with_id("BaselineModelResolver")
190 |
191 | # Prepare evaluation config.
192 | eval_config = tfma.EvalConfig(
193 | model_specs=[
194 | tfma.ModelSpec(
195 | signature_name="serving_tf_example",
196 | label_key=features.TARGET_FEATURE_NAME,
197 | prediction_key="probabilities",
198 | )
199 | ],
200 | slicing_specs=[
201 | tfma.SlicingSpec(),
202 | ],
203 | metrics_specs=[
204 | tfma.MetricsSpec(
205 | metrics=[
206 | tfma.MetricConfig(class_name="ExampleCount"),
207 | tfma.MetricConfig(
208 | class_name="BinaryAccuracy",
209 | threshold=tfma.MetricThreshold(
210 | value_threshold=tfma.GenericValueThreshold(
211 | lower_bound={"value": float(config.ACCURACY_THRESHOLD)}
212 | ),
213 | # Change threshold will be ignored if there is no
214 | # baseline model resolved from MLMD (first run).
215 | change_threshold=tfma.GenericChangeThreshold(
216 | direction=tfma.MetricDirection.HIGHER_IS_BETTER,
217 | absolute={"value": -1e-10},
218 | ),
219 | ),
220 | ),
221 | ]
222 | )
223 | ],
224 | )
225 |
226 | # Model evaluation.
227 | evaluator = Evaluator(
228 | examples=test_example_gen.outputs["examples"],
229 | example_splits=["test"],
230 | model=trainer.outputs["model"],
231 | baseline_model=baseline_model_resolver.outputs["model"],
232 | eval_config=eval_config,
233 | schema=schema_importer.outputs["result"],
234 | ).with_id("ModelEvaluator")
235 |
236 | exported_model_location = os.path.join(
237 | config.MODEL_REGISTRY_URI, config.MODEL_DISPLAY_NAME
238 | )
239 | push_destination = pusher_pb2.PushDestination(
240 | filesystem=pusher_pb2.PushDestination.Filesystem(
241 | base_directory=exported_model_location
242 | )
243 | )
244 |
245 | # Push custom model to model registry.
246 | pusher = Pusher(
247 | model=trainer.outputs["model"],
248 | model_blessing=evaluator.outputs["blessing"],
249 | push_destination=push_destination,
250 | ).with_id("ModelPusher")
251 |
252 | # Upload custom trained model to Vertex AI.
253 | labels = {
254 | "dataset_name": config.DATASET_DISPLAY_NAME,
255 | "pipeline_name": config.PIPELINE_NAME,
256 | "pipeline_root": pipeline_root
257 | }
258 | labels = json.dumps(labels)
259 | explanation_config = json.dumps(features.generate_explanation_config())
260 |
261 | vertex_model_uploader = custom_components.vertex_model_uploader(
262 | project=config.PROJECT,
263 | region=config.REGION,
264 | model_display_name=config.MODEL_DISPLAY_NAME,
265 | pushed_model_location=exported_model_location,
266 | serving_image_uri=config.SERVING_IMAGE_URI,
267 | model_blessing=evaluator.outputs["blessing"],
268 | explanation_config=explanation_config,
269 | labels=labels
270 | ).with_id("VertexUploader")
271 |
272 | pipeline_components = [
273 | hyperparams_gen,
274 | train_example_gen,
275 | test_example_gen,
276 | statistics_gen,
277 | schema_importer,
278 | example_validator,
279 | transform,
280 | warmstart_model_resolver,
281 | trainer,
282 | baseline_model_resolver,
283 | evaluator,
284 | pusher,
285 | ]
286 |
287 | if int(config.UPLOAD_MODEL):
288 | pipeline_components.append(vertex_model_uploader)
289 | # Add dependency from pusher to aip_model_uploader.
290 | vertex_model_uploader.add_upstream_node(pusher)
291 |
292 | logging.info(
293 | f"Pipeline components: {[component.id for component in pipeline_components]}"
294 | )
295 |
296 | beam_pipeline_args = config.BEAM_DIRECT_PIPELINE_ARGS
297 | if config.BEAM_RUNNER == "DataflowRunner":
298 | beam_pipeline_args = config.BEAM_DATAFLOW_PIPELINE_ARGS
299 |
300 | logging.info(f"Beam pipeline args: {beam_pipeline_args}")
301 |
302 | return pipeline.Pipeline(
303 | pipeline_name=config.PIPELINE_NAME,
304 | pipeline_root=pipeline_root,
305 | components=pipeline_components,
306 | beam_pipeline_args=beam_pipeline_args,
307 | metadata_connection_config=metadata_connection_config,
308 | enable_cache=int(config.ENABLE_CACHE),
309 | )
310 |
--------------------------------------------------------------------------------