├── .gitignore
├── 01-dataset-management.ipynb
├── 02-experimentation.ipynb
├── 03-training-formalization.ipynb
├── 04-pipeline-deployment.ipynb
├── 05-continuous-training.ipynb
├── 06-model-deployment.ipynb
├── 07-prediction-serving.ipynb
├── 08-model-monitoring.ipynb
├── Dockerfile
├── LICENSE
├── README.md
├── build
├── Dockerfile
├── model-deployment.yaml
├── pipeline-deployment.yaml
├── serving_resources_spec.json
└── utils.py
├── mlops.png
├── provision
├── README.md
└── terraform
│ ├── gcs-bucket.tf
│ ├── main.tf
│ ├── notebook-instance.tf
│ ├── service-accounts.tf
│ ├── services.tf
│ ├── terraform.tfvars
│ └── variables.tf
├── requirements.txt
├── setup.py
└── src
├── __init__.py
├── common
├── __init__.py
├── datasource_utils.py
└── features.py
├── model_training
├── __init__.py
├── data.py
├── defaults.py
├── exporter.py
├── model.py
├── runner.py
├── task.py
└── trainer.py
├── pipeline_triggering
├── __init__.py
├── main.py
└── requirements.txt
├── preprocessing
├── __init__.py
├── etl.py
└── transformations.py
├── raw_schema
└── schema.pbtxt
├── tests
├── __init__.py
├── datasource_utils_tests.py
├── etl_tests.py
├── model_deployment_tests.py
├── model_tests.py
└── pipeline_deployment_tests.py
└── tfx_pipelines
├── __init__.py
├── components.py
├── config.py
├── prediction_pipeline.py
├── runner.py
└── training_pipeline.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | develop-eggs/
12 | dist/
13 | downloads/
14 | eggs/
15 | .eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | wheels/
22 | pip-wheel-metadata/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 | db.sqlite3-journal
62 |
63 | # Flask stuff:
64 | instance/
65 | .webassets-cache
66 |
67 | # Scrapy stuff:
68 | .scrapy
69 |
70 | # Sphinx documentation
71 | docs/_build/
72 |
73 | # PyBuilder
74 | target/
75 |
76 | # Jupyter Notebook
77 | .ipynb_checkpoints
78 |
79 | # IPython
80 | profile_default/
81 | ipython_config.py
82 | .idea/
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | _workspace/
132 | *.tar.gz
133 | .egg-info/
134 | *.whl
135 | mlpipeline-ui-metadata.json
136 | *.csv
137 | *.sqllite
138 | model.png
139 |
--------------------------------------------------------------------------------
/04-pipeline-deployment.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "d272910e",
6 | "metadata": {},
7 | "source": [
8 | "# 04 - Test and deploy a TFX training pipeline to `Vertex Pipelines`\n",
9 | "\n",
10 | "The purpose of this notebook is to test, deploy, and run the `TFX` pipeline on `Vertex Pipelines`. The notebook covers the following tasks:\n",
11 | "\n",
12 | "1. Run the tests locally.\n",
13 | "2. Run the `TFX` pipeline using `Vertex Pipelines`\n",
14 | "3. Execute the pipeline deployment `CI/CD` steps using `Cloud Build`."
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "id": "beaa2787",
20 | "metadata": {},
21 | "source": [
22 | "## Setup"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "id": "51e05608",
28 | "metadata": {},
29 | "source": [
30 | "### Import libraries"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "id": "9aa72b29",
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "import os\n",
41 | "import kfp\n",
42 | "import tfx\n",
43 | "\n",
44 | "print('TFX:', tfx.__version__)\n",
45 | "print('KFP:', kfp.__version__)"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "id": "24aceb9a",
51 | "metadata": {},
52 | "source": [
53 | "### Setup Google Cloud project"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": null,
59 | "id": "d8d9f81b",
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "PROJECT_ID = '[your-project-id]' # Change to your project id.\n",
64 | "REGION = 'us-central1' # Change to your region.\n",
65 | "BUCKET = '[your-bucket-name]' # Change to your bucket name.\n",
66 | "SERVICE_ACCOUNT = '[your-service-account]'\n",
67 | "\n",
68 | "if PROJECT_ID == '' or PROJECT_ID is None or PROJECT_ID == '[your-project-id]':\n",
69 | " # Get your GCP project id from gcloud\n",
70 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n",
71 | " PROJECT_ID = shell_output[0]\n",
72 | " \n",
73 | "if SERVICE_ACCOUNT == '' or SERVICE_ACCOUNT is None or SERVICE_ACCOUNT == '[your-service-account]':\n",
74 | " # Get your GCP project id from gcloud\n",
75 | " shell_output = !gcloud config list --format 'value(core.account)' 2>/dev/null\n",
76 | " SERVICE_ACCOUNT = shell_output[0]\n",
77 | " \n",
78 | "if BUCKET == '' or BUCKET is None or BUCKET == '[your-bucket-name]':\n",
79 | " # Set your bucket name using your GCP project id\n",
80 | " BUCKET = PROJECT_ID\n",
81 | " # Try to create the bucket if it doesn'exists\n",
82 | " ! gsutil mb -l $REGION gs://$BUCKET\n",
83 | " print('')\n",
84 | " \n",
85 | "print('Project ID:', PROJECT_ID)\n",
86 | "print('Region:', REGION)\n",
87 | "print('Bucket name:', BUCKET)\n",
88 | "print('Service Account:', SERVICE_ACCOUNT)"
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "id": "3d24d18b",
94 | "metadata": {},
95 | "source": [
96 | "### Set configurations"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "id": "a8295cca",
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "BQ_LOCATION = 'US'\n",
107 | "BQ_DATASET_NAME = 'playground_us' # Change to your BQ dataset name.\n",
108 | "BQ_TABLE_NAME = 'chicago_taxitrips_prep'\n",
109 | "\n",
110 | "VERSION = 'v1'\n",
111 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n",
112 | "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n",
113 | "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n",
114 | "\n",
115 | "CICD_IMAGE_NAME = 'cicd:latest'\n",
116 | "CICD_IMAGE_URI = f'gcr.io/{PROJECT_ID}/{CICD_IMAGE_NAME}'"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "id": "81d049f5",
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "! rm -r src/raw_schema/.ipynb_checkpoints/"
127 | ]
128 | },
129 | {
130 | "cell_type": "markdown",
131 | "id": "4cbcbbb8",
132 | "metadata": {},
133 | "source": [
134 | "## 1. Run the CI/CD steps locally"
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "id": "58845362",
140 | "metadata": {},
141 | "source": [
142 | "### Set pipeline configurations for the local run"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "id": "44c48da6",
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "os.environ['DATASET_DISPLAY_NAME'] = DATASET_DISPLAY_NAME\n",
153 | "os.environ['MODEL_DISPLAY_NAME'] = MODEL_DISPLAY_NAME\n",
154 | "os.environ['PIPELINE_NAME'] = PIPELINE_NAME\n",
155 | "os.environ['PROJECT'] = PROJECT_ID\n",
156 | "os.environ['REGION'] = REGION\n",
157 | "os.environ['BQ_LOCATION'] = BQ_LOCATION\n",
158 | "os.environ['BQ_DATASET_NAME'] = BQ_DATASET_NAME\n",
159 | "os.environ['BQ_TABLE_NAME'] = BQ_TABLE_NAME\n",
160 | "os.environ['GCS_LOCATION'] = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/e2e_tests'\n",
161 | "os.environ['TRAIN_LIMIT'] = '1000'\n",
162 | "os.environ['TEST_LIMIT'] = '100'\n",
163 | "os.environ['UPLOAD_MODEL'] = '0'\n",
164 | "os.environ['ACCURACY_THRESHOLD'] = '0.1'\n",
165 | "os.environ['BEAM_RUNNER'] = 'DirectRunner'\n",
166 | "os.environ['TRAINING_RUNNER'] = 'local'"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "id": "fcf65dee",
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "from src.tfx_pipelines import config\n",
177 | "import importlib\n",
178 | "importlib.reload(config)\n",
179 | "\n",
180 | "for key, value in config.__dict__.items():\n",
181 | " if key.isupper(): print(f'{key}: {value}')"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "id": "0e4989b9",
187 | "metadata": {},
188 | "source": [
189 | "### Run the unit tests for the data and model pipeline components"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "id": "37324634",
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "! py.test src/tests/datasource_utils_tests.py -s"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": null,
205 | "id": "1a40f106",
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "! py.test src/tests/model_tests.py -s"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "id": "f3b62aea",
215 | "metadata": {},
216 | "source": [
217 | "### Run the e2e pipeline test"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": null,
223 | "id": "3acb31cf",
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "! py.test src/tests/pipeline_deployment_tests.py::test_e2e_pipeline -s"
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "id": "1c8758df",
233 | "metadata": {},
234 | "source": [
235 | "## 2. Run the training pipeline using `Vertex Pipelines`"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "id": "a02ce062",
241 | "metadata": {},
242 | "source": [
243 | "### Set the pipeline configurations for the `Vertex Pipeline` run"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "id": "01c2b3e1",
250 | "metadata": {},
251 | "outputs": [],
252 | "source": [
253 | "os.environ['DATASET_DISPLAY_NAME'] = DATASET_DISPLAY_NAME\n",
254 | "os.environ['MODEL_DISPLAY_NAME'] = MODEL_DISPLAY_NAME\n",
255 | "os.environ['PIPELINE_NAME'] = PIPELINE_NAME\n",
256 | "os.environ['PROJECT'] = PROJECT_ID\n",
257 | "os.environ['REGION'] = REGION\n",
258 | "os.environ['GCS_LOCATION'] = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}'\n",
259 | "os.environ['TRAIN_LIMIT'] = '85000'\n",
260 | "os.environ['TEST_LIMIT'] = '15000'\n",
261 | "os.environ['BEAM_RUNNER'] = 'DataflowRunner'\n",
262 | "os.environ['TRAINING_RUNNER'] = 'vertex'\n",
263 | "os.environ['TFX_IMAGE_URI'] = f'gcr.io/{PROJECT_ID}/{DATASET_DISPLAY_NAME}:{VERSION}'"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": null,
269 | "id": "9e8be723",
270 | "metadata": {},
271 | "outputs": [],
272 | "source": [
273 | "from src.tfx_pipelines import config\n",
274 | "import importlib\n",
275 | "importlib.reload(config)\n",
276 | "\n",
277 | "for key, value in config.__dict__.items():\n",
278 | " if key.isupper(): print(f'{key}: {value}')"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "id": "286ff84e",
284 | "metadata": {},
285 | "source": [
286 | "### Build the training container image\n",
287 | "\n",
288 | "This is the `TFX` runtime environment for the training pipeline steps."
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": null,
294 | "id": "d9686014",
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "!echo $TFX_IMAGE_URI"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "id": "7f7986c2",
305 | "metadata": {},
306 | "outputs": [],
307 | "source": [
308 | "!gcloud builds submit --tag $TFX_IMAGE_URI . --timeout=15m --machine-type=e2-highcpu-8"
309 | ]
310 | },
311 | {
312 | "cell_type": "markdown",
313 | "id": "ce2d5c9a",
314 | "metadata": {},
315 | "source": [
316 | "### Compile the `TFX` pipeline"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": null,
322 | "id": "df29fc7e",
323 | "metadata": {},
324 | "outputs": [],
325 | "source": [
326 | "from src.tfx_pipelines import runner\n",
327 | "\n",
328 | "pipeline_definition_file = f'{config.PIPELINE_NAME}.json'\n",
329 | "pipeline_definition = runner.compile_training_pipeline(pipeline_definition_file)"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "id": "75928c08",
336 | "metadata": {},
337 | "outputs": [],
338 | "source": [
339 | "PIPELINES_STORE = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/compiled_pipelines/'\n",
340 | "! gsutil cp {pipeline_definition_file} {PIPELINES_STORE}"
341 | ]
342 | },
343 | {
344 | "cell_type": "markdown",
345 | "id": "ef836781",
346 | "metadata": {},
347 | "source": [
348 | "### Submit run to Vertex Pipelines"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "id": "1d1115bd",
355 | "metadata": {},
356 | "outputs": [],
357 | "source": [
358 | "from kfp.v2.google.client import AIPlatformClient\n",
359 | "\n",
360 | "pipeline_client = AIPlatformClient(\n",
361 | " project_id=PROJECT_ID, region=REGION)\n",
362 | " \n",
363 | "job = pipeline_client.create_run_from_job_spec(\n",
364 | " job_spec_path=pipeline_definition_file,\n",
365 | " parameter_values={\n",
366 | " 'learning_rate': 0.003,\n",
367 | " 'batch_size': 512,\n",
368 | " 'hidden_units': '128,128',\n",
369 | " 'num_epochs': 30,\n",
370 | " }\n",
371 | ")"
372 | ]
373 | },
374 | {
375 | "cell_type": "markdown",
376 | "id": "7cc4477f",
377 | "metadata": {},
378 | "source": [
379 | "### Extracting pipeline runs metadata"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": null,
385 | "id": "464ad3a8",
386 | "metadata": {},
387 | "outputs": [],
388 | "source": [
389 | "from google.cloud import aiplatform as vertex_ai\n",
390 | "\n",
391 | "pipeline_df = vertex_ai.get_pipeline_df(PIPELINE_NAME)\n",
392 | "pipeline_df = pipeline_df[pipeline_df.pipeline_name == PIPELINE_NAME]\n",
393 | "pipeline_df.T"
394 | ]
395 | },
396 | {
397 | "cell_type": "markdown",
398 | "id": "ad380129",
399 | "metadata": {},
400 | "source": [
401 | "## 3. Execute the pipeline deployment CI/CD steps in Cloud Build\n",
402 | "\n",
403 | "The CI/CD routine is defined in the [pipeline-deployment.yaml](pipeline-deployment.yaml) file, and consists of the following steps:\n",
404 | "1. Clone the repository to the build environment.\n",
405 | "2. Run unit tests.\n",
406 | "3. Run a local e2e test of the pipeline.\n",
407 | "4. Build the ML container image for pipeline steps.\n",
408 | "5. Compile the pipeline.\n",
409 | "6. Upload the pipeline to Cloud Storage."
410 | ]
411 | },
412 | {
413 | "cell_type": "markdown",
414 | "id": "e00b075f",
415 | "metadata": {},
416 | "source": [
417 | "### Build CI/CD container image for Cloud Build\n",
418 | "\n",
419 | "This is the runtime environment where the steps of testing and deploying the pipeline will be executed."
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": null,
425 | "id": "867e5ae1",
426 | "metadata": {},
427 | "outputs": [],
428 | "source": [
429 | "! echo $CICD_IMAGE_URI"
430 | ]
431 | },
432 | {
433 | "cell_type": "code",
434 | "execution_count": null,
435 | "id": "40f497f6",
436 | "metadata": {},
437 | "outputs": [],
438 | "source": [
439 | "! gcloud builds submit --tag $CICD_IMAGE_URI build/. --timeout=15m --machine-type=e2-highcpu-8"
440 | ]
441 | },
442 | {
443 | "cell_type": "markdown",
444 | "id": "4f6e2dd7",
445 | "metadata": {},
446 | "source": [
447 | "### Run CI/CD from pipeline deployment using Cloud Build"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": null,
453 | "id": "117895d7",
454 | "metadata": {},
455 | "outputs": [],
456 | "source": [
457 | "REPO_URL = 'https://github.com/ksalama/ucaip-labs.git' # Change to your github repo.\n",
458 | "BRANCH = 'main'\n",
459 | "\n",
460 | "GCS_LOCATION = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/'\n",
461 | "TEST_GCS_LOCATION = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/e2e_tests'\n",
462 | "CI_TRAIN_LIMIT = 1000\n",
463 | "CI_TEST_LIMIT = 100\n",
464 | "CI_UPLOAD_MODEL = 0\n",
465 | "CI_ACCURACY_THRESHOLD = 0.1\n",
466 | "BEAM_RUNNER = 'DataflowRunner'\n",
467 | "TRAINING_RUNNER = 'vertex'\n",
468 | "VERSION = 'tfx-0-30'\n",
469 | "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n",
470 | "PIPELINES_STORE = os.path.join(GCS_LOCATION, 'compiled_pipelines')\n",
471 | "\n",
472 | "TFX_IMAGE_URI = f'gcr.io/{PROJECT_ID}/{DATASET_DISPLAY_NAME}:{VERSION}'\n",
473 | "\n",
474 | "SUBSTITUTIONS=f'''\\\n",
475 | "_REPO_URL='{REPO_URL}',\\\n",
476 | "_BRANCH={BRANCH},\\\n",
477 | "_CICD_IMAGE_URI={CICD_IMAGE_URI},\\\n",
478 | "_PROJECT_ID={PROJECT_ID},\\\n",
479 | "_REGION={REGION},\\\n",
480 | "_GCS_LOCATION={GCS_LOCATION},\\\n",
481 | "_TEST_GCS_LOCATION={TEST_GCS_LOCATION},\\\n",
482 | "_BQ_LOCATION={BQ_LOCATION},\\\n",
483 | "_BQ_DATASET_NAME={BQ_DATASET_NAME},\\\n",
484 | "_BQ_TABLE_NAME={BQ_TABLE_NAME},\\\n",
485 | "_DATASET_DISPLAY_NAME={DATASET_DISPLAY_NAME},\\\n",
486 | "_MODEL_DISPLAY_NAME={MODEL_DISPLAY_NAME},\\\n",
487 | "_CI_TRAIN_LIMIT={CI_TRAIN_LIMIT},\\\n",
488 | "_CI_TEST_LIMIT={CI_TEST_LIMIT},\\\n",
489 | "_CI_UPLOAD_MODEL={CI_UPLOAD_MODEL},\\\n",
490 | "_CI_ACCURACY_THRESHOLD={CI_ACCURACY_THRESHOLD},\\\n",
491 | "_BEAM_RUNNER={BEAM_RUNNER},\\\n",
492 | "_TRAINING_RUNNER={TRAINING_RUNNER},\\\n",
493 | "_TFX_IMAGE_URI={TFX_IMAGE_URI},\\\n",
494 | "_PIPELINE_NAME={PIPELINE_NAME},\\\n",
495 | "_PIPELINES_STORE={PIPELINES_STORE}\\\n",
496 | "'''\n",
497 | "\n",
498 | "!echo $SUBSTITUTIONS"
499 | ]
500 | },
501 | {
502 | "cell_type": "code",
503 | "execution_count": null,
504 | "id": "b54081db",
505 | "metadata": {},
506 | "outputs": [],
507 | "source": [
508 | "!gcloud builds submit --no-source --timeout=60m --config build/pipeline-deployment.yaml --substitutions {SUBSTITUTIONS} --machine-type=e2-highcpu-8"
509 | ]
510 | },
511 | {
512 | "cell_type": "code",
513 | "execution_count": null,
514 | "id": "72d9baf5",
515 | "metadata": {},
516 | "outputs": [],
517 | "source": []
518 | }
519 | ],
520 | "metadata": {
521 | "environment": {
522 | "name": "common-cpu.m73",
523 | "type": "gcloud",
524 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73"
525 | },
526 | "kernelspec": {
527 | "display_name": "Python 3",
528 | "language": "python",
529 | "name": "python3"
530 | },
531 | "language_info": {
532 | "codemirror_mode": {
533 | "name": "ipython",
534 | "version": 3
535 | },
536 | "file_extension": ".py",
537 | "mimetype": "text/x-python",
538 | "name": "python",
539 | "nbconvert_exporter": "python",
540 | "pygments_lexer": "ipython3",
541 | "version": "3.7.10"
542 | }
543 | },
544 | "nbformat": 4,
545 | "nbformat_minor": 5
546 | }
547 |
--------------------------------------------------------------------------------
/05-continuous-training.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "26667428",
6 | "metadata": {},
7 | "source": [
8 | "# 05 - Continuous training\n",
9 | "\n",
10 | "After testing, compiling, and uploading the pipeline definition to Cloud Storage, the pipeline is executed with respect to a trigger. `Cloud Functions` and `Cloud Pub/Sub` are used in this notebook as a triggering mechanism. The triggering can be scheduled using `Cloud Scheduler`. The trigger source sends a message to a Cloud Pub/Sub topic that the Cloud Function listens to, and then it submits the pipeline to `Vertex Pipelines` to be executed.\n",
11 | "\n",
12 | "This notebook covers the following steps:\n",
13 | "1. Create the `Cloud Pub/Sub` topic.\n",
14 | "2. Deploy the `Cloud Function` \n",
15 | "3. Test triggering a pipeline.\n",
16 | "4. Extracting pipeline run metadata.\n",
17 | "\n",
18 | "Learn about [Cloud Functions](https://cloud.google.com/functions).\n",
19 | "Learn about [Cloud Pub/Sub](https://cloud.google.com/pubsub).\n",
20 | "Learn about [Cloud Scheduler](https://cloud.google.com/scheduler)"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "id": "45edf109",
26 | "metadata": {},
27 | "source": [
28 | "## Setup"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "id": "d44dcc22",
34 | "metadata": {},
35 | "source": [
36 | "### Import libraries"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "id": "8fa8c2ff",
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "import json\n",
47 | "import os\n",
48 | "import logging\n",
49 | "import tensorflow as tf\n",
50 | "import tfx\n",
51 | "import IPython \n",
52 | "\n",
53 | "logging.getLogger().setLevel(logging.INFO)\n",
54 | "\n",
55 | "print('TFX:', tfx.__version__)"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "id": "a82072dc",
61 | "metadata": {},
62 | "source": [
63 | "### Setup Google Cloud project"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "id": "04c5843a",
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "PROJECT_ID = '[your-project-id]' # Change to your project id.\n",
74 | "REGION = 'us-central1' # Change to your region.\n",
75 | "BUCKET = '[your-bucket-name]' # Change to your bucket name.\n",
76 | "\n",
77 | "if PROJECT_ID == '' or PROJECT_ID is None or PROJECT_ID == '[your-project-id]':\n",
78 | " # Get your GCP project id from gcloud\n",
79 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n",
80 | " PROJECT_ID = shell_output[0]\n",
81 | " \n",
82 | "if BUCKET == '' or BUCKET is None or BUCKET == '[your-bucket-name]':\n",
83 | " # Set your bucket name using your GCP project id\n",
84 | " BUCKET = PROJECT_ID\n",
85 | " # Try to create the bucket if it doesn'exists\n",
86 | " ! gsutil mb -l $REGION gs://$BUCKET\n",
87 | " print('')\n",
88 | "\n",
89 | "print('Project ID:', PROJECT_ID)\n",
90 | "print('Region:', REGION)\n",
91 | "print('Bucket name:', BUCKET)"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "id": "ae7570c7",
97 | "metadata": {},
98 | "source": [
99 | "### Set configurations"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "id": "99e362bb",
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "VERSION = 'v1'\n",
110 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n",
111 | "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n",
112 | "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n",
113 | "\n",
114 | "PIPELINES_STORE = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/compiled_pipelines/'\n",
115 | "GCS_PIPELINE_FILE_LOCATION = os.path.join(PIPELINES_STORE, f'{PIPELINE_NAME}.json')\n",
116 | "PUBSUB_TOPIC = f'trigger-{PIPELINE_NAME}'\n",
117 | "CLOUD_FUNCTION_NAME = f'trigger-{PIPELINE_NAME}-fn'"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "id": "fc916c87",
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "!gsutil ls {GCS_PIPELINE_FILE_LOCATION}"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "id": "ed5321a4",
133 | "metadata": {},
134 | "source": [
135 | "## 1. Create a Pub/Sub topic"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "id": "1f36582f",
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "! gcloud pubsub topics create {PUBSUB_TOPIC}"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "id": "f090676b",
151 | "metadata": {},
152 | "source": [
153 | "## 2. Deploy the Cloud Function"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "id": "48858f15",
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "ENV_VARS=f'''\\\n",
164 | "PROJECT={PROJECT_ID},\\\n",
165 | "REGION={REGION},\\\n",
166 | "GCS_PIPELINE_FILE_LOCATION={GCS_PIPELINE_FILE_LOCATION}\n",
167 | "'''\n",
168 | "\n",
169 | "! echo {ENV_VARS}"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "id": "a78831f0",
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "! rm -r src/pipeline_triggering/.ipynb_checkpoints"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "id": "dfd65f0d",
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "! gcloud functions deploy {CLOUD_FUNCTION_NAME} \\\n",
190 | " --region={REGION} \\\n",
191 | " --trigger-topic={PUBSUB_TOPIC} \\\n",
192 | " --runtime=python37 \\\n",
193 | " --source=src/pipeline_triggering\\\n",
194 | " --entry-point=trigger_pipeline\\\n",
195 | " --stage-bucket={BUCKET}\\\n",
196 | " --update-env-vars={ENV_VARS}"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "id": "4f632321",
203 | "metadata": {},
204 | "outputs": [],
205 | "source": [
206 | "cloud_fn_url = f'https://console.cloud.google.com/functions/details/{REGION}/{CLOUD_FUNCTION_NAME}'\n",
207 | "html = f'See the Cloud Function details here.'\n",
208 | "IPython.display.display(IPython.display.HTML(html))"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "id": "9c00b9a6",
214 | "metadata": {},
215 | "source": [
216 | "## 3. Trigger the pipeline"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": null,
222 | "id": "0cf3abbc",
223 | "metadata": {},
224 | "outputs": [],
225 | "source": [
226 | "from google.cloud import pubsub\n",
227 | "\n",
228 | "publish_client = pubsub.PublisherClient()\n",
229 | "topic = f'projects/{PROJECT_ID}/topics/{PUBSUB_TOPIC}'\n",
230 | "data = {\n",
231 | " 'num_epochs': 7,\n",
232 | " 'learning_rate': 0.0015,\n",
233 | " 'batch_size': 512,\n",
234 | " 'hidden_units': '256,126'\n",
235 | "}\n",
236 | "message = json.dumps(data)\n",
237 | "\n",
238 | "_ = publish_client.publish(topic, message.encode())"
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "id": "c536f29d",
244 | "metadata": {},
245 | "source": [
246 | "Wait for a few seconds for the pipeline run to be submitted, then you can see the run in the Cloud Console"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": null,
252 | "id": "887538b6",
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "from kfp.v2.google.client import AIPlatformClient\n",
257 | "\n",
258 | "pipeline_client = AIPlatformClient(\n",
259 | " project_id=PROJECT_ID, region=REGION)\n",
260 | " \n",
261 | "job_display_name = pipeline_client.list_jobs()['pipelineJobs'][0]['displayName']\n",
262 | "job_url = f'https://console.cloud.google.com/vertex-ai/locations/{REGION}/pipelines/runs/{job_display_name}'\n",
263 | "html = f'See the Pipeline job here.'\n",
264 | "IPython.display.display(IPython.display.HTML(html))"
265 | ]
266 | },
267 | {
268 | "cell_type": "markdown",
269 | "id": "159a66d2",
270 | "metadata": {},
271 | "source": [
272 | "## 4. Extracting pipeline runs metadata"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": null,
278 | "id": "affe56fc",
279 | "metadata": {},
280 | "outputs": [],
281 | "source": [
282 | "from google.cloud import aiplatform as vertex_ai\n",
283 | "\n",
284 | "pipeline_df = vertex_ai.get_pipeline_df(PIPELINE_NAME)\n",
285 | "pipeline_df = pipeline_df[pipeline_df.pipeline_name == PIPELINE_NAME]\n",
286 | "pipeline_df.T"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "id": "04ba49ed",
293 | "metadata": {},
294 | "outputs": [],
295 | "source": []
296 | }
297 | ],
298 | "metadata": {
299 | "environment": {
300 | "name": "common-cpu.m73",
301 | "type": "gcloud",
302 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73"
303 | },
304 | "kernelspec": {
305 | "display_name": "Python 3",
306 | "language": "python",
307 | "name": "python3"
308 | },
309 | "language_info": {
310 | "codemirror_mode": {
311 | "name": "ipython",
312 | "version": 3
313 | },
314 | "file_extension": ".py",
315 | "mimetype": "text/x-python",
316 | "name": "python",
317 | "nbconvert_exporter": "python",
318 | "pygments_lexer": "ipython3",
319 | "version": "3.7.10"
320 | }
321 | },
322 | "nbformat": 4,
323 | "nbformat_minor": 5
324 | }
325 |
--------------------------------------------------------------------------------
/06-model-deployment.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "e80f441f",
6 | "metadata": {},
7 | "source": [
8 | "# 06 - Model deployment\n",
9 | "\n",
10 | "The purpose of this notebook is to execute a CI/CD routine to test and deploy the trained `Vertex Model` resource to a `Vertex Endpoint` resource for online prediction serving. The notebook covers the following steps:\n",
11 | "\n",
12 | "1. Run the test steps locally.\n",
13 | "2. Execute the model deployment CI/CD steps using `Cloud Build`.\n",
14 | "\n"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "id": "0db03d3d",
20 | "metadata": {},
21 | "source": [
22 | "## Setup"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "id": "3cd5f896",
28 | "metadata": {},
29 | "source": [
30 | "### Import libraries"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "id": "c98cf8cb",
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "import os\n",
41 | "import logging\n",
42 | "\n",
43 | "logging.getLogger().setLevel(logging.INFO)"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "id": "faf0de35",
49 | "metadata": {},
50 | "source": [
51 | "### Setup Google Cloud project"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "id": "8ab672e9",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "PROJECT_ID = '[your-project-id]' # Change to your project id.\n",
62 | "REGION = 'us-central1' # Change to your region.\n",
63 | "\n",
64 | "if PROJECT_ID == '' or PROJECT_ID is None or PROJECT_ID == '[your-project-id]':\n",
65 | " # Get your GCP project id from gcloud\n",
66 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n",
67 | " PROJECT_ID = shell_output[0]\n",
68 | "\n",
69 | "print('Project ID:', PROJECT_ID)\n",
70 | "print('Region:', REGION)"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "id": "2d1e359d",
76 | "metadata": {},
77 | "source": [
78 | "### Set configurations"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "id": "25a1e19b",
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "VERSION = 'v1'\n",
89 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n",
90 | "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n",
91 | "ENDPOINT_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier'\n",
92 | "\n",
93 | "CICD_IMAGE_NAME = 'cicd:latest'\n",
94 | "CICD_IMAGE_URI = f'gcr.io/{PROJECT_ID}/{CICD_IMAGE_NAME}'"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "id": "d27a65fd",
100 | "metadata": {},
101 | "source": [
102 | "## 1. Run CI/CD steps locally"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "id": "daffa85a",
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "os.environ['PROJECT'] = PROJECT_ID\n",
113 | "os.environ['REGION'] = REGION\n",
114 | "os.environ['MODEL_DISPLAY_NAME'] = MODEL_DISPLAY_NAME\n",
115 | "os.environ['ENDPOINT_DISPLAY_NAME'] = ENDPOINT_DISPLAY_NAME"
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "id": "189dde56",
121 | "metadata": {},
122 | "source": [
123 | "### Run the model artifact testing"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "id": "20c8ce61",
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "! py.test src/tests/model_deployment_tests.py::test_model_artifact -s"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "id": "195e7cde",
139 | "metadata": {},
140 | "source": [
141 | "### Run create endpoint"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "id": "bdaf0c28",
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "! python build/utils.py \\\n",
152 | " --mode=create-endpoint\\\n",
153 | " --project={PROJECT_ID}\\\n",
154 | " --region={REGION}\\\n",
155 | " --endpoint-display-name={ENDPOINT_DISPLAY_NAME}"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "id": "3e8022b5",
161 | "metadata": {},
162 | "source": [
163 | "### Run deploy model"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": null,
169 | "id": "bac7e8b3",
170 | "metadata": {},
171 | "outputs": [],
172 | "source": [
173 | "! python build/utils.py \\\n",
174 | " --mode=deploy-model\\\n",
175 | " --project={PROJECT_ID}\\\n",
176 | " --region={REGION}\\\n",
177 | " --endpoint-display-name={ENDPOINT_DISPLAY_NAME}\\\n",
178 | " --model-display-name={MODEL_DISPLAY_NAME}"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "id": "937178d9",
184 | "metadata": {},
185 | "source": [
186 | "### Run model endpoint testing"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "id": "efa45c98",
193 | "metadata": {},
194 | "outputs": [],
195 | "source": [
196 | "# TODO {for Khalid, you need to update create an Endpoint resource when using a list. This is a known bug:}\n",
197 | "# AttributeError: 'Endpoint' object has no attribute '_prediction_client'\n",
198 | "! py.test src/tests/model_deployment_tests.py::test_model_endpoint"
199 | ]
200 | },
201 | {
202 | "cell_type": "markdown",
203 | "id": "fbe44e56",
204 | "metadata": {},
205 | "source": [
206 | "## 2. Execute the model deployment CI/CD routine in `Cloud Build`\n",
207 | "\n",
208 | "The CI/CD routine is defined in the [model-deployment.yaml](model-deployment.yaml) file, and consists of the following steps:\n",
209 | "\n",
210 | "1. Load and test the the trained model interface.\n",
211 | "2. Create a `Vertex Endpoint` resource if it does not exist.\n",
212 | "3. Deploy the `Vertex Model` resource to the `Vertex Endpoint` resource.\n",
213 | "4. Test the `Vertex Endpoint` resource."
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "id": "05ef2d3d",
219 | "metadata": {},
220 | "source": [
221 | "### Build CI/CD container Image for `Cloud Build`\n",
222 | "\n",
223 | "This is the runtime environment where the steps of testing and deploying the model will be executed."
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "id": "59f00bcc",
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "! echo $CICD_IMAGE_URI"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "id": "76b7dae5",
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "! gcloud builds submit --tag $CICD_IMAGE_URI build/. --timeout=15m"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "id": "88d91bd7",
249 | "metadata": {},
250 | "source": [
251 | "### Run CI/CD from model deployment using `Cloud Build`"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": null,
257 | "id": "6a8d9b05",
258 | "metadata": {},
259 | "outputs": [],
260 | "source": [
261 | "REPO_URL = 'https://github.com/ksalama/ucaip-labs.git' # Change to your github repo.\n",
262 | "BRANCH = 'main'"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "id": "ee76bd54",
269 | "metadata": {},
270 | "outputs": [],
271 | "source": [
272 | "SUBSTITUTIONS=f'''\\\n",
273 | "_REPO_URL='{REPO_URL}',\\\n",
274 | "_BRANCH={BRANCH},\\\n",
275 | "_CICD_IMAGE_URI={CICD_IMAGE_URI},\\\n",
276 | "_PROJECT={PROJECT_ID},\\\n",
277 | "_REGION={REGION},\\\n",
278 | "_MODEL_DISPLAY_NAME={MODEL_DISPLAY_NAME},\\\n",
279 | "_ENDPOINT_DISPLAY_NAME={ENDPOINT_DISPLAY_NAME},\\\n",
280 | "'''\n",
281 | "\n",
282 | "!echo $SUBSTITUTIONS"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": null,
288 | "id": "3f59114c",
289 | "metadata": {},
290 | "outputs": [],
291 | "source": [
292 | "!gcloud builds submit --no-source --config build/model-deployment.yaml --substitutions {SUBSTITUTIONS} --timeout=30m"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": null,
298 | "id": "d62fc304",
299 | "metadata": {},
300 | "outputs": [],
301 | "source": []
302 | }
303 | ],
304 | "metadata": {
305 | "environment": {
306 | "name": "common-cpu.m73",
307 | "type": "gcloud",
308 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73"
309 | },
310 | "kernelspec": {
311 | "display_name": "Python 3",
312 | "language": "python",
313 | "name": "python3"
314 | },
315 | "language_info": {
316 | "codemirror_mode": {
317 | "name": "ipython",
318 | "version": 3
319 | },
320 | "file_extension": ".py",
321 | "mimetype": "text/x-python",
322 | "name": "python",
323 | "nbconvert_exporter": "python",
324 | "pygments_lexer": "ipython3",
325 | "version": "3.7.10"
326 | }
327 | },
328 | "nbformat": 4,
329 | "nbformat_minor": 5
330 | }
331 |
--------------------------------------------------------------------------------
/07-prediction-serving.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "afa25b6f",
6 | "metadata": {},
7 | "source": [
8 | "# 07 - Serving predictions\n",
9 | "\n",
10 | "The purpose of the notebook is to show how to use the deployed model for online and batch prediction.\n",
11 | "The notebook covers the following tasks:\n",
12 | "\n",
13 | "1. Test the `Endpoint` resource for online prediction.\n",
14 | "2. Use the custom model uploaded as a `Model` resource for batch prediciton.\n",
15 | "3. Run a the batch prediction pipeline using `Vertex Pipelines`."
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "id": "b2ff82c9",
21 | "metadata": {},
22 | "source": [
23 | "## Setup"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "id": "c0d52e77",
29 | "metadata": {},
30 | "source": [
31 | "### Import libraries"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "id": "116a19cf",
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "import os\n",
42 | "import time\n",
43 | "from datetime import datetime\n",
44 | "import tensorflow as tf\n",
45 | "\n",
46 | "from google.cloud import aiplatform as vertex_ai"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "id": "c5a33868",
52 | "metadata": {},
53 | "source": [
54 | "### Setup Google Cloud project"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "id": "3c2c4d1c",
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "PROJECT_ID = '[your-project-id]' # Change to your project id.\n",
65 | "REGION = 'us-central1' # Change to your region.\n",
66 | "BUCKET = '[your-bucket-name]' # Change to your bucket name.\n",
67 | "\n",
68 | "if PROJECT_ID == '' or PROJECT_ID is None or PROJECT_ID == '[your-project-id]':\n",
69 | " # Get your GCP project id from gcloud\n",
70 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n",
71 | " PROJECT_ID = shell_output[0]\n",
72 | " \n",
73 | "if BUCKET == '' or BUCKET is None or BUCKET == '[your-bucket-name]':\n",
74 | " # Set your bucket name using your GCP project id\n",
75 | " BUCKET = PROJECT_ID\n",
76 | " # Try to create the bucket if it doesn'exists\n",
77 | " ! gsutil mb -l $REGION gs://$BUCKET\n",
78 | " print('')\n",
79 | " \n",
80 | "print('Project ID:', PROJECT_ID)\n",
81 | "print('Region:', REGION)\n",
82 | "print('Bucket name:', BUCKET)"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "id": "29e1a653",
88 | "metadata": {},
89 | "source": [
90 | "### Set configurations"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "id": "0019b2dd",
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "VERSION = 'v1'\n",
101 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n",
102 | "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n",
103 | "ENDPOINT_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier'\n",
104 | "\n",
105 | "SERVE_BQ_DATASET_NAME = 'playground_us' # Change to your serving BigQuery dataset name.\n",
106 | "SERVE_BQ_TABLE_NAME = 'chicago_taxitrips_prep' # Change to your serving BigQuery table name."
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "id": "385ed4c0",
112 | "metadata": {},
113 | "source": [
114 | "## 1. Making an online prediciton\n"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "id": "ac2520fc",
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "vertex_ai.init(\n",
125 | " project=PROJECT_ID,\n",
126 | " location=REGION,\n",
127 | " staging_bucket=BUCKET\n",
128 | ")\n",
129 | "\n",
130 | "endpoint_name = vertex_ai.Endpoint.list(\n",
131 | " filter=f'display_name={ENDPOINT_DISPLAY_NAME}', \n",
132 | " order_by='update_time')[-1].gca_resource.name\n",
133 | "\n",
134 | "endpoint = vertex_ai.Endpoint(endpoint_name)"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "id": "c5f4f8c8",
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "test_instances = [ \n",
145 | " {\n",
146 | " 'dropoff_grid': ['POINT(-87.6 41.9)'],\n",
147 | " 'euclidean': [2064.2696],\n",
148 | " 'loc_cross': [''],\n",
149 | " 'payment_type': ['Credit Card'],\n",
150 | " 'pickup_grid': ['POINT(-87.6 41.9)'],\n",
151 | " 'trip_miles': [1.37],\n",
152 | " 'trip_day': [12],\n",
153 | " 'trip_hour': [16],\n",
154 | " 'trip_month': [2],\n",
155 | " 'trip_day_of_week': [4],\n",
156 | " 'trip_seconds': [555]\n",
157 | " }\n",
158 | "]"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": null,
164 | "id": "6fe672df",
165 | "metadata": {},
166 | "outputs": [],
167 | "source": [
168 | "predictions = endpoint.predict(test_instances).predictions\n",
169 | "\n",
170 | "for prediction in predictions:\n",
171 | " print(prediction)"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "id": "077f4225",
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "# TODO {for Khalid, get error saying model does not support explanations}\n",
182 | "\n",
183 | "explanations = endpoint.explain(test_instances).explanations\n",
184 | "\n",
185 | "for explanation in explanations:\n",
186 | " print(explanation)"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "id": "b6140167",
192 | "metadata": {},
193 | "source": [
194 | "## 2. Make a batch prediction"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "id": "37928e74",
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "WORKSPACE = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/'\n",
205 | "SERVING_DATA_DIR = os.path.join(WORKSPACE, 'serving_data')\n",
206 | "SERVING_INPUT_DATA_DIR = os.path.join(SERVING_DATA_DIR, 'input_data')\n",
207 | "SERVING_OUTPUT_DATA_DIR = os.path.join(SERVING_DATA_DIR, 'output_predictions')"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "id": "b83e0d39",
214 | "metadata": {},
215 | "outputs": [],
216 | "source": [
217 | "if tf.io.gfile.exists(SERVING_DATA_DIR):\n",
218 | " print('Removing previous serving data...')\n",
219 | " tf.io.gfile.rmtree(SERVING_DATA_DIR)\n",
220 | "print('Creating serving data directory...')\n",
221 | "tf.io.gfile.mkdir(SERVING_DATA_DIR)\n",
222 | "print('Serving data directory is ready.')"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "id": "163326ce",
228 | "metadata": {},
229 | "source": [
230 | "### Extract serving data to Cloud Storage as JSONL"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "id": "51bdefd3",
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "\n",
241 | "from src.model_training import features as feature_info\n",
242 | "from src.preprocessing import etl\n",
243 | "from src.common import datasource_utils"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "id": "e15508fb",
250 | "metadata": {},
251 | "outputs": [],
252 | "source": [
253 | "LIMIT = 10000\n",
254 | "\n",
255 | "sql_query = datasource_utils.create_bq_source_query(\n",
256 | " dataset_display_name=DATASET_DISPLAY_NAME, \n",
257 | " missing=feature_info.MISSING_VALUES,\n",
258 | " limit=LIMIT\n",
259 | ")\n",
260 | "\n",
261 | "print(sql_query)"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": null,
267 | "id": "95ba6d5f",
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "args = {\n",
272 | " #'runner': 'DataflowRunner',\n",
273 | " 'sql_query': sql_query,\n",
274 | " 'exported_data_prefix': os.path.join(SERVING_INPUT_DATA_DIR, 'data-'),\n",
275 | " 'temporary_dir': os.path.join(WORKSPACE, 'tmp'),\n",
276 | " 'gcs_location': os.path.join(WORKSPACE, 'bq_tmp'),\n",
277 | " 'project': PROJECT_ID,\n",
278 | " 'region': REGION,\n",
279 | " 'setup_file': './setup.py'\n",
280 | "}"
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "id": "c5414f24",
287 | "metadata": {},
288 | "outputs": [],
289 | "source": [
290 | "tf.get_logger().setLevel('ERROR')\n",
291 | "\n",
292 | "print('Data extraction started...')\n",
293 | "etl.run_extract_pipeline(args)\n",
294 | "print('Data extraction completed.')"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": null,
300 | "id": "7411f2dc",
301 | "metadata": {},
302 | "outputs": [],
303 | "source": [
304 | "! gsutil ls {SERVING_INPUT_DATA_DIR}"
305 | ]
306 | },
307 | {
308 | "cell_type": "markdown",
309 | "id": "1660a44e",
310 | "metadata": {},
311 | "source": [
312 | "### Submit the batch prediction job"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "id": "8878a244",
319 | "metadata": {},
320 | "outputs": [],
321 | "source": [
322 | "model_name = vertex_ai.Model.list(\n",
323 | " filter=f'display_name={MODEL_DISPLAY_NAME}',\n",
324 | " order_by='update_time')[-1].gca_resource.name"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": null,
330 | "id": "4f262efa",
331 | "metadata": {},
332 | "outputs": [],
333 | "source": [
334 | "job_resources = {\n",
335 | " 'machine_type': 'n1-standard-2',\n",
336 | " #'accelerator_count': 1,\n",
337 | " #'accelerator_type': 'NVIDIA_TESLA_T4'\n",
338 | " 'starting_replica_count': 1,\n",
339 | " 'max_replica_coun': 10,\n",
340 | "}\n",
341 | "\n",
342 | "job_display_name = f'{MODEL_DISPLAY_NAME}-prediction-job-{datetime.now().strftime('%Y%m%d%H%M%S')}'\n",
343 | "\n",
344 | "vertex_ai.BatchPredictionJob.create(\n",
345 | " job_display_name=job_display_name,\n",
346 | " model_name=model_name,\n",
347 | " gcs_source=SERVING_INPUT_DATA_DIR + '/*.jsonl',\n",
348 | " gcs_destination_prefix=SERVING_OUTPUT_DATA_DIR,\n",
349 | " instances_format='jsonl',\n",
350 | " predictions_format='jsonl',\n",
351 | " sync=True,\n",
352 | " **job_resources,\n",
353 | ")"
354 | ]
355 | },
356 | {
357 | "cell_type": "markdown",
358 | "id": "6d638b6f",
359 | "metadata": {},
360 | "source": [
361 | "## 3. Run the batch prediction pipeline using `Vertex Pipelines`"
362 | ]
363 | },
364 | {
365 | "cell_type": "code",
366 | "execution_count": null,
367 | "id": "ee5be402",
368 | "metadata": {},
369 | "outputs": [],
370 | "source": [
371 | "WORKSPACE = f'{BUCKET}/{DATASET_DISPLAY_NAME}/'\n",
372 | "MLMD_SQLLITE = 'mlmd.sqllite'\n",
373 | "ARTIFACT_STORE = os.path.join(WORKSPACE, 'tfx_artifacts')\n",
374 | "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-predict-pipeline'"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": null,
380 | "id": "f9b84c1e",
381 | "metadata": {},
382 | "outputs": [],
383 | "source": [
384 | "os.environ['PROJECT'] = PROJECT_ID\n",
385 | "os.environ['REGION'] = REGION\n",
386 | "os.environ['MODEL_DISPLAY_NAME'] = MODEL_DISPLAY_NAME\n",
387 | "os.environ['PIPELINE_NAME'] = PIPELINE_NAME\n",
388 | "os.environ['ARTIFACT_STORE_URI'] = ARTIFACT_STORE\n",
389 | "os.environ['BATCH_PREDICTION_BQ_DATASET_NAME'] = SERVE_BQ_DATASET_NAME\n",
390 | "os.environ['BATCH_PREDICTION_BQ_TABLE_NAME'] = SERVE_BQ_TABLE_NAME\n",
391 | "os.environ['SERVE_LIMIT'] = '1000'\n",
392 | "os.environ['BEAM_RUNNER'] = 'DirectRunner'\n",
393 | "os.environ['TFX_IMAGE_URI'] = f'gcr.io/{PROJECT_ID}/{DATASET_DISPLAY_NAME}:{VERSION}'"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": null,
399 | "id": "58681dfe",
400 | "metadata": {},
401 | "outputs": [],
402 | "source": [
403 | "import importlib\n",
404 | "from src.tfx_pipelines import config\n",
405 | "importlib.reload(config)\n",
406 | "\n",
407 | "for key, value in config.__dict__.items():\n",
408 | " if key.isupper(): print(f'{key}: {value}')"
409 | ]
410 | },
411 | {
412 | "cell_type": "code",
413 | "execution_count": null,
414 | "id": "d06a4091",
415 | "metadata": {},
416 | "outputs": [],
417 | "source": [
418 | "from src.tfx_pipelines import runner\n",
419 | "\n",
420 | "pipeline_definition_file = f'{config.PIPELINE_NAME}.json'\n",
421 | "pipeline_definition = runner.compile_prediction_pipeline(pipeline_definition_file)"
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": null,
427 | "id": "b6ffceca",
428 | "metadata": {},
429 | "outputs": [],
430 | "source": [
431 | "from kfp.v2.google.client import AIPlatformClient\n",
432 | "\n",
433 | "pipeline_client = AIPlatformClient(\n",
434 | " project_id=PROJECT_ID, region=REGION)\n",
435 | " \n",
436 | "pipeline_client.create_run_from_job_spec(\n",
437 | " job_spec_path=pipeline_definition_file\n",
438 | ")"
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "execution_count": null,
444 | "id": "dd2efb1b",
445 | "metadata": {},
446 | "outputs": [],
447 | "source": []
448 | }
449 | ],
450 | "metadata": {
451 | "environment": {
452 | "name": "common-cpu.m73",
453 | "type": "gcloud",
454 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73"
455 | },
456 | "kernelspec": {
457 | "display_name": "Python 3",
458 | "language": "python",
459 | "name": "python3"
460 | },
461 | "language_info": {
462 | "codemirror_mode": {
463 | "name": "ipython",
464 | "version": 3
465 | },
466 | "file_extension": ".py",
467 | "mimetype": "text/x-python",
468 | "name": "python",
469 | "nbconvert_exporter": "python",
470 | "pygments_lexer": "ipython3",
471 | "version": "3.7.10"
472 | }
473 | },
474 | "nbformat": 4,
475 | "nbformat_minor": 5
476 | }
477 |
--------------------------------------------------------------------------------
/08-model-monitoring.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "39366395",
6 | "metadata": {},
7 | "source": [
8 | "# 08 - Model monitoring\n",
9 | "\n",
10 | "This notebook covers configuring model monitoring jobs for skew and drift detection:\n",
11 | "\n",
12 | "1. Set skew and drift threshold.\n",
13 | "2. Create a monitoring job for all the models on a `Endpoint` resource.\n",
14 | "3. List the monitoring jobs.\n",
15 | "4. List artifacts produced by monitoring job.\n",
16 | "5. Pause and delete the monitoring job."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "id": "d7e55542",
22 | "metadata": {},
23 | "source": [
24 | "## Setup"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "id": "35292bad",
30 | "metadata": {},
31 | "source": [
32 | "### Import libraries"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "id": "41ba6e75",
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import copy\n",
43 | "from datetime import datetime\n",
44 | "\n",
45 | "from google.protobuf.duration_pb2 import Duration\n",
46 | "from google.cloud import aiplatform as vertex_ai\n",
47 | "from google.cloud import aiplatform_v1beta1 as vertex_ai_beta"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "id": "5279e949",
53 | "metadata": {},
54 | "source": [
55 | "### Setup Google Cloud project"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "id": "272491a9",
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "PROJECT_ID = '[your-project-id]' # Change to your project id.\n",
66 | "REGION = 'us-central1' # Change to your region.\n",
67 | "\n",
68 | "if PROJECT_ID == '' or PROJECT_ID is None or PROJECT_ID == '[your-project-id]':\n",
69 | " # Get your GCP project id from gcloud\n",
70 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n",
71 | " PROJECT_ID = shell_output[0]\n",
72 | "\n",
73 | "PARENT = f'projects/{PROJECT_ID}/locations/{REGION}'\n",
74 | "\n",
75 | "print('Project ID:', PROJECT_ID)\n",
76 | "print('Region:', REGION)\n",
77 | "print('Vertex API Parent URI:', PARENT)"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "id": "513388ee",
83 | "metadata": {},
84 | "source": [
85 | "### Set configurations"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "id": "fb651770",
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n",
96 | "ENDPOINT_DISPLAY_NAME = 'chicago-taxi-tips-classifier'\n",
97 | "MONITORING_JOB_NAME = f'monitor-{ENDPOINT_DISPLAY_NAME}'\n",
98 | "NOTIFY_EMAILS = '[your-email-address]'\n",
99 | "\n",
100 | "LOG_SAMPLE_RATE = 0.8\n",
101 | "MONITOR_INTERVAL = 3600\n",
102 | "TARGET_FEATURE_NAME = 'tip_bin'"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "id": "ac7cb17f",
108 | "metadata": {},
109 | "source": [
110 | "## Create a Job Service client"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "id": "bb896762",
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "job_client_beta = vertex_ai_beta.JobServiceClient(\n",
121 | " client_options={'api_endpoint': f'{REGION}-aiplatform.googleapis.com'}\n",
122 | ")"
123 | ]
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "id": "63bcde67",
128 | "metadata": {},
129 | "source": [
130 | "## 1. Set the skew and drift thresholds"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "id": "3252edaa",
137 | "metadata": {},
138 | "outputs": [],
139 | "source": [
140 | "SKEW_THRESHOLDS = {\n",
141 | " 'trip_month': 0.3,\n",
142 | " 'trip_day': 0.3,\n",
143 | " 'trip_day_of_week': 0.3,\n",
144 | " 'trip_hour': 0.3,\n",
145 | " 'trip_seconds': 0.3,\n",
146 | " 'trip_miles': 0.3,\n",
147 | " 'payment_type': 0.3,\n",
148 | " 'pickup_grid': 0.3,\n",
149 | " 'dropoff_grid': 0.3,\n",
150 | " 'euclidean': 0.3,\n",
151 | " 'loc_cross': 0.3, \n",
152 | "}\n",
153 | "\n",
154 | "DIRFT_THRESHOLDS = {\n",
155 | " 'trip_month': 0.3,\n",
156 | " 'trip_day': 0.3,\n",
157 | " 'trip_day_of_week': 0.3,\n",
158 | " 'trip_hour': 0.3,\n",
159 | " 'trip_seconds': 0.3,\n",
160 | " 'trip_miles': 0.3,\n",
161 | " 'payment_type': 0.3,\n",
162 | " 'pickup_grid': 0.3,\n",
163 | " 'dropoff_grid': 0.3,\n",
164 | " 'euclidean': 0.3,\n",
165 | " 'loc_cross': 0.3, \n",
166 | "}"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "id": "adc333a3",
172 | "metadata": {},
173 | "source": [
174 | "## 2. Create a monitoring job"
175 | ]
176 | },
177 | {
178 | "cell_type": "markdown",
179 | "id": "a40d14cb",
180 | "metadata": {},
181 | "source": [
182 | "### Retrieve the `Dataset`, `Model` and `Endpoint` resources to monitor"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": null,
188 | "id": "ed60fbff",
189 | "metadata": {},
190 | "outputs": [],
191 | "source": [
192 | "dataset = vertex_ai.TabularDataset.list(\n",
193 | " filter=f'display_name={DATASET_DISPLAY_NAME}', \n",
194 | " order_by='update_time')[-1]\n",
195 | "\n",
196 | "bq_source_uri = dataset.gca_resource.metadata['inputConfig']['bigquerySource']['uri']\n",
197 | " \n",
198 | "endpoint = vertex_ai.Endpoint.list(\n",
199 | " filter=f'display_name={ENDPOINT_DISPLAY_NAME}', \n",
200 | " order_by='update_time')[-1]\n",
201 | "\n",
202 | "endpoint_uri = endpoint.gca_resource.name\n",
203 | "\n",
204 | "model_ids = [model.id for model in endpoint.list_models()]"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "id": "0b159368",
210 | "metadata": {},
211 | "source": [
212 | "### Configure the monitoring job"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "id": "0370cb58",
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "skew_thresholds = {\n",
223 | " feature: vertex_ai_beta.ThresholdConfig(value=float(value))\n",
224 | " for feature, value in SKEW_THRESHOLDS.items()\n",
225 | "}\n",
226 | "\n",
227 | "drift_thresholds = {\n",
228 | " feature: vertex_ai_beta.ThresholdConfig(value=float(value))\n",
229 | " for feature, value in DIRFT_THRESHOLDS.items()\n",
230 | "}\n",
231 | "\n",
232 | "skew_config = vertex_ai_beta.ModelMonitoringObjectiveConfig.TrainingPredictionSkewDetectionConfig(\n",
233 | " skew_thresholds=skew_thresholds\n",
234 | ")\n",
235 | "\n",
236 | "drift_config = vertex_ai_beta.ModelMonitoringObjectiveConfig.PredictionDriftDetectionConfig(\n",
237 | " drift_thresholds=drift_thresholds\n",
238 | ")\n",
239 | "\n",
240 | "sampling_config = vertex_ai_beta.SamplingStrategy(\n",
241 | " random_sample_config=vertex_ai_beta.SamplingStrategy.RandomSampleConfig(\n",
242 | " sample_rate=LOG_SAMPLE_RATE\n",
243 | " )\n",
244 | ")\n",
245 | "\n",
246 | "schedule_config = vertex_ai_beta.ModelDeploymentMonitoringScheduleConfig(\n",
247 | " monitor_interval=Duration(seconds=MONITOR_INTERVAL)\n",
248 | ")\n",
249 | "\n",
250 | "training_dataset = vertex_ai_beta.ModelMonitoringObjectiveConfig.TrainingDataset(\n",
251 | " target_field=TARGET_FEATURE_NAME,\n",
252 | " bigquery_source = vertex_ai_beta.types.io.BigQuerySource(\n",
253 | " input_uri=bq_source_uri\n",
254 | " )\n",
255 | ")\n",
256 | "\n",
257 | "\n",
258 | "objective_template = vertex_ai_beta.ModelDeploymentMonitoringObjectiveConfig(\n",
259 | " objective_config=vertex_ai_beta.ModelMonitoringObjectiveConfig(\n",
260 | " training_dataset=training_dataset,\n",
261 | " training_prediction_skew_detection_config=skew_config,\n",
262 | " prediction_drift_detection_config=drift_config,\n",
263 | " )\n",
264 | ")\n",
265 | "\n",
266 | "deployment_objective_configs = []\n",
267 | "for model_id in model_ids:\n",
268 | " objective_config = copy.deepcopy(objective_template)\n",
269 | " objective_config.deployed_model_id = model_id\n",
270 | " deployment_objective_configs.append(objective_config)\n",
271 | "\n",
272 | "alerting_config = vertex_ai_beta.ModelMonitoringAlertConfig(\n",
273 | " email_alert_config=vertex_ai_beta.ModelMonitoringAlertConfig.EmailAlertConfig(\n",
274 | " user_emails=NOTIFY_EMAILS\n",
275 | " )\n",
276 | ")\n"
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "id": "f4b667db",
282 | "metadata": {},
283 | "source": [
284 | "### Instantiate a monitoring job"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": null,
290 | "id": "5e4e0c9d",
291 | "metadata": {},
292 | "outputs": [],
293 | "source": [
294 | "job = vertex_ai_beta.ModelDeploymentMonitoringJob(\n",
295 | " display_name=MONITORING_JOB_NAME,\n",
296 | " endpoint=endpoint_uri,\n",
297 | " model_deployment_monitoring_objective_configs=deployment_objective_configs,\n",
298 | " logging_sampling_strategy=sampling_config,\n",
299 | " model_deployment_monitoring_schedule_config=schedule_config,\n",
300 | " model_monitoring_alert_config=alerting_config,\n",
301 | ")"
302 | ]
303 | },
304 | {
305 | "cell_type": "markdown",
306 | "id": "4a66d1d5",
307 | "metadata": {},
308 | "source": [
309 | "### Submit the monitoring job for execution"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": null,
315 | "id": "4a0e41b9",
316 | "metadata": {},
317 | "outputs": [],
318 | "source": [
319 | "response = job_client_beta.create_model_deployment_monitoring_job(\n",
320 | " parent=PARENT, model_deployment_monitoring_job=job\n",
321 | ")\n",
322 | "response"
323 | ]
324 | },
325 | {
326 | "cell_type": "markdown",
327 | "id": "39352387",
328 | "metadata": {},
329 | "source": [
330 | "## 3. Get the monitoring job"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": null,
336 | "id": "bc47ef29",
337 | "metadata": {},
338 | "outputs": [],
339 | "source": [
340 | "monitoring_jobs = job_client_beta.list_model_deployment_monitoring_jobs(parent=PARENT)\n",
341 | "monitoring_job = [entry for entry in monitoring_jobs if entry.display_name == MONITORING_JOB_NAME][0]\n",
342 | "monitoring_job"
343 | ]
344 | },
345 | {
346 | "cell_type": "markdown",
347 | "id": "9dbb50ce",
348 | "metadata": {},
349 | "source": [
350 | "## 5. Pause the monitoring job"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": null,
356 | "id": "cd6d295e",
357 | "metadata": {},
358 | "outputs": [],
359 | "source": [
360 | "job_client_beta.pause_model_deployment_monitoring_job(name=monitoring_job)"
361 | ]
362 | },
363 | {
364 | "cell_type": "markdown",
365 | "id": "37663c7e",
366 | "metadata": {},
367 | "source": [
368 | "## Delete the monitoring job"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": null,
374 | "id": "c3be1189",
375 | "metadata": {},
376 | "outputs": [],
377 | "source": [
378 | "job_client_beta.delete_model_deployment_monitoring_job(name=monitoring_job)"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": null,
384 | "id": "28159818",
385 | "metadata": {},
386 | "outputs": [],
387 | "source": []
388 | }
389 | ],
390 | "metadata": {
391 | "environment": {
392 | "name": "common-cpu.m73",
393 | "type": "gcloud",
394 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73"
395 | },
396 | "kernelspec": {
397 | "display_name": "Python 3",
398 | "language": "python",
399 | "name": "python3"
400 | },
401 | "language_info": {
402 | "codemirror_mode": {
403 | "name": "ipython",
404 | "version": 3
405 | },
406 | "file_extension": ".py",
407 | "mimetype": "text/x-python",
408 | "name": "python",
409 | "nbconvert_exporter": "python",
410 | "pygments_lexer": "ipython3",
411 | "version": "3.7.10"
412 | }
413 | },
414 | "nbformat": 4,
415 | "nbformat_minor": 5
416 | }
417 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gcr.io/tfx-oss-public/tfx:0.30.0
2 |
3 | COPY requirements.txt requirements.txt
4 |
5 | RUN pip install -r requirements.txt
6 |
7 | COPY src/ src/
8 |
9 | ENV PYTHONPATH="/pipeline:${PYTHONPATH}"
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PLEASE USE THIS REPO INSTEAD: https://github.com/GoogleCloudPlatform/mlops-with-vertex-ai
2 |
3 |
4 | # MLOps on Vertex AI
5 |
6 | This example implements the end-to-end [MLOps process](https://services.google.com/fh/files/misc/practitioners_guide_to_mlops_whitepaper.pdf) using [Vertex AI](https://cloud.google.com/vertex-ai) platform and [Smart Analytics](https://cloud.google.com/solutions/smart-analytics) technology capabilities. The example use [Keras](https://keras.io/) to implement the ML model, [TFX](https://www.tensorflow.org/tfx) to implement the training pipeline, and [Model Builder SDK](https://github.com/googleapis/python-aiplatform/tree/569d4cd03e888fde0171f7b0060695a14f99b072/google/cloud/aiplatform) to interact with Vertex AI.
7 |
8 |
9 |
10 |
11 |
12 | ## Getting started
13 |
14 | 1. [Setting up MLOps environment](provision) on Google Cloud.
15 | 2. Start your AI Notebook instance.
16 | 3. Open the JupyterLab then open a new Terminal
17 | 4. Clone the repository to your AI Notebook instance:
18 | ```
19 | git clone https://github.com/ksalama/ucaip-labs.git
20 | cd ucaip-labs
21 | ```
22 | 5. Install the required Python packages:
23 | ```
24 | pip install tfx==0.30.0 --user
25 | pip install -r requirements.txt --user
26 | ```
27 | 6. Upgrade the `gcloud` components:
28 | ```
29 | sudo apt-get install google-cloud-sdk
30 | gcloud components update
31 | ```
32 |
33 | ## Dataset Management
34 |
35 | The [Chicago Taxi Trips](https://console.cloud.google.com/marketplace/details/city-of-chicago-public-data/chicago-taxi-trips) dataset is one of [public datasets hosted with BigQuery](https://cloud.google.com/bigquery/public-data/), which includes taxi trips from 2013 to the present, reported to the City of Chicago in its role as a regulatory agency. The task is to predict whether a given trip will result in a tip > 20%.
36 |
37 | The [01-dataset-management](01-dataset-management.ipynb) notebook covers:
38 |
39 | 1. Performing exploratory data analysis on the data in `BigQuery`.
40 | 2. Creating `Vertex AI` Dataset resource using the Python SDK.
41 | 3. Generating the schema for the raw data using [TensorFlow Data Validation](https://www.tensorflow.org/tfx/guide/tfdv).
42 |
43 |
44 | ## ML Development
45 |
46 | We experiment with creating a [Custom Model](https://cloud.google.com/ai-platform-unified/docs/training/create-model-custom-training) using [02-experimentation](02-experimentation.ipynb) notebook, which covers:
47 |
48 | 1. Preparing the data using `Dataflow`.
49 | 2. Implementing a `Keras` classification model.
50 | 3. Training the `Keras` model with `Vertex AI` using a [pre-built container](https://cloud.google.com/ai-platform-unified/docs/training/pre-built-containers).
51 | 4. Upload the exported model from `Cloud Storage` to `Vertex AI`.
52 | 5. Exract and visualize experiment parameters from [Vertex AI Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction).
53 |
54 | We use [Vertex TensorBoard](https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview)
55 | and [Vertex ML Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction) to track, visualize, and compare ML experiments.
56 |
57 | In addition, the training steps are formalized by implementing a [TFX pipeline](https://www.tensorflow.org/tfx).
58 | The [03-training-formalization](02-tfx-interactive.ipynb) notebook covers implementing and testing the pipeline components interactively.
59 |
60 | ## Training Operationalization
61 |
62 | The [04-pipeline-deployment](04-pipeline-deployment.ipynb) notebook covers executing the CI/CD steps for the training pipeline deployment using [Cloud Build](https://cloud.google.com/build/docs/overview). The CI/CD routine is defined in the [pipeline-deployment.yaml](pipeline-deployment.yaml) file, and consists of the following steps:
63 |
64 | 1. Clone the repository to the build environment.
65 | 2. Run unit tests.
66 | 3. Run a local e2e test of the `TFX` pipeline.
67 | 4. Build the ML container image for pipeline steps.
68 | 5. Compile the pipeline.
69 | 6. Upload the pipeline to `Cloud Storage`.
70 |
71 | ## Continuous Training
72 |
73 | After testing, compiling, and uploading the pipeline definition to `Cloud Storage`, the pipeline is executed with respect to a trigger.
74 | We use [Cloud Functions](https://cloud.google.com/functions) and [Cloud Pub/Sub](https://cloud.google.com/pubsub) as a triggering mechanism.
75 | The `Cloud Function` listens to the `Pub/Sub` topic, and runs the training pipeline given a message sent to the `Pub/Sub` topic.
76 | The `Cloud Function` is implemented in [src/pipeline_triggering](src/pipeline_triggering).
77 |
78 | The [05-continuous-training](05-continuous-training.ipynb) notebook covers:
79 |
80 | 1. Creating a Cloud `Pub/Sub` topic.
81 | 2. Deploying a `Cloud Function`.
82 | 3. Triggering the pipeline.
83 |
84 | The end-to-end TFX training pipeline implementation is in the [src/pipelines](src/tfx_pipelines) directory, which covers the following steps:
85 |
86 | 1. Receive hyperparameters using `hyperparam_gen` custom python component.
87 | 2. Extract data from `BigQuery` using `BigQueryExampleGen` component.
88 | 3. Validate the raw data using `StatisticsGen` and `ExampleValidator` component.
89 | 4. Process the data using on `Dataflow` `Transform` component.
90 | 5. Train a custom model with `Vertex AI` using `Trainer` component.
91 | 6. Evaluat and validate the custom model using `ModelEvaluator` component.
92 | 7. Save the blessed to model registry location in `Cloud Storage` using `Pusher` component.
93 | 8. Upload the model to `Vertex AI` using `vertex_model_pusher` custom python component.
94 |
95 |
96 | ## Model Deployment
97 |
98 | The [06-model-deployment](06-model-deployment.ipynb) notebook covers executing the CI/CD steps for the model deployment using [Cloud Build](https://cloud.google.com/build/docs/overview). The CI/CD rountine is defined in [build/model-deployment.yaml](build/model-deployment.yaml)
99 | file, and consists of the following steps:
100 |
101 | 2. Test model interface.
102 | 3. Create an endpoint in `Vertex AI`.
103 | 4. Deploy the model to the `endpoint`.
104 | 5. Test the `Vertex AI` endpoint.
105 |
106 | ## Prediction Serving
107 |
108 | We serve the deployed model for prediction.
109 | The [07-prediction-serving](07-prediction-serving.ipynb) notebook covers:
110 |
111 | 1. Use the `Vertex AI` endpoint for online prediction.
112 | 2. Use the `Vertex AI` uploaded model for batch prediciton.
113 | 3. Run the batch prediction using `Vertex Pipelines`.
114 |
115 | ## Model Monitoring
116 |
117 | After a model is deployed in for prediciton serving, continuous monitoring is set up to ensure that the model continue to perform as expected.
118 | The [08-model-monitoring](08-model-monitoring.ipynb) notebook covers configuring [Vertex AI Model Monitoring](https://cloud.google.com/vertex-ai/docs/model-monitoring/overview?hl=nn) for skew and dirft detection:
119 |
120 | 1. Set skew and drift threshold.
121 | 2. Create a monitoring job for all the models under and endpoint.
122 | 3. List the monitoring jobs.
123 | 4. List artifacts produced by monitoring job.
124 | 5. Pause and delete the monitoring job.
125 |
126 |
127 | ## Metadata Tracking
128 |
129 | You can view the parameters and metrics logged by your experiments, as well as the artifacts and metadata stored by
130 | your `Vertex Pipelines` in [Cloud Console](https://console.cloud.google.com/vertex-ai/metadata).
131 |
132 | ## Disclaimer
133 |
134 | This is not an official Google product but sample code provided for an educational purpose.
135 |
136 | ---
137 |
138 | Copyright 2021 Google LLC.
139 |
140 | Licensed under the Apache License, Version 2.0 (the "License");
141 | you may not use this file except in compliance with the License.
142 | You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0
143 |
144 | Unless required by applicable law or agreed to in writing, software
145 | distributed under the License is distributed on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
147 | See the License for the specific language governing permissions and
148 | limitations under the License.
149 |
150 |
151 |
152 |
153 |
154 |
155 |
--------------------------------------------------------------------------------
/build/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gcr.io/tfx-oss-public/tfx:0.30.0
2 |
3 | RUN pip install -U pip
4 | RUN pip install google-cloud-aiplatform==1.1.1 google-cloud-aiplatform[tensorboard]
5 | RUN pip install pytest kfp==1.6.2 google-cloud-bigquery==2.20.0 google-cloud-bigquery-storage==2.4.0
--------------------------------------------------------------------------------
/build/model-deployment.yaml:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | ######################################################################
16 | # CI/CD steps for Cloud Build to test and deploy a model to Vertex AI.
17 | ######################################################################
18 |
19 | steps:
20 |
21 | # Clone the repository.
22 | - name: 'gcr.io/cloud-builders/git'
23 | args: ['clone', '--single-branch', '--branch',
24 | '$_BRANCH', '$_REPO_URL',
25 | '--depth', '1',
26 | '--verbose']
27 | id: 'Clone Repository'
28 |
29 | # Test uploaded model artifact.
30 | - name: '$_CICD_IMAGE_URI'
31 | entrypoint: 'pytest'
32 | args: ['src/tests/model_deployment_tests.py::test_model_artifact']
33 | dir: 'ucaip-labs'
34 | env:
35 | - 'PROJECT=$_PROJECT'
36 | - 'REGION=$_REGION'
37 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME'
38 | id: 'Test Model Artifact'
39 | waitFor: ['Clone Repository']
40 |
41 | # Create an endpoint.
42 | - name: '$_CICD_IMAGE_URI'
43 | entrypoint: 'python'
44 | args: ['build/utils.py',
45 | '--mode', 'create-endpoint',
46 | '--project', '$_PROJECT',
47 | '--region', '$_REGION',
48 | '--endpoint-display-name', '$_ENDPOINT_DISPLAY_NAME']
49 | dir: 'ucaip-labs'
50 | id: 'Create Endpoint'
51 | waitFor: ['Test Model Artifact']
52 |
53 | # Deploy the model.
54 | - name: '$_CICD_IMAGE_URI'
55 | entrypoint: 'python'
56 | args: ['build/utils.py',
57 | '--mode', 'deploy-model',
58 | '--project', '$_PROJECT',
59 | '--region', '$_REGION',
60 | '--endpoint-display-name', '$_ENDPOINT_DISPLAY_NAME',
61 | '--model-display-name', '$_MODEL_DISPLAY_NAME'
62 | ]
63 | dir: 'ucaip-labs'
64 | id: 'Deploy Model'
65 | waitFor: ['Create Endpoint']
66 |
67 | # Test deployed model endpoint.
68 | - name: '$_CICD_IMAGE_URI'
69 | entrypoint: 'pytest'
70 | args: ['src/tests/model_deployment_tests.py::test_model_endpoint']
71 | dir: 'ucaip-labs'
72 | env:
73 | - 'PROJECT=$_PROJECT'
74 | - 'REGION=$_REGION'
75 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME'
76 | - 'ENDPOINT_DISPLAY_NAME=$_ENDPOINT_DISPLAY_NAME'
77 | id: 'Test Model Endpoint'
78 | waitFor: ['Deploy Model']
--------------------------------------------------------------------------------
/build/pipeline-deployment.yaml:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | #############################################################################
16 | # CI/CD steps for Cloud Build to test and deploy a TFX pipeline to Vertex AI.
17 | #############################################################################
18 |
19 | steps:
20 |
21 | # Clone the repository.
22 | - name: 'gcr.io/cloud-builders/git'
23 | args: ['clone', '--single-branch', '--branch',
24 | '$_BRANCH', '$_REPO_URL',
25 | '--depth', '1',
26 | '--verbose']
27 | id: 'Clone Repository'
28 |
29 |
30 | # Run datasource_utils unit tests.
31 | - name: '$_CICD_IMAGE_URI'
32 | entrypoint: 'pytest'
33 | args: ['src/tests/datasource_utils_tests.py', '-s']
34 | dir: 'ucaip-labs'
35 | env:
36 | - 'PROJECT_ID=$_PROJECT_ID'
37 | - 'BQ_LOCATION=$_BQ_LOCATION'
38 | - 'BQ_DATASET_NAME=$_BQ_DATASET_NAME'
39 | - 'BQ_TABLE_NAME=$_BQ_TABLE_NAME'
40 | id: 'Unit Test Datasource Utils'
41 | waitFor: ['Clone Repository']
42 |
43 |
44 | # Run model unit tests.
45 | - name: '$_CICD_IMAGE_URI'
46 | entrypoint: 'pytest'
47 | args: ['src/tests/model_tests.py', '-s']
48 | dir: 'ucaip-labs'
49 | id: 'Unit Test Model'
50 | waitFor: ['Clone Repository']
51 | timeout: 1800s
52 |
53 |
54 | # Test e2e pipeline using local runner.
55 | - name: '$_CICD_IMAGE_URI'
56 | entrypoint: 'pytest'
57 | args: ['src/tests/pipeline_deployment_tests.py::test_e2e_pipeline', '-s']
58 | dir: 'ucaip-labs'
59 | env:
60 | - 'PROJECT_ID=$_PROJECT_ID'
61 | - 'REGION=$_REGION'
62 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME'
63 | - 'DATASET_DISPLAY_NAME=$_DATASET_DISPLAY_NAME'
64 | - 'GCS_LOCATION=$_TEST_GCS_LOCATION'
65 | - 'TRAIN_LIMIT=$_CI_TRAIN_LIMIT'
66 | - 'TEST_LIMIT=$_CI_TEST_LIMIT'
67 | - 'UPLOAD_MODEL=$_CI_UPLOAD_MODEL'
68 | - 'ACCURACY_THRESHOLD=$_CI_ACCURACY_THRESHOLD'
69 | id: 'Local Test E2E Pipeline'
70 | waitFor: ['Unit Test Datasource Utils', 'Unit Test Model']
71 | timeout: 1800s
72 |
73 |
74 | # Build the image that encapsulates the pipeline.
75 | - name: 'gcr.io/cloud-builders/docker'
76 | args: ['build', '-t', '$_TFX_IMAGE_URI', '.']
77 | dir: 'ucaip-labs'
78 | id: 'Build TFX Image'
79 | waitFor: ['Local Test E2E Pipeline']
80 |
81 |
82 | # Compile the pipeline.
83 | - name: '$_CICD_IMAGE_URI'
84 | entrypoint: 'python'
85 | args: ['build/utils.py',
86 | '--mode', 'compile-pipeline',
87 | '--pipeline-name', '$_PIPELINE_NAME'
88 | ]
89 | dir: 'ucaip-labs'
90 | env:
91 | - 'PROJECT_ID=$_PROJECT_ID'
92 | - 'REGION=$_REGION'
93 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME'
94 | - 'DATASET_DISPLAY_NAME=$_DATASET_DISPLAY_NAME'
95 | - 'GCS_LOCATION=$_GCS_LOCATION'
96 | - 'TFX_IMAGE_URI=$_TFX_IMAGE_URI'
97 | - 'BEAM_RUNNER=$_BEAM_RUNNER'
98 | - 'TRAINING_RUNNER=$_TRAINING_RUNNER'
99 | id: 'Compile Pipeline'
100 | waitFor: ['Local Test E2E Pipeline']
101 |
102 |
103 | # Upload compiled pipeline to GCS.
104 | - name: 'gcr.io/cloud-builders/gsutil'
105 | args: ['cp', '$_PIPELINE_NAME.json', '$_PIPELINES_STORE']
106 | dir: 'ucaip-labs'
107 | id: 'Upload Pipeline to GCS'
108 | waitFor: ['Compile Pipeline']
109 |
110 |
111 | # Push TFX Image to Container Registy.
112 | images: ['$_TFX_IMAGE_URI']
113 |
--------------------------------------------------------------------------------
/build/serving_resources_spec.json:
--------------------------------------------------------------------------------
1 | {
2 | "traffic_percentage": 100,
3 | "machine_type": "n1-standard-2",
4 | "min_replica_count": 1,
5 | "max_replica_count": 1,
6 | "accelerator_type": null,
7 | "accelerator_count": null
8 | }
--------------------------------------------------------------------------------
/build/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Utilities for deploying pipelines and models to Vertex AI."""
15 |
16 |
17 | import argparse
18 | import os
19 | import sys
20 | import logging
21 | import json
22 |
23 | from google.cloud import aiplatform as vertex_ai
24 |
25 |
26 | SCRIPT_DIR = os.path.dirname(
27 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))
28 | )
29 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, "..")))
30 |
31 | SERVING_SPEC_FILEPATH = 'build/serving_resources_spec.json'
32 |
33 | def get_args():
34 | """Define an parse commandline arguments."""
35 |
36 | parser = argparse.ArgumentParser()
37 |
38 | parser.add_argument(
39 | '--mode',
40 | type=str,
41 | )
42 |
43 | parser.add_argument(
44 | '--project',
45 | type=str,
46 | )
47 |
48 | parser.add_argument(
49 | '--region',
50 | type=str,
51 | )
52 |
53 | parser.add_argument(
54 | '--endpoint-display-name',
55 | type=str,
56 | )
57 |
58 | parser.add_argument(
59 | '--model-display-name',
60 | type=str,
61 | )
62 |
63 | parser.add_argument(
64 | '--pipeline-name',
65 | type=str,
66 | )
67 |
68 | return parser.parse_args()
69 |
70 |
71 | def create_endpoint(project, region, endpoint_display_name):
72 | """Create a Vertex endpoint."""
73 |
74 | logging.info(f"Creating endpoint {endpoint_display_name}")
75 | vertex_ai.init(
76 | project=project,
77 | location=region
78 | )
79 |
80 | endpoints = vertex_ai.Endpoint.list(
81 | filter=f'display_name={endpoint_display_name}',
82 | order_by="update_time")
83 |
84 | if len(endpoints) > 0:
85 | logging.info(f"Endpoint {endpoint_display_name} already exists.")
86 | endpoint = endpoints[-1]
87 | else:
88 | endpoint = vertex_ai.Endpoint.create(endpoint_display_name)
89 | logging.info(f"Endpoint is ready.")
90 | logging.info(endpoint.gca_resource)
91 | return endpoint
92 |
93 |
94 | def deploy_model(project, region, endpoint_display_name, model_display_name, serving_resources_spec):
95 | """Deploy a model to a Vertex endpoint."""
96 |
97 | logging.info(f"Deploying model {model_display_name} to endpoint {endpoint_display_name}")
98 | vertex_ai.init(
99 | project=project,
100 | location=region
101 | )
102 |
103 | model = vertex_ai.Model.list(
104 | filter=f'display_name={model_display_name}',
105 | order_by="update_time"
106 | )[-1]
107 |
108 | endpoint = vertex_ai.Endpoint.list(
109 | filter=f'display_name={endpoint_display_name}',
110 | order_by="update_time"
111 | )[-1]
112 |
113 | deployed_model = endpoint.deploy(model=model, **serving_resources_spec)
114 | logging.info(f"Model is deployed.")
115 | logging.info(deployed_model)
116 | return deployed_model
117 |
118 |
119 | def compile_pipeline(pipeline_name):
120 | """Create a .json file with the pipeline definition."""
121 |
122 | from src.tfx_pipelines import runner
123 | pipeline_definition_file = f"{pipeline_name}.json"
124 | pipeline_definition = runner.compile_training_pipeline(pipeline_definition_file)
125 | return pipeline_definition
126 |
127 |
128 |
129 | def main():
130 | args = get_args()
131 |
132 | if args.mode == 'create-endpoint':
133 | if not args.project:
134 | raise ValueError("project must be supplied.")
135 | if not args.region:
136 | raise ValueError("region must be supplied.")
137 | if not args.endpoint_display_name:
138 | raise ValueError("endpoint_display_name must be supplied.")
139 |
140 | result = create_endpoint(
141 | args.project,
142 | args.region,
143 | args.endpoint_display_name
144 | )
145 |
146 | elif args.mode == 'deploy-model':
147 | if not args.project:
148 | raise ValueError("project must be supplied.")
149 | if not args.region:
150 | raise ValueError("region must be supplied.")
151 | if not args.endpoint_display_name:
152 | raise ValueError("endpoint-display-name must be supplied.")
153 | if not args.model_display_name:
154 | raise ValueError("model-display-name must be supplied.")
155 |
156 | with open(SERVING_SPEC_FILEPATH) as json_file:
157 | serving_resources_spec = json.load(json_file)
158 | logging.info(f"serving resources: {serving_resources_spec}")
159 | result = deploy_model(
160 | args.project,
161 | args.region,
162 | args.endpoint_display_name,
163 | args.model_display_name,
164 | serving_resources_spec
165 | )
166 |
167 | elif args.mode == 'compile-pipeline':
168 | if not args.pipeline_name:
169 | raise ValueError("pipeline-name must be supplied.")
170 |
171 | result = compile_pipeline(args.pipeline_name)
172 |
173 | else:
174 | raise ValueError(f"Invalid mode {args.mode}.")
175 |
176 | logging.info(result)
177 |
178 |
179 | if __name__ == "__main__":
180 | main()
181 |
182 |
--------------------------------------------------------------------------------
/mlops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/mlops.png
--------------------------------------------------------------------------------
/provision/README.md:
--------------------------------------------------------------------------------
1 | # Creating a Vertex environment
2 |
3 | You can use the [Terraform](https://www.terraform.io/) scripts in the `terraform` folder to automatically provision the environment required by the samples.
4 |
5 | The scripts perform the following actions:
6 |
7 | 1. Enable the required Cloud APIs
8 | * **Essentials**: compute, iam, iamcredentials
9 | * **ML**: notebooks, aiplatform
10 | * **Data**: dataflow, bigquery, bigquerydatatransfer
11 | * **CI/CD**: cloudbuild, container, artifactregistry
12 | * **Operations**: cloudtrace, monitoring, logging, cloudresourcemanager
13 | 2. Create a regional GCS bucket.
14 | 3. Create an instance of Vertex Notebooks.
15 | 4. Create service accounts for Vertex Training and Vertex Pipelines.
16 |
17 | You can customize your configuration using the following variables:
18 |
19 | |Variable|Required|Default|Description|
20 | |--------|--------|-------|-----------|
21 | |name_prefix|Yes||Prefix added to the names of provisioned resources. **The prefix should start with a letter and include letters and digits only**.|
22 | |project_id|Yes||GCP project ID|
23 | |network_name|No|default|Name of the network for the Notebook instance. The network must already exist.|
24 | |subnet_name|No|default|Name of the subnet for the Notebook instance. The subnet must already exist.|
25 | |subnet_region|No|us-central1|Region where the subnet was created.|
26 | |zone|Yes||GCP zone for the Notebook instance. The zone must be in the region defined in the `subnet_region` variable|
27 | |machine_type|No|n1-standard-4|Machine type of the Notebook instance|
28 | |boot_disk_size|No|200GB|Size of the Notebook instance's boot disk|
29 | |image_family|No|tf-2-4-cpu|Image family for the Notebook instance|
30 | |gpu_type|No|null|GPU type of the Notebook instance. By default, the Notebook instance will be provisioned without a GPU|
31 | |gpu_count|No|null|GPU count of the Notebook instance|
32 | |install_gpu_driver|No|false|Whether to install a GPU driver|
33 | |region|No|Set to subnet_region.|GCP region for the GCS bucket and Artifact Registry. It is recommended that the same region is used for all: the bucket, the registry and the Notebook instance. If not provided the `egion` will be set to `subnet_region`.|
34 | |force_destroy|No|false|Whether to force the removal of the bucket on terraform destroy. **Note that by default the bucket will not be destroyed**.|
35 |
36 |
37 | To provision the environment:
38 |
39 | 1. Open [Cloud Shell](https://cloud.google.com/shell/docs/launching-cloud-shell)
40 |
41 | 2. Download the installation scripts
42 | ```
43 | SRC_REPO=https://github.com/ksalama/ucaip-labs
44 | LOCAL_DIR=provision
45 | kpt pkg get $SRC_REPO/provision@main $LOCAL_DIR
46 | cd $LOCAL_DIR/terraform
47 | ```
48 |
49 | 3. Update the `terraform.tfvars` file with the values reflecting your environment. Alternatively, you can provide the values using the Terraform CLI `-var` options when you execute `terraform apply` in the next step
50 |
51 | 4. Execute the following commands. :
52 | ```
53 | terraform init
54 | terraform apply
55 | ```
56 |
57 |
58 | To destroy the environment, execute:
59 | ```
60 | terraform destroy
61 | ```
62 |
--------------------------------------------------------------------------------
/provision/terraform/gcs-bucket.tf:
--------------------------------------------------------------------------------
1 |
2 | # Copyright 2021 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | resource "google_storage_bucket" "artifact_repo" {
18 | project = module.project-services.project_id
19 | name = "${var.name_prefix}-bucket"
20 | location = local.region
21 | storage_class = local.bucket_type
22 | force_destroy = var.force_destroy
23 | }
--------------------------------------------------------------------------------
/provision/terraform/main.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | terraform {
16 | required_version = ">= 0.14"
17 | required_providers {
18 | google = "~> 3.6"
19 | }
20 | }
21 |
22 | provider "google" {
23 | project = var.project_id
24 | }
25 |
26 | data "google_project" "project" {
27 | project_id = var.project_id
28 | }
29 |
30 | locals {
31 | bucket_type = "REGIONAL"
32 | region = var.region == null ? var.subnet_region : var.region
33 | }
34 |
35 |
36 |
--------------------------------------------------------------------------------
/provision/terraform/notebook-instance.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | locals {
16 | image_project = "deeplearning-platform-release"
17 | }
18 |
19 | data "google_compute_network" "vm_network" {
20 | project = module.project-services.project_id
21 | name = var.network_name
22 |
23 | depends_on = [
24 | module.project-services
25 | ]
26 | }
27 |
28 | data "google_compute_subnetwork" "vm_subnetwork" {
29 | project = module.project-services.project_id
30 | name = var.subnet_name
31 | region = var.subnet_region
32 |
33 | depends_on = [
34 | module.project-services
35 | ]
36 | }
37 |
38 | resource "google_notebooks_instance" "notebook_instance" {
39 | project = module.project-services.project_id
40 | name = "${var.name_prefix}-notebook"
41 | machine_type = var.machine_type
42 | location = var.zone
43 |
44 | network = data.google_compute_network.vm_network.id
45 | subnet = data.google_compute_subnetwork.vm_subnetwork.id
46 |
47 | vm_image {
48 | project = local.image_project
49 | image_family = var.image_family
50 | }
51 |
52 | dynamic accelerator_config {
53 | for_each = var.gpu_type != null ? [1] : []
54 | content {
55 | type = var.gpu_type
56 | core_count = var.gpu_count
57 | }
58 | }
59 |
60 | install_gpu_driver = var.install_gpu_driver
61 |
62 | boot_disk_size_gb = var.boot_disk_size
63 | }
64 |
--------------------------------------------------------------------------------
/provision/terraform/service-accounts.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Create Vertex Training service account
16 | resource "google_service_account" "training_sa" {
17 | project = module.project-services.project_id
18 | account_id = var.training_sa_name
19 | display_name = "Vertex Training service account"
20 | }
21 |
22 | # Create Vertex Training SA role bindings
23 | resource "google_project_iam_member" "training_sa_role_bindings" {
24 | project = module.project-services.project_id
25 | for_each = toset(var.training_sa_roles)
26 | member = "serviceAccount:${google_service_account.training_sa.email}"
27 | role = "roles/${each.value}"
28 | }
29 |
30 | # Create Vertex Pipelines service account
31 | resource "google_service_account" "pipelines_sa" {
32 | project = module.project-services.project_id
33 | account_id = var.pipelines_sa_name
34 | display_name = "Vertex Pipelines account name"
35 | }
36 |
37 | # Create Vertex Pipelines SA role bindings
38 | resource "google_project_iam_member" "role_bindings" {
39 | project = module.project-services.project_id
40 | for_each = toset(var.pipelines_sa_roles)
41 | member = "serviceAccount:${google_service_account.pipelines_sa.email}"
42 | role = "roles/${each.value}"
43 | }
44 |
--------------------------------------------------------------------------------
/provision/terraform/services.tf:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | module "project-services" {
17 | source = "terraform-google-modules/project-factory/google//modules/project_services"
18 |
19 | project_id = data.google_project.project.project_id
20 |
21 | disable_services_on_destroy = false
22 | activate_apis = [
23 | "compute.googleapis.com",
24 | "iam.googleapis.com",
25 | "container.googleapis.com",
26 | "artifactregistry.googleapis.com",
27 | "cloudresourcemanager.googleapis.com",
28 | "cloudtrace.googleapis.com",
29 | "iamcredentials.googleapis.com",
30 | "monitoring.googleapis.com",
31 | "logging.googleapis.com",
32 | "notebooks.googleapis.com",
33 | "aiplatform.googleapis.com",
34 | "dataflow.googleapis.com",
35 | "bigquery.googleapis.com",
36 | "cloudbuild.googleapis.com",
37 | "bigquerydatatransfer.googleapis.com",
38 | ]
39 | }
--------------------------------------------------------------------------------
/provision/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
1 | project_id = "vertex-mlops"
2 | subnet_region = "us-central1"
3 | zone = "us-central1-a"
4 | name_prefix = "vertex-mlops"
5 | machine_type = "n1-standard-8"
6 | #gpu_type = "NVIDIA_TESLA_T4"
7 | #gpu_count = 1
8 | #install_gpu_driver = true
9 | #image_family = "tf-2-4-gpu"
10 |
11 |
12 |
--------------------------------------------------------------------------------
/provision/terraform/variables.tf:
--------------------------------------------------------------------------------
1 |
2 | # Copyright 2021 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | variable "project_id" {
18 | description = "The GCP project ID"
19 | type = string
20 | }
21 |
22 | variable "region" {
23 | description = "The region for the GCS bucket and Artifact Registry"
24 | type = string
25 | default = null
26 | }
27 |
28 | variable "zone" {
29 | description = "The zone for a Vertex Notebook instance"
30 | type = string
31 | }
32 |
33 | variable "name_prefix" {
34 | description = "The name prefix to add to the resource names"
35 | type = string
36 | }
37 |
38 | variable "machine_type" {
39 | description = "The Notebook instance's machine type"
40 | type = string
41 | }
42 |
43 | variable "network_name" {
44 | description = "The network name for the Notebook instance"
45 | type = string
46 | default = "default"
47 | }
48 |
49 | variable "subnet_name" {
50 | description = "The subnet name for the Notebook instance"
51 | type = string
52 | default = "default"
53 | }
54 |
55 | variable "subnet_region" {
56 | description = "The region for the Notebook subnet"
57 | type = string
58 | default = "us-central1"
59 | }
60 |
61 | variable "boot_disk_size" {
62 | description = "The size of the boot disk"
63 | default = 200
64 | }
65 |
66 | variable "image_family" {
67 | description = "A Deep Learning image family for the Notebook instance"
68 | type = string
69 | default = "tf-2-4-cpu"
70 | }
71 |
72 | variable "gpu_type" {
73 | description = "A GPU type for the Notebook instance"
74 | type = string
75 | default = null
76 | }
77 |
78 | variable "gpu_count" {
79 | description = "A GPU count for the Notebook instance"
80 | type = string
81 | default = null
82 | }
83 |
84 | variable "install_gpu_driver" {
85 | description = "Whether to install GPU driver"
86 | type = bool
87 | default = false
88 | }
89 |
90 | variable "force_destroy" {
91 | description = "Whether to remove the bucket on destroy"
92 | type = bool
93 | default = false
94 | }
95 |
96 | variable "training_sa_roles" {
97 | description = "The roles to assign to the Vertex Training service account"
98 | default = [
99 | "storage.admin",
100 | "aiplatform.user",
101 | "bigquery.admin"
102 | ]
103 | }
104 |
105 | variable "pipelines_sa_roles" {
106 | description = "The roles to assign to the Vertex Pipelines service account"
107 | default = [
108 | "storage.admin",
109 | "bigquery.admin",
110 | "aiplatform.user"
111 | ]
112 | }
113 |
114 | variable "training_sa_name" {
115 | description = "Vertex training service account name."
116 | default = "training-sa"
117 | }
118 |
119 | variable "pipelines_sa_name" {
120 | description = "Vertex pipelines service account name."
121 | default = "pipelines-sa"
122 | }
123 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | kfp==1.6.2
2 | google-cloud-bigquery==2.20.0
3 | google-cloud-bigquery-storage==2.4.0
4 | google-cloud-aiplatform==1.1.1
5 | google-auth==1.30.1
6 | google-auth-oauthlib==0.4.4
7 | google-auth-httplib2==0.1.0
8 | oauth2client==4.1.3
9 | requests==2.25.1
10 | pytest
11 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | REQUIRED_PACKAGES = [
4 | "google-cloud-aiplatform==1.0.0",
5 | "tensorflow-transform==0.30.0",
6 | "tensorflow-data-validation==0.30.0",
7 | ]
8 |
9 | setuptools.setup(
10 | name="executor",
11 | version="0.0.1",
12 | install_requires=REQUIRED_PACKAGES,
13 | packages=setuptools.find_packages(),
14 | include_package_data=True,
15 | package_data={"src": ["raw_schema/schema.pbtxt"]},
16 | )
17 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/__init__.py
--------------------------------------------------------------------------------
/src/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/common/__init__.py
--------------------------------------------------------------------------------
/src/common/datasource_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Utilities for generating BigQuery data querying scirpts."""
15 |
16 |
17 | from google.cloud import aiplatform as vertex_ai
18 |
19 |
20 | def _get_source_query(bq_dataset_name, bq_table_name, ml_use, limit=None):
21 |
22 | query = f"""
23 | SELECT
24 | IF(trip_month IS NULL, -1, trip_month) trip_month,
25 | IF(trip_day IS NULL, -1, trip_day) trip_day,
26 | IF(trip_day_of_week IS NULL, -1, trip_day_of_week) trip_day_of_week,
27 | IF(trip_hour IS NULL, -1, trip_hour) trip_hour,
28 | IF(trip_seconds IS NULL, -1, trip_seconds) trip_seconds,
29 | IF(trip_miles IS NULL, -1, trip_miles) trip_miles,
30 | IF(payment_type IS NULL, 'NA', payment_type) payment_type,
31 | IF(pickup_grid IS NULL, 'NA', pickup_grid) pickup_grid,
32 | IF(dropoff_grid IS NULL, 'NA', dropoff_grid) dropoff_grid,
33 | IF(euclidean IS NULL, -1, euclidean) euclidean,
34 | IF(loc_cross IS NULL, 'NA', loc_cross) loc_cross"""
35 | if ml_use:
36 | query += f""",
37 | tip_bin
38 | FROM {bq_dataset_name}.{bq_table_name}
39 | WHERE ML_use = '{ml_use}'
40 | """
41 | else:
42 | query += f"""
43 | FROM {bq_dataset_name}.{bq_table_name}
44 | """
45 | if limit:
46 | query += f"LIMIT {limit}"
47 |
48 | return query
49 |
50 |
51 | def get_training_source_query(
52 | project, region, dataset_display_name, ml_use="UNASSIGNED", limit=None
53 | ):
54 | """Generates a BigQuery SELECT statement for the training data."""
55 |
56 | dataset = vertex_ai.TabularDataset.list(
57 | filter=f"display_name={dataset_display_name}", order_by="update_time"
58 | )[-1]
59 | bq_source_uri = dataset.gca_resource.metadata["inputConfig"]["bigquerySource"][
60 | "uri"
61 | ]
62 | _, bq_dataset_name, bq_table_name = bq_source_uri.replace("bq://", "").split(".")
63 |
64 | return _get_source_query(bq_dataset_name, bq_table_name, ml_use, limit)
65 |
66 |
67 | def get_serving_source_query(bq_dataset_name, bq_table_name, limit=None):
68 | """Generates a BigQuery SELECT statement for the training data."""
69 |
70 | return _get_source_query(bq_dataset_name, bq_table_name, ml_use=None, limit=limit)
71 |
--------------------------------------------------------------------------------
/src/common/features.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Model features metadata utils."""
15 |
16 |
17 | FEATURE_NAMES = [
18 | "trip_month",
19 | "trip_day",
20 | "trip_day_of_week",
21 | "trip_hour",
22 | "trip_seconds",
23 | "trip_miles",
24 | "payment_type",
25 | "pickup_grid",
26 | "dropoff_grid",
27 | "euclidean",
28 | "loc_cross",
29 | ]
30 |
31 | TARGET_FEATURE_NAME = "tip_bin"
32 |
33 | TARGET_LABELS = ["tip<20%", "tip>=20%"]
34 |
35 | NUMERICAL_FEATURE_NAMES = [
36 | "trip_seconds",
37 | "trip_miles",
38 | "euclidean",
39 | ]
40 |
41 | EMBEDDING_CATEGORICAL_FEATURES = {
42 | "trip_month": 2,
43 | "trip_day": 4,
44 | "trip_hour": 3,
45 | "pickup_grid": 3,
46 | "dropoff_grid": 3,
47 | "loc_cross": 10,
48 | }
49 |
50 | ONEHOT_CATEGORICAL_FEATURE_NAMES = ["payment_type", "trip_day_of_week"]
51 |
52 |
53 | def transformed_name(key: str) -> str:
54 | """Generate the name of the transformed feature from original name."""
55 | return f"{key}_xf"
56 |
57 |
58 | def original_name(key: str) -> str:
59 | """Generate the name of the original feature from transformed name."""
60 | return key.replace("_xf", "")
61 |
62 |
63 | def vocabulary_name(key: str) -> str:
64 | """Generate the name of the vocabulary feature from original name."""
65 | return f"{key}_vocab"
66 |
67 |
68 | def categorical_feature_names() -> list:
69 | return (
70 | list(EMBEDDING_CATEGORICAL_FEATURES.keys()) + ONEHOT_CATEGORICAL_FEATURE_NAMES
71 | )
72 |
73 |
74 | def generate_explanation_config():
75 | explanation_config = {
76 | "inputs": {},
77 | "outputs": {},
78 | "params": {"sampled_shapley_attribution": {"path_count": 10}},
79 | }
80 |
81 | for feature_name in FEATURE_NAMES:
82 | if feature_name in NUMERICAL_FEATURE_NAMES:
83 | explanation_config["inputs"][feature_name] = {
84 | "input_tensor_name": feature_name,
85 | "modality": "numeric",
86 | }
87 | else:
88 | explanation_config["inputs"][feature_name] = {
89 | "input_tensor_name": feature_name,
90 | "modality": "categorical",
91 | }
92 |
93 | explanation_config["outputs"] = {"scores": {"output_tensor_name": "scores"}}
94 |
95 | return explanation_config
96 |
--------------------------------------------------------------------------------
/src/model_training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/model_training/__init__.py
--------------------------------------------------------------------------------
/src/model_training/data.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Functions for reading data as tf.data.Dataset."""
15 |
16 | import tensorflow as tf
17 |
18 | from src.common import features
19 |
20 |
21 | def _gzip_reader_fn(filenames: list):
22 | """Returns a record reader that can read gzip'ed files."""
23 | return tf.data.TFRecordDataset(filenames, compression_type="GZIP")
24 |
25 |
26 | def get_dataset(
27 | file_pattern: str,
28 | feature_spec: dict,
29 | batch_size: int = 200,
30 | upsampling_factor: float = 2.0,
31 | ):
32 | """Generates features and label for tuning/training.
33 |
34 | Args:
35 | file_pattern: input tfrecord file pattern.
36 | feature_spec: a dictionary of feature specifications.
37 | batch_size: representing the number of consecutive elements of returned
38 | dataset to combine in a single batch
39 | Returns:
40 | A dataset that contains (features, indices) tuple where features is a
41 | dictionary of Tensors, and indices is a single Tensor of label indices.
42 | """
43 |
44 | dataset = tf.data.experimental.make_batched_features_dataset(
45 | file_pattern=file_pattern,
46 | batch_size=batch_size,
47 | features=feature_spec,
48 | label_key=features.TARGET_FEATURE_NAME,
49 | reader=_gzip_reader_fn,
50 | num_epochs=1,
51 | drop_final_batch=True,
52 | )
53 |
54 | return dataset
55 |
--------------------------------------------------------------------------------
/src/model_training/defaults.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Defaults for the model.
15 |
16 | These values can be tweaked to affect model training performance.
17 | """
18 |
19 |
20 | HIDDEN_UNITS = [64, 32]
21 | LEARNING_RATE = 0.0001
22 | BATCH_SIZE = 512
23 | NUM_EPOCHS = 10
24 | NUM_EVAL_STEPS = 100
25 |
26 |
27 | def update_hyperparams(hyperparams: dict) -> dict:
28 | """Updates the hyperparams dictionary with default values."""
29 |
30 | if "hidden_units" not in hyperparams:
31 | hyperparams["hidden_units"] = HIDDEN_UNITS
32 | else:
33 | if not isinstance(hyperparams["hidden_units"], list):
34 | hyperparams["hidden_units"] = [
35 | int(v) for v in hyperparams["hidden_units"].split(",")
36 | ]
37 | if "learning_rate" not in hyperparams:
38 | hyperparams["learning_rate"] = LEARNING_RATE
39 | if "batch_size" not in hyperparams:
40 | hyperparams["batch_size"] = BATCH_SIZE
41 | if "num_epochs" not in hyperparams:
42 | hyperparams["num_epochs"] = NUM_EPOCHS
43 | return hyperparams
44 |
--------------------------------------------------------------------------------
/src/model_training/exporter.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Functions for exporting the model for serving."""
15 |
16 | import logging
17 |
18 | import tensorflow as tf
19 | import tensorflow_transform as tft
20 | import tensorflow_data_validation as tfdv
21 | from tensorflow_transform.tf_metadata import schema_utils
22 | import tensorflow.keras as keras
23 |
24 | from src.common import features
25 |
26 |
27 | def _get_serve_tf_examples_fn(classifier, tft_output, raw_feature_spec):
28 | """Returns a function that parses a serialized tf.Example and applies TFT."""
29 |
30 | classifier.tft_layer = tft_output.transform_features_layer()
31 |
32 | @tf.function
33 | def serve_tf_examples_fn(serialized_tf_examples):
34 | """Returns the output to be used in the serving signature."""
35 | for key in list(raw_feature_spec.keys()):
36 | if key not in features.FEATURE_NAMES:
37 | raw_feature_spec.pop(key)
38 |
39 | parsed_features = tf.io.parse_example(serialized_tf_examples, raw_feature_spec)
40 |
41 | transformed_features = classifier.tft_layer(parsed_features)
42 | logits = classifier(transformed_features)
43 | probabilities = keras.activations.sigmoid(logits)
44 | return {"probabilities": probabilities}
45 |
46 | return serve_tf_examples_fn
47 |
48 |
49 | def _get_serve_features_fn(classifier, tft_output):
50 | """Returns a function that accept a dictionary of features and applies TFT."""
51 |
52 | classifier.tft_layer = tft_output.transform_features_layer()
53 |
54 | @tf.function
55 | def serve_features_fn(raw_features):
56 | """Returns the output to be used in the serving signature."""
57 |
58 | transformed_features = classifier.tft_layer(raw_features)
59 | logits = classifier(transformed_features)
60 | neg_probabilities = keras.activations.sigmoid(logits)
61 | pos_probabilities = 1 - neg_probabilities
62 | probabilities = tf.concat([neg_probabilities, pos_probabilities], -1)
63 | batch_size = tf.shape(probabilities)[0]
64 | classes = tf.repeat([features.TARGET_LABELS], [batch_size], axis=0)
65 | return {"classes": classes, "scores": probabilities}
66 |
67 | return serve_features_fn
68 |
69 |
70 | def export_serving_model(
71 | classifier, serving_model_dir, raw_schema_location, tft_output_dir
72 | ):
73 | """Exports the classifier as a SavedModel with serving signatures."""
74 |
75 | raw_schema = tfdv.load_schema_text(raw_schema_location)
76 | raw_feature_spec = schema_utils.schema_as_feature_spec(raw_schema).feature_spec
77 |
78 | tft_output = tft.TFTransformOutput(tft_output_dir)
79 |
80 | features_input_signature = {
81 | feature_name: tf.TensorSpec(
82 | shape=(None, 1), dtype=spec.dtype, name=feature_name
83 | )
84 | for feature_name, spec in raw_feature_spec.items()
85 | if feature_name in features.FEATURE_NAMES
86 | }
87 |
88 | signatures = {
89 | "serving_default": _get_serve_features_fn(
90 | classifier, tft_output
91 | ).get_concrete_function(features_input_signature),
92 | "serving_tf_example": _get_serve_tf_examples_fn(
93 | classifier, tft_output, raw_feature_spec
94 | ).get_concrete_function(
95 | tf.TensorSpec(shape=[None], dtype=tf.string, name="examples")
96 | ),
97 | }
98 |
99 | logging.info("Model export started...")
100 | tf.saved_model.save(classifier, serving_model_dir, signatures=signatures)
101 | logging.info("Model export completed.")
102 |
--------------------------------------------------------------------------------
/src/model_training/model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """A DNN Keras classification model."""
15 |
16 | import tensorflow as tf
17 | from tensorflow import keras
18 |
19 | from src.common import features
20 |
21 |
22 | def create_model_inputs():
23 | """Creates Keras model input dictionary."""
24 |
25 | inputs = {}
26 | for feature_name in features.FEATURE_NAMES:
27 | name = features.transformed_name(feature_name)
28 | if feature_name in features.NUMERICAL_FEATURE_NAMES:
29 | inputs[name] = keras.layers.Input(name=name, shape=[], dtype=tf.float32)
30 | elif feature_name in features.categorical_feature_names():
31 | inputs[name] = keras.layers.Input(name=name, shape=[], dtype=tf.int64)
32 | else:
33 | pass
34 | return inputs
35 |
36 |
37 | def _create_binary_classifier(feature_vocab_sizes, hyperparams):
38 | """Return a Keras binary classifier."""
39 |
40 | input_layers = create_model_inputs()
41 |
42 | layers = []
43 | for key in input_layers:
44 | feature_name = features.original_name(key)
45 | if feature_name in features.EMBEDDING_CATEGORICAL_FEATURES:
46 | vocab_size = feature_vocab_sizes[feature_name]
47 | embedding_size = features.EMBEDDING_CATEGORICAL_FEATURES[feature_name]
48 | embedding_output = keras.layers.Embedding(
49 | input_dim=vocab_size + 1,
50 | output_dim=embedding_size,
51 | name=f"{key}_embedding",
52 | )(input_layers[key])
53 | layers.append(embedding_output)
54 | elif feature_name in features.ONEHOT_CATEGORICAL_FEATURE_NAMES:
55 | vocab_size = feature_vocab_sizes[feature_name]
56 | onehot_layer = keras.layers.experimental.preprocessing.CategoryEncoding(
57 | max_tokens=vocab_size,
58 | output_mode="binary",
59 | name=f"{key}_onehot",
60 | )(input_layers[key])
61 | layers.append(onehot_layer)
62 | elif feature_name in features.NUMERICAL_FEATURE_NAMES:
63 | numeric_layer = tf.expand_dims(input_layers[key], -1)
64 | layers.append(numeric_layer)
65 | else:
66 | pass
67 |
68 | joined = keras.layers.Concatenate(name="combines_inputs")(layers)
69 | feedforward_output = keras.Sequential(
70 | [
71 | keras.layers.Dense(units, activation="relu")
72 | for units in hyperparams["hidden_units"]
73 | ],
74 | name="feedforward_network",
75 | )(joined)
76 | logits = keras.layers.Dense(units=1, name="logits")(feedforward_output)
77 |
78 | model = keras.Model(inputs=input_layers, outputs=[logits])
79 | return model
80 |
81 |
82 | def create_binary_classifier(tft_output, hyperparams):
83 | """Returns a Keras binary classifier."""
84 |
85 | feature_vocab_sizes = dict()
86 | for feature_name in features.categorical_feature_names():
87 | feature_vocab_sizes[feature_name] = tft_output.vocabulary_size_by_name(
88 | feature_name
89 | )
90 |
91 | return _create_binary_classifier(feature_vocab_sizes, hyperparams)
92 |
--------------------------------------------------------------------------------
/src/model_training/runner.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """A run_fn method called by the TFX Trainer component."""
15 |
16 | import os
17 | import logging
18 |
19 | from src.model_training import trainer, exporter, defaults
20 |
21 |
22 | # TFX Trainer will call this function.
23 | def run_fn(fn_args):
24 | """Train the model based on given args.
25 |
26 | Args:
27 | fn_args: Holds args used to train the model as name/value pairs.
28 | """
29 |
30 | logging.info("Runner started...")
31 | logging.info(f"fn_args: {fn_args}")
32 | logging.info("")
33 |
34 | try:
35 | log_dir = fn_args.model_run_dir
36 | except KeyError:
37 | log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), "logs")
38 |
39 | hyperparams = fn_args.hyperparameters
40 | if not hyperparams:
41 | hyperparams = dict()
42 |
43 | hyperparams = defaults.update_hyperparams(hyperparams)
44 | logging.info("Hyperparameter:")
45 | logging.info(hyperparams)
46 | logging.info("")
47 |
48 | logging.info("Runner executing trainer...")
49 | classifier = trainer.train(
50 | train_data_dir=fn_args.train_files,
51 | eval_data_dir=fn_args.eval_files,
52 | tft_output_dir=fn_args.transform_output,
53 | hyperparams=hyperparams,
54 | log_dir=log_dir,
55 | base_model_dir=fn_args.base_model,
56 | )
57 |
58 | logging.info("Runner executing exporter...")
59 | exporter.export_serving_model(
60 | classifier=classifier,
61 | serving_model_dir=fn_args.serving_model_dir,
62 | raw_schema_location=fn_args.schema_path,
63 | tft_output_dir=fn_args.transform_output,
64 | )
65 | logging.info("Runner completed.")
66 |
--------------------------------------------------------------------------------
/src/model_training/task.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """The entrypoint for the uCAIP traing job."""
15 |
16 | import os
17 | import sys
18 | from datetime import datetime
19 | import logging
20 | import tensorflow as tf
21 | from tensorflow.python.client import device_lib
22 | import argparse
23 |
24 | from google.cloud import aiplatform as vertex_ai
25 | from google.cloud import aiplatform_v1beta1 as vertex_ai_beta
26 |
27 | from src.model_training import defaults, trainer, exporter
28 |
29 | dirname = os.path.dirname(__file__)
30 | dirname = dirname.replace("/model_training", "")
31 | RAW_SCHEMA_LOCATION = os.path.join(dirname, "raw_schema/schema.pbtxt")
32 |
33 |
34 | def get_args():
35 | """Defines and parse commandline arguments."""
36 |
37 | parser = argparse.ArgumentParser()
38 |
39 | parser.add_argument(
40 | "--model-dir",
41 | default=os.getenv("AIP_MODEL_DIR"),
42 | type=str,
43 | )
44 |
45 | parser.add_argument(
46 | "--log-dir",
47 | default=os.getenv("AIP_TENSORBOARD_LOG_DIR"),
48 | type=str,
49 | )
50 |
51 | parser.add_argument(
52 | "--train-data-dir",
53 | type=str,
54 | )
55 |
56 | parser.add_argument(
57 | "--eval-data-dir",
58 | type=str,
59 | )
60 |
61 | parser.add_argument(
62 | "--tft-output-dir",
63 | type=str,
64 | )
65 |
66 | parser.add_argument("--learning-rate", default=0.001, type=float)
67 |
68 | parser.add_argument("--batch-size", default=512, type=float)
69 |
70 | parser.add_argument("--hidden-units", default="64,32", type=str)
71 |
72 | parser.add_argument("--num-epochs", default=10, type=int)
73 |
74 | parser.add_argument("--project", type=str)
75 | parser.add_argument("--region", type=str)
76 | parser.add_argument("--staging-bucket", type=str)
77 | parser.add_argument("--experiment-name", type=str)
78 | parser.add_argument("--run-name", type=str)
79 |
80 | return parser.parse_args()
81 |
82 |
83 | def main():
84 | args = get_args()
85 |
86 | hyperparams = vars(args)
87 | hyperparams = defaults.update_hyperparams(hyperparams)
88 | logging.info(f"Hyperparameter: {hyperparams}")
89 |
90 | if args.experiment_name:
91 | vertex_ai.init(
92 | project=args.project,
93 | staging_bucket=args.staging_bucket,
94 | experiment=args.experiment_name,
95 | )
96 |
97 | logging.info(f"Using Vertex AI experiment: {args.experiment_name}")
98 |
99 | run_id = args.run_name
100 | if not run_id:
101 | run_id = f"run-gcp-{datetime.now().strftime('%Y%m%d%H%M%S')}"
102 |
103 | vertex_ai.start_run(run_id)
104 | logging.info(f"Run {run_id} started.")
105 |
106 | vertex_ai.log_params(hyperparams)
107 |
108 | classifier = trainer.train(
109 | train_data_dir=args.train_data_dir,
110 | eval_data_dir=args.eval_data_dir,
111 | tft_output_dir=args.tft_output_dir,
112 | hyperparams=hyperparams,
113 | log_dir=args.log_dir,
114 | )
115 |
116 | val_loss, val_accuracy = trainer.evaluate(
117 | model=classifier,
118 | data_dir=args.eval_data_dir,
119 | raw_schema_location=RAW_SCHEMA_LOCATION,
120 | tft_output_dir=args.tft_output_dir,
121 | hyperparams=hyperparams,
122 | )
123 |
124 | if args.experiment_name:
125 | vertex_ai.log_metrics({"val_loss": val_loss, "val_accuracy": val_accuracy})
126 |
127 | try:
128 | exporter.export_serving_model(
129 | classifier=classifier,
130 | serving_model_dir=args.model_dir,
131 | raw_schema_location=RAW_SCHEMA_LOCATION,
132 | tft_output_dir=args.tft_output_dir,
133 | )
134 | except:
135 | # Swallow Ignored Errors while exporting the model.
136 | pass
137 |
138 |
139 | if __name__ == "__main__":
140 | logging.getLogger().setLevel(logging.INFO)
141 | logging.info(f"Python Version = {sys.version}")
142 | logging.info(f"TensorFlow Version = {tf.__version__}")
143 | logging.info(f'TF_CONFIG = {os.environ.get("TF_CONFIG", "Not found")}')
144 | logging.info(f"DEVICES = {device_lib.list_local_devices()}")
145 | logging.info(f"Task started...")
146 | main()
147 | logging.info(f"Task completed.")
148 |
--------------------------------------------------------------------------------
/src/model_training/trainer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Train and evaluate the model."""
15 |
16 | import logging
17 | import tensorflow as tf
18 | import tensorflow_transform as tft
19 | from tensorflow import keras
20 |
21 |
22 | from src.model_training import data, model
23 |
24 |
25 | def train(
26 | train_data_dir,
27 | eval_data_dir,
28 | tft_output_dir,
29 | hyperparams,
30 | log_dir,
31 | base_model_dir=None,
32 | ):
33 | """Invokes model.fit method and returns a trained classifier."""
34 |
35 | logging.info(f"Loading tft output from {tft_output_dir}")
36 | tft_output = tft.TFTransformOutput(tft_output_dir)
37 | transformed_feature_spec = tft_output.transformed_feature_spec()
38 |
39 | train_dataset = data.get_dataset(
40 | train_data_dir,
41 | transformed_feature_spec,
42 | hyperparams["batch_size"],
43 | )
44 |
45 | eval_dataset = data.get_dataset(
46 | eval_data_dir,
47 | transformed_feature_spec,
48 | hyperparams["batch_size"],
49 | )
50 |
51 | optimizer = keras.optimizers.Adam(learning_rate=hyperparams["learning_rate"])
52 | loss = keras.losses.BinaryCrossentropy(from_logits=True)
53 | metrics = [keras.metrics.BinaryAccuracy(name="accuracy")]
54 |
55 | early_stopping = tf.keras.callbacks.EarlyStopping(
56 | monitor="val_loss", patience=5, restore_best_weights=True
57 | )
58 | tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
59 |
60 | classifier = model.create_binary_classifier(tft_output, hyperparams)
61 | if base_model_dir:
62 | try:
63 | classifier = keras.load_model(base_model_dir)
64 | except:
65 | pass
66 |
67 | classifier.compile(optimizer=optimizer, loss=loss, metrics=metrics)
68 |
69 | logging.info("Model training started...")
70 | classifier.fit(
71 | train_dataset,
72 | epochs=hyperparams["num_epochs"],
73 | validation_data=eval_dataset,
74 | callbacks=[early_stopping, tensorboard_callback],
75 | )
76 | logging.info("Model training completed.")
77 |
78 | return classifier
79 |
80 |
81 | def evaluate(model, data_dir, raw_schema_location, tft_output_dir, hyperparams):
82 | """Invokes model.evaluate method and returns evaluation_metrics."""
83 |
84 | logging.info(f"Loading raw schema from {raw_schema_location}")
85 |
86 | logging.info(f"Loading tft output from {tft_output_dir}")
87 | tft_output = tft.TFTransformOutput(tft_output_dir)
88 | transformed_feature_spec = tft_output.transformed_feature_spec()
89 |
90 | logging.info("Model evaluation started...")
91 | eval_dataset = data.get_dataset(
92 | data_dir,
93 | transformed_feature_spec,
94 | hyperparams["batch_size"],
95 | )
96 |
97 | evaluation_metrics = model.evaluate(eval_dataset)
98 | logging.info("Model evaluation completed.")
99 |
100 | return evaluation_metrics
101 |
--------------------------------------------------------------------------------
/src/pipeline_triggering/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/pipeline_triggering/__init__.py
--------------------------------------------------------------------------------
/src/pipeline_triggering/main.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Cloud Function to be triggered by Pub/Sub."""
15 |
16 | import os
17 | import json
18 | import logging
19 | from kfp.v2.google.client import AIPlatformClient
20 | from google.cloud import storage
21 | import base64
22 |
23 |
24 | def trigger_pipeline(event, context):
25 | """A Cloud Function for triggering a Vertex pipeline given a Pub/Sub event."""
26 |
27 | project = os.getenv("PROJECT")
28 | region = os.getenv("REGION")
29 | gcs_pipeline_file_location = os.getenv("GCS_PIPELINE_FILE_LOCATION")
30 |
31 | if not project:
32 | raise ValueError("Environment variable PROJECT is not set.")
33 | if not region:
34 | raise ValueError("Environment variable REGION is not set.")
35 | if not gcs_pipeline_file_location:
36 | raise ValueError("Environment variable GCS_PIPELINE_FILE_LOCATION is not set.")
37 |
38 | storage_client = storage.Client()
39 |
40 | if not gcs_pipeline_file_location:
41 | raise ValueError("Environment variable GCS_PIPELINE_FILE_LOCATION is not set.")
42 |
43 | path_parts = gcs_pipeline_file_location.replace("gs://", "").split("/")
44 | bucket_name = path_parts[0]
45 | blob_name = "/".join(path_parts[1:])
46 |
47 | bucket = storage_client.bucket(bucket_name)
48 | blob = storage.Blob(bucket=bucket, name=blob_name)
49 |
50 | if not blob.exists(storage_client):
51 | raise ValueError(f"{gcs_pipeline_file_location} does not exist.")
52 |
53 | data = base64.b64decode(event["data"]).decode("utf-8")
54 | logging.info(f"Event data: {data}")
55 |
56 | parameter_values = json.loads(data)
57 |
58 | api_client = AIPlatformClient(project_id=project, region=region)
59 |
60 | response = api_client.create_run_from_job_spec(
61 | job_spec_path=gcs_pipeline_file_location, parameter_values=parameter_values
62 | )
63 |
64 | logging.info(response)
65 |
--------------------------------------------------------------------------------
/src/pipeline_triggering/requirements.txt:
--------------------------------------------------------------------------------
1 | kfp==1.6.2
2 | google-cloud-aiplatform
3 | google-cloud-storage
--------------------------------------------------------------------------------
/src/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/preprocessing/__init__.py
--------------------------------------------------------------------------------
/src/preprocessing/etl.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Data preprocessing pipelines."""
15 |
16 | import os
17 |
18 | import tensorflow_transform as tft
19 | import tensorflow_data_validation as tfdv
20 | import apache_beam as beam
21 | from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore
22 | import tensorflow_transform.beam as tft_beam
23 | from tensorflow_transform.tf_metadata import dataset_metadata
24 | from tensorflow_transform.tf_metadata import schema_utils
25 |
26 |
27 | from src.preprocessing import transformations
28 |
29 | RAW_SCHEMA_LOCATION = "src/raw_schema/schema.pbtxt"
30 |
31 |
32 | def parse_bq_record(bq_record):
33 | """Parses a bq_record to a dictionary."""
34 | output = {}
35 | for key in bq_record:
36 | output[key] = [bq_record[key]]
37 | return output
38 |
39 |
40 | def split_dataset(bq_row, num_partitions, ratio):
41 | """Returns a partition number for a given bq_row."""
42 | import json
43 |
44 | assert num_partitions == len(ratio)
45 | bucket = sum(map(ord, json.dumps(bq_row))) % sum(ratio)
46 | total = 0
47 | for i, part in enumerate(ratio):
48 | total += part
49 | if bucket < total:
50 | return i
51 | return len(ratio) - 1
52 |
53 |
54 | def run_transform_pipeline(args):
55 | """Runs a Beam pipeline to preprocess the data using TensorFlow Transform."""
56 |
57 | pipeline_options = beam.pipeline.PipelineOptions(flags=[], **args)
58 |
59 | raw_data_query = args["raw_data_query"]
60 | write_raw_data = args["write_raw_data"]
61 | exported_data_prefix = args["exported_data_prefix"]
62 | transformed_data_prefix = args["transformed_data_prefix"]
63 | transform_artifact_dir = args["transform_artifact_dir"]
64 | temp_location = args["temp_location"]
65 | project = args["project"]
66 |
67 | source_raw_schema = tfdv.load_schema_text(RAW_SCHEMA_LOCATION)
68 | raw_feature_spec = schema_utils.schema_as_feature_spec(
69 | source_raw_schema
70 | ).feature_spec
71 |
72 | raw_metadata = dataset_metadata.DatasetMetadata(
73 | schema_utils.schema_from_feature_spec(raw_feature_spec)
74 | )
75 |
76 | with beam.Pipeline(options=pipeline_options) as pipeline:
77 | with tft_beam.Context(temp_location):
78 |
79 | # Read raw BigQuery data.
80 | raw_train_data, raw_eval_data = (
81 | pipeline
82 | | "Read Raw Data"
83 | >> beam.io.ReadFromBigQuery(
84 | query=raw_data_query,
85 | project=project,
86 | use_standard_sql=True,
87 | )
88 | | "Parse Data" >> beam.Map(parse_bq_record)
89 | | "Split" >> beam.Partition(split_dataset, 2, ratio=[8, 2])
90 | )
91 |
92 | # Create a train_dataset from the data and schema.
93 | raw_train_dataset = (raw_train_data, raw_metadata)
94 |
95 | # Analyze and transform raw_train_dataset to produced transformed_train_dataset and transform_fn.
96 | transformed_train_dataset, transform_fn = (
97 | raw_train_dataset
98 | | "Analyze & Transform"
99 | >> tft_beam.AnalyzeAndTransformDataset(transformations.preprocessing_fn)
100 | )
101 |
102 | # Get data and schema separately from the transformed_dataset.
103 | transformed_train_data, transformed_metadata = transformed_train_dataset
104 |
105 | # write transformed train data.
106 | _ = (
107 | transformed_train_data
108 | | "Write Transformed Train Data"
109 | >> beam.io.tfrecordio.WriteToTFRecord(
110 | file_path_prefix=os.path.join(
111 | transformed_data_prefix, "train/data"
112 | ),
113 | file_name_suffix=".gz",
114 | coder=tft.coders.ExampleProtoCoder(transformed_metadata.schema),
115 | )
116 | )
117 |
118 | # Create a eval_dataset from the data and schema.
119 | raw_eval_dataset = (raw_eval_data, raw_metadata)
120 |
121 | # Transform raw_eval_dataset to produced transformed_eval_dataset using transform_fn.
122 | transformed_eval_dataset = (
123 | raw_eval_dataset,
124 | transform_fn,
125 | ) | "Transform" >> tft_beam.TransformDataset()
126 |
127 | # Get data from the transformed_eval_dataset.
128 | transformed_eval_data, _ = transformed_eval_dataset
129 |
130 | # write transformed train data.
131 | _ = (
132 | transformed_eval_data
133 | | "Write Transformed Eval Data"
134 | >> beam.io.tfrecordio.WriteToTFRecord(
135 | file_path_prefix=os.path.join(transformed_data_prefix, "eval/data"),
136 | file_name_suffix=".gz",
137 | coder=tft.coders.ExampleProtoCoder(transformed_metadata.schema),
138 | )
139 | )
140 |
141 | # Write transform_fn.
142 | _ = transform_fn | "Write Transform Artifacts" >> tft_beam.WriteTransformFn(
143 | transform_artifact_dir
144 | )
145 |
146 | if write_raw_data:
147 | # write raw eval data.
148 | _ = (
149 | raw_eval_data
150 | | "Write Raw Eval Data"
151 | >> beam.io.tfrecordio.WriteToTFRecord(
152 | file_path_prefix=os.path.join(exported_data_prefix, "data"),
153 | file_name_suffix=".tfrecord",
154 | coder=tft.coders.ExampleProtoCoder(raw_metadata.schema),
155 | )
156 | )
157 |
158 |
159 | def convert_to_jsonl(bq_record):
160 | """Converts bq_record to a jsonl formatted text."""
161 | import json
162 |
163 | output = {}
164 | for key in bq_record:
165 | output[key] = [bq_record[key]]
166 | return json.dumps(output)
167 |
168 |
169 | def run_extract_pipeline(args):
170 | """Runs a Beam pipeline to extract data from BigQuery as JSONL files."""
171 |
172 | pipeline_options = beam.pipeline.PipelineOptions(flags=[], **args)
173 |
174 | sql_query = args["sql_query"]
175 | exported_data_prefix = args["exported_data_prefix"]
176 | temporary_dir = args["temporary_dir"]
177 | gcs_location = args["gcs_location"]
178 | project = args["project"]
179 |
180 | with beam.Pipeline(options=pipeline_options) as pipeline:
181 | with tft_beam.Context(temporary_dir):
182 |
183 | # Read BigQuery data.
184 | raw_data = (
185 | pipeline
186 | | "Read Data"
187 | >> beam.io.ReadFromBigQuery(
188 | query=sql_query,
189 | project=project,
190 | use_standard_sql=True,
191 | gcs_location=gcs_location,
192 | )
193 | | "Parse Data" >> beam.Map(convert_to_jsonl)
194 | )
195 |
196 | # Write raw data to GCS as JSONL files.
197 | _ = raw_data | "Write Data" >> beam.io.WriteToText(
198 | file_path_prefix=exported_data_prefix, file_name_suffix=".jsonl"
199 | )
200 |
201 |
202 | def parse_prediction_results(jsonl):
203 | """Parses JSONL prediction results to a dictionary."""
204 | import uuid
205 | import json
206 |
207 | prediction_results = json.loads(jsonl)["prediction"]
208 | prediction_id = str(uuid.uuid4())
209 | scores = prediction_results["scores"]
210 | classes = prediction_results["classes"]
211 |
212 | return {"prediction_id": prediction_id, "scores": scores, "classes": classes}
213 |
214 |
215 | def create_datastore_entity(prediction_response, kind):
216 | """Creates a Datastore entity."""
217 |
218 | from apache_beam.io.gcp.datastore.v1new.types import Entity
219 | from apache_beam.io.gcp.datastore.v1new.types import Key
220 |
221 | user_id = prediction_response.pop("prediction_id")
222 | key = Key([kind, user_id])
223 | prediction_entity = Entity(key)
224 | prediction_entity.set_properties(prediction_response)
225 | return prediction_entity
226 |
227 |
228 | def run_store_predictions_pipeline(args):
229 | """Runs a Beam pipeline to store JSONL data to Datastore."""
230 |
231 | project = args["project"]
232 | datastore_kind = args["datastore_kind"]
233 | prediction_results_uri = args["prediction_results_uri"]
234 |
235 | pipeline_options = beam.options.pipeline_options.PipelineOptions(args)
236 | with beam.Pipeline(options=pipeline_options) as pipeline:
237 | _ = (
238 | pipeline
239 | | "ReadFromJSONL" >> beam.io.ReadFromText(prediction_results_uri)
240 | | "ParsePredictionResults" >> beam.Map(parse_prediction_results)
241 | | "ConvertToDatastoreEntity"
242 | >> beam.Map(create_datastore_entity, datastore_kind)
243 | | "WriteToDatastore" >> WriteToDatastore(project=project)
244 | )
245 |
--------------------------------------------------------------------------------
/src/preprocessing/transformations.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TensorFlow Transform preprocessing function."""
15 |
16 | import tensorflow as tf
17 | import tensorflow_transform as tft
18 |
19 | from src.common import features
20 |
21 |
22 | def preprocessing_fn(inputs):
23 | """tf.transform's callback function for preprocessing inputs.
24 |
25 | Args:
26 | inputs: map from feature keys to raw not-yet-transformed features.
27 | Returns:
28 | Map from string feature key to transformed feature operations.
29 | """
30 |
31 | outputs = {}
32 |
33 | for key in features.FEATURE_NAMES:
34 | if key in features.NUMERICAL_FEATURE_NAMES:
35 | outputs[features.transformed_name(key)] = tft.scale_to_z_score(inputs[key])
36 |
37 | elif key in features.categorical_feature_names():
38 | outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
39 | inputs[key],
40 | num_oov_buckets=1,
41 | vocab_filename=key,
42 | )
43 |
44 | outputs[features.TARGET_FEATURE_NAME] = inputs[features.TARGET_FEATURE_NAME]
45 |
46 | for key in outputs:
47 | outputs[key] = tf.squeeze(outputs[key], -1)
48 |
49 | return outputs
50 |
--------------------------------------------------------------------------------
/src/raw_schema/schema.pbtxt:
--------------------------------------------------------------------------------
1 | feature {
2 | name: "trip_month"
3 | type: INT
4 | presence {
5 | min_fraction: 1.0
6 | min_count: 1
7 | }
8 | shape {
9 | dim {
10 | size: 1
11 | }
12 | }
13 | }
14 | feature {
15 | name: "trip_day"
16 | type: INT
17 | presence {
18 | min_fraction: 1.0
19 | min_count: 1
20 | }
21 | shape {
22 | dim {
23 | size: 1
24 | }
25 | }
26 | }
27 | feature {
28 | name: "trip_day_of_week"
29 | type: INT
30 | presence {
31 | min_fraction: 1.0
32 | min_count: 1
33 | }
34 | shape {
35 | dim {
36 | size: 1
37 | }
38 | }
39 | }
40 | feature {
41 | name: "trip_hour"
42 | type: INT
43 | presence {
44 | min_fraction: 1.0
45 | min_count: 1
46 | }
47 | shape {
48 | dim {
49 | size: 1
50 | }
51 | }
52 | }
53 | feature {
54 | name: "trip_seconds"
55 | type: INT
56 | presence {
57 | min_fraction: 1.0
58 | min_count: 1
59 | }
60 | shape {
61 | dim {
62 | size: 1
63 | }
64 | }
65 | }
66 | feature {
67 | name: "trip_miles"
68 | type: FLOAT
69 | presence {
70 | min_fraction: 1.0
71 | min_count: 1
72 | }
73 | shape {
74 | dim {
75 | size: 1
76 | }
77 | }
78 | }
79 | feature {
80 | name: "payment_type"
81 | type: BYTES
82 | domain: "payment_type"
83 | presence {
84 | min_fraction: 1.0
85 | min_count: 1
86 | }
87 | shape {
88 | dim {
89 | size: 1
90 | }
91 | }
92 | }
93 | feature {
94 | name: "pickup_grid"
95 | type: BYTES
96 | domain: "pickup_grid"
97 | presence {
98 | min_fraction: 1.0
99 | min_count: 1
100 | }
101 | shape {
102 | dim {
103 | size: 1
104 | }
105 | }
106 | }
107 | feature {
108 | name: "dropoff_grid"
109 | type: BYTES
110 | domain: "dropoff_grid"
111 | presence {
112 | min_fraction: 1.0
113 | min_count: 1
114 | }
115 | shape {
116 | dim {
117 | size: 1
118 | }
119 | }
120 | }
121 | feature {
122 | name: "euclidean"
123 | type: FLOAT
124 | presence {
125 | min_fraction: 1.0
126 | min_count: 1
127 | }
128 | shape {
129 | dim {
130 | size: 1
131 | }
132 | }
133 | }
134 | feature {
135 | name: "loc_cross"
136 | type: BYTES
137 | presence {
138 | min_fraction: 1.0
139 | min_count: 1
140 | }
141 | shape {
142 | dim {
143 | size: 1
144 | }
145 | }
146 | }
147 | feature {
148 | name: "tip_bin"
149 | type: INT
150 | bool_domain {
151 | }
152 | presence {
153 | min_fraction: 1.0
154 | min_count: 1
155 | }
156 | shape {
157 | dim {
158 | size: 1
159 | }
160 | }
161 | }
162 | string_domain {
163 | name: "payment_type"
164 | value: "Cash"
165 | value: "Credit Card"
166 | value: "Dispute"
167 | value: "Mobile"
168 | value: "No Charge"
169 | value: "Prcard"
170 | value: "Prepaid"
171 | value: "Unknown"
172 | }
173 | string_domain {
174 | name: "pickup_grid"
175 | value: "POINT(-87.5 41.7)"
176 | value: "POINT(-87.6 41.7)"
177 | value: "POINT(-87.6 41.8)"
178 | value: "POINT(-87.6 41.9)"
179 | value: "POINT(-87.6 42)"
180 | value: "POINT(-87.7 41.7)"
181 | value: "POINT(-87.7 41.8)"
182 | value: "POINT(-87.7 41.9)"
183 | value: "POINT(-87.7 42)"
184 | value: "POINT(-87.8 41.8)"
185 | value: "POINT(-87.8 41.9)"
186 | value: "POINT(-87.8 42)"
187 | value: "POINT(-87.9 42)"
188 | }
189 | string_domain {
190 | name: "dropoff_grid"
191 | value: "POINT(-87.5 41.7)"
192 | value: "POINT(-87.6 41.7)"
193 | value: "POINT(-87.6 41.8)"
194 | value: "POINT(-87.6 41.9)"
195 | value: "POINT(-87.6 42)"
196 | value: "POINT(-87.7 41.7)"
197 | value: "POINT(-87.7 41.8)"
198 | value: "POINT(-87.7 41.9)"
199 | value: "POINT(-87.7 42)"
200 | value: "POINT(-87.8 41.8)"
201 | value: "POINT(-87.8 41.9)"
202 | value: "POINT(-87.8 42)"
203 | value: "POINT(-87.9 42)"
204 | }
205 |
--------------------------------------------------------------------------------
/src/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/tests/__init__.py
--------------------------------------------------------------------------------
/src/tests/datasource_utils_tests.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Test utilities for generating BigQuery data querying scirpts."""
15 |
16 | import sys
17 | import os
18 | import logging
19 | from google.cloud import bigquery
20 |
21 | from src.common import datasource_utils
22 |
23 | root = logging.getLogger()
24 | root.setLevel(logging.INFO)
25 | handler = logging.StreamHandler(sys.stdout)
26 | handler.setLevel(logging.INFO)
27 | root.addHandler(handler)
28 |
29 | LIMIT = 100
30 |
31 | TARGET_COLUMN = "tip_bin"
32 |
33 | EXPECTED_TRAINING_COLUMNS = [
34 | "trip_month",
35 | "trip_day",
36 | "trip_day_of_week",
37 | "trip_hour",
38 | "trip_seconds",
39 | "trip_miles",
40 | "payment_type",
41 | "pickup_grid",
42 | "dropoff_grid",
43 | "euclidean",
44 | "loc_cross",
45 | "tip_bin",
46 | ]
47 |
48 |
49 | MISSING = {
50 | "trip_month": -1,
51 | "trip_day": -1,
52 | "trip_day_of_week": -1,
53 | "trip_hour": -1,
54 | "trip_seconds": -1,
55 | "trip_miles": -1,
56 | "payment_type": "NA",
57 | "pickup_grid": "NA",
58 | "dropoff_grid": "NA",
59 | "euclidean": -1,
60 | "loc_cross": "NA",
61 | }
62 |
63 |
64 | def test_training_query():
65 |
66 | project = os.getenv("PROJECT")
67 | location = os.getenv("BQ_LOCATION")
68 | dataset_display_name = os.getenv("DATASET_DISPLAY_NAME")
69 |
70 | assert project, "Environment variable PROJECT is None!"
71 | assert location, "Environment variable BQ_LOCATION is None!"
72 | assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!"
73 |
74 | logging.info(f"Dataset: {dataset_display_name}")
75 |
76 | query = datasource_utils.create_bq_source_query(
77 | dataset_display_name=dataset_display_name,
78 | missing=MISSING,
79 | label_column=TARGET_COLUMN,
80 | ML_use="UNASSIGNED",
81 | limit=LIMIT,
82 | )
83 |
84 | bq_client = bigquery.Client(project=project, location=location)
85 | df = bq_client.query(query).to_dataframe()
86 | columns = set(df.columns)
87 | assert columns == set(EXPECTED_TRAINING_COLUMNS)
88 | assert df.shape == (LIMIT, 12)
89 |
90 |
91 | def test_serving_query():
92 |
93 | project = os.getenv("PROJECT")
94 | location = os.getenv("BQ_LOCATION")
95 | bq_dataset_name = os.getenv("BQ_DATASET_NAME")
96 | dataset_display_name = os.getenv("DATASET_DISPLAY_NAME")
97 |
98 | assert project, "Environment variable PROJECT is None!"
99 | assert location, "Environment variable BQ_LOCATION is None!"
100 | assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!"
101 |
102 | logging.info(f"Dataset: {dataset_display_name}")
103 |
104 | query = datasource_utils.create_bq_source_query(
105 | dataset_display_name=dataset_display_name,
106 | missing=MISSING,
107 | ML_use=None,
108 | limit=LIMIT,
109 | )
110 |
111 | bq_client = bigquery.Client(project=project, location=location)
112 | df = bq_client.query(query).to_dataframe()
113 | columns = set(df.columns)
114 | expected_serving_columns = EXPECTED_TRAINING_COLUMNS
115 | expected_serving_columns.remove(TARGET_COLUMN)
116 | assert columns == set(expected_serving_columns)
117 | assert df.shape == (LIMIT, 11)
118 |
--------------------------------------------------------------------------------
/src/tests/etl_tests.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Test data processing."""
15 |
16 | import sys
17 | import os
18 | import logging
19 | import tensorflow_transform as tft
20 | import tensorflow as tf
21 | from tensorflow.io import FixedLenFeature
22 |
23 | from src.preprocessing import etl
24 | from src.comm import datasource_utils
25 |
26 | root = logging.getLogger()
27 | root.setLevel(logging.INFO)
28 | handler = logging.StreamHandler(sys.stdout)
29 | handler.setLevel(logging.INFO)
30 | root.addHandler(handler)
31 |
32 | OUTPUT_DIR = "test_etl_output_dir"
33 | ML_USE = "UNASSIGNED"
34 | LIMIT = 100
35 |
36 | EXPECTED_FEATURE_SPEC = {
37 | "dropoff_grid_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
38 | "euclidean_xf": FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
39 | "loc_cross_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
40 | "payment_type_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
41 | "pickup_grid_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
42 | "tip_bin": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
43 | "trip_day_of_week_xf": FixedLenFeature(
44 | shape=[], dtype=tf.int64, default_value=None
45 | ),
46 | "trip_day_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
47 | "trip_hour_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
48 | "trip_miles_xf": FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
49 | "trip_month_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
50 | "trip_seconds_xf": FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
51 | }
52 |
53 |
54 | def test_transform_pipeline():
55 |
56 | project = os.getenv("PROJECT")
57 | region = os.getenv("REGION")
58 | bucket = os.getenv("BUCKET")
59 | dataset_display_name = os.getenv("DATASET_DISPLAY_NAME")
60 |
61 | assert project, "Environment variable PROJECT is None!"
62 | assert region, "Environment variable REGION is None!"
63 | assert bucket, "Environment variable BUCKET is None!"
64 | assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!"
65 |
66 | os.mkdir(OUTPUT_DIR)
67 |
68 | exported_data_dir = os.path.join(OUTPUT_DIR, "exported_data")
69 | transformed_data_dir = os.path.join(OUTPUT_DIR, "transformed_data")
70 | transform_artifacts_dir = os.path.join(OUTPUT_DIR, "transform_artifacts")
71 | temporary_dir = os.path.join(OUTPUT_DIR, "tmp")
72 |
73 | raw_data_query = datasource_utils.get_training_source_query(
74 | project=project,
75 | region=region,
76 | dataset_display_name=dataset_display_name,
77 | ml_use=ML_USE,
78 | limit=LIMIT,
79 | )
80 |
81 | args = {
82 | "runner": "DirectRunner",
83 | "raw_data_query": raw_data_query,
84 | "write_raw_data": False,
85 | "exported_data_prefix": exported_data_dir,
86 | "transformed_data_prefix": transformed_data_dir,
87 | "transform_artefact_dir": transform_artifacts_dir,
88 | "temporary_dir": temporary_dir,
89 | "gcs_location": f"gs://{bucket}/bq_tmp",
90 | "project": project,
91 | }
92 |
93 | logging.info(f"Transform pipeline args: {args}")
94 | etl.run_transform_pipeline(args)
95 | logging.info(f"Transform pipeline finished.")
96 |
97 | tft_output = tft.TFTransformOutput(transform_artifacts_dir)
98 | transform_feature_spec = tft_output.transformed_feature_spec()
99 | assert transform_feature_spec == EXPECTED_FEATURE_SPEC
100 |
--------------------------------------------------------------------------------
/src/tests/model_deployment_tests.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Test an uploaded model to Vertex AI."""
15 |
16 | import os
17 | import logging
18 | import tensorflow as tf
19 |
20 | test_instance = {
21 | "dropoff_grid": ["POINT(-87.6 41.9)"],
22 | "euclidean": [2064.2696],
23 | "loc_cross": [""],
24 | "payment_type": ["Credit Card"],
25 | "pickup_grid": ["POINT(-87.6 41.9)"],
26 | "trip_miles": [1.37],
27 | "trip_day": [12],
28 | "trip_hour": [16],
29 | "trip_month": [2],
30 | "trip_day_of_week": [4],
31 | "trip_seconds": [555],
32 | }
33 |
34 | SERVING_DEFAULT_SIGNATURE_NAME = "serving_default"
35 |
36 | from google.cloud import aiplatform as vertex_ai
37 |
38 |
39 | def test_model_artifact():
40 |
41 | feature_types = {
42 | "dropoff_grid": tf.dtypes.string,
43 | "euclidean": tf.dtypes.float32,
44 | "loc_cross": tf.dtypes.string,
45 | "payment_type": tf.dtypes.string,
46 | "pickup_grid": tf.dtypes.string,
47 | "trip_miles": tf.dtypes.float32,
48 | "trip_day": tf.dtypes.int64,
49 | "trip_hour": tf.dtypes.int64,
50 | "trip_month": tf.dtypes.int64,
51 | "trip_day_of_week": tf.dtypes.int64,
52 | "trip_seconds": tf.dtypes.int64,
53 | }
54 |
55 | new_test_instance = dict()
56 | for key in test_instance:
57 | new_test_instance[key] = tf.constant(
58 | [test_instance[key]], dtype=feature_types[key]
59 | )
60 |
61 | print(new_test_instance)
62 |
63 | project = os.getenv("PROJECT")
64 | region = os.getenv("REGION")
65 | model_display_name = os.getenv("MODEL_DISPLAY_NAME")
66 |
67 | assert project, "Environment variable PROJECT is None!"
68 | assert region, "Environment variable REGION is None!"
69 | assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!"
70 |
71 | vertex_ai.init(
72 | project=project,
73 | location=region,
74 | )
75 |
76 | models = vertex_ai.Model.list(
77 | filter=f"display_name={model_display_name}", order_by="update_time"
78 | )
79 |
80 | assert models, f"No model with display name {model_display_name} exists!"
81 |
82 | model = models[-1]
83 | artifact_uri = model.gca_resource.artifact_uri
84 | logging.info(f"Model artifact uri:{artifact_uri}")
85 | assert tf.io.gfile.exists(
86 | artifact_uri
87 | ), f"Model artifact uri {artifact_uri} does not exist!"
88 |
89 | saved_model = tf.saved_model.load(artifact_uri)
90 | logging.info("Model loaded successfully.")
91 |
92 | assert (
93 | SERVING_DEFAULT_SIGNATURE_NAME in saved_model.signatures
94 | ), f"{SERVING_DEFAULT_SIGNATURE_NAME} not in model signatures!"
95 |
96 | prediction_fn = saved_model.signatures["serving_default"]
97 | predictions = prediction_fn(**new_test_instance)
98 | logging.info("Model produced predictions.")
99 |
100 | keys = ["classes", "scores"]
101 | for key in keys:
102 | assert key in predictions, f"{key} in prediction outputs!"
103 |
104 | assert predictions["classes"].shape == (
105 | 1,
106 | 2,
107 | ), f"Invalid output classes shape: {predictions['classes'].shape}!"
108 | assert predictions["scores"].shape == (
109 | 1,
110 | 2,
111 | ), f"Invalid output scores shape: {predictions['scores'].shape}!"
112 | logging.info(f"Prediction output: {predictions}")
113 |
114 |
115 | def test_model_endpoint():
116 |
117 | project = os.getenv("PROJECT")
118 | region = os.getenv("REGION")
119 | model_display_name = os.getenv("MODEL_DISPLAY_NAME")
120 | endpoint_display_name = os.getenv("ENDPOINT_DISPLAY_NAME")
121 |
122 | assert project, "Environment variable PROJECT is None!"
123 | assert region, "Environment variable REGION is None!"
124 | assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!"
125 | assert endpoint_display_name, "Environment variable ENDPOINT_DISPLAY_NAME is None!"
126 |
127 | endpoints = vertex_ai.Endpoint.list(
128 | filter=f"display_name={endpoint_display_name}", order_by="update_time"
129 | )
130 | assert (
131 | endpoints
132 | ), f"Endpoint with display name {endpoint_display_name} does not exist! in region {region}"
133 |
134 | endpoint = endpoints[-1]
135 | logging.info(f"Calling endpoint: {endpoint}.")
136 |
137 | prediction = endpoint.predict([test_instance]).predictions[0]
138 |
139 | keys = ["classes", "scores"]
140 | for key in keys:
141 | assert key in prediction, f"{key} in prediction outputs!"
142 |
143 | assert (
144 | len(prediction["classes"]) == 2
145 | ), f"Invalid number of output classes: {len(prediction['classes'])}!"
146 | assert (
147 | len(prediction["scores"]) == 2
148 | ), f"Invalid number output scores: {len(prediction['scores'])}!"
149 |
150 | logging.info(f"Prediction output: {prediction}")
151 |
--------------------------------------------------------------------------------
/src/tests/model_tests.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Test model functions."""
15 |
16 | import sys
17 | import os
18 | import logging
19 | import tensorflow_transform as tft
20 | import tensorflow as tf
21 | from tensorflow.io import FixedLenFeature
22 |
23 | from src.common import features
24 | from src.model_training import model, defaults
25 |
26 | root = logging.getLogger()
27 | root.setLevel(logging.INFO)
28 | handler = logging.StreamHandler(sys.stdout)
29 | handler.setLevel(logging.INFO)
30 | root.addHandler(handler)
31 |
32 | EXPECTED_HYPERPARAMS_KEYS = [
33 | "hidden_units",
34 | "learning_rate",
35 | "batch_size",
36 | "num_epochs",
37 | ]
38 |
39 |
40 | def test_hyperparams_defaults():
41 | hyperparams = {"hidden_units": [64, 32]}
42 |
43 | hyperparams = defaults.update_hyperparams(hyperparams)
44 | assert set(hyperparams.keys()) == set(EXPECTED_HYPERPARAMS_KEYS)
45 |
46 |
47 | def test_create_binary_classifier():
48 |
49 | hyperparams = hyperparams = defaults.update_hyperparams(dict())
50 |
51 | model_inputs = {
52 | "dropoff_grid_xf": tf.convert_to_tensor([0, 0, 0]),
53 | "euclidean_xf": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]),
54 | "loc_cross_xf": tf.convert_to_tensor([0, 0, 0]),
55 | "payment_type_xf": tf.convert_to_tensor([1, 0, 0]),
56 | "pickup_grid_xf": tf.convert_to_tensor([0, 0, 0]),
57 | "trip_day_of_week_xf": tf.convert_to_tensor([5, 4, 4]),
58 | "trip_day_xf": tf.convert_to_tensor([26, 24, 1]),
59 | "trip_hour_xf": tf.convert_to_tensor([0, 4, 2]),
60 | "trip_miles_xf": tf.convert_to_tensor([5.9717827, -0.7121308, -0.7601589]),
61 | "trip_month_xf": tf.convert_to_tensor([4, 3, 4]),
62 | "trip_seconds_xf": tf.convert_to_tensor([4.9029775, -0.34146854, -0.34479955]),
63 | }
64 |
65 | feature_vocab_sizes = {
66 | feature_name: 100 for feature_name in features.categorical_feature_names()
67 | }
68 | classifier = model._create_binary_classifier(feature_vocab_sizes, hyperparams)
69 | model_outputs = classifier(model_inputs) # .numpy()
70 | assert model_outputs.shape == (3, 1)
71 | assert model_outputs.dtype == "float32"
72 |
--------------------------------------------------------------------------------
/src/tests/pipeline_deployment_tests.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Test training pipeline using local runner."""
15 |
16 | import sys
17 | import os
18 | from tfx.orchestration.local.local_dag_runner import LocalDagRunner
19 | import tensorflow as tf
20 | from ml_metadata.proto import metadata_store_pb2
21 | import logging
22 |
23 | from src.tfx_pipelines import config
24 | from src.tfx_pipelines import training_pipeline
25 |
26 | root = logging.getLogger()
27 | root.setLevel(logging.INFO)
28 | handler = logging.StreamHandler(sys.stdout)
29 | handler.setLevel(logging.INFO)
30 | root.addHandler(handler)
31 |
32 | MLMD_SQLLITE = "mlmd.sqllite"
33 | NUM_EPOCHS = 1
34 | BATCH_SIZE = 512
35 | LEARNING_RATE = 0.001
36 | HIDDEN_UNITS = "128,128"
37 |
38 |
39 | def test_e2e_pipeline():
40 |
41 | project = os.getenv("PROJECT")
42 | region = os.getenv("REGION")
43 | model_display_name = os.getenv("MODEL_DISPLAY_NAME")
44 | dataset_display_name = os.getenv("DATASET_DISPLAY_NAME")
45 | gcs_location = os.getenv("GCS_LOCATION")
46 | model_registry = os.getenv("MODEL_REGISTRY_URI")
47 | upload_model = os.getenv("UPLOAD_MODEL")
48 |
49 | assert project, "Environment variable PROJECT is None!"
50 | assert region, "Environment variable REGION is None!"
51 | assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!"
52 | assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!"
53 | assert gcs_location, "Environment variable GCS_LOCATION is None!"
54 | assert model_registry, "Environment variable MODEL_REGISTRY_URI is None!"
55 |
56 | logging.info(f"upload_model: {upload_model}")
57 | if tf.io.gfile.exists(gcs_location):
58 | tf.io.gfile.rmtree(gcs_location)
59 | logging.info(f"Pipeline e2e test artifacts stored in: {gcs_location}")
60 |
61 | if tf.io.gfile.exists(MLMD_SQLLITE):
62 | tf.io.gfile.remove(MLMD_SQLLITE)
63 |
64 | metadata_connection_config = metadata_store_pb2.ConnectionConfig()
65 | metadata_connection_config.sqlite.filename_uri = MLMD_SQLLITE
66 | metadata_connection_config.sqlite.connection_mode = 3
67 | logging.info("ML metadata store is ready.")
68 |
69 | pipeline_root = os.path.join(
70 | config.ARTIFACT_STORE_URI,
71 | config.PIPELINE_NAME,
72 | )
73 |
74 | runner = LocalDagRunner()
75 |
76 | pipeline = training_pipeline.create_pipeline(
77 | pipeline_root=pipeline_root,
78 | num_epochs=NUM_EPOCHS,
79 | batch_size=BATCH_SIZE,
80 | learning_rate=LEARNING_RATE,
81 | hidden_units=HIDDEN_UNITS,
82 | metadata_connection_config=metadata_connection_config,
83 | )
84 |
85 | runner.run(pipeline)
86 |
87 | logging.info(f"Model output: {os.path.join(model_registry, model_display_name)}")
88 | assert tf.io.gfile.exists(os.path.join(model_registry, model_display_name))
89 |
--------------------------------------------------------------------------------
/src/tfx_pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/tfx_pipelines/__init__.py
--------------------------------------------------------------------------------
/src/tfx_pipelines/components.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TFX Custom Python Components."""
15 |
16 |
17 | import sys
18 | import os
19 | import json
20 | import logging
21 | import tensorflow as tf
22 |
23 | from tfx.types import artifact_utils
24 | from tfx.utils import io_utils
25 | from tfx.dsl.component.experimental.decorators import component
26 | from tfx.dsl.component.experimental.annotations import (
27 | InputArtifact,
28 | OutputArtifact,
29 | Parameter,
30 | )
31 | from tfx.types.standard_artifacts import HyperParameters
32 | from tfx.types.experimental.simple_artifacts import File as UploadedModel
33 | from tfx.types.experimental.simple_artifacts import Dataset
34 |
35 | from google.cloud import aiplatform as vertex_ai
36 |
37 | SCRIPT_DIR = os.path.dirname(
38 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))
39 | )
40 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, "..")))
41 |
42 | from src.preprocessing import etl
43 |
44 |
45 | HYPERPARAM_FILENAME = "hyperparameters.json"
46 | SERVING_DATA_PREFIX = "serving-data-"
47 | PREDICTION_RESULTS_PREFIX = "prediction.results-*"
48 |
49 |
50 | @component
51 | def hyperparameters_gen(
52 | num_epochs: Parameter[int],
53 | batch_size: Parameter[int],
54 | learning_rate: Parameter[float],
55 | hidden_units: Parameter[str],
56 | hyperparameters: OutputArtifact[HyperParameters],
57 | ):
58 | """A TFX custom-Python-function component for receiving hyperparameters."""
59 |
60 | hp_dict = dict()
61 | hp_dict["num_epochs"] = num_epochs
62 | hp_dict["batch_size"] = batch_size
63 | hp_dict["learning_rate"] = learning_rate
64 | hp_dict["hidden_units"] = [int(units) for units in hidden_units.split(",")]
65 | logging.info(f"Hyperparameters: {hp_dict}")
66 |
67 | hyperparams_uri = os.path.join(
68 | artifact_utils.get_single_uri([hyperparameters]), HYPERPARAM_FILENAME
69 | )
70 | io_utils.write_string_file(hyperparams_uri, json.dumps(hp_dict))
71 | logging.info(f"Hyperparameters are written to: {hyperparams_uri}")
72 |
73 |
74 | @component
75 | def vertex_model_uploader(
76 | project: Parameter[str],
77 | region: Parameter[str],
78 | model_display_name: Parameter[str],
79 | pushed_model_location: Parameter[str],
80 | serving_image_uri: Parameter[str],
81 | explanation_config: Parameter[str],
82 | uploaded_model: OutputArtifact[UploadedModel],
83 | ):
84 | """A TFX custom-Python-function component to upload the model to Vertex."""
85 |
86 | vertex_ai.init(project=project, location=region)
87 |
88 | pushed_model_dir = os.path.join(
89 | pushed_model_location, tf.io.gfile.listdir(pushed_model_location)[-1]
90 | )
91 |
92 | logging.info(f"Model registry location: {pushed_model_dir}")
93 |
94 | try:
95 | explanation_metadata = vertex_ai.explain.ExplanationMetadata(
96 | inputs=explanation_config["inputs"],
97 | outputs=explanation_config["outputs"],
98 | )
99 | explanation_parameters = vertex_ai.explain.ExplanationParameters(
100 | explanation_config["params"]
101 | )
102 | except:
103 | explanation_metadata = None
104 | explanation_parameters = None
105 |
106 | vertex_model = vertex_ai.Model.upload(
107 | display_name=model_display_name,
108 | artifact_uri=pushed_model_dir,
109 | serving_container_image_uri=serving_image_uri,
110 | parameters_schema_uri=None,
111 | instance_schema_uri=None,
112 | explanation_metadata=explanation_metadata,
113 | explanation_parameters=explanation_parameters,
114 | )
115 |
116 | model_uri = vertex_model.gca_resource.name
117 | logging.info(f"Model uploaded to AI Platform: {model_uri}")
118 | uploaded_model.set_string_custom_property("model_uri", model_uri)
119 |
120 |
121 | @component
122 | def bigquery_data_gen(
123 | sql_query: Parameter[str],
124 | output_data_format: Parameter[str],
125 | beam_args: Parameter[str],
126 | serving_dataset: OutputArtifact[Dataset],
127 | ):
128 | """A TFX custom-Python-function component for extracting data from BigQuery."""
129 |
130 | output_dir = os.path.join(
131 | artifact_utils.get_single_uri([serving_dataset]), SERVING_DATA_PREFIX
132 | )
133 |
134 | pipeline_args = json.loads(beam_args)
135 | pipeline_args["sql_query"] = sql_query
136 | pipeline_args["exported_data_prefix"] = output_dir
137 | pipeline_args["output_data_format"] = output_data_format
138 |
139 | logging.info("Data extraction started. Source query:")
140 | logging.info("{sql_query}")
141 | etl.run_extract_pipeline(pipeline_args)
142 | logging.info("Data extraction completed.")
143 |
144 |
145 | @component
146 | def vertex_batch_prediction(
147 | project: Parameter[str],
148 | region: Parameter[str],
149 | model_display_name: Parameter[str],
150 | instances_format: Parameter[str],
151 | predictions_format: Parameter[str],
152 | job_resources: Parameter[str],
153 | serving_dataset: InputArtifact[Dataset],
154 | prediction_results: OutputArtifact[Dataset],
155 | ):
156 | """A TFX custom-Python-function component to submit a Vertex batch prediction Job."""
157 |
158 | job_resources = json.loads(job_resources)
159 | gcs_source_pattern = (
160 | os.path.join(
161 | artifact_utils.get_single_uri([serving_dataset]), SERVING_DATA_PREFIX
162 | )
163 | + "*.jsonl"
164 | )
165 |
166 | gcs_destination_prefix = artifact_utils.get_single_uri([prediction_results])
167 |
168 | vertex_client = VertexClient(project, region)
169 | logging.info("Submitting Vertex AI batch prediction job...")
170 | batch_prediction_job = vertex_client.submit_batch_prediction_job(
171 | model_display_name=model_display_name,
172 | gcs_source_pattern=gcs_source_pattern,
173 | gcs_destination_prefix=gcs_destination_prefix,
174 | instances_format=instances_format,
175 | predictions_format=predictions_format,
176 | other_configurations=job_resources,
177 | )
178 | logging.info("Batch prediction job completed.")
179 | prediction_results.set_string_custom_property(
180 | "batch_prediction_job", batch_prediction_job.gca_resource.name
181 | )
182 |
183 |
184 | @component
185 | def datastore_prediction_writer(
186 | datastore_kind: Parameter[str],
187 | predictions_format: Parameter[str],
188 | beam_args: Parameter[str],
189 | prediction_results: InputArtifact[Dataset],
190 | ):
191 | """A TFX custom-Python-function component for writing prediction JSONL files to Datastore."""
192 |
193 | prediction_results_dir = os.path.join(
194 | artifact_utils.get_single_uri([prediction_results])
195 | )
196 | prediction_results_dir = os.path.join(
197 | prediction_results_dir, tf.io.gfile.listdir(prediction_results_dir)[0]
198 | )
199 | prediction_results_uri = os.path.join(
200 | prediction_results_dir, PREDICTION_RESULTS_PREFIX
201 | )
202 |
203 | pipeline_args = json.loads(beam_args)
204 | pipeline_args["prediction_results_uri"] = prediction_results_uri
205 | pipeline_args["datastore_kind"] = datastore_kind
206 | pipeline_args["predictions_format"] = predictions_format
207 |
208 | logging.info(f"Storing predictions to Datastore kind: {datastore_kind}")
209 | etl.run_store_predictions_pipeline(pipeline_args)
210 | logging.info("Predictions are stored.")
211 |
--------------------------------------------------------------------------------
/src/tfx_pipelines/config.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TFX pipeline configurations."""
15 |
16 | import os
17 |
18 | PROJECT_ID = os.getenv("PROJECT_ID", "ksalama-cloudml")
19 | REGION = os.getenv("REGION", "us-central1")
20 | GCS_LOCATION = os.getenv("GCS_LOCATION", "gs://ksalama-cloudml-us/chicago-taxi-tips")
21 |
22 | ARTIFACT_STORE_URI = os.path.join(GCS_LOCATION, "tfx_artifacts")
23 | MODEL_REGISTRY_URI = os.getenv(
24 | "MODEL_REGISTRY_URI",
25 | os.path.join(GCS_LOCATION, "model_registry"),
26 | )
27 |
28 | DATASET_DISPLAY_NAME = os.getenv("DATASET_DISPLAY_NAME", "chicago-taxi-tips")
29 | MODEL_DISPLAY_NAME = os.getenv(
30 | "MODEL_DISPLAY_NAME", f"{DATASET_DISPLAY_NAME}-classifier"
31 | )
32 | PIPELINE_NAME = os.getenv("PIPELINE_NAME", f"{MODEL_DISPLAY_NAME}-train-pipeline")
33 |
34 | ML_USE_COLUMN = "ml_use"
35 | EXCLUDE_COLUMNS = ",".join(["trip_start_timestamp"])
36 | TRAIN_LIMIT = os.getenv("TRAIN_LIMIT", "0")
37 | TEST_LIMIT = os.getenv("TEST_LIMIT", "0")
38 | SERVE_LIMIT = os.getenv("SERVE_LIMIT", "0")
39 |
40 | NUM_TRAIN_SPLITS = os.getenv("NUM_TRAIN_SPLITS", "4")
41 | NUM_EVAL_SPLITS = os.getenv("NUM_EVAL_SPLITS", "1")
42 | ACCURACY_THRESHOLD = os.getenv("ACCURACY_THRESHOLD", "0.8")
43 |
44 | USE_KFP_SA = os.getenv("USE_KFP_SA", "False")
45 |
46 | TFX_IMAGE_URI = os.getenv(
47 | "TFX_IMAGE_URI", f"gcr.io/{PROJECT_ID}/tfx-{DATASET_DISPLAY_NAME}:latest"
48 | )
49 |
50 | BEAM_RUNNER = os.getenv("BEAM_RUNNER", "DirectRunner")
51 | BEAM_DIRECT_PIPELINE_ARGS = [
52 | f"--project={PROJECT_ID}",
53 | f"--temp_location={os.path.join(GCS_LOCATION, 'temp')}",
54 | ]
55 | BEAM_DATAFLOW_PIPELINE_ARGS = [
56 | f"--project={PROJECT_ID}",
57 | f"--temp_location={os.path.join(GCS_LOCATION, 'temp')}",
58 | f"--region={REGION}",
59 | f"--runner={BEAM_RUNNER}",
60 | ]
61 |
62 |
63 | TRAINING_RUNNER = os.getenv("TRAINING_RUNNER", "local")
64 | AI_PLATFORM_TRAINING_ARGS = {
65 | "project": PROJECT_ID,
66 | "region": REGION,
67 | "masterConfig": {"imageUri": TFX_IMAGE_URI},
68 | }
69 |
70 |
71 | SERVING_RUNTIME = os.getenv("SERVING_RUNTIME", "tf2-cpu.2-4")
72 | SERVING_IMAGE_URI = f"gcr.io/cloud-aiplatform/prediction/{SERVING_RUNTIME}:latest"
73 |
74 | BATCH_PREDICTION_BQ_DATASET_NAME = os.getenv(
75 | "BATCH_PREDICTION_BQ_DATASET_NAME", "playground_us"
76 | )
77 | BATCH_PREDICTION_BQ_TABLE_NAME = os.getenv(
78 | "BATCH_PREDICTION_BQ_TABLE_NAME", "chicago_taxitrips_prep"
79 | )
80 | BATCH_PREDICTION_BEAM_ARGS = {
81 | "runner": f"{BEAM_RUNNER}",
82 | "temporary_dir": os.path.join(GCS_LOCATION, "temp"),
83 | "gcs_location": os.path.join(GCS_LOCATION, "temp"),
84 | "project": PROJECT_ID,
85 | "region": REGION,
86 | "setup_file": "./setup.py",
87 | }
88 | BATCH_PREDICTION_JOB_RESOURCES = {
89 | "machine_type": "n1-standard-2",
90 | #'accelerator_count': 1,
91 | #'accelerator_type': 'NVIDIA_TESLA_T4'
92 | "starting_replica_count": 1,
93 | "max_replica_count": 10,
94 | }
95 | DATASTORE_PREDICTION_KIND = f"{MODEL_DISPLAY_NAME}-predictions"
96 |
97 | ENABLE_CACHE = os.getenv("ENABLE_CACHE", "0")
98 | UPLOAD_MODEL = os.getenv("UPLOAD_MODEL", "1")
99 |
100 | os.environ["PROJECT_ID"] = PROJECT_ID
101 | os.environ["PIPELINE_NAME"] = PIPELINE_NAME
102 | os.environ["TFX_IMAGE_URI"] = TFX_IMAGE_URI
103 | os.environ["MODEL_REGISTRY_URI"] = MODEL_REGISTRY_URI
104 |
--------------------------------------------------------------------------------
/src/tfx_pipelines/prediction_pipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TFX prediction pipeline definition."""
15 |
16 | import os
17 | import sys
18 | import json
19 | import logging
20 |
21 | from tfx.orchestration import pipeline, data_types
22 | from ml_metadata.proto import metadata_store_pb2
23 |
24 | SCRIPT_DIR = os.path.dirname(
25 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))
26 | )
27 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, "..")))
28 |
29 | from src.tfx_pipelines import config
30 | from src.tfx_pipelines import components as custom_components
31 | from src.common import datasource_utils
32 |
33 |
34 | def create_pipeline(
35 | pipeline_root: str,
36 | metadata_connection_config: metadata_store_pb2.ConnectionConfig = None,
37 | ):
38 | """Returns a batch prediction pipeline using TFX."""
39 |
40 | # Get source query.
41 | sql_query = datasource_utils.get_serving_source_query(
42 | bq_dataset_name=config.BATCH_PREDICTION_BQ_DATASET_NAME,
43 | bq_table_name=config.BATCH_PREDICTION_BQ_TABLE_NAME,
44 | limit=int(config.SERVE_LIMIT),
45 | )
46 |
47 | bigquery_data_gen = custom_components.bigquery_data_gen(
48 | sql_query=sql_query,
49 | output_data_format="jsonl",
50 | beam_args=json.dumps(config.BATCH_PREDICTION_BEAM_ARGS),
51 | )
52 |
53 | vertex_batch_prediction = custom_components.vertex_batch_prediction(
54 | project=config.PROJECT,
55 | region=config.REGION,
56 | model_display_name=config.MODEL_DISPLAY_NAME,
57 | instances_format="jsonl",
58 | predictions_format="jsonl",
59 | job_resources=json.dumps(config.BATCH_PREDICTION_JOB_RESOURCES),
60 | serving_dataset=bigquery_data_gen.outputs.serving_dataset,
61 | )
62 |
63 | datastore_prediction_writer = custom_components.datastore_prediction_writer(
64 | datastore_kind=config.DATASTORE_PREDICTION_KIND,
65 | predictions_format="jsonl",
66 | beam_args=json.dumps(config.BATCH_PREDICTION_BEAM_ARGS),
67 | prediction_results=vertex_batch_prediction.outputs.prediction_results,
68 | )
69 |
70 | pipeline_components = [
71 | bigquery_data_gen,
72 | vertex_batch_prediction,
73 | datastore_prediction_writer,
74 | ]
75 |
76 | logging.info(
77 | f"Pipeline components: {[component.id for component in pipeline_components]}"
78 | )
79 |
80 | beam_pipeline_args = config.BEAM_DIRECT_PIPELINE_ARGS
81 | if config.BEAM_RUNNER == "DataflowRunner":
82 | beam_pipeline_args = config.BEAM_DATAFLOW_PIPELINE_ARGS
83 |
84 | logging.info(f"Beam pipeline args: {beam_pipeline_args}")
85 |
86 | return pipeline.Pipeline(
87 | pipeline_name=config.PIPELINE_NAME,
88 | pipeline_root=pipeline_root,
89 | components=pipeline_components,
90 | beam_pipeline_args=beam_pipeline_args,
91 | metadata_connection_config=metadata_connection_config,
92 | enable_cache=int(config.ENABLE_CACHE),
93 | )
94 |
--------------------------------------------------------------------------------
/src/tfx_pipelines/runner.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Define KubeflowV2DagRunner to run the training pipeline using Managed Pipelines."""
15 |
16 |
17 | import os
18 | from kfp.v2.google.client import AIPlatformClient
19 | from tfx.orchestration import data_types
20 | from tfx.orchestration.kubeflow.v2 import kubeflow_v2_dag_runner
21 |
22 |
23 | from src.tfx_pipelines import config, training_pipeline, prediction_pipeline
24 | from src.model_training import defaults
25 |
26 |
27 | def compile_training_pipeline(pipeline_definition_file):
28 | """Returns the training pipeline definition."""
29 |
30 | pipeline_root = os.path.join(
31 | config.ARTIFACT_STORE_URI,
32 | config.PIPELINE_NAME,
33 | )
34 |
35 | managed_pipeline = training_pipeline.create_pipeline(
36 | pipeline_root=pipeline_root,
37 | num_epochs=data_types.RuntimeParameter(
38 | name="num_epochs",
39 | default=defaults.NUM_EPOCHS,
40 | ptype=int,
41 | ),
42 | batch_size=data_types.RuntimeParameter(
43 | name="batch_size",
44 | default=defaults.BATCH_SIZE,
45 | ptype=int,
46 | ),
47 | learning_rate=data_types.RuntimeParameter(
48 | name="learning_rate",
49 | default=defaults.LEARNING_RATE,
50 | ptype=float,
51 | ),
52 | hidden_units=data_types.RuntimeParameter(
53 | name="hidden_units",
54 | default=",".join(str(u) for u in defaults.HIDDEN_UNITS),
55 | ptype=str,
56 | ),
57 | )
58 |
59 | runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
60 | config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
61 | default_image=config.TFX_IMAGE_URI
62 | ),
63 | output_filename=pipeline_definition_file,
64 | )
65 |
66 | return runner.run(managed_pipeline, write_out=True)
67 |
68 |
69 | def compile_prediction_pipeline(pipeline_definition_file):
70 | """Returns the prediction pipeline definition."""
71 |
72 | pipeline_root = os.path.join(
73 | config.ARTIFACT_STORE_URI,
74 | config.PIPELINE_NAME,
75 | )
76 |
77 | managed_pipeline = prediction_pipeline.create_pipeline(
78 | pipeline_root=pipeline_root,
79 | )
80 |
81 | runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
82 | config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
83 | default_image=config.TFX_IMAGE_URI
84 | ),
85 | output_filename=pipeline_definition_file,
86 | )
87 |
88 | return runner.run(managed_pipeline, write_out=True)
89 |
90 |
91 | def submit_pipeline(pipeline_definition_file):
92 | """Submits a pipeline definition file to Vertex pipelines."""
93 |
94 | pipeline_client = AIPlatformClient(project_id=config.PROJECT, region=config.REGION)
95 | pipeline_client.create_run_from_job_spec(pipeline_definition_file)
96 |
--------------------------------------------------------------------------------
/src/tfx_pipelines/training_pipeline.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TFX training pipeline definition."""
15 |
16 | import os
17 | import sys
18 | import logging
19 | import json
20 |
21 | import tensorflow_model_analysis as tfma
22 |
23 | import tfx
24 | from tfx.proto import example_gen_pb2, transform_pb2, trainer_pb2
25 | from tfx.orchestration import pipeline, data_types
26 | from tfx.dsl.components.base import executor_spec
27 | from tfx.components.trainer import executor as trainer_executor
28 | from tfx.extensions.google_cloud_ai_platform.trainer import (
29 | executor as ai_platform_trainer_executor,
30 | )
31 | from tfx.extensions.google_cloud_big_query.example_gen.component import (
32 | BigQueryExampleGen,
33 | )
34 | from tfx.components import (
35 | StatisticsGen,
36 | ExampleValidator,
37 | Transform,
38 | Trainer,
39 | Evaluator,
40 | Pusher,
41 | )
42 | from tfx.dsl.components.common.importer import Importer
43 | from tfx.dsl.components.common.resolver import Resolver
44 | from tfx.dsl.experimental import latest_artifacts_resolver
45 | from tfx.dsl.experimental import latest_blessed_model_resolver
46 |
47 | from ml_metadata.proto import metadata_store_pb2
48 |
49 | SCRIPT_DIR = os.path.dirname(
50 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))
51 | )
52 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, "..")))
53 |
54 | from src.tfx_pipelines import config
55 | from src.tfx_pipelines import components as custom_components
56 | from src.common import features, datasource_utils
57 |
58 | RAW_SCHEMA_DIR = "src/raw_schema"
59 | TRANSFORM_MODULE_FILE = "src/preprocessing/transformations.py"
60 | TRAIN_MODULE_FILE = "src/model_training/runner.py"
61 |
62 | MISSING = {
63 | "trip_month": -1,
64 | "trip_day": -1,
65 | "trip_day_of_week": -1,
66 | "trip_hour": -1,
67 | "trip_seconds": -1,
68 | "trip_miles": -1,
69 | "payment_type": "NA",
70 | "pickup_grid": "NA",
71 | "dropoff_grid": "NA",
72 | "euclidean": -1,
73 | }
74 |
75 |
76 | def create_pipeline(
77 | pipeline_root: str,
78 | num_epochs: data_types.RuntimeParameter,
79 | batch_size: data_types.RuntimeParameter,
80 | learning_rate: data_types.RuntimeParameter,
81 | hidden_units: data_types.RuntimeParameter,
82 | metadata_connection_config: metadata_store_pb2.ConnectionConfig = None,
83 | ):
84 | """Returns a TFX training pipeline."""
85 |
86 | local_executor_spec = executor_spec.ExecutorClassSpec(
87 | trainer_executor.GenericExecutor
88 | )
89 |
90 | caip_executor_spec = executor_spec.ExecutorClassSpec(
91 | ai_platform_trainer_executor.GenericExecutor
92 | )
93 |
94 | # Hyperparameter generation.
95 | hyperparams_gen = custom_components.hyperparameters_gen(
96 | num_epochs=num_epochs,
97 | batch_size=batch_size,
98 | learning_rate=learning_rate,
99 | hidden_units=hidden_units,
100 | ).with_id("HyperparamsGen")
101 |
102 | # Get train source query.
103 | train_sql_query = datasource_utils.get_training_source_query(
104 | config.PROJECT_ID,
105 | config.REGION,
106 | config.DATASET_DISPLAY_NAME,
107 | ml_use="UNASSIGNED",
108 | limit=int(config.TRAIN_LIMIT),
109 | )
110 |
111 | train_output_config = example_gen_pb2.Output(
112 | split_config=example_gen_pb2.SplitConfig(
113 | splits=[
114 | example_gen_pb2.SplitConfig.Split(
115 | name="train", hash_buckets=int(config.NUM_TRAIN_SPLITS)
116 | ),
117 | example_gen_pb2.SplitConfig.Split(
118 | name="eval", hash_buckets=int(config.NUM_EVAL_SPLITS)
119 | ),
120 | ]
121 | )
122 | )
123 |
124 | # Train example generation.
125 | train_example_gen = BigQueryExampleGen(
126 | query=train_sql_query,
127 | output_config=train_output_config,
128 | ).with_id("TrainDataGen")
129 |
130 | # Get test source query.
131 | test_sql_query = datasource_utils.get_training_source_query(
132 | config.PROJECT_ID,
133 | config.REGION,
134 | config.DATASET_DISPLAY_NAME,
135 | ml_use="TEST",
136 | limit=int(config.TEST_LIMIT),
137 | )
138 |
139 | test_output_config = example_gen_pb2.Output(
140 | split_config=example_gen_pb2.SplitConfig(
141 | splits=[
142 | example_gen_pb2.SplitConfig.Split(name="test", hash_buckets=1),
143 | ]
144 | )
145 | )
146 |
147 | # Test example generation.
148 | test_example_gen = BigQueryExampleGen(
149 | query=test_sql_query,
150 | output_config=test_output_config,
151 | ).with_id("TestDataGen")
152 |
153 | # Schema importer.
154 | schema_importer = Importer(
155 | source_uri=RAW_SCHEMA_DIR,
156 | artifact_type=tfx.types.standard_artifacts.Schema,
157 | ).with_id("SchemaImporter")
158 |
159 | # Statistics generation.
160 | statistics_gen = StatisticsGen(examples=train_example_gen.outputs.examples).with_id(
161 | "StatisticsGen"
162 | )
163 |
164 | # Example validation.
165 | example_validator = ExampleValidator(
166 | statistics=statistics_gen.outputs.statistics,
167 | schema=schema_importer.outputs.result,
168 | ).with_id("ExampleValidator")
169 |
170 | # Data transformation.
171 | transform = Transform(
172 | examples=train_example_gen.outputs.examples,
173 | schema=schema_importer.outputs.result,
174 | module_file=TRANSFORM_MODULE_FILE,
175 | splits_config=transform_pb2.SplitsConfig(
176 | analyze=["train"], transform=["train", "eval"]
177 | ),
178 | ).with_id("DataTransformer")
179 |
180 | # Add dependency from example_validator to transform.
181 | transform.add_upstream_node(example_validator)
182 |
183 | # Get the latest model to warmstart
184 | warmstart_model_resolver = Resolver(
185 | strategy_class=latest_artifacts_resolver.LatestArtifactsResolver,
186 | latest_model=tfx.types.Channel(type=tfx.types.standard_artifacts.Model),
187 | ).with_id("WarmstartModelResolver")
188 |
189 | # Model training.
190 | trainer = Trainer(
191 | custom_executor_spec=local_executor_spec
192 | if config.TRAINING_RUNNER == "local"
193 | else caip_executor_spec,
194 | module_file=TRAIN_MODULE_FILE,
195 | transformed_examples=transform.outputs.transformed_examples,
196 | schema=schema_importer.outputs.result,
197 | # base_model=warmstart_model_resolver.outputs.latest_model,
198 | transform_graph=transform.outputs.transform_graph,
199 | train_args=trainer_pb2.TrainArgs(num_steps=0),
200 | eval_args=trainer_pb2.EvalArgs(num_steps=None),
201 | hyperparameters=hyperparams_gen.outputs.hyperparameters,
202 | ).with_id("ModelTrainer")
203 |
204 | # Get the latest blessed model (baseline) for model validation.
205 | baseline_model_resolver = Resolver(
206 | strategy_class=latest_blessed_model_resolver.LatestBlessedModelResolver,
207 | model=tfx.types.Channel(type=tfx.types.standard_artifacts.Model),
208 | model_blessing=tfx.types.Channel(
209 | type=tfx.types.standard_artifacts.ModelBlessing
210 | ),
211 | ).with_id("BaselineModelResolver")
212 |
213 | # Prepare evaluation config.
214 | eval_config = tfma.EvalConfig(
215 | model_specs=[
216 | tfma.ModelSpec(
217 | signature_name="serving_tf_example",
218 | label_key=features.TARGET_FEATURE_NAME,
219 | prediction_key="probabilities",
220 | )
221 | ],
222 | slicing_specs=[
223 | tfma.SlicingSpec(),
224 | ],
225 | metrics_specs=[
226 | tfma.MetricsSpec(
227 | metrics=[
228 | tfma.MetricConfig(class_name="ExampleCount"),
229 | tfma.MetricConfig(
230 | class_name="BinaryAccuracy",
231 | threshold=tfma.MetricThreshold(
232 | value_threshold=tfma.GenericValueThreshold(
233 | lower_bound={"value": float(config.ACCURACY_THRESHOLD)}
234 | ),
235 | # Change threshold will be ignored if there is no
236 | # baseline model resolved from MLMD (first run).
237 | change_threshold=tfma.GenericChangeThreshold(
238 | direction=tfma.MetricDirection.HIGHER_IS_BETTER,
239 | absolute={"value": -1e-10},
240 | ),
241 | ),
242 | ),
243 | ]
244 | )
245 | ],
246 | )
247 |
248 | # Model evaluation.
249 | evaluator = Evaluator(
250 | examples=test_example_gen.outputs.examples,
251 | example_splits=["test"],
252 | model=trainer.outputs.model,
253 | # baseline_model=baseline_model_resolver.outputs.model,
254 | eval_config=eval_config,
255 | schema=schema_importer.outputs.result,
256 | ).with_id("ModelEvaluator")
257 |
258 | exported_model_location = os.path.join(
259 | config.MODEL_REGISTRY_URI, config.MODEL_DISPLAY_NAME
260 | )
261 | push_destination = tfx.proto.pusher_pb2.PushDestination(
262 | filesystem=tfx.proto.pusher_pb2.PushDestination.Filesystem(
263 | base_directory=exported_model_location
264 | )
265 | )
266 |
267 | # Push custom model to model registry.
268 | pusher = Pusher(
269 | model=trainer.outputs.model,
270 | model_blessing=evaluator.outputs.blessing,
271 | push_destination=push_destination,
272 | ).with_id("ModelPusher")
273 |
274 | # Upload custom trained model to Vertex AI.
275 | explanation_config = json.dumps(features.generate_explanation_config())
276 | vertex_model_uploader = custom_components.vertex_model_uploader(
277 | project=config.PROJECT_ID,
278 | region=config.REGION,
279 | model_display_name=config.MODEL_DISPLAY_NAME,
280 | pushed_model_location=exported_model_location,
281 | serving_image_uri=config.SERVING_IMAGE_URI,
282 | explanation_config=explanation_config,
283 | ).with_id("VertexUploader")
284 |
285 | pipeline_components = [
286 | hyperparams_gen,
287 | train_example_gen,
288 | test_example_gen,
289 | statistics_gen,
290 | schema_importer,
291 | example_validator,
292 | transform,
293 | # warmstart_model_resolver,
294 | trainer,
295 | # baseline_model_resolver,
296 | evaluator,
297 | pusher,
298 | ]
299 |
300 | if int(config.UPLOAD_MODEL):
301 | pipeline_components.append(vertex_model_uploader)
302 | # Add dependency from pusher to aip_model_uploader.
303 | vertex_model_uploader.add_upstream_node(pusher)
304 |
305 | logging.info(
306 | f"Pipeline components: {[component.id for component in pipeline_components]}"
307 | )
308 |
309 | beam_pipeline_args = config.BEAM_DIRECT_PIPELINE_ARGS
310 | if config.BEAM_RUNNER == "DataflowRunner":
311 | beam_pipeline_args = config.BEAM_DATAFLOW_PIPELINE_ARGS
312 |
313 | logging.info(f"Beam pipeline args: {beam_pipeline_args}")
314 |
315 | return pipeline.Pipeline(
316 | pipeline_name=config.PIPELINE_NAME,
317 | pipeline_root=pipeline_root,
318 | components=pipeline_components,
319 | beam_pipeline_args=beam_pipeline_args,
320 | metadata_connection_config=metadata_connection_config,
321 | enable_cache=int(config.ENABLE_CACHE),
322 | )
323 |
--------------------------------------------------------------------------------