├── .gitignore ├── 01-dataset-management.ipynb ├── 02-experimentation.ipynb ├── 03-training-formalization.ipynb ├── 04-pipeline-deployment.ipynb ├── 05-continuous-training.ipynb ├── 06-model-deployment.ipynb ├── 07-prediction-serving.ipynb ├── 08-model-monitoring.ipynb ├── Dockerfile ├── LICENSE ├── README.md ├── build ├── Dockerfile ├── model-deployment.yaml ├── pipeline-deployment.yaml ├── serving_resources_spec.json └── utils.py ├── mlops.png ├── provision ├── README.md └── terraform │ ├── gcs-bucket.tf │ ├── main.tf │ ├── notebook-instance.tf │ ├── service-accounts.tf │ ├── services.tf │ ├── terraform.tfvars │ └── variables.tf ├── requirements.txt ├── setup.py └── src ├── __init__.py ├── common ├── __init__.py ├── datasource_utils.py └── features.py ├── model_training ├── __init__.py ├── data.py ├── defaults.py ├── exporter.py ├── model.py ├── runner.py ├── task.py └── trainer.py ├── pipeline_triggering ├── __init__.py ├── main.py └── requirements.txt ├── preprocessing ├── __init__.py ├── etl.py └── transformations.py ├── raw_schema └── schema.pbtxt ├── tests ├── __init__.py ├── datasource_utils_tests.py ├── etl_tests.py ├── model_deployment_tests.py ├── model_tests.py └── pipeline_deployment_tests.py └── tfx_pipelines ├── __init__.py ├── components.py ├── config.py ├── prediction_pipeline.py ├── runner.py └── training_pipeline.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | develop-eggs/ 12 | dist/ 13 | downloads/ 14 | eggs/ 15 | .eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | pip-wheel-metadata/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | .idea/ 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | _workspace/ 132 | *.tar.gz 133 | .egg-info/ 134 | *.whl 135 | mlpipeline-ui-metadata.json 136 | *.csv 137 | *.sqllite 138 | model.png 139 | -------------------------------------------------------------------------------- /04-pipeline-deployment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d272910e", 6 | "metadata": {}, 7 | "source": [ 8 | "# 04 - Test and deploy a TFX training pipeline to `Vertex Pipelines`\n", 9 | "\n", 10 | "The purpose of this notebook is to test, deploy, and run the `TFX` pipeline on `Vertex Pipelines`. The notebook covers the following tasks:\n", 11 | "\n", 12 | "1. Run the tests locally.\n", 13 | "2. Run the `TFX` pipeline using `Vertex Pipelines`\n", 14 | "3. Execute the pipeline deployment `CI/CD` steps using `Cloud Build`." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "id": "beaa2787", 20 | "metadata": {}, 21 | "source": [ 22 | "## Setup" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "id": "51e05608", 28 | "metadata": {}, 29 | "source": [ 30 | "### Import libraries" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "id": "9aa72b29", 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import os\n", 41 | "import kfp\n", 42 | "import tfx\n", 43 | "\n", 44 | "print('TFX:', tfx.__version__)\n", 45 | "print('KFP:', kfp.__version__)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "24aceb9a", 51 | "metadata": {}, 52 | "source": [ 53 | "### Setup Google Cloud project" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "d8d9f81b", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "PROJECT_ID = '[your-project-id]' # Change to your project id.\n", 64 | "REGION = 'us-central1' # Change to your region.\n", 65 | "BUCKET = '[your-bucket-name]' # Change to your bucket name.\n", 66 | "SERVICE_ACCOUNT = '[your-service-account]'\n", 67 | "\n", 68 | "if PROJECT_ID == '' or PROJECT_ID is None or PROJECT_ID == '[your-project-id]':\n", 69 | " # Get your GCP project id from gcloud\n", 70 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n", 71 | " PROJECT_ID = shell_output[0]\n", 72 | " \n", 73 | "if SERVICE_ACCOUNT == '' or SERVICE_ACCOUNT is None or SERVICE_ACCOUNT == '[your-service-account]':\n", 74 | " # Get your GCP project id from gcloud\n", 75 | " shell_output = !gcloud config list --format 'value(core.account)' 2>/dev/null\n", 76 | " SERVICE_ACCOUNT = shell_output[0]\n", 77 | " \n", 78 | "if BUCKET == '' or BUCKET is None or BUCKET == '[your-bucket-name]':\n", 79 | " # Set your bucket name using your GCP project id\n", 80 | " BUCKET = PROJECT_ID\n", 81 | " # Try to create the bucket if it doesn'exists\n", 82 | " ! gsutil mb -l $REGION gs://$BUCKET\n", 83 | " print('')\n", 84 | " \n", 85 | "print('Project ID:', PROJECT_ID)\n", 86 | "print('Region:', REGION)\n", 87 | "print('Bucket name:', BUCKET)\n", 88 | "print('Service Account:', SERVICE_ACCOUNT)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "id": "3d24d18b", 94 | "metadata": {}, 95 | "source": [ 96 | "### Set configurations" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "id": "a8295cca", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "BQ_LOCATION = 'US'\n", 107 | "BQ_DATASET_NAME = 'playground_us' # Change to your BQ dataset name.\n", 108 | "BQ_TABLE_NAME = 'chicago_taxitrips_prep'\n", 109 | "\n", 110 | "VERSION = 'v1'\n", 111 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n", 112 | "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n", 113 | "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n", 114 | "\n", 115 | "CICD_IMAGE_NAME = 'cicd:latest'\n", 116 | "CICD_IMAGE_URI = f'gcr.io/{PROJECT_ID}/{CICD_IMAGE_NAME}'" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "id": "81d049f5", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "! rm -r src/raw_schema/.ipynb_checkpoints/" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "id": "4cbcbbb8", 132 | "metadata": {}, 133 | "source": [ 134 | "## 1. Run the CI/CD steps locally" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "58845362", 140 | "metadata": {}, 141 | "source": [ 142 | "### Set pipeline configurations for the local run" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "id": "44c48da6", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "os.environ['DATASET_DISPLAY_NAME'] = DATASET_DISPLAY_NAME\n", 153 | "os.environ['MODEL_DISPLAY_NAME'] = MODEL_DISPLAY_NAME\n", 154 | "os.environ['PIPELINE_NAME'] = PIPELINE_NAME\n", 155 | "os.environ['PROJECT'] = PROJECT_ID\n", 156 | "os.environ['REGION'] = REGION\n", 157 | "os.environ['BQ_LOCATION'] = BQ_LOCATION\n", 158 | "os.environ['BQ_DATASET_NAME'] = BQ_DATASET_NAME\n", 159 | "os.environ['BQ_TABLE_NAME'] = BQ_TABLE_NAME\n", 160 | "os.environ['GCS_LOCATION'] = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/e2e_tests'\n", 161 | "os.environ['TRAIN_LIMIT'] = '1000'\n", 162 | "os.environ['TEST_LIMIT'] = '100'\n", 163 | "os.environ['UPLOAD_MODEL'] = '0'\n", 164 | "os.environ['ACCURACY_THRESHOLD'] = '0.1'\n", 165 | "os.environ['BEAM_RUNNER'] = 'DirectRunner'\n", 166 | "os.environ['TRAINING_RUNNER'] = 'local'" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "id": "fcf65dee", 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "from src.tfx_pipelines import config\n", 177 | "import importlib\n", 178 | "importlib.reload(config)\n", 179 | "\n", 180 | "for key, value in config.__dict__.items():\n", 181 | " if key.isupper(): print(f'{key}: {value}')" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "id": "0e4989b9", 187 | "metadata": {}, 188 | "source": [ 189 | "### Run the unit tests for the data and model pipeline components" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "id": "37324634", 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "! py.test src/tests/datasource_utils_tests.py -s" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "id": "1a40f106", 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "! py.test src/tests/model_tests.py -s" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "id": "f3b62aea", 215 | "metadata": {}, 216 | "source": [ 217 | "### Run the e2e pipeline test" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "id": "3acb31cf", 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "! py.test src/tests/pipeline_deployment_tests.py::test_e2e_pipeline -s" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "id": "1c8758df", 233 | "metadata": {}, 234 | "source": [ 235 | "## 2. Run the training pipeline using `Vertex Pipelines`" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "id": "a02ce062", 241 | "metadata": {}, 242 | "source": [ 243 | "### Set the pipeline configurations for the `Vertex Pipeline` run" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "id": "01c2b3e1", 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "os.environ['DATASET_DISPLAY_NAME'] = DATASET_DISPLAY_NAME\n", 254 | "os.environ['MODEL_DISPLAY_NAME'] = MODEL_DISPLAY_NAME\n", 255 | "os.environ['PIPELINE_NAME'] = PIPELINE_NAME\n", 256 | "os.environ['PROJECT'] = PROJECT_ID\n", 257 | "os.environ['REGION'] = REGION\n", 258 | "os.environ['GCS_LOCATION'] = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}'\n", 259 | "os.environ['TRAIN_LIMIT'] = '85000'\n", 260 | "os.environ['TEST_LIMIT'] = '15000'\n", 261 | "os.environ['BEAM_RUNNER'] = 'DataflowRunner'\n", 262 | "os.environ['TRAINING_RUNNER'] = 'vertex'\n", 263 | "os.environ['TFX_IMAGE_URI'] = f'gcr.io/{PROJECT_ID}/{DATASET_DISPLAY_NAME}:{VERSION}'" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "id": "9e8be723", 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "from src.tfx_pipelines import config\n", 274 | "import importlib\n", 275 | "importlib.reload(config)\n", 276 | "\n", 277 | "for key, value in config.__dict__.items():\n", 278 | " if key.isupper(): print(f'{key}: {value}')" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "id": "286ff84e", 284 | "metadata": {}, 285 | "source": [ 286 | "### Build the training container image\n", 287 | "\n", 288 | "This is the `TFX` runtime environment for the training pipeline steps." 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "id": "d9686014", 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "!echo $TFX_IMAGE_URI" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "id": "7f7986c2", 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "!gcloud builds submit --tag $TFX_IMAGE_URI . --timeout=15m --machine-type=e2-highcpu-8" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "id": "ce2d5c9a", 314 | "metadata": {}, 315 | "source": [ 316 | "### Compile the `TFX` pipeline" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "id": "df29fc7e", 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "from src.tfx_pipelines import runner\n", 327 | "\n", 328 | "pipeline_definition_file = f'{config.PIPELINE_NAME}.json'\n", 329 | "pipeline_definition = runner.compile_training_pipeline(pipeline_definition_file)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "id": "75928c08", 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "PIPELINES_STORE = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/compiled_pipelines/'\n", 340 | "! gsutil cp {pipeline_definition_file} {PIPELINES_STORE}" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "id": "ef836781", 346 | "metadata": {}, 347 | "source": [ 348 | "### Submit run to Vertex Pipelines" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "id": "1d1115bd", 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "from kfp.v2.google.client import AIPlatformClient\n", 359 | "\n", 360 | "pipeline_client = AIPlatformClient(\n", 361 | " project_id=PROJECT_ID, region=REGION)\n", 362 | " \n", 363 | "job = pipeline_client.create_run_from_job_spec(\n", 364 | " job_spec_path=pipeline_definition_file,\n", 365 | " parameter_values={\n", 366 | " 'learning_rate': 0.003,\n", 367 | " 'batch_size': 512,\n", 368 | " 'hidden_units': '128,128',\n", 369 | " 'num_epochs': 30,\n", 370 | " }\n", 371 | ")" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "id": "7cc4477f", 377 | "metadata": {}, 378 | "source": [ 379 | "### Extracting pipeline runs metadata" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "id": "464ad3a8", 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "from google.cloud import aiplatform as vertex_ai\n", 390 | "\n", 391 | "pipeline_df = vertex_ai.get_pipeline_df(PIPELINE_NAME)\n", 392 | "pipeline_df = pipeline_df[pipeline_df.pipeline_name == PIPELINE_NAME]\n", 393 | "pipeline_df.T" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "id": "ad380129", 399 | "metadata": {}, 400 | "source": [ 401 | "## 3. Execute the pipeline deployment CI/CD steps in Cloud Build\n", 402 | "\n", 403 | "The CI/CD routine is defined in the [pipeline-deployment.yaml](pipeline-deployment.yaml) file, and consists of the following steps:\n", 404 | "1. Clone the repository to the build environment.\n", 405 | "2. Run unit tests.\n", 406 | "3. Run a local e2e test of the pipeline.\n", 407 | "4. Build the ML container image for pipeline steps.\n", 408 | "5. Compile the pipeline.\n", 409 | "6. Upload the pipeline to Cloud Storage." 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "id": "e00b075f", 415 | "metadata": {}, 416 | "source": [ 417 | "### Build CI/CD container image for Cloud Build\n", 418 | "\n", 419 | "This is the runtime environment where the steps of testing and deploying the pipeline will be executed." 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "id": "867e5ae1", 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "! echo $CICD_IMAGE_URI" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "id": "40f497f6", 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "! gcloud builds submit --tag $CICD_IMAGE_URI build/. --timeout=15m --machine-type=e2-highcpu-8" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "id": "4f6e2dd7", 445 | "metadata": {}, 446 | "source": [ 447 | "### Run CI/CD from pipeline deployment using Cloud Build" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "id": "117895d7", 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "REPO_URL = 'https://github.com/ksalama/ucaip-labs.git' # Change to your github repo.\n", 458 | "BRANCH = 'main'\n", 459 | "\n", 460 | "GCS_LOCATION = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/'\n", 461 | "TEST_GCS_LOCATION = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/e2e_tests'\n", 462 | "CI_TRAIN_LIMIT = 1000\n", 463 | "CI_TEST_LIMIT = 100\n", 464 | "CI_UPLOAD_MODEL = 0\n", 465 | "CI_ACCURACY_THRESHOLD = 0.1\n", 466 | "BEAM_RUNNER = 'DataflowRunner'\n", 467 | "TRAINING_RUNNER = 'vertex'\n", 468 | "VERSION = 'tfx-0-30'\n", 469 | "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n", 470 | "PIPELINES_STORE = os.path.join(GCS_LOCATION, 'compiled_pipelines')\n", 471 | "\n", 472 | "TFX_IMAGE_URI = f'gcr.io/{PROJECT_ID}/{DATASET_DISPLAY_NAME}:{VERSION}'\n", 473 | "\n", 474 | "SUBSTITUTIONS=f'''\\\n", 475 | "_REPO_URL='{REPO_URL}',\\\n", 476 | "_BRANCH={BRANCH},\\\n", 477 | "_CICD_IMAGE_URI={CICD_IMAGE_URI},\\\n", 478 | "_PROJECT_ID={PROJECT_ID},\\\n", 479 | "_REGION={REGION},\\\n", 480 | "_GCS_LOCATION={GCS_LOCATION},\\\n", 481 | "_TEST_GCS_LOCATION={TEST_GCS_LOCATION},\\\n", 482 | "_BQ_LOCATION={BQ_LOCATION},\\\n", 483 | "_BQ_DATASET_NAME={BQ_DATASET_NAME},\\\n", 484 | "_BQ_TABLE_NAME={BQ_TABLE_NAME},\\\n", 485 | "_DATASET_DISPLAY_NAME={DATASET_DISPLAY_NAME},\\\n", 486 | "_MODEL_DISPLAY_NAME={MODEL_DISPLAY_NAME},\\\n", 487 | "_CI_TRAIN_LIMIT={CI_TRAIN_LIMIT},\\\n", 488 | "_CI_TEST_LIMIT={CI_TEST_LIMIT},\\\n", 489 | "_CI_UPLOAD_MODEL={CI_UPLOAD_MODEL},\\\n", 490 | "_CI_ACCURACY_THRESHOLD={CI_ACCURACY_THRESHOLD},\\\n", 491 | "_BEAM_RUNNER={BEAM_RUNNER},\\\n", 492 | "_TRAINING_RUNNER={TRAINING_RUNNER},\\\n", 493 | "_TFX_IMAGE_URI={TFX_IMAGE_URI},\\\n", 494 | "_PIPELINE_NAME={PIPELINE_NAME},\\\n", 495 | "_PIPELINES_STORE={PIPELINES_STORE}\\\n", 496 | "'''\n", 497 | "\n", 498 | "!echo $SUBSTITUTIONS" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": null, 504 | "id": "b54081db", 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "!gcloud builds submit --no-source --timeout=60m --config build/pipeline-deployment.yaml --substitutions {SUBSTITUTIONS} --machine-type=e2-highcpu-8" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "id": "72d9baf5", 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [] 518 | } 519 | ], 520 | "metadata": { 521 | "environment": { 522 | "name": "common-cpu.m73", 523 | "type": "gcloud", 524 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73" 525 | }, 526 | "kernelspec": { 527 | "display_name": "Python 3", 528 | "language": "python", 529 | "name": "python3" 530 | }, 531 | "language_info": { 532 | "codemirror_mode": { 533 | "name": "ipython", 534 | "version": 3 535 | }, 536 | "file_extension": ".py", 537 | "mimetype": "text/x-python", 538 | "name": "python", 539 | "nbconvert_exporter": "python", 540 | "pygments_lexer": "ipython3", 541 | "version": "3.7.10" 542 | } 543 | }, 544 | "nbformat": 4, 545 | "nbformat_minor": 5 546 | } 547 | -------------------------------------------------------------------------------- /05-continuous-training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "26667428", 6 | "metadata": {}, 7 | "source": [ 8 | "# 05 - Continuous training\n", 9 | "\n", 10 | "After testing, compiling, and uploading the pipeline definition to Cloud Storage, the pipeline is executed with respect to a trigger. `Cloud Functions` and `Cloud Pub/Sub` are used in this notebook as a triggering mechanism. The triggering can be scheduled using `Cloud Scheduler`. The trigger source sends a message to a Cloud Pub/Sub topic that the Cloud Function listens to, and then it submits the pipeline to `Vertex Pipelines` to be executed.\n", 11 | "\n", 12 | "This notebook covers the following steps:\n", 13 | "1. Create the `Cloud Pub/Sub` topic.\n", 14 | "2. Deploy the `Cloud Function` \n", 15 | "3. Test triggering a pipeline.\n", 16 | "4. Extracting pipeline run metadata.\n", 17 | "\n", 18 | "Learn about [Cloud Functions](https://cloud.google.com/functions).\n", 19 | "Learn about [Cloud Pub/Sub](https://cloud.google.com/pubsub).\n", 20 | "Learn about [Cloud Scheduler](https://cloud.google.com/scheduler)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "45edf109", 26 | "metadata": {}, 27 | "source": [ 28 | "## Setup" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "d44dcc22", 34 | "metadata": {}, 35 | "source": [ 36 | "### Import libraries" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "8fa8c2ff", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "import json\n", 47 | "import os\n", 48 | "import logging\n", 49 | "import tensorflow as tf\n", 50 | "import tfx\n", 51 | "import IPython \n", 52 | "\n", 53 | "logging.getLogger().setLevel(logging.INFO)\n", 54 | "\n", 55 | "print('TFX:', tfx.__version__)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "id": "a82072dc", 61 | "metadata": {}, 62 | "source": [ 63 | "### Setup Google Cloud project" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "id": "04c5843a", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "PROJECT_ID = '[your-project-id]' # Change to your project id.\n", 74 | "REGION = 'us-central1' # Change to your region.\n", 75 | "BUCKET = '[your-bucket-name]' # Change to your bucket name.\n", 76 | "\n", 77 | "if PROJECT_ID == '' or PROJECT_ID is None or PROJECT_ID == '[your-project-id]':\n", 78 | " # Get your GCP project id from gcloud\n", 79 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n", 80 | " PROJECT_ID = shell_output[0]\n", 81 | " \n", 82 | "if BUCKET == '' or BUCKET is None or BUCKET == '[your-bucket-name]':\n", 83 | " # Set your bucket name using your GCP project id\n", 84 | " BUCKET = PROJECT_ID\n", 85 | " # Try to create the bucket if it doesn'exists\n", 86 | " ! gsutil mb -l $REGION gs://$BUCKET\n", 87 | " print('')\n", 88 | "\n", 89 | "print('Project ID:', PROJECT_ID)\n", 90 | "print('Region:', REGION)\n", 91 | "print('Bucket name:', BUCKET)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "id": "ae7570c7", 97 | "metadata": {}, 98 | "source": [ 99 | "### Set configurations" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "id": "99e362bb", 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "VERSION = 'v1'\n", 110 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n", 111 | "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n", 112 | "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n", 113 | "\n", 114 | "PIPELINES_STORE = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/compiled_pipelines/'\n", 115 | "GCS_PIPELINE_FILE_LOCATION = os.path.join(PIPELINES_STORE, f'{PIPELINE_NAME}.json')\n", 116 | "PUBSUB_TOPIC = f'trigger-{PIPELINE_NAME}'\n", 117 | "CLOUD_FUNCTION_NAME = f'trigger-{PIPELINE_NAME}-fn'" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "fc916c87", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "!gsutil ls {GCS_PIPELINE_FILE_LOCATION}" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "id": "ed5321a4", 133 | "metadata": {}, 134 | "source": [ 135 | "## 1. Create a Pub/Sub topic" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "id": "1f36582f", 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "! gcloud pubsub topics create {PUBSUB_TOPIC}" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "id": "f090676b", 151 | "metadata": {}, 152 | "source": [ 153 | "## 2. Deploy the Cloud Function" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "id": "48858f15", 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "ENV_VARS=f'''\\\n", 164 | "PROJECT={PROJECT_ID},\\\n", 165 | "REGION={REGION},\\\n", 166 | "GCS_PIPELINE_FILE_LOCATION={GCS_PIPELINE_FILE_LOCATION}\n", 167 | "'''\n", 168 | "\n", 169 | "! echo {ENV_VARS}" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "id": "a78831f0", 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "! rm -r src/pipeline_triggering/.ipynb_checkpoints" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "id": "dfd65f0d", 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "! gcloud functions deploy {CLOUD_FUNCTION_NAME} \\\n", 190 | " --region={REGION} \\\n", 191 | " --trigger-topic={PUBSUB_TOPIC} \\\n", 192 | " --runtime=python37 \\\n", 193 | " --source=src/pipeline_triggering\\\n", 194 | " --entry-point=trigger_pipeline\\\n", 195 | " --stage-bucket={BUCKET}\\\n", 196 | " --update-env-vars={ENV_VARS}" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "id": "4f632321", 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "cloud_fn_url = f'https://console.cloud.google.com/functions/details/{REGION}/{CLOUD_FUNCTION_NAME}'\n", 207 | "html = f'See the Cloud Function details here.'\n", 208 | "IPython.display.display(IPython.display.HTML(html))" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "id": "9c00b9a6", 214 | "metadata": {}, 215 | "source": [ 216 | "## 3. Trigger the pipeline" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "id": "0cf3abbc", 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "from google.cloud import pubsub\n", 227 | "\n", 228 | "publish_client = pubsub.PublisherClient()\n", 229 | "topic = f'projects/{PROJECT_ID}/topics/{PUBSUB_TOPIC}'\n", 230 | "data = {\n", 231 | " 'num_epochs': 7,\n", 232 | " 'learning_rate': 0.0015,\n", 233 | " 'batch_size': 512,\n", 234 | " 'hidden_units': '256,126'\n", 235 | "}\n", 236 | "message = json.dumps(data)\n", 237 | "\n", 238 | "_ = publish_client.publish(topic, message.encode())" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "id": "c536f29d", 244 | "metadata": {}, 245 | "source": [ 246 | "Wait for a few seconds for the pipeline run to be submitted, then you can see the run in the Cloud Console" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "id": "887538b6", 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "from kfp.v2.google.client import AIPlatformClient\n", 257 | "\n", 258 | "pipeline_client = AIPlatformClient(\n", 259 | " project_id=PROJECT_ID, region=REGION)\n", 260 | " \n", 261 | "job_display_name = pipeline_client.list_jobs()['pipelineJobs'][0]['displayName']\n", 262 | "job_url = f'https://console.cloud.google.com/vertex-ai/locations/{REGION}/pipelines/runs/{job_display_name}'\n", 263 | "html = f'See the Pipeline job here.'\n", 264 | "IPython.display.display(IPython.display.HTML(html))" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "id": "159a66d2", 270 | "metadata": {}, 271 | "source": [ 272 | "## 4. Extracting pipeline runs metadata" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "id": "affe56fc", 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "from google.cloud import aiplatform as vertex_ai\n", 283 | "\n", 284 | "pipeline_df = vertex_ai.get_pipeline_df(PIPELINE_NAME)\n", 285 | "pipeline_df = pipeline_df[pipeline_df.pipeline_name == PIPELINE_NAME]\n", 286 | "pipeline_df.T" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "id": "04ba49ed", 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [] 296 | } 297 | ], 298 | "metadata": { 299 | "environment": { 300 | "name": "common-cpu.m73", 301 | "type": "gcloud", 302 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73" 303 | }, 304 | "kernelspec": { 305 | "display_name": "Python 3", 306 | "language": "python", 307 | "name": "python3" 308 | }, 309 | "language_info": { 310 | "codemirror_mode": { 311 | "name": "ipython", 312 | "version": 3 313 | }, 314 | "file_extension": ".py", 315 | "mimetype": "text/x-python", 316 | "name": "python", 317 | "nbconvert_exporter": "python", 318 | "pygments_lexer": "ipython3", 319 | "version": "3.7.10" 320 | } 321 | }, 322 | "nbformat": 4, 323 | "nbformat_minor": 5 324 | } 325 | -------------------------------------------------------------------------------- /06-model-deployment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "e80f441f", 6 | "metadata": {}, 7 | "source": [ 8 | "# 06 - Model deployment\n", 9 | "\n", 10 | "The purpose of this notebook is to execute a CI/CD routine to test and deploy the trained `Vertex Model` resource to a `Vertex Endpoint` resource for online prediction serving. The notebook covers the following steps:\n", 11 | "\n", 12 | "1. Run the test steps locally.\n", 13 | "2. Execute the model deployment CI/CD steps using `Cloud Build`.\n", 14 | "\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "id": "0db03d3d", 20 | "metadata": {}, 21 | "source": [ 22 | "## Setup" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "id": "3cd5f896", 28 | "metadata": {}, 29 | "source": [ 30 | "### Import libraries" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "id": "c98cf8cb", 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import os\n", 41 | "import logging\n", 42 | "\n", 43 | "logging.getLogger().setLevel(logging.INFO)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "id": "faf0de35", 49 | "metadata": {}, 50 | "source": [ 51 | "### Setup Google Cloud project" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "8ab672e9", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "PROJECT_ID = '[your-project-id]' # Change to your project id.\n", 62 | "REGION = 'us-central1' # Change to your region.\n", 63 | "\n", 64 | "if PROJECT_ID == '' or PROJECT_ID is None or PROJECT_ID == '[your-project-id]':\n", 65 | " # Get your GCP project id from gcloud\n", 66 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n", 67 | " PROJECT_ID = shell_output[0]\n", 68 | "\n", 69 | "print('Project ID:', PROJECT_ID)\n", 70 | "print('Region:', REGION)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "id": "2d1e359d", 76 | "metadata": {}, 77 | "source": [ 78 | "### Set configurations" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "id": "25a1e19b", 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "VERSION = 'v1'\n", 89 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n", 90 | "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n", 91 | "ENDPOINT_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier'\n", 92 | "\n", 93 | "CICD_IMAGE_NAME = 'cicd:latest'\n", 94 | "CICD_IMAGE_URI = f'gcr.io/{PROJECT_ID}/{CICD_IMAGE_NAME}'" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "id": "d27a65fd", 100 | "metadata": {}, 101 | "source": [ 102 | "## 1. Run CI/CD steps locally" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "daffa85a", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "os.environ['PROJECT'] = PROJECT_ID\n", 113 | "os.environ['REGION'] = REGION\n", 114 | "os.environ['MODEL_DISPLAY_NAME'] = MODEL_DISPLAY_NAME\n", 115 | "os.environ['ENDPOINT_DISPLAY_NAME'] = ENDPOINT_DISPLAY_NAME" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "id": "189dde56", 121 | "metadata": {}, 122 | "source": [ 123 | "### Run the model artifact testing" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "id": "20c8ce61", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "! py.test src/tests/model_deployment_tests.py::test_model_artifact -s" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "id": "195e7cde", 139 | "metadata": {}, 140 | "source": [ 141 | "### Run create endpoint" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "id": "bdaf0c28", 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "! python build/utils.py \\\n", 152 | " --mode=create-endpoint\\\n", 153 | " --project={PROJECT_ID}\\\n", 154 | " --region={REGION}\\\n", 155 | " --endpoint-display-name={ENDPOINT_DISPLAY_NAME}" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "id": "3e8022b5", 161 | "metadata": {}, 162 | "source": [ 163 | "### Run deploy model" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "id": "bac7e8b3", 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "! python build/utils.py \\\n", 174 | " --mode=deploy-model\\\n", 175 | " --project={PROJECT_ID}\\\n", 176 | " --region={REGION}\\\n", 177 | " --endpoint-display-name={ENDPOINT_DISPLAY_NAME}\\\n", 178 | " --model-display-name={MODEL_DISPLAY_NAME}" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "id": "937178d9", 184 | "metadata": {}, 185 | "source": [ 186 | "### Run model endpoint testing" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "id": "efa45c98", 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "# TODO {for Khalid, you need to update create an Endpoint resource when using a list. This is a known bug:}\n", 197 | "# AttributeError: 'Endpoint' object has no attribute '_prediction_client'\n", 198 | "! py.test src/tests/model_deployment_tests.py::test_model_endpoint" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "id": "fbe44e56", 204 | "metadata": {}, 205 | "source": [ 206 | "## 2. Execute the model deployment CI/CD routine in `Cloud Build`\n", 207 | "\n", 208 | "The CI/CD routine is defined in the [model-deployment.yaml](model-deployment.yaml) file, and consists of the following steps:\n", 209 | "\n", 210 | "1. Load and test the the trained model interface.\n", 211 | "2. Create a `Vertex Endpoint` resource if it does not exist.\n", 212 | "3. Deploy the `Vertex Model` resource to the `Vertex Endpoint` resource.\n", 213 | "4. Test the `Vertex Endpoint` resource." 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "id": "05ef2d3d", 219 | "metadata": {}, 220 | "source": [ 221 | "### Build CI/CD container Image for `Cloud Build`\n", 222 | "\n", 223 | "This is the runtime environment where the steps of testing and deploying the model will be executed." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "id": "59f00bcc", 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "! echo $CICD_IMAGE_URI" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "id": "76b7dae5", 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "! gcloud builds submit --tag $CICD_IMAGE_URI build/. --timeout=15m" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "id": "88d91bd7", 249 | "metadata": {}, 250 | "source": [ 251 | "### Run CI/CD from model deployment using `Cloud Build`" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "id": "6a8d9b05", 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "REPO_URL = 'https://github.com/ksalama/ucaip-labs.git' # Change to your github repo.\n", 262 | "BRANCH = 'main'" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "id": "ee76bd54", 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "SUBSTITUTIONS=f'''\\\n", 273 | "_REPO_URL='{REPO_URL}',\\\n", 274 | "_BRANCH={BRANCH},\\\n", 275 | "_CICD_IMAGE_URI={CICD_IMAGE_URI},\\\n", 276 | "_PROJECT={PROJECT_ID},\\\n", 277 | "_REGION={REGION},\\\n", 278 | "_MODEL_DISPLAY_NAME={MODEL_DISPLAY_NAME},\\\n", 279 | "_ENDPOINT_DISPLAY_NAME={ENDPOINT_DISPLAY_NAME},\\\n", 280 | "'''\n", 281 | "\n", 282 | "!echo $SUBSTITUTIONS" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "id": "3f59114c", 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "!gcloud builds submit --no-source --config build/model-deployment.yaml --substitutions {SUBSTITUTIONS} --timeout=30m" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "id": "d62fc304", 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [] 302 | } 303 | ], 304 | "metadata": { 305 | "environment": { 306 | "name": "common-cpu.m73", 307 | "type": "gcloud", 308 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73" 309 | }, 310 | "kernelspec": { 311 | "display_name": "Python 3", 312 | "language": "python", 313 | "name": "python3" 314 | }, 315 | "language_info": { 316 | "codemirror_mode": { 317 | "name": "ipython", 318 | "version": 3 319 | }, 320 | "file_extension": ".py", 321 | "mimetype": "text/x-python", 322 | "name": "python", 323 | "nbconvert_exporter": "python", 324 | "pygments_lexer": "ipython3", 325 | "version": "3.7.10" 326 | } 327 | }, 328 | "nbformat": 4, 329 | "nbformat_minor": 5 330 | } 331 | -------------------------------------------------------------------------------- /07-prediction-serving.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "afa25b6f", 6 | "metadata": {}, 7 | "source": [ 8 | "# 07 - Serving predictions\n", 9 | "\n", 10 | "The purpose of the notebook is to show how to use the deployed model for online and batch prediction.\n", 11 | "The notebook covers the following tasks:\n", 12 | "\n", 13 | "1. Test the `Endpoint` resource for online prediction.\n", 14 | "2. Use the custom model uploaded as a `Model` resource for batch prediciton.\n", 15 | "3. Run a the batch prediction pipeline using `Vertex Pipelines`." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "id": "b2ff82c9", 21 | "metadata": {}, 22 | "source": [ 23 | "## Setup" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "c0d52e77", 29 | "metadata": {}, 30 | "source": [ 31 | "### Import libraries" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "116a19cf", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import os\n", 42 | "import time\n", 43 | "from datetime import datetime\n", 44 | "import tensorflow as tf\n", 45 | "\n", 46 | "from google.cloud import aiplatform as vertex_ai" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "c5a33868", 52 | "metadata": {}, 53 | "source": [ 54 | "### Setup Google Cloud project" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "id": "3c2c4d1c", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "PROJECT_ID = '[your-project-id]' # Change to your project id.\n", 65 | "REGION = 'us-central1' # Change to your region.\n", 66 | "BUCKET = '[your-bucket-name]' # Change to your bucket name.\n", 67 | "\n", 68 | "if PROJECT_ID == '' or PROJECT_ID is None or PROJECT_ID == '[your-project-id]':\n", 69 | " # Get your GCP project id from gcloud\n", 70 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n", 71 | " PROJECT_ID = shell_output[0]\n", 72 | " \n", 73 | "if BUCKET == '' or BUCKET is None or BUCKET == '[your-bucket-name]':\n", 74 | " # Set your bucket name using your GCP project id\n", 75 | " BUCKET = PROJECT_ID\n", 76 | " # Try to create the bucket if it doesn'exists\n", 77 | " ! gsutil mb -l $REGION gs://$BUCKET\n", 78 | " print('')\n", 79 | " \n", 80 | "print('Project ID:', PROJECT_ID)\n", 81 | "print('Region:', REGION)\n", 82 | "print('Bucket name:', BUCKET)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "id": "29e1a653", 88 | "metadata": {}, 89 | "source": [ 90 | "### Set configurations" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "0019b2dd", 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "VERSION = 'v1'\n", 101 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n", 102 | "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n", 103 | "ENDPOINT_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier'\n", 104 | "\n", 105 | "SERVE_BQ_DATASET_NAME = 'playground_us' # Change to your serving BigQuery dataset name.\n", 106 | "SERVE_BQ_TABLE_NAME = 'chicago_taxitrips_prep' # Change to your serving BigQuery table name." 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "id": "385ed4c0", 112 | "metadata": {}, 113 | "source": [ 114 | "## 1. Making an online prediciton\n" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "id": "ac2520fc", 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "vertex_ai.init(\n", 125 | " project=PROJECT_ID,\n", 126 | " location=REGION,\n", 127 | " staging_bucket=BUCKET\n", 128 | ")\n", 129 | "\n", 130 | "endpoint_name = vertex_ai.Endpoint.list(\n", 131 | " filter=f'display_name={ENDPOINT_DISPLAY_NAME}', \n", 132 | " order_by='update_time')[-1].gca_resource.name\n", 133 | "\n", 134 | "endpoint = vertex_ai.Endpoint(endpoint_name)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "c5f4f8c8", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "test_instances = [ \n", 145 | " {\n", 146 | " 'dropoff_grid': ['POINT(-87.6 41.9)'],\n", 147 | " 'euclidean': [2064.2696],\n", 148 | " 'loc_cross': [''],\n", 149 | " 'payment_type': ['Credit Card'],\n", 150 | " 'pickup_grid': ['POINT(-87.6 41.9)'],\n", 151 | " 'trip_miles': [1.37],\n", 152 | " 'trip_day': [12],\n", 153 | " 'trip_hour': [16],\n", 154 | " 'trip_month': [2],\n", 155 | " 'trip_day_of_week': [4],\n", 156 | " 'trip_seconds': [555]\n", 157 | " }\n", 158 | "]" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "id": "6fe672df", 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "predictions = endpoint.predict(test_instances).predictions\n", 169 | "\n", 170 | "for prediction in predictions:\n", 171 | " print(prediction)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "id": "077f4225", 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "# TODO {for Khalid, get error saying model does not support explanations}\n", 182 | "\n", 183 | "explanations = endpoint.explain(test_instances).explanations\n", 184 | "\n", 185 | "for explanation in explanations:\n", 186 | " print(explanation)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "id": "b6140167", 192 | "metadata": {}, 193 | "source": [ 194 | "## 2. Make a batch prediction" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "id": "37928e74", 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "WORKSPACE = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/'\n", 205 | "SERVING_DATA_DIR = os.path.join(WORKSPACE, 'serving_data')\n", 206 | "SERVING_INPUT_DATA_DIR = os.path.join(SERVING_DATA_DIR, 'input_data')\n", 207 | "SERVING_OUTPUT_DATA_DIR = os.path.join(SERVING_DATA_DIR, 'output_predictions')" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "id": "b83e0d39", 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "if tf.io.gfile.exists(SERVING_DATA_DIR):\n", 218 | " print('Removing previous serving data...')\n", 219 | " tf.io.gfile.rmtree(SERVING_DATA_DIR)\n", 220 | "print('Creating serving data directory...')\n", 221 | "tf.io.gfile.mkdir(SERVING_DATA_DIR)\n", 222 | "print('Serving data directory is ready.')" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "id": "163326ce", 228 | "metadata": {}, 229 | "source": [ 230 | "### Extract serving data to Cloud Storage as JSONL" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "id": "51bdefd3", 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "\n", 241 | "from src.model_training import features as feature_info\n", 242 | "from src.preprocessing import etl\n", 243 | "from src.common import datasource_utils" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "id": "e15508fb", 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "LIMIT = 10000\n", 254 | "\n", 255 | "sql_query = datasource_utils.create_bq_source_query(\n", 256 | " dataset_display_name=DATASET_DISPLAY_NAME, \n", 257 | " missing=feature_info.MISSING_VALUES,\n", 258 | " limit=LIMIT\n", 259 | ")\n", 260 | "\n", 261 | "print(sql_query)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "id": "95ba6d5f", 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "args = {\n", 272 | " #'runner': 'DataflowRunner',\n", 273 | " 'sql_query': sql_query,\n", 274 | " 'exported_data_prefix': os.path.join(SERVING_INPUT_DATA_DIR, 'data-'),\n", 275 | " 'temporary_dir': os.path.join(WORKSPACE, 'tmp'),\n", 276 | " 'gcs_location': os.path.join(WORKSPACE, 'bq_tmp'),\n", 277 | " 'project': PROJECT_ID,\n", 278 | " 'region': REGION,\n", 279 | " 'setup_file': './setup.py'\n", 280 | "}" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "id": "c5414f24", 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "tf.get_logger().setLevel('ERROR')\n", 291 | "\n", 292 | "print('Data extraction started...')\n", 293 | "etl.run_extract_pipeline(args)\n", 294 | "print('Data extraction completed.')" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "id": "7411f2dc", 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "! gsutil ls {SERVING_INPUT_DATA_DIR}" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "id": "1660a44e", 310 | "metadata": {}, 311 | "source": [ 312 | "### Submit the batch prediction job" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "id": "8878a244", 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "model_name = vertex_ai.Model.list(\n", 323 | " filter=f'display_name={MODEL_DISPLAY_NAME}',\n", 324 | " order_by='update_time')[-1].gca_resource.name" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "id": "4f262efa", 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "job_resources = {\n", 335 | " 'machine_type': 'n1-standard-2',\n", 336 | " #'accelerator_count': 1,\n", 337 | " #'accelerator_type': 'NVIDIA_TESLA_T4'\n", 338 | " 'starting_replica_count': 1,\n", 339 | " 'max_replica_coun': 10,\n", 340 | "}\n", 341 | "\n", 342 | "job_display_name = f'{MODEL_DISPLAY_NAME}-prediction-job-{datetime.now().strftime('%Y%m%d%H%M%S')}'\n", 343 | "\n", 344 | "vertex_ai.BatchPredictionJob.create(\n", 345 | " job_display_name=job_display_name,\n", 346 | " model_name=model_name,\n", 347 | " gcs_source=SERVING_INPUT_DATA_DIR + '/*.jsonl',\n", 348 | " gcs_destination_prefix=SERVING_OUTPUT_DATA_DIR,\n", 349 | " instances_format='jsonl',\n", 350 | " predictions_format='jsonl',\n", 351 | " sync=True,\n", 352 | " **job_resources,\n", 353 | ")" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "id": "6d638b6f", 359 | "metadata": {}, 360 | "source": [ 361 | "## 3. Run the batch prediction pipeline using `Vertex Pipelines`" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "id": "ee5be402", 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "WORKSPACE = f'{BUCKET}/{DATASET_DISPLAY_NAME}/'\n", 372 | "MLMD_SQLLITE = 'mlmd.sqllite'\n", 373 | "ARTIFACT_STORE = os.path.join(WORKSPACE, 'tfx_artifacts')\n", 374 | "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-predict-pipeline'" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "id": "f9b84c1e", 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "os.environ['PROJECT'] = PROJECT_ID\n", 385 | "os.environ['REGION'] = REGION\n", 386 | "os.environ['MODEL_DISPLAY_NAME'] = MODEL_DISPLAY_NAME\n", 387 | "os.environ['PIPELINE_NAME'] = PIPELINE_NAME\n", 388 | "os.environ['ARTIFACT_STORE_URI'] = ARTIFACT_STORE\n", 389 | "os.environ['BATCH_PREDICTION_BQ_DATASET_NAME'] = SERVE_BQ_DATASET_NAME\n", 390 | "os.environ['BATCH_PREDICTION_BQ_TABLE_NAME'] = SERVE_BQ_TABLE_NAME\n", 391 | "os.environ['SERVE_LIMIT'] = '1000'\n", 392 | "os.environ['BEAM_RUNNER'] = 'DirectRunner'\n", 393 | "os.environ['TFX_IMAGE_URI'] = f'gcr.io/{PROJECT_ID}/{DATASET_DISPLAY_NAME}:{VERSION}'" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "id": "58681dfe", 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "import importlib\n", 404 | "from src.tfx_pipelines import config\n", 405 | "importlib.reload(config)\n", 406 | "\n", 407 | "for key, value in config.__dict__.items():\n", 408 | " if key.isupper(): print(f'{key}: {value}')" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "id": "d06a4091", 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "from src.tfx_pipelines import runner\n", 419 | "\n", 420 | "pipeline_definition_file = f'{config.PIPELINE_NAME}.json'\n", 421 | "pipeline_definition = runner.compile_prediction_pipeline(pipeline_definition_file)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "id": "b6ffceca", 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "from kfp.v2.google.client import AIPlatformClient\n", 432 | "\n", 433 | "pipeline_client = AIPlatformClient(\n", 434 | " project_id=PROJECT_ID, region=REGION)\n", 435 | " \n", 436 | "pipeline_client.create_run_from_job_spec(\n", 437 | " job_spec_path=pipeline_definition_file\n", 438 | ")" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "id": "dd2efb1b", 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [] 448 | } 449 | ], 450 | "metadata": { 451 | "environment": { 452 | "name": "common-cpu.m73", 453 | "type": "gcloud", 454 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73" 455 | }, 456 | "kernelspec": { 457 | "display_name": "Python 3", 458 | "language": "python", 459 | "name": "python3" 460 | }, 461 | "language_info": { 462 | "codemirror_mode": { 463 | "name": "ipython", 464 | "version": 3 465 | }, 466 | "file_extension": ".py", 467 | "mimetype": "text/x-python", 468 | "name": "python", 469 | "nbconvert_exporter": "python", 470 | "pygments_lexer": "ipython3", 471 | "version": "3.7.10" 472 | } 473 | }, 474 | "nbformat": 4, 475 | "nbformat_minor": 5 476 | } 477 | -------------------------------------------------------------------------------- /08-model-monitoring.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "39366395", 6 | "metadata": {}, 7 | "source": [ 8 | "# 08 - Model monitoring\n", 9 | "\n", 10 | "This notebook covers configuring model monitoring jobs for skew and drift detection:\n", 11 | "\n", 12 | "1. Set skew and drift threshold.\n", 13 | "2. Create a monitoring job for all the models on a `Endpoint` resource.\n", 14 | "3. List the monitoring jobs.\n", 15 | "4. List artifacts produced by monitoring job.\n", 16 | "5. Pause and delete the monitoring job." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "d7e55542", 22 | "metadata": {}, 23 | "source": [ 24 | "## Setup" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "35292bad", 30 | "metadata": {}, 31 | "source": [ 32 | "### Import libraries" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "41ba6e75", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import copy\n", 43 | "from datetime import datetime\n", 44 | "\n", 45 | "from google.protobuf.duration_pb2 import Duration\n", 46 | "from google.cloud import aiplatform as vertex_ai\n", 47 | "from google.cloud import aiplatform_v1beta1 as vertex_ai_beta" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "id": "5279e949", 53 | "metadata": {}, 54 | "source": [ 55 | "### Setup Google Cloud project" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "272491a9", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "PROJECT_ID = '[your-project-id]' # Change to your project id.\n", 66 | "REGION = 'us-central1' # Change to your region.\n", 67 | "\n", 68 | "if PROJECT_ID == '' or PROJECT_ID is None or PROJECT_ID == '[your-project-id]':\n", 69 | " # Get your GCP project id from gcloud\n", 70 | " shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n", 71 | " PROJECT_ID = shell_output[0]\n", 72 | "\n", 73 | "PARENT = f'projects/{PROJECT_ID}/locations/{REGION}'\n", 74 | "\n", 75 | "print('Project ID:', PROJECT_ID)\n", 76 | "print('Region:', REGION)\n", 77 | "print('Vertex API Parent URI:', PARENT)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "id": "513388ee", 83 | "metadata": {}, 84 | "source": [ 85 | "### Set configurations" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "id": "fb651770", 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n", 96 | "ENDPOINT_DISPLAY_NAME = 'chicago-taxi-tips-classifier'\n", 97 | "MONITORING_JOB_NAME = f'monitor-{ENDPOINT_DISPLAY_NAME}'\n", 98 | "NOTIFY_EMAILS = '[your-email-address]'\n", 99 | "\n", 100 | "LOG_SAMPLE_RATE = 0.8\n", 101 | "MONITOR_INTERVAL = 3600\n", 102 | "TARGET_FEATURE_NAME = 'tip_bin'" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "id": "ac7cb17f", 108 | "metadata": {}, 109 | "source": [ 110 | "## Create a Job Service client" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "bb896762", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "job_client_beta = vertex_ai_beta.JobServiceClient(\n", 121 | " client_options={'api_endpoint': f'{REGION}-aiplatform.googleapis.com'}\n", 122 | ")" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "63bcde67", 128 | "metadata": {}, 129 | "source": [ 130 | "## 1. Set the skew and drift thresholds" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "id": "3252edaa", 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "SKEW_THRESHOLDS = {\n", 141 | " 'trip_month': 0.3,\n", 142 | " 'trip_day': 0.3,\n", 143 | " 'trip_day_of_week': 0.3,\n", 144 | " 'trip_hour': 0.3,\n", 145 | " 'trip_seconds': 0.3,\n", 146 | " 'trip_miles': 0.3,\n", 147 | " 'payment_type': 0.3,\n", 148 | " 'pickup_grid': 0.3,\n", 149 | " 'dropoff_grid': 0.3,\n", 150 | " 'euclidean': 0.3,\n", 151 | " 'loc_cross': 0.3, \n", 152 | "}\n", 153 | "\n", 154 | "DIRFT_THRESHOLDS = {\n", 155 | " 'trip_month': 0.3,\n", 156 | " 'trip_day': 0.3,\n", 157 | " 'trip_day_of_week': 0.3,\n", 158 | " 'trip_hour': 0.3,\n", 159 | " 'trip_seconds': 0.3,\n", 160 | " 'trip_miles': 0.3,\n", 161 | " 'payment_type': 0.3,\n", 162 | " 'pickup_grid': 0.3,\n", 163 | " 'dropoff_grid': 0.3,\n", 164 | " 'euclidean': 0.3,\n", 165 | " 'loc_cross': 0.3, \n", 166 | "}" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "id": "adc333a3", 172 | "metadata": {}, 173 | "source": [ 174 | "## 2. Create a monitoring job" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "id": "a40d14cb", 180 | "metadata": {}, 181 | "source": [ 182 | "### Retrieve the `Dataset`, `Model` and `Endpoint` resources to monitor" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "id": "ed60fbff", 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "dataset = vertex_ai.TabularDataset.list(\n", 193 | " filter=f'display_name={DATASET_DISPLAY_NAME}', \n", 194 | " order_by='update_time')[-1]\n", 195 | "\n", 196 | "bq_source_uri = dataset.gca_resource.metadata['inputConfig']['bigquerySource']['uri']\n", 197 | " \n", 198 | "endpoint = vertex_ai.Endpoint.list(\n", 199 | " filter=f'display_name={ENDPOINT_DISPLAY_NAME}', \n", 200 | " order_by='update_time')[-1]\n", 201 | "\n", 202 | "endpoint_uri = endpoint.gca_resource.name\n", 203 | "\n", 204 | "model_ids = [model.id for model in endpoint.list_models()]" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "id": "0b159368", 210 | "metadata": {}, 211 | "source": [ 212 | "### Configure the monitoring job" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "id": "0370cb58", 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "skew_thresholds = {\n", 223 | " feature: vertex_ai_beta.ThresholdConfig(value=float(value))\n", 224 | " for feature, value in SKEW_THRESHOLDS.items()\n", 225 | "}\n", 226 | "\n", 227 | "drift_thresholds = {\n", 228 | " feature: vertex_ai_beta.ThresholdConfig(value=float(value))\n", 229 | " for feature, value in DIRFT_THRESHOLDS.items()\n", 230 | "}\n", 231 | "\n", 232 | "skew_config = vertex_ai_beta.ModelMonitoringObjectiveConfig.TrainingPredictionSkewDetectionConfig(\n", 233 | " skew_thresholds=skew_thresholds\n", 234 | ")\n", 235 | "\n", 236 | "drift_config = vertex_ai_beta.ModelMonitoringObjectiveConfig.PredictionDriftDetectionConfig(\n", 237 | " drift_thresholds=drift_thresholds\n", 238 | ")\n", 239 | "\n", 240 | "sampling_config = vertex_ai_beta.SamplingStrategy(\n", 241 | " random_sample_config=vertex_ai_beta.SamplingStrategy.RandomSampleConfig(\n", 242 | " sample_rate=LOG_SAMPLE_RATE\n", 243 | " )\n", 244 | ")\n", 245 | "\n", 246 | "schedule_config = vertex_ai_beta.ModelDeploymentMonitoringScheduleConfig(\n", 247 | " monitor_interval=Duration(seconds=MONITOR_INTERVAL)\n", 248 | ")\n", 249 | "\n", 250 | "training_dataset = vertex_ai_beta.ModelMonitoringObjectiveConfig.TrainingDataset(\n", 251 | " target_field=TARGET_FEATURE_NAME,\n", 252 | " bigquery_source = vertex_ai_beta.types.io.BigQuerySource(\n", 253 | " input_uri=bq_source_uri\n", 254 | " )\n", 255 | ")\n", 256 | "\n", 257 | "\n", 258 | "objective_template = vertex_ai_beta.ModelDeploymentMonitoringObjectiveConfig(\n", 259 | " objective_config=vertex_ai_beta.ModelMonitoringObjectiveConfig(\n", 260 | " training_dataset=training_dataset,\n", 261 | " training_prediction_skew_detection_config=skew_config,\n", 262 | " prediction_drift_detection_config=drift_config,\n", 263 | " )\n", 264 | ")\n", 265 | "\n", 266 | "deployment_objective_configs = []\n", 267 | "for model_id in model_ids:\n", 268 | " objective_config = copy.deepcopy(objective_template)\n", 269 | " objective_config.deployed_model_id = model_id\n", 270 | " deployment_objective_configs.append(objective_config)\n", 271 | "\n", 272 | "alerting_config = vertex_ai_beta.ModelMonitoringAlertConfig(\n", 273 | " email_alert_config=vertex_ai_beta.ModelMonitoringAlertConfig.EmailAlertConfig(\n", 274 | " user_emails=NOTIFY_EMAILS\n", 275 | " )\n", 276 | ")\n" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "id": "f4b667db", 282 | "metadata": {}, 283 | "source": [ 284 | "### Instantiate a monitoring job" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "id": "5e4e0c9d", 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "job = vertex_ai_beta.ModelDeploymentMonitoringJob(\n", 295 | " display_name=MONITORING_JOB_NAME,\n", 296 | " endpoint=endpoint_uri,\n", 297 | " model_deployment_monitoring_objective_configs=deployment_objective_configs,\n", 298 | " logging_sampling_strategy=sampling_config,\n", 299 | " model_deployment_monitoring_schedule_config=schedule_config,\n", 300 | " model_monitoring_alert_config=alerting_config,\n", 301 | ")" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "id": "4a66d1d5", 307 | "metadata": {}, 308 | "source": [ 309 | "### Submit the monitoring job for execution" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "id": "4a0e41b9", 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "response = job_client_beta.create_model_deployment_monitoring_job(\n", 320 | " parent=PARENT, model_deployment_monitoring_job=job\n", 321 | ")\n", 322 | "response" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "id": "39352387", 328 | "metadata": {}, 329 | "source": [ 330 | "## 3. Get the monitoring job" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "id": "bc47ef29", 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "monitoring_jobs = job_client_beta.list_model_deployment_monitoring_jobs(parent=PARENT)\n", 341 | "monitoring_job = [entry for entry in monitoring_jobs if entry.display_name == MONITORING_JOB_NAME][0]\n", 342 | "monitoring_job" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "id": "9dbb50ce", 348 | "metadata": {}, 349 | "source": [ 350 | "## 5. Pause the monitoring job" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "id": "cd6d295e", 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "job_client_beta.pause_model_deployment_monitoring_job(name=monitoring_job)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "id": "37663c7e", 366 | "metadata": {}, 367 | "source": [ 368 | "## Delete the monitoring job" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "id": "c3be1189", 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "job_client_beta.delete_model_deployment_monitoring_job(name=monitoring_job)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "id": "28159818", 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [] 388 | } 389 | ], 390 | "metadata": { 391 | "environment": { 392 | "name": "common-cpu.m73", 393 | "type": "gcloud", 394 | "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73" 395 | }, 396 | "kernelspec": { 397 | "display_name": "Python 3", 398 | "language": "python", 399 | "name": "python3" 400 | }, 401 | "language_info": { 402 | "codemirror_mode": { 403 | "name": "ipython", 404 | "version": 3 405 | }, 406 | "file_extension": ".py", 407 | "mimetype": "text/x-python", 408 | "name": "python", 409 | "nbconvert_exporter": "python", 410 | "pygments_lexer": "ipython3", 411 | "version": "3.7.10" 412 | } 413 | }, 414 | "nbformat": 4, 415 | "nbformat_minor": 5 416 | } 417 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/tfx-oss-public/tfx:0.30.0 2 | 3 | COPY requirements.txt requirements.txt 4 | 5 | RUN pip install -r requirements.txt 6 | 7 | COPY src/ src/ 8 | 9 | ENV PYTHONPATH="/pipeline:${PYTHONPATH}" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PLEASE USE THIS REPO INSTEAD: https://github.com/GoogleCloudPlatform/mlops-with-vertex-ai 2 | 3 | 4 | # MLOps on Vertex AI 5 | 6 | This example implements the end-to-end [MLOps process](https://services.google.com/fh/files/misc/practitioners_guide_to_mlops_whitepaper.pdf) using [Vertex AI](https://cloud.google.com/vertex-ai) platform and [Smart Analytics](https://cloud.google.com/solutions/smart-analytics) technology capabilities. The example use [Keras](https://keras.io/) to implement the ML model, [TFX](https://www.tensorflow.org/tfx) to implement the training pipeline, and [Model Builder SDK](https://github.com/googleapis/python-aiplatform/tree/569d4cd03e888fde0171f7b0060695a14f99b072/google/cloud/aiplatform) to interact with Vertex AI. 7 | 8 | 9 | MLOps lifecycle 10 | 11 | 12 | ## Getting started 13 | 14 | 1. [Setting up MLOps environment](provision) on Google Cloud. 15 | 2. Start your AI Notebook instance. 16 | 3. Open the JupyterLab then open a new Terminal 17 | 4. Clone the repository to your AI Notebook instance: 18 | ``` 19 | git clone https://github.com/ksalama/ucaip-labs.git 20 | cd ucaip-labs 21 | ``` 22 | 5. Install the required Python packages: 23 | ``` 24 | pip install tfx==0.30.0 --user 25 | pip install -r requirements.txt --user 26 | ``` 27 | 6. Upgrade the `gcloud` components: 28 | ``` 29 | sudo apt-get install google-cloud-sdk 30 | gcloud components update 31 | ``` 32 | 33 | ## Dataset Management 34 | 35 | The [Chicago Taxi Trips](https://console.cloud.google.com/marketplace/details/city-of-chicago-public-data/chicago-taxi-trips) dataset is one of [public datasets hosted with BigQuery](https://cloud.google.com/bigquery/public-data/), which includes taxi trips from 2013 to the present, reported to the City of Chicago in its role as a regulatory agency. The task is to predict whether a given trip will result in a tip > 20%. 36 | 37 | The [01-dataset-management](01-dataset-management.ipynb) notebook covers: 38 | 39 | 1. Performing exploratory data analysis on the data in `BigQuery`. 40 | 2. Creating `Vertex AI` Dataset resource using the Python SDK. 41 | 3. Generating the schema for the raw data using [TensorFlow Data Validation](https://www.tensorflow.org/tfx/guide/tfdv). 42 | 43 | 44 | ## ML Development 45 | 46 | We experiment with creating a [Custom Model](https://cloud.google.com/ai-platform-unified/docs/training/create-model-custom-training) using [02-experimentation](02-experimentation.ipynb) notebook, which covers: 47 | 48 | 1. Preparing the data using `Dataflow`. 49 | 2. Implementing a `Keras` classification model. 50 | 3. Training the `Keras` model with `Vertex AI` using a [pre-built container](https://cloud.google.com/ai-platform-unified/docs/training/pre-built-containers). 51 | 4. Upload the exported model from `Cloud Storage` to `Vertex AI`. 52 | 5. Exract and visualize experiment parameters from [Vertex AI Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction). 53 | 54 | We use [Vertex TensorBoard](https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview) 55 | and [Vertex ML Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction) to track, visualize, and compare ML experiments. 56 | 57 | In addition, the training steps are formalized by implementing a [TFX pipeline](https://www.tensorflow.org/tfx). 58 | The [03-training-formalization](02-tfx-interactive.ipynb) notebook covers implementing and testing the pipeline components interactively. 59 | 60 | ## Training Operationalization 61 | 62 | The [04-pipeline-deployment](04-pipeline-deployment.ipynb) notebook covers executing the CI/CD steps for the training pipeline deployment using [Cloud Build](https://cloud.google.com/build/docs/overview). The CI/CD routine is defined in the [pipeline-deployment.yaml](pipeline-deployment.yaml) file, and consists of the following steps: 63 | 64 | 1. Clone the repository to the build environment. 65 | 2. Run unit tests. 66 | 3. Run a local e2e test of the `TFX` pipeline. 67 | 4. Build the ML container image for pipeline steps. 68 | 5. Compile the pipeline. 69 | 6. Upload the pipeline to `Cloud Storage`. 70 | 71 | ## Continuous Training 72 | 73 | After testing, compiling, and uploading the pipeline definition to `Cloud Storage`, the pipeline is executed with respect to a trigger. 74 | We use [Cloud Functions](https://cloud.google.com/functions) and [Cloud Pub/Sub](https://cloud.google.com/pubsub) as a triggering mechanism. 75 | The `Cloud Function` listens to the `Pub/Sub` topic, and runs the training pipeline given a message sent to the `Pub/Sub` topic. 76 | The `Cloud Function` is implemented in [src/pipeline_triggering](src/pipeline_triggering). 77 | 78 | The [05-continuous-training](05-continuous-training.ipynb) notebook covers: 79 | 80 | 1. Creating a Cloud `Pub/Sub` topic. 81 | 2. Deploying a `Cloud Function`. 82 | 3. Triggering the pipeline. 83 | 84 | The end-to-end TFX training pipeline implementation is in the [src/pipelines](src/tfx_pipelines) directory, which covers the following steps: 85 | 86 | 1. Receive hyperparameters using `hyperparam_gen` custom python component. 87 | 2. Extract data from `BigQuery` using `BigQueryExampleGen` component. 88 | 3. Validate the raw data using `StatisticsGen` and `ExampleValidator` component. 89 | 4. Process the data using on `Dataflow` `Transform` component. 90 | 5. Train a custom model with `Vertex AI` using `Trainer` component. 91 | 6. Evaluat and validate the custom model using `ModelEvaluator` component. 92 | 7. Save the blessed to model registry location in `Cloud Storage` using `Pusher` component. 93 | 8. Upload the model to `Vertex AI` using `vertex_model_pusher` custom python component. 94 | 95 | 96 | ## Model Deployment 97 | 98 | The [06-model-deployment](06-model-deployment.ipynb) notebook covers executing the CI/CD steps for the model deployment using [Cloud Build](https://cloud.google.com/build/docs/overview). The CI/CD rountine is defined in [build/model-deployment.yaml](build/model-deployment.yaml) 99 | file, and consists of the following steps: 100 | 101 | 2. Test model interface. 102 | 3. Create an endpoint in `Vertex AI`. 103 | 4. Deploy the model to the `endpoint`. 104 | 5. Test the `Vertex AI` endpoint. 105 | 106 | ## Prediction Serving 107 | 108 | We serve the deployed model for prediction. 109 | The [07-prediction-serving](07-prediction-serving.ipynb) notebook covers: 110 | 111 | 1. Use the `Vertex AI` endpoint for online prediction. 112 | 2. Use the `Vertex AI` uploaded model for batch prediciton. 113 | 3. Run the batch prediction using `Vertex Pipelines`. 114 | 115 | ## Model Monitoring 116 | 117 | After a model is deployed in for prediciton serving, continuous monitoring is set up to ensure that the model continue to perform as expected. 118 | The [08-model-monitoring](08-model-monitoring.ipynb) notebook covers configuring [Vertex AI Model Monitoring](https://cloud.google.com/vertex-ai/docs/model-monitoring/overview?hl=nn) for skew and dirft detection: 119 | 120 | 1. Set skew and drift threshold. 121 | 2. Create a monitoring job for all the models under and endpoint. 122 | 3. List the monitoring jobs. 123 | 4. List artifacts produced by monitoring job. 124 | 5. Pause and delete the monitoring job. 125 | 126 | 127 | ## Metadata Tracking 128 | 129 | You can view the parameters and metrics logged by your experiments, as well as the artifacts and metadata stored by 130 | your `Vertex Pipelines` in [Cloud Console](https://console.cloud.google.com/vertex-ai/metadata). 131 | 132 | ## Disclaimer 133 | 134 | This is not an official Google product but sample code provided for an educational purpose. 135 | 136 | --- 137 | 138 | Copyright 2021 Google LLC. 139 | 140 | Licensed under the Apache License, Version 2.0 (the "License"); 141 | you may not use this file except in compliance with the License. 142 | You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0 143 | 144 | Unless required by applicable law or agreed to in writing, software 145 | distributed under the License is distributed on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 147 | See the License for the specific language governing permissions and 148 | limitations under the License. 149 | 150 | 151 | 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /build/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/tfx-oss-public/tfx:0.30.0 2 | 3 | RUN pip install -U pip 4 | RUN pip install google-cloud-aiplatform==1.1.1 google-cloud-aiplatform[tensorboard] 5 | RUN pip install pytest kfp==1.6.2 google-cloud-bigquery==2.20.0 google-cloud-bigquery-storage==2.4.0 -------------------------------------------------------------------------------- /build/model-deployment.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ###################################################################### 16 | # CI/CD steps for Cloud Build to test and deploy a model to Vertex AI. 17 | ###################################################################### 18 | 19 | steps: 20 | 21 | # Clone the repository. 22 | - name: 'gcr.io/cloud-builders/git' 23 | args: ['clone', '--single-branch', '--branch', 24 | '$_BRANCH', '$_REPO_URL', 25 | '--depth', '1', 26 | '--verbose'] 27 | id: 'Clone Repository' 28 | 29 | # Test uploaded model artifact. 30 | - name: '$_CICD_IMAGE_URI' 31 | entrypoint: 'pytest' 32 | args: ['src/tests/model_deployment_tests.py::test_model_artifact'] 33 | dir: 'ucaip-labs' 34 | env: 35 | - 'PROJECT=$_PROJECT' 36 | - 'REGION=$_REGION' 37 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' 38 | id: 'Test Model Artifact' 39 | waitFor: ['Clone Repository'] 40 | 41 | # Create an endpoint. 42 | - name: '$_CICD_IMAGE_URI' 43 | entrypoint: 'python' 44 | args: ['build/utils.py', 45 | '--mode', 'create-endpoint', 46 | '--project', '$_PROJECT', 47 | '--region', '$_REGION', 48 | '--endpoint-display-name', '$_ENDPOINT_DISPLAY_NAME'] 49 | dir: 'ucaip-labs' 50 | id: 'Create Endpoint' 51 | waitFor: ['Test Model Artifact'] 52 | 53 | # Deploy the model. 54 | - name: '$_CICD_IMAGE_URI' 55 | entrypoint: 'python' 56 | args: ['build/utils.py', 57 | '--mode', 'deploy-model', 58 | '--project', '$_PROJECT', 59 | '--region', '$_REGION', 60 | '--endpoint-display-name', '$_ENDPOINT_DISPLAY_NAME', 61 | '--model-display-name', '$_MODEL_DISPLAY_NAME' 62 | ] 63 | dir: 'ucaip-labs' 64 | id: 'Deploy Model' 65 | waitFor: ['Create Endpoint'] 66 | 67 | # Test deployed model endpoint. 68 | - name: '$_CICD_IMAGE_URI' 69 | entrypoint: 'pytest' 70 | args: ['src/tests/model_deployment_tests.py::test_model_endpoint'] 71 | dir: 'ucaip-labs' 72 | env: 73 | - 'PROJECT=$_PROJECT' 74 | - 'REGION=$_REGION' 75 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' 76 | - 'ENDPOINT_DISPLAY_NAME=$_ENDPOINT_DISPLAY_NAME' 77 | id: 'Test Model Endpoint' 78 | waitFor: ['Deploy Model'] -------------------------------------------------------------------------------- /build/pipeline-deployment.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ############################################################################# 16 | # CI/CD steps for Cloud Build to test and deploy a TFX pipeline to Vertex AI. 17 | ############################################################################# 18 | 19 | steps: 20 | 21 | # Clone the repository. 22 | - name: 'gcr.io/cloud-builders/git' 23 | args: ['clone', '--single-branch', '--branch', 24 | '$_BRANCH', '$_REPO_URL', 25 | '--depth', '1', 26 | '--verbose'] 27 | id: 'Clone Repository' 28 | 29 | 30 | # Run datasource_utils unit tests. 31 | - name: '$_CICD_IMAGE_URI' 32 | entrypoint: 'pytest' 33 | args: ['src/tests/datasource_utils_tests.py', '-s'] 34 | dir: 'ucaip-labs' 35 | env: 36 | - 'PROJECT_ID=$_PROJECT_ID' 37 | - 'BQ_LOCATION=$_BQ_LOCATION' 38 | - 'BQ_DATASET_NAME=$_BQ_DATASET_NAME' 39 | - 'BQ_TABLE_NAME=$_BQ_TABLE_NAME' 40 | id: 'Unit Test Datasource Utils' 41 | waitFor: ['Clone Repository'] 42 | 43 | 44 | # Run model unit tests. 45 | - name: '$_CICD_IMAGE_URI' 46 | entrypoint: 'pytest' 47 | args: ['src/tests/model_tests.py', '-s'] 48 | dir: 'ucaip-labs' 49 | id: 'Unit Test Model' 50 | waitFor: ['Clone Repository'] 51 | timeout: 1800s 52 | 53 | 54 | # Test e2e pipeline using local runner. 55 | - name: '$_CICD_IMAGE_URI' 56 | entrypoint: 'pytest' 57 | args: ['src/tests/pipeline_deployment_tests.py::test_e2e_pipeline', '-s'] 58 | dir: 'ucaip-labs' 59 | env: 60 | - 'PROJECT_ID=$_PROJECT_ID' 61 | - 'REGION=$_REGION' 62 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' 63 | - 'DATASET_DISPLAY_NAME=$_DATASET_DISPLAY_NAME' 64 | - 'GCS_LOCATION=$_TEST_GCS_LOCATION' 65 | - 'TRAIN_LIMIT=$_CI_TRAIN_LIMIT' 66 | - 'TEST_LIMIT=$_CI_TEST_LIMIT' 67 | - 'UPLOAD_MODEL=$_CI_UPLOAD_MODEL' 68 | - 'ACCURACY_THRESHOLD=$_CI_ACCURACY_THRESHOLD' 69 | id: 'Local Test E2E Pipeline' 70 | waitFor: ['Unit Test Datasource Utils', 'Unit Test Model'] 71 | timeout: 1800s 72 | 73 | 74 | # Build the image that encapsulates the pipeline. 75 | - name: 'gcr.io/cloud-builders/docker' 76 | args: ['build', '-t', '$_TFX_IMAGE_URI', '.'] 77 | dir: 'ucaip-labs' 78 | id: 'Build TFX Image' 79 | waitFor: ['Local Test E2E Pipeline'] 80 | 81 | 82 | # Compile the pipeline. 83 | - name: '$_CICD_IMAGE_URI' 84 | entrypoint: 'python' 85 | args: ['build/utils.py', 86 | '--mode', 'compile-pipeline', 87 | '--pipeline-name', '$_PIPELINE_NAME' 88 | ] 89 | dir: 'ucaip-labs' 90 | env: 91 | - 'PROJECT_ID=$_PROJECT_ID' 92 | - 'REGION=$_REGION' 93 | - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME' 94 | - 'DATASET_DISPLAY_NAME=$_DATASET_DISPLAY_NAME' 95 | - 'GCS_LOCATION=$_GCS_LOCATION' 96 | - 'TFX_IMAGE_URI=$_TFX_IMAGE_URI' 97 | - 'BEAM_RUNNER=$_BEAM_RUNNER' 98 | - 'TRAINING_RUNNER=$_TRAINING_RUNNER' 99 | id: 'Compile Pipeline' 100 | waitFor: ['Local Test E2E Pipeline'] 101 | 102 | 103 | # Upload compiled pipeline to GCS. 104 | - name: 'gcr.io/cloud-builders/gsutil' 105 | args: ['cp', '$_PIPELINE_NAME.json', '$_PIPELINES_STORE'] 106 | dir: 'ucaip-labs' 107 | id: 'Upload Pipeline to GCS' 108 | waitFor: ['Compile Pipeline'] 109 | 110 | 111 | # Push TFX Image to Container Registy. 112 | images: ['$_TFX_IMAGE_URI'] 113 | -------------------------------------------------------------------------------- /build/serving_resources_spec.json: -------------------------------------------------------------------------------- 1 | { 2 | "traffic_percentage": 100, 3 | "machine_type": "n1-standard-2", 4 | "min_replica_count": 1, 5 | "max_replica_count": 1, 6 | "accelerator_type": null, 7 | "accelerator_count": null 8 | } -------------------------------------------------------------------------------- /build/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Utilities for deploying pipelines and models to Vertex AI.""" 15 | 16 | 17 | import argparse 18 | import os 19 | import sys 20 | import logging 21 | import json 22 | 23 | from google.cloud import aiplatform as vertex_ai 24 | 25 | 26 | SCRIPT_DIR = os.path.dirname( 27 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))) 28 | ) 29 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, ".."))) 30 | 31 | SERVING_SPEC_FILEPATH = 'build/serving_resources_spec.json' 32 | 33 | def get_args(): 34 | """Define an parse commandline arguments.""" 35 | 36 | parser = argparse.ArgumentParser() 37 | 38 | parser.add_argument( 39 | '--mode', 40 | type=str, 41 | ) 42 | 43 | parser.add_argument( 44 | '--project', 45 | type=str, 46 | ) 47 | 48 | parser.add_argument( 49 | '--region', 50 | type=str, 51 | ) 52 | 53 | parser.add_argument( 54 | '--endpoint-display-name', 55 | type=str, 56 | ) 57 | 58 | parser.add_argument( 59 | '--model-display-name', 60 | type=str, 61 | ) 62 | 63 | parser.add_argument( 64 | '--pipeline-name', 65 | type=str, 66 | ) 67 | 68 | return parser.parse_args() 69 | 70 | 71 | def create_endpoint(project, region, endpoint_display_name): 72 | """Create a Vertex endpoint.""" 73 | 74 | logging.info(f"Creating endpoint {endpoint_display_name}") 75 | vertex_ai.init( 76 | project=project, 77 | location=region 78 | ) 79 | 80 | endpoints = vertex_ai.Endpoint.list( 81 | filter=f'display_name={endpoint_display_name}', 82 | order_by="update_time") 83 | 84 | if len(endpoints) > 0: 85 | logging.info(f"Endpoint {endpoint_display_name} already exists.") 86 | endpoint = endpoints[-1] 87 | else: 88 | endpoint = vertex_ai.Endpoint.create(endpoint_display_name) 89 | logging.info(f"Endpoint is ready.") 90 | logging.info(endpoint.gca_resource) 91 | return endpoint 92 | 93 | 94 | def deploy_model(project, region, endpoint_display_name, model_display_name, serving_resources_spec): 95 | """Deploy a model to a Vertex endpoint.""" 96 | 97 | logging.info(f"Deploying model {model_display_name} to endpoint {endpoint_display_name}") 98 | vertex_ai.init( 99 | project=project, 100 | location=region 101 | ) 102 | 103 | model = vertex_ai.Model.list( 104 | filter=f'display_name={model_display_name}', 105 | order_by="update_time" 106 | )[-1] 107 | 108 | endpoint = vertex_ai.Endpoint.list( 109 | filter=f'display_name={endpoint_display_name}', 110 | order_by="update_time" 111 | )[-1] 112 | 113 | deployed_model = endpoint.deploy(model=model, **serving_resources_spec) 114 | logging.info(f"Model is deployed.") 115 | logging.info(deployed_model) 116 | return deployed_model 117 | 118 | 119 | def compile_pipeline(pipeline_name): 120 | """Create a .json file with the pipeline definition.""" 121 | 122 | from src.tfx_pipelines import runner 123 | pipeline_definition_file = f"{pipeline_name}.json" 124 | pipeline_definition = runner.compile_training_pipeline(pipeline_definition_file) 125 | return pipeline_definition 126 | 127 | 128 | 129 | def main(): 130 | args = get_args() 131 | 132 | if args.mode == 'create-endpoint': 133 | if not args.project: 134 | raise ValueError("project must be supplied.") 135 | if not args.region: 136 | raise ValueError("region must be supplied.") 137 | if not args.endpoint_display_name: 138 | raise ValueError("endpoint_display_name must be supplied.") 139 | 140 | result = create_endpoint( 141 | args.project, 142 | args.region, 143 | args.endpoint_display_name 144 | ) 145 | 146 | elif args.mode == 'deploy-model': 147 | if not args.project: 148 | raise ValueError("project must be supplied.") 149 | if not args.region: 150 | raise ValueError("region must be supplied.") 151 | if not args.endpoint_display_name: 152 | raise ValueError("endpoint-display-name must be supplied.") 153 | if not args.model_display_name: 154 | raise ValueError("model-display-name must be supplied.") 155 | 156 | with open(SERVING_SPEC_FILEPATH) as json_file: 157 | serving_resources_spec = json.load(json_file) 158 | logging.info(f"serving resources: {serving_resources_spec}") 159 | result = deploy_model( 160 | args.project, 161 | args.region, 162 | args.endpoint_display_name, 163 | args.model_display_name, 164 | serving_resources_spec 165 | ) 166 | 167 | elif args.mode == 'compile-pipeline': 168 | if not args.pipeline_name: 169 | raise ValueError("pipeline-name must be supplied.") 170 | 171 | result = compile_pipeline(args.pipeline_name) 172 | 173 | else: 174 | raise ValueError(f"Invalid mode {args.mode}.") 175 | 176 | logging.info(result) 177 | 178 | 179 | if __name__ == "__main__": 180 | main() 181 | 182 | -------------------------------------------------------------------------------- /mlops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/mlops.png -------------------------------------------------------------------------------- /provision/README.md: -------------------------------------------------------------------------------- 1 | # Creating a Vertex environment 2 | 3 | You can use the [Terraform](https://www.terraform.io/) scripts in the `terraform` folder to automatically provision the environment required by the samples. 4 | 5 | The scripts perform the following actions: 6 | 7 | 1. Enable the required Cloud APIs 8 | * **Essentials**: compute, iam, iamcredentials 9 | * **ML**: notebooks, aiplatform 10 | * **Data**: dataflow, bigquery, bigquerydatatransfer 11 | * **CI/CD**: cloudbuild, container, artifactregistry 12 | * **Operations**: cloudtrace, monitoring, logging, cloudresourcemanager 13 | 2. Create a regional GCS bucket. 14 | 3. Create an instance of Vertex Notebooks. 15 | 4. Create service accounts for Vertex Training and Vertex Pipelines. 16 | 17 | You can customize your configuration using the following variables: 18 | 19 | |Variable|Required|Default|Description| 20 | |--------|--------|-------|-----------| 21 | |name_prefix|Yes||Prefix added to the names of provisioned resources. **The prefix should start with a letter and include letters and digits only**.| 22 | |project_id|Yes||GCP project ID| 23 | |network_name|No|default|Name of the network for the Notebook instance. The network must already exist.| 24 | |subnet_name|No|default|Name of the subnet for the Notebook instance. The subnet must already exist.| 25 | |subnet_region|No|us-central1|Region where the subnet was created.| 26 | |zone|Yes||GCP zone for the Notebook instance. The zone must be in the region defined in the `subnet_region` variable| 27 | |machine_type|No|n1-standard-4|Machine type of the Notebook instance| 28 | |boot_disk_size|No|200GB|Size of the Notebook instance's boot disk| 29 | |image_family|No|tf-2-4-cpu|Image family for the Notebook instance| 30 | |gpu_type|No|null|GPU type of the Notebook instance. By default, the Notebook instance will be provisioned without a GPU| 31 | |gpu_count|No|null|GPU count of the Notebook instance| 32 | |install_gpu_driver|No|false|Whether to install a GPU driver| 33 | |region|No|Set to subnet_region.|GCP region for the GCS bucket and Artifact Registry. It is recommended that the same region is used for all: the bucket, the registry and the Notebook instance. If not provided the `egion` will be set to `subnet_region`.| 34 | |force_destroy|No|false|Whether to force the removal of the bucket on terraform destroy. **Note that by default the bucket will not be destroyed**.| 35 | 36 | 37 | To provision the environment: 38 | 39 | 1. Open [Cloud Shell](https://cloud.google.com/shell/docs/launching-cloud-shell) 40 | 41 | 2. Download the installation scripts 42 | ``` 43 | SRC_REPO=https://github.com/ksalama/ucaip-labs 44 | LOCAL_DIR=provision 45 | kpt pkg get $SRC_REPO/provision@main $LOCAL_DIR 46 | cd $LOCAL_DIR/terraform 47 | ``` 48 | 49 | 3. Update the `terraform.tfvars` file with the values reflecting your environment. Alternatively, you can provide the values using the Terraform CLI `-var` options when you execute `terraform apply` in the next step 50 | 51 | 4. Execute the following commands. : 52 | ``` 53 | terraform init 54 | terraform apply 55 | ``` 56 | 57 | 58 | To destroy the environment, execute: 59 | ``` 60 | terraform destroy 61 | ``` 62 | -------------------------------------------------------------------------------- /provision/terraform/gcs-bucket.tf: -------------------------------------------------------------------------------- 1 | 2 | # Copyright 2021 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | resource "google_storage_bucket" "artifact_repo" { 18 | project = module.project-services.project_id 19 | name = "${var.name_prefix}-bucket" 20 | location = local.region 21 | storage_class = local.bucket_type 22 | force_destroy = var.force_destroy 23 | } -------------------------------------------------------------------------------- /provision/terraform/main.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | terraform { 16 | required_version = ">= 0.14" 17 | required_providers { 18 | google = "~> 3.6" 19 | } 20 | } 21 | 22 | provider "google" { 23 | project = var.project_id 24 | } 25 | 26 | data "google_project" "project" { 27 | project_id = var.project_id 28 | } 29 | 30 | locals { 31 | bucket_type = "REGIONAL" 32 | region = var.region == null ? var.subnet_region : var.region 33 | } 34 | 35 | 36 | -------------------------------------------------------------------------------- /provision/terraform/notebook-instance.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | locals { 16 | image_project = "deeplearning-platform-release" 17 | } 18 | 19 | data "google_compute_network" "vm_network" { 20 | project = module.project-services.project_id 21 | name = var.network_name 22 | 23 | depends_on = [ 24 | module.project-services 25 | ] 26 | } 27 | 28 | data "google_compute_subnetwork" "vm_subnetwork" { 29 | project = module.project-services.project_id 30 | name = var.subnet_name 31 | region = var.subnet_region 32 | 33 | depends_on = [ 34 | module.project-services 35 | ] 36 | } 37 | 38 | resource "google_notebooks_instance" "notebook_instance" { 39 | project = module.project-services.project_id 40 | name = "${var.name_prefix}-notebook" 41 | machine_type = var.machine_type 42 | location = var.zone 43 | 44 | network = data.google_compute_network.vm_network.id 45 | subnet = data.google_compute_subnetwork.vm_subnetwork.id 46 | 47 | vm_image { 48 | project = local.image_project 49 | image_family = var.image_family 50 | } 51 | 52 | dynamic accelerator_config { 53 | for_each = var.gpu_type != null ? [1] : [] 54 | content { 55 | type = var.gpu_type 56 | core_count = var.gpu_count 57 | } 58 | } 59 | 60 | install_gpu_driver = var.install_gpu_driver 61 | 62 | boot_disk_size_gb = var.boot_disk_size 63 | } 64 | -------------------------------------------------------------------------------- /provision/terraform/service-accounts.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Create Vertex Training service account 16 | resource "google_service_account" "training_sa" { 17 | project = module.project-services.project_id 18 | account_id = var.training_sa_name 19 | display_name = "Vertex Training service account" 20 | } 21 | 22 | # Create Vertex Training SA role bindings 23 | resource "google_project_iam_member" "training_sa_role_bindings" { 24 | project = module.project-services.project_id 25 | for_each = toset(var.training_sa_roles) 26 | member = "serviceAccount:${google_service_account.training_sa.email}" 27 | role = "roles/${each.value}" 28 | } 29 | 30 | # Create Vertex Pipelines service account 31 | resource "google_service_account" "pipelines_sa" { 32 | project = module.project-services.project_id 33 | account_id = var.pipelines_sa_name 34 | display_name = "Vertex Pipelines account name" 35 | } 36 | 37 | # Create Vertex Pipelines SA role bindings 38 | resource "google_project_iam_member" "role_bindings" { 39 | project = module.project-services.project_id 40 | for_each = toset(var.pipelines_sa_roles) 41 | member = "serviceAccount:${google_service_account.pipelines_sa.email}" 42 | role = "roles/${each.value}" 43 | } 44 | -------------------------------------------------------------------------------- /provision/terraform/services.tf: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | module "project-services" { 17 | source = "terraform-google-modules/project-factory/google//modules/project_services" 18 | 19 | project_id = data.google_project.project.project_id 20 | 21 | disable_services_on_destroy = false 22 | activate_apis = [ 23 | "compute.googleapis.com", 24 | "iam.googleapis.com", 25 | "container.googleapis.com", 26 | "artifactregistry.googleapis.com", 27 | "cloudresourcemanager.googleapis.com", 28 | "cloudtrace.googleapis.com", 29 | "iamcredentials.googleapis.com", 30 | "monitoring.googleapis.com", 31 | "logging.googleapis.com", 32 | "notebooks.googleapis.com", 33 | "aiplatform.googleapis.com", 34 | "dataflow.googleapis.com", 35 | "bigquery.googleapis.com", 36 | "cloudbuild.googleapis.com", 37 | "bigquerydatatransfer.googleapis.com", 38 | ] 39 | } -------------------------------------------------------------------------------- /provision/terraform/terraform.tfvars: -------------------------------------------------------------------------------- 1 | project_id = "vertex-mlops" 2 | subnet_region = "us-central1" 3 | zone = "us-central1-a" 4 | name_prefix = "vertex-mlops" 5 | machine_type = "n1-standard-8" 6 | #gpu_type = "NVIDIA_TESLA_T4" 7 | #gpu_count = 1 8 | #install_gpu_driver = true 9 | #image_family = "tf-2-4-gpu" 10 | 11 | 12 | -------------------------------------------------------------------------------- /provision/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | 2 | # Copyright 2021 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | variable "project_id" { 18 | description = "The GCP project ID" 19 | type = string 20 | } 21 | 22 | variable "region" { 23 | description = "The region for the GCS bucket and Artifact Registry" 24 | type = string 25 | default = null 26 | } 27 | 28 | variable "zone" { 29 | description = "The zone for a Vertex Notebook instance" 30 | type = string 31 | } 32 | 33 | variable "name_prefix" { 34 | description = "The name prefix to add to the resource names" 35 | type = string 36 | } 37 | 38 | variable "machine_type" { 39 | description = "The Notebook instance's machine type" 40 | type = string 41 | } 42 | 43 | variable "network_name" { 44 | description = "The network name for the Notebook instance" 45 | type = string 46 | default = "default" 47 | } 48 | 49 | variable "subnet_name" { 50 | description = "The subnet name for the Notebook instance" 51 | type = string 52 | default = "default" 53 | } 54 | 55 | variable "subnet_region" { 56 | description = "The region for the Notebook subnet" 57 | type = string 58 | default = "us-central1" 59 | } 60 | 61 | variable "boot_disk_size" { 62 | description = "The size of the boot disk" 63 | default = 200 64 | } 65 | 66 | variable "image_family" { 67 | description = "A Deep Learning image family for the Notebook instance" 68 | type = string 69 | default = "tf-2-4-cpu" 70 | } 71 | 72 | variable "gpu_type" { 73 | description = "A GPU type for the Notebook instance" 74 | type = string 75 | default = null 76 | } 77 | 78 | variable "gpu_count" { 79 | description = "A GPU count for the Notebook instance" 80 | type = string 81 | default = null 82 | } 83 | 84 | variable "install_gpu_driver" { 85 | description = "Whether to install GPU driver" 86 | type = bool 87 | default = false 88 | } 89 | 90 | variable "force_destroy" { 91 | description = "Whether to remove the bucket on destroy" 92 | type = bool 93 | default = false 94 | } 95 | 96 | variable "training_sa_roles" { 97 | description = "The roles to assign to the Vertex Training service account" 98 | default = [ 99 | "storage.admin", 100 | "aiplatform.user", 101 | "bigquery.admin" 102 | ] 103 | } 104 | 105 | variable "pipelines_sa_roles" { 106 | description = "The roles to assign to the Vertex Pipelines service account" 107 | default = [ 108 | "storage.admin", 109 | "bigquery.admin", 110 | "aiplatform.user" 111 | ] 112 | } 113 | 114 | variable "training_sa_name" { 115 | description = "Vertex training service account name." 116 | default = "training-sa" 117 | } 118 | 119 | variable "pipelines_sa_name" { 120 | description = "Vertex pipelines service account name." 121 | default = "pipelines-sa" 122 | } 123 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | kfp==1.6.2 2 | google-cloud-bigquery==2.20.0 3 | google-cloud-bigquery-storage==2.4.0 4 | google-cloud-aiplatform==1.1.1 5 | google-auth==1.30.1 6 | google-auth-oauthlib==0.4.4 7 | google-auth-httplib2==0.1.0 8 | oauth2client==4.1.3 9 | requests==2.25.1 10 | pytest 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | REQUIRED_PACKAGES = [ 4 | "google-cloud-aiplatform==1.0.0", 5 | "tensorflow-transform==0.30.0", 6 | "tensorflow-data-validation==0.30.0", 7 | ] 8 | 9 | setuptools.setup( 10 | name="executor", 11 | version="0.0.1", 12 | install_requires=REQUIRED_PACKAGES, 13 | packages=setuptools.find_packages(), 14 | include_package_data=True, 15 | package_data={"src": ["raw_schema/schema.pbtxt"]}, 16 | ) 17 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/__init__.py -------------------------------------------------------------------------------- /src/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/common/__init__.py -------------------------------------------------------------------------------- /src/common/datasource_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Utilities for generating BigQuery data querying scirpts.""" 15 | 16 | 17 | from google.cloud import aiplatform as vertex_ai 18 | 19 | 20 | def _get_source_query(bq_dataset_name, bq_table_name, ml_use, limit=None): 21 | 22 | query = f""" 23 | SELECT 24 | IF(trip_month IS NULL, -1, trip_month) trip_month, 25 | IF(trip_day IS NULL, -1, trip_day) trip_day, 26 | IF(trip_day_of_week IS NULL, -1, trip_day_of_week) trip_day_of_week, 27 | IF(trip_hour IS NULL, -1, trip_hour) trip_hour, 28 | IF(trip_seconds IS NULL, -1, trip_seconds) trip_seconds, 29 | IF(trip_miles IS NULL, -1, trip_miles) trip_miles, 30 | IF(payment_type IS NULL, 'NA', payment_type) payment_type, 31 | IF(pickup_grid IS NULL, 'NA', pickup_grid) pickup_grid, 32 | IF(dropoff_grid IS NULL, 'NA', dropoff_grid) dropoff_grid, 33 | IF(euclidean IS NULL, -1, euclidean) euclidean, 34 | IF(loc_cross IS NULL, 'NA', loc_cross) loc_cross""" 35 | if ml_use: 36 | query += f""", 37 | tip_bin 38 | FROM {bq_dataset_name}.{bq_table_name} 39 | WHERE ML_use = '{ml_use}' 40 | """ 41 | else: 42 | query += f""" 43 | FROM {bq_dataset_name}.{bq_table_name} 44 | """ 45 | if limit: 46 | query += f"LIMIT {limit}" 47 | 48 | return query 49 | 50 | 51 | def get_training_source_query( 52 | project, region, dataset_display_name, ml_use="UNASSIGNED", limit=None 53 | ): 54 | """Generates a BigQuery SELECT statement for the training data.""" 55 | 56 | dataset = vertex_ai.TabularDataset.list( 57 | filter=f"display_name={dataset_display_name}", order_by="update_time" 58 | )[-1] 59 | bq_source_uri = dataset.gca_resource.metadata["inputConfig"]["bigquerySource"][ 60 | "uri" 61 | ] 62 | _, bq_dataset_name, bq_table_name = bq_source_uri.replace("bq://", "").split(".") 63 | 64 | return _get_source_query(bq_dataset_name, bq_table_name, ml_use, limit) 65 | 66 | 67 | def get_serving_source_query(bq_dataset_name, bq_table_name, limit=None): 68 | """Generates a BigQuery SELECT statement for the training data.""" 69 | 70 | return _get_source_query(bq_dataset_name, bq_table_name, ml_use=None, limit=limit) 71 | -------------------------------------------------------------------------------- /src/common/features.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Model features metadata utils.""" 15 | 16 | 17 | FEATURE_NAMES = [ 18 | "trip_month", 19 | "trip_day", 20 | "trip_day_of_week", 21 | "trip_hour", 22 | "trip_seconds", 23 | "trip_miles", 24 | "payment_type", 25 | "pickup_grid", 26 | "dropoff_grid", 27 | "euclidean", 28 | "loc_cross", 29 | ] 30 | 31 | TARGET_FEATURE_NAME = "tip_bin" 32 | 33 | TARGET_LABELS = ["tip<20%", "tip>=20%"] 34 | 35 | NUMERICAL_FEATURE_NAMES = [ 36 | "trip_seconds", 37 | "trip_miles", 38 | "euclidean", 39 | ] 40 | 41 | EMBEDDING_CATEGORICAL_FEATURES = { 42 | "trip_month": 2, 43 | "trip_day": 4, 44 | "trip_hour": 3, 45 | "pickup_grid": 3, 46 | "dropoff_grid": 3, 47 | "loc_cross": 10, 48 | } 49 | 50 | ONEHOT_CATEGORICAL_FEATURE_NAMES = ["payment_type", "trip_day_of_week"] 51 | 52 | 53 | def transformed_name(key: str) -> str: 54 | """Generate the name of the transformed feature from original name.""" 55 | return f"{key}_xf" 56 | 57 | 58 | def original_name(key: str) -> str: 59 | """Generate the name of the original feature from transformed name.""" 60 | return key.replace("_xf", "") 61 | 62 | 63 | def vocabulary_name(key: str) -> str: 64 | """Generate the name of the vocabulary feature from original name.""" 65 | return f"{key}_vocab" 66 | 67 | 68 | def categorical_feature_names() -> list: 69 | return ( 70 | list(EMBEDDING_CATEGORICAL_FEATURES.keys()) + ONEHOT_CATEGORICAL_FEATURE_NAMES 71 | ) 72 | 73 | 74 | def generate_explanation_config(): 75 | explanation_config = { 76 | "inputs": {}, 77 | "outputs": {}, 78 | "params": {"sampled_shapley_attribution": {"path_count": 10}}, 79 | } 80 | 81 | for feature_name in FEATURE_NAMES: 82 | if feature_name in NUMERICAL_FEATURE_NAMES: 83 | explanation_config["inputs"][feature_name] = { 84 | "input_tensor_name": feature_name, 85 | "modality": "numeric", 86 | } 87 | else: 88 | explanation_config["inputs"][feature_name] = { 89 | "input_tensor_name": feature_name, 90 | "modality": "categorical", 91 | } 92 | 93 | explanation_config["outputs"] = {"scores": {"output_tensor_name": "scores"}} 94 | 95 | return explanation_config 96 | -------------------------------------------------------------------------------- /src/model_training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/model_training/__init__.py -------------------------------------------------------------------------------- /src/model_training/data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Functions for reading data as tf.data.Dataset.""" 15 | 16 | import tensorflow as tf 17 | 18 | from src.common import features 19 | 20 | 21 | def _gzip_reader_fn(filenames: list): 22 | """Returns a record reader that can read gzip'ed files.""" 23 | return tf.data.TFRecordDataset(filenames, compression_type="GZIP") 24 | 25 | 26 | def get_dataset( 27 | file_pattern: str, 28 | feature_spec: dict, 29 | batch_size: int = 200, 30 | upsampling_factor: float = 2.0, 31 | ): 32 | """Generates features and label for tuning/training. 33 | 34 | Args: 35 | file_pattern: input tfrecord file pattern. 36 | feature_spec: a dictionary of feature specifications. 37 | batch_size: representing the number of consecutive elements of returned 38 | dataset to combine in a single batch 39 | Returns: 40 | A dataset that contains (features, indices) tuple where features is a 41 | dictionary of Tensors, and indices is a single Tensor of label indices. 42 | """ 43 | 44 | dataset = tf.data.experimental.make_batched_features_dataset( 45 | file_pattern=file_pattern, 46 | batch_size=batch_size, 47 | features=feature_spec, 48 | label_key=features.TARGET_FEATURE_NAME, 49 | reader=_gzip_reader_fn, 50 | num_epochs=1, 51 | drop_final_batch=True, 52 | ) 53 | 54 | return dataset 55 | -------------------------------------------------------------------------------- /src/model_training/defaults.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Defaults for the model. 15 | 16 | These values can be tweaked to affect model training performance. 17 | """ 18 | 19 | 20 | HIDDEN_UNITS = [64, 32] 21 | LEARNING_RATE = 0.0001 22 | BATCH_SIZE = 512 23 | NUM_EPOCHS = 10 24 | NUM_EVAL_STEPS = 100 25 | 26 | 27 | def update_hyperparams(hyperparams: dict) -> dict: 28 | """Updates the hyperparams dictionary with default values.""" 29 | 30 | if "hidden_units" not in hyperparams: 31 | hyperparams["hidden_units"] = HIDDEN_UNITS 32 | else: 33 | if not isinstance(hyperparams["hidden_units"], list): 34 | hyperparams["hidden_units"] = [ 35 | int(v) for v in hyperparams["hidden_units"].split(",") 36 | ] 37 | if "learning_rate" not in hyperparams: 38 | hyperparams["learning_rate"] = LEARNING_RATE 39 | if "batch_size" not in hyperparams: 40 | hyperparams["batch_size"] = BATCH_SIZE 41 | if "num_epochs" not in hyperparams: 42 | hyperparams["num_epochs"] = NUM_EPOCHS 43 | return hyperparams 44 | -------------------------------------------------------------------------------- /src/model_training/exporter.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Functions for exporting the model for serving.""" 15 | 16 | import logging 17 | 18 | import tensorflow as tf 19 | import tensorflow_transform as tft 20 | import tensorflow_data_validation as tfdv 21 | from tensorflow_transform.tf_metadata import schema_utils 22 | import tensorflow.keras as keras 23 | 24 | from src.common import features 25 | 26 | 27 | def _get_serve_tf_examples_fn(classifier, tft_output, raw_feature_spec): 28 | """Returns a function that parses a serialized tf.Example and applies TFT.""" 29 | 30 | classifier.tft_layer = tft_output.transform_features_layer() 31 | 32 | @tf.function 33 | def serve_tf_examples_fn(serialized_tf_examples): 34 | """Returns the output to be used in the serving signature.""" 35 | for key in list(raw_feature_spec.keys()): 36 | if key not in features.FEATURE_NAMES: 37 | raw_feature_spec.pop(key) 38 | 39 | parsed_features = tf.io.parse_example(serialized_tf_examples, raw_feature_spec) 40 | 41 | transformed_features = classifier.tft_layer(parsed_features) 42 | logits = classifier(transformed_features) 43 | probabilities = keras.activations.sigmoid(logits) 44 | return {"probabilities": probabilities} 45 | 46 | return serve_tf_examples_fn 47 | 48 | 49 | def _get_serve_features_fn(classifier, tft_output): 50 | """Returns a function that accept a dictionary of features and applies TFT.""" 51 | 52 | classifier.tft_layer = tft_output.transform_features_layer() 53 | 54 | @tf.function 55 | def serve_features_fn(raw_features): 56 | """Returns the output to be used in the serving signature.""" 57 | 58 | transformed_features = classifier.tft_layer(raw_features) 59 | logits = classifier(transformed_features) 60 | neg_probabilities = keras.activations.sigmoid(logits) 61 | pos_probabilities = 1 - neg_probabilities 62 | probabilities = tf.concat([neg_probabilities, pos_probabilities], -1) 63 | batch_size = tf.shape(probabilities)[0] 64 | classes = tf.repeat([features.TARGET_LABELS], [batch_size], axis=0) 65 | return {"classes": classes, "scores": probabilities} 66 | 67 | return serve_features_fn 68 | 69 | 70 | def export_serving_model( 71 | classifier, serving_model_dir, raw_schema_location, tft_output_dir 72 | ): 73 | """Exports the classifier as a SavedModel with serving signatures.""" 74 | 75 | raw_schema = tfdv.load_schema_text(raw_schema_location) 76 | raw_feature_spec = schema_utils.schema_as_feature_spec(raw_schema).feature_spec 77 | 78 | tft_output = tft.TFTransformOutput(tft_output_dir) 79 | 80 | features_input_signature = { 81 | feature_name: tf.TensorSpec( 82 | shape=(None, 1), dtype=spec.dtype, name=feature_name 83 | ) 84 | for feature_name, spec in raw_feature_spec.items() 85 | if feature_name in features.FEATURE_NAMES 86 | } 87 | 88 | signatures = { 89 | "serving_default": _get_serve_features_fn( 90 | classifier, tft_output 91 | ).get_concrete_function(features_input_signature), 92 | "serving_tf_example": _get_serve_tf_examples_fn( 93 | classifier, tft_output, raw_feature_spec 94 | ).get_concrete_function( 95 | tf.TensorSpec(shape=[None], dtype=tf.string, name="examples") 96 | ), 97 | } 98 | 99 | logging.info("Model export started...") 100 | tf.saved_model.save(classifier, serving_model_dir, signatures=signatures) 101 | logging.info("Model export completed.") 102 | -------------------------------------------------------------------------------- /src/model_training/model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """A DNN Keras classification model.""" 15 | 16 | import tensorflow as tf 17 | from tensorflow import keras 18 | 19 | from src.common import features 20 | 21 | 22 | def create_model_inputs(): 23 | """Creates Keras model input dictionary.""" 24 | 25 | inputs = {} 26 | for feature_name in features.FEATURE_NAMES: 27 | name = features.transformed_name(feature_name) 28 | if feature_name in features.NUMERICAL_FEATURE_NAMES: 29 | inputs[name] = keras.layers.Input(name=name, shape=[], dtype=tf.float32) 30 | elif feature_name in features.categorical_feature_names(): 31 | inputs[name] = keras.layers.Input(name=name, shape=[], dtype=tf.int64) 32 | else: 33 | pass 34 | return inputs 35 | 36 | 37 | def _create_binary_classifier(feature_vocab_sizes, hyperparams): 38 | """Return a Keras binary classifier.""" 39 | 40 | input_layers = create_model_inputs() 41 | 42 | layers = [] 43 | for key in input_layers: 44 | feature_name = features.original_name(key) 45 | if feature_name in features.EMBEDDING_CATEGORICAL_FEATURES: 46 | vocab_size = feature_vocab_sizes[feature_name] 47 | embedding_size = features.EMBEDDING_CATEGORICAL_FEATURES[feature_name] 48 | embedding_output = keras.layers.Embedding( 49 | input_dim=vocab_size + 1, 50 | output_dim=embedding_size, 51 | name=f"{key}_embedding", 52 | )(input_layers[key]) 53 | layers.append(embedding_output) 54 | elif feature_name in features.ONEHOT_CATEGORICAL_FEATURE_NAMES: 55 | vocab_size = feature_vocab_sizes[feature_name] 56 | onehot_layer = keras.layers.experimental.preprocessing.CategoryEncoding( 57 | max_tokens=vocab_size, 58 | output_mode="binary", 59 | name=f"{key}_onehot", 60 | )(input_layers[key]) 61 | layers.append(onehot_layer) 62 | elif feature_name in features.NUMERICAL_FEATURE_NAMES: 63 | numeric_layer = tf.expand_dims(input_layers[key], -1) 64 | layers.append(numeric_layer) 65 | else: 66 | pass 67 | 68 | joined = keras.layers.Concatenate(name="combines_inputs")(layers) 69 | feedforward_output = keras.Sequential( 70 | [ 71 | keras.layers.Dense(units, activation="relu") 72 | for units in hyperparams["hidden_units"] 73 | ], 74 | name="feedforward_network", 75 | )(joined) 76 | logits = keras.layers.Dense(units=1, name="logits")(feedforward_output) 77 | 78 | model = keras.Model(inputs=input_layers, outputs=[logits]) 79 | return model 80 | 81 | 82 | def create_binary_classifier(tft_output, hyperparams): 83 | """Returns a Keras binary classifier.""" 84 | 85 | feature_vocab_sizes = dict() 86 | for feature_name in features.categorical_feature_names(): 87 | feature_vocab_sizes[feature_name] = tft_output.vocabulary_size_by_name( 88 | feature_name 89 | ) 90 | 91 | return _create_binary_classifier(feature_vocab_sizes, hyperparams) 92 | -------------------------------------------------------------------------------- /src/model_training/runner.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """A run_fn method called by the TFX Trainer component.""" 15 | 16 | import os 17 | import logging 18 | 19 | from src.model_training import trainer, exporter, defaults 20 | 21 | 22 | # TFX Trainer will call this function. 23 | def run_fn(fn_args): 24 | """Train the model based on given args. 25 | 26 | Args: 27 | fn_args: Holds args used to train the model as name/value pairs. 28 | """ 29 | 30 | logging.info("Runner started...") 31 | logging.info(f"fn_args: {fn_args}") 32 | logging.info("") 33 | 34 | try: 35 | log_dir = fn_args.model_run_dir 36 | except KeyError: 37 | log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), "logs") 38 | 39 | hyperparams = fn_args.hyperparameters 40 | if not hyperparams: 41 | hyperparams = dict() 42 | 43 | hyperparams = defaults.update_hyperparams(hyperparams) 44 | logging.info("Hyperparameter:") 45 | logging.info(hyperparams) 46 | logging.info("") 47 | 48 | logging.info("Runner executing trainer...") 49 | classifier = trainer.train( 50 | train_data_dir=fn_args.train_files, 51 | eval_data_dir=fn_args.eval_files, 52 | tft_output_dir=fn_args.transform_output, 53 | hyperparams=hyperparams, 54 | log_dir=log_dir, 55 | base_model_dir=fn_args.base_model, 56 | ) 57 | 58 | logging.info("Runner executing exporter...") 59 | exporter.export_serving_model( 60 | classifier=classifier, 61 | serving_model_dir=fn_args.serving_model_dir, 62 | raw_schema_location=fn_args.schema_path, 63 | tft_output_dir=fn_args.transform_output, 64 | ) 65 | logging.info("Runner completed.") 66 | -------------------------------------------------------------------------------- /src/model_training/task.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """The entrypoint for the uCAIP traing job.""" 15 | 16 | import os 17 | import sys 18 | from datetime import datetime 19 | import logging 20 | import tensorflow as tf 21 | from tensorflow.python.client import device_lib 22 | import argparse 23 | 24 | from google.cloud import aiplatform as vertex_ai 25 | from google.cloud import aiplatform_v1beta1 as vertex_ai_beta 26 | 27 | from src.model_training import defaults, trainer, exporter 28 | 29 | dirname = os.path.dirname(__file__) 30 | dirname = dirname.replace("/model_training", "") 31 | RAW_SCHEMA_LOCATION = os.path.join(dirname, "raw_schema/schema.pbtxt") 32 | 33 | 34 | def get_args(): 35 | """Defines and parse commandline arguments.""" 36 | 37 | parser = argparse.ArgumentParser() 38 | 39 | parser.add_argument( 40 | "--model-dir", 41 | default=os.getenv("AIP_MODEL_DIR"), 42 | type=str, 43 | ) 44 | 45 | parser.add_argument( 46 | "--log-dir", 47 | default=os.getenv("AIP_TENSORBOARD_LOG_DIR"), 48 | type=str, 49 | ) 50 | 51 | parser.add_argument( 52 | "--train-data-dir", 53 | type=str, 54 | ) 55 | 56 | parser.add_argument( 57 | "--eval-data-dir", 58 | type=str, 59 | ) 60 | 61 | parser.add_argument( 62 | "--tft-output-dir", 63 | type=str, 64 | ) 65 | 66 | parser.add_argument("--learning-rate", default=0.001, type=float) 67 | 68 | parser.add_argument("--batch-size", default=512, type=float) 69 | 70 | parser.add_argument("--hidden-units", default="64,32", type=str) 71 | 72 | parser.add_argument("--num-epochs", default=10, type=int) 73 | 74 | parser.add_argument("--project", type=str) 75 | parser.add_argument("--region", type=str) 76 | parser.add_argument("--staging-bucket", type=str) 77 | parser.add_argument("--experiment-name", type=str) 78 | parser.add_argument("--run-name", type=str) 79 | 80 | return parser.parse_args() 81 | 82 | 83 | def main(): 84 | args = get_args() 85 | 86 | hyperparams = vars(args) 87 | hyperparams = defaults.update_hyperparams(hyperparams) 88 | logging.info(f"Hyperparameter: {hyperparams}") 89 | 90 | if args.experiment_name: 91 | vertex_ai.init( 92 | project=args.project, 93 | staging_bucket=args.staging_bucket, 94 | experiment=args.experiment_name, 95 | ) 96 | 97 | logging.info(f"Using Vertex AI experiment: {args.experiment_name}") 98 | 99 | run_id = args.run_name 100 | if not run_id: 101 | run_id = f"run-gcp-{datetime.now().strftime('%Y%m%d%H%M%S')}" 102 | 103 | vertex_ai.start_run(run_id) 104 | logging.info(f"Run {run_id} started.") 105 | 106 | vertex_ai.log_params(hyperparams) 107 | 108 | classifier = trainer.train( 109 | train_data_dir=args.train_data_dir, 110 | eval_data_dir=args.eval_data_dir, 111 | tft_output_dir=args.tft_output_dir, 112 | hyperparams=hyperparams, 113 | log_dir=args.log_dir, 114 | ) 115 | 116 | val_loss, val_accuracy = trainer.evaluate( 117 | model=classifier, 118 | data_dir=args.eval_data_dir, 119 | raw_schema_location=RAW_SCHEMA_LOCATION, 120 | tft_output_dir=args.tft_output_dir, 121 | hyperparams=hyperparams, 122 | ) 123 | 124 | if args.experiment_name: 125 | vertex_ai.log_metrics({"val_loss": val_loss, "val_accuracy": val_accuracy}) 126 | 127 | try: 128 | exporter.export_serving_model( 129 | classifier=classifier, 130 | serving_model_dir=args.model_dir, 131 | raw_schema_location=RAW_SCHEMA_LOCATION, 132 | tft_output_dir=args.tft_output_dir, 133 | ) 134 | except: 135 | # Swallow Ignored Errors while exporting the model. 136 | pass 137 | 138 | 139 | if __name__ == "__main__": 140 | logging.getLogger().setLevel(logging.INFO) 141 | logging.info(f"Python Version = {sys.version}") 142 | logging.info(f"TensorFlow Version = {tf.__version__}") 143 | logging.info(f'TF_CONFIG = {os.environ.get("TF_CONFIG", "Not found")}') 144 | logging.info(f"DEVICES = {device_lib.list_local_devices()}") 145 | logging.info(f"Task started...") 146 | main() 147 | logging.info(f"Task completed.") 148 | -------------------------------------------------------------------------------- /src/model_training/trainer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Train and evaluate the model.""" 15 | 16 | import logging 17 | import tensorflow as tf 18 | import tensorflow_transform as tft 19 | from tensorflow import keras 20 | 21 | 22 | from src.model_training import data, model 23 | 24 | 25 | def train( 26 | train_data_dir, 27 | eval_data_dir, 28 | tft_output_dir, 29 | hyperparams, 30 | log_dir, 31 | base_model_dir=None, 32 | ): 33 | """Invokes model.fit method and returns a trained classifier.""" 34 | 35 | logging.info(f"Loading tft output from {tft_output_dir}") 36 | tft_output = tft.TFTransformOutput(tft_output_dir) 37 | transformed_feature_spec = tft_output.transformed_feature_spec() 38 | 39 | train_dataset = data.get_dataset( 40 | train_data_dir, 41 | transformed_feature_spec, 42 | hyperparams["batch_size"], 43 | ) 44 | 45 | eval_dataset = data.get_dataset( 46 | eval_data_dir, 47 | transformed_feature_spec, 48 | hyperparams["batch_size"], 49 | ) 50 | 51 | optimizer = keras.optimizers.Adam(learning_rate=hyperparams["learning_rate"]) 52 | loss = keras.losses.BinaryCrossentropy(from_logits=True) 53 | metrics = [keras.metrics.BinaryAccuracy(name="accuracy")] 54 | 55 | early_stopping = tf.keras.callbacks.EarlyStopping( 56 | monitor="val_loss", patience=5, restore_best_weights=True 57 | ) 58 | tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir) 59 | 60 | classifier = model.create_binary_classifier(tft_output, hyperparams) 61 | if base_model_dir: 62 | try: 63 | classifier = keras.load_model(base_model_dir) 64 | except: 65 | pass 66 | 67 | classifier.compile(optimizer=optimizer, loss=loss, metrics=metrics) 68 | 69 | logging.info("Model training started...") 70 | classifier.fit( 71 | train_dataset, 72 | epochs=hyperparams["num_epochs"], 73 | validation_data=eval_dataset, 74 | callbacks=[early_stopping, tensorboard_callback], 75 | ) 76 | logging.info("Model training completed.") 77 | 78 | return classifier 79 | 80 | 81 | def evaluate(model, data_dir, raw_schema_location, tft_output_dir, hyperparams): 82 | """Invokes model.evaluate method and returns evaluation_metrics.""" 83 | 84 | logging.info(f"Loading raw schema from {raw_schema_location}") 85 | 86 | logging.info(f"Loading tft output from {tft_output_dir}") 87 | tft_output = tft.TFTransformOutput(tft_output_dir) 88 | transformed_feature_spec = tft_output.transformed_feature_spec() 89 | 90 | logging.info("Model evaluation started...") 91 | eval_dataset = data.get_dataset( 92 | data_dir, 93 | transformed_feature_spec, 94 | hyperparams["batch_size"], 95 | ) 96 | 97 | evaluation_metrics = model.evaluate(eval_dataset) 98 | logging.info("Model evaluation completed.") 99 | 100 | return evaluation_metrics 101 | -------------------------------------------------------------------------------- /src/pipeline_triggering/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/pipeline_triggering/__init__.py -------------------------------------------------------------------------------- /src/pipeline_triggering/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Cloud Function to be triggered by Pub/Sub.""" 15 | 16 | import os 17 | import json 18 | import logging 19 | from kfp.v2.google.client import AIPlatformClient 20 | from google.cloud import storage 21 | import base64 22 | 23 | 24 | def trigger_pipeline(event, context): 25 | """A Cloud Function for triggering a Vertex pipeline given a Pub/Sub event.""" 26 | 27 | project = os.getenv("PROJECT") 28 | region = os.getenv("REGION") 29 | gcs_pipeline_file_location = os.getenv("GCS_PIPELINE_FILE_LOCATION") 30 | 31 | if not project: 32 | raise ValueError("Environment variable PROJECT is not set.") 33 | if not region: 34 | raise ValueError("Environment variable REGION is not set.") 35 | if not gcs_pipeline_file_location: 36 | raise ValueError("Environment variable GCS_PIPELINE_FILE_LOCATION is not set.") 37 | 38 | storage_client = storage.Client() 39 | 40 | if not gcs_pipeline_file_location: 41 | raise ValueError("Environment variable GCS_PIPELINE_FILE_LOCATION is not set.") 42 | 43 | path_parts = gcs_pipeline_file_location.replace("gs://", "").split("/") 44 | bucket_name = path_parts[0] 45 | blob_name = "/".join(path_parts[1:]) 46 | 47 | bucket = storage_client.bucket(bucket_name) 48 | blob = storage.Blob(bucket=bucket, name=blob_name) 49 | 50 | if not blob.exists(storage_client): 51 | raise ValueError(f"{gcs_pipeline_file_location} does not exist.") 52 | 53 | data = base64.b64decode(event["data"]).decode("utf-8") 54 | logging.info(f"Event data: {data}") 55 | 56 | parameter_values = json.loads(data) 57 | 58 | api_client = AIPlatformClient(project_id=project, region=region) 59 | 60 | response = api_client.create_run_from_job_spec( 61 | job_spec_path=gcs_pipeline_file_location, parameter_values=parameter_values 62 | ) 63 | 64 | logging.info(response) 65 | -------------------------------------------------------------------------------- /src/pipeline_triggering/requirements.txt: -------------------------------------------------------------------------------- 1 | kfp==1.6.2 2 | google-cloud-aiplatform 3 | google-cloud-storage -------------------------------------------------------------------------------- /src/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/preprocessing/__init__.py -------------------------------------------------------------------------------- /src/preprocessing/etl.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Data preprocessing pipelines.""" 15 | 16 | import os 17 | 18 | import tensorflow_transform as tft 19 | import tensorflow_data_validation as tfdv 20 | import apache_beam as beam 21 | from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore 22 | import tensorflow_transform.beam as tft_beam 23 | from tensorflow_transform.tf_metadata import dataset_metadata 24 | from tensorflow_transform.tf_metadata import schema_utils 25 | 26 | 27 | from src.preprocessing import transformations 28 | 29 | RAW_SCHEMA_LOCATION = "src/raw_schema/schema.pbtxt" 30 | 31 | 32 | def parse_bq_record(bq_record): 33 | """Parses a bq_record to a dictionary.""" 34 | output = {} 35 | for key in bq_record: 36 | output[key] = [bq_record[key]] 37 | return output 38 | 39 | 40 | def split_dataset(bq_row, num_partitions, ratio): 41 | """Returns a partition number for a given bq_row.""" 42 | import json 43 | 44 | assert num_partitions == len(ratio) 45 | bucket = sum(map(ord, json.dumps(bq_row))) % sum(ratio) 46 | total = 0 47 | for i, part in enumerate(ratio): 48 | total += part 49 | if bucket < total: 50 | return i 51 | return len(ratio) - 1 52 | 53 | 54 | def run_transform_pipeline(args): 55 | """Runs a Beam pipeline to preprocess the data using TensorFlow Transform.""" 56 | 57 | pipeline_options = beam.pipeline.PipelineOptions(flags=[], **args) 58 | 59 | raw_data_query = args["raw_data_query"] 60 | write_raw_data = args["write_raw_data"] 61 | exported_data_prefix = args["exported_data_prefix"] 62 | transformed_data_prefix = args["transformed_data_prefix"] 63 | transform_artifact_dir = args["transform_artifact_dir"] 64 | temp_location = args["temp_location"] 65 | project = args["project"] 66 | 67 | source_raw_schema = tfdv.load_schema_text(RAW_SCHEMA_LOCATION) 68 | raw_feature_spec = schema_utils.schema_as_feature_spec( 69 | source_raw_schema 70 | ).feature_spec 71 | 72 | raw_metadata = dataset_metadata.DatasetMetadata( 73 | schema_utils.schema_from_feature_spec(raw_feature_spec) 74 | ) 75 | 76 | with beam.Pipeline(options=pipeline_options) as pipeline: 77 | with tft_beam.Context(temp_location): 78 | 79 | # Read raw BigQuery data. 80 | raw_train_data, raw_eval_data = ( 81 | pipeline 82 | | "Read Raw Data" 83 | >> beam.io.ReadFromBigQuery( 84 | query=raw_data_query, 85 | project=project, 86 | use_standard_sql=True, 87 | ) 88 | | "Parse Data" >> beam.Map(parse_bq_record) 89 | | "Split" >> beam.Partition(split_dataset, 2, ratio=[8, 2]) 90 | ) 91 | 92 | # Create a train_dataset from the data and schema. 93 | raw_train_dataset = (raw_train_data, raw_metadata) 94 | 95 | # Analyze and transform raw_train_dataset to produced transformed_train_dataset and transform_fn. 96 | transformed_train_dataset, transform_fn = ( 97 | raw_train_dataset 98 | | "Analyze & Transform" 99 | >> tft_beam.AnalyzeAndTransformDataset(transformations.preprocessing_fn) 100 | ) 101 | 102 | # Get data and schema separately from the transformed_dataset. 103 | transformed_train_data, transformed_metadata = transformed_train_dataset 104 | 105 | # write transformed train data. 106 | _ = ( 107 | transformed_train_data 108 | | "Write Transformed Train Data" 109 | >> beam.io.tfrecordio.WriteToTFRecord( 110 | file_path_prefix=os.path.join( 111 | transformed_data_prefix, "train/data" 112 | ), 113 | file_name_suffix=".gz", 114 | coder=tft.coders.ExampleProtoCoder(transformed_metadata.schema), 115 | ) 116 | ) 117 | 118 | # Create a eval_dataset from the data and schema. 119 | raw_eval_dataset = (raw_eval_data, raw_metadata) 120 | 121 | # Transform raw_eval_dataset to produced transformed_eval_dataset using transform_fn. 122 | transformed_eval_dataset = ( 123 | raw_eval_dataset, 124 | transform_fn, 125 | ) | "Transform" >> tft_beam.TransformDataset() 126 | 127 | # Get data from the transformed_eval_dataset. 128 | transformed_eval_data, _ = transformed_eval_dataset 129 | 130 | # write transformed train data. 131 | _ = ( 132 | transformed_eval_data 133 | | "Write Transformed Eval Data" 134 | >> beam.io.tfrecordio.WriteToTFRecord( 135 | file_path_prefix=os.path.join(transformed_data_prefix, "eval/data"), 136 | file_name_suffix=".gz", 137 | coder=tft.coders.ExampleProtoCoder(transformed_metadata.schema), 138 | ) 139 | ) 140 | 141 | # Write transform_fn. 142 | _ = transform_fn | "Write Transform Artifacts" >> tft_beam.WriteTransformFn( 143 | transform_artifact_dir 144 | ) 145 | 146 | if write_raw_data: 147 | # write raw eval data. 148 | _ = ( 149 | raw_eval_data 150 | | "Write Raw Eval Data" 151 | >> beam.io.tfrecordio.WriteToTFRecord( 152 | file_path_prefix=os.path.join(exported_data_prefix, "data"), 153 | file_name_suffix=".tfrecord", 154 | coder=tft.coders.ExampleProtoCoder(raw_metadata.schema), 155 | ) 156 | ) 157 | 158 | 159 | def convert_to_jsonl(bq_record): 160 | """Converts bq_record to a jsonl formatted text.""" 161 | import json 162 | 163 | output = {} 164 | for key in bq_record: 165 | output[key] = [bq_record[key]] 166 | return json.dumps(output) 167 | 168 | 169 | def run_extract_pipeline(args): 170 | """Runs a Beam pipeline to extract data from BigQuery as JSONL files.""" 171 | 172 | pipeline_options = beam.pipeline.PipelineOptions(flags=[], **args) 173 | 174 | sql_query = args["sql_query"] 175 | exported_data_prefix = args["exported_data_prefix"] 176 | temporary_dir = args["temporary_dir"] 177 | gcs_location = args["gcs_location"] 178 | project = args["project"] 179 | 180 | with beam.Pipeline(options=pipeline_options) as pipeline: 181 | with tft_beam.Context(temporary_dir): 182 | 183 | # Read BigQuery data. 184 | raw_data = ( 185 | pipeline 186 | | "Read Data" 187 | >> beam.io.ReadFromBigQuery( 188 | query=sql_query, 189 | project=project, 190 | use_standard_sql=True, 191 | gcs_location=gcs_location, 192 | ) 193 | | "Parse Data" >> beam.Map(convert_to_jsonl) 194 | ) 195 | 196 | # Write raw data to GCS as JSONL files. 197 | _ = raw_data | "Write Data" >> beam.io.WriteToText( 198 | file_path_prefix=exported_data_prefix, file_name_suffix=".jsonl" 199 | ) 200 | 201 | 202 | def parse_prediction_results(jsonl): 203 | """Parses JSONL prediction results to a dictionary.""" 204 | import uuid 205 | import json 206 | 207 | prediction_results = json.loads(jsonl)["prediction"] 208 | prediction_id = str(uuid.uuid4()) 209 | scores = prediction_results["scores"] 210 | classes = prediction_results["classes"] 211 | 212 | return {"prediction_id": prediction_id, "scores": scores, "classes": classes} 213 | 214 | 215 | def create_datastore_entity(prediction_response, kind): 216 | """Creates a Datastore entity.""" 217 | 218 | from apache_beam.io.gcp.datastore.v1new.types import Entity 219 | from apache_beam.io.gcp.datastore.v1new.types import Key 220 | 221 | user_id = prediction_response.pop("prediction_id") 222 | key = Key([kind, user_id]) 223 | prediction_entity = Entity(key) 224 | prediction_entity.set_properties(prediction_response) 225 | return prediction_entity 226 | 227 | 228 | def run_store_predictions_pipeline(args): 229 | """Runs a Beam pipeline to store JSONL data to Datastore.""" 230 | 231 | project = args["project"] 232 | datastore_kind = args["datastore_kind"] 233 | prediction_results_uri = args["prediction_results_uri"] 234 | 235 | pipeline_options = beam.options.pipeline_options.PipelineOptions(args) 236 | with beam.Pipeline(options=pipeline_options) as pipeline: 237 | _ = ( 238 | pipeline 239 | | "ReadFromJSONL" >> beam.io.ReadFromText(prediction_results_uri) 240 | | "ParsePredictionResults" >> beam.Map(parse_prediction_results) 241 | | "ConvertToDatastoreEntity" 242 | >> beam.Map(create_datastore_entity, datastore_kind) 243 | | "WriteToDatastore" >> WriteToDatastore(project=project) 244 | ) 245 | -------------------------------------------------------------------------------- /src/preprocessing/transformations.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """TensorFlow Transform preprocessing function.""" 15 | 16 | import tensorflow as tf 17 | import tensorflow_transform as tft 18 | 19 | from src.common import features 20 | 21 | 22 | def preprocessing_fn(inputs): 23 | """tf.transform's callback function for preprocessing inputs. 24 | 25 | Args: 26 | inputs: map from feature keys to raw not-yet-transformed features. 27 | Returns: 28 | Map from string feature key to transformed feature operations. 29 | """ 30 | 31 | outputs = {} 32 | 33 | for key in features.FEATURE_NAMES: 34 | if key in features.NUMERICAL_FEATURE_NAMES: 35 | outputs[features.transformed_name(key)] = tft.scale_to_z_score(inputs[key]) 36 | 37 | elif key in features.categorical_feature_names(): 38 | outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary( 39 | inputs[key], 40 | num_oov_buckets=1, 41 | vocab_filename=key, 42 | ) 43 | 44 | outputs[features.TARGET_FEATURE_NAME] = inputs[features.TARGET_FEATURE_NAME] 45 | 46 | for key in outputs: 47 | outputs[key] = tf.squeeze(outputs[key], -1) 48 | 49 | return outputs 50 | -------------------------------------------------------------------------------- /src/raw_schema/schema.pbtxt: -------------------------------------------------------------------------------- 1 | feature { 2 | name: "trip_month" 3 | type: INT 4 | presence { 5 | min_fraction: 1.0 6 | min_count: 1 7 | } 8 | shape { 9 | dim { 10 | size: 1 11 | } 12 | } 13 | } 14 | feature { 15 | name: "trip_day" 16 | type: INT 17 | presence { 18 | min_fraction: 1.0 19 | min_count: 1 20 | } 21 | shape { 22 | dim { 23 | size: 1 24 | } 25 | } 26 | } 27 | feature { 28 | name: "trip_day_of_week" 29 | type: INT 30 | presence { 31 | min_fraction: 1.0 32 | min_count: 1 33 | } 34 | shape { 35 | dim { 36 | size: 1 37 | } 38 | } 39 | } 40 | feature { 41 | name: "trip_hour" 42 | type: INT 43 | presence { 44 | min_fraction: 1.0 45 | min_count: 1 46 | } 47 | shape { 48 | dim { 49 | size: 1 50 | } 51 | } 52 | } 53 | feature { 54 | name: "trip_seconds" 55 | type: INT 56 | presence { 57 | min_fraction: 1.0 58 | min_count: 1 59 | } 60 | shape { 61 | dim { 62 | size: 1 63 | } 64 | } 65 | } 66 | feature { 67 | name: "trip_miles" 68 | type: FLOAT 69 | presence { 70 | min_fraction: 1.0 71 | min_count: 1 72 | } 73 | shape { 74 | dim { 75 | size: 1 76 | } 77 | } 78 | } 79 | feature { 80 | name: "payment_type" 81 | type: BYTES 82 | domain: "payment_type" 83 | presence { 84 | min_fraction: 1.0 85 | min_count: 1 86 | } 87 | shape { 88 | dim { 89 | size: 1 90 | } 91 | } 92 | } 93 | feature { 94 | name: "pickup_grid" 95 | type: BYTES 96 | domain: "pickup_grid" 97 | presence { 98 | min_fraction: 1.0 99 | min_count: 1 100 | } 101 | shape { 102 | dim { 103 | size: 1 104 | } 105 | } 106 | } 107 | feature { 108 | name: "dropoff_grid" 109 | type: BYTES 110 | domain: "dropoff_grid" 111 | presence { 112 | min_fraction: 1.0 113 | min_count: 1 114 | } 115 | shape { 116 | dim { 117 | size: 1 118 | } 119 | } 120 | } 121 | feature { 122 | name: "euclidean" 123 | type: FLOAT 124 | presence { 125 | min_fraction: 1.0 126 | min_count: 1 127 | } 128 | shape { 129 | dim { 130 | size: 1 131 | } 132 | } 133 | } 134 | feature { 135 | name: "loc_cross" 136 | type: BYTES 137 | presence { 138 | min_fraction: 1.0 139 | min_count: 1 140 | } 141 | shape { 142 | dim { 143 | size: 1 144 | } 145 | } 146 | } 147 | feature { 148 | name: "tip_bin" 149 | type: INT 150 | bool_domain { 151 | } 152 | presence { 153 | min_fraction: 1.0 154 | min_count: 1 155 | } 156 | shape { 157 | dim { 158 | size: 1 159 | } 160 | } 161 | } 162 | string_domain { 163 | name: "payment_type" 164 | value: "Cash" 165 | value: "Credit Card" 166 | value: "Dispute" 167 | value: "Mobile" 168 | value: "No Charge" 169 | value: "Prcard" 170 | value: "Prepaid" 171 | value: "Unknown" 172 | } 173 | string_domain { 174 | name: "pickup_grid" 175 | value: "POINT(-87.5 41.7)" 176 | value: "POINT(-87.6 41.7)" 177 | value: "POINT(-87.6 41.8)" 178 | value: "POINT(-87.6 41.9)" 179 | value: "POINT(-87.6 42)" 180 | value: "POINT(-87.7 41.7)" 181 | value: "POINT(-87.7 41.8)" 182 | value: "POINT(-87.7 41.9)" 183 | value: "POINT(-87.7 42)" 184 | value: "POINT(-87.8 41.8)" 185 | value: "POINT(-87.8 41.9)" 186 | value: "POINT(-87.8 42)" 187 | value: "POINT(-87.9 42)" 188 | } 189 | string_domain { 190 | name: "dropoff_grid" 191 | value: "POINT(-87.5 41.7)" 192 | value: "POINT(-87.6 41.7)" 193 | value: "POINT(-87.6 41.8)" 194 | value: "POINT(-87.6 41.9)" 195 | value: "POINT(-87.6 42)" 196 | value: "POINT(-87.7 41.7)" 197 | value: "POINT(-87.7 41.8)" 198 | value: "POINT(-87.7 41.9)" 199 | value: "POINT(-87.7 42)" 200 | value: "POINT(-87.8 41.8)" 201 | value: "POINT(-87.8 41.9)" 202 | value: "POINT(-87.8 42)" 203 | value: "POINT(-87.9 42)" 204 | } 205 | -------------------------------------------------------------------------------- /src/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/tests/__init__.py -------------------------------------------------------------------------------- /src/tests/datasource_utils_tests.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Test utilities for generating BigQuery data querying scirpts.""" 15 | 16 | import sys 17 | import os 18 | import logging 19 | from google.cloud import bigquery 20 | 21 | from src.common import datasource_utils 22 | 23 | root = logging.getLogger() 24 | root.setLevel(logging.INFO) 25 | handler = logging.StreamHandler(sys.stdout) 26 | handler.setLevel(logging.INFO) 27 | root.addHandler(handler) 28 | 29 | LIMIT = 100 30 | 31 | TARGET_COLUMN = "tip_bin" 32 | 33 | EXPECTED_TRAINING_COLUMNS = [ 34 | "trip_month", 35 | "trip_day", 36 | "trip_day_of_week", 37 | "trip_hour", 38 | "trip_seconds", 39 | "trip_miles", 40 | "payment_type", 41 | "pickup_grid", 42 | "dropoff_grid", 43 | "euclidean", 44 | "loc_cross", 45 | "tip_bin", 46 | ] 47 | 48 | 49 | MISSING = { 50 | "trip_month": -1, 51 | "trip_day": -1, 52 | "trip_day_of_week": -1, 53 | "trip_hour": -1, 54 | "trip_seconds": -1, 55 | "trip_miles": -1, 56 | "payment_type": "NA", 57 | "pickup_grid": "NA", 58 | "dropoff_grid": "NA", 59 | "euclidean": -1, 60 | "loc_cross": "NA", 61 | } 62 | 63 | 64 | def test_training_query(): 65 | 66 | project = os.getenv("PROJECT") 67 | location = os.getenv("BQ_LOCATION") 68 | dataset_display_name = os.getenv("DATASET_DISPLAY_NAME") 69 | 70 | assert project, "Environment variable PROJECT is None!" 71 | assert location, "Environment variable BQ_LOCATION is None!" 72 | assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!" 73 | 74 | logging.info(f"Dataset: {dataset_display_name}") 75 | 76 | query = datasource_utils.create_bq_source_query( 77 | dataset_display_name=dataset_display_name, 78 | missing=MISSING, 79 | label_column=TARGET_COLUMN, 80 | ML_use="UNASSIGNED", 81 | limit=LIMIT, 82 | ) 83 | 84 | bq_client = bigquery.Client(project=project, location=location) 85 | df = bq_client.query(query).to_dataframe() 86 | columns = set(df.columns) 87 | assert columns == set(EXPECTED_TRAINING_COLUMNS) 88 | assert df.shape == (LIMIT, 12) 89 | 90 | 91 | def test_serving_query(): 92 | 93 | project = os.getenv("PROJECT") 94 | location = os.getenv("BQ_LOCATION") 95 | bq_dataset_name = os.getenv("BQ_DATASET_NAME") 96 | dataset_display_name = os.getenv("DATASET_DISPLAY_NAME") 97 | 98 | assert project, "Environment variable PROJECT is None!" 99 | assert location, "Environment variable BQ_LOCATION is None!" 100 | assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!" 101 | 102 | logging.info(f"Dataset: {dataset_display_name}") 103 | 104 | query = datasource_utils.create_bq_source_query( 105 | dataset_display_name=dataset_display_name, 106 | missing=MISSING, 107 | ML_use=None, 108 | limit=LIMIT, 109 | ) 110 | 111 | bq_client = bigquery.Client(project=project, location=location) 112 | df = bq_client.query(query).to_dataframe() 113 | columns = set(df.columns) 114 | expected_serving_columns = EXPECTED_TRAINING_COLUMNS 115 | expected_serving_columns.remove(TARGET_COLUMN) 116 | assert columns == set(expected_serving_columns) 117 | assert df.shape == (LIMIT, 11) 118 | -------------------------------------------------------------------------------- /src/tests/etl_tests.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Test data processing.""" 15 | 16 | import sys 17 | import os 18 | import logging 19 | import tensorflow_transform as tft 20 | import tensorflow as tf 21 | from tensorflow.io import FixedLenFeature 22 | 23 | from src.preprocessing import etl 24 | from src.comm import datasource_utils 25 | 26 | root = logging.getLogger() 27 | root.setLevel(logging.INFO) 28 | handler = logging.StreamHandler(sys.stdout) 29 | handler.setLevel(logging.INFO) 30 | root.addHandler(handler) 31 | 32 | OUTPUT_DIR = "test_etl_output_dir" 33 | ML_USE = "UNASSIGNED" 34 | LIMIT = 100 35 | 36 | EXPECTED_FEATURE_SPEC = { 37 | "dropoff_grid_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 38 | "euclidean_xf": FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 39 | "loc_cross_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 40 | "payment_type_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 41 | "pickup_grid_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 42 | "tip_bin": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 43 | "trip_day_of_week_xf": FixedLenFeature( 44 | shape=[], dtype=tf.int64, default_value=None 45 | ), 46 | "trip_day_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 47 | "trip_hour_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 48 | "trip_miles_xf": FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 49 | "trip_month_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None), 50 | "trip_seconds_xf": FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 51 | } 52 | 53 | 54 | def test_transform_pipeline(): 55 | 56 | project = os.getenv("PROJECT") 57 | region = os.getenv("REGION") 58 | bucket = os.getenv("BUCKET") 59 | dataset_display_name = os.getenv("DATASET_DISPLAY_NAME") 60 | 61 | assert project, "Environment variable PROJECT is None!" 62 | assert region, "Environment variable REGION is None!" 63 | assert bucket, "Environment variable BUCKET is None!" 64 | assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!" 65 | 66 | os.mkdir(OUTPUT_DIR) 67 | 68 | exported_data_dir = os.path.join(OUTPUT_DIR, "exported_data") 69 | transformed_data_dir = os.path.join(OUTPUT_DIR, "transformed_data") 70 | transform_artifacts_dir = os.path.join(OUTPUT_DIR, "transform_artifacts") 71 | temporary_dir = os.path.join(OUTPUT_DIR, "tmp") 72 | 73 | raw_data_query = datasource_utils.get_training_source_query( 74 | project=project, 75 | region=region, 76 | dataset_display_name=dataset_display_name, 77 | ml_use=ML_USE, 78 | limit=LIMIT, 79 | ) 80 | 81 | args = { 82 | "runner": "DirectRunner", 83 | "raw_data_query": raw_data_query, 84 | "write_raw_data": False, 85 | "exported_data_prefix": exported_data_dir, 86 | "transformed_data_prefix": transformed_data_dir, 87 | "transform_artefact_dir": transform_artifacts_dir, 88 | "temporary_dir": temporary_dir, 89 | "gcs_location": f"gs://{bucket}/bq_tmp", 90 | "project": project, 91 | } 92 | 93 | logging.info(f"Transform pipeline args: {args}") 94 | etl.run_transform_pipeline(args) 95 | logging.info(f"Transform pipeline finished.") 96 | 97 | tft_output = tft.TFTransformOutput(transform_artifacts_dir) 98 | transform_feature_spec = tft_output.transformed_feature_spec() 99 | assert transform_feature_spec == EXPECTED_FEATURE_SPEC 100 | -------------------------------------------------------------------------------- /src/tests/model_deployment_tests.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Test an uploaded model to Vertex AI.""" 15 | 16 | import os 17 | import logging 18 | import tensorflow as tf 19 | 20 | test_instance = { 21 | "dropoff_grid": ["POINT(-87.6 41.9)"], 22 | "euclidean": [2064.2696], 23 | "loc_cross": [""], 24 | "payment_type": ["Credit Card"], 25 | "pickup_grid": ["POINT(-87.6 41.9)"], 26 | "trip_miles": [1.37], 27 | "trip_day": [12], 28 | "trip_hour": [16], 29 | "trip_month": [2], 30 | "trip_day_of_week": [4], 31 | "trip_seconds": [555], 32 | } 33 | 34 | SERVING_DEFAULT_SIGNATURE_NAME = "serving_default" 35 | 36 | from google.cloud import aiplatform as vertex_ai 37 | 38 | 39 | def test_model_artifact(): 40 | 41 | feature_types = { 42 | "dropoff_grid": tf.dtypes.string, 43 | "euclidean": tf.dtypes.float32, 44 | "loc_cross": tf.dtypes.string, 45 | "payment_type": tf.dtypes.string, 46 | "pickup_grid": tf.dtypes.string, 47 | "trip_miles": tf.dtypes.float32, 48 | "trip_day": tf.dtypes.int64, 49 | "trip_hour": tf.dtypes.int64, 50 | "trip_month": tf.dtypes.int64, 51 | "trip_day_of_week": tf.dtypes.int64, 52 | "trip_seconds": tf.dtypes.int64, 53 | } 54 | 55 | new_test_instance = dict() 56 | for key in test_instance: 57 | new_test_instance[key] = tf.constant( 58 | [test_instance[key]], dtype=feature_types[key] 59 | ) 60 | 61 | print(new_test_instance) 62 | 63 | project = os.getenv("PROJECT") 64 | region = os.getenv("REGION") 65 | model_display_name = os.getenv("MODEL_DISPLAY_NAME") 66 | 67 | assert project, "Environment variable PROJECT is None!" 68 | assert region, "Environment variable REGION is None!" 69 | assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!" 70 | 71 | vertex_ai.init( 72 | project=project, 73 | location=region, 74 | ) 75 | 76 | models = vertex_ai.Model.list( 77 | filter=f"display_name={model_display_name}", order_by="update_time" 78 | ) 79 | 80 | assert models, f"No model with display name {model_display_name} exists!" 81 | 82 | model = models[-1] 83 | artifact_uri = model.gca_resource.artifact_uri 84 | logging.info(f"Model artifact uri:{artifact_uri}") 85 | assert tf.io.gfile.exists( 86 | artifact_uri 87 | ), f"Model artifact uri {artifact_uri} does not exist!" 88 | 89 | saved_model = tf.saved_model.load(artifact_uri) 90 | logging.info("Model loaded successfully.") 91 | 92 | assert ( 93 | SERVING_DEFAULT_SIGNATURE_NAME in saved_model.signatures 94 | ), f"{SERVING_DEFAULT_SIGNATURE_NAME} not in model signatures!" 95 | 96 | prediction_fn = saved_model.signatures["serving_default"] 97 | predictions = prediction_fn(**new_test_instance) 98 | logging.info("Model produced predictions.") 99 | 100 | keys = ["classes", "scores"] 101 | for key in keys: 102 | assert key in predictions, f"{key} in prediction outputs!" 103 | 104 | assert predictions["classes"].shape == ( 105 | 1, 106 | 2, 107 | ), f"Invalid output classes shape: {predictions['classes'].shape}!" 108 | assert predictions["scores"].shape == ( 109 | 1, 110 | 2, 111 | ), f"Invalid output scores shape: {predictions['scores'].shape}!" 112 | logging.info(f"Prediction output: {predictions}") 113 | 114 | 115 | def test_model_endpoint(): 116 | 117 | project = os.getenv("PROJECT") 118 | region = os.getenv("REGION") 119 | model_display_name = os.getenv("MODEL_DISPLAY_NAME") 120 | endpoint_display_name = os.getenv("ENDPOINT_DISPLAY_NAME") 121 | 122 | assert project, "Environment variable PROJECT is None!" 123 | assert region, "Environment variable REGION is None!" 124 | assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!" 125 | assert endpoint_display_name, "Environment variable ENDPOINT_DISPLAY_NAME is None!" 126 | 127 | endpoints = vertex_ai.Endpoint.list( 128 | filter=f"display_name={endpoint_display_name}", order_by="update_time" 129 | ) 130 | assert ( 131 | endpoints 132 | ), f"Endpoint with display name {endpoint_display_name} does not exist! in region {region}" 133 | 134 | endpoint = endpoints[-1] 135 | logging.info(f"Calling endpoint: {endpoint}.") 136 | 137 | prediction = endpoint.predict([test_instance]).predictions[0] 138 | 139 | keys = ["classes", "scores"] 140 | for key in keys: 141 | assert key in prediction, f"{key} in prediction outputs!" 142 | 143 | assert ( 144 | len(prediction["classes"]) == 2 145 | ), f"Invalid number of output classes: {len(prediction['classes'])}!" 146 | assert ( 147 | len(prediction["scores"]) == 2 148 | ), f"Invalid number output scores: {len(prediction['scores'])}!" 149 | 150 | logging.info(f"Prediction output: {prediction}") 151 | -------------------------------------------------------------------------------- /src/tests/model_tests.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Test model functions.""" 15 | 16 | import sys 17 | import os 18 | import logging 19 | import tensorflow_transform as tft 20 | import tensorflow as tf 21 | from tensorflow.io import FixedLenFeature 22 | 23 | from src.common import features 24 | from src.model_training import model, defaults 25 | 26 | root = logging.getLogger() 27 | root.setLevel(logging.INFO) 28 | handler = logging.StreamHandler(sys.stdout) 29 | handler.setLevel(logging.INFO) 30 | root.addHandler(handler) 31 | 32 | EXPECTED_HYPERPARAMS_KEYS = [ 33 | "hidden_units", 34 | "learning_rate", 35 | "batch_size", 36 | "num_epochs", 37 | ] 38 | 39 | 40 | def test_hyperparams_defaults(): 41 | hyperparams = {"hidden_units": [64, 32]} 42 | 43 | hyperparams = defaults.update_hyperparams(hyperparams) 44 | assert set(hyperparams.keys()) == set(EXPECTED_HYPERPARAMS_KEYS) 45 | 46 | 47 | def test_create_binary_classifier(): 48 | 49 | hyperparams = hyperparams = defaults.update_hyperparams(dict()) 50 | 51 | model_inputs = { 52 | "dropoff_grid_xf": tf.convert_to_tensor([0, 0, 0]), 53 | "euclidean_xf": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]), 54 | "loc_cross_xf": tf.convert_to_tensor([0, 0, 0]), 55 | "payment_type_xf": tf.convert_to_tensor([1, 0, 0]), 56 | "pickup_grid_xf": tf.convert_to_tensor([0, 0, 0]), 57 | "trip_day_of_week_xf": tf.convert_to_tensor([5, 4, 4]), 58 | "trip_day_xf": tf.convert_to_tensor([26, 24, 1]), 59 | "trip_hour_xf": tf.convert_to_tensor([0, 4, 2]), 60 | "trip_miles_xf": tf.convert_to_tensor([5.9717827, -0.7121308, -0.7601589]), 61 | "trip_month_xf": tf.convert_to_tensor([4, 3, 4]), 62 | "trip_seconds_xf": tf.convert_to_tensor([4.9029775, -0.34146854, -0.34479955]), 63 | } 64 | 65 | feature_vocab_sizes = { 66 | feature_name: 100 for feature_name in features.categorical_feature_names() 67 | } 68 | classifier = model._create_binary_classifier(feature_vocab_sizes, hyperparams) 69 | model_outputs = classifier(model_inputs) # .numpy() 70 | assert model_outputs.shape == (3, 1) 71 | assert model_outputs.dtype == "float32" 72 | -------------------------------------------------------------------------------- /src/tests/pipeline_deployment_tests.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Test training pipeline using local runner.""" 15 | 16 | import sys 17 | import os 18 | from tfx.orchestration.local.local_dag_runner import LocalDagRunner 19 | import tensorflow as tf 20 | from ml_metadata.proto import metadata_store_pb2 21 | import logging 22 | 23 | from src.tfx_pipelines import config 24 | from src.tfx_pipelines import training_pipeline 25 | 26 | root = logging.getLogger() 27 | root.setLevel(logging.INFO) 28 | handler = logging.StreamHandler(sys.stdout) 29 | handler.setLevel(logging.INFO) 30 | root.addHandler(handler) 31 | 32 | MLMD_SQLLITE = "mlmd.sqllite" 33 | NUM_EPOCHS = 1 34 | BATCH_SIZE = 512 35 | LEARNING_RATE = 0.001 36 | HIDDEN_UNITS = "128,128" 37 | 38 | 39 | def test_e2e_pipeline(): 40 | 41 | project = os.getenv("PROJECT") 42 | region = os.getenv("REGION") 43 | model_display_name = os.getenv("MODEL_DISPLAY_NAME") 44 | dataset_display_name = os.getenv("DATASET_DISPLAY_NAME") 45 | gcs_location = os.getenv("GCS_LOCATION") 46 | model_registry = os.getenv("MODEL_REGISTRY_URI") 47 | upload_model = os.getenv("UPLOAD_MODEL") 48 | 49 | assert project, "Environment variable PROJECT is None!" 50 | assert region, "Environment variable REGION is None!" 51 | assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!" 52 | assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!" 53 | assert gcs_location, "Environment variable GCS_LOCATION is None!" 54 | assert model_registry, "Environment variable MODEL_REGISTRY_URI is None!" 55 | 56 | logging.info(f"upload_model: {upload_model}") 57 | if tf.io.gfile.exists(gcs_location): 58 | tf.io.gfile.rmtree(gcs_location) 59 | logging.info(f"Pipeline e2e test artifacts stored in: {gcs_location}") 60 | 61 | if tf.io.gfile.exists(MLMD_SQLLITE): 62 | tf.io.gfile.remove(MLMD_SQLLITE) 63 | 64 | metadata_connection_config = metadata_store_pb2.ConnectionConfig() 65 | metadata_connection_config.sqlite.filename_uri = MLMD_SQLLITE 66 | metadata_connection_config.sqlite.connection_mode = 3 67 | logging.info("ML metadata store is ready.") 68 | 69 | pipeline_root = os.path.join( 70 | config.ARTIFACT_STORE_URI, 71 | config.PIPELINE_NAME, 72 | ) 73 | 74 | runner = LocalDagRunner() 75 | 76 | pipeline = training_pipeline.create_pipeline( 77 | pipeline_root=pipeline_root, 78 | num_epochs=NUM_EPOCHS, 79 | batch_size=BATCH_SIZE, 80 | learning_rate=LEARNING_RATE, 81 | hidden_units=HIDDEN_UNITS, 82 | metadata_connection_config=metadata_connection_config, 83 | ) 84 | 85 | runner.run(pipeline) 86 | 87 | logging.info(f"Model output: {os.path.join(model_registry, model_display_name)}") 88 | assert tf.io.gfile.exists(os.path.join(model_registry, model_display_name)) 89 | -------------------------------------------------------------------------------- /src/tfx_pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/tfx_pipelines/__init__.py -------------------------------------------------------------------------------- /src/tfx_pipelines/components.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """TFX Custom Python Components.""" 15 | 16 | 17 | import sys 18 | import os 19 | import json 20 | import logging 21 | import tensorflow as tf 22 | 23 | from tfx.types import artifact_utils 24 | from tfx.utils import io_utils 25 | from tfx.dsl.component.experimental.decorators import component 26 | from tfx.dsl.component.experimental.annotations import ( 27 | InputArtifact, 28 | OutputArtifact, 29 | Parameter, 30 | ) 31 | from tfx.types.standard_artifacts import HyperParameters 32 | from tfx.types.experimental.simple_artifacts import File as UploadedModel 33 | from tfx.types.experimental.simple_artifacts import Dataset 34 | 35 | from google.cloud import aiplatform as vertex_ai 36 | 37 | SCRIPT_DIR = os.path.dirname( 38 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))) 39 | ) 40 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, ".."))) 41 | 42 | from src.preprocessing import etl 43 | 44 | 45 | HYPERPARAM_FILENAME = "hyperparameters.json" 46 | SERVING_DATA_PREFIX = "serving-data-" 47 | PREDICTION_RESULTS_PREFIX = "prediction.results-*" 48 | 49 | 50 | @component 51 | def hyperparameters_gen( 52 | num_epochs: Parameter[int], 53 | batch_size: Parameter[int], 54 | learning_rate: Parameter[float], 55 | hidden_units: Parameter[str], 56 | hyperparameters: OutputArtifact[HyperParameters], 57 | ): 58 | """A TFX custom-Python-function component for receiving hyperparameters.""" 59 | 60 | hp_dict = dict() 61 | hp_dict["num_epochs"] = num_epochs 62 | hp_dict["batch_size"] = batch_size 63 | hp_dict["learning_rate"] = learning_rate 64 | hp_dict["hidden_units"] = [int(units) for units in hidden_units.split(",")] 65 | logging.info(f"Hyperparameters: {hp_dict}") 66 | 67 | hyperparams_uri = os.path.join( 68 | artifact_utils.get_single_uri([hyperparameters]), HYPERPARAM_FILENAME 69 | ) 70 | io_utils.write_string_file(hyperparams_uri, json.dumps(hp_dict)) 71 | logging.info(f"Hyperparameters are written to: {hyperparams_uri}") 72 | 73 | 74 | @component 75 | def vertex_model_uploader( 76 | project: Parameter[str], 77 | region: Parameter[str], 78 | model_display_name: Parameter[str], 79 | pushed_model_location: Parameter[str], 80 | serving_image_uri: Parameter[str], 81 | explanation_config: Parameter[str], 82 | uploaded_model: OutputArtifact[UploadedModel], 83 | ): 84 | """A TFX custom-Python-function component to upload the model to Vertex.""" 85 | 86 | vertex_ai.init(project=project, location=region) 87 | 88 | pushed_model_dir = os.path.join( 89 | pushed_model_location, tf.io.gfile.listdir(pushed_model_location)[-1] 90 | ) 91 | 92 | logging.info(f"Model registry location: {pushed_model_dir}") 93 | 94 | try: 95 | explanation_metadata = vertex_ai.explain.ExplanationMetadata( 96 | inputs=explanation_config["inputs"], 97 | outputs=explanation_config["outputs"], 98 | ) 99 | explanation_parameters = vertex_ai.explain.ExplanationParameters( 100 | explanation_config["params"] 101 | ) 102 | except: 103 | explanation_metadata = None 104 | explanation_parameters = None 105 | 106 | vertex_model = vertex_ai.Model.upload( 107 | display_name=model_display_name, 108 | artifact_uri=pushed_model_dir, 109 | serving_container_image_uri=serving_image_uri, 110 | parameters_schema_uri=None, 111 | instance_schema_uri=None, 112 | explanation_metadata=explanation_metadata, 113 | explanation_parameters=explanation_parameters, 114 | ) 115 | 116 | model_uri = vertex_model.gca_resource.name 117 | logging.info(f"Model uploaded to AI Platform: {model_uri}") 118 | uploaded_model.set_string_custom_property("model_uri", model_uri) 119 | 120 | 121 | @component 122 | def bigquery_data_gen( 123 | sql_query: Parameter[str], 124 | output_data_format: Parameter[str], 125 | beam_args: Parameter[str], 126 | serving_dataset: OutputArtifact[Dataset], 127 | ): 128 | """A TFX custom-Python-function component for extracting data from BigQuery.""" 129 | 130 | output_dir = os.path.join( 131 | artifact_utils.get_single_uri([serving_dataset]), SERVING_DATA_PREFIX 132 | ) 133 | 134 | pipeline_args = json.loads(beam_args) 135 | pipeline_args["sql_query"] = sql_query 136 | pipeline_args["exported_data_prefix"] = output_dir 137 | pipeline_args["output_data_format"] = output_data_format 138 | 139 | logging.info("Data extraction started. Source query:") 140 | logging.info("{sql_query}") 141 | etl.run_extract_pipeline(pipeline_args) 142 | logging.info("Data extraction completed.") 143 | 144 | 145 | @component 146 | def vertex_batch_prediction( 147 | project: Parameter[str], 148 | region: Parameter[str], 149 | model_display_name: Parameter[str], 150 | instances_format: Parameter[str], 151 | predictions_format: Parameter[str], 152 | job_resources: Parameter[str], 153 | serving_dataset: InputArtifact[Dataset], 154 | prediction_results: OutputArtifact[Dataset], 155 | ): 156 | """A TFX custom-Python-function component to submit a Vertex batch prediction Job.""" 157 | 158 | job_resources = json.loads(job_resources) 159 | gcs_source_pattern = ( 160 | os.path.join( 161 | artifact_utils.get_single_uri([serving_dataset]), SERVING_DATA_PREFIX 162 | ) 163 | + "*.jsonl" 164 | ) 165 | 166 | gcs_destination_prefix = artifact_utils.get_single_uri([prediction_results]) 167 | 168 | vertex_client = VertexClient(project, region) 169 | logging.info("Submitting Vertex AI batch prediction job...") 170 | batch_prediction_job = vertex_client.submit_batch_prediction_job( 171 | model_display_name=model_display_name, 172 | gcs_source_pattern=gcs_source_pattern, 173 | gcs_destination_prefix=gcs_destination_prefix, 174 | instances_format=instances_format, 175 | predictions_format=predictions_format, 176 | other_configurations=job_resources, 177 | ) 178 | logging.info("Batch prediction job completed.") 179 | prediction_results.set_string_custom_property( 180 | "batch_prediction_job", batch_prediction_job.gca_resource.name 181 | ) 182 | 183 | 184 | @component 185 | def datastore_prediction_writer( 186 | datastore_kind: Parameter[str], 187 | predictions_format: Parameter[str], 188 | beam_args: Parameter[str], 189 | prediction_results: InputArtifact[Dataset], 190 | ): 191 | """A TFX custom-Python-function component for writing prediction JSONL files to Datastore.""" 192 | 193 | prediction_results_dir = os.path.join( 194 | artifact_utils.get_single_uri([prediction_results]) 195 | ) 196 | prediction_results_dir = os.path.join( 197 | prediction_results_dir, tf.io.gfile.listdir(prediction_results_dir)[0] 198 | ) 199 | prediction_results_uri = os.path.join( 200 | prediction_results_dir, PREDICTION_RESULTS_PREFIX 201 | ) 202 | 203 | pipeline_args = json.loads(beam_args) 204 | pipeline_args["prediction_results_uri"] = prediction_results_uri 205 | pipeline_args["datastore_kind"] = datastore_kind 206 | pipeline_args["predictions_format"] = predictions_format 207 | 208 | logging.info(f"Storing predictions to Datastore kind: {datastore_kind}") 209 | etl.run_store_predictions_pipeline(pipeline_args) 210 | logging.info("Predictions are stored.") 211 | -------------------------------------------------------------------------------- /src/tfx_pipelines/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """TFX pipeline configurations.""" 15 | 16 | import os 17 | 18 | PROJECT_ID = os.getenv("PROJECT_ID", "ksalama-cloudml") 19 | REGION = os.getenv("REGION", "us-central1") 20 | GCS_LOCATION = os.getenv("GCS_LOCATION", "gs://ksalama-cloudml-us/chicago-taxi-tips") 21 | 22 | ARTIFACT_STORE_URI = os.path.join(GCS_LOCATION, "tfx_artifacts") 23 | MODEL_REGISTRY_URI = os.getenv( 24 | "MODEL_REGISTRY_URI", 25 | os.path.join(GCS_LOCATION, "model_registry"), 26 | ) 27 | 28 | DATASET_DISPLAY_NAME = os.getenv("DATASET_DISPLAY_NAME", "chicago-taxi-tips") 29 | MODEL_DISPLAY_NAME = os.getenv( 30 | "MODEL_DISPLAY_NAME", f"{DATASET_DISPLAY_NAME}-classifier" 31 | ) 32 | PIPELINE_NAME = os.getenv("PIPELINE_NAME", f"{MODEL_DISPLAY_NAME}-train-pipeline") 33 | 34 | ML_USE_COLUMN = "ml_use" 35 | EXCLUDE_COLUMNS = ",".join(["trip_start_timestamp"]) 36 | TRAIN_LIMIT = os.getenv("TRAIN_LIMIT", "0") 37 | TEST_LIMIT = os.getenv("TEST_LIMIT", "0") 38 | SERVE_LIMIT = os.getenv("SERVE_LIMIT", "0") 39 | 40 | NUM_TRAIN_SPLITS = os.getenv("NUM_TRAIN_SPLITS", "4") 41 | NUM_EVAL_SPLITS = os.getenv("NUM_EVAL_SPLITS", "1") 42 | ACCURACY_THRESHOLD = os.getenv("ACCURACY_THRESHOLD", "0.8") 43 | 44 | USE_KFP_SA = os.getenv("USE_KFP_SA", "False") 45 | 46 | TFX_IMAGE_URI = os.getenv( 47 | "TFX_IMAGE_URI", f"gcr.io/{PROJECT_ID}/tfx-{DATASET_DISPLAY_NAME}:latest" 48 | ) 49 | 50 | BEAM_RUNNER = os.getenv("BEAM_RUNNER", "DirectRunner") 51 | BEAM_DIRECT_PIPELINE_ARGS = [ 52 | f"--project={PROJECT_ID}", 53 | f"--temp_location={os.path.join(GCS_LOCATION, 'temp')}", 54 | ] 55 | BEAM_DATAFLOW_PIPELINE_ARGS = [ 56 | f"--project={PROJECT_ID}", 57 | f"--temp_location={os.path.join(GCS_LOCATION, 'temp')}", 58 | f"--region={REGION}", 59 | f"--runner={BEAM_RUNNER}", 60 | ] 61 | 62 | 63 | TRAINING_RUNNER = os.getenv("TRAINING_RUNNER", "local") 64 | AI_PLATFORM_TRAINING_ARGS = { 65 | "project": PROJECT_ID, 66 | "region": REGION, 67 | "masterConfig": {"imageUri": TFX_IMAGE_URI}, 68 | } 69 | 70 | 71 | SERVING_RUNTIME = os.getenv("SERVING_RUNTIME", "tf2-cpu.2-4") 72 | SERVING_IMAGE_URI = f"gcr.io/cloud-aiplatform/prediction/{SERVING_RUNTIME}:latest" 73 | 74 | BATCH_PREDICTION_BQ_DATASET_NAME = os.getenv( 75 | "BATCH_PREDICTION_BQ_DATASET_NAME", "playground_us" 76 | ) 77 | BATCH_PREDICTION_BQ_TABLE_NAME = os.getenv( 78 | "BATCH_PREDICTION_BQ_TABLE_NAME", "chicago_taxitrips_prep" 79 | ) 80 | BATCH_PREDICTION_BEAM_ARGS = { 81 | "runner": f"{BEAM_RUNNER}", 82 | "temporary_dir": os.path.join(GCS_LOCATION, "temp"), 83 | "gcs_location": os.path.join(GCS_LOCATION, "temp"), 84 | "project": PROJECT_ID, 85 | "region": REGION, 86 | "setup_file": "./setup.py", 87 | } 88 | BATCH_PREDICTION_JOB_RESOURCES = { 89 | "machine_type": "n1-standard-2", 90 | #'accelerator_count': 1, 91 | #'accelerator_type': 'NVIDIA_TESLA_T4' 92 | "starting_replica_count": 1, 93 | "max_replica_count": 10, 94 | } 95 | DATASTORE_PREDICTION_KIND = f"{MODEL_DISPLAY_NAME}-predictions" 96 | 97 | ENABLE_CACHE = os.getenv("ENABLE_CACHE", "0") 98 | UPLOAD_MODEL = os.getenv("UPLOAD_MODEL", "1") 99 | 100 | os.environ["PROJECT_ID"] = PROJECT_ID 101 | os.environ["PIPELINE_NAME"] = PIPELINE_NAME 102 | os.environ["TFX_IMAGE_URI"] = TFX_IMAGE_URI 103 | os.environ["MODEL_REGISTRY_URI"] = MODEL_REGISTRY_URI 104 | -------------------------------------------------------------------------------- /src/tfx_pipelines/prediction_pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """TFX prediction pipeline definition.""" 15 | 16 | import os 17 | import sys 18 | import json 19 | import logging 20 | 21 | from tfx.orchestration import pipeline, data_types 22 | from ml_metadata.proto import metadata_store_pb2 23 | 24 | SCRIPT_DIR = os.path.dirname( 25 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))) 26 | ) 27 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, ".."))) 28 | 29 | from src.tfx_pipelines import config 30 | from src.tfx_pipelines import components as custom_components 31 | from src.common import datasource_utils 32 | 33 | 34 | def create_pipeline( 35 | pipeline_root: str, 36 | metadata_connection_config: metadata_store_pb2.ConnectionConfig = None, 37 | ): 38 | """Returns a batch prediction pipeline using TFX.""" 39 | 40 | # Get source query. 41 | sql_query = datasource_utils.get_serving_source_query( 42 | bq_dataset_name=config.BATCH_PREDICTION_BQ_DATASET_NAME, 43 | bq_table_name=config.BATCH_PREDICTION_BQ_TABLE_NAME, 44 | limit=int(config.SERVE_LIMIT), 45 | ) 46 | 47 | bigquery_data_gen = custom_components.bigquery_data_gen( 48 | sql_query=sql_query, 49 | output_data_format="jsonl", 50 | beam_args=json.dumps(config.BATCH_PREDICTION_BEAM_ARGS), 51 | ) 52 | 53 | vertex_batch_prediction = custom_components.vertex_batch_prediction( 54 | project=config.PROJECT, 55 | region=config.REGION, 56 | model_display_name=config.MODEL_DISPLAY_NAME, 57 | instances_format="jsonl", 58 | predictions_format="jsonl", 59 | job_resources=json.dumps(config.BATCH_PREDICTION_JOB_RESOURCES), 60 | serving_dataset=bigquery_data_gen.outputs.serving_dataset, 61 | ) 62 | 63 | datastore_prediction_writer = custom_components.datastore_prediction_writer( 64 | datastore_kind=config.DATASTORE_PREDICTION_KIND, 65 | predictions_format="jsonl", 66 | beam_args=json.dumps(config.BATCH_PREDICTION_BEAM_ARGS), 67 | prediction_results=vertex_batch_prediction.outputs.prediction_results, 68 | ) 69 | 70 | pipeline_components = [ 71 | bigquery_data_gen, 72 | vertex_batch_prediction, 73 | datastore_prediction_writer, 74 | ] 75 | 76 | logging.info( 77 | f"Pipeline components: {[component.id for component in pipeline_components]}" 78 | ) 79 | 80 | beam_pipeline_args = config.BEAM_DIRECT_PIPELINE_ARGS 81 | if config.BEAM_RUNNER == "DataflowRunner": 82 | beam_pipeline_args = config.BEAM_DATAFLOW_PIPELINE_ARGS 83 | 84 | logging.info(f"Beam pipeline args: {beam_pipeline_args}") 85 | 86 | return pipeline.Pipeline( 87 | pipeline_name=config.PIPELINE_NAME, 88 | pipeline_root=pipeline_root, 89 | components=pipeline_components, 90 | beam_pipeline_args=beam_pipeline_args, 91 | metadata_connection_config=metadata_connection_config, 92 | enable_cache=int(config.ENABLE_CACHE), 93 | ) 94 | -------------------------------------------------------------------------------- /src/tfx_pipelines/runner.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Define KubeflowV2DagRunner to run the training pipeline using Managed Pipelines.""" 15 | 16 | 17 | import os 18 | from kfp.v2.google.client import AIPlatformClient 19 | from tfx.orchestration import data_types 20 | from tfx.orchestration.kubeflow.v2 import kubeflow_v2_dag_runner 21 | 22 | 23 | from src.tfx_pipelines import config, training_pipeline, prediction_pipeline 24 | from src.model_training import defaults 25 | 26 | 27 | def compile_training_pipeline(pipeline_definition_file): 28 | """Returns the training pipeline definition.""" 29 | 30 | pipeline_root = os.path.join( 31 | config.ARTIFACT_STORE_URI, 32 | config.PIPELINE_NAME, 33 | ) 34 | 35 | managed_pipeline = training_pipeline.create_pipeline( 36 | pipeline_root=pipeline_root, 37 | num_epochs=data_types.RuntimeParameter( 38 | name="num_epochs", 39 | default=defaults.NUM_EPOCHS, 40 | ptype=int, 41 | ), 42 | batch_size=data_types.RuntimeParameter( 43 | name="batch_size", 44 | default=defaults.BATCH_SIZE, 45 | ptype=int, 46 | ), 47 | learning_rate=data_types.RuntimeParameter( 48 | name="learning_rate", 49 | default=defaults.LEARNING_RATE, 50 | ptype=float, 51 | ), 52 | hidden_units=data_types.RuntimeParameter( 53 | name="hidden_units", 54 | default=",".join(str(u) for u in defaults.HIDDEN_UNITS), 55 | ptype=str, 56 | ), 57 | ) 58 | 59 | runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner( 60 | config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( 61 | default_image=config.TFX_IMAGE_URI 62 | ), 63 | output_filename=pipeline_definition_file, 64 | ) 65 | 66 | return runner.run(managed_pipeline, write_out=True) 67 | 68 | 69 | def compile_prediction_pipeline(pipeline_definition_file): 70 | """Returns the prediction pipeline definition.""" 71 | 72 | pipeline_root = os.path.join( 73 | config.ARTIFACT_STORE_URI, 74 | config.PIPELINE_NAME, 75 | ) 76 | 77 | managed_pipeline = prediction_pipeline.create_pipeline( 78 | pipeline_root=pipeline_root, 79 | ) 80 | 81 | runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner( 82 | config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( 83 | default_image=config.TFX_IMAGE_URI 84 | ), 85 | output_filename=pipeline_definition_file, 86 | ) 87 | 88 | return runner.run(managed_pipeline, write_out=True) 89 | 90 | 91 | def submit_pipeline(pipeline_definition_file): 92 | """Submits a pipeline definition file to Vertex pipelines.""" 93 | 94 | pipeline_client = AIPlatformClient(project_id=config.PROJECT, region=config.REGION) 95 | pipeline_client.create_run_from_job_spec(pipeline_definition_file) 96 | -------------------------------------------------------------------------------- /src/tfx_pipelines/training_pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """TFX training pipeline definition.""" 15 | 16 | import os 17 | import sys 18 | import logging 19 | import json 20 | 21 | import tensorflow_model_analysis as tfma 22 | 23 | import tfx 24 | from tfx.proto import example_gen_pb2, transform_pb2, trainer_pb2 25 | from tfx.orchestration import pipeline, data_types 26 | from tfx.dsl.components.base import executor_spec 27 | from tfx.components.trainer import executor as trainer_executor 28 | from tfx.extensions.google_cloud_ai_platform.trainer import ( 29 | executor as ai_platform_trainer_executor, 30 | ) 31 | from tfx.extensions.google_cloud_big_query.example_gen.component import ( 32 | BigQueryExampleGen, 33 | ) 34 | from tfx.components import ( 35 | StatisticsGen, 36 | ExampleValidator, 37 | Transform, 38 | Trainer, 39 | Evaluator, 40 | Pusher, 41 | ) 42 | from tfx.dsl.components.common.importer import Importer 43 | from tfx.dsl.components.common.resolver import Resolver 44 | from tfx.dsl.experimental import latest_artifacts_resolver 45 | from tfx.dsl.experimental import latest_blessed_model_resolver 46 | 47 | from ml_metadata.proto import metadata_store_pb2 48 | 49 | SCRIPT_DIR = os.path.dirname( 50 | os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))) 51 | ) 52 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, ".."))) 53 | 54 | from src.tfx_pipelines import config 55 | from src.tfx_pipelines import components as custom_components 56 | from src.common import features, datasource_utils 57 | 58 | RAW_SCHEMA_DIR = "src/raw_schema" 59 | TRANSFORM_MODULE_FILE = "src/preprocessing/transformations.py" 60 | TRAIN_MODULE_FILE = "src/model_training/runner.py" 61 | 62 | MISSING = { 63 | "trip_month": -1, 64 | "trip_day": -1, 65 | "trip_day_of_week": -1, 66 | "trip_hour": -1, 67 | "trip_seconds": -1, 68 | "trip_miles": -1, 69 | "payment_type": "NA", 70 | "pickup_grid": "NA", 71 | "dropoff_grid": "NA", 72 | "euclidean": -1, 73 | } 74 | 75 | 76 | def create_pipeline( 77 | pipeline_root: str, 78 | num_epochs: data_types.RuntimeParameter, 79 | batch_size: data_types.RuntimeParameter, 80 | learning_rate: data_types.RuntimeParameter, 81 | hidden_units: data_types.RuntimeParameter, 82 | metadata_connection_config: metadata_store_pb2.ConnectionConfig = None, 83 | ): 84 | """Returns a TFX training pipeline.""" 85 | 86 | local_executor_spec = executor_spec.ExecutorClassSpec( 87 | trainer_executor.GenericExecutor 88 | ) 89 | 90 | caip_executor_spec = executor_spec.ExecutorClassSpec( 91 | ai_platform_trainer_executor.GenericExecutor 92 | ) 93 | 94 | # Hyperparameter generation. 95 | hyperparams_gen = custom_components.hyperparameters_gen( 96 | num_epochs=num_epochs, 97 | batch_size=batch_size, 98 | learning_rate=learning_rate, 99 | hidden_units=hidden_units, 100 | ).with_id("HyperparamsGen") 101 | 102 | # Get train source query. 103 | train_sql_query = datasource_utils.get_training_source_query( 104 | config.PROJECT_ID, 105 | config.REGION, 106 | config.DATASET_DISPLAY_NAME, 107 | ml_use="UNASSIGNED", 108 | limit=int(config.TRAIN_LIMIT), 109 | ) 110 | 111 | train_output_config = example_gen_pb2.Output( 112 | split_config=example_gen_pb2.SplitConfig( 113 | splits=[ 114 | example_gen_pb2.SplitConfig.Split( 115 | name="train", hash_buckets=int(config.NUM_TRAIN_SPLITS) 116 | ), 117 | example_gen_pb2.SplitConfig.Split( 118 | name="eval", hash_buckets=int(config.NUM_EVAL_SPLITS) 119 | ), 120 | ] 121 | ) 122 | ) 123 | 124 | # Train example generation. 125 | train_example_gen = BigQueryExampleGen( 126 | query=train_sql_query, 127 | output_config=train_output_config, 128 | ).with_id("TrainDataGen") 129 | 130 | # Get test source query. 131 | test_sql_query = datasource_utils.get_training_source_query( 132 | config.PROJECT_ID, 133 | config.REGION, 134 | config.DATASET_DISPLAY_NAME, 135 | ml_use="TEST", 136 | limit=int(config.TEST_LIMIT), 137 | ) 138 | 139 | test_output_config = example_gen_pb2.Output( 140 | split_config=example_gen_pb2.SplitConfig( 141 | splits=[ 142 | example_gen_pb2.SplitConfig.Split(name="test", hash_buckets=1), 143 | ] 144 | ) 145 | ) 146 | 147 | # Test example generation. 148 | test_example_gen = BigQueryExampleGen( 149 | query=test_sql_query, 150 | output_config=test_output_config, 151 | ).with_id("TestDataGen") 152 | 153 | # Schema importer. 154 | schema_importer = Importer( 155 | source_uri=RAW_SCHEMA_DIR, 156 | artifact_type=tfx.types.standard_artifacts.Schema, 157 | ).with_id("SchemaImporter") 158 | 159 | # Statistics generation. 160 | statistics_gen = StatisticsGen(examples=train_example_gen.outputs.examples).with_id( 161 | "StatisticsGen" 162 | ) 163 | 164 | # Example validation. 165 | example_validator = ExampleValidator( 166 | statistics=statistics_gen.outputs.statistics, 167 | schema=schema_importer.outputs.result, 168 | ).with_id("ExampleValidator") 169 | 170 | # Data transformation. 171 | transform = Transform( 172 | examples=train_example_gen.outputs.examples, 173 | schema=schema_importer.outputs.result, 174 | module_file=TRANSFORM_MODULE_FILE, 175 | splits_config=transform_pb2.SplitsConfig( 176 | analyze=["train"], transform=["train", "eval"] 177 | ), 178 | ).with_id("DataTransformer") 179 | 180 | # Add dependency from example_validator to transform. 181 | transform.add_upstream_node(example_validator) 182 | 183 | # Get the latest model to warmstart 184 | warmstart_model_resolver = Resolver( 185 | strategy_class=latest_artifacts_resolver.LatestArtifactsResolver, 186 | latest_model=tfx.types.Channel(type=tfx.types.standard_artifacts.Model), 187 | ).with_id("WarmstartModelResolver") 188 | 189 | # Model training. 190 | trainer = Trainer( 191 | custom_executor_spec=local_executor_spec 192 | if config.TRAINING_RUNNER == "local" 193 | else caip_executor_spec, 194 | module_file=TRAIN_MODULE_FILE, 195 | transformed_examples=transform.outputs.transformed_examples, 196 | schema=schema_importer.outputs.result, 197 | # base_model=warmstart_model_resolver.outputs.latest_model, 198 | transform_graph=transform.outputs.transform_graph, 199 | train_args=trainer_pb2.TrainArgs(num_steps=0), 200 | eval_args=trainer_pb2.EvalArgs(num_steps=None), 201 | hyperparameters=hyperparams_gen.outputs.hyperparameters, 202 | ).with_id("ModelTrainer") 203 | 204 | # Get the latest blessed model (baseline) for model validation. 205 | baseline_model_resolver = Resolver( 206 | strategy_class=latest_blessed_model_resolver.LatestBlessedModelResolver, 207 | model=tfx.types.Channel(type=tfx.types.standard_artifacts.Model), 208 | model_blessing=tfx.types.Channel( 209 | type=tfx.types.standard_artifacts.ModelBlessing 210 | ), 211 | ).with_id("BaselineModelResolver") 212 | 213 | # Prepare evaluation config. 214 | eval_config = tfma.EvalConfig( 215 | model_specs=[ 216 | tfma.ModelSpec( 217 | signature_name="serving_tf_example", 218 | label_key=features.TARGET_FEATURE_NAME, 219 | prediction_key="probabilities", 220 | ) 221 | ], 222 | slicing_specs=[ 223 | tfma.SlicingSpec(), 224 | ], 225 | metrics_specs=[ 226 | tfma.MetricsSpec( 227 | metrics=[ 228 | tfma.MetricConfig(class_name="ExampleCount"), 229 | tfma.MetricConfig( 230 | class_name="BinaryAccuracy", 231 | threshold=tfma.MetricThreshold( 232 | value_threshold=tfma.GenericValueThreshold( 233 | lower_bound={"value": float(config.ACCURACY_THRESHOLD)} 234 | ), 235 | # Change threshold will be ignored if there is no 236 | # baseline model resolved from MLMD (first run). 237 | change_threshold=tfma.GenericChangeThreshold( 238 | direction=tfma.MetricDirection.HIGHER_IS_BETTER, 239 | absolute={"value": -1e-10}, 240 | ), 241 | ), 242 | ), 243 | ] 244 | ) 245 | ], 246 | ) 247 | 248 | # Model evaluation. 249 | evaluator = Evaluator( 250 | examples=test_example_gen.outputs.examples, 251 | example_splits=["test"], 252 | model=trainer.outputs.model, 253 | # baseline_model=baseline_model_resolver.outputs.model, 254 | eval_config=eval_config, 255 | schema=schema_importer.outputs.result, 256 | ).with_id("ModelEvaluator") 257 | 258 | exported_model_location = os.path.join( 259 | config.MODEL_REGISTRY_URI, config.MODEL_DISPLAY_NAME 260 | ) 261 | push_destination = tfx.proto.pusher_pb2.PushDestination( 262 | filesystem=tfx.proto.pusher_pb2.PushDestination.Filesystem( 263 | base_directory=exported_model_location 264 | ) 265 | ) 266 | 267 | # Push custom model to model registry. 268 | pusher = Pusher( 269 | model=trainer.outputs.model, 270 | model_blessing=evaluator.outputs.blessing, 271 | push_destination=push_destination, 272 | ).with_id("ModelPusher") 273 | 274 | # Upload custom trained model to Vertex AI. 275 | explanation_config = json.dumps(features.generate_explanation_config()) 276 | vertex_model_uploader = custom_components.vertex_model_uploader( 277 | project=config.PROJECT_ID, 278 | region=config.REGION, 279 | model_display_name=config.MODEL_DISPLAY_NAME, 280 | pushed_model_location=exported_model_location, 281 | serving_image_uri=config.SERVING_IMAGE_URI, 282 | explanation_config=explanation_config, 283 | ).with_id("VertexUploader") 284 | 285 | pipeline_components = [ 286 | hyperparams_gen, 287 | train_example_gen, 288 | test_example_gen, 289 | statistics_gen, 290 | schema_importer, 291 | example_validator, 292 | transform, 293 | # warmstart_model_resolver, 294 | trainer, 295 | # baseline_model_resolver, 296 | evaluator, 297 | pusher, 298 | ] 299 | 300 | if int(config.UPLOAD_MODEL): 301 | pipeline_components.append(vertex_model_uploader) 302 | # Add dependency from pusher to aip_model_uploader. 303 | vertex_model_uploader.add_upstream_node(pusher) 304 | 305 | logging.info( 306 | f"Pipeline components: {[component.id for component in pipeline_components]}" 307 | ) 308 | 309 | beam_pipeline_args = config.BEAM_DIRECT_PIPELINE_ARGS 310 | if config.BEAM_RUNNER == "DataflowRunner": 311 | beam_pipeline_args = config.BEAM_DATAFLOW_PIPELINE_ARGS 312 | 313 | logging.info(f"Beam pipeline args: {beam_pipeline_args}") 314 | 315 | return pipeline.Pipeline( 316 | pipeline_name=config.PIPELINE_NAME, 317 | pipeline_root=pipeline_root, 318 | components=pipeline_components, 319 | beam_pipeline_args=beam_pipeline_args, 320 | metadata_connection_config=metadata_connection_config, 321 | enable_cache=int(config.ENABLE_CACHE), 322 | ) 323 | --------------------------------------------------------------------------------