├── .gitignore
├── 01-dataset-management.ipynb
├── 02-experimentation.ipynb
├── 03-training-formalization.ipynb
├── 04-pipeline-deployment.ipynb
├── 05-continuous-training.ipynb
├── 06-model-deployment.ipynb
├── 07-prediction-serving.ipynb
├── 08-model-monitoring.ipynb
├── Dockerfile
├── LICENSE
├── README.md
├── build
    ├── Dockerfile
    ├── model-deployment.yaml
    ├── pipeline-deployment.yaml
    ├── serving_resources_spec.json
    └── utils.py
├── mlops.png
├── provision
    ├── README.md
    └── terraform
    │   ├── gcs-bucket.tf
    │   ├── main.tf
    │   ├── notebook-instance.tf
    │   ├── service-accounts.tf
    │   ├── services.tf
    │   ├── terraform.tfvars
    │   └── variables.tf
├── requirements.txt
├── setup.py
└── src
    ├── __init__.py
    ├── common
        ├── __init__.py
        ├── datasource_utils.py
        └── features.py
    ├── model_training
        ├── __init__.py
        ├── data.py
        ├── defaults.py
        ├── exporter.py
        ├── model.py
        ├── runner.py
        ├── task.py
        └── trainer.py
    ├── pipeline_triggering
        ├── __init__.py
        ├── main.py
        └── requirements.txt
    ├── preprocessing
        ├── __init__.py
        ├── etl.py
        └── transformations.py
    ├── raw_schema
        └── schema.pbtxt
    ├── tests
        ├── __init__.py
        ├── datasource_utils_tests.py
        ├── etl_tests.py
        ├── model_deployment_tests.py
        ├── model_tests.py
        └── pipeline_deployment_tests.py
    └── tfx_pipelines
        ├── __init__.py
        ├── components.py
        ├── config.py
        ├── prediction_pipeline.py
        ├── runner.py
        └── training_pipeline.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | develop-eggs/
 12 | dist/
 13 | downloads/
 14 | eggs/
 15 | .eggs/
 16 | lib/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | wheels/
 22 | pip-wheel-metadata/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | .idea/
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | _workspace/
132 | *.tar.gz
133 | .egg-info/
134 | *.whl
135 | mlpipeline-ui-metadata.json
136 | *.csv
137 | *.sqllite
138 | model.png
139 | 


--------------------------------------------------------------------------------
/04-pipeline-deployment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "d272910e",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# 04 - Test and deploy a TFX training pipeline to `Vertex Pipelines`\n",
  9 |     "\n",
 10 |     "The purpose of this notebook is to test, deploy, and run the `TFX` pipeline on `Vertex Pipelines`. The notebook covers the following tasks:\n",
 11 |     "\n",
 12 |     "1. Run the tests locally.\n",
 13 |     "2. Run the `TFX` pipeline using `Vertex Pipelines`\n",
 14 |     "3. Execute the pipeline deployment `CI/CD` steps using `Cloud Build`."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "id": "beaa2787",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## Setup"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "id": "51e05608",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "### Import libraries"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "id": "9aa72b29",
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import os\n",
 41 |     "import kfp\n",
 42 |     "import tfx\n",
 43 |     "\n",
 44 |     "print('TFX:', tfx.__version__)\n",
 45 |     "print('KFP:', kfp.__version__)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "id": "24aceb9a",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "### Setup Google Cloud project"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "id": "d8d9f81b",
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "PROJECT_ID = '[your-project-id]' # Change to your project id.\n",
 64 |     "REGION = 'us-central1' # Change to your region.\n",
 65 |     "BUCKET =  '[your-bucket-name]' # Change to your bucket name.\n",
 66 |     "SERVICE_ACCOUNT = '[your-service-account]'\n",
 67 |     "\n",
 68 |     "if PROJECT_ID == '' or PROJECT_ID is None or PROJECT_ID == '[your-project-id]':\n",
 69 |     "    # Get your GCP project id from gcloud\n",
 70 |     "    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n",
 71 |     "    PROJECT_ID = shell_output[0]\n",
 72 |     "    \n",
 73 |     "if SERVICE_ACCOUNT == '' or SERVICE_ACCOUNT is None or SERVICE_ACCOUNT == '[your-service-account]':\n",
 74 |     "    # Get your GCP project id from gcloud\n",
 75 |     "    shell_output = !gcloud config list --format 'value(core.account)' 2>/dev/null\n",
 76 |     "    SERVICE_ACCOUNT = shell_output[0]\n",
 77 |     "    \n",
 78 |     "if BUCKET == '' or BUCKET is None or BUCKET == '[your-bucket-name]':\n",
 79 |     "    # Set your bucket name using your GCP project id\n",
 80 |     "    BUCKET = PROJECT_ID\n",
 81 |     "    # Try to create the bucket if it doesn'exists\n",
 82 |     "    ! gsutil mb -l $REGION gs://$BUCKET\n",
 83 |     "    print('')\n",
 84 |     "    \n",
 85 |     "print('Project ID:', PROJECT_ID)\n",
 86 |     "print('Region:', REGION)\n",
 87 |     "print('Bucket name:', BUCKET)\n",
 88 |     "print('Service Account:', SERVICE_ACCOUNT)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "id": "3d24d18b",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "### Set configurations"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "id": "a8295cca",
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "BQ_LOCATION = 'US'\n",
107 |     "BQ_DATASET_NAME = 'playground_us' # Change to your BQ dataset name.\n",
108 |     "BQ_TABLE_NAME = 'chicago_taxitrips_prep'\n",
109 |     "\n",
110 |     "VERSION = 'v1'\n",
111 |     "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n",
112 |     "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n",
113 |     "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n",
114 |     "\n",
115 |     "CICD_IMAGE_NAME = 'cicd:latest'\n",
116 |     "CICD_IMAGE_URI = f'gcr.io/{PROJECT_ID}/{CICD_IMAGE_NAME}'"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "id": "81d049f5",
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "! rm -r src/raw_schema/.ipynb_checkpoints/"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "id": "4cbcbbb8",
132 |    "metadata": {},
133 |    "source": [
134 |     "## 1. Run the CI/CD steps locally"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "id": "58845362",
140 |    "metadata": {},
141 |    "source": [
142 |     "### Set pipeline configurations for the local run"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "id": "44c48da6",
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "os.environ['DATASET_DISPLAY_NAME'] = DATASET_DISPLAY_NAME\n",
153 |     "os.environ['MODEL_DISPLAY_NAME'] =  MODEL_DISPLAY_NAME\n",
154 |     "os.environ['PIPELINE_NAME'] = PIPELINE_NAME\n",
155 |     "os.environ['PROJECT'] = PROJECT_ID\n",
156 |     "os.environ['REGION'] = REGION\n",
157 |     "os.environ['BQ_LOCATION'] = BQ_LOCATION\n",
158 |     "os.environ['BQ_DATASET_NAME'] = BQ_DATASET_NAME\n",
159 |     "os.environ['BQ_TABLE_NAME'] = BQ_TABLE_NAME\n",
160 |     "os.environ['GCS_LOCATION'] = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/e2e_tests'\n",
161 |     "os.environ['TRAIN_LIMIT'] = '1000'\n",
162 |     "os.environ['TEST_LIMIT'] = '100'\n",
163 |     "os.environ['UPLOAD_MODEL'] = '0'\n",
164 |     "os.environ['ACCURACY_THRESHOLD'] = '0.1'\n",
165 |     "os.environ['BEAM_RUNNER'] = 'DirectRunner'\n",
166 |     "os.environ['TRAINING_RUNNER'] = 'local'"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "id": "fcf65dee",
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "from src.tfx_pipelines import config\n",
177 |     "import importlib\n",
178 |     "importlib.reload(config)\n",
179 |     "\n",
180 |     "for key, value in config.__dict__.items():\n",
181 |     "    if key.isupper(): print(f'{key}: {value}')"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "id": "0e4989b9",
187 |    "metadata": {},
188 |    "source": [
189 |     "### Run the unit tests for the data and model pipeline components"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "id": "37324634",
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "! py.test src/tests/datasource_utils_tests.py -s"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "id": "1a40f106",
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "! py.test src/tests/model_tests.py -s"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "id": "f3b62aea",
215 |    "metadata": {},
216 |    "source": [
217 |     "### Run the e2e pipeline test"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": null,
223 |    "id": "3acb31cf",
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "! py.test src/tests/pipeline_deployment_tests.py::test_e2e_pipeline -s"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "id": "1c8758df",
233 |    "metadata": {},
234 |    "source": [
235 |     "## 2. Run the training pipeline using `Vertex Pipelines`"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "id": "a02ce062",
241 |    "metadata": {},
242 |    "source": [
243 |     "### Set the pipeline configurations for the `Vertex Pipeline` run"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "id": "01c2b3e1",
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "os.environ['DATASET_DISPLAY_NAME'] = DATASET_DISPLAY_NAME\n",
254 |     "os.environ['MODEL_DISPLAY_NAME'] = MODEL_DISPLAY_NAME\n",
255 |     "os.environ['PIPELINE_NAME'] = PIPELINE_NAME\n",
256 |     "os.environ['PROJECT'] = PROJECT_ID\n",
257 |     "os.environ['REGION'] = REGION\n",
258 |     "os.environ['GCS_LOCATION'] = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}'\n",
259 |     "os.environ['TRAIN_LIMIT'] = '85000'\n",
260 |     "os.environ['TEST_LIMIT'] = '15000'\n",
261 |     "os.environ['BEAM_RUNNER'] = 'DataflowRunner'\n",
262 |     "os.environ['TRAINING_RUNNER'] = 'vertex'\n",
263 |     "os.environ['TFX_IMAGE_URI'] = f'gcr.io/{PROJECT_ID}/{DATASET_DISPLAY_NAME}:{VERSION}'"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "id": "9e8be723",
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "from src.tfx_pipelines import config\n",
274 |     "import importlib\n",
275 |     "importlib.reload(config)\n",
276 |     "\n",
277 |     "for key, value in config.__dict__.items():\n",
278 |     "    if key.isupper(): print(f'{key}: {value}')"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "id": "286ff84e",
284 |    "metadata": {},
285 |    "source": [
286 |     "### Build the training container image\n",
287 |     "\n",
288 |     "This is the `TFX` runtime environment for the training pipeline steps."
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "id": "d9686014",
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "!echo $TFX_IMAGE_URI"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": null,
304 |    "id": "7f7986c2",
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "!gcloud builds submit --tag $TFX_IMAGE_URI . --timeout=15m --machine-type=e2-highcpu-8"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "markdown",
313 |    "id": "ce2d5c9a",
314 |    "metadata": {},
315 |    "source": [
316 |     "### Compile the `TFX` pipeline"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "id": "df29fc7e",
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": [
326 |     "from src.tfx_pipelines import runner\n",
327 |     "\n",
328 |     "pipeline_definition_file = f'{config.PIPELINE_NAME}.json'\n",
329 |     "pipeline_definition = runner.compile_training_pipeline(pipeline_definition_file)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "id": "75928c08",
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "PIPELINES_STORE = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/compiled_pipelines/'\n",
340 |     "! gsutil cp {pipeline_definition_file} {PIPELINES_STORE}"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "markdown",
345 |    "id": "ef836781",
346 |    "metadata": {},
347 |    "source": [
348 |     "### Submit run to Vertex Pipelines"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": null,
354 |    "id": "1d1115bd",
355 |    "metadata": {},
356 |    "outputs": [],
357 |    "source": [
358 |     "from kfp.v2.google.client import AIPlatformClient\n",
359 |     "\n",
360 |     "pipeline_client = AIPlatformClient(\n",
361 |     "    project_id=PROJECT_ID, region=REGION)\n",
362 |     "                 \n",
363 |     "job = pipeline_client.create_run_from_job_spec(\n",
364 |     "    job_spec_path=pipeline_definition_file,\n",
365 |     "    parameter_values={\n",
366 |     "        'learning_rate': 0.003,\n",
367 |     "        'batch_size': 512,\n",
368 |     "        'hidden_units': '128,128',\n",
369 |     "        'num_epochs': 30,\n",
370 |     "    }\n",
371 |     ")"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "markdown",
376 |    "id": "7cc4477f",
377 |    "metadata": {},
378 |    "source": [
379 |     "### Extracting pipeline runs metadata"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": null,
385 |    "id": "464ad3a8",
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "from google.cloud import aiplatform as vertex_ai\n",
390 |     "\n",
391 |     "pipeline_df = vertex_ai.get_pipeline_df(PIPELINE_NAME)\n",
392 |     "pipeline_df = pipeline_df[pipeline_df.pipeline_name == PIPELINE_NAME]\n",
393 |     "pipeline_df.T"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "markdown",
398 |    "id": "ad380129",
399 |    "metadata": {},
400 |    "source": [
401 |     "## 3. Execute the pipeline deployment CI/CD steps in Cloud Build\n",
402 |     "\n",
403 |     "The CI/CD routine is defined in the [pipeline-deployment.yaml](pipeline-deployment.yaml) file, and consists of the following steps:\n",
404 |     "1. Clone the repository to the build environment.\n",
405 |     "2. Run unit tests.\n",
406 |     "3. Run a local e2e test of the pipeline.\n",
407 |     "4. Build the ML container image for pipeline steps.\n",
408 |     "5. Compile the pipeline.\n",
409 |     "6. Upload the pipeline to Cloud Storage."
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "markdown",
414 |    "id": "e00b075f",
415 |    "metadata": {},
416 |    "source": [
417 |     "### Build CI/CD container image for Cloud Build\n",
418 |     "\n",
419 |     "This is the runtime environment where the steps of testing and deploying the pipeline will be executed."
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "id": "867e5ae1",
426 |    "metadata": {},
427 |    "outputs": [],
428 |    "source": [
429 |     "! echo $CICD_IMAGE_URI"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": null,
435 |    "id": "40f497f6",
436 |    "metadata": {},
437 |    "outputs": [],
438 |    "source": [
439 |     "! gcloud builds submit --tag $CICD_IMAGE_URI build/. --timeout=15m --machine-type=e2-highcpu-8"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "markdown",
444 |    "id": "4f6e2dd7",
445 |    "metadata": {},
446 |    "source": [
447 |     "### Run CI/CD from pipeline deployment using Cloud Build"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "id": "117895d7",
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "REPO_URL = 'https://github.com/ksalama/ucaip-labs.git' # Change to your github repo.\n",
458 |     "BRANCH = 'main'\n",
459 |     "\n",
460 |     "GCS_LOCATION = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/'\n",
461 |     "TEST_GCS_LOCATION = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/e2e_tests'\n",
462 |     "CI_TRAIN_LIMIT = 1000\n",
463 |     "CI_TEST_LIMIT = 100\n",
464 |     "CI_UPLOAD_MODEL = 0\n",
465 |     "CI_ACCURACY_THRESHOLD = 0.1\n",
466 |     "BEAM_RUNNER = 'DataflowRunner'\n",
467 |     "TRAINING_RUNNER = 'vertex'\n",
468 |     "VERSION = 'tfx-0-30'\n",
469 |     "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n",
470 |     "PIPELINES_STORE = os.path.join(GCS_LOCATION, 'compiled_pipelines')\n",
471 |     "\n",
472 |     "TFX_IMAGE_URI = f'gcr.io/{PROJECT_ID}/{DATASET_DISPLAY_NAME}:{VERSION}'\n",
473 |     "\n",
474 |     "SUBSTITUTIONS=f'''\\\n",
475 |     "_REPO_URL='{REPO_URL}',\\\n",
476 |     "_BRANCH={BRANCH},\\\n",
477 |     "_CICD_IMAGE_URI={CICD_IMAGE_URI},\\\n",
478 |     "_PROJECT_ID={PROJECT_ID},\\\n",
479 |     "_REGION={REGION},\\\n",
480 |     "_GCS_LOCATION={GCS_LOCATION},\\\n",
481 |     "_TEST_GCS_LOCATION={TEST_GCS_LOCATION},\\\n",
482 |     "_BQ_LOCATION={BQ_LOCATION},\\\n",
483 |     "_BQ_DATASET_NAME={BQ_DATASET_NAME},\\\n",
484 |     "_BQ_TABLE_NAME={BQ_TABLE_NAME},\\\n",
485 |     "_DATASET_DISPLAY_NAME={DATASET_DISPLAY_NAME},\\\n",
486 |     "_MODEL_DISPLAY_NAME={MODEL_DISPLAY_NAME},\\\n",
487 |     "_CI_TRAIN_LIMIT={CI_TRAIN_LIMIT},\\\n",
488 |     "_CI_TEST_LIMIT={CI_TEST_LIMIT},\\\n",
489 |     "_CI_UPLOAD_MODEL={CI_UPLOAD_MODEL},\\\n",
490 |     "_CI_ACCURACY_THRESHOLD={CI_ACCURACY_THRESHOLD},\\\n",
491 |     "_BEAM_RUNNER={BEAM_RUNNER},\\\n",
492 |     "_TRAINING_RUNNER={TRAINING_RUNNER},\\\n",
493 |     "_TFX_IMAGE_URI={TFX_IMAGE_URI},\\\n",
494 |     "_PIPELINE_NAME={PIPELINE_NAME},\\\n",
495 |     "_PIPELINES_STORE={PIPELINES_STORE}\\\n",
496 |     "'''\n",
497 |     "\n",
498 |     "!echo $SUBSTITUTIONS"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": null,
504 |    "id": "b54081db",
505 |    "metadata": {},
506 |    "outputs": [],
507 |    "source": [
508 |     "!gcloud builds submit --no-source --timeout=60m --config build/pipeline-deployment.yaml --substitutions {SUBSTITUTIONS} --machine-type=e2-highcpu-8"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": null,
514 |    "id": "72d9baf5",
515 |    "metadata": {},
516 |    "outputs": [],
517 |    "source": []
518 |   }
519 |  ],
520 |  "metadata": {
521 |   "environment": {
522 |    "name": "common-cpu.m73",
523 |    "type": "gcloud",
524 |    "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73"
525 |   },
526 |   "kernelspec": {
527 |    "display_name": "Python 3",
528 |    "language": "python",
529 |    "name": "python3"
530 |   },
531 |   "language_info": {
532 |    "codemirror_mode": {
533 |     "name": "ipython",
534 |     "version": 3
535 |    },
536 |    "file_extension": ".py",
537 |    "mimetype": "text/x-python",
538 |    "name": "python",
539 |    "nbconvert_exporter": "python",
540 |    "pygments_lexer": "ipython3",
541 |    "version": "3.7.10"
542 |   }
543 |  },
544 |  "nbformat": 4,
545 |  "nbformat_minor": 5
546 | }
547 | 


--------------------------------------------------------------------------------
/05-continuous-training.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "26667428",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# 05 - Continuous training\n",
  9 |     "\n",
 10 |     "After testing, compiling, and uploading the pipeline definition to Cloud Storage, the pipeline is executed with respect to a trigger. `Cloud Functions` and `Cloud Pub/Sub` are used in this notebook as a triggering mechanism. The triggering can be scheduled using `Cloud Scheduler`. The trigger source sends a message to a Cloud Pub/Sub topic that the Cloud Function listens to, and then it submits the pipeline to `Vertex Pipelines` to be executed.\n",
 11 |     "\n",
 12 |     "This notebook covers the following steps:\n",
 13 |     "1. Create the `Cloud Pub/Sub` topic.\n",
 14 |     "2. Deploy the `Cloud Function` \n",
 15 |     "3. Test triggering a pipeline.\n",
 16 |     "4. Extracting pipeline run metadata.\n",
 17 |     "\n",
 18 |     "Learn about [Cloud Functions](https://cloud.google.com/functions).\n",
 19 |     "Learn about [Cloud Pub/Sub](https://cloud.google.com/pubsub).\n",
 20 |     "Learn about [Cloud Scheduler](https://cloud.google.com/scheduler)"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "45edf109",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Setup"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "id": "d44dcc22",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "### Import libraries"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "id": "8fa8c2ff",
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "import json\n",
 47 |     "import os\n",
 48 |     "import logging\n",
 49 |     "import tensorflow as tf\n",
 50 |     "import tfx\n",
 51 |     "import IPython \n",
 52 |     "\n",
 53 |     "logging.getLogger().setLevel(logging.INFO)\n",
 54 |     "\n",
 55 |     "print('TFX:', tfx.__version__)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "id": "a82072dc",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "### Setup Google Cloud project"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "id": "04c5843a",
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "PROJECT_ID = '[your-project-id]'  # Change to your project id.\n",
 74 |     "REGION = 'us-central1'  # Change to your region.\n",
 75 |     "BUCKET = '[your-bucket-name]'  # Change to your bucket name.\n",
 76 |     "\n",
 77 |     "if PROJECT_ID == '' or PROJECT_ID is None or PROJECT_ID == '[your-project-id]':\n",
 78 |     "    # Get your GCP project id from gcloud\n",
 79 |     "    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n",
 80 |     "    PROJECT_ID = shell_output[0]\n",
 81 |     "    \n",
 82 |     "if BUCKET == '' or BUCKET is None or BUCKET == '[your-bucket-name]':\n",
 83 |     "    # Set your bucket name using your GCP project id\n",
 84 |     "    BUCKET = PROJECT_ID\n",
 85 |     "    # Try to create the bucket if it doesn'exists\n",
 86 |     "    ! gsutil mb -l $REGION gs://$BUCKET\n",
 87 |     "    print('')\n",
 88 |     "\n",
 89 |     "print('Project ID:', PROJECT_ID)\n",
 90 |     "print('Region:', REGION)\n",
 91 |     "print('Bucket name:', BUCKET)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "id": "ae7570c7",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "### Set configurations"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "id": "99e362bb",
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "VERSION = 'v1'\n",
110 |     "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n",
111 |     "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n",
112 |     "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-train-pipeline'\n",
113 |     "\n",
114 |     "PIPELINES_STORE = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/compiled_pipelines/'\n",
115 |     "GCS_PIPELINE_FILE_LOCATION = os.path.join(PIPELINES_STORE, f'{PIPELINE_NAME}.json')\n",
116 |     "PUBSUB_TOPIC = f'trigger-{PIPELINE_NAME}'\n",
117 |     "CLOUD_FUNCTION_NAME = f'trigger-{PIPELINE_NAME}-fn'"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "id": "fc916c87",
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "!gsutil ls {GCS_PIPELINE_FILE_LOCATION}"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "id": "ed5321a4",
133 |    "metadata": {},
134 |    "source": [
135 |     "## 1. Create a Pub/Sub topic"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "id": "1f36582f",
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "! gcloud pubsub topics create {PUBSUB_TOPIC}"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "id": "f090676b",
151 |    "metadata": {},
152 |    "source": [
153 |     "## 2. Deploy the Cloud Function"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "id": "48858f15",
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "ENV_VARS=f'''\\\n",
164 |     "PROJECT={PROJECT_ID},\\\n",
165 |     "REGION={REGION},\\\n",
166 |     "GCS_PIPELINE_FILE_LOCATION={GCS_PIPELINE_FILE_LOCATION}\n",
167 |     "'''\n",
168 |     "\n",
169 |     "! echo {ENV_VARS}"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "id": "a78831f0",
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "! rm -r src/pipeline_triggering/.ipynb_checkpoints"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "id": "dfd65f0d",
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "! gcloud functions deploy {CLOUD_FUNCTION_NAME} \\\n",
190 |     "    --region={REGION} \\\n",
191 |     "    --trigger-topic={PUBSUB_TOPIC} \\\n",
192 |     "    --runtime=python37 \\\n",
193 |     "    --source=src/pipeline_triggering\\\n",
194 |     "    --entry-point=trigger_pipeline\\\n",
195 |     "    --stage-bucket={BUCKET}\\\n",
196 |     "    --update-env-vars={ENV_VARS}"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "id": "4f632321",
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "cloud_fn_url = f'https://console.cloud.google.com/functions/details/{REGION}/{CLOUD_FUNCTION_NAME}'\n",
207 |     "html = f'See the Cloud Function details <a href=\"{cloud_fn_url}\" target=\"_blank\">here</a>.'\n",
208 |     "IPython.display.display(IPython.display.HTML(html))"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "id": "9c00b9a6",
214 |    "metadata": {},
215 |    "source": [
216 |     "## 3. Trigger the pipeline"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "id": "0cf3abbc",
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "from google.cloud import pubsub\n",
227 |     "\n",
228 |     "publish_client = pubsub.PublisherClient()\n",
229 |     "topic = f'projects/{PROJECT_ID}/topics/{PUBSUB_TOPIC}'\n",
230 |     "data = {\n",
231 |     "    'num_epochs': 7,\n",
232 |     "    'learning_rate': 0.0015,\n",
233 |     "    'batch_size': 512,\n",
234 |     "    'hidden_units': '256,126'\n",
235 |     "}\n",
236 |     "message = json.dumps(data)\n",
237 |     "\n",
238 |     "_ = publish_client.publish(topic, message.encode())"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "id": "c536f29d",
244 |    "metadata": {},
245 |    "source": [
246 |     "Wait for a few seconds for the pipeline run to be submitted, then you can see the run in the Cloud Console"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "id": "887538b6",
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "from kfp.v2.google.client import AIPlatformClient\n",
257 |     "\n",
258 |     "pipeline_client = AIPlatformClient(\n",
259 |     "    project_id=PROJECT_ID, region=REGION)\n",
260 |     " \n",
261 |     "job_display_name = pipeline_client.list_jobs()['pipelineJobs'][0]['displayName']\n",
262 |     "job_url = f'https://console.cloud.google.com/vertex-ai/locations/{REGION}/pipelines/runs/{job_display_name}'\n",
263 |     "html = f'See the Pipeline job <a href=\"{job_url}\" target=\"_blank\">here</a>.'\n",
264 |     "IPython.display.display(IPython.display.HTML(html))"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "id": "159a66d2",
270 |    "metadata": {},
271 |    "source": [
272 |     "## 4. Extracting pipeline runs metadata"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": null,
278 |    "id": "affe56fc",
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": [
282 |     "from google.cloud import aiplatform as vertex_ai\n",
283 |     "\n",
284 |     "pipeline_df = vertex_ai.get_pipeline_df(PIPELINE_NAME)\n",
285 |     "pipeline_df = pipeline_df[pipeline_df.pipeline_name == PIPELINE_NAME]\n",
286 |     "pipeline_df.T"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": null,
292 |    "id": "04ba49ed",
293 |    "metadata": {},
294 |    "outputs": [],
295 |    "source": []
296 |   }
297 |  ],
298 |  "metadata": {
299 |   "environment": {
300 |    "name": "common-cpu.m73",
301 |    "type": "gcloud",
302 |    "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73"
303 |   },
304 |   "kernelspec": {
305 |    "display_name": "Python 3",
306 |    "language": "python",
307 |    "name": "python3"
308 |   },
309 |   "language_info": {
310 |    "codemirror_mode": {
311 |     "name": "ipython",
312 |     "version": 3
313 |    },
314 |    "file_extension": ".py",
315 |    "mimetype": "text/x-python",
316 |    "name": "python",
317 |    "nbconvert_exporter": "python",
318 |    "pygments_lexer": "ipython3",
319 |    "version": "3.7.10"
320 |   }
321 |  },
322 |  "nbformat": 4,
323 |  "nbformat_minor": 5
324 | }
325 | 


--------------------------------------------------------------------------------
/06-model-deployment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "e80f441f",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# 06 - Model deployment\n",
  9 |     "\n",
 10 |     "The purpose of this notebook is to execute a CI/CD routine to test and deploy the trained `Vertex Model` resource to a `Vertex Endpoint` resource for online prediction serving. The notebook covers the following steps:\n",
 11 |     "\n",
 12 |     "1. Run the test steps locally.\n",
 13 |     "2. Execute the model deployment CI/CD steps using `Cloud Build`.\n",
 14 |     "\n"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "id": "0db03d3d",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## Setup"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "id": "3cd5f896",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "### Import libraries"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "id": "c98cf8cb",
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import os\n",
 41 |     "import logging\n",
 42 |     "\n",
 43 |     "logging.getLogger().setLevel(logging.INFO)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "id": "faf0de35",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "### Setup Google Cloud project"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "id": "8ab672e9",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "PROJECT_ID = '[your-project-id]' # Change to your project id.\n",
 62 |     "REGION = 'us-central1' # Change to your region.\n",
 63 |     "\n",
 64 |     "if PROJECT_ID == '' or PROJECT_ID is None or PROJECT_ID == '[your-project-id]':\n",
 65 |     "    # Get your GCP project id from gcloud\n",
 66 |     "    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n",
 67 |     "    PROJECT_ID = shell_output[0]\n",
 68 |     "\n",
 69 |     "print('Project ID:', PROJECT_ID)\n",
 70 |     "print('Region:', REGION)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "id": "2d1e359d",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "### Set configurations"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "id": "25a1e19b",
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "VERSION = 'v1'\n",
 89 |     "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n",
 90 |     "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n",
 91 |     "ENDPOINT_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier'\n",
 92 |     "\n",
 93 |     "CICD_IMAGE_NAME = 'cicd:latest'\n",
 94 |     "CICD_IMAGE_URI = f'gcr.io/{PROJECT_ID}/{CICD_IMAGE_NAME}'"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "id": "d27a65fd",
100 |    "metadata": {},
101 |    "source": [
102 |     "## 1. Run CI/CD steps locally"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "id": "daffa85a",
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "os.environ['PROJECT'] = PROJECT_ID\n",
113 |     "os.environ['REGION'] = REGION\n",
114 |     "os.environ['MODEL_DISPLAY_NAME'] = MODEL_DISPLAY_NAME\n",
115 |     "os.environ['ENDPOINT_DISPLAY_NAME'] = ENDPOINT_DISPLAY_NAME"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "id": "189dde56",
121 |    "metadata": {},
122 |    "source": [
123 |     "### Run the model artifact testing"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "id": "20c8ce61",
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "! py.test src/tests/model_deployment_tests.py::test_model_artifact -s"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "id": "195e7cde",
139 |    "metadata": {},
140 |    "source": [
141 |     "### Run create endpoint"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "id": "bdaf0c28",
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "! python build/utils.py \\\n",
152 |     "    --mode=create-endpoint\\\n",
153 |     "    --project={PROJECT_ID}\\\n",
154 |     "    --region={REGION}\\\n",
155 |     "    --endpoint-display-name={ENDPOINT_DISPLAY_NAME}"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "id": "3e8022b5",
161 |    "metadata": {},
162 |    "source": [
163 |     "### Run deploy model"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "id": "bac7e8b3",
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "! python build/utils.py \\\n",
174 |     "    --mode=deploy-model\\\n",
175 |     "    --project={PROJECT_ID}\\\n",
176 |     "    --region={REGION}\\\n",
177 |     "    --endpoint-display-name={ENDPOINT_DISPLAY_NAME}\\\n",
178 |     "    --model-display-name={MODEL_DISPLAY_NAME}"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "id": "937178d9",
184 |    "metadata": {},
185 |    "source": [
186 |     "### Run model endpoint testing"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "id": "efa45c98",
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "# TODO {for Khalid, you need to update create an Endpoint resource when using a list. This is a known bug:}\n",
197 |     "# AttributeError: 'Endpoint' object has no attribute '_prediction_client'\n",
198 |     "! py.test src/tests/model_deployment_tests.py::test_model_endpoint"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "id": "fbe44e56",
204 |    "metadata": {},
205 |    "source": [
206 |     "## 2. Execute the model deployment CI/CD routine in `Cloud Build`\n",
207 |     "\n",
208 |     "The CI/CD routine is defined in the [model-deployment.yaml](model-deployment.yaml) file, and consists of the following steps:\n",
209 |     "\n",
210 |     "1. Load and test the the trained model interface.\n",
211 |     "2. Create a `Vertex Endpoint` resource if it does not exist.\n",
212 |     "3. Deploy the `Vertex Model` resource to the `Vertex Endpoint` resource.\n",
213 |     "4. Test the `Vertex Endpoint` resource."
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "id": "05ef2d3d",
219 |    "metadata": {},
220 |    "source": [
221 |     "### Build CI/CD container Image for `Cloud Build`\n",
222 |     "\n",
223 |     "This is the runtime environment where the steps of testing and deploying the model will be executed."
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "id": "59f00bcc",
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "! echo $CICD_IMAGE_URI"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "id": "76b7dae5",
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "! gcloud builds submit --tag $CICD_IMAGE_URI build/. --timeout=15m"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "id": "88d91bd7",
249 |    "metadata": {},
250 |    "source": [
251 |     "### Run CI/CD from model deployment using `Cloud Build`"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "id": "6a8d9b05",
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "REPO_URL = 'https://github.com/ksalama/ucaip-labs.git' # Change to your github repo.\n",
262 |     "BRANCH = 'main'"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "id": "ee76bd54",
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "SUBSTITUTIONS=f'''\\\n",
273 |     "_REPO_URL='{REPO_URL}',\\\n",
274 |     "_BRANCH={BRANCH},\\\n",
275 |     "_CICD_IMAGE_URI={CICD_IMAGE_URI},\\\n",
276 |     "_PROJECT={PROJECT_ID},\\\n",
277 |     "_REGION={REGION},\\\n",
278 |     "_MODEL_DISPLAY_NAME={MODEL_DISPLAY_NAME},\\\n",
279 |     "_ENDPOINT_DISPLAY_NAME={ENDPOINT_DISPLAY_NAME},\\\n",
280 |     "'''\n",
281 |     "\n",
282 |     "!echo $SUBSTITUTIONS"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "id": "3f59114c",
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "!gcloud builds submit --no-source --config build/model-deployment.yaml --substitutions {SUBSTITUTIONS} --timeout=30m"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": null,
298 |    "id": "d62fc304",
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": []
302 |   }
303 |  ],
304 |  "metadata": {
305 |   "environment": {
306 |    "name": "common-cpu.m73",
307 |    "type": "gcloud",
308 |    "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73"
309 |   },
310 |   "kernelspec": {
311 |    "display_name": "Python 3",
312 |    "language": "python",
313 |    "name": "python3"
314 |   },
315 |   "language_info": {
316 |    "codemirror_mode": {
317 |     "name": "ipython",
318 |     "version": 3
319 |    },
320 |    "file_extension": ".py",
321 |    "mimetype": "text/x-python",
322 |    "name": "python",
323 |    "nbconvert_exporter": "python",
324 |    "pygments_lexer": "ipython3",
325 |    "version": "3.7.10"
326 |   }
327 |  },
328 |  "nbformat": 4,
329 |  "nbformat_minor": 5
330 | }
331 | 


--------------------------------------------------------------------------------
/07-prediction-serving.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "afa25b6f",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# 07 - Serving predictions\n",
  9 |     "\n",
 10 |     "The purpose of the notebook is to show how to use the deployed model for online and batch prediction.\n",
 11 |     "The notebook covers the following tasks:\n",
 12 |     "\n",
 13 |     "1. Test the `Endpoint` resource for online prediction.\n",
 14 |     "2. Use the custom model uploaded as a `Model` resource for batch prediciton.\n",
 15 |     "3. Run a the batch prediction pipeline using `Vertex Pipelines`."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "id": "b2ff82c9",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Setup"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "id": "c0d52e77",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "### Import libraries"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "id": "116a19cf",
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "import os\n",
 42 |     "import time\n",
 43 |     "from datetime import datetime\n",
 44 |     "import tensorflow as tf\n",
 45 |     "\n",
 46 |     "from google.cloud import aiplatform as vertex_ai"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "id": "c5a33868",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "### Setup Google Cloud project"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "id": "3c2c4d1c",
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "PROJECT_ID = '[your-project-id]' # Change to your project id.\n",
 65 |     "REGION = 'us-central1' # Change to your region.\n",
 66 |     "BUCKET = '[your-bucket-name]'  # Change to your bucket name.\n",
 67 |     "\n",
 68 |     "if PROJECT_ID == '' or PROJECT_ID is None or PROJECT_ID == '[your-project-id]':\n",
 69 |     "    # Get your GCP project id from gcloud\n",
 70 |     "    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n",
 71 |     "    PROJECT_ID = shell_output[0]\n",
 72 |     "    \n",
 73 |     "if BUCKET == '' or BUCKET is None or BUCKET == '[your-bucket-name]':\n",
 74 |     "    # Set your bucket name using your GCP project id\n",
 75 |     "    BUCKET = PROJECT_ID\n",
 76 |     "    # Try to create the bucket if it doesn'exists\n",
 77 |     "    ! gsutil mb -l $REGION gs://$BUCKET\n",
 78 |     "    print('')\n",
 79 |     "    \n",
 80 |     "print('Project ID:', PROJECT_ID)\n",
 81 |     "print('Region:', REGION)\n",
 82 |     "print('Bucket name:', BUCKET)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "id": "29e1a653",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "### Set configurations"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "id": "0019b2dd",
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "VERSION = 'v1'\n",
101 |     "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n",
102 |     "MODEL_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier-{VERSION}'\n",
103 |     "ENDPOINT_DISPLAY_NAME = f'{DATASET_DISPLAY_NAME}-classifier'\n",
104 |     "\n",
105 |     "SERVE_BQ_DATASET_NAME = 'playground_us' # Change to your serving BigQuery dataset name.\n",
106 |     "SERVE_BQ_TABLE_NAME = 'chicago_taxitrips_prep' # Change to your serving BigQuery table name."
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "id": "385ed4c0",
112 |    "metadata": {},
113 |    "source": [
114 |     "## 1. Making an online prediciton\n"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "id": "ac2520fc",
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "vertex_ai.init(\n",
125 |     "    project=PROJECT_ID,\n",
126 |     "    location=REGION,\n",
127 |     "    staging_bucket=BUCKET\n",
128 |     ")\n",
129 |     "\n",
130 |     "endpoint_name = vertex_ai.Endpoint.list(\n",
131 |     "    filter=f'display_name={ENDPOINT_DISPLAY_NAME}', \n",
132 |     "    order_by='update_time')[-1].gca_resource.name\n",
133 |     "\n",
134 |     "endpoint = vertex_ai.Endpoint(endpoint_name)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "id": "c5f4f8c8",
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "test_instances = [  \n",
145 |     "    {\n",
146 |     "        'dropoff_grid': ['POINT(-87.6 41.9)'],\n",
147 |     "        'euclidean': [2064.2696],\n",
148 |     "        'loc_cross': [''],\n",
149 |     "        'payment_type': ['Credit Card'],\n",
150 |     "        'pickup_grid': ['POINT(-87.6 41.9)'],\n",
151 |     "        'trip_miles': [1.37],\n",
152 |     "        'trip_day': [12],\n",
153 |     "        'trip_hour': [16],\n",
154 |     "        'trip_month': [2],\n",
155 |     "        'trip_day_of_week': [4],\n",
156 |     "        'trip_seconds': [555]\n",
157 |     "    }\n",
158 |     "]"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "id": "6fe672df",
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "predictions = endpoint.predict(test_instances).predictions\n",
169 |     "\n",
170 |     "for prediction in predictions:\n",
171 |     "    print(prediction)"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "id": "077f4225",
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "# TODO {for Khalid, get error saying model does not support explanations}\n",
182 |     "\n",
183 |     "explanations = endpoint.explain(test_instances).explanations\n",
184 |     "\n",
185 |     "for explanation in explanations:\n",
186 |     "    print(explanation)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "id": "b6140167",
192 |    "metadata": {},
193 |    "source": [
194 |     "## 2. Make a batch prediction"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "id": "37928e74",
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "WORKSPACE = f'gs://{BUCKET}/{DATASET_DISPLAY_NAME}/'\n",
205 |     "SERVING_DATA_DIR = os.path.join(WORKSPACE, 'serving_data')\n",
206 |     "SERVING_INPUT_DATA_DIR = os.path.join(SERVING_DATA_DIR, 'input_data')\n",
207 |     "SERVING_OUTPUT_DATA_DIR = os.path.join(SERVING_DATA_DIR, 'output_predictions')"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "id": "b83e0d39",
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "if tf.io.gfile.exists(SERVING_DATA_DIR):\n",
218 |     "    print('Removing previous serving data...')\n",
219 |     "    tf.io.gfile.rmtree(SERVING_DATA_DIR)\n",
220 |     "print('Creating serving data directory...')\n",
221 |     "tf.io.gfile.mkdir(SERVING_DATA_DIR)\n",
222 |     "print('Serving data directory is ready.')"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "id": "163326ce",
228 |    "metadata": {},
229 |    "source": [
230 |     "### Extract serving data to Cloud Storage as JSONL"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "id": "51bdefd3",
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "\n",
241 |     "from src.model_training import features as feature_info\n",
242 |     "from src.preprocessing import etl\n",
243 |     "from src.common import datasource_utils"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "id": "e15508fb",
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "LIMIT = 10000\n",
254 |     "\n",
255 |     "sql_query = datasource_utils.create_bq_source_query(\n",
256 |     "    dataset_display_name=DATASET_DISPLAY_NAME, \n",
257 |     "    missing=feature_info.MISSING_VALUES,\n",
258 |     "    limit=LIMIT\n",
259 |     ")\n",
260 |     "\n",
261 |     "print(sql_query)"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "id": "95ba6d5f",
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "args = {\n",
272 |     "    #'runner': 'DataflowRunner',\n",
273 |     "    'sql_query': sql_query,\n",
274 |     "    'exported_data_prefix': os.path.join(SERVING_INPUT_DATA_DIR, 'data-'),\n",
275 |     "    'temporary_dir': os.path.join(WORKSPACE, 'tmp'),\n",
276 |     "    'gcs_location': os.path.join(WORKSPACE, 'bq_tmp'),\n",
277 |     "    'project': PROJECT_ID,\n",
278 |     "    'region': REGION,\n",
279 |     "    'setup_file': './setup.py'\n",
280 |     "}"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": null,
286 |    "id": "c5414f24",
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "tf.get_logger().setLevel('ERROR')\n",
291 |     "\n",
292 |     "print('Data extraction started...')\n",
293 |     "etl.run_extract_pipeline(args)\n",
294 |     "print('Data extraction completed.')"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "id": "7411f2dc",
301 |    "metadata": {},
302 |    "outputs": [],
303 |    "source": [
304 |     "! gsutil ls {SERVING_INPUT_DATA_DIR}"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "markdown",
309 |    "id": "1660a44e",
310 |    "metadata": {},
311 |    "source": [
312 |     "### Submit the batch prediction job"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "id": "8878a244",
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "model_name =  vertex_ai.Model.list(\n",
323 |     "    filter=f'display_name={MODEL_DISPLAY_NAME}',\n",
324 |     "    order_by='update_time')[-1].gca_resource.name"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": null,
330 |    "id": "4f262efa",
331 |    "metadata": {},
332 |    "outputs": [],
333 |    "source": [
334 |     "job_resources =  {\n",
335 |     "    'machine_type': 'n1-standard-2',\n",
336 |     "    #'accelerator_count': 1,\n",
337 |     "    #'accelerator_type': 'NVIDIA_TESLA_T4'\n",
338 |     "    'starting_replica_count': 1,\n",
339 |     "    'max_replica_coun': 10,\n",
340 |     "}\n",
341 |     "\n",
342 |     "job_display_name = f'{MODEL_DISPLAY_NAME}-prediction-job-{datetime.now().strftime('%Y%m%d%H%M%S')}'\n",
343 |     "\n",
344 |     "vertex_ai.BatchPredictionJob.create(\n",
345 |     "    job_display_name=job_display_name,\n",
346 |     "    model_name=model_name,\n",
347 |     "    gcs_source=SERVING_INPUT_DATA_DIR + '/*.jsonl',\n",
348 |     "    gcs_destination_prefix=SERVING_OUTPUT_DATA_DIR,\n",
349 |     "    instances_format='jsonl',\n",
350 |     "    predictions_format='jsonl',\n",
351 |     "    sync=True,\n",
352 |     "    **job_resources,\n",
353 |     ")"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "id": "6d638b6f",
359 |    "metadata": {},
360 |    "source": [
361 |     "## 3. Run the batch prediction pipeline using `Vertex Pipelines`"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "id": "ee5be402",
368 |    "metadata": {},
369 |    "outputs": [],
370 |    "source": [
371 |     "WORKSPACE = f'{BUCKET}/{DATASET_DISPLAY_NAME}/'\n",
372 |     "MLMD_SQLLITE = 'mlmd.sqllite'\n",
373 |     "ARTIFACT_STORE = os.path.join(WORKSPACE, 'tfx_artifacts')\n",
374 |     "PIPELINE_NAME = f'{MODEL_DISPLAY_NAME}-predict-pipeline'"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": null,
380 |    "id": "f9b84c1e",
381 |    "metadata": {},
382 |    "outputs": [],
383 |    "source": [
384 |     "os.environ['PROJECT'] = PROJECT_ID\n",
385 |     "os.environ['REGION'] = REGION\n",
386 |     "os.environ['MODEL_DISPLAY_NAME'] = MODEL_DISPLAY_NAME\n",
387 |     "os.environ['PIPELINE_NAME'] = PIPELINE_NAME\n",
388 |     "os.environ['ARTIFACT_STORE_URI'] = ARTIFACT_STORE\n",
389 |     "os.environ['BATCH_PREDICTION_BQ_DATASET_NAME'] = SERVE_BQ_DATASET_NAME\n",
390 |     "os.environ['BATCH_PREDICTION_BQ_TABLE_NAME'] = SERVE_BQ_TABLE_NAME\n",
391 |     "os.environ['SERVE_LIMIT'] = '1000'\n",
392 |     "os.environ['BEAM_RUNNER'] = 'DirectRunner'\n",
393 |     "os.environ['TFX_IMAGE_URI'] = f'gcr.io/{PROJECT_ID}/{DATASET_DISPLAY_NAME}:{VERSION}'"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": null,
399 |    "id": "58681dfe",
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "import importlib\n",
404 |     "from src.tfx_pipelines import config\n",
405 |     "importlib.reload(config)\n",
406 |     "\n",
407 |     "for key, value in config.__dict__.items():\n",
408 |     "    if key.isupper(): print(f'{key}: {value}')"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": null,
414 |    "id": "d06a4091",
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "from src.tfx_pipelines import runner\n",
419 |     "\n",
420 |     "pipeline_definition_file = f'{config.PIPELINE_NAME}.json'\n",
421 |     "pipeline_definition = runner.compile_prediction_pipeline(pipeline_definition_file)"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "id": "b6ffceca",
428 |    "metadata": {},
429 |    "outputs": [],
430 |    "source": [
431 |     "from kfp.v2.google.client import AIPlatformClient\n",
432 |     "\n",
433 |     "pipeline_client = AIPlatformClient(\n",
434 |     "    project_id=PROJECT_ID, region=REGION)\n",
435 |     "                 \n",
436 |     "pipeline_client.create_run_from_job_spec(\n",
437 |     "    job_spec_path=pipeline_definition_file\n",
438 |     ")"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": null,
444 |    "id": "dd2efb1b",
445 |    "metadata": {},
446 |    "outputs": [],
447 |    "source": []
448 |   }
449 |  ],
450 |  "metadata": {
451 |   "environment": {
452 |    "name": "common-cpu.m73",
453 |    "type": "gcloud",
454 |    "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73"
455 |   },
456 |   "kernelspec": {
457 |    "display_name": "Python 3",
458 |    "language": "python",
459 |    "name": "python3"
460 |   },
461 |   "language_info": {
462 |    "codemirror_mode": {
463 |     "name": "ipython",
464 |     "version": 3
465 |    },
466 |    "file_extension": ".py",
467 |    "mimetype": "text/x-python",
468 |    "name": "python",
469 |    "nbconvert_exporter": "python",
470 |    "pygments_lexer": "ipython3",
471 |    "version": "3.7.10"
472 |   }
473 |  },
474 |  "nbformat": 4,
475 |  "nbformat_minor": 5
476 | }
477 | 


--------------------------------------------------------------------------------
/08-model-monitoring.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "39366395",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# 08 - Model monitoring\n",
  9 |     "\n",
 10 |     "This notebook covers configuring model monitoring jobs for skew and drift detection:\n",
 11 |     "\n",
 12 |     "1. Set skew and drift threshold.\n",
 13 |     "2. Create a monitoring job for all the models on a `Endpoint` resource.\n",
 14 |     "3. List the monitoring jobs.\n",
 15 |     "4. List artifacts produced by monitoring job.\n",
 16 |     "5. Pause and delete the monitoring job."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "id": "d7e55542",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Setup"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "id": "35292bad",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "### Import libraries"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "id": "41ba6e75",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import copy\n",
 43 |     "from datetime import datetime\n",
 44 |     "\n",
 45 |     "from google.protobuf.duration_pb2 import Duration\n",
 46 |     "from google.cloud import aiplatform as vertex_ai\n",
 47 |     "from google.cloud import aiplatform_v1beta1 as vertex_ai_beta"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "id": "5279e949",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "### Setup Google Cloud project"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "id": "272491a9",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "PROJECT_ID = '[your-project-id]' # Change to your project id.\n",
 66 |     "REGION = 'us-central1' # Change to your region.\n",
 67 |     "\n",
 68 |     "if PROJECT_ID == '' or PROJECT_ID is None or PROJECT_ID == '[your-project-id]':\n",
 69 |     "    # Get your GCP project id from gcloud\n",
 70 |     "    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null\n",
 71 |     "    PROJECT_ID = shell_output[0]\n",
 72 |     "\n",
 73 |     "PARENT = f'projects/{PROJECT_ID}/locations/{REGION}'\n",
 74 |     "\n",
 75 |     "print('Project ID:', PROJECT_ID)\n",
 76 |     "print('Region:', REGION)\n",
 77 |     "print('Vertex API Parent URI:', PARENT)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "id": "513388ee",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "### Set configurations"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "id": "fb651770",
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "DATASET_DISPLAY_NAME = 'chicago-taxi-tips'\n",
 96 |     "ENDPOINT_DISPLAY_NAME = 'chicago-taxi-tips-classifier'\n",
 97 |     "MONITORING_JOB_NAME = f'monitor-{ENDPOINT_DISPLAY_NAME}'\n",
 98 |     "NOTIFY_EMAILS = '[your-email-address]'\n",
 99 |     "\n",
100 |     "LOG_SAMPLE_RATE = 0.8\n",
101 |     "MONITOR_INTERVAL = 3600\n",
102 |     "TARGET_FEATURE_NAME = 'tip_bin'"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "id": "ac7cb17f",
108 |    "metadata": {},
109 |    "source": [
110 |     "## Create a Job Service client"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "id": "bb896762",
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "job_client_beta = vertex_ai_beta.JobServiceClient(\n",
121 |     "    client_options={'api_endpoint': f'{REGION}-aiplatform.googleapis.com'}\n",
122 |     ")"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "id": "63bcde67",
128 |    "metadata": {},
129 |    "source": [
130 |     "## 1. Set the skew and drift thresholds"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "id": "3252edaa",
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "SKEW_THRESHOLDS = {\n",
141 |     "    'trip_month': 0.3,\n",
142 |     "    'trip_day': 0.3,\n",
143 |     "    'trip_day_of_week': 0.3,\n",
144 |     "    'trip_hour': 0.3,\n",
145 |     "    'trip_seconds': 0.3,\n",
146 |     "    'trip_miles': 0.3,\n",
147 |     "    'payment_type': 0.3,\n",
148 |     "    'pickup_grid': 0.3,\n",
149 |     "    'dropoff_grid': 0.3,\n",
150 |     "    'euclidean': 0.3,\n",
151 |     "    'loc_cross': 0.3,  \n",
152 |     "}\n",
153 |     "\n",
154 |     "DIRFT_THRESHOLDS = {\n",
155 |     "    'trip_month': 0.3,\n",
156 |     "    'trip_day': 0.3,\n",
157 |     "    'trip_day_of_week': 0.3,\n",
158 |     "    'trip_hour': 0.3,\n",
159 |     "    'trip_seconds': 0.3,\n",
160 |     "    'trip_miles': 0.3,\n",
161 |     "    'payment_type': 0.3,\n",
162 |     "    'pickup_grid': 0.3,\n",
163 |     "    'dropoff_grid': 0.3,\n",
164 |     "    'euclidean': 0.3,\n",
165 |     "    'loc_cross': 0.3,  \n",
166 |     "}"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "id": "adc333a3",
172 |    "metadata": {},
173 |    "source": [
174 |     "## 2. Create a monitoring job"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "id": "a40d14cb",
180 |    "metadata": {},
181 |    "source": [
182 |     "### Retrieve the `Dataset`, `Model` and `Endpoint` resources to monitor"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "id": "ed60fbff",
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "dataset = vertex_ai.TabularDataset.list(\n",
193 |     "    filter=f'display_name={DATASET_DISPLAY_NAME}', \n",
194 |     "    order_by='update_time')[-1]\n",
195 |     "\n",
196 |     "bq_source_uri = dataset.gca_resource.metadata['inputConfig']['bigquerySource']['uri']\n",
197 |     "    \n",
198 |     "endpoint = vertex_ai.Endpoint.list(\n",
199 |     "    filter=f'display_name={ENDPOINT_DISPLAY_NAME}', \n",
200 |     "    order_by='update_time')[-1]\n",
201 |     "\n",
202 |     "endpoint_uri = endpoint.gca_resource.name\n",
203 |     "\n",
204 |     "model_ids = [model.id for model in endpoint.list_models()]"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "id": "0b159368",
210 |    "metadata": {},
211 |    "source": [
212 |     "### Configure the monitoring job"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "id": "0370cb58",
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "skew_thresholds = {\n",
223 |     "    feature: vertex_ai_beta.ThresholdConfig(value=float(value))\n",
224 |     "    for feature, value in SKEW_THRESHOLDS.items()\n",
225 |     "}\n",
226 |     "\n",
227 |     "drift_thresholds = {\n",
228 |     "    feature: vertex_ai_beta.ThresholdConfig(value=float(value))\n",
229 |     "    for feature, value in DIRFT_THRESHOLDS.items()\n",
230 |     "}\n",
231 |     "\n",
232 |     "skew_config = vertex_ai_beta.ModelMonitoringObjectiveConfig.TrainingPredictionSkewDetectionConfig(\n",
233 |     "    skew_thresholds=skew_thresholds\n",
234 |     ")\n",
235 |     "\n",
236 |     "drift_config = vertex_ai_beta.ModelMonitoringObjectiveConfig.PredictionDriftDetectionConfig(\n",
237 |     "    drift_thresholds=drift_thresholds\n",
238 |     ")\n",
239 |     "\n",
240 |     "sampling_config = vertex_ai_beta.SamplingStrategy(\n",
241 |     "    random_sample_config=vertex_ai_beta.SamplingStrategy.RandomSampleConfig(\n",
242 |     "        sample_rate=LOG_SAMPLE_RATE\n",
243 |     "    )\n",
244 |     ")\n",
245 |     "\n",
246 |     "schedule_config = vertex_ai_beta.ModelDeploymentMonitoringScheduleConfig(\n",
247 |     "    monitor_interval=Duration(seconds=MONITOR_INTERVAL)\n",
248 |     ")\n",
249 |     "\n",
250 |     "training_dataset = vertex_ai_beta.ModelMonitoringObjectiveConfig.TrainingDataset(\n",
251 |     "    target_field=TARGET_FEATURE_NAME,\n",
252 |     "    bigquery_source = vertex_ai_beta.types.io.BigQuerySource(\n",
253 |     "        input_uri=bq_source_uri\n",
254 |     "    )\n",
255 |     ")\n",
256 |     "\n",
257 |     "\n",
258 |     "objective_template = vertex_ai_beta.ModelDeploymentMonitoringObjectiveConfig(\n",
259 |     "    objective_config=vertex_ai_beta.ModelMonitoringObjectiveConfig(\n",
260 |     "        training_dataset=training_dataset,\n",
261 |     "        training_prediction_skew_detection_config=skew_config,\n",
262 |     "        prediction_drift_detection_config=drift_config,\n",
263 |     "    )\n",
264 |     ")\n",
265 |     "\n",
266 |     "deployment_objective_configs = []\n",
267 |     "for model_id in model_ids:\n",
268 |     "    objective_config = copy.deepcopy(objective_template)\n",
269 |     "    objective_config.deployed_model_id = model_id\n",
270 |     "    deployment_objective_configs.append(objective_config)\n",
271 |     "\n",
272 |     "alerting_config = vertex_ai_beta.ModelMonitoringAlertConfig(\n",
273 |     "    email_alert_config=vertex_ai_beta.ModelMonitoringAlertConfig.EmailAlertConfig(\n",
274 |     "        user_emails=NOTIFY_EMAILS\n",
275 |     "    )\n",
276 |     ")\n"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "id": "f4b667db",
282 |    "metadata": {},
283 |    "source": [
284 |     "### Instantiate a monitoring job"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "id": "5e4e0c9d",
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "job = vertex_ai_beta.ModelDeploymentMonitoringJob(\n",
295 |     "    display_name=MONITORING_JOB_NAME,\n",
296 |     "    endpoint=endpoint_uri,\n",
297 |     "    model_deployment_monitoring_objective_configs=deployment_objective_configs,\n",
298 |     "    logging_sampling_strategy=sampling_config,\n",
299 |     "    model_deployment_monitoring_schedule_config=schedule_config,\n",
300 |     "    model_monitoring_alert_config=alerting_config,\n",
301 |     ")"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "markdown",
306 |    "id": "4a66d1d5",
307 |    "metadata": {},
308 |    "source": [
309 |     "### Submit the monitoring job for execution"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "id": "4a0e41b9",
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": [
319 |     "response = job_client_beta.create_model_deployment_monitoring_job(\n",
320 |     "    parent=PARENT, model_deployment_monitoring_job=job\n",
321 |     ")\n",
322 |     "response"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "id": "39352387",
328 |    "metadata": {},
329 |    "source": [
330 |     "## 3. Get the monitoring job"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "id": "bc47ef29",
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "monitoring_jobs = job_client_beta.list_model_deployment_monitoring_jobs(parent=PARENT)\n",
341 |     "monitoring_job = [entry for entry in monitoring_jobs if entry.display_name == MONITORING_JOB_NAME][0]\n",
342 |     "monitoring_job"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "id": "9dbb50ce",
348 |    "metadata": {},
349 |    "source": [
350 |     "## 5. Pause the monitoring job"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": null,
356 |    "id": "cd6d295e",
357 |    "metadata": {},
358 |    "outputs": [],
359 |    "source": [
360 |     "job_client_beta.pause_model_deployment_monitoring_job(name=monitoring_job)"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "id": "37663c7e",
366 |    "metadata": {},
367 |    "source": [
368 |     "## Delete the monitoring job"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "id": "c3be1189",
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "job_client_beta.delete_model_deployment_monitoring_job(name=monitoring_job)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": null,
384 |    "id": "28159818",
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": []
388 |   }
389 |  ],
390 |  "metadata": {
391 |   "environment": {
392 |    "name": "common-cpu.m73",
393 |    "type": "gcloud",
394 |    "uri": "gcr.io/deeplearning-platform-release/base-cpu:m73"
395 |   },
396 |   "kernelspec": {
397 |    "display_name": "Python 3",
398 |    "language": "python",
399 |    "name": "python3"
400 |   },
401 |   "language_info": {
402 |    "codemirror_mode": {
403 |     "name": "ipython",
404 |     "version": 3
405 |    },
406 |    "file_extension": ".py",
407 |    "mimetype": "text/x-python",
408 |    "name": "python",
409 |    "nbconvert_exporter": "python",
410 |    "pygments_lexer": "ipython3",
411 |    "version": "3.7.10"
412 |   }
413 |  },
414 |  "nbformat": 4,
415 |  "nbformat_minor": 5
416 | }
417 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gcr.io/tfx-oss-public/tfx:0.30.0
2 | 
3 | COPY requirements.txt requirements.txt
4 | 
5 | RUN pip install -r requirements.txt
6 | 
7 | COPY src/ src/
8 | 
9 | ENV PYTHONPATH="/pipeline:${PYTHONPATH}"


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PLEASE USE THIS REPO INSTEAD: https://github.com/GoogleCloudPlatform/mlops-with-vertex-ai
  2 | 
  3 | 
  4 | # MLOps on Vertex AI
  5 | 
  6 | This example implements the end-to-end [MLOps process](https://services.google.com/fh/files/misc/practitioners_guide_to_mlops_whitepaper.pdf) using [Vertex AI](https://cloud.google.com/vertex-ai) platform and [Smart Analytics](https://cloud.google.com/solutions/smart-analytics) technology capabilities. The example use [Keras](https://keras.io/) to implement the ML model, [TFX](https://www.tensorflow.org/tfx) to implement the training pipeline, and [Model Builder SDK](https://github.com/googleapis/python-aiplatform/tree/569d4cd03e888fde0171f7b0060695a14f99b072/google/cloud/aiplatform) to interact with Vertex AI.
  7 | 
  8 | 
  9 | <img src="mlops.png" alt="MLOps lifecycle" width="400"/>
 10 | 
 11 | 
 12 | ## Getting started
 13 | 
 14 | 1. [Setting up MLOps environment](provision) on Google Cloud.
 15 | 2. Start your AI Notebook instance.
 16 | 3. Open the JupyterLab then open a new Terminal
 17 | 4. Clone the repository to your AI Notebook instance:
 18 |     ```
 19 |     git clone https://github.com/ksalama/ucaip-labs.git
 20 |     cd ucaip-labs
 21 |     ```
 22 | 5. Install the required Python packages:
 23 |     ```
 24 |     pip install tfx==0.30.0 --user
 25 |     pip install -r requirements.txt --user
 26 |     ```
 27 | 6. Upgrade the `gcloud` components:
 28 |     ```
 29 |    sudo apt-get install google-cloud-sdk
 30 |    gcloud components update
 31 |    ```
 32 | 
 33 | ## Dataset Management
 34 | 
 35 | The [Chicago Taxi Trips](https://console.cloud.google.com/marketplace/details/city-of-chicago-public-data/chicago-taxi-trips) dataset is one of [public datasets hosted with BigQuery](https://cloud.google.com/bigquery/public-data/), which includes taxi trips from 2013 to the present, reported to the City of Chicago in its role as a regulatory agency. The task is to predict whether a given trip will result in a tip > 20%.
 36 | 
 37 | The [01-dataset-management](01-dataset-management.ipynb) notebook covers:
 38 | 
 39 | 1. Performing exploratory data analysis on the data in `BigQuery`.
 40 | 2. Creating `Vertex AI` Dataset resource using the Python SDK.
 41 | 3. Generating the schema for the raw data using [TensorFlow Data Validation](https://www.tensorflow.org/tfx/guide/tfdv).
 42 | 
 43 | 
 44 | ## ML Development
 45 | 
 46 | We experiment with creating a [Custom Model](https://cloud.google.com/ai-platform-unified/docs/training/create-model-custom-training) using [02-experimentation](02-experimentation.ipynb) notebook, which covers:
 47 | 
 48 | 1. Preparing the data using `Dataflow`.
 49 | 2. Implementing a `Keras` classification model.
 50 | 3. Training the `Keras` model with `Vertex AI` using a [pre-built container](https://cloud.google.com/ai-platform-unified/docs/training/pre-built-containers).
 51 | 4. Upload the exported model from `Cloud Storage` to `Vertex AI`.
 52 | 5. Exract and visualize experiment parameters from [Vertex AI Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction).
 53 | 
 54 | We use [Vertex TensorBoard](https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview) 
 55 | and [Vertex ML Metadata](https://cloud.google.com/vertex-ai/docs/ml-metadata/introduction) to  track, visualize, and compare ML experiments.
 56 | 
 57 | In addition, the training steps are formalized by implementing a [TFX pipeline](https://www.tensorflow.org/tfx).
 58 | The [03-training-formalization](02-tfx-interactive.ipynb) notebook covers implementing and testing the pipeline components interactively.
 59 | 
 60 | ## Training Operationalization
 61 | 
 62 | The [04-pipeline-deployment](04-pipeline-deployment.ipynb) notebook covers executing the CI/CD steps for the training pipeline deployment using [Cloud Build](https://cloud.google.com/build/docs/overview). The CI/CD routine is defined in the [pipeline-deployment.yaml](pipeline-deployment.yaml) file, and consists of the following steps:
 63 | 
 64 | 1. Clone the repository to the build environment.
 65 | 2. Run unit tests.
 66 | 3. Run a local e2e test of the `TFX` pipeline.
 67 | 4. Build the ML container image for pipeline steps.
 68 | 5. Compile the pipeline.
 69 | 6. Upload the pipeline to `Cloud Storage`.
 70 | 
 71 | ## Continuous Training
 72 | 
 73 | After testing, compiling, and uploading the pipeline definition to `Cloud Storage`, the pipeline is executed with respect to a trigger. 
 74 | We use [Cloud Functions](https://cloud.google.com/functions) and [Cloud Pub/Sub](https://cloud.google.com/pubsub) as a triggering mechanism.
 75 | The `Cloud Function` listens to the `Pub/Sub` topic, and runs the training pipeline given a message sent to the `Pub/Sub` topic.
 76 | The `Cloud Function` is implemented in [src/pipeline_triggering](src/pipeline_triggering). 
 77 | 
 78 | The [05-continuous-training](05-continuous-training.ipynb) notebook covers:
 79 | 
 80 | 1. Creating a Cloud `Pub/Sub` topic.
 81 | 2. Deploying a `Cloud Function`.
 82 | 3. Triggering the pipeline.
 83 | 
 84 | The end-to-end TFX training pipeline implementation is in the [src/pipelines](src/tfx_pipelines) directory, which covers the following steps:
 85 | 
 86 | 1. Receive hyperparameters using `hyperparam_gen` custom python component.
 87 | 2. Extract data from `BigQuery` using `BigQueryExampleGen` component.
 88 | 3. Validate the raw data using `StatisticsGen` and `ExampleValidator` component.
 89 | 4. Process the data using on `Dataflow` `Transform` component.
 90 | 5. Train a custom model with `Vertex AI` using `Trainer` component.
 91 | 6. Evaluat and validate the custom model using `ModelEvaluator` component.
 92 | 7. Save the blessed to model registry location in `Cloud Storage` using `Pusher` component.
 93 | 8. Upload the model to `Vertex AI` using `vertex_model_pusher` custom python component.
 94 | 
 95 | 
 96 | ## Model Deployment
 97 | 
 98 | The [06-model-deployment](06-model-deployment.ipynb) notebook covers executing the CI/CD steps for the model deployment using [Cloud Build](https://cloud.google.com/build/docs/overview). The CI/CD rountine is defined in [build/model-deployment.yaml](build/model-deployment.yaml)
 99 | file, and consists of the following steps:
100 | 
101 | 2. Test model interface.
102 | 3. Create an endpoint in `Vertex AI`.
103 | 4. Deploy the model to the `endpoint`.
104 | 5. Test the `Vertex AI` endpoint.
105 | 
106 | ## Prediction Serving
107 | 
108 | We serve the deployed model for prediction. 
109 | The [07-prediction-serving](07-prediction-serving.ipynb) notebook covers:
110 | 
111 | 1. Use the `Vertex AI` endpoint for online prediction.
112 | 2. Use the `Vertex AI` uploaded model for batch prediciton.
113 | 3. Run the batch prediction using `Vertex Pipelines`.
114 | 
115 | ## Model Monitoring
116 | 
117 | After a model is deployed in for prediciton serving, continuous monitoring is set up to ensure that the model continue to perform as expected.
118 | The [08-model-monitoring](08-model-monitoring.ipynb) notebook covers configuring [Vertex AI Model Monitoring](https://cloud.google.com/vertex-ai/docs/model-monitoring/overview?hl=nn) for skew and dirft detection:
119 | 
120 | 1. Set skew and drift threshold.
121 | 2. Create a monitoring job for all the models under and endpoint.
122 | 3. List the monitoring jobs.
123 | 4. List artifacts produced by monitoring job.
124 | 5. Pause and delete the monitoring job.
125 | 
126 | 
127 | ## Metadata Tracking
128 | 
129 | You can view the parameters and metrics logged by your experiments, as well as the artifacts and metadata stored by 
130 | your `Vertex Pipelines` in [Cloud Console](https://console.cloud.google.com/vertex-ai/metadata).
131 | 
132 | ## Disclaimer
133 | 
134 | This is not an official Google product but sample code provided for an educational purpose.
135 | 
136 | ---
137 | 
138 | Copyright 2021 Google LLC.
139 | 
140 | Licensed under the Apache License, Version 2.0 (the "License");
141 | you may not use this file except in compliance with the License.
142 | You may obtain a copy of the License at: http://www.apache.org/licenses/LICENSE-2.0
143 | 
144 | Unless required by applicable law or agreed to in writing, software
145 | distributed under the License is distributed on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
147 | See the License for the specific language governing permissions and
148 | limitations under the License.
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 


--------------------------------------------------------------------------------
/build/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gcr.io/tfx-oss-public/tfx:0.30.0
2 | 
3 | RUN pip install -U pip
4 | RUN pip install google-cloud-aiplatform==1.1.1 google-cloud-aiplatform[tensorboard]
5 | RUN pip install pytest kfp==1.6.2 google-cloud-bigquery==2.20.0 google-cloud-bigquery-storage==2.4.0


--------------------------------------------------------------------------------
/build/model-deployment.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #            http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | ######################################################################
16 | # CI/CD steps for Cloud Build to test and deploy a model to Vertex AI.
17 | ######################################################################
18 | 
19 | steps:
20 | 
21 | # Clone the repository.
22 | - name: 'gcr.io/cloud-builders/git'
23 |   args: ['clone', '--single-branch', '--branch',
24 |          '$_BRANCH', '$_REPO_URL',
25 |          '--depth', '1',
26 |          '--verbose']
27 |   id: 'Clone Repository'
28 |   
29 | # Test uploaded model artifact.
30 | - name: '$_CICD_IMAGE_URI'
31 |   entrypoint: 'pytest'
32 |   args: ['src/tests/model_deployment_tests.py::test_model_artifact']
33 |   dir: 'ucaip-labs'
34 |   env: 
35 |   - 'PROJECT=$_PROJECT'  
36 |   - 'REGION=$_REGION'
37 |   - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME'
38 |   id: 'Test Model Artifact'
39 |   waitFor: ['Clone Repository']
40 |   
41 | # Create an endpoint.
42 | - name: '$_CICD_IMAGE_URI'
43 |   entrypoint: 'python'
44 |   args: ['build/utils.py',
45 |           '--mode', 'create-endpoint',
46 |           '--project', '$_PROJECT',
47 |           '--region', '$_REGION',
48 |           '--endpoint-display-name', '$_ENDPOINT_DISPLAY_NAME']
49 |   dir: 'ucaip-labs'
50 |   id: 'Create Endpoint'
51 |   waitFor: ['Test Model Artifact']
52 |   
53 | # Deploy the model.
54 | - name: '$_CICD_IMAGE_URI'
55 |   entrypoint: 'python'
56 |   args: ['build/utils.py',
57 |           '--mode', 'deploy-model',
58 |           '--project', '$_PROJECT',
59 |           '--region', '$_REGION',
60 |           '--endpoint-display-name', '$_ENDPOINT_DISPLAY_NAME',
61 |           '--model-display-name', '$_MODEL_DISPLAY_NAME'
62 |           ]
63 |   dir: 'ucaip-labs'
64 |   id: 'Deploy Model'
65 |   waitFor: ['Create Endpoint']
66 |   
67 | # Test deployed model endpoint.
68 | - name: '$_CICD_IMAGE_URI'
69 |   entrypoint: 'pytest'
70 |   args: ['src/tests/model_deployment_tests.py::test_model_endpoint']
71 |   dir: 'ucaip-labs'
72 |   env: 
73 |   - 'PROJECT=$_PROJECT'  
74 |   - 'REGION=$_REGION'
75 |   - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME'
76 |   - 'ENDPOINT_DISPLAY_NAME=$_ENDPOINT_DISPLAY_NAME'
77 |   id: 'Test Model Endpoint'
78 |   waitFor: ['Deploy Model']


--------------------------------------------------------------------------------
/build/pipeline-deployment.yaml:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #            http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | #############################################################################
 16 | # CI/CD steps for Cloud Build to test and deploy a TFX pipeline to Vertex AI.
 17 | #############################################################################
 18 | 
 19 | steps:
 20 | 
 21 | # Clone the repository.
 22 | - name: 'gcr.io/cloud-builders/git'
 23 |   args: ['clone', '--single-branch', '--branch',
 24 |          '$_BRANCH', '$_REPO_URL',
 25 |          '--depth', '1',
 26 |          '--verbose']
 27 |   id: 'Clone Repository'
 28 |   
 29 | 
 30 | # Run datasource_utils unit tests.
 31 | - name: '$_CICD_IMAGE_URI'
 32 |   entrypoint: 'pytest'
 33 |   args: ['src/tests/datasource_utils_tests.py', '-s']
 34 |   dir: 'ucaip-labs'
 35 |   env: 
 36 |   - 'PROJECT_ID=$_PROJECT_ID'  
 37 |   - 'BQ_LOCATION=$_BQ_LOCATION'
 38 |   - 'BQ_DATASET_NAME=$_BQ_DATASET_NAME'
 39 |   - 'BQ_TABLE_NAME=$_BQ_TABLE_NAME'  
 40 |   id: 'Unit Test Datasource Utils'
 41 |   waitFor: ['Clone Repository']
 42 | 
 43 | 
 44 | # Run model unit tests.
 45 | - name: '$_CICD_IMAGE_URI'
 46 |   entrypoint: 'pytest'
 47 |   args: ['src/tests/model_tests.py', '-s']
 48 |   dir: 'ucaip-labs'
 49 |   id: 'Unit Test Model'
 50 |   waitFor: ['Clone Repository']
 51 |   timeout: 1800s
 52 | 
 53 | 
 54 | # Test e2e pipeline using local runner.
 55 | - name: '$_CICD_IMAGE_URI'
 56 |   entrypoint: 'pytest'
 57 |   args: ['src/tests/pipeline_deployment_tests.py::test_e2e_pipeline', '-s']
 58 |   dir: 'ucaip-labs'
 59 |   env: 
 60 |   - 'PROJECT_ID=$_PROJECT_ID'  
 61 |   - 'REGION=$_REGION'
 62 |   - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME'
 63 |   - 'DATASET_DISPLAY_NAME=$_DATASET_DISPLAY_NAME'  
 64 |   - 'GCS_LOCATION=$_TEST_GCS_LOCATION'
 65 |   - 'TRAIN_LIMIT=$_CI_TRAIN_LIMIT'
 66 |   - 'TEST_LIMIT=$_CI_TEST_LIMIT'  
 67 |   - 'UPLOAD_MODEL=$_CI_UPLOAD_MODEL'
 68 |   - 'ACCURACY_THRESHOLD=$_CI_ACCURACY_THRESHOLD'
 69 |   id: 'Local Test E2E Pipeline'
 70 |   waitFor: ['Unit Test Datasource Utils', 'Unit Test Model']
 71 |   timeout: 1800s
 72 | 
 73 | 
 74 | # Build the image that encapsulates the pipeline.
 75 | - name: 'gcr.io/cloud-builders/docker'
 76 |   args: ['build', '-t', '$_TFX_IMAGE_URI', '.']
 77 |   dir: 'ucaip-labs'
 78 |   id: 'Build TFX Image'
 79 |   waitFor: ['Local Test E2E Pipeline']
 80 |   
 81 |   
 82 | # Compile the pipeline.
 83 | - name: '$_CICD_IMAGE_URI'
 84 |   entrypoint: 'python'
 85 |   args: ['build/utils.py',
 86 |           '--mode', 'compile-pipeline',
 87 |           '--pipeline-name', '$_PIPELINE_NAME'
 88 |           ]
 89 |   dir: 'ucaip-labs'
 90 |   env: 
 91 |   - 'PROJECT_ID=$_PROJECT_ID'  
 92 |   - 'REGION=$_REGION'
 93 |   - 'MODEL_DISPLAY_NAME=$_MODEL_DISPLAY_NAME'
 94 |   - 'DATASET_DISPLAY_NAME=$_DATASET_DISPLAY_NAME'  
 95 |   - 'GCS_LOCATION=$_GCS_LOCATION' 
 96 |   - 'TFX_IMAGE_URI=$_TFX_IMAGE_URI' 
 97 |   - 'BEAM_RUNNER=$_BEAM_RUNNER'
 98 |   - 'TRAINING_RUNNER=$_TRAINING_RUNNER'
 99 |   id: 'Compile Pipeline'
100 |   waitFor: ['Local Test E2E Pipeline']
101 |   
102 |   
103 | # Upload compiled pipeline to GCS.
104 | - name: 'gcr.io/cloud-builders/gsutil'
105 |   args: ['cp', '$_PIPELINE_NAME.json', '$_PIPELINES_STORE']
106 |   dir: 'ucaip-labs'
107 |   id:  'Upload Pipeline to GCS'
108 |   waitFor: ['Compile Pipeline']
109 |   
110 | 
111 | # Push TFX Image to Container Registy.
112 | images: ['$_TFX_IMAGE_URI']
113 | 


--------------------------------------------------------------------------------
/build/serving_resources_spec.json:
--------------------------------------------------------------------------------
1 | {
2 |     "traffic_percentage": 100,
3 |     "machine_type": "n1-standard-2",
4 |     "min_replica_count": 1,
5 |     "max_replica_count": 1,
6 |     "accelerator_type": null,
7 |     "accelerator_count": null
8 | }


--------------------------------------------------------------------------------
/build/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Utilities for deploying pipelines and models to Vertex AI."""
 15 | 
 16 | 
 17 | import argparse
 18 | import os
 19 | import sys
 20 | import logging
 21 | import json
 22 | 
 23 | from google.cloud import aiplatform as vertex_ai
 24 | 
 25 | 
 26 | SCRIPT_DIR = os.path.dirname(
 27 |     os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))
 28 | )
 29 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, "..")))
 30 | 
 31 | SERVING_SPEC_FILEPATH = 'build/serving_resources_spec.json'
 32 | 
 33 | def get_args():
 34 |     """Define an parse commandline arguments."""
 35 |     
 36 |     parser = argparse.ArgumentParser()
 37 | 
 38 |     parser.add_argument(
 39 |         '--mode', 
 40 |         type=str,
 41 |     )
 42 | 
 43 |     parser.add_argument(
 44 |         '--project',  
 45 |         type=str,
 46 |     )
 47 |     
 48 |     parser.add_argument(
 49 |         '--region',  
 50 |         type=str,
 51 |     )
 52 |     
 53 |     parser.add_argument(
 54 |         '--endpoint-display-name', 
 55 |         type=str,
 56 |     )
 57 | 
 58 |     parser.add_argument(
 59 |         '--model-display-name', 
 60 |         type=str,
 61 |     )
 62 |     
 63 |     parser.add_argument(
 64 |         '--pipeline-name', 
 65 |         type=str,
 66 |     )
 67 | 
 68 |     return parser.parse_args()
 69 | 
 70 | 
 71 | def create_endpoint(project, region, endpoint_display_name):
 72 |     """Create a Vertex endpoint."""
 73 |     
 74 |     logging.info(f"Creating endpoint {endpoint_display_name}")
 75 |     vertex_ai.init(
 76 |         project=project,
 77 |         location=region
 78 |     )
 79 |     
 80 |     endpoints = vertex_ai.Endpoint.list(
 81 |         filter=f'display_name={endpoint_display_name}', 
 82 |         order_by="update_time")
 83 |     
 84 |     if len(endpoints) > 0:
 85 |         logging.info(f"Endpoint {endpoint_display_name} already exists.")
 86 |         endpoint = endpoints[-1]
 87 |     else:
 88 |         endpoint = vertex_ai.Endpoint.create(endpoint_display_name)
 89 |     logging.info(f"Endpoint is ready.")
 90 |     logging.info(endpoint.gca_resource)
 91 |     return endpoint
 92 | 
 93 | 
 94 | def deploy_model(project, region, endpoint_display_name, model_display_name, serving_resources_spec):
 95 |     """Deploy a model to a Vertex endpoint."""
 96 |     
 97 |     logging.info(f"Deploying model {model_display_name} to endpoint {endpoint_display_name}")
 98 |     vertex_ai.init(
 99 |         project=project,
100 |         location=region
101 |     )
102 |     
103 |     model = vertex_ai.Model.list(
104 |         filter=f'display_name={model_display_name}',
105 |         order_by="update_time"
106 |     )[-1]
107 |     
108 |     endpoint = vertex_ai.Endpoint.list(
109 |         filter=f'display_name={endpoint_display_name}',
110 |         order_by="update_time"
111 |     )[-1]
112 | 
113 |     deployed_model = endpoint.deploy(model=model, **serving_resources_spec)
114 |     logging.info(f"Model is deployed.")
115 |     logging.info(deployed_model)
116 |     return deployed_model
117 | 
118 | 
119 | def compile_pipeline(pipeline_name):
120 |     """Create a .json file with the pipeline definition."""
121 |     
122 |     from src.tfx_pipelines import runner
123 |     pipeline_definition_file = f"{pipeline_name}.json"
124 |     pipeline_definition = runner.compile_training_pipeline(pipeline_definition_file)
125 |     return pipeline_definition
126 | 
127 |     
128 | 
129 | def main():
130 |     args = get_args()
131 |     
132 |     if args.mode == 'create-endpoint':
133 |         if not args.project:
134 |             raise ValueError("project must be supplied.")
135 |         if not args.region:
136 |             raise ValueError("region must be supplied.")
137 |         if not args.endpoint_display_name:
138 |             raise ValueError("endpoint_display_name must be supplied.")
139 |             
140 |         result = create_endpoint(
141 |             args.project, 
142 |             args.region, 
143 |             args.endpoint_display_name
144 |         )
145 |         
146 |     elif args.mode == 'deploy-model':
147 |         if not args.project:
148 |             raise ValueError("project must be supplied.")
149 |         if not args.region:
150 |             raise ValueError("region must be supplied.")
151 |         if not args.endpoint_display_name:
152 |             raise ValueError("endpoint-display-name must be supplied.")
153 |         if not args.model_display_name:
154 |             raise ValueError("model-display-name must be supplied.")
155 |             
156 |         with open(SERVING_SPEC_FILEPATH) as json_file:
157 |             serving_resources_spec = json.load(json_file)
158 |         logging.info(f"serving resources: {serving_resources_spec}")
159 |         result = deploy_model(
160 |             args.project, 
161 |             args.region, 
162 |             args.endpoint_display_name, 
163 |             args.model_display_name,
164 |             serving_resources_spec
165 |         )
166 |         
167 |     elif args.mode == 'compile-pipeline':
168 |         if not args.pipeline_name:
169 |             raise ValueError("pipeline-name must be supplied.")
170 |             
171 |         result = compile_pipeline(args.pipeline_name)
172 | 
173 |     else:
174 |         raise ValueError(f"Invalid mode {args.mode}.")
175 |         
176 |     logging.info(result)
177 |         
178 |     
179 | if __name__ == "__main__":
180 |     main()
181 |     
182 | 


--------------------------------------------------------------------------------
/mlops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/mlops.png


--------------------------------------------------------------------------------
/provision/README.md:
--------------------------------------------------------------------------------
 1 | # Creating a Vertex environment
 2 | 
 3 | You can use the [Terraform](https://www.terraform.io/) scripts in the `terraform` folder to automatically provision the environment required by the samples. 
 4 | 
 5 | The scripts perform the following actions:
 6 | 
 7 | 1. Enable the required Cloud APIs
 8 |     * **Essentials**: compute, iam, iamcredentials
 9 |     * **ML**: notebooks, aiplatform
10 |     * **Data**: dataflow, bigquery, bigquerydatatransfer
11 |     * **CI/CD**: cloudbuild, container, artifactregistry
12 |     * **Operations**: cloudtrace, monitoring, logging, cloudresourcemanager
13 | 2. Create a regional GCS bucket.
14 | 3. Create an instance of Vertex Notebooks.
15 | 4. Create service accounts for Vertex Training and Vertex Pipelines.
16 | 
17 | You can customize your configuration using the following variables:
18 | 
19 | |Variable|Required|Default|Description|
20 | |--------|--------|-------|-----------|
21 | |name_prefix|Yes||Prefix added to the names of provisioned resources. **The prefix should start with a letter and include letters and digits only**.|
22 | |project_id|Yes||GCP project ID|
23 | |network_name|No|default|Name of the network for the Notebook instance. The network must already exist.|
24 | |subnet_name|No|default|Name of the subnet for the Notebook instance. The subnet must already exist.|
25 | |subnet_region|No|us-central1|Region where the subnet was created.|
26 | |zone|Yes||GCP zone for the Notebook instance. The zone must be in the region defined in the `subnet_region` variable|
27 | |machine_type|No|n1-standard-4|Machine type of the  Notebook instance|
28 | |boot_disk_size|No|200GB|Size of the Notebook instance's boot disk|
29 | |image_family|No|tf-2-4-cpu|Image family for the Notebook instance|
30 | |gpu_type|No|null|GPU type of the Notebook instance. By default, the Notebook instance will be provisioned without a GPU|
31 | |gpu_count|No|null|GPU count of the Notebook instance|
32 | |install_gpu_driver|No|false|Whether to install a GPU driver|
33 | |region|No|Set to subnet_region.|GCP region for the GCS bucket and Artifact Registry. It is recommended that the same region is used for all: the bucket, the registry and the Notebook instance. If not provided the `egion` will be set to `subnet_region`.|
34 | |force_destroy|No|false|Whether to force the removal of the bucket on terraform destroy. **Note that by default the bucket will not be destroyed**.|
35 | 
36 | 
37 | To provision the environment:
38 | 
39 | 1. Open [Cloud Shell](https://cloud.google.com/shell/docs/launching-cloud-shell)
40 | 
41 | 2. Download the installation scripts
42 |     ```
43 |     SRC_REPO=https://github.com/ksalama/ucaip-labs
44 |     LOCAL_DIR=provision
45 |     kpt pkg get $SRC_REPO/provision@main $LOCAL_DIR
46 |     cd $LOCAL_DIR/terraform
47 |     ```
48 | 
49 | 3. Update the `terraform.tfvars` file with the values reflecting your environment. Alternatively, you can provide the values using the Terraform CLI `-var` options when you execute `terraform apply` in the next step
50 | 
51 | 4. Execute the following commands. :
52 |     ```
53 |     terraform init
54 |     terraform apply
55 |     ```
56 | 
57 | 
58 | To destroy the environment, execute:
59 | ```
60 | terraform destroy
61 | ```
62 | 


--------------------------------------------------------------------------------
/provision/terraform/gcs-bucket.tf:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright 2021 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | resource "google_storage_bucket" "artifact_repo" {
18 |     project       = module.project-services.project_id
19 |     name          = "${var.name_prefix}-bucket"
20 |     location      = local.region
21 |     storage_class = local.bucket_type
22 |     force_destroy = var.force_destroy
23 | }


--------------------------------------------------------------------------------
/provision/terraform/main.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | terraform {
16 |   required_version = ">= 0.14"
17 |   required_providers {
18 |     google = "~> 3.6"
19 |   }
20 | }
21 | 
22 | provider "google" {
23 |     project = var.project_id 
24 | }
25 | 
26 | data "google_project" "project" {
27 |     project_id = var.project_id    
28 | }
29 | 
30 | locals {
31 |     bucket_type = "REGIONAL"
32 |     region = var.region == null ? var.subnet_region : var.region
33 | }
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/provision/terraform/notebook-instance.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | locals {
16 |     image_project = "deeplearning-platform-release"
17 | }
18 | 
19 | data "google_compute_network" "vm_network" {
20 |     project = module.project-services.project_id
21 |     name    = var.network_name
22 | 
23 |     depends_on = [
24 |         module.project-services
25 |     ]
26 | }
27 | 
28 | data "google_compute_subnetwork" "vm_subnetwork" {
29 |     project = module.project-services.project_id
30 |     name   = var.subnet_name
31 |     region = var.subnet_region
32 | 
33 |     depends_on = [
34 |         module.project-services
35 |     ]
36 | }
37 | 
38 | resource "google_notebooks_instance" "notebook_instance" {
39 |     project          = module.project-services.project_id
40 |     name             = "${var.name_prefix}-notebook"
41 |     machine_type     = var.machine_type
42 |     location         = var.zone
43 | 
44 |     network = data.google_compute_network.vm_network.id
45 |     subnet  = data.google_compute_subnetwork.vm_subnetwork.id
46 | 
47 |     vm_image {
48 |         project      = local.image_project
49 |         image_family = var.image_family
50 |     }
51 | 
52 |     dynamic accelerator_config {
53 |       for_each = var.gpu_type != null ? [1] : []
54 |       content {
55 |           type = var.gpu_type
56 |           core_count = var.gpu_count
57 |       }
58 |     }
59 | 
60 |     install_gpu_driver  = var.install_gpu_driver
61 | 
62 |     boot_disk_size_gb   = var.boot_disk_size
63 | }
64 | 


--------------------------------------------------------------------------------
/provision/terraform/service-accounts.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Create Vertex Training service account
16 | resource "google_service_account" "training_sa" {
17 |     project       = module.project-services.project_id
18 |     account_id   = var.training_sa_name
19 |     display_name = "Vertex Training service account"
20 | }
21 | 
22 | # Create Vertex Training SA role bindings
23 | resource "google_project_iam_member" "training_sa_role_bindings" {
24 |     project       = module.project-services.project_id
25 |     for_each = toset(var.training_sa_roles)
26 |     member   = "serviceAccount:${google_service_account.training_sa.email}"
27 |     role     = "roles/${each.value}"
28 | }
29 | 
30 | # Create Vertex Pipelines service account
31 | resource "google_service_account" "pipelines_sa" {
32 |     project       = module.project-services.project_id
33 |     account_id   = var.pipelines_sa_name
34 |     display_name = "Vertex Pipelines account name"
35 | }
36 | 
37 | # Create Vertex Pipelines SA role bindings
38 | resource "google_project_iam_member" "role_bindings" {
39 |     project       = module.project-services.project_id
40 |     for_each = toset(var.pipelines_sa_roles)
41 |     member   = "serviceAccount:${google_service_account.pipelines_sa.email}"
42 |     role     = "roles/${each.value}"
43 | }
44 | 


--------------------------------------------------------------------------------
/provision/terraform/services.tf:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | module "project-services" {
17 |   source  = "terraform-google-modules/project-factory/google//modules/project_services"
18 | 
19 |   project_id  = data.google_project.project.project_id
20 | 
21 |   disable_services_on_destroy = false
22 |   activate_apis = [
23 |     "compute.googleapis.com",
24 |     "iam.googleapis.com",
25 |     "container.googleapis.com",
26 |     "artifactregistry.googleapis.com",
27 |     "cloudresourcemanager.googleapis.com",
28 |     "cloudtrace.googleapis.com",
29 |     "iamcredentials.googleapis.com",
30 |     "monitoring.googleapis.com",
31 |     "logging.googleapis.com",
32 |     "notebooks.googleapis.com",
33 |     "aiplatform.googleapis.com",
34 |     "dataflow.googleapis.com",
35 |     "bigquery.googleapis.com",
36 |     "cloudbuild.googleapis.com",
37 |     "bigquerydatatransfer.googleapis.com",
38 |   ]
39 | }


--------------------------------------------------------------------------------
/provision/terraform/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | project_id = "vertex-mlops"
 2 | subnet_region = "us-central1"
 3 | zone = "us-central1-a"
 4 | name_prefix = "vertex-mlops"
 5 | machine_type = "n1-standard-8"
 6 | #gpu_type = "NVIDIA_TESLA_T4"
 7 | #gpu_count = 1
 8 | #install_gpu_driver = true
 9 | #image_family = "tf-2-4-gpu"
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/provision/terraform/variables.tf:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright 2021 Google LLC
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | variable "project_id" {
 18 |     description = "The GCP project ID"
 19 |     type        = string
 20 | }
 21 | 
 22 | variable "region" {
 23 |     description = "The region for the GCS bucket and Artifact Registry"
 24 |     type        = string
 25 |     default     = null
 26 | }
 27 | 
 28 | variable "zone" {
 29 |     description = "The zone for a Vertex Notebook instance"
 30 |     type        = string
 31 | }
 32 | 
 33 | variable "name_prefix" {
 34 |     description = "The name prefix to add to the resource names"
 35 |     type        = string
 36 | }
 37 | 
 38 | variable "machine_type" {
 39 |     description = "The Notebook instance's machine type"
 40 |     type        = string
 41 | }
 42 | 
 43 | variable "network_name" {
 44 |   description = "The network name for the Notebook instance"
 45 |   type        = string
 46 |   default     = "default"
 47 | }
 48 | 
 49 | variable "subnet_name" {
 50 |   description = "The subnet name for the Notebook instance"
 51 |   type        = string
 52 |   default     = "default"
 53 | }
 54 | 
 55 | variable "subnet_region" {
 56 |     description = "The region for the Notebook subnet"
 57 |     type        = string
 58 |     default     = "us-central1"
 59 | }
 60 | 
 61 | variable "boot_disk_size" {
 62 |     description = "The size of the boot disk"
 63 |     default     = 200
 64 | }
 65 | 
 66 | variable "image_family" {
 67 |     description = "A Deep Learning image family for the Notebook instance"
 68 |     type        = string
 69 |     default     = "tf-2-4-cpu"
 70 | }
 71 | 
 72 | variable "gpu_type" {
 73 |     description = "A GPU type for the Notebook instance"
 74 |     type        = string
 75 |     default     = null
 76 | }
 77 | 
 78 | variable "gpu_count" {
 79 |     description = "A GPU count for the Notebook instance"
 80 |     type        = string
 81 |     default     = null
 82 | }
 83 | 
 84 | variable "install_gpu_driver" {
 85 |     description = "Whether to install GPU driver"
 86 |     type        = bool
 87 |     default     = false
 88 | }
 89 | 
 90 | variable "force_destroy" {
 91 |     description = "Whether to remove the bucket on destroy"
 92 |     type        = bool
 93 |     default     = false
 94 | }
 95 | 
 96 | variable "training_sa_roles" {
 97 |   description = "The roles to assign to the Vertex Training service account"
 98 |   default = [
 99 |     "storage.admin",
100 |     "aiplatform.user",
101 |     "bigquery.admin"
102 |     ] 
103 | }
104 | 
105 | variable "pipelines_sa_roles" {
106 |   description = "The roles to assign to the Vertex Pipelines service account"
107 |   default = [    
108 |     "storage.admin", 
109 |     "bigquery.admin", 
110 |     "aiplatform.user"
111 |   ]
112 | }
113 | 
114 | variable "training_sa_name" {
115 |     description = "Vertex training service account name."
116 |     default = "training-sa"
117 | }
118 | 
119 | variable "pipelines_sa_name" {
120 |     description = "Vertex pipelines service account name."
121 |     default = "pipelines-sa"
122 | }
123 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | kfp==1.6.2
 2 | google-cloud-bigquery==2.20.0
 3 | google-cloud-bigquery-storage==2.4.0
 4 | google-cloud-aiplatform==1.1.1
 5 | google-auth==1.30.1
 6 | google-auth-oauthlib==0.4.4
 7 | google-auth-httplib2==0.1.0
 8 | oauth2client==4.1.3
 9 | requests==2.25.1
10 | pytest
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | REQUIRED_PACKAGES = [
 4 |     "google-cloud-aiplatform==1.0.0",
 5 |     "tensorflow-transform==0.30.0",
 6 |     "tensorflow-data-validation==0.30.0",
 7 | ]
 8 | 
 9 | setuptools.setup(
10 |     name="executor",
11 |     version="0.0.1",
12 |     install_requires=REQUIRED_PACKAGES,
13 |     packages=setuptools.find_packages(),
14 |     include_package_data=True,
15 |     package_data={"src": ["raw_schema/schema.pbtxt"]},
16 | )
17 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/__init__.py


--------------------------------------------------------------------------------
/src/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/common/__init__.py


--------------------------------------------------------------------------------
/src/common/datasource_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Utilities for generating BigQuery data querying scirpts."""
15 | 
16 | 
17 | from google.cloud import aiplatform as vertex_ai
18 | 
19 | 
20 | def _get_source_query(bq_dataset_name, bq_table_name, ml_use, limit=None):
21 | 
22 |     query = f"""
23 |     SELECT 
24 |         IF(trip_month IS NULL, -1, trip_month) trip_month,
25 |         IF(trip_day IS NULL, -1, trip_day) trip_day,
26 |         IF(trip_day_of_week IS NULL, -1, trip_day_of_week) trip_day_of_week,
27 |         IF(trip_hour IS NULL, -1, trip_hour) trip_hour,
28 |         IF(trip_seconds IS NULL, -1, trip_seconds) trip_seconds,
29 |         IF(trip_miles IS NULL, -1, trip_miles) trip_miles,
30 |         IF(payment_type IS NULL, 'NA', payment_type) payment_type,
31 |         IF(pickup_grid IS NULL, 'NA', pickup_grid) pickup_grid,
32 |         IF(dropoff_grid IS NULL, 'NA', dropoff_grid) dropoff_grid,
33 |         IF(euclidean IS NULL, -1, euclidean) euclidean,
34 |         IF(loc_cross IS NULL, 'NA', loc_cross) loc_cross"""
35 |     if ml_use:
36 |         query += f""",
37 |         tip_bin
38 |     FROM {bq_dataset_name}.{bq_table_name} 
39 |     WHERE ML_use = '{ml_use}'
40 |     """
41 |     else:
42 |         query += f"""
43 |     FROM {bq_dataset_name}.{bq_table_name} 
44 |     """
45 |     if limit:
46 |         query += f"LIMIT {limit}"
47 | 
48 |     return query
49 | 
50 | 
51 | def get_training_source_query(
52 |     project, region, dataset_display_name, ml_use="UNASSIGNED", limit=None
53 | ):
54 |     """Generates a BigQuery SELECT statement for the training data."""
55 | 
56 |     dataset = vertex_ai.TabularDataset.list(
57 |         filter=f"display_name={dataset_display_name}", order_by="update_time"
58 |     )[-1]
59 |     bq_source_uri = dataset.gca_resource.metadata["inputConfig"]["bigquerySource"][
60 |         "uri"
61 |     ]
62 |     _, bq_dataset_name, bq_table_name = bq_source_uri.replace("bq://", "").split(".")
63 | 
64 |     return _get_source_query(bq_dataset_name, bq_table_name, ml_use, limit)
65 | 
66 | 
67 | def get_serving_source_query(bq_dataset_name, bq_table_name, limit=None):
68 |     """Generates a BigQuery SELECT statement for the training data."""
69 | 
70 |     return _get_source_query(bq_dataset_name, bq_table_name, ml_use=None, limit=limit)
71 | 


--------------------------------------------------------------------------------
/src/common/features.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Model features metadata utils."""
15 | 
16 | 
17 | FEATURE_NAMES = [
18 |     "trip_month",
19 |     "trip_day",
20 |     "trip_day_of_week",
21 |     "trip_hour",
22 |     "trip_seconds",
23 |     "trip_miles",
24 |     "payment_type",
25 |     "pickup_grid",
26 |     "dropoff_grid",
27 |     "euclidean",
28 |     "loc_cross",
29 | ]
30 | 
31 | TARGET_FEATURE_NAME = "tip_bin"
32 | 
33 | TARGET_LABELS = ["tip<20%", "tip>=20%"]
34 | 
35 | NUMERICAL_FEATURE_NAMES = [
36 |     "trip_seconds",
37 |     "trip_miles",
38 |     "euclidean",
39 | ]
40 | 
41 | EMBEDDING_CATEGORICAL_FEATURES = {
42 |     "trip_month": 2,
43 |     "trip_day": 4,
44 |     "trip_hour": 3,
45 |     "pickup_grid": 3,
46 |     "dropoff_grid": 3,
47 |     "loc_cross": 10,
48 | }
49 | 
50 | ONEHOT_CATEGORICAL_FEATURE_NAMES = ["payment_type", "trip_day_of_week"]
51 | 
52 | 
53 | def transformed_name(key: str) -> str:
54 |     """Generate the name of the transformed feature from original name."""
55 |     return f"{key}_xf"
56 | 
57 | 
58 | def original_name(key: str) -> str:
59 |     """Generate the name of the original feature from transformed name."""
60 |     return key.replace("_xf", "")
61 | 
62 | 
63 | def vocabulary_name(key: str) -> str:
64 |     """Generate the name of the vocabulary feature from original name."""
65 |     return f"{key}_vocab"
66 | 
67 | 
68 | def categorical_feature_names() -> list:
69 |     return (
70 |         list(EMBEDDING_CATEGORICAL_FEATURES.keys()) + ONEHOT_CATEGORICAL_FEATURE_NAMES
71 |     )
72 | 
73 | 
74 | def generate_explanation_config():
75 |     explanation_config = {
76 |         "inputs": {},
77 |         "outputs": {},
78 |         "params": {"sampled_shapley_attribution": {"path_count": 10}},
79 |     }
80 | 
81 |     for feature_name in FEATURE_NAMES:
82 |         if feature_name in NUMERICAL_FEATURE_NAMES:
83 |             explanation_config["inputs"][feature_name] = {
84 |                 "input_tensor_name": feature_name,
85 |                 "modality": "numeric",
86 |             }
87 |         else:
88 |             explanation_config["inputs"][feature_name] = {
89 |                 "input_tensor_name": feature_name,
90 |                 "modality": "categorical",
91 |             }
92 | 
93 |     explanation_config["outputs"] = {"scores": {"output_tensor_name": "scores"}}
94 | 
95 |     return explanation_config
96 | 


--------------------------------------------------------------------------------
/src/model_training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/model_training/__init__.py


--------------------------------------------------------------------------------
/src/model_training/data.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Functions for reading data as tf.data.Dataset."""
15 | 
16 | import tensorflow as tf
17 | 
18 | from src.common import features
19 | 
20 | 
21 | def _gzip_reader_fn(filenames: list):
22 |     """Returns a record reader that can read gzip'ed files."""
23 |     return tf.data.TFRecordDataset(filenames, compression_type="GZIP")
24 | 
25 | 
26 | def get_dataset(
27 |     file_pattern: str,
28 |     feature_spec: dict,
29 |     batch_size: int = 200,
30 |     upsampling_factor: float = 2.0,
31 | ):
32 |     """Generates features and label for tuning/training.
33 | 
34 |     Args:
35 |       file_pattern: input tfrecord file pattern.
36 |       feature_spec: a dictionary of feature specifications.
37 |       batch_size: representing the number of consecutive elements of returned
38 |         dataset to combine in a single batch
39 |     Returns:
40 |       A dataset that contains (features, indices) tuple where features is a
41 |         dictionary of Tensors, and indices is a single Tensor of label indices.
42 |     """
43 | 
44 |     dataset = tf.data.experimental.make_batched_features_dataset(
45 |         file_pattern=file_pattern,
46 |         batch_size=batch_size,
47 |         features=feature_spec,
48 |         label_key=features.TARGET_FEATURE_NAME,
49 |         reader=_gzip_reader_fn,
50 |         num_epochs=1,
51 |         drop_final_batch=True,
52 |     )
53 | 
54 |     return dataset
55 | 


--------------------------------------------------------------------------------
/src/model_training/defaults.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Defaults for the model.
15 | 
16 | These values can be tweaked to affect model training performance.
17 | """
18 | 
19 | 
20 | HIDDEN_UNITS = [64, 32]
21 | LEARNING_RATE = 0.0001
22 | BATCH_SIZE = 512
23 | NUM_EPOCHS = 10
24 | NUM_EVAL_STEPS = 100
25 | 
26 | 
27 | def update_hyperparams(hyperparams: dict) -> dict:
28 |     """Updates the hyperparams dictionary with default values."""
29 | 
30 |     if "hidden_units" not in hyperparams:
31 |         hyperparams["hidden_units"] = HIDDEN_UNITS
32 |     else:
33 |         if not isinstance(hyperparams["hidden_units"], list):
34 |             hyperparams["hidden_units"] = [
35 |                 int(v) for v in hyperparams["hidden_units"].split(",")
36 |             ]
37 |     if "learning_rate" not in hyperparams:
38 |         hyperparams["learning_rate"] = LEARNING_RATE
39 |     if "batch_size" not in hyperparams:
40 |         hyperparams["batch_size"] = BATCH_SIZE
41 |     if "num_epochs" not in hyperparams:
42 |         hyperparams["num_epochs"] = NUM_EPOCHS
43 |     return hyperparams
44 | 


--------------------------------------------------------------------------------
/src/model_training/exporter.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Functions for exporting the model for serving."""
 15 | 
 16 | import logging
 17 | 
 18 | import tensorflow as tf
 19 | import tensorflow_transform as tft
 20 | import tensorflow_data_validation as tfdv
 21 | from tensorflow_transform.tf_metadata import schema_utils
 22 | import tensorflow.keras as keras
 23 | 
 24 | from src.common import features
 25 | 
 26 | 
 27 | def _get_serve_tf_examples_fn(classifier, tft_output, raw_feature_spec):
 28 |     """Returns a function that parses a serialized tf.Example and applies TFT."""
 29 | 
 30 |     classifier.tft_layer = tft_output.transform_features_layer()
 31 | 
 32 |     @tf.function
 33 |     def serve_tf_examples_fn(serialized_tf_examples):
 34 |         """Returns the output to be used in the serving signature."""
 35 |         for key in list(raw_feature_spec.keys()):
 36 |             if key not in features.FEATURE_NAMES:
 37 |                 raw_feature_spec.pop(key)
 38 | 
 39 |         parsed_features = tf.io.parse_example(serialized_tf_examples, raw_feature_spec)
 40 | 
 41 |         transformed_features = classifier.tft_layer(parsed_features)
 42 |         logits = classifier(transformed_features)
 43 |         probabilities = keras.activations.sigmoid(logits)
 44 |         return {"probabilities": probabilities}
 45 | 
 46 |     return serve_tf_examples_fn
 47 | 
 48 | 
 49 | def _get_serve_features_fn(classifier, tft_output):
 50 |     """Returns a function that accept a dictionary of features and applies TFT."""
 51 | 
 52 |     classifier.tft_layer = tft_output.transform_features_layer()
 53 | 
 54 |     @tf.function
 55 |     def serve_features_fn(raw_features):
 56 |         """Returns the output to be used in the serving signature."""
 57 | 
 58 |         transformed_features = classifier.tft_layer(raw_features)
 59 |         logits = classifier(transformed_features)
 60 |         neg_probabilities = keras.activations.sigmoid(logits)
 61 |         pos_probabilities = 1 - neg_probabilities
 62 |         probabilities = tf.concat([neg_probabilities, pos_probabilities], -1)
 63 |         batch_size = tf.shape(probabilities)[0]
 64 |         classes = tf.repeat([features.TARGET_LABELS], [batch_size], axis=0)
 65 |         return {"classes": classes, "scores": probabilities}
 66 | 
 67 |     return serve_features_fn
 68 | 
 69 | 
 70 | def export_serving_model(
 71 |     classifier, serving_model_dir, raw_schema_location, tft_output_dir
 72 | ):
 73 |     """Exports the classifier as a SavedModel with serving signatures."""
 74 | 
 75 |     raw_schema = tfdv.load_schema_text(raw_schema_location)
 76 |     raw_feature_spec = schema_utils.schema_as_feature_spec(raw_schema).feature_spec
 77 | 
 78 |     tft_output = tft.TFTransformOutput(tft_output_dir)
 79 | 
 80 |     features_input_signature = {
 81 |         feature_name: tf.TensorSpec(
 82 |             shape=(None, 1), dtype=spec.dtype, name=feature_name
 83 |         )
 84 |         for feature_name, spec in raw_feature_spec.items()
 85 |         if feature_name in features.FEATURE_NAMES
 86 |     }
 87 | 
 88 |     signatures = {
 89 |         "serving_default": _get_serve_features_fn(
 90 |             classifier, tft_output
 91 |         ).get_concrete_function(features_input_signature),
 92 |         "serving_tf_example": _get_serve_tf_examples_fn(
 93 |             classifier, tft_output, raw_feature_spec
 94 |         ).get_concrete_function(
 95 |             tf.TensorSpec(shape=[None], dtype=tf.string, name="examples")
 96 |         ),
 97 |     }
 98 | 
 99 |     logging.info("Model export started...")
100 |     tf.saved_model.save(classifier, serving_model_dir, signatures=signatures)
101 |     logging.info("Model export completed.")
102 | 


--------------------------------------------------------------------------------
/src/model_training/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """A DNN Keras classification model."""
15 | 
16 | import tensorflow as tf
17 | from tensorflow import keras
18 | 
19 | from src.common import features
20 | 
21 | 
22 | def create_model_inputs():
23 |     """Creates Keras model input dictionary."""
24 | 
25 |     inputs = {}
26 |     for feature_name in features.FEATURE_NAMES:
27 |         name = features.transformed_name(feature_name)
28 |         if feature_name in features.NUMERICAL_FEATURE_NAMES:
29 |             inputs[name] = keras.layers.Input(name=name, shape=[], dtype=tf.float32)
30 |         elif feature_name in features.categorical_feature_names():
31 |             inputs[name] = keras.layers.Input(name=name, shape=[], dtype=tf.int64)
32 |         else:
33 |             pass
34 |     return inputs
35 | 
36 | 
37 | def _create_binary_classifier(feature_vocab_sizes, hyperparams):
38 |     """Return a Keras binary classifier."""
39 | 
40 |     input_layers = create_model_inputs()
41 | 
42 |     layers = []
43 |     for key in input_layers:
44 |         feature_name = features.original_name(key)
45 |         if feature_name in features.EMBEDDING_CATEGORICAL_FEATURES:
46 |             vocab_size = feature_vocab_sizes[feature_name]
47 |             embedding_size = features.EMBEDDING_CATEGORICAL_FEATURES[feature_name]
48 |             embedding_output = keras.layers.Embedding(
49 |                 input_dim=vocab_size + 1,
50 |                 output_dim=embedding_size,
51 |                 name=f"{key}_embedding",
52 |             )(input_layers[key])
53 |             layers.append(embedding_output)
54 |         elif feature_name in features.ONEHOT_CATEGORICAL_FEATURE_NAMES:
55 |             vocab_size = feature_vocab_sizes[feature_name]
56 |             onehot_layer = keras.layers.experimental.preprocessing.CategoryEncoding(
57 |                 max_tokens=vocab_size,
58 |                 output_mode="binary",
59 |                 name=f"{key}_onehot",
60 |             )(input_layers[key])
61 |             layers.append(onehot_layer)
62 |         elif feature_name in features.NUMERICAL_FEATURE_NAMES:
63 |             numeric_layer = tf.expand_dims(input_layers[key], -1)
64 |             layers.append(numeric_layer)
65 |         else:
66 |             pass
67 | 
68 |     joined = keras.layers.Concatenate(name="combines_inputs")(layers)
69 |     feedforward_output = keras.Sequential(
70 |         [
71 |             keras.layers.Dense(units, activation="relu")
72 |             for units in hyperparams["hidden_units"]
73 |         ],
74 |         name="feedforward_network",
75 |     )(joined)
76 |     logits = keras.layers.Dense(units=1, name="logits")(feedforward_output)
77 | 
78 |     model = keras.Model(inputs=input_layers, outputs=[logits])
79 |     return model
80 | 
81 | 
82 | def create_binary_classifier(tft_output, hyperparams):
83 |     """Returns a Keras binary classifier."""
84 | 
85 |     feature_vocab_sizes = dict()
86 |     for feature_name in features.categorical_feature_names():
87 |         feature_vocab_sizes[feature_name] = tft_output.vocabulary_size_by_name(
88 |             feature_name
89 |         )
90 | 
91 |     return _create_binary_classifier(feature_vocab_sizes, hyperparams)
92 | 


--------------------------------------------------------------------------------
/src/model_training/runner.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """A run_fn method called by the TFX Trainer component."""
15 | 
16 | import os
17 | import logging
18 | 
19 | from src.model_training import trainer, exporter, defaults
20 | 
21 | 
22 | # TFX Trainer will call this function.
23 | def run_fn(fn_args):
24 |     """Train the model based on given args.
25 | 
26 |     Args:
27 |       fn_args: Holds args used to train the model as name/value pairs.
28 |     """
29 | 
30 |     logging.info("Runner started...")
31 |     logging.info(f"fn_args: {fn_args}")
32 |     logging.info("")
33 | 
34 |     try:
35 |         log_dir = fn_args.model_run_dir
36 |     except KeyError:
37 |         log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), "logs")
38 | 
39 |     hyperparams = fn_args.hyperparameters
40 |     if not hyperparams:
41 |         hyperparams = dict()
42 | 
43 |     hyperparams = defaults.update_hyperparams(hyperparams)
44 |     logging.info("Hyperparameter:")
45 |     logging.info(hyperparams)
46 |     logging.info("")
47 | 
48 |     logging.info("Runner executing trainer...")
49 |     classifier = trainer.train(
50 |         train_data_dir=fn_args.train_files,
51 |         eval_data_dir=fn_args.eval_files,
52 |         tft_output_dir=fn_args.transform_output,
53 |         hyperparams=hyperparams,
54 |         log_dir=log_dir,
55 |         base_model_dir=fn_args.base_model,
56 |     )
57 | 
58 |     logging.info("Runner executing exporter...")
59 |     exporter.export_serving_model(
60 |         classifier=classifier,
61 |         serving_model_dir=fn_args.serving_model_dir,
62 |         raw_schema_location=fn_args.schema_path,
63 |         tft_output_dir=fn_args.transform_output,
64 |     )
65 |     logging.info("Runner completed.")
66 | 


--------------------------------------------------------------------------------
/src/model_training/task.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """The entrypoint for the uCAIP traing job."""
 15 | 
 16 | import os
 17 | import sys
 18 | from datetime import datetime
 19 | import logging
 20 | import tensorflow as tf
 21 | from tensorflow.python.client import device_lib
 22 | import argparse
 23 | 
 24 | from google.cloud import aiplatform as vertex_ai
 25 | from google.cloud import aiplatform_v1beta1 as vertex_ai_beta
 26 | 
 27 | from src.model_training import defaults, trainer, exporter
 28 | 
 29 | dirname = os.path.dirname(__file__)
 30 | dirname = dirname.replace("/model_training", "")
 31 | RAW_SCHEMA_LOCATION = os.path.join(dirname, "raw_schema/schema.pbtxt")
 32 | 
 33 | 
 34 | def get_args():
 35 |     """Defines and parse commandline arguments."""
 36 | 
 37 |     parser = argparse.ArgumentParser()
 38 | 
 39 |     parser.add_argument(
 40 |         "--model-dir",
 41 |         default=os.getenv("AIP_MODEL_DIR"),
 42 |         type=str,
 43 |     )
 44 | 
 45 |     parser.add_argument(
 46 |         "--log-dir",
 47 |         default=os.getenv("AIP_TENSORBOARD_LOG_DIR"),
 48 |         type=str,
 49 |     )
 50 | 
 51 |     parser.add_argument(
 52 |         "--train-data-dir",
 53 |         type=str,
 54 |     )
 55 | 
 56 |     parser.add_argument(
 57 |         "--eval-data-dir",
 58 |         type=str,
 59 |     )
 60 | 
 61 |     parser.add_argument(
 62 |         "--tft-output-dir",
 63 |         type=str,
 64 |     )
 65 | 
 66 |     parser.add_argument("--learning-rate", default=0.001, type=float)
 67 | 
 68 |     parser.add_argument("--batch-size", default=512, type=float)
 69 | 
 70 |     parser.add_argument("--hidden-units", default="64,32", type=str)
 71 | 
 72 |     parser.add_argument("--num-epochs", default=10, type=int)
 73 | 
 74 |     parser.add_argument("--project", type=str)
 75 |     parser.add_argument("--region", type=str)
 76 |     parser.add_argument("--staging-bucket", type=str)
 77 |     parser.add_argument("--experiment-name", type=str)
 78 |     parser.add_argument("--run-name", type=str)
 79 | 
 80 |     return parser.parse_args()
 81 | 
 82 | 
 83 | def main():
 84 |     args = get_args()
 85 | 
 86 |     hyperparams = vars(args)
 87 |     hyperparams = defaults.update_hyperparams(hyperparams)
 88 |     logging.info(f"Hyperparameter: {hyperparams}")
 89 | 
 90 |     if args.experiment_name:
 91 |         vertex_ai.init(
 92 |             project=args.project,
 93 |             staging_bucket=args.staging_bucket,
 94 |             experiment=args.experiment_name,
 95 |         )
 96 | 
 97 |         logging.info(f"Using Vertex AI experiment: {args.experiment_name}")
 98 | 
 99 |         run_id = args.run_name
100 |         if not run_id:
101 |             run_id = f"run-gcp-{datetime.now().strftime('%Y%m%d%H%M%S')}"
102 | 
103 |         vertex_ai.start_run(run_id)
104 |         logging.info(f"Run {run_id} started.")
105 | 
106 |         vertex_ai.log_params(hyperparams)
107 | 
108 |     classifier = trainer.train(
109 |         train_data_dir=args.train_data_dir,
110 |         eval_data_dir=args.eval_data_dir,
111 |         tft_output_dir=args.tft_output_dir,
112 |         hyperparams=hyperparams,
113 |         log_dir=args.log_dir,
114 |     )
115 | 
116 |     val_loss, val_accuracy = trainer.evaluate(
117 |         model=classifier,
118 |         data_dir=args.eval_data_dir,
119 |         raw_schema_location=RAW_SCHEMA_LOCATION,
120 |         tft_output_dir=args.tft_output_dir,
121 |         hyperparams=hyperparams,
122 |     )
123 | 
124 |     if args.experiment_name:
125 |         vertex_ai.log_metrics({"val_loss": val_loss, "val_accuracy": val_accuracy})
126 | 
127 |     try:
128 |         exporter.export_serving_model(
129 |             classifier=classifier,
130 |             serving_model_dir=args.model_dir,
131 |             raw_schema_location=RAW_SCHEMA_LOCATION,
132 |             tft_output_dir=args.tft_output_dir,
133 |         )
134 |     except:
135 |         # Swallow Ignored Errors while exporting the model.
136 |         pass
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     logging.getLogger().setLevel(logging.INFO)
141 |     logging.info(f"Python Version = {sys.version}")
142 |     logging.info(f"TensorFlow Version = {tf.__version__}")
143 |     logging.info(f'TF_CONFIG = {os.environ.get("TF_CONFIG", "Not found")}')
144 |     logging.info(f"DEVICES = {device_lib.list_local_devices()}")
145 |     logging.info(f"Task started...")
146 |     main()
147 |     logging.info(f"Task completed.")
148 | 


--------------------------------------------------------------------------------
/src/model_training/trainer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Train and evaluate the model."""
 15 | 
 16 | import logging
 17 | import tensorflow as tf
 18 | import tensorflow_transform as tft
 19 | from tensorflow import keras
 20 | 
 21 | 
 22 | from src.model_training import data, model
 23 | 
 24 | 
 25 | def train(
 26 |     train_data_dir,
 27 |     eval_data_dir,
 28 |     tft_output_dir,
 29 |     hyperparams,
 30 |     log_dir,
 31 |     base_model_dir=None,
 32 | ):
 33 |     """Invokes model.fit method and returns a trained classifier."""
 34 | 
 35 |     logging.info(f"Loading tft output from {tft_output_dir}")
 36 |     tft_output = tft.TFTransformOutput(tft_output_dir)
 37 |     transformed_feature_spec = tft_output.transformed_feature_spec()
 38 | 
 39 |     train_dataset = data.get_dataset(
 40 |         train_data_dir,
 41 |         transformed_feature_spec,
 42 |         hyperparams["batch_size"],
 43 |     )
 44 | 
 45 |     eval_dataset = data.get_dataset(
 46 |         eval_data_dir,
 47 |         transformed_feature_spec,
 48 |         hyperparams["batch_size"],
 49 |     )
 50 | 
 51 |     optimizer = keras.optimizers.Adam(learning_rate=hyperparams["learning_rate"])
 52 |     loss = keras.losses.BinaryCrossentropy(from_logits=True)
 53 |     metrics = [keras.metrics.BinaryAccuracy(name="accuracy")]
 54 | 
 55 |     early_stopping = tf.keras.callbacks.EarlyStopping(
 56 |         monitor="val_loss", patience=5, restore_best_weights=True
 57 |     )
 58 |     tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
 59 | 
 60 |     classifier = model.create_binary_classifier(tft_output, hyperparams)
 61 |     if base_model_dir:
 62 |         try:
 63 |             classifier = keras.load_model(base_model_dir)
 64 |         except:
 65 |             pass
 66 | 
 67 |     classifier.compile(optimizer=optimizer, loss=loss, metrics=metrics)
 68 | 
 69 |     logging.info("Model training started...")
 70 |     classifier.fit(
 71 |         train_dataset,
 72 |         epochs=hyperparams["num_epochs"],
 73 |         validation_data=eval_dataset,
 74 |         callbacks=[early_stopping, tensorboard_callback],
 75 |     )
 76 |     logging.info("Model training completed.")
 77 | 
 78 |     return classifier
 79 | 
 80 | 
 81 | def evaluate(model, data_dir, raw_schema_location, tft_output_dir, hyperparams):
 82 |     """Invokes model.evaluate method and returns evaluation_metrics."""
 83 | 
 84 |     logging.info(f"Loading raw schema from {raw_schema_location}")
 85 | 
 86 |     logging.info(f"Loading tft output from {tft_output_dir}")
 87 |     tft_output = tft.TFTransformOutput(tft_output_dir)
 88 |     transformed_feature_spec = tft_output.transformed_feature_spec()
 89 | 
 90 |     logging.info("Model evaluation started...")
 91 |     eval_dataset = data.get_dataset(
 92 |         data_dir,
 93 |         transformed_feature_spec,
 94 |         hyperparams["batch_size"],
 95 |     )
 96 | 
 97 |     evaluation_metrics = model.evaluate(eval_dataset)
 98 |     logging.info("Model evaluation completed.")
 99 | 
100 |     return evaluation_metrics
101 | 


--------------------------------------------------------------------------------
/src/pipeline_triggering/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/pipeline_triggering/__init__.py


--------------------------------------------------------------------------------
/src/pipeline_triggering/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Cloud Function to be triggered by Pub/Sub."""
15 | 
16 | import os
17 | import json
18 | import logging
19 | from kfp.v2.google.client import AIPlatformClient
20 | from google.cloud import storage
21 | import base64
22 | 
23 | 
24 | def trigger_pipeline(event, context):
25 |     """A Cloud Function for triggering a Vertex pipeline given a Pub/Sub event."""
26 | 
27 |     project = os.getenv("PROJECT")
28 |     region = os.getenv("REGION")
29 |     gcs_pipeline_file_location = os.getenv("GCS_PIPELINE_FILE_LOCATION")
30 | 
31 |     if not project:
32 |         raise ValueError("Environment variable PROJECT is not set.")
33 |     if not region:
34 |         raise ValueError("Environment variable REGION is not set.")
35 |     if not gcs_pipeline_file_location:
36 |         raise ValueError("Environment variable GCS_PIPELINE_FILE_LOCATION is not set.")
37 | 
38 |     storage_client = storage.Client()
39 | 
40 |     if not gcs_pipeline_file_location:
41 |         raise ValueError("Environment variable GCS_PIPELINE_FILE_LOCATION is not set.")
42 | 
43 |     path_parts = gcs_pipeline_file_location.replace("gs://", "").split("/")
44 |     bucket_name = path_parts[0]
45 |     blob_name = "/".join(path_parts[1:])
46 | 
47 |     bucket = storage_client.bucket(bucket_name)
48 |     blob = storage.Blob(bucket=bucket, name=blob_name)
49 | 
50 |     if not blob.exists(storage_client):
51 |         raise ValueError(f"{gcs_pipeline_file_location} does not exist.")
52 | 
53 |     data = base64.b64decode(event["data"]).decode("utf-8")
54 |     logging.info(f"Event data: {data}")
55 | 
56 |     parameter_values = json.loads(data)
57 | 
58 |     api_client = AIPlatformClient(project_id=project, region=region)
59 | 
60 |     response = api_client.create_run_from_job_spec(
61 |         job_spec_path=gcs_pipeline_file_location, parameter_values=parameter_values
62 |     )
63 | 
64 |     logging.info(response)
65 | 


--------------------------------------------------------------------------------
/src/pipeline_triggering/requirements.txt:
--------------------------------------------------------------------------------
1 | kfp==1.6.2
2 | google-cloud-aiplatform
3 | google-cloud-storage


--------------------------------------------------------------------------------
/src/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/preprocessing/__init__.py


--------------------------------------------------------------------------------
/src/preprocessing/etl.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Data preprocessing pipelines."""
 15 | 
 16 | import os
 17 | 
 18 | import tensorflow_transform as tft
 19 | import tensorflow_data_validation as tfdv
 20 | import apache_beam as beam
 21 | from apache_beam.io.gcp.datastore.v1new.datastoreio import WriteToDatastore
 22 | import tensorflow_transform.beam as tft_beam
 23 | from tensorflow_transform.tf_metadata import dataset_metadata
 24 | from tensorflow_transform.tf_metadata import schema_utils
 25 | 
 26 | 
 27 | from src.preprocessing import transformations
 28 | 
 29 | RAW_SCHEMA_LOCATION = "src/raw_schema/schema.pbtxt"
 30 | 
 31 | 
 32 | def parse_bq_record(bq_record):
 33 |     """Parses a bq_record to a dictionary."""
 34 |     output = {}
 35 |     for key in bq_record:
 36 |         output[key] = [bq_record[key]]
 37 |     return output
 38 | 
 39 | 
 40 | def split_dataset(bq_row, num_partitions, ratio):
 41 |     """Returns a partition number for a given bq_row."""
 42 |     import json
 43 | 
 44 |     assert num_partitions == len(ratio)
 45 |     bucket = sum(map(ord, json.dumps(bq_row))) % sum(ratio)
 46 |     total = 0
 47 |     for i, part in enumerate(ratio):
 48 |         total += part
 49 |         if bucket < total:
 50 |             return i
 51 |     return len(ratio) - 1
 52 | 
 53 | 
 54 | def run_transform_pipeline(args):
 55 |     """Runs a Beam pipeline to preprocess the data using TensorFlow Transform."""
 56 | 
 57 |     pipeline_options = beam.pipeline.PipelineOptions(flags=[], **args)
 58 | 
 59 |     raw_data_query = args["raw_data_query"]
 60 |     write_raw_data = args["write_raw_data"]
 61 |     exported_data_prefix = args["exported_data_prefix"]
 62 |     transformed_data_prefix = args["transformed_data_prefix"]
 63 |     transform_artifact_dir = args["transform_artifact_dir"]
 64 |     temp_location = args["temp_location"]
 65 |     project = args["project"]
 66 | 
 67 |     source_raw_schema = tfdv.load_schema_text(RAW_SCHEMA_LOCATION)
 68 |     raw_feature_spec = schema_utils.schema_as_feature_spec(
 69 |         source_raw_schema
 70 |     ).feature_spec
 71 | 
 72 |     raw_metadata = dataset_metadata.DatasetMetadata(
 73 |         schema_utils.schema_from_feature_spec(raw_feature_spec)
 74 |     )
 75 | 
 76 |     with beam.Pipeline(options=pipeline_options) as pipeline:
 77 |         with tft_beam.Context(temp_location):
 78 | 
 79 |             # Read raw BigQuery data.
 80 |             raw_train_data, raw_eval_data = (
 81 |                 pipeline
 82 |                 | "Read Raw Data"
 83 |                 >> beam.io.ReadFromBigQuery(
 84 |                     query=raw_data_query,
 85 |                     project=project,
 86 |                     use_standard_sql=True,
 87 |                 )
 88 |                 | "Parse Data" >> beam.Map(parse_bq_record)
 89 |                 | "Split" >> beam.Partition(split_dataset, 2, ratio=[8, 2])
 90 |             )
 91 | 
 92 |             # Create a train_dataset from the data and schema.
 93 |             raw_train_dataset = (raw_train_data, raw_metadata)
 94 | 
 95 |             # Analyze and transform raw_train_dataset to produced transformed_train_dataset and transform_fn.
 96 |             transformed_train_dataset, transform_fn = (
 97 |                 raw_train_dataset
 98 |                 | "Analyze & Transform"
 99 |                 >> tft_beam.AnalyzeAndTransformDataset(transformations.preprocessing_fn)
100 |             )
101 | 
102 |             # Get data and schema separately from the transformed_dataset.
103 |             transformed_train_data, transformed_metadata = transformed_train_dataset
104 | 
105 |             # write transformed train data.
106 |             _ = (
107 |                 transformed_train_data
108 |                 | "Write Transformed Train Data"
109 |                 >> beam.io.tfrecordio.WriteToTFRecord(
110 |                     file_path_prefix=os.path.join(
111 |                         transformed_data_prefix, "train/data"
112 |                     ),
113 |                     file_name_suffix=".gz",
114 |                     coder=tft.coders.ExampleProtoCoder(transformed_metadata.schema),
115 |                 )
116 |             )
117 | 
118 |             # Create a eval_dataset from the data and schema.
119 |             raw_eval_dataset = (raw_eval_data, raw_metadata)
120 | 
121 |             # Transform raw_eval_dataset to produced transformed_eval_dataset using transform_fn.
122 |             transformed_eval_dataset = (
123 |                 raw_eval_dataset,
124 |                 transform_fn,
125 |             ) | "Transform" >> tft_beam.TransformDataset()
126 | 
127 |             # Get data from the transformed_eval_dataset.
128 |             transformed_eval_data, _ = transformed_eval_dataset
129 | 
130 |             # write transformed train data.
131 |             _ = (
132 |                 transformed_eval_data
133 |                 | "Write Transformed Eval Data"
134 |                 >> beam.io.tfrecordio.WriteToTFRecord(
135 |                     file_path_prefix=os.path.join(transformed_data_prefix, "eval/data"),
136 |                     file_name_suffix=".gz",
137 |                     coder=tft.coders.ExampleProtoCoder(transformed_metadata.schema),
138 |                 )
139 |             )
140 | 
141 |             # Write transform_fn.
142 |             _ = transform_fn | "Write Transform Artifacts" >> tft_beam.WriteTransformFn(
143 |                 transform_artifact_dir
144 |             )
145 | 
146 |             if write_raw_data:
147 |                 # write raw eval data.
148 |                 _ = (
149 |                     raw_eval_data
150 |                     | "Write Raw Eval Data"
151 |                     >> beam.io.tfrecordio.WriteToTFRecord(
152 |                         file_path_prefix=os.path.join(exported_data_prefix, "data"),
153 |                         file_name_suffix=".tfrecord",
154 |                         coder=tft.coders.ExampleProtoCoder(raw_metadata.schema),
155 |                     )
156 |                 )
157 | 
158 | 
159 | def convert_to_jsonl(bq_record):
160 |     """Converts bq_record to a jsonl formatted text."""
161 |     import json
162 | 
163 |     output = {}
164 |     for key in bq_record:
165 |         output[key] = [bq_record[key]]
166 |     return json.dumps(output)
167 | 
168 | 
169 | def run_extract_pipeline(args):
170 |     """Runs a Beam pipeline to extract data from BigQuery as JSONL files."""
171 | 
172 |     pipeline_options = beam.pipeline.PipelineOptions(flags=[], **args)
173 | 
174 |     sql_query = args["sql_query"]
175 |     exported_data_prefix = args["exported_data_prefix"]
176 |     temporary_dir = args["temporary_dir"]
177 |     gcs_location = args["gcs_location"]
178 |     project = args["project"]
179 | 
180 |     with beam.Pipeline(options=pipeline_options) as pipeline:
181 |         with tft_beam.Context(temporary_dir):
182 | 
183 |             # Read BigQuery data.
184 |             raw_data = (
185 |                 pipeline
186 |                 | "Read Data"
187 |                 >> beam.io.ReadFromBigQuery(
188 |                     query=sql_query,
189 |                     project=project,
190 |                     use_standard_sql=True,
191 |                     gcs_location=gcs_location,
192 |                 )
193 |                 | "Parse Data" >> beam.Map(convert_to_jsonl)
194 |             )
195 | 
196 |             # Write raw data to GCS as JSONL files.
197 |             _ = raw_data | "Write Data" >> beam.io.WriteToText(
198 |                 file_path_prefix=exported_data_prefix, file_name_suffix=".jsonl"
199 |             )
200 | 
201 | 
202 | def parse_prediction_results(jsonl):
203 |     """Parses JSONL prediction results to a dictionary."""
204 |     import uuid
205 |     import json
206 | 
207 |     prediction_results = json.loads(jsonl)["prediction"]
208 |     prediction_id = str(uuid.uuid4())
209 |     scores = prediction_results["scores"]
210 |     classes = prediction_results["classes"]
211 | 
212 |     return {"prediction_id": prediction_id, "scores": scores, "classes": classes}
213 | 
214 | 
215 | def create_datastore_entity(prediction_response, kind):
216 |     """Creates a Datastore entity."""
217 | 
218 |     from apache_beam.io.gcp.datastore.v1new.types import Entity
219 |     from apache_beam.io.gcp.datastore.v1new.types import Key
220 | 
221 |     user_id = prediction_response.pop("prediction_id")
222 |     key = Key([kind, user_id])
223 |     prediction_entity = Entity(key)
224 |     prediction_entity.set_properties(prediction_response)
225 |     return prediction_entity
226 | 
227 | 
228 | def run_store_predictions_pipeline(args):
229 |     """Runs a Beam pipeline to store JSONL data to Datastore."""
230 | 
231 |     project = args["project"]
232 |     datastore_kind = args["datastore_kind"]
233 |     prediction_results_uri = args["prediction_results_uri"]
234 | 
235 |     pipeline_options = beam.options.pipeline_options.PipelineOptions(args)
236 |     with beam.Pipeline(options=pipeline_options) as pipeline:
237 |         _ = (
238 |             pipeline
239 |             | "ReadFromJSONL" >> beam.io.ReadFromText(prediction_results_uri)
240 |             | "ParsePredictionResults" >> beam.Map(parse_prediction_results)
241 |             | "ConvertToDatastoreEntity"
242 |             >> beam.Map(create_datastore_entity, datastore_kind)
243 |             | "WriteToDatastore" >> WriteToDatastore(project=project)
244 |         )
245 | 


--------------------------------------------------------------------------------
/src/preprocessing/transformations.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TensorFlow Transform preprocessing function."""
15 | 
16 | import tensorflow as tf
17 | import tensorflow_transform as tft
18 | 
19 | from src.common import features
20 | 
21 | 
22 | def preprocessing_fn(inputs):
23 |     """tf.transform's callback function for preprocessing inputs.
24 | 
25 |     Args:
26 |       inputs: map from feature keys to raw not-yet-transformed features.
27 |     Returns:
28 |       Map from string feature key to transformed feature operations.
29 |     """
30 | 
31 |     outputs = {}
32 | 
33 |     for key in features.FEATURE_NAMES:
34 |         if key in features.NUMERICAL_FEATURE_NAMES:
35 |             outputs[features.transformed_name(key)] = tft.scale_to_z_score(inputs[key])
36 | 
37 |         elif key in features.categorical_feature_names():
38 |             outputs[features.transformed_name(key)] = tft.compute_and_apply_vocabulary(
39 |                 inputs[key],
40 |                 num_oov_buckets=1,
41 |                 vocab_filename=key,
42 |             )
43 | 
44 |     outputs[features.TARGET_FEATURE_NAME] = inputs[features.TARGET_FEATURE_NAME]
45 | 
46 |     for key in outputs:
47 |         outputs[key] = tf.squeeze(outputs[key], -1)
48 | 
49 |     return outputs
50 | 


--------------------------------------------------------------------------------
/src/raw_schema/schema.pbtxt:
--------------------------------------------------------------------------------
  1 | feature {
  2 |   name: "trip_month"
  3 |   type: INT
  4 |   presence {
  5 |     min_fraction: 1.0
  6 |     min_count: 1
  7 |   }
  8 |   shape {
  9 |     dim {
 10 |       size: 1
 11 |     }
 12 |   }
 13 | }
 14 | feature {
 15 |   name: "trip_day"
 16 |   type: INT
 17 |   presence {
 18 |     min_fraction: 1.0
 19 |     min_count: 1
 20 |   }
 21 |   shape {
 22 |     dim {
 23 |       size: 1
 24 |     }
 25 |   }
 26 | }
 27 | feature {
 28 |   name: "trip_day_of_week"
 29 |   type: INT
 30 |   presence {
 31 |     min_fraction: 1.0
 32 |     min_count: 1
 33 |   }
 34 |   shape {
 35 |     dim {
 36 |       size: 1
 37 |     }
 38 |   }
 39 | }
 40 | feature {
 41 |   name: "trip_hour"
 42 |   type: INT
 43 |   presence {
 44 |     min_fraction: 1.0
 45 |     min_count: 1
 46 |   }
 47 |   shape {
 48 |     dim {
 49 |       size: 1
 50 |     }
 51 |   }
 52 | }
 53 | feature {
 54 |   name: "trip_seconds"
 55 |   type: INT
 56 |   presence {
 57 |     min_fraction: 1.0
 58 |     min_count: 1
 59 |   }
 60 |   shape {
 61 |     dim {
 62 |       size: 1
 63 |     }
 64 |   }
 65 | }
 66 | feature {
 67 |   name: "trip_miles"
 68 |   type: FLOAT
 69 |   presence {
 70 |     min_fraction: 1.0
 71 |     min_count: 1
 72 |   }
 73 |   shape {
 74 |     dim {
 75 |       size: 1
 76 |     }
 77 |   }
 78 | }
 79 | feature {
 80 |   name: "payment_type"
 81 |   type: BYTES
 82 |   domain: "payment_type"
 83 |   presence {
 84 |     min_fraction: 1.0
 85 |     min_count: 1
 86 |   }
 87 |   shape {
 88 |     dim {
 89 |       size: 1
 90 |     }
 91 |   }
 92 | }
 93 | feature {
 94 |   name: "pickup_grid"
 95 |   type: BYTES
 96 |   domain: "pickup_grid"
 97 |   presence {
 98 |     min_fraction: 1.0
 99 |     min_count: 1
100 |   }
101 |   shape {
102 |     dim {
103 |       size: 1
104 |     }
105 |   }
106 | }
107 | feature {
108 |   name: "dropoff_grid"
109 |   type: BYTES
110 |   domain: "dropoff_grid"
111 |   presence {
112 |     min_fraction: 1.0
113 |     min_count: 1
114 |   }
115 |   shape {
116 |     dim {
117 |       size: 1
118 |     }
119 |   }
120 | }
121 | feature {
122 |   name: "euclidean"
123 |   type: FLOAT
124 |   presence {
125 |     min_fraction: 1.0
126 |     min_count: 1
127 |   }
128 |   shape {
129 |     dim {
130 |       size: 1
131 |     }
132 |   }
133 | }
134 | feature {
135 |   name: "loc_cross"
136 |   type: BYTES
137 |   presence {
138 |     min_fraction: 1.0
139 |     min_count: 1
140 |   }
141 |   shape {
142 |     dim {
143 |       size: 1
144 |     }
145 |   }
146 | }
147 | feature {
148 |   name: "tip_bin"
149 |   type: INT
150 |   bool_domain {
151 |   }
152 |   presence {
153 |     min_fraction: 1.0
154 |     min_count: 1
155 |   }
156 |   shape {
157 |     dim {
158 |       size: 1
159 |     }
160 |   }
161 | }
162 | string_domain {
163 |   name: "payment_type"
164 |   value: "Cash"
165 |   value: "Credit Card"
166 |   value: "Dispute"
167 |   value: "Mobile"
168 |   value: "No Charge"
169 |   value: "Prcard"
170 |   value: "Prepaid"
171 |   value: "Unknown"
172 | }
173 | string_domain {
174 |   name: "pickup_grid"
175 |   value: "POINT(-87.5 41.7)"
176 |   value: "POINT(-87.6 41.7)"
177 |   value: "POINT(-87.6 41.8)"
178 |   value: "POINT(-87.6 41.9)"
179 |   value: "POINT(-87.6 42)"
180 |   value: "POINT(-87.7 41.7)"
181 |   value: "POINT(-87.7 41.8)"
182 |   value: "POINT(-87.7 41.9)"
183 |   value: "POINT(-87.7 42)"
184 |   value: "POINT(-87.8 41.8)"
185 |   value: "POINT(-87.8 41.9)"
186 |   value: "POINT(-87.8 42)"
187 |   value: "POINT(-87.9 42)"
188 | }
189 | string_domain {
190 |   name: "dropoff_grid"
191 |   value: "POINT(-87.5 41.7)"
192 |   value: "POINT(-87.6 41.7)"
193 |   value: "POINT(-87.6 41.8)"
194 |   value: "POINT(-87.6 41.9)"
195 |   value: "POINT(-87.6 42)"
196 |   value: "POINT(-87.7 41.7)"
197 |   value: "POINT(-87.7 41.8)"
198 |   value: "POINT(-87.7 41.9)"
199 |   value: "POINT(-87.7 42)"
200 |   value: "POINT(-87.8 41.8)"
201 |   value: "POINT(-87.8 41.9)"
202 |   value: "POINT(-87.8 42)"
203 |   value: "POINT(-87.9 42)"
204 | }
205 | 


--------------------------------------------------------------------------------
/src/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/tests/__init__.py


--------------------------------------------------------------------------------
/src/tests/datasource_utils_tests.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Test utilities for generating BigQuery data querying scirpts."""
 15 | 
 16 | import sys
 17 | import os
 18 | import logging
 19 | from google.cloud import bigquery
 20 | 
 21 | from src.common import datasource_utils
 22 | 
 23 | root = logging.getLogger()
 24 | root.setLevel(logging.INFO)
 25 | handler = logging.StreamHandler(sys.stdout)
 26 | handler.setLevel(logging.INFO)
 27 | root.addHandler(handler)
 28 | 
 29 | LIMIT = 100
 30 | 
 31 | TARGET_COLUMN = "tip_bin"
 32 | 
 33 | EXPECTED_TRAINING_COLUMNS = [
 34 |     "trip_month",
 35 |     "trip_day",
 36 |     "trip_day_of_week",
 37 |     "trip_hour",
 38 |     "trip_seconds",
 39 |     "trip_miles",
 40 |     "payment_type",
 41 |     "pickup_grid",
 42 |     "dropoff_grid",
 43 |     "euclidean",
 44 |     "loc_cross",
 45 |     "tip_bin",
 46 | ]
 47 | 
 48 | 
 49 | MISSING = {
 50 |     "trip_month": -1,
 51 |     "trip_day": -1,
 52 |     "trip_day_of_week": -1,
 53 |     "trip_hour": -1,
 54 |     "trip_seconds": -1,
 55 |     "trip_miles": -1,
 56 |     "payment_type": "NA",
 57 |     "pickup_grid": "NA",
 58 |     "dropoff_grid": "NA",
 59 |     "euclidean": -1,
 60 |     "loc_cross": "NA",
 61 | }
 62 | 
 63 | 
 64 | def test_training_query():
 65 | 
 66 |     project = os.getenv("PROJECT")
 67 |     location = os.getenv("BQ_LOCATION")
 68 |     dataset_display_name = os.getenv("DATASET_DISPLAY_NAME")
 69 | 
 70 |     assert project, "Environment variable PROJECT is None!"
 71 |     assert location, "Environment variable BQ_LOCATION is None!"
 72 |     assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!"
 73 | 
 74 |     logging.info(f"Dataset: {dataset_display_name}")
 75 | 
 76 |     query = datasource_utils.create_bq_source_query(
 77 |         dataset_display_name=dataset_display_name,
 78 |         missing=MISSING,
 79 |         label_column=TARGET_COLUMN,
 80 |         ML_use="UNASSIGNED",
 81 |         limit=LIMIT,
 82 |     )
 83 | 
 84 |     bq_client = bigquery.Client(project=project, location=location)
 85 |     df = bq_client.query(query).to_dataframe()
 86 |     columns = set(df.columns)
 87 |     assert columns == set(EXPECTED_TRAINING_COLUMNS)
 88 |     assert df.shape == (LIMIT, 12)
 89 | 
 90 | 
 91 | def test_serving_query():
 92 | 
 93 |     project = os.getenv("PROJECT")
 94 |     location = os.getenv("BQ_LOCATION")
 95 |     bq_dataset_name = os.getenv("BQ_DATASET_NAME")
 96 |     dataset_display_name = os.getenv("DATASET_DISPLAY_NAME")
 97 | 
 98 |     assert project, "Environment variable PROJECT is None!"
 99 |     assert location, "Environment variable BQ_LOCATION is None!"
100 |     assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!"
101 | 
102 |     logging.info(f"Dataset: {dataset_display_name}")
103 | 
104 |     query = datasource_utils.create_bq_source_query(
105 |         dataset_display_name=dataset_display_name,
106 |         missing=MISSING,
107 |         ML_use=None,
108 |         limit=LIMIT,
109 |     )
110 | 
111 |     bq_client = bigquery.Client(project=project, location=location)
112 |     df = bq_client.query(query).to_dataframe()
113 |     columns = set(df.columns)
114 |     expected_serving_columns = EXPECTED_TRAINING_COLUMNS
115 |     expected_serving_columns.remove(TARGET_COLUMN)
116 |     assert columns == set(expected_serving_columns)
117 |     assert df.shape == (LIMIT, 11)
118 | 


--------------------------------------------------------------------------------
/src/tests/etl_tests.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Test data processing."""
 15 | 
 16 | import sys
 17 | import os
 18 | import logging
 19 | import tensorflow_transform as tft
 20 | import tensorflow as tf
 21 | from tensorflow.io import FixedLenFeature
 22 | 
 23 | from src.preprocessing import etl
 24 | from src.comm import datasource_utils
 25 | 
 26 | root = logging.getLogger()
 27 | root.setLevel(logging.INFO)
 28 | handler = logging.StreamHandler(sys.stdout)
 29 | handler.setLevel(logging.INFO)
 30 | root.addHandler(handler)
 31 | 
 32 | OUTPUT_DIR = "test_etl_output_dir"
 33 | ML_USE = "UNASSIGNED"
 34 | LIMIT = 100
 35 | 
 36 | EXPECTED_FEATURE_SPEC = {
 37 |     "dropoff_grid_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 38 |     "euclidean_xf": FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 39 |     "loc_cross_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 40 |     "payment_type_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 41 |     "pickup_grid_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 42 |     "tip_bin": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 43 |     "trip_day_of_week_xf": FixedLenFeature(
 44 |         shape=[], dtype=tf.int64, default_value=None
 45 |     ),
 46 |     "trip_day_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 47 |     "trip_hour_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 48 |     "trip_miles_xf": FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 49 |     "trip_month_xf": FixedLenFeature(shape=[], dtype=tf.int64, default_value=None),
 50 |     "trip_seconds_xf": FixedLenFeature(shape=[], dtype=tf.float32, default_value=None),
 51 | }
 52 | 
 53 | 
 54 | def test_transform_pipeline():
 55 | 
 56 |     project = os.getenv("PROJECT")
 57 |     region = os.getenv("REGION")
 58 |     bucket = os.getenv("BUCKET")
 59 |     dataset_display_name = os.getenv("DATASET_DISPLAY_NAME")
 60 | 
 61 |     assert project, "Environment variable PROJECT is None!"
 62 |     assert region, "Environment variable REGION is None!"
 63 |     assert bucket, "Environment variable BUCKET is None!"
 64 |     assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!"
 65 | 
 66 |     os.mkdir(OUTPUT_DIR)
 67 | 
 68 |     exported_data_dir = os.path.join(OUTPUT_DIR, "exported_data")
 69 |     transformed_data_dir = os.path.join(OUTPUT_DIR, "transformed_data")
 70 |     transform_artifacts_dir = os.path.join(OUTPUT_DIR, "transform_artifacts")
 71 |     temporary_dir = os.path.join(OUTPUT_DIR, "tmp")
 72 | 
 73 |     raw_data_query = datasource_utils.get_training_source_query(
 74 |         project=project,
 75 |         region=region,
 76 |         dataset_display_name=dataset_display_name,
 77 |         ml_use=ML_USE,
 78 |         limit=LIMIT,
 79 |     )
 80 | 
 81 |     args = {
 82 |         "runner": "DirectRunner",
 83 |         "raw_data_query": raw_data_query,
 84 |         "write_raw_data": False,
 85 |         "exported_data_prefix": exported_data_dir,
 86 |         "transformed_data_prefix": transformed_data_dir,
 87 |         "transform_artefact_dir": transform_artifacts_dir,
 88 |         "temporary_dir": temporary_dir,
 89 |         "gcs_location": f"gs://{bucket}/bq_tmp",
 90 |         "project": project,
 91 |     }
 92 | 
 93 |     logging.info(f"Transform pipeline args: {args}")
 94 |     etl.run_transform_pipeline(args)
 95 |     logging.info(f"Transform pipeline finished.")
 96 | 
 97 |     tft_output = tft.TFTransformOutput(transform_artifacts_dir)
 98 |     transform_feature_spec = tft_output.transformed_feature_spec()
 99 |     assert transform_feature_spec == EXPECTED_FEATURE_SPEC
100 | 


--------------------------------------------------------------------------------
/src/tests/model_deployment_tests.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Test an uploaded model to Vertex AI."""
 15 | 
 16 | import os
 17 | import logging
 18 | import tensorflow as tf
 19 | 
 20 | test_instance = {
 21 |     "dropoff_grid": ["POINT(-87.6 41.9)"],
 22 |     "euclidean": [2064.2696],
 23 |     "loc_cross": [""],
 24 |     "payment_type": ["Credit Card"],
 25 |     "pickup_grid": ["POINT(-87.6 41.9)"],
 26 |     "trip_miles": [1.37],
 27 |     "trip_day": [12],
 28 |     "trip_hour": [16],
 29 |     "trip_month": [2],
 30 |     "trip_day_of_week": [4],
 31 |     "trip_seconds": [555],
 32 | }
 33 | 
 34 | SERVING_DEFAULT_SIGNATURE_NAME = "serving_default"
 35 | 
 36 | from google.cloud import aiplatform as vertex_ai
 37 | 
 38 | 
 39 | def test_model_artifact():
 40 | 
 41 |     feature_types = {
 42 |         "dropoff_grid": tf.dtypes.string,
 43 |         "euclidean": tf.dtypes.float32,
 44 |         "loc_cross": tf.dtypes.string,
 45 |         "payment_type": tf.dtypes.string,
 46 |         "pickup_grid": tf.dtypes.string,
 47 |         "trip_miles": tf.dtypes.float32,
 48 |         "trip_day": tf.dtypes.int64,
 49 |         "trip_hour": tf.dtypes.int64,
 50 |         "trip_month": tf.dtypes.int64,
 51 |         "trip_day_of_week": tf.dtypes.int64,
 52 |         "trip_seconds": tf.dtypes.int64,
 53 |     }
 54 | 
 55 |     new_test_instance = dict()
 56 |     for key in test_instance:
 57 |         new_test_instance[key] = tf.constant(
 58 |             [test_instance[key]], dtype=feature_types[key]
 59 |         )
 60 | 
 61 |     print(new_test_instance)
 62 | 
 63 |     project = os.getenv("PROJECT")
 64 |     region = os.getenv("REGION")
 65 |     model_display_name = os.getenv("MODEL_DISPLAY_NAME")
 66 | 
 67 |     assert project, "Environment variable PROJECT is None!"
 68 |     assert region, "Environment variable REGION is None!"
 69 |     assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!"
 70 | 
 71 |     vertex_ai.init(
 72 |         project=project,
 73 |         location=region,
 74 |     )
 75 | 
 76 |     models = vertex_ai.Model.list(
 77 |         filter=f"display_name={model_display_name}", order_by="update_time"
 78 |     )
 79 | 
 80 |     assert models, f"No model with display name {model_display_name} exists!"
 81 | 
 82 |     model = models[-1]
 83 |     artifact_uri = model.gca_resource.artifact_uri
 84 |     logging.info(f"Model artifact uri:{artifact_uri}")
 85 |     assert tf.io.gfile.exists(
 86 |         artifact_uri
 87 |     ), f"Model artifact uri {artifact_uri} does not exist!"
 88 | 
 89 |     saved_model = tf.saved_model.load(artifact_uri)
 90 |     logging.info("Model loaded successfully.")
 91 | 
 92 |     assert (
 93 |         SERVING_DEFAULT_SIGNATURE_NAME in saved_model.signatures
 94 |     ), f"{SERVING_DEFAULT_SIGNATURE_NAME} not in model signatures!"
 95 | 
 96 |     prediction_fn = saved_model.signatures["serving_default"]
 97 |     predictions = prediction_fn(**new_test_instance)
 98 |     logging.info("Model produced predictions.")
 99 | 
100 |     keys = ["classes", "scores"]
101 |     for key in keys:
102 |         assert key in predictions, f"{key} in prediction outputs!"
103 | 
104 |     assert predictions["classes"].shape == (
105 |         1,
106 |         2,
107 |     ), f"Invalid output classes shape: {predictions['classes'].shape}!"
108 |     assert predictions["scores"].shape == (
109 |         1,
110 |         2,
111 |     ), f"Invalid output scores shape: {predictions['scores'].shape}!"
112 |     logging.info(f"Prediction output: {predictions}")
113 | 
114 | 
115 | def test_model_endpoint():
116 | 
117 |     project = os.getenv("PROJECT")
118 |     region = os.getenv("REGION")
119 |     model_display_name = os.getenv("MODEL_DISPLAY_NAME")
120 |     endpoint_display_name = os.getenv("ENDPOINT_DISPLAY_NAME")
121 | 
122 |     assert project, "Environment variable PROJECT is None!"
123 |     assert region, "Environment variable REGION is None!"
124 |     assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!"
125 |     assert endpoint_display_name, "Environment variable ENDPOINT_DISPLAY_NAME is None!"
126 | 
127 |     endpoints = vertex_ai.Endpoint.list(
128 |         filter=f"display_name={endpoint_display_name}", order_by="update_time"
129 |     )
130 |     assert (
131 |         endpoints
132 |     ), f"Endpoint with display name {endpoint_display_name} does not exist! in region {region}"
133 | 
134 |     endpoint = endpoints[-1]
135 |     logging.info(f"Calling endpoint: {endpoint}.")
136 | 
137 |     prediction = endpoint.predict([test_instance]).predictions[0]
138 | 
139 |     keys = ["classes", "scores"]
140 |     for key in keys:
141 |         assert key in prediction, f"{key} in prediction outputs!"
142 | 
143 |     assert (
144 |         len(prediction["classes"]) == 2
145 |     ), f"Invalid number of output classes: {len(prediction['classes'])}!"
146 |     assert (
147 |         len(prediction["scores"]) == 2
148 |     ), f"Invalid number output scores: {len(prediction['scores'])}!"
149 | 
150 |     logging.info(f"Prediction output: {prediction}")
151 | 


--------------------------------------------------------------------------------
/src/tests/model_tests.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Test model functions."""
15 | 
16 | import sys
17 | import os
18 | import logging
19 | import tensorflow_transform as tft
20 | import tensorflow as tf
21 | from tensorflow.io import FixedLenFeature
22 | 
23 | from src.common import features
24 | from src.model_training import model, defaults
25 | 
26 | root = logging.getLogger()
27 | root.setLevel(logging.INFO)
28 | handler = logging.StreamHandler(sys.stdout)
29 | handler.setLevel(logging.INFO)
30 | root.addHandler(handler)
31 | 
32 | EXPECTED_HYPERPARAMS_KEYS = [
33 |     "hidden_units",
34 |     "learning_rate",
35 |     "batch_size",
36 |     "num_epochs",
37 | ]
38 | 
39 | 
40 | def test_hyperparams_defaults():
41 |     hyperparams = {"hidden_units": [64, 32]}
42 | 
43 |     hyperparams = defaults.update_hyperparams(hyperparams)
44 |     assert set(hyperparams.keys()) == set(EXPECTED_HYPERPARAMS_KEYS)
45 | 
46 | 
47 | def test_create_binary_classifier():
48 | 
49 |     hyperparams = hyperparams = defaults.update_hyperparams(dict())
50 | 
51 |     model_inputs = {
52 |         "dropoff_grid_xf": tf.convert_to_tensor([0, 0, 0]),
53 |         "euclidean_xf": tf.convert_to_tensor([-0.9066112, -0.9066112, -0.9066112]),
54 |         "loc_cross_xf": tf.convert_to_tensor([0, 0, 0]),
55 |         "payment_type_xf": tf.convert_to_tensor([1, 0, 0]),
56 |         "pickup_grid_xf": tf.convert_to_tensor([0, 0, 0]),
57 |         "trip_day_of_week_xf": tf.convert_to_tensor([5, 4, 4]),
58 |         "trip_day_xf": tf.convert_to_tensor([26, 24, 1]),
59 |         "trip_hour_xf": tf.convert_to_tensor([0, 4, 2]),
60 |         "trip_miles_xf": tf.convert_to_tensor([5.9717827, -0.7121308, -0.7601589]),
61 |         "trip_month_xf": tf.convert_to_tensor([4, 3, 4]),
62 |         "trip_seconds_xf": tf.convert_to_tensor([4.9029775, -0.34146854, -0.34479955]),
63 |     }
64 | 
65 |     feature_vocab_sizes = {
66 |         feature_name: 100 for feature_name in features.categorical_feature_names()
67 |     }
68 |     classifier = model._create_binary_classifier(feature_vocab_sizes, hyperparams)
69 |     model_outputs = classifier(model_inputs)  # .numpy()
70 |     assert model_outputs.shape == (3, 1)
71 |     assert model_outputs.dtype == "float32"
72 | 


--------------------------------------------------------------------------------
/src/tests/pipeline_deployment_tests.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Test training pipeline using local runner."""
15 | 
16 | import sys
17 | import os
18 | from tfx.orchestration.local.local_dag_runner import LocalDagRunner
19 | import tensorflow as tf
20 | from ml_metadata.proto import metadata_store_pb2
21 | import logging
22 | 
23 | from src.tfx_pipelines import config
24 | from src.tfx_pipelines import training_pipeline
25 | 
26 | root = logging.getLogger()
27 | root.setLevel(logging.INFO)
28 | handler = logging.StreamHandler(sys.stdout)
29 | handler.setLevel(logging.INFO)
30 | root.addHandler(handler)
31 | 
32 | MLMD_SQLLITE = "mlmd.sqllite"
33 | NUM_EPOCHS = 1
34 | BATCH_SIZE = 512
35 | LEARNING_RATE = 0.001
36 | HIDDEN_UNITS = "128,128"
37 | 
38 | 
39 | def test_e2e_pipeline():
40 | 
41 |     project = os.getenv("PROJECT")
42 |     region = os.getenv("REGION")
43 |     model_display_name = os.getenv("MODEL_DISPLAY_NAME")
44 |     dataset_display_name = os.getenv("DATASET_DISPLAY_NAME")
45 |     gcs_location = os.getenv("GCS_LOCATION")
46 |     model_registry = os.getenv("MODEL_REGISTRY_URI")
47 |     upload_model = os.getenv("UPLOAD_MODEL")
48 | 
49 |     assert project, "Environment variable PROJECT is None!"
50 |     assert region, "Environment variable REGION is None!"
51 |     assert dataset_display_name, "Environment variable DATASET_DISPLAY_NAME is None!"
52 |     assert model_display_name, "Environment variable MODEL_DISPLAY_NAME is None!"
53 |     assert gcs_location, "Environment variable GCS_LOCATION is None!"
54 |     assert model_registry, "Environment variable MODEL_REGISTRY_URI is None!"
55 | 
56 |     logging.info(f"upload_model: {upload_model}")
57 |     if tf.io.gfile.exists(gcs_location):
58 |         tf.io.gfile.rmtree(gcs_location)
59 |     logging.info(f"Pipeline e2e test artifacts stored in: {gcs_location}")
60 | 
61 |     if tf.io.gfile.exists(MLMD_SQLLITE):
62 |         tf.io.gfile.remove(MLMD_SQLLITE)
63 | 
64 |     metadata_connection_config = metadata_store_pb2.ConnectionConfig()
65 |     metadata_connection_config.sqlite.filename_uri = MLMD_SQLLITE
66 |     metadata_connection_config.sqlite.connection_mode = 3
67 |     logging.info("ML metadata store is ready.")
68 | 
69 |     pipeline_root = os.path.join(
70 |         config.ARTIFACT_STORE_URI,
71 |         config.PIPELINE_NAME,
72 |     )
73 | 
74 |     runner = LocalDagRunner()
75 | 
76 |     pipeline = training_pipeline.create_pipeline(
77 |         pipeline_root=pipeline_root,
78 |         num_epochs=NUM_EPOCHS,
79 |         batch_size=BATCH_SIZE,
80 |         learning_rate=LEARNING_RATE,
81 |         hidden_units=HIDDEN_UNITS,
82 |         metadata_connection_config=metadata_connection_config,
83 |     )
84 | 
85 |     runner.run(pipeline)
86 | 
87 |     logging.info(f"Model output: {os.path.join(model_registry, model_display_name)}")
88 |     assert tf.io.gfile.exists(os.path.join(model_registry, model_display_name))
89 | 


--------------------------------------------------------------------------------
/src/tfx_pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ksalama/ucaip-labs/5a5a5703274b407b9492a888ce9657639b3a3dbf/src/tfx_pipelines/__init__.py


--------------------------------------------------------------------------------
/src/tfx_pipelines/components.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """TFX Custom Python Components."""
 15 | 
 16 | 
 17 | import sys
 18 | import os
 19 | import json
 20 | import logging
 21 | import tensorflow as tf
 22 | 
 23 | from tfx.types import artifact_utils
 24 | from tfx.utils import io_utils
 25 | from tfx.dsl.component.experimental.decorators import component
 26 | from tfx.dsl.component.experimental.annotations import (
 27 |     InputArtifact,
 28 |     OutputArtifact,
 29 |     Parameter,
 30 | )
 31 | from tfx.types.standard_artifacts import HyperParameters
 32 | from tfx.types.experimental.simple_artifacts import File as UploadedModel
 33 | from tfx.types.experimental.simple_artifacts import Dataset
 34 | 
 35 | from google.cloud import aiplatform as vertex_ai
 36 | 
 37 | SCRIPT_DIR = os.path.dirname(
 38 |     os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))
 39 | )
 40 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, "..")))
 41 | 
 42 | from src.preprocessing import etl
 43 | 
 44 | 
 45 | HYPERPARAM_FILENAME = "hyperparameters.json"
 46 | SERVING_DATA_PREFIX = "serving-data-"
 47 | PREDICTION_RESULTS_PREFIX = "prediction.results-*"
 48 | 
 49 | 
 50 | @component
 51 | def hyperparameters_gen(
 52 |     num_epochs: Parameter[int],
 53 |     batch_size: Parameter[int],
 54 |     learning_rate: Parameter[float],
 55 |     hidden_units: Parameter[str],
 56 |     hyperparameters: OutputArtifact[HyperParameters],
 57 | ):
 58 |     """A TFX custom-Python-function component for receiving hyperparameters."""
 59 | 
 60 |     hp_dict = dict()
 61 |     hp_dict["num_epochs"] = num_epochs
 62 |     hp_dict["batch_size"] = batch_size
 63 |     hp_dict["learning_rate"] = learning_rate
 64 |     hp_dict["hidden_units"] = [int(units) for units in hidden_units.split(",")]
 65 |     logging.info(f"Hyperparameters: {hp_dict}")
 66 | 
 67 |     hyperparams_uri = os.path.join(
 68 |         artifact_utils.get_single_uri([hyperparameters]), HYPERPARAM_FILENAME
 69 |     )
 70 |     io_utils.write_string_file(hyperparams_uri, json.dumps(hp_dict))
 71 |     logging.info(f"Hyperparameters are written to: {hyperparams_uri}")
 72 | 
 73 | 
 74 | @component
 75 | def vertex_model_uploader(
 76 |     project: Parameter[str],
 77 |     region: Parameter[str],
 78 |     model_display_name: Parameter[str],
 79 |     pushed_model_location: Parameter[str],
 80 |     serving_image_uri: Parameter[str],
 81 |     explanation_config: Parameter[str],
 82 |     uploaded_model: OutputArtifact[UploadedModel],
 83 | ):
 84 |     """A TFX custom-Python-function component to upload the model to Vertex."""
 85 | 
 86 |     vertex_ai.init(project=project, location=region)
 87 | 
 88 |     pushed_model_dir = os.path.join(
 89 |         pushed_model_location, tf.io.gfile.listdir(pushed_model_location)[-1]
 90 |     )
 91 | 
 92 |     logging.info(f"Model registry location: {pushed_model_dir}")
 93 | 
 94 |     try:
 95 |         explanation_metadata = vertex_ai.explain.ExplanationMetadata(
 96 |             inputs=explanation_config["inputs"],
 97 |             outputs=explanation_config["outputs"],
 98 |         )
 99 |         explanation_parameters = vertex_ai.explain.ExplanationParameters(
100 |             explanation_config["params"]
101 |         )
102 |     except:
103 |         explanation_metadata = None
104 |         explanation_parameters = None
105 | 
106 |     vertex_model = vertex_ai.Model.upload(
107 |         display_name=model_display_name,
108 |         artifact_uri=pushed_model_dir,
109 |         serving_container_image_uri=serving_image_uri,
110 |         parameters_schema_uri=None,
111 |         instance_schema_uri=None,
112 |         explanation_metadata=explanation_metadata,
113 |         explanation_parameters=explanation_parameters,
114 |     )
115 | 
116 |     model_uri = vertex_model.gca_resource.name
117 |     logging.info(f"Model uploaded to AI Platform: {model_uri}")
118 |     uploaded_model.set_string_custom_property("model_uri", model_uri)
119 | 
120 | 
121 | @component
122 | def bigquery_data_gen(
123 |     sql_query: Parameter[str],
124 |     output_data_format: Parameter[str],
125 |     beam_args: Parameter[str],
126 |     serving_dataset: OutputArtifact[Dataset],
127 | ):
128 |     """A TFX custom-Python-function component for extracting data from BigQuery."""
129 | 
130 |     output_dir = os.path.join(
131 |         artifact_utils.get_single_uri([serving_dataset]), SERVING_DATA_PREFIX
132 |     )
133 | 
134 |     pipeline_args = json.loads(beam_args)
135 |     pipeline_args["sql_query"] = sql_query
136 |     pipeline_args["exported_data_prefix"] = output_dir
137 |     pipeline_args["output_data_format"] = output_data_format
138 | 
139 |     logging.info("Data extraction started. Source query:")
140 |     logging.info("{sql_query}")
141 |     etl.run_extract_pipeline(pipeline_args)
142 |     logging.info("Data extraction completed.")
143 | 
144 | 
145 | @component
146 | def vertex_batch_prediction(
147 |     project: Parameter[str],
148 |     region: Parameter[str],
149 |     model_display_name: Parameter[str],
150 |     instances_format: Parameter[str],
151 |     predictions_format: Parameter[str],
152 |     job_resources: Parameter[str],
153 |     serving_dataset: InputArtifact[Dataset],
154 |     prediction_results: OutputArtifact[Dataset],
155 | ):
156 |     """A TFX custom-Python-function component to submit a Vertex batch prediction Job."""
157 | 
158 |     job_resources = json.loads(job_resources)
159 |     gcs_source_pattern = (
160 |         os.path.join(
161 |             artifact_utils.get_single_uri([serving_dataset]), SERVING_DATA_PREFIX
162 |         )
163 |         + "*.jsonl"
164 |     )
165 | 
166 |     gcs_destination_prefix = artifact_utils.get_single_uri([prediction_results])
167 | 
168 |     vertex_client = VertexClient(project, region)
169 |     logging.info("Submitting Vertex AI batch prediction job...")
170 |     batch_prediction_job = vertex_client.submit_batch_prediction_job(
171 |         model_display_name=model_display_name,
172 |         gcs_source_pattern=gcs_source_pattern,
173 |         gcs_destination_prefix=gcs_destination_prefix,
174 |         instances_format=instances_format,
175 |         predictions_format=predictions_format,
176 |         other_configurations=job_resources,
177 |     )
178 |     logging.info("Batch prediction job completed.")
179 |     prediction_results.set_string_custom_property(
180 |         "batch_prediction_job", batch_prediction_job.gca_resource.name
181 |     )
182 | 
183 | 
184 | @component
185 | def datastore_prediction_writer(
186 |     datastore_kind: Parameter[str],
187 |     predictions_format: Parameter[str],
188 |     beam_args: Parameter[str],
189 |     prediction_results: InputArtifact[Dataset],
190 | ):
191 |     """A TFX custom-Python-function component for writing prediction JSONL files to Datastore."""
192 | 
193 |     prediction_results_dir = os.path.join(
194 |         artifact_utils.get_single_uri([prediction_results])
195 |     )
196 |     prediction_results_dir = os.path.join(
197 |         prediction_results_dir, tf.io.gfile.listdir(prediction_results_dir)[0]
198 |     )
199 |     prediction_results_uri = os.path.join(
200 |         prediction_results_dir, PREDICTION_RESULTS_PREFIX
201 |     )
202 | 
203 |     pipeline_args = json.loads(beam_args)
204 |     pipeline_args["prediction_results_uri"] = prediction_results_uri
205 |     pipeline_args["datastore_kind"] = datastore_kind
206 |     pipeline_args["predictions_format"] = predictions_format
207 | 
208 |     logging.info(f"Storing predictions to Datastore kind: {datastore_kind}")
209 |     etl.run_store_predictions_pipeline(pipeline_args)
210 |     logging.info("Predictions are stored.")
211 | 


--------------------------------------------------------------------------------
/src/tfx_pipelines/config.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """TFX pipeline configurations."""
 15 | 
 16 | import os
 17 | 
 18 | PROJECT_ID = os.getenv("PROJECT_ID", "ksalama-cloudml")
 19 | REGION = os.getenv("REGION", "us-central1")
 20 | GCS_LOCATION = os.getenv("GCS_LOCATION", "gs://ksalama-cloudml-us/chicago-taxi-tips")
 21 | 
 22 | ARTIFACT_STORE_URI = os.path.join(GCS_LOCATION, "tfx_artifacts")
 23 | MODEL_REGISTRY_URI = os.getenv(
 24 |     "MODEL_REGISTRY_URI",
 25 |     os.path.join(GCS_LOCATION, "model_registry"),
 26 | )
 27 | 
 28 | DATASET_DISPLAY_NAME = os.getenv("DATASET_DISPLAY_NAME", "chicago-taxi-tips")
 29 | MODEL_DISPLAY_NAME = os.getenv(
 30 |     "MODEL_DISPLAY_NAME", f"{DATASET_DISPLAY_NAME}-classifier"
 31 | )
 32 | PIPELINE_NAME = os.getenv("PIPELINE_NAME", f"{MODEL_DISPLAY_NAME}-train-pipeline")
 33 | 
 34 | ML_USE_COLUMN = "ml_use"
 35 | EXCLUDE_COLUMNS = ",".join(["trip_start_timestamp"])
 36 | TRAIN_LIMIT = os.getenv("TRAIN_LIMIT", "0")
 37 | TEST_LIMIT = os.getenv("TEST_LIMIT", "0")
 38 | SERVE_LIMIT = os.getenv("SERVE_LIMIT", "0")
 39 | 
 40 | NUM_TRAIN_SPLITS = os.getenv("NUM_TRAIN_SPLITS", "4")
 41 | NUM_EVAL_SPLITS = os.getenv("NUM_EVAL_SPLITS", "1")
 42 | ACCURACY_THRESHOLD = os.getenv("ACCURACY_THRESHOLD", "0.8")
 43 | 
 44 | USE_KFP_SA = os.getenv("USE_KFP_SA", "False")
 45 | 
 46 | TFX_IMAGE_URI = os.getenv(
 47 |     "TFX_IMAGE_URI", f"gcr.io/{PROJECT_ID}/tfx-{DATASET_DISPLAY_NAME}:latest"
 48 | )
 49 | 
 50 | BEAM_RUNNER = os.getenv("BEAM_RUNNER", "DirectRunner")
 51 | BEAM_DIRECT_PIPELINE_ARGS = [
 52 |     f"--project={PROJECT_ID}",
 53 |     f"--temp_location={os.path.join(GCS_LOCATION, 'temp')}",
 54 | ]
 55 | BEAM_DATAFLOW_PIPELINE_ARGS = [
 56 |     f"--project={PROJECT_ID}",
 57 |     f"--temp_location={os.path.join(GCS_LOCATION, 'temp')}",
 58 |     f"--region={REGION}",
 59 |     f"--runner={BEAM_RUNNER}",
 60 | ]
 61 | 
 62 | 
 63 | TRAINING_RUNNER = os.getenv("TRAINING_RUNNER", "local")
 64 | AI_PLATFORM_TRAINING_ARGS = {
 65 |     "project": PROJECT_ID,
 66 |     "region": REGION,
 67 |     "masterConfig": {"imageUri": TFX_IMAGE_URI},
 68 | }
 69 | 
 70 | 
 71 | SERVING_RUNTIME = os.getenv("SERVING_RUNTIME", "tf2-cpu.2-4")
 72 | SERVING_IMAGE_URI = f"gcr.io/cloud-aiplatform/prediction/{SERVING_RUNTIME}:latest"
 73 | 
 74 | BATCH_PREDICTION_BQ_DATASET_NAME = os.getenv(
 75 |     "BATCH_PREDICTION_BQ_DATASET_NAME", "playground_us"
 76 | )
 77 | BATCH_PREDICTION_BQ_TABLE_NAME = os.getenv(
 78 |     "BATCH_PREDICTION_BQ_TABLE_NAME", "chicago_taxitrips_prep"
 79 | )
 80 | BATCH_PREDICTION_BEAM_ARGS = {
 81 |     "runner": f"{BEAM_RUNNER}",
 82 |     "temporary_dir": os.path.join(GCS_LOCATION, "temp"),
 83 |     "gcs_location": os.path.join(GCS_LOCATION, "temp"),
 84 |     "project": PROJECT_ID,
 85 |     "region": REGION,
 86 |     "setup_file": "./setup.py",
 87 | }
 88 | BATCH_PREDICTION_JOB_RESOURCES = {
 89 |     "machine_type": "n1-standard-2",
 90 |     #'accelerator_count': 1,
 91 |     #'accelerator_type': 'NVIDIA_TESLA_T4'
 92 |     "starting_replica_count": 1,
 93 |     "max_replica_count": 10,
 94 | }
 95 | DATASTORE_PREDICTION_KIND = f"{MODEL_DISPLAY_NAME}-predictions"
 96 | 
 97 | ENABLE_CACHE = os.getenv("ENABLE_CACHE", "0")
 98 | UPLOAD_MODEL = os.getenv("UPLOAD_MODEL", "1")
 99 | 
100 | os.environ["PROJECT_ID"] = PROJECT_ID
101 | os.environ["PIPELINE_NAME"] = PIPELINE_NAME
102 | os.environ["TFX_IMAGE_URI"] = TFX_IMAGE_URI
103 | os.environ["MODEL_REGISTRY_URI"] = MODEL_REGISTRY_URI
104 | 


--------------------------------------------------------------------------------
/src/tfx_pipelines/prediction_pipeline.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TFX prediction pipeline definition."""
15 | 
16 | import os
17 | import sys
18 | import json
19 | import logging
20 | 
21 | from tfx.orchestration import pipeline, data_types
22 | from ml_metadata.proto import metadata_store_pb2
23 | 
24 | SCRIPT_DIR = os.path.dirname(
25 |     os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))
26 | )
27 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, "..")))
28 | 
29 | from src.tfx_pipelines import config
30 | from src.tfx_pipelines import components as custom_components
31 | from src.common import datasource_utils
32 | 
33 | 
34 | def create_pipeline(
35 |     pipeline_root: str,
36 |     metadata_connection_config: metadata_store_pb2.ConnectionConfig = None,
37 | ):
38 |     """Returns a batch prediction pipeline using TFX."""
39 | 
40 |     # Get source query.
41 |     sql_query = datasource_utils.get_serving_source_query(
42 |         bq_dataset_name=config.BATCH_PREDICTION_BQ_DATASET_NAME,
43 |         bq_table_name=config.BATCH_PREDICTION_BQ_TABLE_NAME,
44 |         limit=int(config.SERVE_LIMIT),
45 |     )
46 | 
47 |     bigquery_data_gen = custom_components.bigquery_data_gen(
48 |         sql_query=sql_query,
49 |         output_data_format="jsonl",
50 |         beam_args=json.dumps(config.BATCH_PREDICTION_BEAM_ARGS),
51 |     )
52 | 
53 |     vertex_batch_prediction = custom_components.vertex_batch_prediction(
54 |         project=config.PROJECT,
55 |         region=config.REGION,
56 |         model_display_name=config.MODEL_DISPLAY_NAME,
57 |         instances_format="jsonl",
58 |         predictions_format="jsonl",
59 |         job_resources=json.dumps(config.BATCH_PREDICTION_JOB_RESOURCES),
60 |         serving_dataset=bigquery_data_gen.outputs.serving_dataset,
61 |     )
62 | 
63 |     datastore_prediction_writer = custom_components.datastore_prediction_writer(
64 |         datastore_kind=config.DATASTORE_PREDICTION_KIND,
65 |         predictions_format="jsonl",
66 |         beam_args=json.dumps(config.BATCH_PREDICTION_BEAM_ARGS),
67 |         prediction_results=vertex_batch_prediction.outputs.prediction_results,
68 |     )
69 | 
70 |     pipeline_components = [
71 |         bigquery_data_gen,
72 |         vertex_batch_prediction,
73 |         datastore_prediction_writer,
74 |     ]
75 | 
76 |     logging.info(
77 |         f"Pipeline components: {[component.id for component in pipeline_components]}"
78 |     )
79 | 
80 |     beam_pipeline_args = config.BEAM_DIRECT_PIPELINE_ARGS
81 |     if config.BEAM_RUNNER == "DataflowRunner":
82 |         beam_pipeline_args = config.BEAM_DATAFLOW_PIPELINE_ARGS
83 | 
84 |     logging.info(f"Beam pipeline args: {beam_pipeline_args}")
85 | 
86 |     return pipeline.Pipeline(
87 |         pipeline_name=config.PIPELINE_NAME,
88 |         pipeline_root=pipeline_root,
89 |         components=pipeline_components,
90 |         beam_pipeline_args=beam_pipeline_args,
91 |         metadata_connection_config=metadata_connection_config,
92 |         enable_cache=int(config.ENABLE_CACHE),
93 |     )
94 | 


--------------------------------------------------------------------------------
/src/tfx_pipelines/runner.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Define KubeflowV2DagRunner to run the training pipeline using Managed Pipelines."""
15 | 
16 | 
17 | import os
18 | from kfp.v2.google.client import AIPlatformClient
19 | from tfx.orchestration import data_types
20 | from tfx.orchestration.kubeflow.v2 import kubeflow_v2_dag_runner
21 | 
22 | 
23 | from src.tfx_pipelines import config, training_pipeline, prediction_pipeline
24 | from src.model_training import defaults
25 | 
26 | 
27 | def compile_training_pipeline(pipeline_definition_file):
28 |     """Returns the training pipeline definition."""
29 | 
30 |     pipeline_root = os.path.join(
31 |         config.ARTIFACT_STORE_URI,
32 |         config.PIPELINE_NAME,
33 |     )
34 | 
35 |     managed_pipeline = training_pipeline.create_pipeline(
36 |         pipeline_root=pipeline_root,
37 |         num_epochs=data_types.RuntimeParameter(
38 |             name="num_epochs",
39 |             default=defaults.NUM_EPOCHS,
40 |             ptype=int,
41 |         ),
42 |         batch_size=data_types.RuntimeParameter(
43 |             name="batch_size",
44 |             default=defaults.BATCH_SIZE,
45 |             ptype=int,
46 |         ),
47 |         learning_rate=data_types.RuntimeParameter(
48 |             name="learning_rate",
49 |             default=defaults.LEARNING_RATE,
50 |             ptype=float,
51 |         ),
52 |         hidden_units=data_types.RuntimeParameter(
53 |             name="hidden_units",
54 |             default=",".join(str(u) for u in defaults.HIDDEN_UNITS),
55 |             ptype=str,
56 |         ),
57 |     )
58 | 
59 |     runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
60 |         config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
61 |             default_image=config.TFX_IMAGE_URI
62 |         ),
63 |         output_filename=pipeline_definition_file,
64 |     )
65 | 
66 |     return runner.run(managed_pipeline, write_out=True)
67 | 
68 | 
69 | def compile_prediction_pipeline(pipeline_definition_file):
70 |     """Returns the prediction pipeline definition."""
71 | 
72 |     pipeline_root = os.path.join(
73 |         config.ARTIFACT_STORE_URI,
74 |         config.PIPELINE_NAME,
75 |     )
76 | 
77 |     managed_pipeline = prediction_pipeline.create_pipeline(
78 |         pipeline_root=pipeline_root,
79 |     )
80 | 
81 |     runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
82 |         config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
83 |             default_image=config.TFX_IMAGE_URI
84 |         ),
85 |         output_filename=pipeline_definition_file,
86 |     )
87 | 
88 |     return runner.run(managed_pipeline, write_out=True)
89 | 
90 | 
91 | def submit_pipeline(pipeline_definition_file):
92 |     """Submits a pipeline definition file to Vertex pipelines."""
93 | 
94 |     pipeline_client = AIPlatformClient(project_id=config.PROJECT, region=config.REGION)
95 |     pipeline_client.create_run_from_job_spec(pipeline_definition_file)
96 | 


--------------------------------------------------------------------------------
/src/tfx_pipelines/training_pipeline.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """TFX training pipeline definition."""
 15 | 
 16 | import os
 17 | import sys
 18 | import logging
 19 | import json
 20 | 
 21 | import tensorflow_model_analysis as tfma
 22 | 
 23 | import tfx
 24 | from tfx.proto import example_gen_pb2, transform_pb2, trainer_pb2
 25 | from tfx.orchestration import pipeline, data_types
 26 | from tfx.dsl.components.base import executor_spec
 27 | from tfx.components.trainer import executor as trainer_executor
 28 | from tfx.extensions.google_cloud_ai_platform.trainer import (
 29 |     executor as ai_platform_trainer_executor,
 30 | )
 31 | from tfx.extensions.google_cloud_big_query.example_gen.component import (
 32 |     BigQueryExampleGen,
 33 | )
 34 | from tfx.components import (
 35 |     StatisticsGen,
 36 |     ExampleValidator,
 37 |     Transform,
 38 |     Trainer,
 39 |     Evaluator,
 40 |     Pusher,
 41 | )
 42 | from tfx.dsl.components.common.importer import Importer
 43 | from tfx.dsl.components.common.resolver import Resolver
 44 | from tfx.dsl.experimental import latest_artifacts_resolver
 45 | from tfx.dsl.experimental import latest_blessed_model_resolver
 46 | 
 47 | from ml_metadata.proto import metadata_store_pb2
 48 | 
 49 | SCRIPT_DIR = os.path.dirname(
 50 |     os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))
 51 | )
 52 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, "..")))
 53 | 
 54 | from src.tfx_pipelines import config
 55 | from src.tfx_pipelines import components as custom_components
 56 | from src.common import features, datasource_utils
 57 | 
 58 | RAW_SCHEMA_DIR = "src/raw_schema"
 59 | TRANSFORM_MODULE_FILE = "src/preprocessing/transformations.py"
 60 | TRAIN_MODULE_FILE = "src/model_training/runner.py"
 61 | 
 62 | MISSING = {
 63 |     "trip_month": -1,
 64 |     "trip_day": -1,
 65 |     "trip_day_of_week": -1,
 66 |     "trip_hour": -1,
 67 |     "trip_seconds": -1,
 68 |     "trip_miles": -1,
 69 |     "payment_type": "NA",
 70 |     "pickup_grid": "NA",
 71 |     "dropoff_grid": "NA",
 72 |     "euclidean": -1,
 73 | }
 74 | 
 75 | 
 76 | def create_pipeline(
 77 |     pipeline_root: str,
 78 |     num_epochs: data_types.RuntimeParameter,
 79 |     batch_size: data_types.RuntimeParameter,
 80 |     learning_rate: data_types.RuntimeParameter,
 81 |     hidden_units: data_types.RuntimeParameter,
 82 |     metadata_connection_config: metadata_store_pb2.ConnectionConfig = None,
 83 | ):
 84 |     """Returns a TFX training pipeline."""
 85 | 
 86 |     local_executor_spec = executor_spec.ExecutorClassSpec(
 87 |         trainer_executor.GenericExecutor
 88 |     )
 89 | 
 90 |     caip_executor_spec = executor_spec.ExecutorClassSpec(
 91 |         ai_platform_trainer_executor.GenericExecutor
 92 |     )
 93 | 
 94 |     # Hyperparameter generation.
 95 |     hyperparams_gen = custom_components.hyperparameters_gen(
 96 |         num_epochs=num_epochs,
 97 |         batch_size=batch_size,
 98 |         learning_rate=learning_rate,
 99 |         hidden_units=hidden_units,
100 |     ).with_id("HyperparamsGen")
101 | 
102 |     # Get train source query.
103 |     train_sql_query = datasource_utils.get_training_source_query(
104 |         config.PROJECT_ID,
105 |         config.REGION,
106 |         config.DATASET_DISPLAY_NAME,
107 |         ml_use="UNASSIGNED",
108 |         limit=int(config.TRAIN_LIMIT),
109 |     )
110 | 
111 |     train_output_config = example_gen_pb2.Output(
112 |         split_config=example_gen_pb2.SplitConfig(
113 |             splits=[
114 |                 example_gen_pb2.SplitConfig.Split(
115 |                     name="train", hash_buckets=int(config.NUM_TRAIN_SPLITS)
116 |                 ),
117 |                 example_gen_pb2.SplitConfig.Split(
118 |                     name="eval", hash_buckets=int(config.NUM_EVAL_SPLITS)
119 |                 ),
120 |             ]
121 |         )
122 |     )
123 | 
124 |     # Train example generation.
125 |     train_example_gen = BigQueryExampleGen(
126 |         query=train_sql_query,
127 |         output_config=train_output_config,
128 |     ).with_id("TrainDataGen")
129 | 
130 |     # Get test source query.
131 |     test_sql_query = datasource_utils.get_training_source_query(
132 |         config.PROJECT_ID,
133 |         config.REGION,
134 |         config.DATASET_DISPLAY_NAME,
135 |         ml_use="TEST",
136 |         limit=int(config.TEST_LIMIT),
137 |     )
138 | 
139 |     test_output_config = example_gen_pb2.Output(
140 |         split_config=example_gen_pb2.SplitConfig(
141 |             splits=[
142 |                 example_gen_pb2.SplitConfig.Split(name="test", hash_buckets=1),
143 |             ]
144 |         )
145 |     )
146 | 
147 |     # Test example generation.
148 |     test_example_gen = BigQueryExampleGen(
149 |         query=test_sql_query,
150 |         output_config=test_output_config,
151 |     ).with_id("TestDataGen")
152 | 
153 |     # Schema importer.
154 |     schema_importer = Importer(
155 |         source_uri=RAW_SCHEMA_DIR,
156 |         artifact_type=tfx.types.standard_artifacts.Schema,
157 |     ).with_id("SchemaImporter")
158 | 
159 |     # Statistics generation.
160 |     statistics_gen = StatisticsGen(examples=train_example_gen.outputs.examples).with_id(
161 |         "StatisticsGen"
162 |     )
163 | 
164 |     # Example validation.
165 |     example_validator = ExampleValidator(
166 |         statistics=statistics_gen.outputs.statistics,
167 |         schema=schema_importer.outputs.result,
168 |     ).with_id("ExampleValidator")
169 | 
170 |     # Data transformation.
171 |     transform = Transform(
172 |         examples=train_example_gen.outputs.examples,
173 |         schema=schema_importer.outputs.result,
174 |         module_file=TRANSFORM_MODULE_FILE,
175 |         splits_config=transform_pb2.SplitsConfig(
176 |             analyze=["train"], transform=["train", "eval"]
177 |         ),
178 |     ).with_id("DataTransformer")
179 | 
180 |     # Add dependency from example_validator to transform.
181 |     transform.add_upstream_node(example_validator)
182 | 
183 |     # Get the latest model to warmstart
184 |     warmstart_model_resolver = Resolver(
185 |         strategy_class=latest_artifacts_resolver.LatestArtifactsResolver,
186 |         latest_model=tfx.types.Channel(type=tfx.types.standard_artifacts.Model),
187 |     ).with_id("WarmstartModelResolver")
188 | 
189 |     # Model training.
190 |     trainer = Trainer(
191 |         custom_executor_spec=local_executor_spec
192 |         if config.TRAINING_RUNNER == "local"
193 |         else caip_executor_spec,
194 |         module_file=TRAIN_MODULE_FILE,
195 |         transformed_examples=transform.outputs.transformed_examples,
196 |         schema=schema_importer.outputs.result,
197 |         # base_model=warmstart_model_resolver.outputs.latest_model,
198 |         transform_graph=transform.outputs.transform_graph,
199 |         train_args=trainer_pb2.TrainArgs(num_steps=0),
200 |         eval_args=trainer_pb2.EvalArgs(num_steps=None),
201 |         hyperparameters=hyperparams_gen.outputs.hyperparameters,
202 |     ).with_id("ModelTrainer")
203 | 
204 |     # Get the latest blessed model (baseline) for model validation.
205 |     baseline_model_resolver = Resolver(
206 |         strategy_class=latest_blessed_model_resolver.LatestBlessedModelResolver,
207 |         model=tfx.types.Channel(type=tfx.types.standard_artifacts.Model),
208 |         model_blessing=tfx.types.Channel(
209 |             type=tfx.types.standard_artifacts.ModelBlessing
210 |         ),
211 |     ).with_id("BaselineModelResolver")
212 | 
213 |     # Prepare evaluation config.
214 |     eval_config = tfma.EvalConfig(
215 |         model_specs=[
216 |             tfma.ModelSpec(
217 |                 signature_name="serving_tf_example",
218 |                 label_key=features.TARGET_FEATURE_NAME,
219 |                 prediction_key="probabilities",
220 |             )
221 |         ],
222 |         slicing_specs=[
223 |             tfma.SlicingSpec(),
224 |         ],
225 |         metrics_specs=[
226 |             tfma.MetricsSpec(
227 |                 metrics=[
228 |                     tfma.MetricConfig(class_name="ExampleCount"),
229 |                     tfma.MetricConfig(
230 |                         class_name="BinaryAccuracy",
231 |                         threshold=tfma.MetricThreshold(
232 |                             value_threshold=tfma.GenericValueThreshold(
233 |                                 lower_bound={"value": float(config.ACCURACY_THRESHOLD)}
234 |                             ),
235 |                             # Change threshold will be ignored if there is no
236 |                             # baseline model resolved from MLMD (first run).
237 |                             change_threshold=tfma.GenericChangeThreshold(
238 |                                 direction=tfma.MetricDirection.HIGHER_IS_BETTER,
239 |                                 absolute={"value": -1e-10},
240 |                             ),
241 |                         ),
242 |                     ),
243 |                 ]
244 |             )
245 |         ],
246 |     )
247 | 
248 |     # Model evaluation.
249 |     evaluator = Evaluator(
250 |         examples=test_example_gen.outputs.examples,
251 |         example_splits=["test"],
252 |         model=trainer.outputs.model,
253 |         # baseline_model=baseline_model_resolver.outputs.model,
254 |         eval_config=eval_config,
255 |         schema=schema_importer.outputs.result,
256 |     ).with_id("ModelEvaluator")
257 | 
258 |     exported_model_location = os.path.join(
259 |         config.MODEL_REGISTRY_URI, config.MODEL_DISPLAY_NAME
260 |     )
261 |     push_destination = tfx.proto.pusher_pb2.PushDestination(
262 |         filesystem=tfx.proto.pusher_pb2.PushDestination.Filesystem(
263 |             base_directory=exported_model_location
264 |         )
265 |     )
266 | 
267 |     # Push custom model to model registry.
268 |     pusher = Pusher(
269 |         model=trainer.outputs.model,
270 |         model_blessing=evaluator.outputs.blessing,
271 |         push_destination=push_destination,
272 |     ).with_id("ModelPusher")
273 | 
274 |     # Upload custom trained model to Vertex AI.
275 |     explanation_config = json.dumps(features.generate_explanation_config())
276 |     vertex_model_uploader = custom_components.vertex_model_uploader(
277 |         project=config.PROJECT_ID,
278 |         region=config.REGION,
279 |         model_display_name=config.MODEL_DISPLAY_NAME,
280 |         pushed_model_location=exported_model_location,
281 |         serving_image_uri=config.SERVING_IMAGE_URI,
282 |         explanation_config=explanation_config,
283 |     ).with_id("VertexUploader")
284 | 
285 |     pipeline_components = [
286 |         hyperparams_gen,
287 |         train_example_gen,
288 |         test_example_gen,
289 |         statistics_gen,
290 |         schema_importer,
291 |         example_validator,
292 |         transform,
293 |         # warmstart_model_resolver,
294 |         trainer,
295 |         # baseline_model_resolver,
296 |         evaluator,
297 |         pusher,
298 |     ]
299 | 
300 |     if int(config.UPLOAD_MODEL):
301 |         pipeline_components.append(vertex_model_uploader)
302 |         # Add dependency from pusher to aip_model_uploader.
303 |         vertex_model_uploader.add_upstream_node(pusher)
304 | 
305 |     logging.info(
306 |         f"Pipeline components: {[component.id for component in pipeline_components]}"
307 |     )
308 | 
309 |     beam_pipeline_args = config.BEAM_DIRECT_PIPELINE_ARGS
310 |     if config.BEAM_RUNNER == "DataflowRunner":
311 |         beam_pipeline_args = config.BEAM_DATAFLOW_PIPELINE_ARGS
312 | 
313 |     logging.info(f"Beam pipeline args: {beam_pipeline_args}")
314 | 
315 |     return pipeline.Pipeline(
316 |         pipeline_name=config.PIPELINE_NAME,
317 |         pipeline_root=pipeline_root,
318 |         components=pipeline_components,
319 |         beam_pipeline_args=beam_pipeline_args,
320 |         metadata_connection_config=metadata_connection_config,
321 |         enable_cache=int(config.ENABLE_CACHE),
322 |     )
323 | 


--------------------------------------------------------------------------------