├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── create-pr.md ├── .gitignore ├── .pre-commit-config.yaml ├── .travis.yml ├── AUTHORS.rst ├── CHANGELOG.rst ├── CONTRIBUTING.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── __init__.py ├── chapters ├── adv_tfx │ └── Custom_TFX_Components.ipynb ├── appendix_c │ ├── .gitignore │ ├── README.md │ └── tfx_template_example │ │ ├── __init__.py │ │ ├── beam_dag_runner.py │ │ ├── data_validation.ipynb │ │ ├── kubeflow_dag_runner.py │ │ ├── model_analysis.ipynb │ │ ├── models │ │ ├── __init__.py │ │ ├── features.py │ │ ├── features_test.py │ │ ├── keras │ │ │ ├── __init__.py │ │ │ ├── constants.py │ │ │ ├── model.py │ │ │ └── model_test.py │ │ ├── preprocessing.py │ │ └── preprocessing_test.py │ │ └── pipeline │ │ ├── __init__.py │ │ ├── configs.py │ │ └── pipeline.py ├── data_ingestion │ └── convert_data_to_tfrecords.py ├── data_privacy │ └── differential_privacy.ipynb ├── data_validation │ └── data_validation.ipynb ├── intro_tfx │ └── Apache_beam_example_notebook.ipynb └── model_analysis │ └── model_analysis.ipynb ├── components ├── __init__.py ├── keras_trainer.py ├── module.py ├── module_test.py └── transform.py ├── interactive-pipeline ├── README.md └── interactive_pipeline.ipynb ├── pipelines ├── __init__.py ├── apache_airflow │ ├── Dockerfile │ ├── launch_airflow.sh │ ├── pipeline_airflow.py │ ├── setup_airflow.sh │ └── setup_env.sh ├── apache_beam │ ├── __init__.py │ └── pipeline_beam.py ├── base_pipeline.py ├── kubeflow_pipelines │ ├── __init__.py │ ├── argo_pipeline_files │ │ └── .gitkeep │ ├── kubeflow-config │ │ ├── storage-access-pod.yaml │ │ ├── storage-claim.yaml │ │ └── storage.yaml │ ├── pipeline_kubeflow.py │ ├── pipeline_kubeflow_gcp_buckets.py │ └── tfx-docker-image │ │ └── Dockerfile └── vertex │ └── pipeline_vertex.py ├── pre-experiment-pipeline ├── experiment_6Mar.ipynb └── make_final_dataset.ipynb ├── requirements ├── requirements.txt └── test_requirements.txt ├── setup.cfg └── utils └── download_dataset.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: hanneshapke, drcat101 7 | 8 | --- 9 | 10 | # Thank you for reporting an issue! 11 | 12 | If you want to report an issue with the code in this repository, 13 | please provide the following information: 14 | 15 | * Your operating system name and version, as well as version numbers of the following packages: tensorflow, tfx. 16 | * Any details about your local setup that might be helpful in troubleshooting. 17 | * Detailed steps to reproduce the bug. 18 | 19 | If you found an error in the book, please report it at 20 | https://www.oreilly.com/catalog/errata.csp?isbn=0636920260912. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/create-pr.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Create PR 3 | about: Create pull requests 4 | title: Fix for ... [BUG] 5 | labels: '' 6 | assignees: hanneshapke, drcat101 7 | 8 | --- 9 | 10 | # Thank you for contributing! 11 | 12 | Bug fixes to our code are always welcome. All bug fixes should be connected to an issue. 13 | If you plan to fix a bug, please use the appropriate issue to briefly tell us what you have 14 | in mind. If you have found a bug and there is no issue yet, please start by filing an issue. 15 | 16 | Check that your pull request meets these guidelines before you submit it: 17 | 18 | 1. If the pull request adds or changes functionality, it should include updated tests. 19 | 2. Make sure that all tests run by ``make test`` pass. 20 | 3. Add your name and/or Github username to the `AUTHORS.rst` file under "Contributors", so that your contribution is listed upon merging your PR. Thank you! 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | tfx/ 132 | .DS_Store 133 | model.png 134 | .vscode/ 135 | pipelines/kubeflow_pipelines/argo_pipeline_files/*.yaml 136 | pipelines/kubeflow_pipelines/argo_pipeline_files/*.yml 137 | pipelines/kubeflow_pipelines/argo_pipeline_files/*.tar.gz 138 | pipelines/kubeflow_pipelines/argo_pipeline_files/*.tgz 139 | pipelines/kubeflow_pipelines/argo_pipeline_files/*.zip 140 | 141 | pipelines/gcp_cloud_ai/argo_pipeline_files/*.yaml 142 | pipelines/gcp_cloud_ai/argo_pipeline_files/*.yml 143 | pipelines/gcp_cloud_ai/argo_pipeline_files/*.tar.gz 144 | pipelines/gcp_cloud_ai/argo_pipeline_files/*.tgz 145 | pipelines/gcp_cloud_ai/argo_pipeline_files/*.zip 146 | data/ 147 | .artifacts/ 148 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 19.10b0 4 | hooks: 5 | - id: black 6 | args: [--line-length=80, --safe] 7 | python_version: python3 8 | - repo: git://github.com/pre-commit/pre-commit-hooks 9 | rev: v2.4.0 10 | hooks: 11 | - id: check-case-conflict 12 | - id: check-merge-conflict 13 | - id: check-symlinks 14 | - id: end-of-file-fixer 15 | - id: trailing-whitespace 16 | - id: debug-statements 17 | - id: flake8 18 | log_file: '.artifacts/flake8.log' 19 | - id: requirements-txt-fixer 20 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: xenial 2 | language: python 3 | sudo: false 4 | python: 5 | - "3.6" 6 | # - "3.7" 7 | # - "3.8" 8 | 9 | install: 10 | - pip install -U -r requirements/requirements.txt 11 | - pip install -U -r requirements/test_requirements.txt 12 | 13 | # command to run tests, e.g. python setup.py test 14 | script: 15 | - export PYTHONPATH=$PYTHONPATH:`pwd`/chapters/appendix_c/tfx_template_example 16 | - pytest . chapters/appendix_c/tfx_template_example/ 17 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Maintainers 6 | ------------ 7 | * Hannes Hapke 8 | * Catherine Nelson 9 | 10 | Contributors 11 | ------------ 12 | * Timo Cornelius Metzger 13 | * Snehan Kekre (@snehankekre) 14 | 15 | 16 | 17 | Thanks to all contributors! 18 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | .. :changelog: 2 | 3 | Changelog 4 | --------------- 5 | 6 | 7 | 0.1.0 (in progress) 8 | ++++++++++++++++++ 9 | 10 | * Initial repo for the O'Reilly publication "Building Machine Learning Pipelines" 11 | * Added TFX template example 12 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | ************ 2 | Contributing 3 | ************ 4 | 5 | We greatly appreciate your contributions! 6 | 7 | The following is a collection of guidelines for contributing to the code repository for the 8 | O'Reilly publication `"Building Machine Learning Pipelines" `_ 9 | by Hannes Hapke & Catherine Nelson. 10 | 11 | If you found an error in the book, please report it at 12 | https://www.oreilly.com/catalog/errata.csp?isbn=0636920260912. 13 | 14 | Types of contributions: 15 | ======================= 16 | 17 | .. _issue: 18 | 19 | 1. Reporting bugs 20 | ----------------- 21 | 22 | Please report bugs at 23 | https://github.com/Building-ML-Pipelines/building-machine-learning-pipelines/issues. 24 | 25 | If you are reporting a bug, we ask you to include: 26 | 27 | * Your operating system name and version. 28 | * Any details about your local setup that might be helpful in troubleshooting. 29 | * Detailed steps to reproduce the bug. 30 | 31 | 2. Fixing bugs 32 | -------------- 33 | 34 | Bug fixes to our code are always welcome. All bug fixes should be connected to an issue. 35 | If you plan to fix a bug, please use the appropriate issue to briefly tell us what you have in mind. 36 | If you have found a bug and there is no issue yet, please start by filing an issue_. 37 | 38 | You can also look through the `GitHub issues `_ 39 | for bugs. Anything tagged with "bug" is open to whoever wants to fix it. 40 | 41 | See the section `Contributing to this repository`_ below for instructions on how to 42 | set up your environment and create a pull request. 43 | 44 | Providing general feedback 45 | -------------------------- 46 | 47 | The best way to send general feedback is to file an issue_. 48 | You can submit errata for the publication here: https://www.oreilly.com/catalog/errata.csp?isbn=0636920260912 49 | 50 | Contributing to this repository 51 | =============================== 52 | 53 | Setting up your development environment 54 | --------------------------------------- 55 | 56 | This is how you set up a local development environment to work on the code: 57 | 58 | Create a fork of the 59 | `Github repository `_. 60 | 61 | Clone your fork locally:: 62 | 63 | $ git clone git@github.com:[YOUR USERNAME]/building-machine-learning-pipelines.git 64 | 65 | Create a branch for your contribution:: 66 | 67 | $ git checkout -b name-of-your-contribution 68 | 69 | Create a virtualenv to separate your Python dependencies:: 70 | 71 | $ virtualenv .env && source .env/bin/activate 72 | 73 | Download and install all dependencies required for development:: 74 | 75 | $ make develop 76 | 77 | This will automatically download and configure all required dependencies. 78 | Your local environment is now ready for you to begin making your changes. 79 | 80 | Testing and linting 81 | ------------------- 82 | 83 | When you're done making changes, please make sure that your code passes style and unit tests. 84 | 85 | You can run linting and all testing with this command:: 86 | 87 | $ make test 88 | 89 | In case you want to run only specific tests instead of all available tests, you can call 90 | Pytest directly and combine it with a substring. Pytest will only run tests with names 91 | matching the substring:: 92 | 93 | $ pytest -k -v 94 | 95 | You are welcome to add your name to AUTHORS.rst before committing your code! 96 | 97 | Submitting a pull request 98 | ------------------------- 99 | 100 | Commit your changes and push your branch to GitHub:: 101 | 102 | $ git add . 103 | $ git commit -m "Description of your changes." 104 | $ git push origin name-of-your-contribution 105 | 106 | Check that your pull request meets these guidelines before you submit it: 107 | 108 | 1. If the pull request adds or changes functionality, it should include updated tests. 109 | 2. The pull request should work with Python 3.6, 3.7 and 3.8. Make sure that 110 | all tests run by ``make test`` pass. 111 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Hannes Hapke 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include CONTRIBUTING.rst 3 | include CHANGELOG.rst 4 | include LICENSE 5 | include README.rst 6 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: help clean clean-pyc clean-build list test test-all coverage docs release sdist 2 | 3 | help: 4 | @echo "clean - remove build artifacts" 5 | @echo "develop - set up dev environment" 6 | @echo "install-deps" 7 | @echo "install-pre-commit" 8 | @echo "setup-git" 9 | @echo "lint - check style with flake8" 10 | @echo "test - run tests quickly with the default Python" 11 | 12 | 13 | clean: 14 | rm -fr build/ 15 | rm -fr dist/ 16 | rm -fr *.egg-info 17 | find . -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete 18 | 19 | develop: setup-git install-deps install-pre-commit 20 | 21 | install-deps: 22 | pip install -r requirements/requirements.txt 23 | pip install -r requirements/test_requirements.txt 24 | 25 | install-pre-commit: 26 | pip install "pre-commit>=1.10.1,<1.11.0" 27 | 28 | setup-git: install-pre-commit 29 | pre-commit install 30 | git config branch.autosetuprebase always 31 | 32 | lint: install-pre-commit 33 | @echo "Linting Python files" 34 | pre-commit run -a 35 | @echo "" 36 | 37 | test: develop lint 38 | @echo "Running Python tests" 39 | py.test . 40 | @echo "" 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Building Machine Learning Pipelines 2 | 3 | Code repository for the O'Reilly publication ["Building Machine Learning Pipelines"](http://www.buildingmlpipelines.com) by Hannes Hapke & Catherine Nelson 4 | 5 | ## Update 6 | 7 | * The example code has been updated to work with TFX 1.4.0, TensorFlow 2.6.1, and Apache Beam 2.33.0. A GCP Vertex example (training and serving) was added. 8 | 9 | ## Set up the demo project 10 | 11 | Download the initial dataset. From the root of this repository, execute 12 | 13 | ``` 14 | python3 utils/download_dataset.py 15 | ``` 16 | 17 | After this script runs, you should have a `data` folder containing the file `consumer_complaints_with_narrative.csv`. 18 | 19 | ## The dataset 20 | 21 | The data that we use in this example project can be downloaded using the script above. The dataset is from a public dataset on customer complaints collected from the US Consumer Finance Protection Bureau. If you would like to reproduce our edited dataset, carry out the following steps: 22 | 23 | - Download the dataset from https://www.consumerfinance.gov/data-research/consumer-complaints/#download-the-data 24 | - Rename the columns to `[ 25 | "product", 26 | "sub_product", 27 | "issue", 28 | "sub_issue", 29 | "consumer_complaint_narrative", 30 | "company", 31 | "state", 32 | "zip_code", 33 | "company", 34 | "company_response", 35 | "timely_response", 36 | "consumer_disputed"]` 37 | - Filter the dataset to remove rows with missing data in the `consumer_complaint_narrative` column 38 | - In the `consumer_disputed` column, map `Yes` to `1` and `No` to `0` 39 | 40 | 41 | ## Pre-pipeline experiment 42 | 43 | Before building our TFX pipeline, we experimented with different feature engineering and model architectures. The notebooks in this folder preserve our experiments, and we then refactored our code into the interactive pipeline below. 44 | 45 | ## Interactive pipeline 46 | 47 | The `interactive-pipeline` folder contains a full interactive TFX pipeline for the consumer complaint data. 48 | 49 | ## Full pipelines with Apache Beam, Apache Airflow, Kubeflow Pipelines, GCP 50 | 51 | The `pipelines` folder contains complete pipelines for the various orchestrators. See Chapters 11 and 12 for full details. 52 | 53 | ## Chapters 54 | 55 | The following subfolders contain stand-alone code for individual chapters. 56 | 57 | ### Model analysis 58 | Chapter 7. Stand-alone code for TFMA, Fairness Indicators, What-If Tool. Note that these notebooks will not work in JupyterLab. 59 | 60 | ### Advanced TFX 61 | Chapter 10. Notebook outlining the implementation of custom TFX components from scratch and by inheriting existing functionality. Presented at the Apache Beam Summit 2020. 62 | 63 | ### Data privacy 64 | Chapter 14. Code for training a differentially private version of the demo project. Note that the TF-Privacy module only supports TF 1.x as of June 2020. 65 | 66 | ## Version notes 67 | 68 | The code was written and tested for version 0.22. 69 | 70 | - As of 11/23/21, the examples have been updated to support TFX 1.4.0, TensorFlow 2.6.1, and Apache Beam 2.33.0. A GCP Vertex example (training and serving) was added. 71 | 72 | - As of 9/22/20, the interactive pipeline runs on TFX version 0.24.0rc1. 73 | Due to tiny TFX bugs, the pipelines currently don't work on the releases 0.23 and 0.24-rc0. Github issues have been filed with the TFX team specifically for the book pipelines ([Issue 2500](https://github.com/tensorflow/tfx/issues/2500#issuecomment-695363847)). We will update the repository once the issue is resolved. 74 | 75 | - As of 9/14/20, TFX only supports Python 3.8 with version >0.24.0rc0. 76 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Building-ML-Pipelines/building-machine-learning-pipelines/8862436d291a330d772dc59e104c4ba0a6d64b5a/__init__.py -------------------------------------------------------------------------------- /chapters/appendix_c/.gitignore: -------------------------------------------------------------------------------- 1 | complaint_prediction_pipeline.tar.gz # Pipeline definition file in argo format for Kubeflow. 2 | tfx_pipeline_output # Default pipeline output directory 3 | tfx_metadata # Default tfx metadata SQLite DB directory 4 | -------------------------------------------------------------------------------- /chapters/appendix_c/README.md: -------------------------------------------------------------------------------- 1 | ## How to use TFX templates 2 | 3 | ### Download the base template 4 | 5 | You can download the taxi example template (at the moment, the only available template) and update the individual files. We have done this for our _complaint prediction_ example pipeline. 6 | 7 | We ran: 8 | ``` 9 | $ tfx template copy --pipeline_name complaint_prediction_pipeline --destination_path . --model=taxi 10 | ``` 11 | 12 | ### Test your pipeline 13 | 14 | Add this path to your `PYTHONPATH` with: 15 | 16 | ``` 17 | $ cd building-machine-learning-pipelines/chapters/appendix_c/tfx_template_example 18 | $ export PYTHONPATH=$PYTHONPATH:`pwd` 19 | ``` 20 | 21 | Afterward, you can run the tests with: 22 | ``` 23 | $ pytest . -s 24 | ``` 25 | 26 | Note: The first test might take some time, since the TF Hub model will be downloaded. 27 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/__init__.py: -------------------------------------------------------------------------------- 1 | # Lint as: python2, python3 2 | # Copyright 2020 Google LLC. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/beam_dag_runner.py: -------------------------------------------------------------------------------- 1 | # Lint as: python2, python3 2 | # Copyright 2020 Google LLC. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Define BeamDagRunner to run the pipeline using Apache Beam.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | from absl import logging 23 | 24 | from pipeline import configs 25 | from pipeline import pipeline 26 | from tfx.orchestration import metadata 27 | from tfx.orchestration.beam.beam_dag_runner import BeamDagRunner 28 | from tfx.proto import trainer_pb2 29 | 30 | 31 | # TFX pipeline produces many output files and metadata. All output data will be 32 | # stored under this OUTPUT_DIR. 33 | # NOTE: It is recommended to have a separated OUTPUT_DIR which is *outside* of 34 | # the source code structure. Please change OUTPUT_DIR to other location 35 | # where we can store outputs of the pipeline. 36 | OUTPUT_DIR = "." 37 | 38 | # TFX produces two types of outputs, files and metadata. 39 | # - Files will be created under PIPELINE_ROOT directory. 40 | # - Metadata will be written to SQLite database in METADATA_PATH. 41 | PIPELINE_ROOT = os.path.join( 42 | OUTPUT_DIR, "tfx_pipeline_output", configs.PIPELINE_NAME 43 | ) 44 | METADATA_PATH = os.path.join( 45 | OUTPUT_DIR, "tfx_metadata", configs.PIPELINE_NAME, "metadata.db" 46 | ) 47 | 48 | # The last component of the pipeline, "Pusher" will produce serving model under 49 | # SERVING_MODEL_DIR. 50 | SERVING_MODEL_DIR = os.path.join(PIPELINE_ROOT, "serving_model") 51 | 52 | # Specifies data file directory. DATA_PATH should be a directory containing CSV 53 | # files for CsvExampleGen in this example. By default, data files are in the 54 | # `data` directory. 55 | # NOTE: If you upload data files to GCS(which is recommended if you use 56 | # Kubeflow), you can use a path starting "gs://YOUR_BUCKET_NAME/path" for 57 | # DATA_PATH. For example, 58 | # DATA_PATH = 'gs://bucket/chicago_taxi_trips/csv/'. 59 | DATA_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") 60 | 61 | 62 | def run(): 63 | """Define a beam pipeline.""" 64 | 65 | BeamDagRunner().run( 66 | pipeline.create_pipeline( 67 | pipeline_name=configs.PIPELINE_NAME, 68 | pipeline_root=PIPELINE_ROOT, 69 | data_path=DATA_PATH, 70 | preprocessing_fn=configs.PREPROCESSING_FN, 71 | run_fn=configs.RUN_FN, 72 | train_args=trainer_pb2.TrainArgs(num_steps=configs.TRAIN_NUM_STEPS), 73 | eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS), 74 | eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD, 75 | serving_model_dir=SERVING_MODEL_DIR, 76 | metadata_connection_config=metadata.sqlite_metadata_connection_config( # noqa 77 | METADATA_PATH 78 | ), 79 | ) 80 | ) 81 | 82 | 83 | if __name__ == "__main__": 84 | logging.set_verbosity(logging.INFO) 85 | run() 86 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/data_validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# import required libs\n", 10 | "import glob\n", 11 | "import os\n", 12 | "\n", 13 | "import tensorflow as tf\n", 14 | "import tensorflow_data_validation as tfdv\n", 15 | "print('TF version: {}'.format(tf.version.VERSION))\n", 16 | "print('TFDV version: {}'.format(tfdv.version.__version__))" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "# Read artifact information from metadata store.\n", 26 | "import beam_dag_runner\n", 27 | "\n", 28 | "from tfx.orchestration import metadata\n", 29 | "from tfx.types import standard_artifacts\n", 30 | "\n", 31 | "metadata_connection_config = metadata.sqlite_metadata_connection_config(\n", 32 | " beam_dag_runner.METADATA_PATH)\n", 33 | "with metadata.Metadata(metadata_connection_config) as store:\n", 34 | " stats_artifacts = store.get_artifacts_by_type(standard_artifacts.ExampleStatistics.TYPE_NAME)\n", 35 | " schema_artifacts = store.get_artifacts_by_type(standard_artifacts.Schema.TYPE_NAME)\n", 36 | " anomalies_artifacts = store.get_artifacts_by_type(standard_artifacts.ExampleAnomalies.TYPE_NAME)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# configure output paths\n", 46 | "# Exact paths to output artifacts can also be found on KFP Web UI if you are using kubeflow.\n", 47 | "stats_path = stats_artifacts[-1].uri\n", 48 | "train_stats_file = os.path.join(stats_path, 'train', 'stats_tfrecord')\n", 49 | "eval_stats_file = os.path.join(stats_path, 'eval', 'stats_tfrecord')\n", 50 | "print(\"Train stats file:{}, Eval stats file:{}\".format(\n", 51 | " train_stats_file, eval_stats_file))\n", 52 | "\n", 53 | "schema_file = os.path.join(schema_artifacts[-1].uri, 'schema.pbtxt')\n", 54 | "print(\"Generated schame file:{}\".format(schema_file))\n", 55 | "anomalies_file = os.path.join(anomalies_artifacts[-1].uri, 'anomalies.pbtxt')\n", 56 | "print(\"Generated anomalies file:{}\".format(anomalies_file))" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# load generated statistics from StatisticsGen\n", 66 | "train_stats = tfdv.load_statistics(train_stats_file)\n", 67 | "eval_stats = tfdv.load_statistics(eval_stats_file)\n", 68 | "tfdv.visualize_statistics(lhs_statistics=eval_stats, rhs_statistics=train_stats,\n", 69 | " lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "# load generated schema from SchemaGen\n", 79 | "schema = tfdv.load_schema_text(schema_file)\n", 80 | "tfdv.display_schema(schema=schema)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# load data vaildation result from ExampleValidator\n", 90 | "anomalies = tfdv.load_anomalies_text(anomalies_file)\n", 91 | "tfdv.display_anomalies(anomalies)" 92 | ] 93 | } 94 | ], 95 | "metadata": { 96 | "kernelspec": { 97 | "display_name": "Python 3", 98 | "language": "python", 99 | "name": "python3" 100 | }, 101 | "language_info": { 102 | "codemirror_mode": { 103 | "name": "ipython", 104 | "version": 3 105 | }, 106 | "file_extension": ".py", 107 | "mimetype": "text/x-python", 108 | "name": "python", 109 | "nbconvert_exporter": "python", 110 | "pygments_lexer": "ipython3", 111 | "version": "3.7.5rc1" 112 | }, 113 | "pycharm": { 114 | "stem_cell": { 115 | "cell_type": "raw", 116 | "source": [], 117 | "metadata": { 118 | "collapsed": false 119 | } 120 | } 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 2 125 | } 126 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/kubeflow_dag_runner.py: -------------------------------------------------------------------------------- 1 | # Lint as: python2, python3 2 | # Copyright 2020 Google LLC. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Define KubeflowDagRunner to run the pipeline using Kubeflow.""" 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import os 21 | from absl import logging 22 | 23 | from pipeline import configs 24 | from pipeline import pipeline 25 | from tfx.orchestration.kubeflow import kubeflow_dag_runner 26 | from tfx.proto import trainer_pb2 27 | from tfx.utils import telemetry_utils 28 | 29 | # TFX pipeline produces many output files and metadata. All output data will be 30 | # stored under this OUTPUT_DIR. 31 | OUTPUT_DIR = os.path.join("gs://", configs.GCS_BUCKET_NAME) 32 | 33 | # TFX produces two types of outputs, files and metadata. 34 | # - Files will be created under PIPELINE_ROOT directory. 35 | PIPELINE_ROOT = os.path.join( 36 | OUTPUT_DIR, "tfx_pipeline_output", configs.PIPELINE_NAME 37 | ) 38 | 39 | # The last component of the pipeline, "Pusher" will produce serving model under 40 | # SERVING_MODEL_DIR. 41 | SERVING_MODEL_DIR = os.path.join(PIPELINE_ROOT, "serving_model") 42 | 43 | # Specifies data file directory. DATA_PATH should be a directory containing CSV 44 | # files for CsvExampleGen in this example. By default, data files are in the 45 | # `data` directory. 46 | # NOTE: If you upload data files to GCS(which is recommended if you use 47 | # Kubeflow), you can use a path starting "gs://YOUR_BUCKET_NAME/path" for 48 | # DATA_PATH. For example, 49 | # DATA_PATH = 'gs://bucket/chicago_taxi_trips/csv/' 50 | DATA_PATH = "data" 51 | 52 | 53 | def run(): 54 | """Define a kubeflow pipeline.""" 55 | 56 | # Metadata config. The defaults works work with the installation of 57 | # KF Pipelines using Kubeflow. If installing KF Pipelines using the 58 | # lightweight deployment option, you may need to override the defaults. 59 | # If you use Kubeflow, metadata will be written to MySQL database inside 60 | # Kubeflow cluster. 61 | metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config() 62 | 63 | # This pipeline automatically injects the Kubeflow TFX image if the 64 | # environment variable 'KUBEFLOW_TFX_IMAGE' is defined. Currently, the tfx 65 | # cli tool exports the environment variable to pass to the pipelines. 66 | # TODO(b/157598477) Find a better way to pass parameters from CLI handler to 67 | # pipeline DSL file, instead of using environment vars. 68 | tfx_image = os.environ.get("KUBEFLOW_TFX_IMAGE", None) 69 | 70 | runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( 71 | kubeflow_metadata_config=metadata_config, tfx_image=tfx_image 72 | ) 73 | pod_labels = kubeflow_dag_runner.get_default_pod_labels().update( 74 | {telemetry_utils.LABEL_KFP_SDK_ENV: "tfx-template"} 75 | ) 76 | kubeflow_dag_runner.KubeflowDagRunner( 77 | config=runner_config, pod_labels_to_attach=pod_labels 78 | ).run( 79 | pipeline.create_pipeline( 80 | pipeline_name=configs.PIPELINE_NAME, 81 | pipeline_root=PIPELINE_ROOT, 82 | data_path=DATA_PATH, 83 | preprocessing_fn=configs.PREPROCESSING_FN, 84 | run_fn=configs.RUN_FN, 85 | train_args=trainer_pb2.TrainArgs(num_steps=configs.TRAIN_NUM_STEPS), 86 | eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS), 87 | eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD, 88 | serving_model_dir=SERVING_MODEL_DIR, 89 | beam_pipeline_args=configs.DATAFLOW_BEAM_PIPELINE_ARGS, 90 | ai_platform_training_args=configs.GCP_AI_PLATFORM_TRAINING_ARGS, 91 | # ai_platform_serving_args=configs.GCP_AI_PLATFORM_SERVING_ARGS, 92 | ) 93 | ) 94 | 95 | 96 | if __name__ == "__main__": 97 | logging.set_verbosity(logging.INFO) 98 | run() 99 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/model_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# import required libs\n", 10 | "import glob\n", 11 | "import os\n", 12 | "\n", 13 | "import tensorflow as tf\n", 14 | "import tensorflow_model_analysis as tfma\n", 15 | "print('TF version: {}'.format(tf.version.VERSION))\n", 16 | "print('TFMA version: {}'.format(tfma.version.VERSION_STRING))" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "# Read artifact information from metadata store.\n", 26 | "import beam_dag_runner\n", 27 | "\n", 28 | "from tfx.orchestration import metadata\n", 29 | "from tfx.types import standard_artifacts\n", 30 | "\n", 31 | "metadata_connection_config = metadata.sqlite_metadata_connection_config(\n", 32 | " beam_dag_runner.METADATA_PATH)\n", 33 | "with metadata.Metadata(metadata_connection_config) as store:\n", 34 | " model_eval_artifacts = store.get_artifacts_by_type(standard_artifacts.ModelEvaluation.TYPE_NAME)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# configure output paths\n", 44 | "# Exact paths to output artifacts can be found in the execution logs\n", 45 | "# or KFP Web UI if you are using kubeflow.\n", 46 | "model_eval_path = model_eval_artifacts[-1].uri\n", 47 | "print(\"Generated model evaluation result:{}\".format(model_eval_path))" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## Install Jupyter Extensions\n", 55 | "Note: If running in a local Jupyter notebook, then these Jupyter extensions must be installed in the environment before running Jupyter.\n", 56 | "\n", 57 | "```bash\n", 58 | "jupyter nbextension enable --py widgetsnbextension\n", 59 | "jupyter nbextension install --py --symlink tensorflow_model_analysis\n", 60 | "jupyter nbextension enable --py tensorflow_model_analysis\n", 61 | "```" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "eval_result = tfma.load_eval_result(model_eval_path)\n", 71 | "tfma.view.render_slicing_metrics(eval_result, slicing_spec = tfma.slicer.SingleSliceSpec(columns=['product']))" 72 | ] 73 | } 74 | ], 75 | "metadata": { 76 | "kernelspec": { 77 | "display_name": "Python 3", 78 | "language": "python", 79 | "name": "python3" 80 | }, 81 | "language_info": { 82 | "codemirror_mode": { 83 | "name": "ipython", 84 | "version": 3 85 | }, 86 | "file_extension": ".py", 87 | "mimetype": "text/x-python", 88 | "name": "python", 89 | "nbconvert_exporter": "python", 90 | "pygments_lexer": "ipython3", 91 | "version": "3.7.5rc1" 92 | }, 93 | "pycharm": { 94 | "stem_cell": { 95 | "cell_type": "raw", 96 | "source": [], 97 | "metadata": { 98 | "collapsed": false 99 | } 100 | } 101 | } 102 | }, 103 | "nbformat": 4, 104 | "nbformat_minor": 2 105 | } 106 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Lint as: python2, python3 2 | # Copyright 2020 Google LLC. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/models/features.py: -------------------------------------------------------------------------------- 1 | # Lint as: python2, python3 2 | # Copyright 2020 Google LLC. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """TFX complaint model model features. 16 | 17 | Define constants here that are common across all models 18 | including features names, label and size of vocabulary. 19 | """ 20 | 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | 25 | from typing import Text, List 26 | 27 | # At least one feature is needed. 28 | 29 | # feature name, feature dimensionality 30 | ONE_HOT_FEATURES = { 31 | "product": 11, 32 | "sub_product": 45, 33 | "company_response": 5, 34 | "state": 60, 35 | "issue": 90, 36 | } 37 | 38 | # feature name, bucket count 39 | BUCKET_FEATURES = {"zip_code": 10} 40 | 41 | # feature name, value is unused 42 | TEXT_FEATURES = {"consumer_complaint_narrative": None} 43 | 44 | # Keys 45 | LABEL_KEY = "consumer_disputed" 46 | 47 | 48 | def transformed_name(key: Text) -> Text: 49 | """Generate the name of the transformed feature from original name.""" 50 | return key + "_xf" 51 | 52 | 53 | def vocabulary_name(key: Text) -> Text: 54 | """Generate the name of the vocabulary feature from original name.""" 55 | return key + "_vocab" 56 | 57 | 58 | def transformed_names(keys: List[Text]) -> List[Text]: 59 | """Transform multiple feature names at once.""" 60 | return [transformed_name(key) for key in keys] 61 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/models/features_test.py: -------------------------------------------------------------------------------- 1 | # Lint as: python2, python3 2 | # Copyright 2020 Google LLC. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import tensorflow as tf 21 | 22 | from models import features 23 | 24 | 25 | class FeaturesTest(tf.test.TestCase): 26 | def testOneHotFeatures(self): 27 | 28 | for name, num_options in features.ONE_HOT_FEATURES.items(): 29 | self.assertTrue(isinstance(name, str)) 30 | self.assertTrue(isinstance(num_options, int)) 31 | 32 | def testTransformedNames(self): 33 | names = ["f1", "cf"] 34 | self.assertEqual(["f1_xf", "cf_xf"], features.transformed_names(names)) 35 | 36 | 37 | if __name__ == "__main__": 38 | tf.test.main() 39 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/models/keras/__init__.py: -------------------------------------------------------------------------------- 1 | # Lint as: python2, python3 2 | # Copyright 2020 Google LLC. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/models/keras/constants.py: -------------------------------------------------------------------------------- 1 | # Lint as: python2, python3 2 | # Copyright 2020 Google LLC. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Constants for the complaint prediction model. 16 | 17 | These values can be tweaked to affect model training performance. 18 | """ 19 | 20 | LEARNING_RATE = 0.001 21 | 22 | TRAIN_BATCH_SIZE = 64 23 | EVAL_BATCH_SIZE = 64 24 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/models/keras/model.py: -------------------------------------------------------------------------------- 1 | # Lint as: python2, python3 2 | # Copyright 2020 Google LLC. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """TFX template complaint prediction model. 16 | 17 | A DNN keras model which uses features defined in features.py and network 18 | parameters defined in constants.py. 19 | """ 20 | 21 | from __future__ import division 22 | from __future__ import print_function 23 | 24 | import os 25 | import tensorflow as tf 26 | import tensorflow_hub as hub 27 | import tensorflow_transform as tft 28 | 29 | from models import features 30 | from models.keras import constants 31 | 32 | 33 | def _gzip_reader_fn(filenames): 34 | """Small utility returning a record reader that can read gzip'ed files.""" 35 | return tf.data.TFRecordDataset(filenames, compression_type="GZIP") 36 | 37 | 38 | def _get_serve_tf_examples_fn(model, tf_transform_output): 39 | """Returns a function that parses a serialized tf.Example and applies TFT. 40 | """ 41 | 42 | model.tft_layer = tf_transform_output.transform_features_layer() 43 | 44 | @tf.function 45 | def serve_tf_examples_fn(serialized_tf_examples): 46 | """Returns the output to be used in the serving signature.""" 47 | feature_spec = tf_transform_output.raw_feature_spec() 48 | feature_spec.pop(features.LABEL_KEY) 49 | parsed_features = tf.io.parse_example( 50 | serialized_tf_examples, feature_spec 51 | ) 52 | 53 | transformed_features = model.tft_layer(parsed_features) 54 | 55 | return model(transformed_features) 56 | 57 | return serve_tf_examples_fn 58 | 59 | 60 | def _input_fn(file_pattern, tf_transform_output, batch_size=200): 61 | """Generates features and label for tuning/training. 62 | 63 | Args: 64 | file_pattern: input tfrecord file pattern. 65 | tf_transform_output: A TFTransformOutput. 66 | batch_size: representing the number of consecutive elements of returned 67 | dataset to combine in a single batch 68 | 69 | Returns: 70 | A dataset that contains (features, indices) tuple where features is a 71 | dictionary of Tensors, and indices is a single Tensor of label indices. 72 | """ 73 | transformed_feature_spec = ( 74 | tf_transform_output.transformed_feature_spec().copy() 75 | ) 76 | 77 | dataset = tf.data.experimental.make_batched_features_dataset( 78 | file_pattern=file_pattern, 79 | batch_size=batch_size, 80 | features=transformed_feature_spec, 81 | reader=_gzip_reader_fn, 82 | label_key=features.transformed_name(features.LABEL_KEY), 83 | ) 84 | 85 | return dataset 86 | 87 | 88 | def get_model(show_summary: bool = True) -> tf.keras.models.Model: 89 | """ 90 | This function defines a Keras model and returns the model as a Keras object. 91 | """ 92 | 93 | # one-hot categorical features 94 | input_features = [] 95 | for key, dim in features.ONE_HOT_FEATURES.items(): 96 | input_features.append( 97 | tf.keras.Input( 98 | shape=(dim + 1,), name=features.transformed_name(key) 99 | ) 100 | ) 101 | 102 | # adding bucketized features 103 | for key, dim in features.BUCKET_FEATURES.items(): 104 | input_features.append( 105 | tf.keras.Input( 106 | shape=(dim + 1,), name=features.transformed_name(key) 107 | ) 108 | ) 109 | 110 | # adding text input features 111 | input_texts = [] 112 | for key in features.TEXT_FEATURES.keys(): 113 | input_texts.append( 114 | tf.keras.Input( 115 | shape=(1,), name=features.transformed_name(key), dtype=tf.string 116 | ) 117 | ) 118 | 119 | # embed text features 120 | MODULE_URL = "https://tfhub.dev/google/universal-sentence-encoder/4" 121 | embed = hub.KerasLayer(MODULE_URL) 122 | reshaped_narrative = tf.reshape(input_texts[0], [-1]) 123 | embed_narrative = embed(reshaped_narrative) 124 | deep_ff = tf.keras.layers.Reshape((512,), input_shape=(1, 512))( 125 | embed_narrative 126 | ) 127 | 128 | deep = tf.keras.layers.Dense(256, activation="relu")(deep_ff) 129 | deep = tf.keras.layers.Dense(64, activation="relu")(deep) 130 | deep = tf.keras.layers.Dense(16, activation="relu")(deep) 131 | 132 | wide_ff = tf.keras.layers.concatenate(input_features) 133 | wide = tf.keras.layers.Dense(16, activation="relu")(wide_ff) 134 | 135 | both = tf.keras.layers.concatenate([deep, wide]) 136 | 137 | output = tf.keras.layers.Dense(1, activation="sigmoid")(both) 138 | 139 | inputs = input_features + input_texts 140 | 141 | keras_model = tf.keras.models.Model(inputs, output) 142 | keras_model.compile( 143 | optimizer=tf.keras.optimizers.Adam( 144 | learning_rate=constants.LEARNING_RATE 145 | ), 146 | loss="binary_crossentropy", 147 | metrics=[ 148 | tf.keras.metrics.BinaryAccuracy(), 149 | tf.keras.metrics.TruePositives(), 150 | ], 151 | ) 152 | if show_summary: 153 | keras_model.summary() 154 | 155 | return keras_model 156 | 157 | 158 | # TFX Trainer will call this function. 159 | def run_fn(fn_args): 160 | """Train the model based on given args. 161 | 162 | Args: 163 | fn_args: Holds args used to train the model as name/value pairs. 164 | """ 165 | 166 | tf_transform_output = tft.TFTransformOutput(fn_args.transform_output) 167 | 168 | train_dataset = _input_fn( 169 | fn_args.train_files, tf_transform_output, constants.TRAIN_BATCH_SIZE 170 | ) 171 | eval_dataset = _input_fn( 172 | fn_args.eval_files, tf_transform_output, constants.EVAL_BATCH_SIZE 173 | ) 174 | 175 | mirrored_strategy = tf.distribute.MirroredStrategy() 176 | with mirrored_strategy.scope(): 177 | model = get_model() 178 | # This log path might change in the future. 179 | log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), "logs") 180 | tensorboard_callback = tf.keras.callbacks.TensorBoard( 181 | log_dir=log_dir, update_freq="batch" 182 | ) 183 | 184 | model.fit( 185 | train_dataset, 186 | steps_per_epoch=fn_args.train_steps, 187 | validation_data=eval_dataset, 188 | validation_steps=fn_args.eval_steps, 189 | callbacks=[tensorboard_callback], 190 | ) 191 | 192 | signatures = { 193 | "serving_default": _get_serve_tf_examples_fn( 194 | model, tf_transform_output 195 | ).get_concrete_function( 196 | tf.TensorSpec(shape=[None], dtype=tf.string, name="examples") 197 | ), 198 | } 199 | model.save( 200 | fn_args.serving_model_dir, save_format="tf", signatures=signatures 201 | ) 202 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/models/keras/model_test.py: -------------------------------------------------------------------------------- 1 | # Lint as: python2, python3 2 | # Copyright 2020 Google LLC. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import pytest 21 | 22 | import tensorflow as tf 23 | 24 | from models.keras import model 25 | 26 | 27 | @pytest.mark.skip(reason="skip until way found to test with tf.hub") 28 | class ModelTest(tf.test.TestCase): 29 | def testBuildGetModel(self): 30 | built_model = model.get_model() 31 | self.assertEqual(len(built_model.layers), 17) 32 | 33 | 34 | if __name__ == "__main__": 35 | tf.test.main() 36 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/models/preprocessing.py: -------------------------------------------------------------------------------- 1 | # Lint as: python2, python3 2 | # Copyright 2020 Google LLC. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """TFX complaint prediction preprocessing. 16 | 17 | This file defines a template for TFX Transform component. 18 | """ 19 | 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from typing import Union 24 | 25 | import tensorflow as tf 26 | import tensorflow_transform as tft 27 | 28 | from models import features 29 | 30 | 31 | def fill_in_missing(x: Union[tf.Tensor, tf.SparseTensor]) -> tf.Tensor: 32 | """Replace missing values in a SparseTensor. 33 | 34 | Fills in missing values of `x` with '' or 0, and converts to a 35 | dense tensor. 36 | 37 | Args: 38 | x: A `SparseTensor` of rank 2. Its dense shape should have 39 | size at most 1 in the second dimension. 40 | 41 | Returns: 42 | A rank 1 tensor where missing values of `x` have been filled in. 43 | """ 44 | if isinstance(x, tf.sparse.SparseTensor): 45 | default_value = "" if x.dtype == tf.string else 0 46 | x = tf.sparse.to_dense( 47 | tf.SparseTensor(x.indices, x.values, [x.dense_shape[0], 1]), 48 | default_value, 49 | ) 50 | return tf.squeeze(x, axis=1) 51 | 52 | 53 | def convert_num_to_one_hot( 54 | label_tensor: tf.Tensor, num_labels: int = 2 55 | ) -> tf.Tensor: 56 | """ 57 | Convert a label (0 or 1) into a one-hot vector 58 | Args: 59 | int: label_tensor (0 or 1) 60 | Returns 61 | label tensor 62 | """ 63 | one_hot_tensor = tf.one_hot(label_tensor, num_labels) 64 | return tf.reshape(one_hot_tensor, [-1, num_labels]) 65 | 66 | 67 | def convert_zip_code(zipcode: str) -> tf.float32: 68 | """ 69 | Convert a zipcode string to int64 representation. In the dataset the 70 | zipcodes are anonymized by repacing the last 3 digits to XXX. We are 71 | replacing those characters to 000 to simplify the bucketing later on. 72 | 73 | Args: 74 | str: zipcode 75 | Returns: 76 | zipcode: int64 77 | """ 78 | if zipcode == "": 79 | zipcode = "00000" 80 | zipcode = tf.strings.regex_replace(zipcode, r"X{0,5}", "0") 81 | zipcode = tf.strings.to_number(zipcode, out_type=tf.float32) 82 | return zipcode 83 | 84 | 85 | def preprocessing_fn(inputs: tf.Tensor) -> tf.Tensor: 86 | """tf.transform's callback function for preprocessing inputs. 87 | 88 | Args: 89 | inputs: map from feature keys to raw not-yet-transformed features. 90 | 91 | Returns: 92 | Map from string feature key to transformed feature operations. 93 | """ 94 | outputs = {} 95 | 96 | for key in features.ONE_HOT_FEATURES.keys(): 97 | dim = features.ONE_HOT_FEATURES[key] 98 | int_value = tft.compute_and_apply_vocabulary( 99 | fill_in_missing(inputs[key]), top_k=dim + 1 100 | ) 101 | outputs[features.transformed_name(key)] = convert_num_to_one_hot( 102 | int_value, num_labels=dim + 1 103 | ) 104 | 105 | for key, bucket_count in features.BUCKET_FEATURES.items(): 106 | temp_feature = tft.bucketize( 107 | convert_zip_code(fill_in_missing(inputs[key])), 108 | bucket_count, 109 | always_return_num_quantiles=False, 110 | ) 111 | outputs[features.transformed_name(key)] = convert_num_to_one_hot( 112 | temp_feature, num_labels=bucket_count + 1 113 | ) 114 | 115 | for key in features.TEXT_FEATURES.keys(): 116 | outputs[features.transformed_name(key)] = fill_in_missing(inputs[key]) 117 | 118 | outputs[features.transformed_name(features.LABEL_KEY)] = fill_in_missing( 119 | inputs[features.LABEL_KEY] 120 | ) 121 | 122 | return outputs 123 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/models/preprocessing_test.py: -------------------------------------------------------------------------------- 1 | # Lint as: python2, python3 2 | # Copyright 2020 Google LLC. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import tensorflow as tf 21 | 22 | from models import preprocessing 23 | 24 | 25 | class PreprocessingTest(tf.test.TestCase): 26 | def testPreprocessingFn(self): 27 | self.assertTrue(callable(preprocessing.preprocessing_fn)) 28 | 29 | 30 | if __name__ == "__main__": 31 | tf.test.main() 32 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | # Lint as: python2, python3 2 | # Copyright 2020 Google LLC. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/pipeline/configs.py: -------------------------------------------------------------------------------- 1 | # Lint as: python2, python3 2 | # Copyright 2020 Google LLC. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """TFX complaint prediction template configurations. 16 | 17 | This file defines environments for a TFX complaint prediction pipeline. 18 | """ 19 | 20 | # Pipeline name will be used to identify this pipeline. 21 | PIPELINE_NAME = "complaint_prediction_pipeline" 22 | 23 | # GCP related configs. 24 | 25 | # Following code will retrieve your GCP project. You can choose which project 26 | # to use by setting GOOGLE_CLOUD_PROJECT environment variable. 27 | try: 28 | import google.auth # pylint: disable=g-import-not-at-top 29 | 30 | try: 31 | _, GOOGLE_CLOUD_PROJECT = google.auth.default() 32 | except google.auth.exceptions.DefaultCredentialsError: 33 | GOOGLE_CLOUD_PROJECT = "" 34 | except ImportError: 35 | GOOGLE_CLOUD_PROJECT = "" 36 | 37 | # Specify your GCS bucket name here. You have to use GCS to store output files 38 | # when running a pipeline with Kubeflow Pipeline on GCP or when running a job 39 | # using Dataflow. Default is '-kubeflowpipelines-default'. 40 | # This bucket is created automatically when you deploy KFP from marketplace. 41 | GCS_BUCKET_NAME = GOOGLE_CLOUD_PROJECT + "-kubeflowpipelines-default" 42 | 43 | # TODO(step 8,step 9): (Optional) Set your region to use GCP services including 44 | # BigQuery, Dataflow and Cloud AI Platform. 45 | # GOOGLE_CLOUD_REGION = '' # ex) 'us-central1' 46 | 47 | PREPROCESSING_FN = "models.preprocessing.preprocessing_fn" 48 | RUN_FN = "models.keras.model.run_fn" 49 | # NOTE: Uncomment below to use an estimator based model. 50 | # RUN_FN = 'models.estimator.model.run_fn' 51 | 52 | TRAIN_NUM_STEPS = 100 53 | EVAL_NUM_STEPS = 100 54 | 55 | # Change this value according to your use cases. 56 | EVAL_ACCURACY_THRESHOLD = 0.6 57 | 58 | # Beam args to run data processing on DataflowRunner. 59 | # TODO(step 8): (Optional) Uncomment below to use Dataflow. 60 | # DATAFLOW_BEAM_PIPELINE_ARGS = [ 61 | # '--project=' + GOOGLE_CLOUD_PROJECT, 62 | # '--runner=DataflowRunner', 63 | # '--temp_location=' + os.path.join('gs://', GCS_BUCKET_NAME, 'tmp'), 64 | # '--region=' + GOOGLE_CLOUD_REGION, 65 | # # TODO(tensorflow/tfx#1461) Remove `shuffle_mode` after default is changed. # noqa pylint: disable=g-bad-todo 66 | # '--experiments=shuffle_mode=auto', 67 | # # TODO(tensorflow/tfx#1459) Remove `disk_size_gb` after default is 68 | # # increased. # pylint: disable=g-bad-todo 69 | # '--disk_size_gb=50', 70 | # # If you are blocked by IP Address quota, using a bigger machine_type will 71 | # # reduce the number of needed IPs. 72 | # # '--machine_type=n1-standard-8', 73 | # ] 74 | 75 | # A dict which contains the training job parameters to be passed to Google 76 | # Cloud AI Platform. For the full set of parameters supported by Google 77 | # Cloud AI Platform, refer to 78 | # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#Job 79 | # TODO(step 9): (Optional) Uncomment below to use AI Platform training. 80 | # GCP_AI_PLATFORM_TRAINING_ARGS = { 81 | # 'project': GOOGLE_CLOUD_PROJECT, 82 | # 'region': GOOGLE_CLOUD_REGION, 83 | # # Starting from TFX 0.14, training on AI Platform uses custom containers: 84 | # # https://cloud.google.com/ml-engine/docs/containers-overview 85 | # # You can specify a custom container here. If not specified, TFX will use 86 | # # a public container image matching the installed version of TFX. 87 | # # TODO(step 9): (Optional) Set your container name below. 88 | # 'masterConfig': { 89 | # 'imageUri': 'gcr.io/' + GOOGLE_CLOUD_PROJECT + '/tfx-pipeline' 90 | # }, 91 | # # Note that if you do specify a custom container, ensure the entrypoint 92 | # # calls into TFX's run_executor script (tfx/scripts/run_executor.py) 93 | # } 94 | 95 | # A dict which contains the serving job parameters to be passed to Google 96 | # Cloud AI Platform. For the full set of parameters supported by Google Cloud AI 97 | # Platform, refer to 98 | # https://cloud.google.com/ml-engine/reference/rest/v1/projects.models 99 | # TODO(step 9): (Optional) Uncomment below to use AI Platform serving. 100 | # GCP_AI_PLATFORM_SERVING_ARGS = { 101 | # 'model_name': PIPELINE_NAME, 102 | # 'project_id': GOOGLE_CLOUD_PROJECT, 103 | # # The region to use when serving the model. See available regions here: 104 | # # https://cloud.google.com/ml-engine/docs/regions 105 | # # Note that serving currently only supports a single region: 106 | # # https://cloud.google.com/ml-engine/reference/rest/v1/projects.models#Model # noqa pylint: disable=line-too-long 107 | # 'regions': [GOOGLE_CLOUD_REGION], 108 | # } 109 | -------------------------------------------------------------------------------- /chapters/appendix_c/tfx_template_example/pipeline/pipeline.py: -------------------------------------------------------------------------------- 1 | # Lint as: python2, python3 2 | # Copyright 2020 Google LLC. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """TFX complaint prediction template configurations. 16 | 17 | This file defines TFX pipeline and various components in the pipeline. 18 | """ 19 | 20 | from __future__ import absolute_import 21 | from __future__ import division 22 | from __future__ import print_function 23 | 24 | from typing import Optional, Text, List, Dict, Any 25 | import tensorflow_model_analysis as tfma 26 | 27 | from ml_metadata.proto import metadata_store_pb2 28 | from tfx.components import CsvExampleGen 29 | from tfx.components import Evaluator 30 | from tfx.components import ExampleValidator 31 | from tfx.components import Pusher 32 | from tfx.components import ResolverNode 33 | from tfx.components import SchemaGen 34 | from tfx.components import StatisticsGen 35 | from tfx.components import Trainer 36 | from tfx.components import Transform 37 | from tfx.components.base import executor_spec 38 | from tfx.components.trainer import executor as trainer_executor 39 | from tfx.dsl.experimental import latest_blessed_model_resolver 40 | from tfx.extensions.google_cloud_ai_platform.pusher import ( 41 | executor as ai_platform_pusher_executor, 42 | ) 43 | from tfx.extensions.google_cloud_ai_platform.trainer import ( 44 | executor as ai_platform_trainer_executor, 45 | ) 46 | from tfx.orchestration import pipeline 47 | from tfx.proto import pusher_pb2 48 | from tfx.proto import trainer_pb2 49 | from tfx.types import Channel 50 | from tfx.types.standard_artifacts import Model 51 | from tfx.types.standard_artifacts import ModelBlessing 52 | from tfx.utils.dsl_utils import external_input 53 | 54 | 55 | def create_pipeline( 56 | pipeline_name: Text, 57 | pipeline_root: Text, 58 | data_path: Text, 59 | preprocessing_fn: Text, 60 | run_fn: Text, 61 | train_args: trainer_pb2.TrainArgs, 62 | eval_args: trainer_pb2.EvalArgs, 63 | eval_accuracy_threshold: float, 64 | serving_model_dir: Text, 65 | metadata_connection_config: Optional[ 66 | metadata_store_pb2.ConnectionConfig 67 | ] = None, 68 | beam_pipeline_args: Optional[List[Text]] = None, 69 | ai_platform_training_args: Optional[Dict[Text, Text]] = None, 70 | ai_platform_serving_args: Optional[Dict[Text, Any]] = None, 71 | ) -> pipeline.Pipeline: 72 | """Implements the complaint prediction pipeline with TFX.""" 73 | 74 | components = [] 75 | 76 | # Brings data into the pipeline or otherwise joins/converts training data. 77 | example_gen = CsvExampleGen(input=external_input(data_path)) 78 | components.append(example_gen) 79 | 80 | # Computes statistics over data for visualization and example validation. 81 | statistics_gen = StatisticsGen(examples=example_gen.outputs["examples"]) 82 | components.append(statistics_gen) 83 | 84 | # Generates schema based on statistics files. 85 | schema_gen = SchemaGen( 86 | statistics=statistics_gen.outputs["statistics"], 87 | infer_feature_shape=True, 88 | ) 89 | components.append(schema_gen) 90 | 91 | # Performs anomaly detection based on statistics and data schema. 92 | example_validator = ExampleValidator( # pylint: disable=unused-variable 93 | statistics=statistics_gen.outputs["statistics"], 94 | schema=schema_gen.outputs["schema"], 95 | ) 96 | components.append(example_validator) 97 | 98 | # Performs transformations and feature engineering in training and serving. 99 | transform = Transform( 100 | examples=example_gen.outputs["examples"], 101 | schema=schema_gen.outputs["schema"], 102 | preprocessing_fn=preprocessing_fn, 103 | ) 104 | components.append(transform) 105 | 106 | # Uses user-provided Python function that implements a model using TF-Learn. 107 | trainer_args = { 108 | "run_fn": run_fn, 109 | "transformed_examples": transform.outputs["transformed_examples"], 110 | "schema": schema_gen.outputs["schema"], 111 | "transform_graph": transform.outputs["transform_graph"], 112 | "train_args": train_args, 113 | "eval_args": eval_args, 114 | "custom_executor_spec": executor_spec.ExecutorClassSpec( 115 | trainer_executor.GenericExecutor 116 | ), 117 | } 118 | if ai_platform_training_args is not None: 119 | trainer_args.update( 120 | { 121 | "custom_executor_spec": executor_spec.ExecutorClassSpec( 122 | ai_platform_trainer_executor.GenericExecutor 123 | ), 124 | "custom_config": { 125 | ai_platform_trainer_executor.TRAINING_ARGS_KEY: ai_platform_training_args, # noqa 126 | }, 127 | } 128 | ) 129 | trainer = Trainer(**trainer_args) 130 | components.append(trainer) 131 | 132 | # Get the latest blessed model for model validation. 133 | model_resolver = ResolverNode( 134 | instance_name="latest_blessed_model_resolver", 135 | resolver_class=latest_blessed_model_resolver.LatestBlessedModelResolver, 136 | model=Channel(type=Model), 137 | model_blessing=Channel(type=ModelBlessing), 138 | ) 139 | components.append(model_resolver) 140 | 141 | # Uses TFMA to compute a evaluation statistics over features of a model and 142 | # perform quality validation of a candidate model (compared to a baseline). 143 | eval_config = tfma.EvalConfig( 144 | model_specs=[tfma.ModelSpec(label_key="big_tipper")], 145 | slicing_specs=[tfma.SlicingSpec()], 146 | metrics_specs=[ 147 | tfma.MetricsSpec( 148 | metrics=[ 149 | tfma.MetricConfig( 150 | class_name="BinaryAccuracy", 151 | threshold=tfma.MetricThreshold( 152 | value_threshold=tfma.GenericValueThreshold( 153 | lower_bound={"value": eval_accuracy_threshold} 154 | ), 155 | change_threshold=tfma.GenericChangeThreshold( 156 | direction=tfma.MetricDirection.HIGHER_IS_BETTER, 157 | absolute={"value": -1e-10}, 158 | ), 159 | ), 160 | ) 161 | ] 162 | ) 163 | ], 164 | ) 165 | evaluator = Evaluator( 166 | examples=example_gen.outputs["examples"], 167 | model=trainer.outputs["model"], 168 | baseline_model=model_resolver.outputs["model"], 169 | # Change threshold will be ignored if there is no baseline (first run). 170 | eval_config=eval_config, 171 | ) 172 | components.append(evaluator) 173 | 174 | # Checks whether the model passed the validation steps and pushes the model 175 | # to a file destination if check passed. 176 | pusher_args = { 177 | "model": trainer.outputs["model"], 178 | "model_blessing": evaluator.outputs["blessing"], 179 | "push_destination": pusher_pb2.PushDestination( 180 | filesystem=pusher_pb2.PushDestination.Filesystem( 181 | base_directory=serving_model_dir 182 | ) 183 | ), 184 | } 185 | if ai_platform_serving_args is not None: 186 | pusher_args.update( 187 | { 188 | "custom_executor_spec": executor_spec.ExecutorClassSpec( 189 | ai_platform_pusher_executor.Executor 190 | ), 191 | "custom_config": { 192 | ai_platform_pusher_executor.SERVING_ARGS_KEY: ai_platform_serving_args # noqa 193 | }, 194 | } 195 | ) 196 | pusher = Pusher(**pusher_args) # pylint: disable=unused-variable 197 | components.append(pusher) 198 | 199 | return pipeline.Pipeline( 200 | pipeline_name=pipeline_name, 201 | pipeline_root=pipeline_root, 202 | components=components, 203 | enable_cache=True, 204 | metadata_connection_config=metadata_connection_config, 205 | beam_pipeline_args=beam_pipeline_args, 206 | ) 207 | -------------------------------------------------------------------------------- /chapters/data_ingestion/convert_data_to_tfrecords.py: -------------------------------------------------------------------------------- 1 | """ Example module to convert csv data to TFRecords 2 | """ 3 | 4 | import csv 5 | 6 | import tensorflow as tf 7 | from tqdm import tqdm 8 | 9 | 10 | def _bytes_feature(value): 11 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()])) 12 | 13 | 14 | def _int64_feature(value): 15 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 16 | 17 | 18 | def clean_rows(row): 19 | if not row["zip_code"]: 20 | row["zip_code"] = "99999" 21 | return row 22 | 23 | 24 | def convert_zipcode_to_int(zipcode): 25 | if isinstance(zipcode, str) and "XX" in zipcode: 26 | zipcode = zipcode.replace("XX", "00") 27 | int_zipcode = int(zipcode) 28 | return int_zipcode 29 | 30 | 31 | original_data_file = "../../data/consumer_complaints_with_narrative.csv" 32 | tfrecords_filename = "consumer-complaints.tfrecords" 33 | tf_record_writer = tf.io.TFRecordWriter(tfrecords_filename) 34 | 35 | with open(original_data_file) as csv_file: 36 | reader = csv.DictReader(csv_file, delimiter=",", quotechar='"') 37 | for row in tqdm(reader): 38 | row = clean_rows(row) 39 | example = tf.train.Example( 40 | features=tf.train.Features( 41 | feature={ 42 | "product": _bytes_feature(row["product"]), 43 | "sub_product": _bytes_feature(row["sub_product"]), 44 | "issue": _bytes_feature(row["issue"]), 45 | "sub_issue": _bytes_feature(row["sub_issue"]), 46 | "state": _bytes_feature(row["state"]), 47 | "zip_code": _int64_feature(convert_zipcode_to_int(row["zip_code"])), 48 | "company": _bytes_feature(row["company"]), 49 | "company_response": _bytes_feature(row["company_response"]), 50 | "timely_response": _bytes_feature(row["timely_response"]), 51 | "consumer_disputed": _bytes_feature(row["consumer_disputed"]), 52 | } 53 | ) 54 | ) 55 | tf_record_writer.write(example.SerializeToString()) 56 | tf_record_writer.close() 57 | -------------------------------------------------------------------------------- /chapters/data_privacy/differential_privacy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Copy of complaints_model_dp_20Apr.ipynb", 7 | "provenance": [], 8 | "toc_visible": true 9 | }, 10 | "kernelspec": { 11 | "display_name": "Python 3", 12 | "language": "python", 13 | "name": "python3" 14 | }, 15 | "language_info": { 16 | "codemirror_mode": { 17 | "name": "ipython", 18 | "version": 3 19 | }, 20 | "file_extension": ".py", 21 | "mimetype": "text/x-python", 22 | "name": "python", 23 | "nbconvert_exporter": "python", 24 | "pygments_lexer": "ipython3", 25 | "version": "3.6.4" 26 | } 27 | }, 28 | "cells": [ 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "id": "ke3Z3T-DPLMu", 33 | "colab_type": "text" 34 | }, 35 | "source": [ 36 | "# Chapter 14: Data Privacy" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "id": "94oXR---PLMv", 43 | "colab_type": "text" 44 | }, 45 | "source": [ 46 | "### NOTE as of September 2020, tf-privacy relies on the updated Keras optimizer which will be part of the TensorFlow 2.4 release\n", 47 | "\n", 48 | "Until the release of a stable 2.4 version, this notebook requires the TensorFlow's nightly builds. Due to the unstable nature of the nightly builds, this notebook might fail intermittently." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "metadata": { 54 | "id": "V5GVnk0RPLMv", 55 | "colab_type": "code", 56 | "colab": { 57 | "base_uri": "https://localhost:8080/", 58 | "height": 943 59 | }, 60 | "outputId": "bf7cb24f-c6fd-41ee-e598-b314c3c3ffdd" 61 | }, 62 | "source": [ 63 | "!pip install tensorflow_privacy\n", 64 | "!pip install tf-nightly" 65 | ], 66 | "execution_count": null, 67 | "outputs": [ 68 | { 69 | "output_type": "stream", 70 | "text": [ 71 | "Requirement already satisfied: tensorflow_privacy in /usr/local/lib/python3.6/dist-packages (0.2.2)\n", 72 | "Requirement already satisfied: mpmath in /usr/local/lib/python3.6/dist-packages (from tensorflow_privacy) (1.1.0)\n", 73 | "Requirement already satisfied: scipy>=0.17 in /usr/local/lib/python3.6/dist-packages (from tensorflow_privacy) (1.4.1)\n", 74 | "Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from scipy>=0.17->tensorflow_privacy) (1.18.5)\n", 75 | "Collecting tf-nightly\n", 76 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/51/2f/410f5153862dc461c8c1d1bafc0be6e5942eafaffc1764e71ce284b4034e/tf_nightly-2.4.0.dev20200909-cp36-cp36m-manylinux2010_x86_64.whl (389.9MB)\n", 77 | "\u001b[K |████████████████████████████████| 389.9MB 46kB/s \n", 78 | "\u001b[?25hRequirement already satisfied: keras-preprocessing<1.2,>=1.1.1 in /usr/local/lib/python3.6/dist-packages (from tf-nightly) (1.1.2)\n", 79 | "Requirement already satisfied: wheel>=0.26 in /usr/local/lib/python3.6/dist-packages (from tf-nightly) (0.35.1)\n", 80 | "Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly) (1.15.0)\n", 81 | "Requirement already satisfied: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly) (0.8.1)\n", 82 | "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly) (3.12.4)\n", 83 | "Requirement already satisfied: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tf-nightly) (1.12.1)\n", 84 | "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly) (3.3.0)\n", 85 | "Requirement already satisfied: typing-extensions>=3.7.4.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly) (3.7.4.3)\n", 86 | "Requirement already satisfied: google-pasta>=0.1.8 in /usr/local/lib/python3.6/dist-packages (from tf-nightly) (0.2.0)\n", 87 | "Requirement already satisfied: gast==0.3.3 in /usr/local/lib/python3.6/dist-packages (from tf-nightly) (0.3.3)\n", 88 | "Requirement already satisfied: astunparse==1.6.3 in /usr/local/lib/python3.6/dist-packages (from tf-nightly) (1.6.3)\n", 89 | "Requirement already satisfied: numpy<1.19.0,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly) (1.18.5)\n", 90 | "Collecting tb-nightly<3.0.0a0,>=2.4.0a0\n", 91 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/30/8f/8195d11bc8e6e1945fab68f85ced31f8ff60f88d856867dd310c31b34c22/tb_nightly-2.4.0a20200909-py3-none-any.whl (9.2MB)\n", 92 | "\u001b[K |████████████████████████████████| 9.2MB 51.0MB/s \n", 93 | "\u001b[?25hCollecting tf-estimator-nightly\n", 94 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/a2/4c/b8c1af2d1a2a8e0ca7b07287e2be948addf7e3884d022e62e37e72232dea/tf_estimator_nightly-2.4.0.dev2020090901-py2.py3-none-any.whl (460kB)\n", 95 | "\u001b[K |████████████████████████████████| 460kB 57.1MB/s \n", 96 | "\u001b[?25hCollecting flatbuffers>=1.12\n", 97 | " Downloading https://files.pythonhosted.org/packages/eb/26/712e578c5f14e26ae3314c39a1bdc4eb2ec2f4ddc89b708cf8e0a0d20423/flatbuffers-1.12-py2.py3-none-any.whl\n", 98 | "Requirement already satisfied: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tf-nightly) (1.31.0)\n", 99 | "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly) (1.1.0)\n", 100 | "Requirement already satisfied: h5py<2.11.0,>=2.10.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly) (2.10.0)\n", 101 | "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf>=3.9.2->tf-nightly) (49.6.0)\n", 102 | "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (3.2.2)\n", 103 | "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (2.23.0)\n", 104 | "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (0.4.1)\n", 105 | "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (1.0.1)\n", 106 | "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (1.17.2)\n", 107 | "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (1.7.0)\n", 108 | "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown>=2.6.8->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (1.7.0)\n", 109 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (2020.6.20)\n", 110 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (1.24.3)\n", 111 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (3.0.4)\n", 112 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (2.10)\n", 113 | "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (1.3.0)\n", 114 | "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (4.1.1)\n", 115 | "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (0.2.8)\n", 116 | "Requirement already satisfied: rsa<5,>=3.1.4; python_version >= \"3\" in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (4.6)\n", 117 | "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (3.1.0)\n", 118 | "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (3.1.0)\n", 119 | "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tb-nightly<3.0.0a0,>=2.4.0a0->tf-nightly) (0.4.8)\n", 120 | "Installing collected packages: tb-nightly, tf-estimator-nightly, flatbuffers, tf-nightly\n", 121 | "Successfully installed flatbuffers-1.12 tb-nightly-2.4.0a20200909 tf-estimator-nightly-2.4.0.dev2020090901 tf-nightly-2.4.0.dev20200909\n" 122 | ], 123 | "name": "stdout" 124 | } 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "metadata": { 130 | "colab_type": "code", 131 | "id": "4N10jj3R47Sj", 132 | "colab": {} 133 | }, 134 | "source": [ 135 | "import tensorflow as tf\n", 136 | "import tensorflow_hub as hub\n", 137 | "import pandas as pd\n", 138 | "import numpy as np\n", 139 | "import os" 140 | ], 141 | "execution_count": null, 142 | "outputs": [] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "metadata": { 147 | "id": "JvieVVCNPLM0", 148 | "colab_type": "code", 149 | "colab": { 150 | "base_uri": "https://localhost:8080/", 151 | "height": 34 152 | }, 153 | "outputId": "6682e680-6222-4bb2-93bf-f3fe19546f76" 154 | }, 155 | "source": [ 156 | "\n", 157 | "from pathlib import Path\n", 158 | "\n", 159 | "repo_dir = Path.cwd().parents[1]\n", 160 | "data_file_path = os.path.join(repo_dir, 'data/consumer_complaints_with_narrative.csv')\n", 161 | "print(data_file_path)" 162 | ], 163 | "execution_count": null, 164 | "outputs": [ 165 | { 166 | "output_type": "stream", 167 | "text": [ 168 | "/content/data/reduced_consumer_complaints_with_narrative.csv\n" 169 | ], 170 | "name": "stdout" 171 | } 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "metadata": { 177 | "id": "pakq9qGqQXdn", 178 | "colab_type": "code", 179 | "colab": { 180 | "base_uri": "https://localhost:8080/", 181 | "height": 34 182 | }, 183 | "outputId": "0e6c2168-683d-4884-bc05-89db9a4a863e" 184 | }, 185 | "source": [ 186 | "!ls /content/" 187 | ], 188 | "execution_count": null, 189 | "outputs": [ 190 | { 191 | "output_type": "stream", 192 | "text": [ 193 | "data sample_data\n" 194 | ], 195 | "name": "stdout" 196 | } 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": { 202 | "id": "Ep69FHWVPLM3", 203 | "colab_type": "text" 204 | }, 205 | "source": [ 206 | "## Feature engineering" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "colab_type": "code", 213 | "id": "5lDj1OJCWfTy", 214 | "colab": {} 215 | }, 216 | "source": [ 217 | "ONE_HOT_FEATURES = {\n", 218 | " \"product\": None,\n", 219 | " \"sub_product\": None,\n", 220 | " \"company_response\": None, \n", 221 | " \"state\": None,\n", 222 | " \"issue\": None\n", 223 | "}\n", 224 | "\n", 225 | "# feature name, bucket count\n", 226 | "BUCKET_FEATURES = {\n", 227 | " \"zip_code\": 10\n", 228 | "}\n", 229 | "\n", 230 | "# feature name, value is unused\n", 231 | "TEXT_FEATURES = {\n", 232 | " \"consumer_complaint_narrative\": None\n", 233 | "}" 234 | ], 235 | "execution_count": null, 236 | "outputs": [] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "metadata": { 241 | "colab_type": "code", 242 | "id": "3o-EurrDk7Fi", 243 | "colab": {} 244 | }, 245 | "source": [ 246 | "feature_names = [\"product\", \"sub_product\", \"issue\", \"sub_issue\", \"consumer_complaint_narrative\", \"company\", \"state\", \"zip_code\", \"company_response\", \"timely_response\", \"consumer_disputed\"]\n", 247 | "df = pd.read_csv(data_file_path, usecols=feature_names)" 248 | ], 249 | "execution_count": null, 250 | "outputs": [] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "metadata": { 255 | "colab_type": "code", 256 | "id": "nH5SAgmOOL6K", 257 | "colab": {} 258 | }, 259 | "source": [ 260 | "def make_one_hot(df):\n", 261 | " one_hot_array = []\n", 262 | " for feature_name in ONE_HOT_FEATURES.keys():\n", 263 | " temp_array = pd.np.asarray(tf.keras.utils.to_categorical(df[feature_name].values))\n", 264 | " ONE_HOT_FEATURES[feature_name] = temp_array.shape[1]\n", 265 | " one_hot_array.append(temp_array)\n", 266 | "\n", 267 | " return one_hot_array" 268 | ], 269 | "execution_count": null, 270 | "outputs": [] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "metadata": { 275 | "colab_type": "code", 276 | "id": "jxbSJIw3lDOj", 277 | "colab": { 278 | "base_uri": "https://localhost:8080/", 279 | "height": 107 280 | }, 281 | "outputId": "a889dfe6-98f5-4d6e-823f-08aef87b3ec2" 282 | }, 283 | "source": [ 284 | "for feature in ONE_HOT_FEATURES.keys():\n", 285 | " df[feature] = df[feature].astype(\"category\").cat.codes\n", 286 | "\n", 287 | "one_hot_x = make_one_hot(df)\n", 288 | "\n", 289 | "embedding_x = [pd.np.asarray(df[feature_name].values).reshape(-1) for feature_name in TEXT_FEATURES.keys()]\n", 290 | "\n", 291 | "df['zip_code'] = df['zip_code'].str.replace('X', '0', regex=True)\n", 292 | "df['zip_code'] = df['zip_code'].str.replace(r'\\[|\\*|\\+|\\-|`|\\.|\\ |\\$|\\/|!|\\(', '0', regex=True)\n", 293 | "df['zip_code'] = df['zip_code'].fillna(0)\n", 294 | "df['zip_code'] = df['zip_code'].astype('int32')\n", 295 | "# one bucket per 10k\n", 296 | "df['zip_code'] = df['zip_code'].apply(lambda x: x//10000)\n", 297 | "numeric_x = [df['zip_code'].values]\n", 298 | "\n", 299 | "X = one_hot_x + numeric_x + embedding_x\n", 300 | "y = np.asarray(df[\"consumer_disputed\"], dtype=np.uint8).reshape(-1)" 301 | ], 302 | "execution_count": null, 303 | "outputs": [ 304 | { 305 | "output_type": "stream", 306 | "text": [ 307 | "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:4: FutureWarning: The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead\n", 308 | " after removing the cwd from sys.path.\n", 309 | "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:6: FutureWarning: The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead\n", 310 | " \n" 311 | ], 312 | "name": "stderr" 313 | } 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": { 319 | "colab_type": "text", 320 | "id": "t9Eo3zrCVRPm" 321 | }, 322 | "source": [ 323 | "## Adding DP" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "metadata": { 329 | "colab_type": "code", 330 | "id": "Yto8Cmn7VErQ", 331 | "colab": {} 332 | }, 333 | "source": [ 334 | "# DP parameters\n", 335 | "NOISE_MULTIPLIER = 1.1\n", 336 | "NUM_MICROBATCHES = 32\n", 337 | "LEARNING_RATE = 0.1\n", 338 | "POPULATION_SIZE = 1000\n", 339 | "L2_NORM_CLIP = 1.0\n", 340 | "BATCH_SIZE = 32 \n", 341 | "EPOCHS = 1" 342 | ], 343 | "execution_count": null, 344 | "outputs": [] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "metadata": { 349 | "colab_type": "code", 350 | "id": "u0JJ_EnmVTk6", 351 | "colab": {} 352 | }, 353 | "source": [ 354 | "from tensorflow_privacy.privacy.optimizers.dp_optimizer import DPGradientDescentGaussianOptimizer\n", 355 | "\n", 356 | "optimizer = DPGradientDescentGaussianOptimizer(\n", 357 | " l2_norm_clip=L2_NORM_CLIP,\n", 358 | " noise_multiplier=NOISE_MULTIPLIER,\n", 359 | " num_microbatches=NUM_MICROBATCHES,\n", 360 | " learning_rate=LEARNING_RATE)\n", 361 | " \n", 362 | "loss = tf.keras.losses.BinaryCrossentropy(\n", 363 | " from_logits=True, reduction=tf.losses.Reduction.NONE)" 364 | ], 365 | "execution_count": null, 366 | "outputs": [] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": { 371 | "colab_type": "text", 372 | "id": "LoQHOGsh5Anr" 373 | }, 374 | "source": [ 375 | "The model is unchanged, we just pass in the differentially private optimizer and loss." 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "metadata": { 381 | "colab_type": "code", 382 | "id": "JZ7Z1LHd4-kb", 383 | "colab": {} 384 | }, 385 | "source": [ 386 | "def transformed_name(key):\n", 387 | " return key + '_xf'\n", 388 | "\n", 389 | "def get_model(dp_optimizer, dp_loss, show_summary=True):\n", 390 | " \"\"\"\n", 391 | " This function defines a Keras model and returns the model as a Keras object.\n", 392 | " \"\"\"\n", 393 | " \n", 394 | " # one-hot categorical features\n", 395 | " input_features = []\n", 396 | " for key, dim in ONE_HOT_FEATURES.items():\n", 397 | " input_features.append(tf.keras.Input(shape=(dim), name=transformed_name(key)))\n", 398 | "\n", 399 | " # adding bucketized features \n", 400 | " for key, dim in BUCKET_FEATURES.items():\n", 401 | " input_features.append(tf.keras.Input(1, name=transformed_name(key)))\n", 402 | "\n", 403 | " # adding text input features\n", 404 | " input_texts = []\n", 405 | " for key in TEXT_FEATURES.keys():\n", 406 | " input_texts.append(tf.keras.Input(shape=(1,), name=transformed_name(key), dtype=tf.string))\n", 407 | "\n", 408 | " # embed text features\n", 409 | " MODULE_URL = \"https://tfhub.dev/google/universal-sentence-encoder/4\"\n", 410 | " embed = hub.KerasLayer(MODULE_URL)\n", 411 | " reshaped_narrative = tf.reshape(input_texts[0], [-1])\n", 412 | " embed_narrative = embed(reshaped_narrative) \n", 413 | " deep_ff = tf.keras.layers.Reshape((512, ), input_shape=(1, 512))(embed_narrative)\n", 414 | " \n", 415 | " deep = tf.keras.layers.Dense(256, activation='relu')(deep_ff)\n", 416 | " deep = tf.keras.layers.Dense(64, activation='relu')(deep)\n", 417 | " deep = tf.keras.layers.Dense(16, activation='relu')(deep)\n", 418 | "\n", 419 | " wide_ff = tf.keras.layers.concatenate(input_features)\n", 420 | " wide = tf.keras.layers.Dense(16, activation='relu')(wide_ff)\n", 421 | "\n", 422 | " both = tf.keras.layers.concatenate([deep, wide])\n", 423 | "\n", 424 | " output = tf.keras.layers.Dense(1, activation='sigmoid')(both) \n", 425 | "\n", 426 | " inputs = input_features + input_texts\n", 427 | "\n", 428 | " keras_model = tf.keras.models.Model(inputs, output)\n", 429 | " keras_model.compile(optimizer=dp_optimizer,\n", 430 | " loss=dp_loss, \n", 431 | " metrics=[\n", 432 | " tf.keras.metrics.BinaryAccuracy(),\n", 433 | " tf.keras.metrics.TruePositives()\n", 434 | " ])\n", 435 | " if show_summary:\n", 436 | " keras_model.summary()\n", 437 | "\n", 438 | " return keras_model" 439 | ], 440 | "execution_count": null, 441 | "outputs": [] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "metadata": { 446 | "colab_type": "code", 447 | "id": "Y4TTGI9glD_M", 448 | "colab": {} 449 | }, 450 | "source": [ 451 | "model = get_model(show_summary=False, dp_optimizer=optimizer, dp_loss=loss)" 452 | ], 453 | "execution_count": null, 454 | "outputs": [] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "metadata": { 459 | "colab_type": "code", 460 | "id": "yAmaGolZl4cX", 461 | "colab": { 462 | "base_uri": "https://localhost:8080/", 463 | "height": 72 464 | }, 465 | "outputId": "f5161960-ada8-4717-bf8d-cfa0551ad358" 466 | }, 467 | "source": [ 468 | "model.fit(x=X, y=y, batch_size=32, validation_split=0.1, epochs=EPOCHS)" 469 | ], 470 | "execution_count": null, 471 | "outputs": [ 472 | { 473 | "output_type": "stream", 474 | "text": [ 475 | "29/29 [==============================] - 4s 67ms/step - loss: 0.7254 - binary_accuracy: 0.7596 - true_positives: 0.0000e+00 - val_loss: 0.6942 - val_binary_accuracy: 0.7400 - val_true_positives: 0.0000e+00\n" 476 | ], 477 | "name": "stdout" 478 | }, 479 | { 480 | "output_type": "execute_result", 481 | "data": { 482 | "text/plain": [ 483 | "" 484 | ] 485 | }, 486 | "metadata": { 487 | "tags": [] 488 | }, 489 | "execution_count": 24 490 | } 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": { 496 | "colab_type": "text", 497 | "id": "P1gtS5tFfZau" 498 | }, 499 | "source": [ 500 | "### Calculate Epsilon" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "metadata": { 506 | "colab_type": "code", 507 | "id": "q6u5MIUkMrpS", 508 | "colab": { 509 | "base_uri": "https://localhost:8080/", 510 | "height": 70 511 | }, 512 | "outputId": "68bf00b6-1bbc-4529-82a9-50479772522c" 513 | }, 514 | "source": [ 515 | "from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy\n", 516 | "\n", 517 | "\n", 518 | "compute_dp_sgd_privacy.compute_dp_sgd_privacy(n=POPULATION_SIZE, \n", 519 | " batch_size=BATCH_SIZE, \n", 520 | " noise_multiplier=NOISE_MULTIPLIER, \n", 521 | " epochs=EPOCHS, \n", 522 | " delta=1e-3)" 523 | ], 524 | "execution_count": null, 525 | "outputs": [ 526 | { 527 | "output_type": "stream", 528 | "text": [ 529 | "DP-SGD with sampling rate = 3.2% and noise_multiplier = 1.1 iterated over 32 steps satisfies differential privacy with eps = 1.38 and delta = 0.001.\n", 530 | "The optimal RDP order is 7.0.\n" 531 | ], 532 | "name": "stdout" 533 | }, 534 | { 535 | "output_type": "execute_result", 536 | "data": { 537 | "text/plain": [ 538 | "(1.3845887532963042, 7.0)" 539 | ] 540 | }, 541 | "metadata": { 542 | "tags": [] 543 | }, 544 | "execution_count": 25 545 | } 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "metadata": { 551 | "colab_type": "code", 552 | "id": "gBzK9bK1gBab", 553 | "colab": {} 554 | }, 555 | "source": [ 556 | "" 557 | ], 558 | "execution_count": null, 559 | "outputs": [] 560 | } 561 | ] 562 | } -------------------------------------------------------------------------------- /chapters/intro_tfx/Apache_beam_example_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "oKvOFVJLu3tf" 8 | }, 9 | "source": [ 10 | "# Apache Beam Word Count Example\n", 11 | "\n", 12 | "The example is adopted from https://beam.apache.org/get-started/wordcount-example/ for Google Colab\n", 13 | "\n", 14 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/Building-ML-Pipelines/building-machine-learning-pipelines/blob/master/chapters/intro_tfx/Apache_beam_example_notebook.ipynb)\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "colab": {}, 22 | "colab_type": "code", 23 | "id": "rAh3av_ovF2y" 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "# %pip install -q apache_beam[gcp]" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "colab": {}, 35 | "colab_type": "code", 36 | "id": "m9BExktyu2yM" 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "import re\n", 41 | "\n", 42 | "import apache_beam as beam\n", 43 | "from apache_beam.options.pipeline_options import PipelineOptions\n", 44 | "\n", 45 | "\n", 46 | "input_file = \"gs://dataflow-samples/shakespeare/kinglear.txt\"\n", 47 | "output_file = \"output.txt\"\n", 48 | "\n", 49 | "pipeline_options = PipelineOptions()\n", 50 | "\n", 51 | "\n", 52 | "with beam.Pipeline(options=pipeline_options) as p: \n", 53 | "\n", 54 | " # Read the text file[pattern] into a PCollection.\n", 55 | " lines = p | beam.io.ReadFromText(input_file)\n", 56 | "\n", 57 | " # Count the occurrences of each word.\n", 58 | " counts = (\n", 59 | " lines\n", 60 | " | 'Split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\\']+', x)))\n", 61 | " # .with_output_types(unicode))\n", 62 | " | 'PairWithOne' >> beam.Map(lambda x: (x, 1))\n", 63 | " | 'GroupAndSum' >> beam.CombinePerKey(sum))\n", 64 | "\n", 65 | " # Format the counts into a PCollection of strings.\n", 66 | " def format_result(word_count):\n", 67 | " (word, count) = word_count\n", 68 | " return f\"{word}: {count}\"\n", 69 | "\n", 70 | " output = counts | 'Format' >> beam.Map(format_result)\n", 71 | "\n", 72 | " # Write the output using a \"Write\" transform that has side effects.\n", 73 | " output | beam.io.WriteToText(output_file)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "colab": { 81 | "base_uri": "https://localhost:8080/", 82 | "height": 187 83 | }, 84 | "colab_type": "code", 85 | "id": "-xbljzhfvJVH", 86 | "outputId": "fd19137c-52e4-4442-b597-b0a23a2bf3bb" 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "!head output.txt*" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "colab": {}, 98 | "colab_type": "code", 99 | "id": "U0skCBIkwT9m" 100 | }, 101 | "outputs": [], 102 | "source": [] 103 | } 104 | ], 105 | "metadata": { 106 | "colab": { 107 | "collapsed_sections": [], 108 | "name": "Apache_beam_example_notebook.ipynb", 109 | "provenance": [], 110 | "toc_visible": true 111 | }, 112 | "kernelspec": { 113 | "display_name": "Python 3", 114 | "name": "python3" 115 | }, 116 | "language_info": { 117 | "codemirror_mode": { 118 | "name": "ipython", 119 | "version": 3 120 | }, 121 | "file_extension": ".py", 122 | "mimetype": "text/x-python", 123 | "name": "python", 124 | "nbconvert_exporter": "python", 125 | "pygments_lexer": "ipython3", 126 | "version": "3.8.12" 127 | } 128 | }, 129 | "nbformat": 4, 130 | "nbformat_minor": 0 131 | } 132 | -------------------------------------------------------------------------------- /chapters/model_analysis/model_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Analyzing your model with TensorFlow Model Analysis and the What-If Tool" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### NB This only works in a Jupyter Notebook, NOT Jupyter Lab.\n", 15 | "Lab extensions have not been released for TFMA and the What-If Tool." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 3, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import tensorflow_model_analysis as tfma\n", 25 | "import tensorflow as tf\n", 26 | "\n", 27 | "import sys\n", 28 | "import os\n", 29 | "\n", 30 | "# stop tf warnings \n", 31 | "import logging\n", 32 | "logger = tf.get_logger()\n", 33 | "logger.setLevel(logging.ERROR)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "You will need a trained model and an evaluation dataset (TFRecords) as produced by the earlier steps in the pipeline." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 4, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "_EVAL_DATA_FILE = 'data_tfrecord-00000-of-00001'\n", 50 | "_MODEL_DIR = 'serving_model_dir_2000_steps/'" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## TFMA" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "eval_shared_model = tfma.default_eval_shared_model(\n", 67 | " eval_saved_model_path=_MODEL_DIR, tags=[tf.saved_model.SERVING])" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "slices = [tfma.slicer.SingleSliceSpec(),\n", 77 | " tfma.slicer.SingleSliceSpec(columns=['product'])]" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "eval_config=tfma.EvalConfig(\n", 87 | " model_specs=[tfma.ModelSpec(label_key='consumer_disputed')],\n", 88 | " slicing_specs=[tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['product'])],\n", 89 | " metrics_specs=[\n", 90 | " tfma.MetricsSpec(metrics=[\n", 91 | " tfma.MetricConfig(class_name='BinaryAccuracy'),\n", 92 | " tfma.MetricConfig(class_name='ExampleCount'),\n", 93 | " tfma.MetricConfig(class_name='FalsePositives'),\n", 94 | " tfma.MetricConfig(class_name='TruePositives'),\n", 95 | " tfma.MetricConfig(class_name='FalseNegatives'),\n", 96 | " tfma.MetricConfig(class_name='TrueNegatives')\n", 97 | " ])])" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "eval_result = tfma.run_model_analysis(\n", 107 | " eval_shared_model=eval_shared_model,\n", 108 | " eval_config=eval_config,\n", 109 | " data_location=_EVAL_DATA_FILE,\n", 110 | " output_path=\"./eval_result_2000_steps\",\n", 111 | " file_format='tfrecords',\n", 112 | " slice_spec = slices)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "# may take 2 goes\n", 122 | "tfma.view.render_slicing_metrics(eval_result)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "scrolled": false 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "tfma.view.render_slicing_metrics(eval_result, slicing_spec=slices[1])" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "## Compare 2 models" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "scrolled": true 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "eval_shared_model_2 = tfma.default_eval_shared_model(\n", 152 | " eval_saved_model_path='serving_model_dir_150_steps/', tags=[tf.saved_model.SERVING])\n", 153 | "\n", 154 | "eval_result_2 = tfma.run_model_analysis(\n", 155 | " eval_shared_model=eval_shared_model_2,\n", 156 | " eval_config=eval_config,\n", 157 | " data_location=_EVAL_DATA_FILE,\n", 158 | " output_path=\"./eval_result_150_steps\",\n", 159 | " file_format='tfrecords',\n", 160 | " slice_spec = slices)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "tfma.view.render_slicing_metrics(eval_result_2)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "eval_results_from_disk = tfma.load_eval_results(\n", 179 | " ['./eval_result_2000_steps','./eval_result_150_steps'], tfma.constants.MODEL_CENTRIC_MODE)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "# bug - only works reliably in Colab\n", 189 | "tfma.view.render_time_series(eval_results_from_disk, slices[0])\n" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "## Validating against thresholds" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "eval_config_threshold=tfma.EvalConfig(\n", 206 | " model_specs=[tfma.ModelSpec(label_key='consumer_disputed')],\n", 207 | " slicing_specs=[tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['product'])],\n", 208 | " metrics_specs=[\n", 209 | " tfma.MetricsSpec(metrics=[\n", 210 | " tfma.MetricConfig(class_name='BinaryAccuracy'),\n", 211 | " tfma.MetricConfig(class_name='ExampleCount'),\n", 212 | " tfma.MetricConfig(class_name='AUC')\n", 213 | " ],\n", 214 | " thresholds={\n", 215 | " 'AUC':\n", 216 | " tfma.config.MetricThreshold(\n", 217 | " value_threshold=tfma.GenericValueThreshold(\n", 218 | " lower_bound={'value': 0.5}))}\n", 219 | " )])" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "scrolled": true 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "eval_shared_models = [\n", 231 | " tfma.default_eval_shared_model(\n", 232 | " model_name='candidate', # must have this exact name\n", 233 | " eval_saved_model_path='serving_model_dir_150_steps/', tags=[tf.saved_model.SERVING]),\n", 234 | " tfma.default_eval_shared_model(\n", 235 | " model_name='baseline', # must have this exact name\n", 236 | " eval_saved_model_path='serving_model_dir_2000_steps/', tags=[tf.saved_model.SERVING]),\n", 237 | "]\n", 238 | "\n", 239 | "eval_result = tfma.run_model_analysis(\n", 240 | " eval_shared_models,\n", 241 | " eval_config=eval_config_threshold,\n", 242 | " data_location=_EVAL_DATA_FILE,\n", 243 | " output_path=\"./eval_threshold\",slice_spec = slices)\n", 244 | "\n" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "tfma.load_validation_result('./eval_threshold')" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "tfma.view.render_slicing_metrics(eval_result)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "## Fairness indicators" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "# https://github.com/tensorflow/tensorboard/blob/master/docs/fairness-indicators.md\n", 279 | "# needs environment without WIT,but with TF2.x, TFX\n", 280 | "!pip install tensorboard_plugin_fairness_indicators" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "eval_config_fairness=tfma.EvalConfig(\n", 290 | " model_specs=[tfma.ModelSpec(label_key='consumer_disputed')],\n", 291 | " slicing_specs=[tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['product'])],\n", 292 | " metrics_specs=[\n", 293 | " tfma.MetricsSpec(metrics=[\n", 294 | " tfma.MetricConfig(class_name='BinaryAccuracy'),\n", 295 | " tfma.MetricConfig(class_name='ExampleCount'),\n", 296 | " tfma.MetricConfig(class_name='FalsePositives'),\n", 297 | " tfma.MetricConfig(class_name='TruePositives'),\n", 298 | " tfma.MetricConfig(class_name='FalseNegatives'),\n", 299 | " tfma.MetricConfig(class_name='TrueNegatives'),\n", 300 | " tfma.MetricConfig(class_name='FairnessIndicators', config='{\"thresholds\":[0.25, 0.5, 0.75]}')\n", 301 | " ])])" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "eval_result = tfma.run_model_analysis(\n", 311 | " eval_shared_model=eval_shared_model,\n", 312 | " eval_config=eval_config_fairness,\n", 313 | " data_location=_EVAL_DATA_FILE,\n", 314 | " output_path=\"./eval_result_fairness\",\n", 315 | " file_format='tfrecords',\n", 316 | " slice_spec = slices)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "from tensorboard_plugin_fairness_indicators import summary_v2" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "writer = tf.summary.create_file_writer('./fairness_indicator_logs')\n", 335 | "with writer.as_default():\n", 336 | " summary_v2.FairnessIndicators('./eval_result_fairness', step=1)\n", 337 | "writer.close()" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "%load_ext tensorboard\n", 347 | "%tensorboard --logdir=./fairness_indicator_logs" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "## The What-If Tool" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 1, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "from witwidget.notebook.visualization import WitConfigBuilder\n", 364 | "from witwidget.notebook.visualization import WitWidget" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 5, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "eval_data = tf.data.TFRecordDataset(_EVAL_DATA_FILE)" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 6, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "eval_examples = [tf.train.Example.FromString(d.numpy()) for d in eval_data.take(1000)]" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 7, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "model = tf.saved_model.load(export_dir=_MODEL_DIR)" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 8, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "def predict(examples):\n", 401 | " preds = model.signatures['serving_default'](examples=tf.constant([example.SerializeToString() for example in examples]))\n", 402 | " return preds['outputs'].numpy()" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 9, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "config_builder = WitConfigBuilder(eval_examples).set_custom_predict_fn(predict)" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 10, 417 | "metadata": { 418 | "scrolled": false 419 | }, 420 | "outputs": [ 421 | { 422 | "data": { 423 | "text/html": [ 424 | "" 425 | ], 426 | "text/plain": [ 427 | "" 428 | ] 429 | }, 430 | "metadata": {}, 431 | "output_type": "display_data" 432 | }, 433 | { 434 | "data": { 435 | "application/vnd.jupyter.widget-view+json": { 436 | "model_id": "6bd82c30c5fb4e49b77e955430ee9a40", 437 | "version_major": 2, 438 | "version_minor": 0 439 | }, 440 | "text/plain": [ 441 | "WitWidget(config={'model_type': 'classification', 'label_vocab': [], 'are_sequence_examples': False, 'inferenc…" 442 | ] 443 | }, 444 | "metadata": {}, 445 | "output_type": "display_data" 446 | } 447 | ], 448 | "source": [ 449 | "WitWidget(config_builder)" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "### Debugging" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [ 465 | "!pip install witwidget" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "# works with >2.1\n", 475 | "!pip show tensorflow" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": null, 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "# works with >0.21.3\n", 485 | "!pip show tensorflow_model_analysis" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": null, 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [ 494 | "# works with >1.6.0\n", 495 | "!pip show witwidget" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": 3, 501 | "metadata": {}, 502 | "outputs": [ 503 | { 504 | "name": "stdout", 505 | "output_type": "stream", 506 | "text": [ 507 | "Installing /Users/i854694/.virtualenvs/bmlp2/lib/python3.7/site-packages/witwidget/static -> wit-widget\n", 508 | "- Validating: \u001b[32mOK\u001b[0m\n", 509 | "\n", 510 | " To initialize this nbextension in the browser every time the notebook (or other app) loads:\n", 511 | " \n", 512 | " jupyter nbextension enable witwidget --py --sys-prefix\n", 513 | " \n", 514 | "Enabling notebook extension wit-widget/extension...\n", 515 | " - Validating: \u001b[32mOK\u001b[0m\n" 516 | ] 517 | } 518 | ], 519 | "source": [ 520 | "# may need to run this every time\n", 521 | "!jupyter nbextension install --py --symlink --sys-prefix witwidget\n", 522 | "\n", 523 | "!jupyter nbextension enable witwidget --py --sys-prefix \n", 524 | "\n", 525 | "# then refresh browser page" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": null, 531 | "metadata": {}, 532 | "outputs": [], 533 | "source": [ 534 | "# may need to run this every time\n", 535 | "\n", 536 | "!jupyter nbextension enable --py widgetsnbextension --sys-prefix\n", 537 | " \n", 538 | "!jupyter nbextension install --py --symlink tensorflow_model_analysis --sys-prefix\n", 539 | " \n", 540 | "!jupyter nbextension enable --py tensorflow_model_analysis --sys-prefix\n", 541 | "\n", 542 | "# then refresh browser page" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "!pip install widgetsnbextension" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": null, 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "!pip install -U ipywidgets" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": null, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "!pip install jupyter_nbextensions_configurator" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": {}, 576 | "outputs": [], 577 | "source": [ 578 | "!jupyter nbextension list" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": null, 584 | "metadata": {}, 585 | "outputs": [], 586 | "source": [ 587 | "!jupyter serverextension list" 588 | ] 589 | } 590 | ], 591 | "metadata": { 592 | "kernelspec": { 593 | "display_name": "Python 3", 594 | "language": "python", 595 | "name": "python3" 596 | } 597 | }, 598 | "nbformat": 4, 599 | "nbformat_minor": 4 600 | } 601 | -------------------------------------------------------------------------------- /components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Building-ML-Pipelines/building-machine-learning-pipelines/8862436d291a330d772dc59e104c4ba0a6d64b5a/components/__init__.py -------------------------------------------------------------------------------- /components/keras_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import tensorflow as tf 4 | import tensorflow_hub as hub 5 | import tensorflow_transform as tft 6 | 7 | from transform import ( 8 | BUCKET_FEATURES, 9 | LABEL_KEY, 10 | ONE_HOT_FEATURES, 11 | TEXT_FEATURES, 12 | transformed_name, 13 | ) 14 | 15 | 16 | def get_model(show_summary=True): 17 | """ 18 | This function defines a Keras model and returns the model as a 19 | Keras object. 20 | """ 21 | 22 | # one-hot categorical features 23 | input_features = [] 24 | for key, dim in ONE_HOT_FEATURES.items(): 25 | input_features.append( 26 | tf.keras.Input(shape=(dim + 1,), name=transformed_name(key)) 27 | ) 28 | 29 | # adding bucketized features 30 | for key, dim in BUCKET_FEATURES.items(): 31 | input_features.append( 32 | tf.keras.Input(shape=(dim + 1,), name=transformed_name(key)) 33 | ) 34 | 35 | # adding text input features 36 | input_texts = [] 37 | for key in TEXT_FEATURES.keys(): 38 | input_texts.append( 39 | tf.keras.Input( 40 | shape=(1,), name=transformed_name(key), dtype=tf.string 41 | ) 42 | ) 43 | 44 | # embed text features 45 | MODULE_URL = "https://tfhub.dev/google/universal-sentence-encoder/4" 46 | embed = hub.KerasLayer(MODULE_URL) 47 | reshaped_narrative = tf.reshape(input_texts[0], [-1]) 48 | embed_narrative = embed(reshaped_narrative) 49 | deep_ff = tf.keras.layers.Reshape((512,), input_shape=(1, 512))( 50 | embed_narrative 51 | ) 52 | 53 | deep = tf.keras.layers.Dense(256, activation="relu")(deep_ff) 54 | deep = tf.keras.layers.Dense(64, activation="relu")(deep) 55 | deep = tf.keras.layers.Dense(16, activation="relu")(deep) 56 | 57 | wide_ff = tf.keras.layers.concatenate(input_features) 58 | wide = tf.keras.layers.Dense(16, activation="relu")(wide_ff) 59 | 60 | both = tf.keras.layers.concatenate([deep, wide]) 61 | 62 | output = tf.keras.layers.Dense(1, activation="sigmoid")(both) 63 | 64 | inputs = input_features + input_texts 65 | 66 | keras_model = tf.keras.models.Model(inputs, output) 67 | keras_model.compile( 68 | optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 69 | loss="binary_crossentropy", 70 | metrics=[ 71 | tf.keras.metrics.BinaryAccuracy(), 72 | tf.keras.metrics.TruePositives(), 73 | ], 74 | ) 75 | if show_summary: 76 | keras_model.summary() 77 | 78 | return keras_model 79 | 80 | 81 | def _gzip_reader_fn(filenames): 82 | """Small utility returning a record reader that can read gzip'ed files.""" 83 | return tf.data.TFRecordDataset(filenames, compression_type="GZIP") 84 | 85 | 86 | def _get_serve_tf_examples_fn(model, tf_transform_output): 87 | """Returns a function that parses a serialized tf.Example.""" 88 | 89 | model.tft_layer = tf_transform_output.transform_features_layer() 90 | 91 | @tf.function 92 | def serve_tf_examples_fn(serialized_tf_examples): 93 | """Returns the output to be used in the serving signature.""" 94 | feature_spec = tf_transform_output.raw_feature_spec() 95 | feature_spec.pop(LABEL_KEY) 96 | parsed_features = tf.io.parse_example( 97 | serialized_tf_examples, feature_spec 98 | ) 99 | 100 | transformed_features = model.tft_layer(parsed_features) 101 | 102 | outputs = model(transformed_features) 103 | return {"outputs": outputs} 104 | 105 | return serve_tf_examples_fn 106 | 107 | 108 | def _input_fn(file_pattern, tf_transform_output, batch_size=64): 109 | """Generates features and label for tuning/training. 110 | 111 | Args: 112 | file_pattern: input tfrecord file pattern. 113 | tf_transform_output: A TFTransformOutput. 114 | batch_size: representing the number of consecutive elements of 115 | returned dataset to combine in a single batch 116 | 117 | Returns: 118 | A dataset that contains (features, indices) tuple where features 119 | is a dictionary of Tensors, and indices is a single Tensor of 120 | label indices. 121 | """ 122 | transformed_feature_spec = ( 123 | tf_transform_output.transformed_feature_spec().copy() 124 | ) 125 | 126 | dataset = tf.data.experimental.make_batched_features_dataset( 127 | file_pattern=file_pattern, 128 | batch_size=batch_size, 129 | features=transformed_feature_spec, 130 | reader=_gzip_reader_fn, 131 | label_key=transformed_name(LABEL_KEY), 132 | ) 133 | 134 | return dataset 135 | 136 | 137 | # TFX Trainer will call this function. 138 | def run_fn(fn_args): 139 | """Train the model based on given args. 140 | 141 | Args: 142 | fn_args: Holds args used to train the model as name/value pairs. 143 | """ 144 | tf_transform_output = tft.TFTransformOutput(fn_args.transform_output) 145 | 146 | train_dataset = _input_fn(fn_args.train_files, tf_transform_output, 64) 147 | eval_dataset = _input_fn(fn_args.eval_files, tf_transform_output, 64) 148 | 149 | model = get_model() 150 | 151 | log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), "logs") 152 | tensorboard_callback = tf.keras.callbacks.TensorBoard( 153 | log_dir=log_dir, update_freq="batch" 154 | ) 155 | 156 | model.fit( 157 | train_dataset, 158 | steps_per_epoch=fn_args.train_steps, 159 | validation_data=eval_dataset, 160 | validation_steps=fn_args.eval_steps, 161 | callbacks=[tensorboard_callback], 162 | ) 163 | 164 | signatures = { 165 | "serving_default": _get_serve_tf_examples_fn( 166 | model, tf_transform_output 167 | ).get_concrete_function( 168 | tf.TensorSpec(shape=[None], dtype=tf.string, name="examples") 169 | ), 170 | } 171 | model.save( 172 | fn_args.serving_model_dir, save_format="tf", signatures=signatures 173 | ) 174 | -------------------------------------------------------------------------------- /components/module.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from typing import Union 4 | 5 | import tensorflow as tf 6 | import tensorflow_hub as hub 7 | import tensorflow_transform as tft 8 | 9 | 10 | LABEL_KEY = "consumer_disputed" 11 | 12 | ################ 13 | # Transform code 14 | ################ 15 | 16 | # feature name, feature dimensionality 17 | ONE_HOT_FEATURES = { 18 | "product": 11, 19 | "sub_product": 45, 20 | "company_response": 5, 21 | "state": 60, 22 | "issue": 90, 23 | } 24 | 25 | # feature name, bucket count 26 | BUCKET_FEATURES = {"zip_code": 10} 27 | 28 | # feature name, value is unused 29 | TEXT_FEATURES = {"consumer_complaint_narrative": None} 30 | 31 | 32 | os.environ["TFHUB_CACHE_DIR"] = "tmp/tfhub" 33 | 34 | 35 | def transformed_name(key: str) -> str: 36 | return key + "_xf" 37 | 38 | 39 | def fill_in_missing(x: Union[tf.Tensor, tf.SparseTensor]) -> tf.Tensor: 40 | """Replace missing values in a SparseTensor. 41 | 42 | Fills in missing values of `x` with '' or 0, and converts to a 43 | dense tensor. 44 | 45 | Args: 46 | x: A `SparseTensor` of rank 2. Its dense shape should have 47 | size at most 1 in the second dimension. 48 | 49 | Returns: 50 | A rank 1 tensor where missing values of `x` have been filled in. 51 | """ 52 | if isinstance(x, tf.sparse.SparseTensor): 53 | default_value = "" if x.dtype == tf.string else 0 54 | x = tf.sparse.to_dense( 55 | tf.SparseTensor(x.indices, x.values, [x.dense_shape[0], 1]), 56 | default_value, 57 | ) 58 | return tf.squeeze(x, axis=1) 59 | 60 | 61 | def convert_num_to_one_hot(label_tensor: tf.Tensor, num_labels: int = 2) -> tf.Tensor: 62 | """ 63 | Convert a label (0 or 1) into a one-hot vector 64 | Args: 65 | int: label_tensor (0 or 1) 66 | Returns 67 | label tensor 68 | """ 69 | one_hot_tensor = tf.one_hot(label_tensor, num_labels) 70 | return tf.reshape(one_hot_tensor, [-1, num_labels]) 71 | 72 | 73 | def convert_zip_code(zipcode: str) -> tf.float32: 74 | """ 75 | Convert a zipcode string to int64 representation. In the dataset the 76 | zipcodes are anonymized by repacing the last 3 digits to XXX. We are 77 | replacing those characters to 000 to simplify the bucketing later on. 78 | 79 | Args: 80 | str: zipcode 81 | Returns: 82 | zipcode: int64 83 | """ 84 | zipcode = tf.strings.regex_replace(zipcode, r"X{0,5}", "0") 85 | zipcode = tf.strings.to_number(zipcode, out_type=tf.float32) 86 | return zipcode 87 | 88 | 89 | def preprocessing_fn(inputs: tf.Tensor) -> tf.Tensor: 90 | """tf.transform's callback function for preprocessing inputs. 91 | 92 | Args: 93 | inputs: map from feature keys to raw not-yet-transformed features. 94 | 95 | Returns: 96 | Map from string feature key to transformed feature operations. 97 | """ 98 | outputs = {} 99 | 100 | for key in ONE_HOT_FEATURES.keys(): 101 | dim = ONE_HOT_FEATURES[key] 102 | int_value = tft.compute_and_apply_vocabulary( 103 | fill_in_missing(inputs[key]), top_k=dim + 1 104 | ) 105 | outputs[transformed_name(key)] = convert_num_to_one_hot( 106 | int_value, num_labels=dim + 1 107 | ) 108 | 109 | for key, bucket_count in BUCKET_FEATURES.items(): 110 | temp_feature = tft.bucketize( 111 | convert_zip_code(fill_in_missing(inputs[key])), 112 | bucket_count, 113 | ) 114 | outputs[transformed_name(key)] = convert_num_to_one_hot( 115 | temp_feature, num_labels=bucket_count + 1 116 | ) 117 | 118 | for key in TEXT_FEATURES.keys(): 119 | outputs[transformed_name(key)] = fill_in_missing(inputs[key]) 120 | 121 | outputs[transformed_name(LABEL_KEY)] = fill_in_missing(inputs[LABEL_KEY]) 122 | 123 | return outputs 124 | 125 | 126 | ################ 127 | # Model code 128 | ################ 129 | 130 | 131 | def get_model(show_summary: bool = True) -> tf.keras.models.Model: 132 | """ 133 | This function defines a Keras model and returns the model as a Keras object. 134 | """ 135 | 136 | # one-hot categorical features 137 | input_features = [] 138 | for key, dim in ONE_HOT_FEATURES.items(): 139 | input_features.append( 140 | tf.keras.Input(shape=(dim + 1,), name=transformed_name(key)) 141 | ) 142 | 143 | # adding bucketized features 144 | for key, dim in BUCKET_FEATURES.items(): 145 | input_features.append( 146 | tf.keras.Input(shape=(dim + 1,), name=transformed_name(key)) 147 | ) 148 | 149 | # adding text input features 150 | input_texts = [] 151 | for key in TEXT_FEATURES.keys(): 152 | input_texts.append( 153 | tf.keras.Input(shape=(1,), name=transformed_name(key), dtype=tf.string) 154 | ) 155 | 156 | # embed text features 157 | MODULE_URL = "https://tfhub.dev/google/universal-sentence-encoder/4" 158 | embed = hub.KerasLayer(MODULE_URL) 159 | reshaped_narrative = tf.reshape(input_texts[0], [-1]) 160 | embed_narrative = embed(reshaped_narrative) 161 | deep_ff = tf.keras.layers.Reshape((512,), input_shape=(1, 512))(embed_narrative) 162 | 163 | deep = tf.keras.layers.Dense(256, activation="relu")(deep_ff) 164 | deep = tf.keras.layers.Dense(64, activation="relu")(deep) 165 | deep = tf.keras.layers.Dense(16, activation="relu")(deep) 166 | 167 | wide_ff = tf.keras.layers.concatenate(input_features) 168 | wide = tf.keras.layers.Dense(16, activation="relu")(wide_ff) 169 | 170 | both = tf.keras.layers.concatenate([deep, wide]) 171 | 172 | output = tf.keras.layers.Dense(1, activation="sigmoid")(both) 173 | 174 | inputs = input_features + input_texts 175 | 176 | keras_model = tf.keras.models.Model(inputs, output) 177 | keras_model.compile( 178 | optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 179 | loss="binary_crossentropy", 180 | metrics=[ 181 | tf.keras.metrics.BinaryAccuracy(), 182 | tf.keras.metrics.TruePositives(), 183 | ], 184 | ) 185 | if show_summary: 186 | keras_model.summary() 187 | 188 | return keras_model 189 | 190 | 191 | def _gzip_reader_fn(filenames): 192 | """Small utility returning a record reader that can read gzip'ed files.""" 193 | return tf.data.TFRecordDataset(filenames, compression_type="GZIP") 194 | 195 | 196 | def _get_serve_tf_examples_fn(model, tf_transform_output): 197 | """Returns a function that parses a serialized tf.Example.""" 198 | 199 | model.tft_layer = tf_transform_output.transform_features_layer() 200 | 201 | @tf.function 202 | def serve_tf_examples_fn(serialized_tf_examples): 203 | """Returns the output to be used in the serving signature.""" 204 | feature_spec = tf_transform_output.raw_feature_spec() 205 | feature_spec.pop(LABEL_KEY) 206 | parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec) 207 | 208 | transformed_features = model.tft_layer(parsed_features) 209 | 210 | outputs = model(transformed_features) 211 | return {"outputs": outputs} 212 | 213 | return serve_tf_examples_fn 214 | 215 | 216 | def _input_fn(file_pattern, tf_transform_output, batch_size=64): 217 | """Generates features and label for tuning/training. 218 | 219 | Args: 220 | file_pattern: input tfrecord file pattern. 221 | tf_transform_output: A TFTransformOutput. 222 | batch_size: representing the number of consecutive elements of returned 223 | dataset to combine in a single batch 224 | 225 | Returns: 226 | A dataset that contains (features, indices) tuple where features is a 227 | dictionary of Tensors, and indices is a single Tensor of 228 | label indices. 229 | """ 230 | transformed_feature_spec = tf_transform_output.transformed_feature_spec().copy() 231 | 232 | dataset = tf.data.experimental.make_batched_features_dataset( 233 | file_pattern=file_pattern, 234 | batch_size=batch_size, 235 | features=transformed_feature_spec, 236 | reader=_gzip_reader_fn, 237 | label_key=transformed_name(LABEL_KEY), 238 | ) 239 | 240 | return dataset 241 | 242 | 243 | # TFX Trainer will call this function. 244 | def run_fn(fn_args): 245 | """Train the model based on given args. 246 | 247 | Args: 248 | fn_args: Holds args used to train the model as name/value pairs. 249 | """ 250 | tf_transform_output = tft.TFTransformOutput(fn_args.transform_output) 251 | 252 | train_dataset = _input_fn(fn_args.train_files, tf_transform_output, 64) 253 | eval_dataset = _input_fn(fn_args.eval_files, tf_transform_output, 64) 254 | 255 | model = get_model() 256 | 257 | log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), "logs") 258 | tensorboard_callback = tf.keras.callbacks.TensorBoard( 259 | log_dir=log_dir, update_freq="batch" 260 | ) 261 | callbacks = [tensorboard_callback] 262 | 263 | model.fit( 264 | train_dataset, 265 | epochs=1, 266 | steps_per_epoch=fn_args.train_steps, 267 | validation_data=eval_dataset, 268 | validation_steps=fn_args.eval_steps, 269 | callbacks=callbacks, 270 | ) 271 | 272 | signatures = { 273 | "serving_default": _get_serve_tf_examples_fn( 274 | model, tf_transform_output 275 | ).get_concrete_function( 276 | tf.TensorSpec(shape=[None], dtype=tf.string, name="examples") 277 | ), 278 | } 279 | model.save(fn_args.serving_model_dir, save_format="tf", signatures=signatures) 280 | -------------------------------------------------------------------------------- /components/module_test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from components.module import fill_in_missing 4 | 5 | 6 | class ExecutorTest(tf.test.TestCase): 7 | def setUp(self): 8 | super(ExecutorTest, self).setUp() 9 | 10 | def test_fill_in_missing_dense_tensor(self): 11 | 12 | dense_tensor = tf.constant([[""], ["wow"], ["test"]]) 13 | expected_tensor = tf.constant(["", "wow", "test"]) 14 | rs = fill_in_missing(dense_tensor) 15 | comparison = tf.reduce_all(tf.equal(rs, expected_tensor)) 16 | self.assertTrue(comparison) 17 | 18 | def test_fill_in_missing_sparse_tensor(self): 19 | 20 | sparse_tensor = tf.SparseTensor( 21 | indices=[[1, 0], [2, 0]], values=["wow", "test"], dense_shape=[3, 1] 22 | ) 23 | expected_tensor = tf.constant(["", "wow", "test"]) 24 | rs = fill_in_missing(sparse_tensor) 25 | comparison = tf.reduce_all(tf.equal(rs, expected_tensor)) 26 | self.assertTrue(comparison) 27 | -------------------------------------------------------------------------------- /components/transform.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import tensorflow as tf 4 | import tensorflow_transform as tft 5 | 6 | 7 | ONE_HOT_FEATURES = { 8 | "product": 11, 9 | "sub_product": 45, 10 | "company_response": 5, 11 | "state": 60, 12 | "issue": 90, 13 | } 14 | 15 | # feature name, bucket count 16 | BUCKET_FEATURES = {"zip_code": 10} 17 | 18 | # feature name, value is unused 19 | TEXT_FEATURES = {"consumer_complaint_narrative": None} 20 | 21 | LABEL_KEY = "consumer_disputed" 22 | 23 | 24 | def transformed_name(key): 25 | return key + "_xf" 26 | 27 | 28 | def fill_in_missing(x: Union[tf.Tensor, tf.SparseTensor]) -> tf.Tensor: 29 | """Replace missing values in a SparseTensor. 30 | 31 | Fills in missing values of `x` with '' or 0, and converts to a 32 | dense tensor. 33 | 34 | Args: 35 | x: A `SparseTensor` of rank 2. Its dense shape should have 36 | size at most 1 in the second dimension. 37 | 38 | Returns: 39 | A rank 1 tensor where missing values of `x` have been filled in. 40 | """ 41 | if isinstance(x, tf.sparse.SparseTensor): 42 | default_value = "" if x.dtype == tf.string else 0 43 | x = tf.sparse.to_dense( 44 | tf.SparseTensor(x.indices, x.values, [x.dense_shape[0], 1]), 45 | default_value, 46 | ) 47 | return tf.squeeze(x, axis=1) 48 | 49 | 50 | def convert_num_to_one_hot(label_tensor, num_labels=2): 51 | """ 52 | Convert a label (0 or 1) into a one-hot vector 53 | Args: 54 | int: label_tensor (0 or 1) 55 | Returns 56 | label tensor 57 | """ 58 | one_hot_tensor = tf.one_hot(label_tensor, num_labels) 59 | return tf.reshape(one_hot_tensor, [-1, num_labels]) 60 | 61 | 62 | def convert_zip_code(zipcode): 63 | """ 64 | Convert a zipcode string to int64 representation. In the dataset the 65 | zipcodes are anonymized by repacing the last 3 digits to XXX. We are 66 | replacing those characters to 000 to simplify the bucketing later on. 67 | 68 | Args: 69 | str: zipcode 70 | Returns: 71 | zipcode: int64 72 | """ 73 | zipcode = tf.strings.regex_replace(zipcode, r"X{0,5}", "0") 74 | zipcode = tf.strings.to_number(zipcode, out_type=tf.float32) 75 | return zipcode 76 | 77 | 78 | def preprocessing_fn(inputs): 79 | """tf.transform's callback function for preprocessing inputs. 80 | 81 | Args: 82 | inputs: map from feature keys to raw not-yet-transformed features. 83 | 84 | Returns: 85 | Map from string feature key to transformed feature operations. 86 | """ 87 | outputs = {} 88 | 89 | for key in ONE_HOT_FEATURES.keys(): 90 | dim = ONE_HOT_FEATURES[key] 91 | int_value = tft.compute_and_apply_vocabulary( 92 | fill_in_missing(inputs[key]), top_k=dim + 1 93 | ) 94 | outputs[transformed_name(key)] = convert_num_to_one_hot( 95 | int_value, num_labels=dim + 1 96 | ) 97 | 98 | for key, bucket_count in BUCKET_FEATURES.items(): 99 | temp_feature = tft.bucketize( 100 | convert_zip_code(fill_in_missing(inputs[key])), 101 | bucket_count, 102 | ) 103 | outputs[transformed_name(key)] = convert_num_to_one_hot( 104 | temp_feature, num_labels=bucket_count + 1 105 | ) 106 | 107 | for key in TEXT_FEATURES.keys(): 108 | outputs[transformed_name(key)] = fill_in_missing(inputs[key]) 109 | 110 | outputs[transformed_name(LABEL_KEY)] = fill_in_missing(inputs[LABEL_KEY]) 111 | 112 | return outputs 113 | -------------------------------------------------------------------------------- /interactive-pipeline/README.md: -------------------------------------------------------------------------------- 1 | ## Example pipeline 2 | -------------------------------------------------------------------------------- /pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Building-ML-Pipelines/building-machine-learning-pipelines/8862436d291a330d772dc59e104c4ba0a6d64b5a/pipelines/__init__.py -------------------------------------------------------------------------------- /pipelines/apache_airflow/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/tfx-oss-public/tfx:1.4.0 2 | 3 | LABEL maintainer="hannes.hapke@gmail.com" 4 | 5 | RUN apt-get update 6 | RUN apt-get install -y \ 7 | curl \ 8 | gnupg \ 9 | vim \ 10 | git \ 11 | build-essential \ 12 | tmux \ 13 | htop 14 | 15 | ENV LANG C.UTF-8 16 | ENV DEBIAN_FRONTEND=noninteractive 17 | ENV SLUGIFY_USES_UNIDECODE=yes 18 | 19 | WORKDIR /root 20 | RUN mkdir -p tfx 21 | RUN mkdir -p airflow/dags 22 | 23 | COPY setup_airflow.sh setup_airflow.sh 24 | COPY setup_env.sh setup_env.sh 25 | COPY launch_airflow.sh launch_airflow.sh 26 | 27 | RUN sh setup_env.sh 28 | RUN sh setup_airflow.sh 29 | RUN chmod +x launch_airflow.sh 30 | 31 | EXPOSE 8081 32 | EXPOSE 7070 33 | 34 | ENTRYPOINT [ "sh", "-c", "./launch_airflow.sh" ] 35 | -------------------------------------------------------------------------------- /pipelines/apache_airflow/launch_airflow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | tmux new-session -d -s 'airflow_launch' -n 'htop' htop 3 | tmux new-window 'jupyter notebook --ip=0.0.0.0 --port=8081 --allow-root' 4 | tmux split-window -v 'airflow webserver -p 7070' 5 | tmux split-window -h 'airflow scheduler' 6 | tmux -2 attach-session -d 7 | -------------------------------------------------------------------------------- /pipelines/apache_airflow/pipeline_airflow.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | 4 | from typing import Text 5 | 6 | from tfx.orchestration import metadata, pipeline 7 | from tfx.orchestration.airflow.airflow_dag_runner import AirflowDagRunner 8 | from tfx.orchestration.airflow.airflow_dag_runner import AirflowPipelineConfig 9 | from base_pipeline import init_components 10 | 11 | 12 | pipeline_name = "consumer_complaint_pipeline_airflow" 13 | airflow_dir = os.path.join(os.environ["HOME"], "airflow") 14 | data_dir = os.path.join(airflow_dir, "data/consumer_complaints") 15 | module_file = os.path.join(airflow_dir, "dags/module.py") 16 | 17 | pipeline_root = os.path.join(airflow_dir, "tfx", pipeline_name) 18 | metadata_path = os.path.join(pipeline_root, "metadata.sqlite") 19 | serving_model_dir = os.path.join(pipeline_root, "serving_model", pipeline_name) 20 | 21 | airflow_config = { 22 | "schedule_interval": None, 23 | "start_date": datetime.datetime(2020, 4, 17), 24 | } 25 | 26 | 27 | def init_pipeline( 28 | components, pipeline_root: Text, direct_num_workers: int 29 | ) -> pipeline.Pipeline: 30 | 31 | beam_arg = ( 32 | f"--direct_num_workers={direct_num_workers}", 33 | "--direct_running_mode=multi_processing", 34 | ) 35 | 36 | p = pipeline.Pipeline( 37 | pipeline_name=pipeline_name, 38 | pipeline_root=pipeline_root, 39 | components=components, 40 | enable_cache=True, 41 | metadata_connection_config=metadata.sqlite_metadata_connection_config( 42 | metadata_path 43 | ), 44 | beam_pipeline_args=beam_arg, 45 | ) 46 | return p 47 | 48 | 49 | components = init_components( 50 | data_dir, 51 | module_file, 52 | serving_model_dir, 53 | training_steps=50000, 54 | eval_steps=10000, 55 | ) 56 | pipeline = init_pipeline(components, pipeline_root, 0) 57 | DAG = AirflowDagRunner(AirflowPipelineConfig(airflow_config)).run(pipeline) 58 | -------------------------------------------------------------------------------- /pipelines/apache_airflow/setup_airflow.sh: -------------------------------------------------------------------------------- 1 | # Adjust configuration 2 | printf "${GREEN}Adjusting Airflow config${NORMAL}\n" 3 | sed -i'.orig' 's/dag_dir_list_interval = 300/dag_dir_list_interval = 1/g' ~/airflow/airflow.cfg 4 | sed -i'.orig' 's/job_heartbeat_sec = 5/job_heartbeat_sec = 1/g' ~/airflow/airflow.cfg 5 | sed -i'.orig' 's/scheduler_heartbeat_sec = 5/scheduler_heartbeat_sec = 1/g' ~/airflow/airflow.cfg 6 | sed -i'.orig' 's/dag_default_view = tree/dag_default_view = graph/g' ~/airflow/airflow.cfg 7 | sed -i'.orig' 's/load_examples = True/load_examples = False/g' ~/airflow/airflow.cfg 8 | sed -i'.orig' 's/max_threads = 2/max_threads = 1/g' ~/airflow/airflow.cfg 9 | 10 | 11 | printf "${GREEN}Refreshing Airflow to pick up new config${NORMAL}\n" 12 | airflow db reset --yes 13 | airflow db init 14 | 15 | # Copy Dag to ~/airflow/dags 16 | mkdir -p ~/airflow/dags 17 | 18 | jupyter nbextension enable --py widgetsnbextension 19 | jupyter nbextension install --py --symlink tensorflow_model_analysis 20 | jupyter nbextension enable --py tensorflow_model_analysis 21 | -------------------------------------------------------------------------------- /pipelines/apache_airflow/setup_env.sh: -------------------------------------------------------------------------------- 1 | # Set up the environment for the TFX tutorial 2 | # Adopted from the TFX setup 3 | 4 | GREEN=$(tput setaf 2) 5 | NORMAL=$(tput sgr0) 6 | 7 | printf "${GREEN}Installing Google API Client${NORMAL}\n" 8 | pip install google-api-python-client 9 | 10 | # # Docker images 11 | printf "${GREEN}Installing docker${NORMAL}\n" 12 | pip install docker 13 | 14 | # Airflow 15 | # Set this to avoid the GPL version; no functionality difference either way 16 | printf "${GREEN}Preparing environment for Airflow${NORMAL}\n" 17 | export SLUGIFY_USES_TEXT_UNIDECODE=yes 18 | printf "${GREEN}Installing Airflow${NORMAL}\n" 19 | pip install apache-airflow 20 | printf "${GREEN}Initializing Airflow database${NORMAL}\n" 21 | airflow db init 22 | airflow users create --username "tfx" \ 23 | --firstname "TensorFlow" \ 24 | --lastname "Extended" \ 25 | --role "Admin" \ 26 | --email "admin@example.org" \ 27 | --password "tfx" 28 | -------------------------------------------------------------------------------- /pipelines/apache_beam/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Building-ML-Pipelines/building-machine-learning-pipelines/8862436d291a330d772dc59e104c4ba0a6d64b5a/pipelines/apache_beam/__init__.py -------------------------------------------------------------------------------- /pipelines/apache_beam/pipeline_beam.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from typing import Text 4 | 5 | from absl import logging 6 | from tfx.orchestration import metadata, pipeline 7 | from tfx.orchestration.beam.beam_dag_runner import BeamDagRunner 8 | 9 | 10 | pipeline_name = "consumer_complaint_pipeline_beam" 11 | 12 | # pipeline inputs 13 | pipeline_dir = os.getcwd() 14 | data_dir = os.path.join(pipeline_dir, "data") 15 | module_file = os.path.join(pipeline_dir, "components", "module.py") 16 | requirement_file = os.path.join(pipeline_dir, "requirements.txt") 17 | 18 | # pipeline outputs 19 | output_base = os.path.join(pipeline_dir, "output", pipeline_name) 20 | serving_model_dir = os.path.join(output_base, pipeline_name) 21 | pipeline_root = os.path.join(output_base, "pipeline_root") 22 | metadata_path = os.path.join(pipeline_root, "metadata.sqlite") 23 | 24 | 25 | def init_beam_pipeline( 26 | components, pipeline_root: Text, direct_num_workers: int 27 | ) -> pipeline.Pipeline: 28 | 29 | logging.info(f"Pipeline root set to: {pipeline_root}") 30 | beam_arg = ( 31 | f"--direct_num_workers={direct_num_workers}", 32 | f"--requirements_file={requirement_file}", # optional 33 | "--direct_running_mode=multi_processing", 34 | ) 35 | 36 | p = pipeline.Pipeline( 37 | pipeline_name=pipeline_name, 38 | pipeline_root=pipeline_root, 39 | components=components, 40 | enable_cache=False, 41 | metadata_connection_config=metadata.sqlite_metadata_connection_config( 42 | metadata_path 43 | ), 44 | beam_pipeline_args=beam_arg, 45 | ) 46 | return p 47 | 48 | 49 | if __name__ == "__main__": 50 | 51 | logging.set_verbosity(logging.INFO) 52 | 53 | module_path = os.getcwd() 54 | if module_path not in sys.path: 55 | print(module_path) 56 | sys.path.append(module_path) 57 | 58 | from pipelines.base_pipeline import init_components 59 | 60 | components = init_components( 61 | data_dir, 62 | module_file, 63 | training_steps=5000, 64 | eval_steps=100, 65 | serving_model_dir=serving_model_dir, 66 | ) 67 | direct_num_workers = int(os.cpu_count() / 2) 68 | direct_num_workers = 1 if direct_num_workers < 1 else direct_num_workers 69 | pipeline = init_beam_pipeline(components, pipeline_root, direct_num_workers) 70 | BeamDagRunner().run(pipeline) 71 | -------------------------------------------------------------------------------- /pipelines/base_pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import tensorflow_model_analysis as tfma 4 | from tfx import v1 as tfx 5 | from tfx.proto import example_gen_pb2, pusher_pb2, trainer_pb2 6 | 7 | 8 | TRAIN_STEPS = 50000 9 | EVAL_STEPS = 10000 10 | 11 | 12 | def init_components( 13 | data_dir, 14 | module_file, 15 | training_steps=TRAIN_STEPS, 16 | eval_steps=EVAL_STEPS, 17 | serving_model_dir=None, 18 | vertex_training_custom_config=None, 19 | vertex_serving_args=None, 20 | ): 21 | 22 | if serving_model_dir and vertex_serving_args: 23 | raise NotImplementedError( 24 | "Can't set vertex_serving_args and serving_model_dir at " 25 | "the same time. Choose one deployment option." 26 | ) 27 | 28 | output = example_gen_pb2.Output( 29 | split_config=example_gen_pb2.SplitConfig( 30 | splits=[ 31 | example_gen_pb2.SplitConfig.Split(name="train", hash_buckets=9), 32 | example_gen_pb2.SplitConfig.Split(name="eval", hash_buckets=1), 33 | ] 34 | ) 35 | ) 36 | 37 | example_gen = tfx.components.CsvExampleGen( 38 | input_base=os.path.join(os.getcwd(), data_dir), output_config=output 39 | ) 40 | 41 | statistics_gen = tfx.components.StatisticsGen( 42 | examples=example_gen.outputs["examples"] 43 | ) 44 | 45 | schema_gen = tfx.components.SchemaGen( 46 | statistics=statistics_gen.outputs["statistics"], 47 | infer_feature_shape=False, 48 | ) 49 | 50 | example_validator = tfx.components.ExampleValidator( 51 | statistics=statistics_gen.outputs["statistics"], 52 | schema=schema_gen.outputs["schema"], 53 | ) 54 | 55 | transform = tfx.components.Transform( 56 | examples=example_gen.outputs["examples"], 57 | schema=schema_gen.outputs["schema"], 58 | module_file=module_file, 59 | ) 60 | 61 | training_kwargs = { 62 | "module_file": module_file, 63 | "examples": transform.outputs["transformed_examples"], 64 | "schema": schema_gen.outputs["schema"], 65 | "transform_graph": transform.outputs["transform_graph"], 66 | "train_args": trainer_pb2.TrainArgs(num_steps=training_steps), 67 | "eval_args": trainer_pb2.EvalArgs(num_steps=eval_steps), 68 | } 69 | 70 | if vertex_training_custom_config: 71 | training_kwargs.update({"custom_config": vertex_training_custom_config}) 72 | trainer = tfx.extensions.google_cloud_ai_platform.Trainer(**training_kwargs) 73 | else: 74 | trainer = tfx.components.Trainer(**training_kwargs) 75 | 76 | model_resolver = tfx.dsl.Resolver( 77 | strategy_class=tfx.dsl.experimental.LatestBlessedModelStrategy, 78 | model=tfx.dsl.Channel(type=tfx.types.standard_artifacts.Model), 79 | model_blessing=tfx.dsl.Channel(type=tfx.types.standard_artifacts.ModelBlessing), 80 | ) 81 | 82 | eval_config = tfma.EvalConfig( 83 | model_specs=[ 84 | tfma.ModelSpec( 85 | signature_name="serving_default", 86 | label_key="consumer_disputed", 87 | # preprocessing_function_names=["transform_features"], 88 | ) 89 | ], 90 | slicing_specs=[tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=["product"])], 91 | metrics_specs=[ 92 | tfma.MetricsSpec( 93 | metrics=[ 94 | tfma.MetricConfig( 95 | class_name="BinaryAccuracy", 96 | threshold=tfma.MetricThreshold( 97 | value_threshold=tfma.GenericValueThreshold( 98 | lower_bound={"value": 0.65} 99 | ), 100 | change_threshold=tfma.GenericChangeThreshold( 101 | direction=tfma.MetricDirection.HIGHER_IS_BETTER, 102 | absolute={"value": -1e-10}, 103 | ), 104 | ), 105 | ), 106 | tfma.MetricConfig(class_name="Precision"), 107 | tfma.MetricConfig(class_name="Recall"), 108 | tfma.MetricConfig(class_name="ExampleCount"), 109 | tfma.MetricConfig(class_name="AUC"), 110 | ], 111 | ) 112 | ], 113 | ) 114 | 115 | evaluator = tfx.components.Evaluator( 116 | examples=example_gen.outputs["examples"], 117 | model=trainer.outputs["model"], 118 | baseline_model=model_resolver.outputs["model"], 119 | eval_config=eval_config, 120 | ) 121 | 122 | if vertex_serving_args: 123 | pusher = tfx.extensions.google_cloud_ai_platform.Pusher( 124 | model=trainer.outputs["model"], 125 | model_blessing=evaluator.outputs["blessing"], 126 | custom_config=vertex_serving_args, 127 | ) 128 | 129 | elif serving_model_dir: 130 | pusher = tfx.components.Pusher( 131 | model=trainer.outputs["model"], 132 | model_blessing=evaluator.outputs["blessing"], 133 | push_destination=pusher_pb2.PushDestination( 134 | filesystem=pusher_pb2.PushDestination.Filesystem( 135 | base_directory=serving_model_dir 136 | ) 137 | ), 138 | ) 139 | else: 140 | raise NotImplementedError( 141 | "Provide ai_platform_serving_args or serving_model_dir." 142 | ) 143 | 144 | components = [ 145 | example_gen, 146 | statistics_gen, 147 | schema_gen, 148 | example_validator, 149 | transform, 150 | trainer, 151 | model_resolver, 152 | evaluator, 153 | pusher, 154 | ] 155 | return components 156 | -------------------------------------------------------------------------------- /pipelines/kubeflow_pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Building-ML-Pipelines/building-machine-learning-pipelines/8862436d291a330d772dc59e104c4ba0a6d64b5a/pipelines/kubeflow_pipelines/__init__.py -------------------------------------------------------------------------------- /pipelines/kubeflow_pipelines/argo_pipeline_files/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Building-ML-Pipelines/building-machine-learning-pipelines/8862436d291a330d772dc59e104c4ba0a6d64b5a/pipelines/kubeflow_pipelines/argo_pipeline_files/.gitkeep -------------------------------------------------------------------------------- /pipelines/kubeflow_pipelines/kubeflow-config/storage-access-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: data-access 5 | namespace: kubeflow 6 | labels: 7 | app: data-access 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | app: data-access 13 | template: 14 | metadata: 15 | labels: 16 | app: data-access 17 | spec: 18 | containers: 19 | - name: alpine 20 | image: alpine:latest 21 | command: ["/bin/sh", "-ec", "while :; do echo '.'; sleep 5 ; done"] 22 | volumeMounts: 23 | - name: tfx-persistent-storage 24 | mountPath: /tfx-data 25 | volumes: 26 | - name: tfx-persistent-storage 27 | persistentVolumeClaim: 28 | claimName: tfx-pvc 29 | -------------------------------------------------------------------------------- /pipelines/kubeflow_pipelines/kubeflow-config/storage-claim.yaml: -------------------------------------------------------------------------------- 1 | kind: PersistentVolumeClaim 2 | apiVersion: v1 3 | metadata: 4 | name: tfx-pvc 5 | namespace: kubeflow 6 | spec: 7 | accessModes: 8 | - ReadWriteOnce 9 | resources: 10 | requests: 11 | storage: 20Gi 12 | -------------------------------------------------------------------------------- /pipelines/kubeflow_pipelines/kubeflow-config/storage.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: tfx-pv 5 | namespace: kubeflow 6 | annotations: 7 | kubernetes.io/createdby: gce-pd-dynamic-provisioner 8 | pv.kubernetes.io/bound-by-controller: "yes" 9 | pv.kubernetes.io/provisioned-by: kubernetes.io/gce-pd 10 | spec: 11 | accessModes: 12 | - ReadWriteOnce 13 | capacity: 14 | storage: 20Gi 15 | claimRef: 16 | apiVersion: v1 17 | kind: PersistentVolumeClaim 18 | name: tfx-pvc 19 | namespace: kubeflow 20 | gcePersistentDisk: 21 | fsType: ext4 22 | pdName: tfx-pv-disk 23 | nodeAffinity: 24 | required: 25 | nodeSelectorTerms: 26 | - matchExpressions: 27 | - key: failure-domain.beta.kubernetes.io/zone 28 | operator: In 29 | values: 30 | - us-central1-c 31 | - key: failure-domain.beta.kubernetes.io/region 32 | operator: In 33 | values: 34 | - us-central1 35 | persistentVolumeReclaimPolicy: Delete 36 | storageClassName: standard 37 | volumeMode: Filesystem 38 | status: 39 | phase: Bound 40 | -------------------------------------------------------------------------------- /pipelines/kubeflow_pipelines/pipeline_kubeflow.py: -------------------------------------------------------------------------------- 1 | """Kubeflow example using TFX DSL for local deployments (not GCP Cloud AI).""" 2 | 3 | import os 4 | import sys 5 | from typing import Text 6 | 7 | from absl import logging 8 | from kfp import onprem 9 | from tfx.orchestration import pipeline 10 | from tfx.orchestration.kubeflow import kubeflow_dag_runner 11 | 12 | 13 | pipeline_name = "consumer_complaint_pipeline_kubeflow" 14 | 15 | persistent_volume_claim = "tfx-pvc" 16 | persistent_volume = "tfx-pv" 17 | # this folder needs to match the folder in the persistent volume which contains the module file 18 | persistent_volume_mount = "/tmp" 19 | 20 | # temp yaml file for Kubeflow Pipelines 21 | output_filename = f"{pipeline_name}.yaml" 22 | output_dir = os.path.join( 23 | os.getcwd(), "pipelines", "kubeflow_pipelines", "argo_pipeline_files" 24 | ) 25 | 26 | # pipeline inputs 27 | data_dir = os.path.join(persistent_volume_mount, "data") 28 | module_file = os.path.join("components", "module.py") 29 | 30 | # pipeline outputs 31 | output_base = os.path.join(persistent_volume_mount, "output") 32 | serving_model_dir = os.path.join(output_base, pipeline_name) 33 | 34 | 35 | def init_kubeflow_pipeline( 36 | components, pipeline_root: Text, direct_num_workers: int 37 | ) -> pipeline.Pipeline: 38 | 39 | logging.info(f"Pipeline root set to: {pipeline_root}") 40 | beam_arg = ( 41 | f"--direct_num_workers={direct_num_workers}", 42 | "--direct_running_mode=multi_processing", 43 | ) 44 | p = pipeline.Pipeline( 45 | pipeline_name=pipeline_name, 46 | pipeline_root=pipeline_root, 47 | components=components, 48 | beam_pipeline_args=beam_arg, 49 | ) 50 | return p 51 | 52 | 53 | if __name__ == "__main__": 54 | 55 | logging.set_verbosity(logging.INFO) 56 | 57 | module_path = os.getcwd() 58 | if module_path not in sys.path: 59 | sys.path.append(module_path) 60 | 61 | metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config() 62 | tfx_image = os.environ.get( 63 | "KUBEFLOW_TFX_IMAGE", 64 | "gcr.io/oreilly-book/ml-pipelines-tfx-custom:latest", 65 | ) 66 | 67 | from pipelines.base_pipeline import init_components 68 | 69 | components = init_components( 70 | data_dir, 71 | module_file, 72 | serving_model_dir=serving_model_dir, 73 | training_steps=100, 74 | eval_steps=100, 75 | ) 76 | 77 | runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( 78 | kubeflow_metadata_config=metadata_config, 79 | # Specify custom docker image to use. 80 | tfx_image=tfx_image, 81 | pipeline_operator_funcs=( 82 | # If running on K8s Engine (GKE) on Google Cloud Platform (GCP), 83 | # kubeflow_dag_runner.get_default_pipeline_operator_funcs() 84 | # provides default configurations specifically for GKE on GCP, 85 | # such as secrets. 86 | kubeflow_dag_runner.get_default_pipeline_operator_funcs() 87 | + [ 88 | onprem.mount_pvc( 89 | persistent_volume_claim, 90 | persistent_volume, 91 | persistent_volume_mount, 92 | ) 93 | ] 94 | ), 95 | ) 96 | 97 | p = init_kubeflow_pipeline(components, output_base, direct_num_workers=0) 98 | output_filename = f"{pipeline_name}.yaml" 99 | kubeflow_dag_runner.KubeflowDagRunner( 100 | config=runner_config, 101 | output_dir=output_dir, 102 | output_filename=output_filename, 103 | ).run(p) 104 | -------------------------------------------------------------------------------- /pipelines/kubeflow_pipelines/pipeline_kubeflow_gcp_buckets.py: -------------------------------------------------------------------------------- 1 | """Kubeflow example using TFX DSL for local deployments (not GCP Cloud AI).""" 2 | 3 | import os 4 | import sys 5 | from typing import Text 6 | 7 | from absl import logging 8 | from tfx.orchestration import pipeline 9 | from tfx.orchestration.kubeflow import kubeflow_dag_runner 10 | 11 | 12 | pipeline_name = "consumer_complaint_pipeline_kubeflow" 13 | 14 | # temp yaml file for Kubeflow Pipelines 15 | output_filename = f"{pipeline_name}.yaml" 16 | output_dir = os.path.join( 17 | os.getcwd(), "pipelines", "kubeflow_pipelines", "argo_pipeline_files" 18 | ) 19 | 20 | # Directory and data locations (uses Google Cloud Storage). 21 | input_bucket = "gs://consumer_complaint_gcp_cloud_ai" 22 | output_bucket = "gs://consumer_complaint_gcp_cloud_ai" 23 | data_dir = os.path.join(input_bucket, "data") 24 | module_file = os.path.join(input_bucket, "components", "module.py") 25 | output_base = os.path.join(output_bucket, "output") 26 | 27 | tfx_root = os.path.join(output_bucket, "tfx_pipeline") 28 | pipeline_root = os.path.join(tfx_root, pipeline_name) 29 | serving_model_dir = os.path.join(output_bucket, "serving_model_dir") 30 | 31 | 32 | def init_kubeflow_pipeline( 33 | components, pipeline_root: Text, direct_num_workers: int 34 | ) -> pipeline.Pipeline: 35 | 36 | logging.info(f"Pipeline root set to: {pipeline_root}") 37 | beam_arg = ( 38 | f"--direct_num_workers={direct_num_workers}", 39 | "--direct_running_mode=multi_processing", 40 | ) 41 | p = pipeline.Pipeline( 42 | pipeline_name=pipeline_name, 43 | pipeline_root=pipeline_root, 44 | components=components, 45 | beam_pipeline_args=beam_arg, 46 | ) 47 | return p 48 | 49 | 50 | if __name__ == "__main__": 51 | 52 | logging.set_verbosity(logging.INFO) 53 | 54 | module_path = os.getcwd() 55 | if module_path not in sys.path: 56 | sys.path.append(module_path) 57 | 58 | metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config() 59 | tfx_image = os.environ.get( 60 | "KUBEFLOW_TFX_IMAGE", 61 | "gcr.io/oreilly-book/ml-pipelines-tfx-custom:latest", 62 | ) 63 | 64 | from pipelines.base_pipeline import init_components 65 | 66 | components = init_components( 67 | data_dir, module_file, 5000, 100, serving_model_dir=serving_model_dir 68 | ) 69 | 70 | runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( 71 | kubeflow_metadata_config=metadata_config, 72 | # Specify custom docker image to use. 73 | tfx_image=tfx_image, 74 | pipeline_operator_funcs=( 75 | # If running on K8s Engine (GKE) on Google Cloud Platform (GCP), 76 | # kubeflow_dag_runner.get_default_pipeline_operator_funcs() 77 | # provides default configurations specifically for GKE on GCP, 78 | # such as secrets. 79 | kubeflow_dag_runner.get_default_pipeline_operator_funcs() 80 | ), 81 | ) 82 | 83 | p = init_kubeflow_pipeline(components, output_base, direct_num_workers=0) 84 | output_filename = f"{pipeline_name}.yaml" 85 | kubeflow_dag_runner.KubeflowDagRunner( 86 | config=runner_config, 87 | output_dir=output_dir, 88 | output_filename=output_filename, 89 | ).run(p) 90 | -------------------------------------------------------------------------------- /pipelines/kubeflow_pipelines/tfx-docker-image/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/tfx-oss-public/tfx:1.4.0 2 | 3 | LABEL maintainer="hannes.hapke@gmail.com" 4 | 5 | RUN pip install tensorflow-hub 6 | 7 | WORKDIR /pipeline 8 | -------------------------------------------------------------------------------- /pipelines/vertex/pipeline_vertex.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from absl import logging 5 | from tfx import v1 as tfx 6 | from tfx.orchestration import pipeline 7 | from tfx.orchestration.kubeflow import kubeflow_dag_runner 8 | 9 | 10 | pipeline_name = "consumer_complaint_pipeline_cloud_ai_to_cloud_bucket" 11 | 12 | # temp yaml file for Kubeflow Pipelines 13 | output_filename = f"{pipeline_name}.yaml" 14 | output_dir = os.path.join( 15 | os.getcwd(), "pipelines", "gcp_cloud_ai", "argo_pipeline_files" 16 | ) 17 | 18 | # Directory and data locations (uses Google Cloud Storage). 19 | input_bucket = "gs://consumer_complaint_gcp_cloud_ai" 20 | output_bucket = "gs://consumer_complaint_gcp_cloud_ai" 21 | data_dir = os.path.join(input_bucket, "data") 22 | 23 | tfx_root = os.path.join(output_bucket, "tfx_pipeline") 24 | pipeline_root = os.path.join(tfx_root, pipeline_name) 25 | ai_platform_distributed_training = False 26 | serving_model_dir = os.path.join(output_bucket, "serving_model_dir") 27 | 28 | # Google Cloud Platform project id to use when deploying this pipeline. 29 | project_id = "~~oreilly-book~~" # <--- needs update by the user 30 | 31 | module_file = os.path.join(input_bucket, "components", "module.py") 32 | 33 | gcp_region = "us-central1" 34 | 35 | use_gpu = True 36 | 37 | vertex_training_args = { 38 | "project": project_id, 39 | "worker_pool_specs": [ 40 | { 41 | "machine_spec": { 42 | "machine_type": "n1-highmem-8", 43 | }, 44 | "replica_count": 1, 45 | "container_spec": { 46 | "image_uri": "gcr.io/tfx-oss-public/tfx:{}".format(tfx.__version__), 47 | }, 48 | } 49 | ], 50 | } 51 | 52 | if use_gpu: 53 | vertex_training_args["worker_pool_specs"][0]["machine_spec"].update( 54 | {"accelerator_type": "NVIDIA_TESLA_K80", "accelerator_count": 1} 55 | ) 56 | 57 | vertex_training_custom_config = { 58 | tfx.extensions.google_cloud_ai_platform.ENABLE_UCAIP_KEY: True, 59 | tfx.extensions.google_cloud_ai_platform.UCAIP_REGION_KEY: gcp_region, 60 | tfx.extensions.google_cloud_ai_platform.TRAINING_ARGS_KEY: vertex_training_args, 61 | "use_gpu": use_gpu, 62 | } 63 | 64 | 65 | vertex_serving_spec = { 66 | "project_id": project_id, 67 | "endpoint_name": "consumer_complaint", 68 | "deployed_model_display_name": "consumer_complaint", 69 | "machine_type": "n1-standard-2", 70 | "min_replica_count": 1, 71 | "max_replica_count": 2, 72 | "metadata": (("model_name", "consumer_complaint"),), 73 | } 74 | 75 | vertex_container_image_uri = "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-6:latest" 76 | 77 | vertex_serving_args = { 78 | tfx.extensions.google_cloud_ai_platform.ENABLE_VERTEX_KEY: True, 79 | tfx.extensions.google_cloud_ai_platform.VERTEX_REGION_KEY: gcp_region, 80 | tfx.extensions.google_cloud_ai_platform.VERTEX_CONTAINER_IMAGE_URI_KEY: vertex_container_image_uri, 81 | tfx.extensions.google_cloud_ai_platform.SERVING_ARGS_KEY: vertex_serving_spec, 82 | } 83 | 84 | beam_pipeline_args = [ 85 | "--runner=DataflowRunner", 86 | "--experiments=shuffle_mode=auto", 87 | "--project=" + project_id, 88 | "--temp_location=" + os.path.join(output_bucket, "tmp"), 89 | "--region=" + gcp_region, 90 | "--disk_size_gb=50", 91 | "--machine_type=e2-standard-8", 92 | "--experiments=use_runner_v2", 93 | ] 94 | 95 | 96 | if __name__ == "__main__": 97 | 98 | logging.set_verbosity(logging.INFO) 99 | 100 | module_path = os.getcwd() 101 | if module_path not in sys.path: 102 | sys.path.append(module_path) 103 | 104 | from pipelines.base_pipeline import init_components 105 | 106 | components = init_components( 107 | data_dir, 108 | module_file, 109 | vertex_training_custom_config=vertex_training_custom_config, 110 | serving_model_dir=serving_model_dir, 111 | # ai_platform_serving_args=ai_platform_serving_args 112 | ) 113 | 114 | p = pipeline.Pipeline( 115 | pipeline_name=pipeline_name, 116 | pipeline_root=pipeline_root, 117 | components=components, 118 | beam_pipeline_args=beam_pipeline_args, 119 | ) 120 | 121 | # Metadata config. The defaults works work with the installation of 122 | # KF Pipelines using Kubeflow. If installing KF Pipelines using the 123 | # lightweight deployment option, you may need to override the defaults. 124 | metadata_config = kubeflow_dag_runner.get_default_kubeflow_metadata_config() 125 | 126 | # This pipeline automatically injects the Kubeflow TFX image if the 127 | # environment variable 'KUBEFLOW_TFX_IMAGE' is defined. Currently, the tfx 128 | # cli tool exports the environment variable to pass to the pipelines. 129 | tfx_image = os.environ.get( 130 | "KUBEFLOW_TFX_IMAGE", 131 | "gcr.io/oreilly-book/ml-pipelines-tfx-custom:latest", 132 | ) 133 | 134 | runner_config = kubeflow_dag_runner.KubeflowDagRunnerConfig( 135 | kubeflow_metadata_config=metadata_config, 136 | # Specify custom docker image to use. 137 | tfx_image=tfx_image, 138 | ) 139 | 140 | kubeflow_dag_runner.KubeflowDagRunner( 141 | config=runner_config, 142 | output_dir=output_dir, 143 | output_filename=output_filename, 144 | ).run(p) 145 | -------------------------------------------------------------------------------- /pre-experiment-pipeline/experiment_6Mar.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import os\n", 11 | "import sys\n", 12 | "import tensorflow as tf\n", 13 | "import numpy as np\n", 14 | "import tensorflow_hub as hub" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import wandb\n", 24 | "from wandb.keras import WandbCallback" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "nb_dir = os.path.split(os.getcwd())[0]\n", 34 | "if nb_dir not in sys.path:\n", 35 | " sys.path.append(nb_dir)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "feature_names = [\"product\", \"sub_product\", \"issue\", \"sub_issue\", \"state\", \"zip_code\", \"company\", \"company_response\", \"timely_response\", \"consumer_disputed\", \"consumer_complaint_narrative\"]\n", 45 | "one_hot_features = ['product', 'sub_product', 'company_response', 'state', 'issue']\n", 46 | "numeric_features = ['zip_code']\n", 47 | "text_features = ['consumer_complaint_narrative']" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 5, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "df = pd.read_csv('../data/consumer_complaints_with_narrative.csv', usecols=feature_names)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 6, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/html": [ 67 | "
\n", 68 | "\n", 81 | "\n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | "
productsub_productissuesub_issueconsumer_complaint_narrativecompanystatezip_codecompany_responsetimely_responseconsumer_disputed
0Debt collectionI do not knowDisclosure verification of debtRight to dispute notice not receivedI was denied employment because of a judgment ...Encore Capital GroupNY113XXClosed with explanationYes0
1Credit reportingNaNImproper use of my credit reportReport improperly shared by CRCI have a credit card through XXXX XXXX and XXX...ExperianIL606XXClosed with non-monetary reliefYes0
2Debt collectionI do not knowCont'd attempts collect debt not owedDebt is not mineAlmost daily phone calls from Stellar Recovery...Stellar Recovery Inc.MI480XXClosed with explanationYes1
3MortgageConventional fixed mortgageLoan servicing, payments, escrow accountNaNI submitted my monthly mortgage payment to Pri...Primary Residential MortgageCT066XXClosed with monetary reliefYes0
4Student loanNon-federal student loanDealing with my lender or servicerReceived bad information about my loanI contacted America Education Services in XX/X...AES/PHEAAFL321XXClosed with explanationYes1
\n", 171 | "
" 172 | ], 173 | "text/plain": [ 174 | " product sub_product \\\n", 175 | "0 Debt collection I do not know \n", 176 | "1 Credit reporting NaN \n", 177 | "2 Debt collection I do not know \n", 178 | "3 Mortgage Conventional fixed mortgage \n", 179 | "4 Student loan Non-federal student loan \n", 180 | "\n", 181 | " issue \\\n", 182 | "0 Disclosure verification of debt \n", 183 | "1 Improper use of my credit report \n", 184 | "2 Cont'd attempts collect debt not owed \n", 185 | "3 Loan servicing, payments, escrow account \n", 186 | "4 Dealing with my lender or servicer \n", 187 | "\n", 188 | " sub_issue \\\n", 189 | "0 Right to dispute notice not received \n", 190 | "1 Report improperly shared by CRC \n", 191 | "2 Debt is not mine \n", 192 | "3 NaN \n", 193 | "4 Received bad information about my loan \n", 194 | "\n", 195 | " consumer_complaint_narrative \\\n", 196 | "0 I was denied employment because of a judgment ... \n", 197 | "1 I have a credit card through XXXX XXXX and XXX... \n", 198 | "2 Almost daily phone calls from Stellar Recovery... \n", 199 | "3 I submitted my monthly mortgage payment to Pri... \n", 200 | "4 I contacted America Education Services in XX/X... \n", 201 | "\n", 202 | " company state zip_code \\\n", 203 | "0 Encore Capital Group NY 113XX \n", 204 | "1 Experian IL 606XX \n", 205 | "2 Stellar Recovery Inc. MI 480XX \n", 206 | "3 Primary Residential Mortgage CT 066XX \n", 207 | "4 AES/PHEAA FL 321XX \n", 208 | "\n", 209 | " company_response timely_response consumer_disputed \n", 210 | "0 Closed with explanation Yes 0 \n", 211 | "1 Closed with non-monetary relief Yes 0 \n", 212 | "2 Closed with explanation Yes 1 \n", 213 | "3 Closed with monetary relief Yes 0 \n", 214 | "4 Closed with explanation Yes 1 " 215 | ] 216 | }, 217 | "execution_count": 6, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "df.head()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 7, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "name": "stdout", 233 | "output_type": "stream", 234 | "text": [ 235 | "product\n", 236 | "11\n", 237 | "sub_product\n", 238 | "45\n", 239 | "company_response\n", 240 | "5\n", 241 | "state\n", 242 | "60\n", 243 | "issue\n", 244 | "90\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "for col in one_hot_features:\n", 250 | " print(col)\n", 251 | " print(df[col].nunique())" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 8, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "df['consumer_disputed'] = df['consumer_disputed'].map({'Yes':1, 'No':0})" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 9, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "for feature in one_hot_features:\n", 270 | " df[feature] = df[feature].astype(\"category\").cat.codes" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 10, 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "name": "stderr", 280 | "output_type": "stream", 281 | "text": [ 282 | "/var/folders/7h/_rbt3v8d3vd1h3c8f1zdrsmm0000gn/T/ipykernel_43478/1472151705.py:1: FutureWarning: The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead\n", 283 | " one_hot_x = [pd.np.asarray(tf.keras.utils.to_categorical(df[feature_name].values)) for feature_name in one_hot_features]\n" 284 | ] 285 | } 286 | ], 287 | "source": [ 288 | "one_hot_x = [pd.np.asarray(tf.keras.utils.to_categorical(df[feature_name].values)) for feature_name in one_hot_features]" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 11, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "name": "stderr", 298 | "output_type": "stream", 299 | "text": [ 300 | "/var/folders/7h/_rbt3v8d3vd1h3c8f1zdrsmm0000gn/T/ipykernel_43478/1459859265.py:1: FutureWarning: The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead\n", 301 | " embedding_x = [pd.np.asarray(df[feature_name].values).reshape(-1) for feature_name in text_features]\n" 302 | ] 303 | } 304 | ], 305 | "source": [ 306 | "embedding_x = [pd.np.asarray(df[feature_name].values).reshape(-1) for feature_name in text_features]" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 12, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "df['zip_code'] = df['zip_code'].str.replace('X', '0', regex=True)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 13, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "df['zip_code'] = df['zip_code'].str.replace(r'\\[|\\*|\\+|\\-|`|\\.|\\ |\\$|\\/|!|\\(', '0', regex=True)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 14, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "df['zip_code'] = df['zip_code'].fillna(0)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 15, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "df['zip_code'] = df['zip_code'].astype('int32')" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 16, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "df['zip_code'] = df['zip_code'].apply(lambda x: x//10000)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 17, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "numeric_x = [df['zip_code'].values]" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 18, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "X = one_hot_x + numeric_x + embedding_x" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 19, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "y = np.asarray(df[\"consumer_disputed\"], dtype=np.uint8).reshape(-1)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 22, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "def get_model(show_summary=True):\n", 388 | " \"\"\"\n", 389 | " Function defines a Keras model and returns the model as Keras object\n", 390 | " \"\"\"\n", 391 | " wandb.init(project=\"consumer-complaints\")\n", 392 | " config = wandb.config\n", 393 | " config.name='final_features_wide'\n", 394 | " config.hidden_layer_size = 256\n", 395 | " config.optimizer = 'adam'\n", 396 | " config.learning_rate = 0.001\n", 397 | " config.data_version = 'cc_imbalanced_narrative'\n", 398 | " config.one_hot_features = one_hot_features\n", 399 | " config.numeric_features = numeric_features\n", 400 | " config.text_features = text_features\n", 401 | " \n", 402 | " # one-hot categorical features\n", 403 | " num_products = 11\n", 404 | " num_sub_products = 45\n", 405 | " num_company_responses = 5\n", 406 | " num_states = 60\n", 407 | " num_issues = 90\n", 408 | "\n", 409 | " input_product = tf.keras.Input(shape=(num_products,), name=\"product_xf\")\n", 410 | " input_sub_product = tf.keras.Input(shape=(num_sub_products,), name=\"sub_product_xf\")\n", 411 | " input_company_response = tf.keras.Input(shape=(num_company_responses,), name=\"company_response_xf\")\n", 412 | " input_state = tf.keras.Input(shape=(num_states,), name=\"state_xf\")\n", 413 | " input_issue = tf.keras.Input(shape=(num_issues,), name=\"issue_xf\")\n", 414 | " \n", 415 | " # numeric features\n", 416 | " input_zip_code = tf.keras.Input(shape=(1,), name=\"zip_code_xf\")\n", 417 | "\n", 418 | " # text features\n", 419 | " input_narrative = tf.keras.Input(shape=(1,), name=\"narrative_xf\", dtype=tf.string)\n", 420 | "\n", 421 | " # embed text features\n", 422 | " module_url = \"https://tfhub.dev/google/universal-sentence-encoder/4\"\n", 423 | " embed = hub.KerasLayer(module_url)\n", 424 | " reshaped_narrative = tf.reshape(input_narrative, [-1])\n", 425 | " embed_narrative = embed(reshaped_narrative) \n", 426 | " deep_ff = tf.keras.layers.Reshape((512, ), input_shape=(1, 512))(embed_narrative)\n", 427 | " \n", 428 | " deep = tf.keras.layers.Dense(256, activation='relu')(deep_ff)\n", 429 | " deep = tf.keras.layers.Dense(64, activation='relu')(deep)\n", 430 | " deep = tf.keras.layers.Dense(16, activation='relu')(deep)\n", 431 | "\n", 432 | " wide_ff = tf.keras.layers.concatenate(\n", 433 | " [input_product, input_sub_product, input_company_response, \n", 434 | " input_state, input_issue, input_zip_code])\n", 435 | " wide = tf.keras.layers.Dense(16, activation='relu')(wide_ff)\n", 436 | "\n", 437 | "\n", 438 | " both = tf.keras.layers.concatenate([deep, wide])\n", 439 | "\n", 440 | " output = tf.keras.layers.Dense(1, activation='sigmoid')(both) \n", 441 | "\n", 442 | " _inputs = [input_product, input_sub_product, input_company_response, \n", 443 | " input_state, input_issue, input_zip_code, input_narrative]\n", 444 | "\n", 445 | " keras_model = tf.keras.models.Model(_inputs, output)\n", 446 | " keras_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),\n", 447 | " loss='binary_crossentropy', \n", 448 | " metrics=[\n", 449 | " tf.keras.metrics.BinaryAccuracy(),\n", 450 | " tf.keras.metrics.TruePositives()\n", 451 | " ])\n", 452 | " if show_summary:\n", 453 | " keras_model.summary()\n", 454 | "\n", 455 | " return keras_model" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 23, 461 | "metadata": {}, 462 | "outputs": [ 463 | { 464 | "name": "stderr", 465 | "output_type": "stream", 466 | "text": [ 467 | "2021-11-23 12:45:04.267354: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", 468 | "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", 469 | "2021-11-23 12:45:08.180375: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n" 470 | ] 471 | } 472 | ], 473 | "source": [ 474 | "model = get_model(show_summary=False)" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 24, 480 | "metadata": {}, 481 | "outputs": [ 482 | { 483 | "name": "stdout", 484 | "output_type": "stream", 485 | "text": [ 486 | "Epoch 1/5\n", 487 | "1670/1670 [==============================] - 56s 32ms/step - loss: 0.0062 - binary_accuracy: 0.9973 - true_positives: 0.0000e+00 - val_loss: 1.2860e-06 - val_binary_accuracy: 1.0000 - val_true_positives: 0.0000e+00\n", 488 | "Epoch 2/5\n", 489 | "1670/1670 [==============================] - 52s 31ms/step - loss: 4.0018e-07 - binary_accuracy: 1.0000 - true_positives: 0.0000e+00 - val_loss: 3.3683e-07 - val_binary_accuracy: 1.0000 - val_true_positives: 0.0000e+00\n", 490 | "Epoch 3/5\n", 491 | "1670/1670 [==============================] - 52s 31ms/step - loss: 1.0129e-07 - binary_accuracy: 1.0000 - true_positives: 0.0000e+00 - val_loss: 1.2363e-07 - val_binary_accuracy: 1.0000 - val_true_positives: 0.0000e+00\n", 492 | "Epoch 4/5\n", 493 | "1670/1670 [==============================] - 51s 31ms/step - loss: 3.3623e-08 - binary_accuracy: 1.0000 - true_positives: 0.0000e+00 - val_loss: 5.1761e-08 - val_binary_accuracy: 1.0000 - val_true_positives: 0.0000e+00\n", 494 | "Epoch 5/5\n", 495 | "1670/1670 [==============================] - 52s 31ms/step - loss: 1.2559e-08 - binary_accuracy: 1.0000 - true_positives: 0.0000e+00 - val_loss: 2.3961e-08 - val_binary_accuracy: 1.0000 - val_true_positives: 0.0000e+00\n" 496 | ] 497 | }, 498 | { 499 | "data": { 500 | "text/plain": [ 501 | "" 502 | ] 503 | }, 504 | "execution_count": 24, 505 | "metadata": {}, 506 | "output_type": "execute_result" 507 | } 508 | ], 509 | "source": [ 510 | "model.fit(x=X, \n", 511 | " y=y, \n", 512 | " batch_size=32, \n", 513 | " validation_split=0.2, \n", 514 | " epochs=5, \n", 515 | " callbacks=[WandbCallback()]\n", 516 | ")" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 7, 522 | "metadata": {}, 523 | "outputs": [ 524 | { 525 | "name": "stdout", 526 | "output_type": "stream", 527 | "text": [ 528 | "Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.\n" 529 | ] 530 | } 531 | ], 532 | "source": [ 533 | "#from IPython.display import Image\n", 534 | "\n", 535 | "file_name = 'model.png'\n", 536 | "tf.keras.utils.plot_model(model, to_file=file_name)\n", 537 | "#Image(filename=file_name)" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": {}, 544 | "outputs": [], 545 | "source": [ 546 | "def get_model(show_summary=True):\n", 547 | " \"\"\"\n", 548 | " Function defines a Keras model and returns the model as Keras object\n", 549 | " \"\"\"\n", 550 | " \n", 551 | " # one-hot categorical features\n", 552 | " num_products = 11\n", 553 | " num_sub_products = 45\n", 554 | " num_company_responses = 5\n", 555 | " num_states = 60\n", 556 | " num_issues = 90\n", 557 | "\n", 558 | " input_product = tf.keras.Input(shape=(num_products,), name=\"product_xf\")\n", 559 | " input_sub_product = tf.keras.Input(shape=(num_sub_products,), name=\"sub_product_xf\")\n", 560 | " input_company_response = tf.keras.Input(shape=(num_company_responses,), name=\"company_response_xf\")\n", 561 | " input_state = tf.keras.Input(shape=(num_states,), name=\"state_xf\")\n", 562 | " input_issue = tf.keras.Input(shape=(num_issues,), name=\"issue_xf\")\n", 563 | " \n", 564 | " # numeric features\n", 565 | " input_zip_code = tf.keras.Input(shape=(1,), name=\"zip_code_xf\")\n", 566 | "\n", 567 | " # text features\n", 568 | " input_narrative = tf.keras.Input(shape=(1,), name=\"narrative_xf\", dtype=tf.string)\n", 569 | "\n", 570 | " # embed text features\n", 571 | " module_url = \"https://tfhub.dev/google/universal-sentence-encoder/4\"\n", 572 | " embed = hub.KerasLayer(module_url)\n", 573 | " reshaped_narrative = tf.reshape(input_narrative, [-1])\n", 574 | " embed_narrative = embed(reshaped_narrative) \n", 575 | " deep_ff = tf.keras.layers.Reshape((512, ), input_shape=(1, 512))(embed_narrative)\n", 576 | " \n", 577 | " deep = tf.keras.layers.Dense(256, activation='relu')(deep_ff)\n", 578 | " deep = tf.keras.layers.Dense(64, activation='relu')(deep)\n", 579 | " deep = tf.keras.layers.Dense(16, activation='relu')(deep)\n", 580 | "\n", 581 | " wide_ff = tf.keras.layers.concatenate(\n", 582 | " [input_product, input_sub_product, input_company_response, \n", 583 | " input_state, input_issue, input_zip_code])\n", 584 | " wide = tf.keras.layers.Dense(16, activation='relu')(wide_ff)\n", 585 | "\n", 586 | "\n", 587 | " both = tf.keras.layers.concatenate([deep, wide])\n", 588 | "\n", 589 | " output = tf.keras.layers.Dense(1, activation='sigmoid')(both) \n", 590 | "\n", 591 | " _inputs = [input_product, input_sub_product, input_company_response, \n", 592 | " input_state, input_issue, input_zip_code, input_narrative]\n", 593 | "\n", 594 | " keras_model = tf.keras.models.Model(_inputs, output)\n", 595 | " keras_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),\n", 596 | " loss='binary_crossentropy', \n", 597 | " metrics=[\n", 598 | " tf.keras.metrics.BinaryAccuracy(),\n", 599 | " tf.keras.metrics.TruePositives()\n", 600 | " ])\n", 601 | " if show_summary:\n", 602 | " keras_model.summary()\n", 603 | "\n", 604 | " return keras_model" 605 | ] 606 | } 607 | ], 608 | "metadata": { 609 | "kernelspec": { 610 | "display_name": "Python 3", 611 | "language": "python", 612 | "name": "python3" 613 | }, 614 | "language_info": { 615 | "codemirror_mode": { 616 | "name": "ipython", 617 | "version": 3 618 | }, 619 | "file_extension": ".py", 620 | "mimetype": "text/x-python", 621 | "name": "python", 622 | "nbconvert_exporter": "python", 623 | "pygments_lexer": "ipython3", 624 | "version": "3.8.12" 625 | } 626 | }, 627 | "nbformat": 4, 628 | "nbformat_minor": 4 629 | } 630 | -------------------------------------------------------------------------------- /pre-experiment-pipeline/make_final_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "source: https://www.kaggle.com/cfpb/us-consumer-finance-complaints" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import os\n", 18 | "import sys" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "nb_dir = os.path.split(os.getcwd())[0]\n", 28 | "if nb_dir not in sys.path:\n", 29 | " sys.path.append(nb_dir)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "feature_cols=[\"product\", \"sub_product\", \"issue\", \"sub_issue\", \"state\", \"zipcode\", \"company\", \"company_response_to_consumer\", \"timely_response\", \"consumer_disputed?\", \"consumer_complaint_narrative\"]\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "name": "stderr", 48 | "output_type": "stream", 49 | "text": [ 50 | "/Users/i854694/.virtualenvs/pipelines/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n", 51 | " interactivity=interactivity, compiler=compiler, result=result)\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "df = pd.read_csv(_FILE_LOCATION, usecols=feature_cols)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 5, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/html": [ 67 | "
\n", 68 | "\n", 81 | "\n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | "
productsub_productissuesub_issueconsumer_complaint_narrativecompanystatezipcodecompany_response_to_consumertimely_responseconsumer_disputed?
0MortgageOther mortgageLoan modification,collection,foreclosureNaNNaNU.S. BancorpCA95993Closed with explanationYesYes
1MortgageOther mortgageLoan servicing, payments, escrow accountNaNNaNWells Fargo & CompanyCA91104Closed with explanationYesYes
2Credit reportingNaNIncorrect information on credit reportAccount statusNaNWells Fargo & CompanyNY11764Closed with explanationYesNo
3Student loanNon-federal student loanRepaying your loanRepaying your loanNaNNavient Solutions, Inc.MD21402Closed with explanationYesYes
4Debt collectionCredit cardFalse statements or representationAttempted to collect wrong amountNaNResurgent Capital Services L.P.GA30106Closed with explanationYesYes
\n", 171 | "
" 172 | ], 173 | "text/plain": [ 174 | " product sub_product \\\n", 175 | "0 Mortgage Other mortgage \n", 176 | "1 Mortgage Other mortgage \n", 177 | "2 Credit reporting NaN \n", 178 | "3 Student loan Non-federal student loan \n", 179 | "4 Debt collection Credit card \n", 180 | "\n", 181 | " issue \\\n", 182 | "0 Loan modification,collection,foreclosure \n", 183 | "1 Loan servicing, payments, escrow account \n", 184 | "2 Incorrect information on credit report \n", 185 | "3 Repaying your loan \n", 186 | "4 False statements or representation \n", 187 | "\n", 188 | " sub_issue consumer_complaint_narrative \\\n", 189 | "0 NaN NaN \n", 190 | "1 NaN NaN \n", 191 | "2 Account status NaN \n", 192 | "3 Repaying your loan NaN \n", 193 | "4 Attempted to collect wrong amount NaN \n", 194 | "\n", 195 | " company state zipcode company_response_to_consumer \\\n", 196 | "0 U.S. Bancorp CA 95993 Closed with explanation \n", 197 | "1 Wells Fargo & Company CA 91104 Closed with explanation \n", 198 | "2 Wells Fargo & Company NY 11764 Closed with explanation \n", 199 | "3 Navient Solutions, Inc. MD 21402 Closed with explanation \n", 200 | "4 Resurgent Capital Services L.P. GA 30106 Closed with explanation \n", 201 | "\n", 202 | " timely_response consumer_disputed? \n", 203 | "0 Yes Yes \n", 204 | "1 Yes Yes \n", 205 | "2 Yes No \n", 206 | "3 Yes Yes \n", 207 | "4 Yes Yes " 208 | ] 209 | }, 210 | "execution_count": 5, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "df.head()" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 6, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "df.columns = df.columns.str.replace(' ','_').str.replace('?', '')" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 7, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "df = df.rename({'zipcode': 'zip_code', 'company_response_to_consumer': 'company_response'}, axis=1)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 8, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/html": [ 245 | "
\n", 246 | "\n", 259 | "\n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | "
productsub_productissuesub_issueconsumer_complaint_narrativecompanystatezip_codecompany_responsetimely_responseconsumer_disputed
0MortgageOther mortgageLoan modification,collection,foreclosureNaNNaNU.S. BancorpCA95993Closed with explanationYesYes
1MortgageOther mortgageLoan servicing, payments, escrow accountNaNNaNWells Fargo & CompanyCA91104Closed with explanationYesYes
2Credit reportingNaNIncorrect information on credit reportAccount statusNaNWells Fargo & CompanyNY11764Closed with explanationYesNo
3Student loanNon-federal student loanRepaying your loanRepaying your loanNaNNavient Solutions, Inc.MD21402Closed with explanationYesYes
4Debt collectionCredit cardFalse statements or representationAttempted to collect wrong amountNaNResurgent Capital Services L.P.GA30106Closed with explanationYesYes
\n", 349 | "
" 350 | ], 351 | "text/plain": [ 352 | " product sub_product \\\n", 353 | "0 Mortgage Other mortgage \n", 354 | "1 Mortgage Other mortgage \n", 355 | "2 Credit reporting NaN \n", 356 | "3 Student loan Non-federal student loan \n", 357 | "4 Debt collection Credit card \n", 358 | "\n", 359 | " issue \\\n", 360 | "0 Loan modification,collection,foreclosure \n", 361 | "1 Loan servicing, payments, escrow account \n", 362 | "2 Incorrect information on credit report \n", 363 | "3 Repaying your loan \n", 364 | "4 False statements or representation \n", 365 | "\n", 366 | " sub_issue consumer_complaint_narrative \\\n", 367 | "0 NaN NaN \n", 368 | "1 NaN NaN \n", 369 | "2 Account status NaN \n", 370 | "3 Repaying your loan NaN \n", 371 | "4 Attempted to collect wrong amount NaN \n", 372 | "\n", 373 | " company state zip_code company_response \\\n", 374 | "0 U.S. Bancorp CA 95993 Closed with explanation \n", 375 | "1 Wells Fargo & Company CA 91104 Closed with explanation \n", 376 | "2 Wells Fargo & Company NY 11764 Closed with explanation \n", 377 | "3 Navient Solutions, Inc. MD 21402 Closed with explanation \n", 378 | "4 Resurgent Capital Services L.P. GA 30106 Closed with explanation \n", 379 | "\n", 380 | " timely_response consumer_disputed \n", 381 | "0 Yes Yes \n", 382 | "1 Yes Yes \n", 383 | "2 Yes No \n", 384 | "3 Yes Yes \n", 385 | "4 Yes Yes " 386 | ] 387 | }, 388 | "execution_count": 8, 389 | "metadata": {}, 390 | "output_type": "execute_result" 391 | } 392 | ], 393 | "source": [ 394 | "df.head()" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 9, 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "data": { 404 | "text/plain": [ 405 | "Index(['product', 'sub_product', 'issue', 'sub_issue',\n", 406 | " 'consumer_complaint_narrative', 'company', 'state', 'zip_code',\n", 407 | " 'company_response', 'timely_response', 'consumer_disputed'],\n", 408 | " dtype='object')" 409 | ] 410 | }, 411 | "execution_count": 9, 412 | "metadata": {}, 413 | "output_type": "execute_result" 414 | } 415 | ], 416 | "source": [ 417 | "df.columns" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 10, 423 | "metadata": {}, 424 | "outputs": [ 425 | { 426 | "data": { 427 | "text/plain": [ 428 | "555957" 429 | ] 430 | }, 431 | "execution_count": 10, 432 | "metadata": {}, 433 | "output_type": "execute_result" 434 | } 435 | ], 436 | "source": [ 437 | "len(df)" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 11, 443 | "metadata": {}, 444 | "outputs": [ 445 | { 446 | "data": { 447 | "text/plain": [ 448 | "66806" 449 | ] 450 | }, 451 | "execution_count": 11, 452 | "metadata": {}, 453 | "output_type": "execute_result" 454 | } 455 | ], 456 | "source": [ 457 | "df['consumer_complaint_narrative'].notnull().sum()" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 12, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "df = df[df['consumer_complaint_narrative'].notnull()]" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 13, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "df['c'] = df['consumer_disputed'].map({'Yes': 1, 'No': 0})\n", 476 | "df = df.drop('consumer_disputed', axis=1)\n", 477 | "df = df.rename(columns={\"c\": \"consumer_disputed\"})" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 14, 483 | "metadata": {}, 484 | "outputs": [ 485 | { 486 | "data": { 487 | "text/plain": [ 488 | "0 51229\n", 489 | "1 15577\n", 490 | "Name: consumer_disputed, dtype: int64" 491 | ] 492 | }, 493 | "execution_count": 14, 494 | "metadata": {}, 495 | "output_type": "execute_result" 496 | } 497 | ], 498 | "source": [ 499 | "df['consumer_disputed'].value_counts()" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 15, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "df = df.sample(frac=1, replace=False).reset_index(drop=True)" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 16, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "df = df.replace(r'\\s', ' ', regex=True)" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": 18, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "df.to_csv('../data/consumer_complaints_with_narrative.csv', index=False)" 527 | ] 528 | } 529 | ], 530 | "metadata": { 531 | "kernelspec": { 532 | "display_name": "Python 3", 533 | "language": "python", 534 | "name": "python3" 535 | }, 536 | "language_info": { 537 | "codemirror_mode": { 538 | "name": "ipython", 539 | "version": 3 540 | }, 541 | "file_extension": ".py", 542 | "mimetype": "text/x-python", 543 | "name": "python", 544 | "nbconvert_exporter": "python", 545 | "pygments_lexer": "ipython3", 546 | "version": "3.7.2" 547 | } 548 | }, 549 | "nbformat": 4, 550 | "nbformat_minor": 4 551 | } 552 | -------------------------------------------------------------------------------- /requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboard_plugin_fairness_indicators==0.35.0 2 | tensorflow_hub==0.12.0 3 | tensorflow_privacy==0.7.3 4 | tensorflow==2.6.1 5 | tfx==1.4.0 6 | witwidget==1.8.1 7 | -------------------------------------------------------------------------------- /requirements/test_requirements.txt: -------------------------------------------------------------------------------- 1 | pytest==5.4.3 2 | pytest-flakes==4.0.0 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | addopts=--tb=short 3 | 4 | [flake8] 5 | max-line-length = 80 6 | 7 | [yapf] 8 | based_on_style = pep8 9 | -------------------------------------------------------------------------------- /utils/download_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Downloads the csv data 5 | """ 6 | 7 | import logging 8 | import os 9 | import shutil 10 | 11 | import pandas as pd 12 | import urllib3 13 | 14 | # Initial dataset source 15 | DATASET_URL = "http://bit.ly/building-ml-pipelines-dataset" 16 | 17 | # Initial local dataset location 18 | LOCAL_FILE_NAME = "data/consumer_complaints_with_narrative.csv" 19 | 20 | 21 | def download_dataset(url=DATASET_URL): 22 | """download_dataset downloads the remote dataset to a local path 23 | 24 | Keyword Arguments: 25 | url {string} -- 26 | complete url path to the csv data source (default: {DATASET_URL}) 27 | local_path {string} -- 28 | initial local file location (default: {LOCAL_FILE_NAME}) 29 | Returns: 30 | None 31 | """ 32 | # disable insecure https warning 33 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 34 | 35 | c = urllib3.PoolManager() 36 | with c.request("GET", url, preload_content=False) as res, open( 37 | LOCAL_FILE_NAME, "wb" 38 | ) as out_file: 39 | shutil.copyfileobj(res, out_file) 40 | logging.info("Download completed.") 41 | 42 | 43 | def create_folder(): 44 | """Creates a data folder if it doesn't exist. 45 | 46 | Returns: 47 | None 48 | """ 49 | directory = "data/" 50 | if not os.path.exists(directory): 51 | os.makedirs(directory) 52 | logging.info("Data folder created.") 53 | else: 54 | logging.info("Data folder already existed.") 55 | 56 | 57 | def check_execution_path(): 58 | """Check if the function and therefore all subsequent functions 59 | are executed from the root of the project 60 | 61 | Returns: 62 | boolean -- returns False if execution path isn't the root, 63 | otherwise True 64 | """ 65 | file_name = "LICENSE" 66 | if not os.path.exists(file_name): 67 | logging.error( 68 | "Don't execute the script from a sub-directory. " 69 | "Switch to the root of the project folder" 70 | ) 71 | return False 72 | return True 73 | 74 | 75 | if __name__ == "__main__": 76 | 77 | logging.basicConfig(level=logging.INFO) 78 | logging.info("Started download script") 79 | 80 | if check_execution_path(): 81 | create_folder() 82 | download_dataset() 83 | 84 | logging.info("Finished download script") 85 | --------------------------------------------------------------------------------