├── .github ├── ISSUE_TEMPLATE.md ├── PULL_REQUEST_TEMPLATE.md ├── ci │ ├── .pre-commit-config.yaml │ ├── .pylintrc │ └── .style.yapf └── workflows │ ├── automerge.yml │ ├── cherrypick.yml │ ├── ci.yml │ ├── ci_examples.yml │ ├── filter_examples.py │ ├── filter_projects.py │ ├── lint.yml │ ├── minor_release.yml │ ├── prepare_minor_release.py │ ├── release.yml │ └── update_main.py ├── .gitignore ├── .pre-commit-config.yaml ├── .pylintrc ├── .style.yapf ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── RELEASE.md ├── examples ├── README.md ├── example_filter │ ├── data │ │ └── test_data.csv │ ├── filter_example_colab.ipynb │ └── filter_function.py ├── fraud_feast │ ├── README.md │ ├── feast_pipeline_local.py │ ├── repo │ │ ├── .gitignore │ │ ├── driver_repo.py │ │ └── feature_store.yaml │ └── requirements.txt ├── model_card_generator │ ├── .gitignore │ └── MLMD_Model_Card_Toolkit_Demo.ipynb ├── pandas_transform │ ├── README.md │ ├── pandas_transform_example.ipynb │ └── requirements.txt ├── sklearn_penguins │ ├── .gitignore │ └── README.md └── xgboost_penguins │ ├── README.md │ ├── __init__.py │ ├── data │ └── penguins_processed.csv │ ├── penguin_pipeline_local.py │ ├── penguin_pipeline_local_e2e_test.py │ ├── requirements.txt │ └── utils.py ├── proposals ├── 20210404-sklearn_example.md ├── 20210507-mlmd_client_lib.md ├── 20210525-examplefilter.md ├── 20210525-feast_example_gen.md ├── 20210526-model_load_test_component.md ├── 20210605-schema_curation_custom_component.md ├── 20210707-xgboost_evaluator.md ├── 20210721-sampling_component.md ├── 20210723-feature_selection_custom_component.md ├── 20210817-firebase_ml_publisher_component.md ├── 20211124-model_card_component.md ├── 20220117-exit-handler-slack.md ├── 20220118-upload_predictions_to_bigquery.md ├── 20220513-pandas_transform.md ├── 20220802-project_pytorch_example.md ├── 20220823-huggingface_model_pusher.md ├── 20230209-copy_example_gen.md ├── 20230328-airflow_orchestration.md ├── README.md └── yyyymmdd-project_template.md ├── pyproject.toml ├── setup.py └── tfx_addons ├── __init__.py ├── apache_airflow └── README.md ├── copy_example_gen ├── README.md ├── __init__.py ├── component.py └── component_test.py ├── example_filter ├── README.md ├── RELEASE.md ├── component.py ├── component_test.py ├── data │ └── test_data.csv └── filter_function.py ├── feast_examplegen ├── README.md ├── __init__.py ├── component.py ├── component_test.py ├── converters.py ├── executor.py └── executor_test.py ├── feature_selection ├── CONTRIBUTING.md ├── README.md ├── RELEASE.md ├── __init__.py ├── component.py ├── component_test.py ├── data │ └── data.csv ├── example │ ├── Iris_example_colab.ipynb │ ├── Palmer_Penguins_example_colab.ipynb │ ├── Pima_Indians_Diabetes_example_colab.ipynb │ └── modules │ │ ├── iris_module_file.py │ │ ├── penguins_module.py │ │ └── pima_indians_module_file.py ├── nb │ └── Example.ipynb ├── requirements.txt └── test │ └── iris.csv ├── firebase_publisher ├── README.md ├── RELEASE.md ├── __init__.py ├── component.py ├── component_test.py ├── executor.py ├── runner.py └── runner_test.py ├── huggingface_pusher ├── README.md ├── __init__.py ├── component.py ├── component_test.py ├── executor.py ├── executor_test.py ├── runner.py └── runner_test.py ├── message_exit_handler ├── README.md ├── RELEASE.md ├── __init__.py ├── component.py ├── component_test.py ├── constants.py └── message_providers │ ├── __init__.py │ ├── base_provider.py │ ├── base_provider_test.py │ ├── logging_provider.py │ ├── logging_provider_test.py │ ├── slack_provider.py │ └── slack_provider_test.py ├── mlmd_client ├── README.md ├── RELEASE.md ├── __init__.py ├── client.py └── client_test.py ├── model_card_generator ├── README.md ├── RELEASE.md ├── __init__.py ├── artifact.py ├── artifact_test.py ├── component.py ├── component_test.py ├── executor.py ├── executor_test.py └── tfxtest.py ├── pandas_transform ├── README.md ├── RELEASE.md ├── __init__.py ├── component.py ├── component_test.py └── null_preprocessing.py ├── predictions_to_bigquery ├── __init__.py ├── component.py ├── executor.py ├── executor_test.py ├── test_component.py └── utils.py ├── sampling ├── README.md ├── __init__.py ├── component.py ├── component_test.py ├── data │ ├── example_gen │ │ ├── Split-eval │ │ │ └── data_tfrecord-00000-of-00001.gz │ │ └── Split-train │ │ │ └── data_tfrecord-00000-of-00001.gz │ └── test_data.csv ├── example │ ├── __init__.py │ ├── data │ │ └── credit_fraud.csv │ ├── local_notebook.ipynb │ ├── sampler_pipeline_local.py │ └── sampler_utils.py ├── executor.py ├── executor_test.py └── spec.py ├── schema_curation ├── CONTRIBUTING.md ├── README.md ├── RELEASE.md ├── __init__.py ├── component │ ├── __init__.py │ ├── component.py │ ├── component_test.py │ ├── executor.py │ └── executor_test.py ├── example │ ├── __init__.py │ ├── module_file.py │ ├── taxi_example_colab.ipynb │ └── taxi_example_local.py └── test_data │ ├── module_file │ └── module_file.py │ └── schema_gen │ ├── __init__.py │ └── schema.pbtxt ├── utils ├── __init__.py ├── test_utils.py └── test_utils_tests.py ├── version.py └── xgboost_evaluator ├── README.md ├── RELEASE.md ├── __init__.py ├── component.py ├── data └── penguins_processed.csv ├── xgboost_predict_extractor.py └── xgboost_predict_extractor_test.py /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Expected Behavior 2 | 3 | 4 | ## Actual Behavior 5 | 6 | 7 | ## Steps to Reproduce the Problem 8 | 9 | 1. 10 | 1. 11 | 1. 12 | 13 | ## Specifications 14 | 15 | - Version: 16 | - Platform: -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Fixes # 2 | 3 | > It's a good idea to open an issue first for discussion. 4 | 5 | - [ ] Tests pass 6 | - [ ] Appropriate changes to README are included in PR -------------------------------------------------------------------------------- /.github/ci/.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | default_stages: [push,commit] 4 | repos: 5 | - repo: https://github.com/pre-commit/mirrors-yapf 6 | rev: v0.31.0 7 | hooks: 8 | - id: yapf 9 | - repo: https://github.com/pycqa/isort 10 | rev: 5.11.5 11 | hooks: 12 | - id: isort 13 | name: isort (python) 14 | - repo: https://github.com/PyCQA/pylint 15 | rev: v2.8.3 16 | hooks: 17 | - id: pylint -------------------------------------------------------------------------------- /.github/ci/.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | based_on_style=pep8 3 | indent_width=2 4 | ALLOW_MULTILINE_DICTIONARY_KEYS=True 5 | -------------------------------------------------------------------------------- /.github/workflows/automerge.yml: -------------------------------------------------------------------------------- 1 | name: Automatic merging 2 | on: 3 | pull_request_target: { types: [opened, synchronize] } 4 | issue_comment: { types: [created] } 5 | 6 | jobs: 7 | automerge: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v1 11 | - name: Run Codeowners merge check 12 | uses: casassg/auto-merge-bot@v0.3 13 | env: 14 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 15 | with: 16 | merge_method: 'squash' 17 | assign_reviewer: 'false' -------------------------------------------------------------------------------- /.github/workflows/cherrypick.yml: -------------------------------------------------------------------------------- 1 | name: Cherry pick 2 | on: 3 | issue_comment: 4 | types: [created] 5 | jobs: 6 | cherry-pick: 7 | name: Cherry Pick 8 | # Only cherry pick if user is a release manager 9 | # NB(gcasassaez): We unfortunately have to use fromJSON as GitHub doesn't have a way to specify constant arrays 10 | # See: https://github.community/t/passing-an-array-literal-to-contains-function-causes-syntax-error/17213/3 11 | if: github.event.issue.pull_request != '' && contains(github.event.comment.body, '/cherry-pick') && contains(fromJson('["casassg", "hanneshapke"]'), github.event.sender.login) 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout the latest code 15 | uses: actions/checkout@v2 16 | with: 17 | token: ${{ secrets.GITHUB_TOKEN }} 18 | fetch-depth: 0 # otherwise, you will fail to push refs to dest repo 19 | - name: Automatic Cherry Pick 20 | uses: vendoo/gha-cherry-pick@v1 21 | env: 22 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 23 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'tfx_addons/**' 7 | - '.github/workflows/ci.yml' 8 | - '.github/workflows/filter_projects.py' 9 | - 'setup.py' 10 | - 'pyproject.toml' 11 | - '.github/ci/deps/**' 12 | branches: 13 | - main 14 | - r* 15 | pull_request: 16 | paths: 17 | - 'tfx_addons/**' 18 | - '.github/workflows/ci.yml' 19 | - '.github/workflows/filter_projects.py' 20 | - 'setup.py' 21 | - 'pyproject.toml' 22 | - '.github/ci/deps/**' 23 | branches: 24 | - main 25 | - r* 26 | 27 | concurrency: 28 | group: ${{ github.workflow }}-${{ github.ref }} 29 | cancel-in-progress: true 30 | 31 | jobs: 32 | filter_projects: 33 | # Dynamic matrix trick inspired by https://www.cynkra.com/blog/2020-12-23-dynamic-gha/ 34 | runs-on: ubuntu-latest 35 | timeout-minutes: 60 36 | outputs: 37 | projects: ${{ steps.set-matrix.outputs.projects }} 38 | steps: 39 | - uses: actions/checkout@v2 40 | - name: Set up Python 3.7 41 | uses: actions/setup-python@v2 42 | with: 43 | python-version: 3.7 44 | - name: Get Changed Files 45 | id: changed_files 46 | uses: trilom/file-changes-action@v1.2.4 47 | with: 48 | output: json 49 | - name: Filter projects 50 | id: set-matrix 51 | run: | 52 | echo "projects=$(python ./.github/workflows/filter_projects.py $HOME/files.json)" >> $GITHUB_OUTPUT 53 | 54 | ci: 55 | runs-on: ubuntu-latest 56 | needs: filter_projects 57 | timeout-minutes: 60 58 | if: needs.filter_examples.outputs.projects != '[]' 59 | strategy: 60 | # Test for each project in parallel using ci_max and ci_min to ensure 61 | # tested in range of tfx/tensorflow supported versions 62 | matrix: 63 | project: ${{fromJson(needs.filter_projects.outputs.projects)}} 64 | depconstraint: 65 | - ci_max 66 | - ci_min 67 | steps: 68 | - uses: actions/checkout@v2 69 | - name: Set up Python 3.7 70 | uses: actions/setup-python@v2 71 | with: 72 | python-version: 3.7 73 | - name: Cache python environment 74 | uses: actions/cache@v2 75 | with: 76 | # Cache pip 77 | path: ~/.cache/pip 78 | # Look to see if there is a cache hit for the corresponding setup.py + TFX version 79 | key: ${{ runner.os }}-pip-${{ matrix.depconstraint }}-${{ hashFiles('tfx_addons/version.py') }} 80 | restore-keys: | 81 | ${{ runner.os }}-pip-${{ matrix.depconstraint }} 82 | - name: Install dependencies 83 | run: | 84 | python -m pip install --upgrade pip wheel 85 | python -m pip install -e ".[${{ matrix.project }}, ${{ matrix.depconstraint }}, test]" 86 | - name: Run tests 87 | run: pytest tfx_addons/${{ matrix.project }} 88 | -------------------------------------------------------------------------------- /.github/workflows/ci_examples.yml: -------------------------------------------------------------------------------- 1 | name: Examples CI 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'tfx_addons/**' 7 | - 'examples/**' 8 | - '.github/workflows/ci_examples.yml' 9 | - '.github/workflows/filter_examples.py' 10 | - 'setup.py' 11 | - 'pyproject.toml' 12 | branches: 13 | - main 14 | - r* 15 | pull_request: 16 | paths: 17 | - 'tfx_addons/**' 18 | - 'examples/**' 19 | - '.github/workflows/ci_examples.yml' 20 | - '.github/workflows/filter_examples.py' 21 | - 'setup.py' 22 | - 'pyproject.toml' 23 | branches: 24 | - main 25 | - r* 26 | 27 | concurrency: 28 | group: ${{ github.workflow }}-${{ github.ref }} 29 | cancel-in-progress: true 30 | 31 | jobs: 32 | filter_examples: 33 | # Dynamic matrix trick inspired by https://www.cynkra.com/blog/2020-12-23-dynamic-gha/ 34 | runs-on: ubuntu-latest 35 | timeout-minutes: 60 36 | outputs: 37 | projects: ${{ steps.set-matrix.outputs.projects }} 38 | steps: 39 | - uses: actions/checkout@v2 40 | - name: Set up Python 3.7 41 | uses: actions/setup-python@v2 42 | with: 43 | python-version: 3.7 44 | - name: Get Changed Files 45 | id: changed_files 46 | uses: trilom/file-changes-action@v1.2.4 47 | with: 48 | output: json 49 | - name: Filter example projects 50 | id: set-matrix 51 | run: | 52 | echo "projects=$(python ./.github/workflows/filter_examples.py $HOME/files.json)" >> $GITHUB_OUTPUT 53 | ci-examples: 54 | runs-on: ubuntu-latest 55 | needs: filter_examples 56 | timeout-minutes: 60 57 | if: needs.filter_examples.outputs.projects != '[]' 58 | strategy: 59 | # Test for each project in parallel using ci_max and ci_min to ensure 60 | # tested in range of tfx/tensorflow supported versions 61 | matrix: 62 | project: ${{fromJson(needs.filter_examples.outputs.projects)}} 63 | steps: 64 | - uses: actions/checkout@v2 65 | - name: Set up Python 3.7 66 | uses: actions/setup-python@v2 67 | with: 68 | python-version: 3.7 69 | - name: Cache python environment 70 | uses: actions/cache@v2 71 | with: 72 | # Cache installed dependencies 73 | path: ~/.cache/pip 74 | # Look to see if there is a cache hit for the corresponding requirement.txt + project name 75 | key: ${{ runner.os }}-pip-ciexamples-${{ matrix.project }}-${{ hashFiles(format('examples/{0}/requirements.txt', matrix.project)) }} 76 | restore-keys: | 77 | ${{ runner.os }}-pip-ciexamples-${{ matrix.project }} 78 | - name: Install dependencies 79 | run: | 80 | python -m pip install --upgrade pip wheel pytest 81 | cd examples/${{ matrix.project }} 82 | pip install -r requirements.txt 83 | - name: Run tests 84 | run: | 85 | cd examples/${{ matrix.project }} 86 | python -m pytest . 87 | 88 | -------------------------------------------------------------------------------- /.github/workflows/filter_projects.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Internal script to parse changed files and potential pkgs and returns the overlap""" 16 | 17 | import argparse 18 | import json 19 | import logging 20 | import os 21 | from typing import List 22 | 23 | logging.getLogger().setLevel(logging.INFO) 24 | 25 | # NB(casassg): Files that if changed should trigger running CI for all projects. 26 | # This are files which are core and we want to avoid causing outages 27 | # because of them 28 | RUN_ALL_FILES = [ 29 | "tfx_addons/version.py", "setup.py", ".github/workflows/ci.yml", 30 | "pyproject.toml" 31 | ] 32 | 33 | # Get event that triggered workflow 34 | # See: https://docs.github.com/en/actions/learn-github-actions/environment-variables#default-environment-variables 35 | GH_EVENT_NAME = os.environ.get("GITHUB_EVENT_NAME", "unknown") 36 | 37 | 38 | def _get_testable_projects() -> List[str]: 39 | """Get _PKG_METADATA from version.py which contains what projects are active 40 | """ 41 | context = {} 42 | base_dir = os.path.dirname( 43 | os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 44 | with open(os.path.join(base_dir, "tfx_addons", "version.py")) as fp: 45 | exec(fp.read(), context) # pylint: disable=exec-used 46 | 47 | return list(context["_PKG_METADATA"].keys()) 48 | 49 | 50 | def get_affected_projects(affected_files: List[str]) -> List[str]: 51 | """Given a list of affected files, and projects that can be tested, 52 | find what projects should CI run""" 53 | 54 | logging.info("Found affected files: %s", affected_files) 55 | testable_projects = _get_testable_projects() 56 | if GH_EVENT_NAME == "push": 57 | logging.info("GitHub Action trigger is %s, running all projects", 58 | GH_EVENT_NAME) 59 | return testable_projects 60 | else: 61 | logging.info("GitHub Action trigger is %s, filtering projects", 62 | GH_EVENT_NAME) 63 | for run_all_file in RUN_ALL_FILES: 64 | if run_all_file in affected_files: 65 | logging.warning("Found change in %s, running all projects", run_all_file) 66 | return testable_projects 67 | projects_to_test = set() 68 | for file in affected_files: 69 | if file.startswith("tfx_addons"): 70 | file_component = file.replace("tfx_addons/", "").split("/", 71 | maxsplit=1)[0] 72 | if file_component in testable_projects: 73 | logging.info("Package %s is marked for testing", file_component) 74 | projects_to_test.add(file_component) 75 | else: 76 | logging.warning( 77 | "Package %s is not in _PKG_TESTABLE variable for version.py", 78 | file_component) 79 | return list(projects_to_test) 80 | 81 | 82 | if __name__ == "__main__": 83 | parser = argparse.ArgumentParser() 84 | parser.add_argument("file_manifest") 85 | 86 | args = parser.parse_args() 87 | 88 | with open(args.file_manifest, "r") as f: 89 | affected_components = get_affected_projects(json.load(f)) 90 | print(json.dumps(affected_components)) 91 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | push: 5 | paths: 6 | - '**.py' 7 | - '.github/ci/**' 8 | - '.github/workflows/lint.yml' 9 | branches: 10 | - main 11 | - r* 12 | pull_request: 13 | paths: 14 | - '**.py' 15 | - '.github/ci/**' 16 | - '.github/workflows/lint.yml' 17 | branches: 18 | - main 19 | - r* 20 | 21 | jobs: 22 | pre-commit-checks: 23 | runs-on: ubuntu-latest 24 | timeout-minutes: 60 25 | steps: 26 | - uses: actions/checkout@v2 27 | - uses: pre-commit/action@v2.0.3 28 | name: Run pre-commit checks (pylint/yapf/isort) 29 | env: 30 | SKIP: insert-license 31 | with: 32 | extra_args: --hook-stage push --all-files 33 | -------------------------------------------------------------------------------- /.github/workflows/minor_release.yml: -------------------------------------------------------------------------------- 1 | name: Create Minor Release 2 | on: 3 | workflow_dispatch: 4 | 5 | jobs: 6 | createrelease: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - name: Check out code 11 | uses: actions/checkout@v2 12 | - name: Set minor version 13 | id: set-version 14 | run: | 15 | echo "::set-output name=version::$(python ./.github/workflows/prepare_minor_release.py)" 16 | - name: Create release branch 17 | run: git checkout -b r${{ steps.set-version.outputs.version }} 18 | - name: Initialize mandatory git config 19 | run: | 20 | git config user.name "GitHub Actions" 21 | git config user.email noreply@github.com 22 | - name: Commit changelog and manifest files 23 | id: make-commit 24 | run: | 25 | git add tfx_addons/version.py 26 | git commit --message "Prepare release ${{ steps.set-version.outputs.version }}" 27 | echo "::set-output name=commit::$(git rev-parse HEAD)" 28 | - name: Push new branch 29 | run: git push origin r${{ steps.set-version.outputs.version }} 30 | - uses: ncipollo/release-action@v1 31 | with: 32 | name: v${{ steps.set-version.outputs.version }}.0rc0 33 | commit: ${{ steps.make-commit.outputs.commit }} 34 | prerelease: true 35 | draft: true 36 | generateReleaseNotes: true 37 | skipIfReleaseExists: true 38 | tag: v${{ steps.set-version.outputs.version }}.0rc0 39 | - name: Update main 40 | id: update-main 41 | run: | 42 | git checkout main 43 | echo "::set-output name=new_version::$(python ./.github/workflows/update_main.py)" 44 | - name: Commit main change 45 | run: | 46 | git checkout -b ${{ github.triggering_actor }}/update-${{ steps.update-main.outputs.new_version }} 47 | git add tfx_addons/version.py 48 | git commit --message "Update main to ${{ steps.update-main.outputs.new_version }}" 49 | git push origin ${{ github.triggering_actor }}/update-${{ steps.update-main.outputs.new_version }} 50 | 51 | - name: Create pull request into main 52 | uses: thomaseizinger/create-pull-request@1.0.0 53 | with: 54 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 55 | head: ${{ github.triggering_actor }}/update-${{ steps.update-main.outputs.new_version }} 56 | base: main 57 | title: Update minor version to ${{ steps.update-main.outputs.new_version }} 58 | reviewers: ${{ github.triggering_actor }} 59 | body: | 60 | This is an automatic PR triggered by ${{ github.triggering_actor }} to prepare for ${{ steps.set-version.outputs.version }} release. 61 | 62 | Approve and merge in order to update main branch to ${{ steps.update-main.outputs.new_version }}. 63 | 64 | Check out [RELEASE.md](https://github.com/tensorflow/tfx-addons/blob/main/RELEASE.md) for more details. 65 | -------------------------------------------------------------------------------- /.github/workflows/prepare_minor_release.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Internal script to perform a minor release""" 16 | import logging 17 | import os 18 | import sys 19 | 20 | logging.getLogger().setLevel(logging.INFO) 21 | # Dynamically load root as module so that we can import version 22 | BASE_DIR = os.path.dirname( 23 | os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 24 | sys.path.append(BASE_DIR) 25 | 26 | import tfx_addons as tfxa # pylint: disable=wrong-import-position 27 | 28 | current_version = tfxa.__version__ 29 | major, minor, patch = current_version.split(".") 30 | 31 | with open(os.path.join(BASE_DIR, "tfx_addons", "version.py")) as f: 32 | lines = f.readlines() 33 | 34 | with open(os.path.join(BASE_DIR, "tfx_addons", "version.py"), "w") as f: 35 | for l in lines: 36 | if l.startswith("_VERSION_SUFFIX"): 37 | f.write('_VERSION_SUFFIX = "rc0"\n') 38 | else: 39 | f.write(l) 40 | 41 | print(".".join([major, minor])) 42 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release TFX Addons package to PyPI and TestPyPI 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'tfx_addons/**' 7 | - 'setup.py' 8 | - 'pyproject.toml' 9 | branches: 10 | - main 11 | - r* 12 | release: 13 | types: [published] 14 | tags: 15 | - v* 16 | 17 | jobs: 18 | build-and-publish: 19 | name: Build TFX Addons PyPI package and release to PyPI and TestPyPI 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: actions/checkout@v2 23 | - name: Set up Python 3.7 24 | uses: actions/setup-python@v2 25 | with: 26 | python-version: 3.7 27 | - name: Install pypa/build 28 | run: python -m pip install build --user 29 | - name: Build a binary wheel and a source tarball 30 | run: python -m build --sdist --wheel --outdir dist/ . 31 | - name: Publish distribution TFX Addons package to Test PyPI 32 | uses: pypa/gh-action-pypi-publish@v1.5.0 33 | with: 34 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 35 | repository_url: https://test.pypi.org/legacy/ 36 | skip_existing: true 37 | - name: Publish distribution TFX Addons package to PyPI 38 | if: github.event_name == 'release' 39 | uses: pypa/gh-action-pypi-publish@v1.5.0 40 | with: 41 | password: ${{ secrets.PYPI_API_TOKEN }} 42 | - name: Upload files to a GitHub release 43 | uses: svenstaro/upload-release-action@2.2.1 44 | if: github.event_name == 'release' 45 | with: 46 | repo_token: ${{ secrets.GITHUB_TOKEN }} 47 | file: dist/* 48 | tag: ${{ github.ref }} 49 | overwrite: true 50 | file_glob: true 51 | -------------------------------------------------------------------------------- /.github/workflows/update_main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Internal script to perform a minor release""" 16 | import logging 17 | import os 18 | import sys 19 | 20 | logging.getLogger().setLevel(logging.INFO) 21 | # Dynamically load root as module so that we can import version 22 | BASE_DIR = os.path.dirname( 23 | os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 24 | sys.path.append(BASE_DIR) 25 | 26 | import tfx_addons as tfxa # pylint: disable=wrong-import-position 27 | 28 | current_version = tfxa.__version__ 29 | major, minor, patch = current_version.split(".") 30 | 31 | with open(os.path.join(BASE_DIR, "tfx_addons", "version.py")) as f: 32 | lines = f.readlines() 33 | 34 | with open(os.path.join(BASE_DIR, "tfx_addons", "version.py"), "w") as f: 35 | for l in lines: 36 | if l.startswith("_MINOR_VERSION"): 37 | next_minor = int(minor) + 1 38 | f.write(f'_MINOR_VERSION = "{next_minor}"\n') 39 | else: 40 | f.write(l) 41 | 42 | print(".".join([major, str(next_minor)])) 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # LINT.IfChange 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | **/*.py[cod] 5 | **/*$py.class 6 | 7 | # Mac folder attributes 8 | **/.DS_Store 9 | 10 | # C extensions 11 | **/*.so 12 | 13 | # Unit test 14 | .pytest_cache/ 15 | 16 | # Distribution / packaging 17 | .Python 18 | # build/ # build/ contains required files for building tfx packages. 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | pip-wheel-metadata/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # Virtual environments 38 | .venv/* 39 | env/* 40 | **/env 41 | **/venv 42 | 43 | # pyenv 44 | .python-version 45 | 46 | # Editor 47 | .idea/* 48 | .vscode/* 49 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | .github/ci/.pre-commit-config.yaml -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | .github/ci/.pylintrc -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | .github/ci/.style.yapf -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Global owners for SIG TFX-Addons 2 | * @rcrowe-google @theadactyl 3 | 4 | # SIG operations: proposals and contributions guidelines 5 | /proposals/ @rcrowe-google 6 | 7 | # SIG core members 8 | /CONTRIBUTING.md @rcrowe-google @casassg @hanneshapke @codesue @deutranium @wihanbooyse @BACtaki 9 | /README.md @casassg @hanneshapke @codesue @deutranium @wihanbooyse @BACtaki 10 | 11 | # PyPi distribution files 12 | /tfx_addons/__init__.py @casassg @hanneshapke @codesue @deutranium @wihanbooyse @BACtaki 13 | /tfx_addons/version.py @casassg @hanneshapke @codesue @deutranium @wihanbooyse @BACtaki 14 | /setup.py @casassg @hanneshapke @codesue @deutranium @wihanbooyse @BACtaki 15 | /pyproject.toml @casassg @hanneshapke @codesue @deutranium @wihanbooyse @BACtaki 16 | 17 | # CI/CD configuration (Release team) 18 | /.github/workflows/ @casassg @hanneshapke 19 | /.github/ci @casassg @hanneshapke 20 | /RELEASE.md @casassg @hanneshapke 21 | 22 | # Sci-Kit Learn Example using the Penguins dataset 23 | /examples/sklearn_penguins/ @TheMichaelHu @1025KB 24 | 25 | # MLMD Client Library 26 | /tfx_addons/mlmd_client/ @codesue @pselden @casassg 27 | 28 | # ExampleFilter Component 29 | /tfx_addons/example_filter/ @rclough 30 | 31 | # Schema Curation Component 32 | /tfx_addons/schema_curation/ @pratishtha-abrol @FatimahAdwan @deutranium @nirzu97 33 | 34 | # XGBoost Evaluator Component 35 | /tfx_addons/xgboost_evaluator @kindalime @cent5 @casassg 36 | /examples/xgboost_penguins @kindalime @cent5 @casassg 37 | 38 | # Sampling Component 39 | /tfx_addons/sampling @kindalime @cent5 @casassg 40 | 41 | # Feast ExampleGen Component 42 | /tfx_addons/feast_examplegen @BACtaki @casassg @wihanbooyse 43 | /examples/fraud_feast @BACtaki @casassg @wihanbooyse 44 | 45 | # Feature Selection Component 46 | /tfx_addons/feature_selection @nirzu97 @pratishtha-abrol @FatimahAdwan @deutranium 47 | 48 | # Firebase Publisher 49 | /tfx_addons/firebase_publisher @deep-diver @sayakpaul 50 | 51 | # HuggingFace Pusher 52 | /tfx_addons/huggingface_pusher @deep-diver @sayakpaul 53 | 54 | # Message Exit Handler 55 | /tfx_addons/message_exit_handler @hanneshapke 56 | /tfx_addons/utils @hanneshapke 57 | 58 | # Predictions to Bigquery Component 59 | /tfx_addons/predictions_to_bigquery @hanneshapke @cfezequiel 60 | 61 | # PandasTransform Component 62 | /tfx_addons/pandas_transform @rcrowe-google 63 | 64 | # PandasTransform Component 65 | /tfx_addons/model_card_generator @codesue @hanneshapke 66 | /examples/model_card_generator @codesue @hanneshapke 67 | 68 | # Apache Airflow Orchestrator 69 | /tfx_addons/apache_airflow @lego0901 70 | 71 | # CopyExampleGen Component 72 | /tfx_addons/copy_example_gen @alxndrnh 73 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # SIG Addons Releases 2 | 3 | TFX Addons follows [Semantic Versioning 2.0](https://semver.org/) strategy. 4 | 5 | * See the [Release Notes](https://github.com/tensorflow/tfx-addons/releases) for current and past releases. 6 | 7 | ## Minor automatic release from main 8 | 9 | 1. Trigger [Create Minor Release](https://github.com/tensorflow/tfx-addons/actions/workflows/minor_release.yml) workflow and ensure it runs to completion. 10 | 2. Find created [draft release](https://github.com/tensorflow/tfx-addons/releases). 11 | * Add updates for new features, enhancements, bug fixes 12 | * Add contributors using `git shortlog ..HEAD -s` 13 | 3. Publish release. 14 | * Check PyPI to ensure release candidate has been released. 15 | * Send email to mailing list for vote. 16 | 4. Find the minor version PR created above and merge it. 17 | 18 | 19 | ## Major/Minor releases 20 | 21 | 1. Create new `rX.Y` branch on https://github.com/tensorflow/tfx-addons from `main`. 22 | 2. Update `version.py` in `rX.Y` branch. 23 | * Set the correct version and suffix in [version.py](https://github.com/tensorflow/tfx-addons/blob/main/tfx_addons/version.py). 24 | * Ensure the proper minimum and maximum tested versions of TFX are set in [version.py](https://github.com/tensorflow/tfx-addons/blob/main/tfx_addons/version.py). 25 | * Ensure proper supported python libraries are set in [version.py](https://github.com/tensorflow/tfx-addons/blob/main/tfx_addons/version.py). 26 | 3. Create a [new release](https://github.com/tensorflow/tfx-addons/releases) from `rX.Y` branch. Create a tag with `vX.Y.Z` name. 27 | * Add updates for new features, enhancements, bug fixes 28 | * Add contributors using `git shortlog ..HEAD -s` 29 | 4. Create a new PR and merge an increase of `_MINOR_VERSION` number in `main` to get ready for next release. 30 | 31 | ## Patch releases 32 | 1. Cherry-pick commits to `rX.Y` branch. Release team can just port PR by commenting "/cherry-pick rX.Y" in a merged PR. 33 | 2. Create new PR with increasing `_PATCH_VERSION` in `version.py` against `rX.Y` branch. 34 | * Set the correct version and suffix in [version.py](https://github.com/tensorflow/tfx-addons/blob/main/tfx_addons/version.py). 35 | * Ensure the proper minimum and maximum tested versions of TFX are set in [version.py](https://github.com/tensorflow/tfx-addons/blob/main/tfx_addons/version.py). 36 | * Ensure proper supported python libraries are set in [version.py](https://github.com/tensorflow/tfx-addons/blob/main/tfx_addons/version.py). 37 | 3. Create a [new release](https://github.com/tensorflow/tfx-addons/releases) from `rX.Y` branch. Create a tag with `vX.Y.Z` name. 38 | * Add updates for new features, enhancements, bug fixes 39 | * Add contributors using `git shortlog ..HEAD -s` 40 | 41 | 42 | 43 | ## SIG Addons Release Team 44 | 45 | Current Release Team: 46 | 47 | - Hannes Hapke - @hanneshapke 48 | - Gerard Casas Saez - @casassg 49 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | This directory contains projects which provide examples for different 4 | use-cases or design approaches using TFX. -------------------------------------------------------------------------------- /examples/example_filter/data/test_data.csv: -------------------------------------------------------------------------------- 1 | label,col1 2 | ,2 3 | ,2 4 | ,2 5 | ,2 6 | ,2 7 | ,2 8 | ,2 9 | ,2 10 | ,2 11 | ,2 12 | 1,1 13 | 1,1 14 | 1,1 15 | 1,1 16 | 1,1 17 | 1,1 18 | 1,1 19 | 1,1 20 | 1,1 21 | 1,1 22 | 1,1 23 | 1,1 24 | 1,1 25 | 1,1 26 | 1,1 27 | 1,1 28 | 1,1 29 | 1,1 30 | 1,1 31 | 1,1 32 | 1,1 33 | 1,1 34 | 0,0 35 | 0,0 36 | 0,0 37 | 0,0 38 | 0,0 39 | 0,0 40 | 0,0 41 | 0,0 42 | 0,0 43 | 0,0 44 | 0,0 45 | 0,0 46 | 0,0 47 | 0,0 48 | 0,0 49 | 0,0 50 | 0,0 51 | 0,0 -------------------------------------------------------------------------------- /examples/example_filter/filter_function.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Filters the data from input data by using the filter function.""" 16 | 17 | 18 | def filter_function(x_list): 19 | """Filters the data from input data by using the filter function. 20 | 21 | Args: 22 | x_list: Input list of data to be filtered. 23 | 24 | 25 | Returns: 26 | filtered list 27 | 28 | """ 29 | new_list = [] 30 | for element in x_list: 31 | if element['label'] == [0]: 32 | new_list.append(element) 33 | return new_list 34 | -------------------------------------------------------------------------------- /examples/fraud_feast/README.md: -------------------------------------------------------------------------------- 1 | # Fraud feast Example 2 | 3 | Expanded [Feast Fraud tutorial](https://github.com/feast-dev/feast-fraud-tutorial/blob/4acf205dfbb3615d2f3e913adf1c28c5f2655f4c/notebooks/Fraud_Detection_Tutorial.ipynb) to use TFX-Addons [FeastExampleGen](/tfx_addons/feast_examplegen/README.md) 4 | 5 | ## Instructions 6 | 7 | Clone the tfx-addons repo and navigate to the fraud_feast directory. 8 | 9 |
10 | git clone https://github.com/tensorflow/tfx-addons.git
11 | cd tfx-addons/examples/fraud_feast
12 | 
13 | 14 | Next, create a Python virtual environment for this example, activate the 15 | environment, and install dependencies. Make sure you are using a version of 16 | python supported by TFX. 17 | 18 |
19 | python -m venv venv
20 | source ./venv/bin/activate
21 | pip install -r requirements.txt
22 | 
23 | 24 | ### Local Example 25 | Initialize Feast repository and run local file. 26 | 27 |
28 | cd repo && feast apply && cd ..
29 | python feast_pipeline_local.py
30 | 
31 | -------------------------------------------------------------------------------- /examples/fraud_feast/repo/.gitignore: -------------------------------------------------------------------------------- 1 | data/*.db -------------------------------------------------------------------------------- /examples/fraud_feast/repo/driver_repo.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Demo repository for credit card transations fraud dataset 16 | """ 17 | from datetime import timedelta 18 | 19 | from feast import BigQuerySource, Entity, FeatureView, ValueType 20 | 21 | # Add an entity for users 22 | user_entity = Entity( 23 | name="user_id", 24 | description= 25 | "A user that has executed a transaction or received a transaction", 26 | value_type=ValueType.STRING) 27 | 28 | # Add two FeatureViews based on existing tables in BigQuery 29 | user_account_fv = FeatureView( 30 | name="user_account_features", 31 | entities=["user_id"], 32 | ttl=timedelta(weeks=52), 33 | batch_source=BigQuerySource( 34 | table_ref="feast-oss.fraud_tutorial.user_account_features", 35 | event_timestamp_column="feature_timestamp")) 36 | 37 | user_has_fraudulent_transactions_fv = FeatureView( 38 | name="user_has_fraudulent_transactions", 39 | entities=["user_id"], 40 | ttl=timedelta(weeks=52), 41 | batch_source=BigQuerySource( 42 | table_ref="feast-oss.fraud_tutorial.user_has_fraudulent_transactions", 43 | event_timestamp_column="feature_timestamp")) 44 | -------------------------------------------------------------------------------- /examples/fraud_feast/repo/feature_store.yaml: -------------------------------------------------------------------------------- 1 | project: fraud_tutorial 2 | registry: ./data/registry.db 3 | provider: gcp 4 | online_store: 5 | type: sqlite -------------------------------------------------------------------------------- /examples/fraud_feast/requirements.txt: -------------------------------------------------------------------------------- 1 | ../..[feast_examplegen] -------------------------------------------------------------------------------- /examples/model_card_generator/.gitignore: -------------------------------------------------------------------------------- 1 | # unnecessary project files 2 | census_income_constants.py 3 | census_income_trainer.py 4 | census_income_transform.py 5 | -------------------------------------------------------------------------------- /examples/pandas_transform/README.md: -------------------------------------------------------------------------------- 1 | # PandasTransform 2 | ## TL;DR 3 | PandasTransform is a TFX component which can be used instead of the standard Transform component, and allows you to work with Pandas dataframes for your feature engineering. Processing is distributed using Beam for scalability. Operations which require a full pass over the dataset are not currently supported. Statistics such as the standard deviation, which are required for operations such as z-score normalization, are supplied using the statistics which are captured by StatisticsGen. 4 | 5 | ## This Example 6 | This example notebook shows how to use the PandasTransform component in a TFX pipeline. Notice in particular the way that StatisticsGen is used to create statistics for both the raw dataset and the transformed dataset. 7 | 8 | Note that although this example does use a TensorFlow model, since PandasTransform does not create a Transform graph the feature engineering which is done in PandasTransform will need to be applied separately during serving. 9 | 10 | ## Project Team 11 | Robert Crowe (rcrowe-google) robertcrowe--at--google--dot--com 12 | -------------------------------------------------------------------------------- /examples/pandas_transform/requirements.txt: -------------------------------------------------------------------------------- 1 | ../..[pandas_transform] 2 | -------------------------------------------------------------------------------- /examples/sklearn_penguins/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled 2 | __pycache__/ 3 | 4 | # Unit test / coverage reports 5 | htmlcov/ 6 | .tox/ 7 | .nox/ 8 | .coverage 9 | .coverage.* 10 | .cache 11 | nosetests.xml 12 | coverage.xml 13 | *.cover 14 | *.py,cover 15 | .hypothesis/ 16 | .pytest_cache/ 17 | cover/ 18 | 19 | # Jupyter Notebook 20 | .ipynb_checkpoints 21 | 22 | # Environments 23 | venv/ 24 | 25 | # Compiled KFP pipelines. 26 | *.tar.gz 27 | 28 | # TFX artifacts 29 | serving_model/ 30 | -------------------------------------------------------------------------------- /examples/sklearn_penguins/README.md: -------------------------------------------------------------------------------- 1 |
2 |

ARCHIVED

3 | 4 | This example is archived because there is no currently active owner. It is also basically a duplicate of the [existing example in the main TFX repo](https://github.com/tensorflow/tfx/tree/master/tfx/examples/penguin/experimental), which is maintained by the TFX team. 5 | 6 | This example could be expanded to include a broader use of Scikit-Learn. If anyone is interested in expanding or working on this, you can check out the code in [v0.3.0 release](https://github.com/tensorflow/tfx-addons/tree/v0.3.0/examples/sklearn_penguins). 7 | 8 | Please contact the TFX-Addons maintainers to request ownership. 9 | 10 |
11 | 12 | # Penguin Classification Scikit-learn Example 13 | 14 | Expanded the [TFX penguin example 15 | pipeline](https://github.com/tensorflow/tfx/tree/master/tfx/examples/penguin) 16 | with instructions for using [scikit-learn](https://scikit-learn.org/stable/) 17 | to build and train the model. 18 | 19 | ## Instructions 20 | 21 | Clone the tfx-addons repo and navigate to the penguin directory. 22 | 23 |
24 | git clone https://github.com/tensorflow/tfx-addons.git
25 | cd tfx-addons/examples/sklearn_penguins
26 | 
27 | 28 | Next, create a Python virtual environment for this example, activate the 29 | environment, and install dependencies. Make sure you are using a version of 30 | python supported by TFX. 31 | 32 |
33 | python -m venv venv
34 | source ./penguin/bin/activate
35 | pip install -r requirements.txt
36 | 
37 | 38 | ### Local Example 39 | Execute the pipeline python file. Output can be found at `~/tfx`: 40 | 41 |
42 | python penguin_pipeline_sklearn_local.py
43 | 
44 | 45 | ### GCP Example 46 | This example uses a custom container image instead of the default TFX ones found 47 | [here](gcr.io/tfx-oss-public/tfx). This custom container ensures the proper 48 | version of scikit-learn is installed. Run the following commands to build this 49 | image and upload it to Google Container Registry (GCR). 50 | 51 |
52 | cd ~/penguin/experimental
53 | gcloud auth configure-docker
54 | docker build \
55 |   --tag gcr.io/[PROJECT-ID]/tfx-example-sklearn \
56 |   --build-arg TFX_VERSION=$(python -c 'import tfx; print(tfx.__version__)') \
57 |   .
58 | docker push gcr.io/[PROJECT-ID]/tfx-example-sklearn
59 | 
60 | 61 | Note that the custom container extends an official TFX container image based on 62 | the local TFX version. If an unreleased version of TFX is being used 63 | (e.g. installing from HEAD), `Dockerfile` may need to be modified to install the 64 | unreleased version. 65 | 66 | Set the project id and bucket in `penguin_pipeline_sklearn_gcp.py`. Then, run 67 | the following commands to copy the `~/penguin` directory to GCS and execute the 68 | pipeline python file. Output can be found at `[BUCKET]/tfx`. 69 | 70 |
71 | vi penguin_pipeline_sklearn_gcp.py
72 | gsutil -m cp -r ~/penguin/data/* gs://[BUCKET]/penguin/data/
73 | gsutil -m cp ~/penguin/experimental/\*.py gs://[BUCKET]/penguin/experimental/
74 | 
75 | tfx pipeline create \
76 |   --engine kubeflow \
77 |   --pipeline-path penguin_pipeline_sklearn_gcp.py \
78 |   --endpoint [MY-GCP-ENDPOINT.PIPELINES.GOOGLEUSERCONTENT.COM]
79 | 
80 | 81 | Note that 82 | `gsutil -m cp ~/penguin/experimental/*.py gs://[BUCKET]/penguin/experimental` 83 | will need to be run every time updates are made to the GCP example. 84 | Additionally, subsequent pipeline deployments should use `tfx pipeline update` 85 | instead of `tfx pipeline create`. 86 | -------------------------------------------------------------------------------- /examples/xgboost_penguins/README.md: -------------------------------------------------------------------------------- 1 | # Penguin Classification XGBoost Example 2 | 3 | Expanded the [TFX penguin example 4 | pipeline](https://github.com/tensorflow/tfx/tree/master/tfx/examples/penguin) 5 | and use [xgboost](https://xgboost.readthedocs.io/en/latest/) 6 | to build and train the model. 7 | 8 | Also see [XGBoost Evaluator](/tfx_addons/xgboost_evaluator/README.md) for more 9 | context on how the trained model can evaluated. 10 | 11 | ## Local Example 12 | Execute the pipeline python file. Output can be found at `~/tfx`: 13 | 14 | ``` 15 | python examples/xgboost_penguins/penguin_pipeline_local.py 16 | ``` 17 | 18 | ## Run e2e test 19 | 20 | ``` 21 | pip install -e ".[all,test]"` 22 | pytest examples/xgboost_penguins 23 | ``` -------------------------------------------------------------------------------- /examples/xgboost_penguins/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /examples/xgboost_penguins/penguin_pipeline_local_e2e_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """E2E Tests for penguin_pipeline_xgboost_local.""" 16 | 17 | import os 18 | from typing import Text 19 | 20 | import tensorflow as tf 21 | from tfx import v1 as tfx 22 | from tfx.orchestration import metadata 23 | 24 | from . import penguin_pipeline_local 25 | 26 | 27 | class PenguinPipelineLocalEndToEndTest(tf.test.TestCase): 28 | def setUp(self): 29 | super().setUp() 30 | self._test_dir = os.path.join( 31 | os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), 32 | self._testMethodName) 33 | self._penguin_root = os.path.dirname(__file__) 34 | 35 | self._pipeline_name = 'xgboost_test' 36 | self._data_root = os.path.join(self._penguin_root, 'data') 37 | self._module_file = os.path.join(self._penguin_root, 'utils.py') 38 | self._serving_model_dir = os.path.join(self._test_dir, 'serving_model') 39 | self._pipeline_root = os.path.join(self._test_dir, 'tfx', 'pipelines', 40 | self._pipeline_name) 41 | self._metadata_path = os.path.join(self._test_dir, 'tfx', 'metadata', 42 | self._pipeline_name, 'metadata.db') 43 | 44 | def assertExecutedOnce(self, component: Text) -> None: 45 | """Check the component is executed exactly once.""" 46 | component_path = os.path.join(self._pipeline_root, component) 47 | self.assertTrue(tfx.dsl.io.fileio.exists(component_path)) 48 | execution_path = os.path.join(component_path, '.system', 49 | 'executor_execution') 50 | execution = tfx.dsl.io.fileio.listdir(execution_path) 51 | self.assertLen(execution, 1) 52 | 53 | def assertPipelineExecution(self) -> None: 54 | self.assertExecutedOnce('CsvExampleGen') 55 | self.assertExecutedOnce('ExampleValidator') 56 | self.assertExecutedOnce('SchemaGen') 57 | self.assertExecutedOnce('StatisticsGen') 58 | self.assertExecutedOnce('Trainer') 59 | 60 | def testPenguinPipelineLocal(self): 61 | tfx.orchestration.LocalDagRunner().run( 62 | penguin_pipeline_local.create_pipeline( 63 | pipeline_name=self._pipeline_name, 64 | pipeline_root=self._pipeline_root, 65 | data_root=self._data_root, 66 | module_file=self._module_file, 67 | metadata_path=self._metadata_path, 68 | beam_pipeline_args=[])) 69 | 70 | self.assertTrue(tfx.dsl.io.fileio.exists(self._metadata_path)) 71 | expected_execution_count = 6 72 | metadata_config = ( 73 | tfx.orchestration.metadata.sqlite_metadata_connection_config( 74 | self._metadata_path)) 75 | with metadata.Metadata(metadata_config) as m: 76 | artifact_count = len(m.store.get_artifacts()) 77 | execution_count = len(m.store.get_executions()) 78 | self.assertGreaterEqual(artifact_count, execution_count) 79 | self.assertEqual(expected_execution_count, execution_count) 80 | 81 | self.assertPipelineExecution() 82 | 83 | 84 | if __name__ == '__main__': 85 | tf.compat.v1.enable_v2_behavior() 86 | tf.test.main() 87 | -------------------------------------------------------------------------------- /examples/xgboost_penguins/requirements.txt: -------------------------------------------------------------------------------- 1 | ../..[xgboost_evaluator] 2 | xgboost>=1.0.0 -------------------------------------------------------------------------------- /proposals/20210404-sklearn_example.md: -------------------------------------------------------------------------------- 1 | #### SIG TFX-Addons 2 | 3 | # Project Proposal 4 | 5 | **Your name:** Michael Hu 6 | 7 | **Your email:** humichael@google.com 8 | 9 | **Your company/organization:** Google 10 | 11 | **Project name:** Scikit-learn Penguin Classification 12 | 13 | ## Project Description 14 | Demonstrates training a scikit-learn MLPClassifier model in a TFX pipeline. The pipeline can either run locally or on GCP using CAIP, Dataflow, and Kubeflow Pipelines. 15 | 16 | ## Project Category 17 | Example 18 | 19 | ## Project Use-Case(s) 20 | This example can be used to push any scikit-learn model to CAIP with minimal custom code to acquire standard TFX benefits like orchestration, data validation, gated retraining, etc. This example is currently not used within my organization. 21 | 22 | ## Project Implementation 23 | Scikit-learn will be integrated with TFX by using the following approach: 24 | Create a custom trainer module for training the scikit-learn model using example protos. 25 | 26 | Tensors parsed from examples will be converted to Numpy arrays. 27 | The model artifact will be stored as a pickle, which both the custom evaluator module and CAIP serving will be able to load. 28 | 29 | Create a custom evaluator module for making predictions against the model in Evaluator. 30 | 31 | Build a Docker container extending a TFX image for managing the scikit-learn version and dependencies when training on CAIP. This container will be hosted in the user's Google Container Registry on GCP. 32 | 33 | CAIP supports serving scikit-learn models out of the box. 34 | 35 | The project will not be packaged. Instead, users just need to clone the source code to run the example. 36 | 37 | ## Project Dependencies 38 | 'scikit-learn>=0.23,<0.24' 39 | kfp 40 | 41 | ## Project Team 42 | Michael Hu, humichael@google.com 43 | 44 | Jiayi Zhao, jyzhao@google.com 45 | 46 | -------------------------------------------------------------------------------- /proposals/20210507-mlmd_client_lib.md: -------------------------------------------------------------------------------- 1 | #### SIG TFX-Addons 2 | # Project Proposal 3 | 4 | **Your name:** Gerard Casas Saez 5 | 6 | **Your email:** gerard@twitter.com 7 | 8 | **Your company/organization:** Twitter 9 | 10 | **Project name:** TFX MLMD Client Library 11 | 12 | ## Project Description 13 | 14 | Client library to inspect content in ML Metadata populated by TFX pipelines. Library will be written in Python and distributed through PyPi. 15 | Given metadata connection information, it should provide easy to use methods to introspect the Metadata DB. 16 | 17 | Idea from [#12](https://github.com/tensorflow/tfx-addons/issues/12) 18 | 19 | ## Project Category 20 | 21 | Client Library 22 | 23 | ## Project Use-Case(s) 24 | 25 | 26 | ML Metadata contains all the metadata for TFX pipelines (pipeline state, component execution, artifact lineage...). 27 | However currently to query pipeline information you need to write custom code every time, as there is no common library that provides an abstraction layer on top 28 | of the raw ML Metadata library. 29 | 30 | Several libraries have implemented their own implementation of this library as seen in [ModelCards](https://github.com/tensorflow/model-card-toolkit/blob/master/model_card_toolkit/utils/tfx_util.py), [NitroML](https://github.com/google/nitroml/tree/master/nitroml/analytics) 31 | or [Airflow example](https://github.com/tensorflow/tfx/blob/master/tfx/examples/airflow_workshop/notebooks/tfx_utils.py) in TFX repository. 32 | 33 | Twitter already has a small implementation of this library used to track pipeline state from interactive environments. 34 | 35 | Project will need close collaboration with TFX team to stabilize the context types ids used by TFX to track its jobs in ML Metadata. 36 | 37 | ## Project Implementation 38 | 39 | _Distribution:_ 40 | - Python library `tfx-addons-metadata-client` released to PyPi. (potentially `tfx-addons` if we want to include more projects in the future). 41 | - Automatic release and packaging using GitHub Actions. Versioning will depend on TFX stability for MLMD types. 42 | - Folder: `tfx/addons/metadata-client` (we will likely also need to create some .github files for automatic testing and automatic release). 43 | 44 | _Project implementation:_ 45 | 46 | - Python client library for ML Metadata, using ML Metadata Python SDK to query the database. 47 | - Main skeleton will be 3 model classes for Pipeline, PipelineRun and ComponentRun to introspect their status. 48 | - Artifact class methods to obtain artifacts generated by each ComponentRun, PipelineRun and ComponentRun (with optional filter by ArtifactType). 49 | - Lineage tracking for Artifact class: Obtain all artifacts that helped generate this Artifact, and check all downstream Artifacts generated by current artifact. 50 | 51 | To be heavily based on the existing libraries by NitroML, ModelCard (see above) and [tensorflow/tfx#2415](https://github.com/tensorflow/tfx/pull/2415). 52 | 53 | 54 | ## Project Dependencies 55 | `ml-metadata>=0.26` - Used to query the database. 56 | `ml-pipelines-sdk>=0.26` - This will be needed to pull the type names used by TFX on ML Metadata. 57 | 58 | ## Project Team 59 | Suzen Fylke, sue@twitter.com 60 | Vincent Nguyen, [[To be filled]] 61 | Paul Selden, paul.selden@openx.com 62 | [[TFX team member TBD]] 63 | -------------------------------------------------------------------------------- /proposals/20210525-examplefilter.md: -------------------------------------------------------------------------------- 1 | #### SIG TFX-Addons 2 | # Project Proposal 3 | 4 | **Your name:** Ryan Clough 5 | 6 | **Your email:** rclough@spotify.com 7 | 8 | **Your company/organization:** Spotify 9 | 10 | **Project name:** Example Filter 11 | 12 | ## Project Description 13 | Beam based component that can filter Examples based on a user-defined predicate function. 14 | 15 | ## Project Category 16 | Choose 1: Component 17 | 18 | ## Project Use-Case(s) 19 | Data can be imported into TFX in a number of ways, and indeed, sometimes the dataset you wish to load is not under your direct 20 | control. In cases like these, it is useful to have a component that can filter your input data with simple rules. Ex: filter 21 | all records where `feature_a >= 1`. 22 | 23 | Our organization currently has a component for this purpose that is in active use. It is not as robust as it could be. 24 | 25 | It is also worth conidering that we may wish to try and promote this functionality to be included in the TFX core base ExampleGen, 26 | so that the filtering could be done within any ExampleGen based component. 27 | 28 | ## Project Implementation 29 | Spotify can provide the current implementation, which is based off of an old version of Tensorflow Transform. At a high level, use 30 | of the component looks like: 31 | 32 | ```python 33 | def predicate_fn(example) 34 | # Throw out Examples that used a credit card 35 | if b'Credit Card' in example['payment_type']: 36 | return False 37 | return True 38 | ... 39 | 40 | filtered_examples = ExampleFilter( 41 | examples=examples.output, 42 | schema=schema.output, 43 | module_file=filter_module, 44 | ) 45 | ``` 46 | 47 | ## Packaging 48 | 49 | Given that it's a Beam component, I think it will have to be a fully custom component. 50 | 51 | In terms of packaging and providing, we can provide the code, and a sample docker file and example pipeline for the component. 52 | 53 | ## Future Considerations 54 | 55 | For the purposes of this proposal, the `ExampleFilter` component will be submitted as-is, as to not let "perfect" become the 56 | enemy of "good enough". There are a number of potential improvements that could be made to the component, but working 57 | through them should be a separate process from this initial proposal to get a working MVP. 58 | 59 | The current implementation is a bit dated and not so robust. It depends on a deprecated TFT proto coder, and only works on 60 | TF Records, as it does not make use of TFXIO. As part of bringing this to TFX-addons, I think it is worth iterating on the 61 | current design. Some initial ideas for change might be: 62 | 63 | * Implementing it more flexibly in TFXIO 64 | * Determine if there's a way to implement it without requiring a schema 65 | * Making the predicate_fn operate on true data types rather than bytes (see example above) 66 | * Adding an input that allows the user to specify splits (currently applies to all splits) 67 | 68 | ## Project Dependencies 69 | Current implementation uses a [proto decoder](https://github.com/tensorflow/transform/blob/v0.24.1/tensorflow_transform/coders/example_proto_coder.py#L329-L339) 70 | deprecated from TFX 0.25 onwards. Otherwise the project uses standard TFX dependencies. 71 | 72 | ## Project Team 73 | * Ryan Clough, rclough@spotify.com, @rclough 74 | * TBD 75 | -------------------------------------------------------------------------------- /proposals/20210605-schema_curation_custom_component.md: -------------------------------------------------------------------------------- 1 | #### SIG TFX-Addons 2 | # Project Proposal 3 | 4 | **Your name:** Pratishtha Abrol 5 | 6 | **Your email:** pratishthaabrol@gmail.com 7 | 8 | **Your company/organization:** Outreachy 9 | 10 | **Project name:** [Schema curation custom component](https://github.com/tensorflow/tfx-addons/issues/8) 11 | 12 | ## Project Description 13 | This project applies Python user code from a user-supplied module file to a schema produced by SchemaGen, to curate the schema based on domain knowledge. 14 | 15 | ## Project Category 16 | Component 17 | 18 | ## Project Use-Case(s) 19 | This project will allow the user to add a custom component that modifies the schema generated by SchemaGen component according to user knowledge, for example, fixing domain limits that were inferred wrongly by the SchemaGen component. 20 | 21 | ## Project Implementation 22 | Implementation of the Schema Curation Custom Component can be done using the following approach: 23 | - Get the base Schema using SchemaGen component of TFX 24 | - User supplies a module file with a fully-custom component that defines the additions/changes to the initially generated schema through SchemaGen. 25 | - And execution script would run on the module file, which sets and modifies variables accordingly. 26 | - The base schema gets modified according to the module file and used further along the pipeline 27 | 28 | ## Project Dependencies 29 | The implementation will use the [TFDV library](https://www.tensorflow.org/tfx/data_validation/api_docs/python/tfdv) for validation and modification of schema objects according to the module file provided by the user. The following two methods would be of special focus: 30 | - [tfdv.set_domain](https://www.tensorflow.org/tfx/data_validation/api_docs/python/tfdv/set_domain) 31 | - [tfdv.write_schema_text](https://www.tensorflow.org/tfx/data_validation/api_docs/python/tfdv/write_schema_text) 32 | 33 | A similar implementation can be seen in the [Transform library](https://github.com/tensorflow/transform). Paricularly, the [schema_utils](https://github.com/tensorflow/transform/blob/master/tensorflow_transform/tf_metadata/schema_utils.py) method could come in useful. 34 | 35 | ## Project Team 36 | **Project Leader** : Pratishtha Abrol, pratishtha-abrol, pratishthaabrol@gmail.com 37 | 1. Fatimah Adwan, FatimahAdwan, akilahafaf72@gmail.com 38 | 2. Kshitijaa Jaglan, deutranium, jaglan.kshitijaa2@gmail.com 39 | 3. Nirzari Gupta, nirzu97, nirzu97@gmail.com 40 | -------------------------------------------------------------------------------- /proposals/20210721-sampling_component.md: -------------------------------------------------------------------------------- 1 | #### SIG TFX-Addons 2 | 3 | # Project Proposal 4 | 5 | ------ 6 | 7 | **Your name:** Daniel Kim 8 | 9 | **Your email:** danielk@twitter.com 10 | 11 | **Your company/organization:** Twitter 12 | 13 | **Project name:** Sampling Component 14 | 15 | ## Project Description 16 | 17 | This project will be a fully custom component that inputs an artifact in `tfRecord` format of `tf.Example`s and randomly undersamples or randomly oversamples it, reducing the data to the lowest- or highest-frequency class. It will primarily use an underlying Apache Beam pipeline that will be wrapped inside the TensorFlow component. 18 | 19 | ## Project Category 20 | 21 | Component 22 | 23 | ## Project Use-Case(s) 24 | 25 | As this project represents a very general operation used widely in machine learning data processing, we anticipate that it will have wide-ranging use cases, the most evident being in cases where dependent variable classes have wildly different relative frequencies and under/oversampling is needed to help effectively train a classifier. The potential impact will likely be large due to this, and our organization will likely utilize this project in the future. 26 | 27 | ## Project Solutions 28 | 29 | We considered multiple possible solutions and implementations for this project before deciding on an Apache Beam-based pipeline, including standard Python code and the utilization of a BigQuery query in order to perform the random /oversampling task. Using a pure Python-based algorithm with `multiprocessing` will likely be inefficient for the purposes of a parallelizable computation such as this one, and utilizing solutions such as Dask would introduce unnecessary dependencies into our project. 30 | 31 | BigQuery is also a very good option, and a great fallback in case Apache Beam turns out to be infeasible for this project, but Apache Beam has better python integration through custom `DoFn`s that may help us with our implementation of other algorithms later on. In this case, we would load the data in and out of a BigQuery table and perform our operations within this table. The component would then either utilize a schema generated from `SchemaGen` or inflect one on its own, potententially adding an unneeded dependency into the component or performing unnecessary inflection. 32 | 33 | ## Project Implementation 34 | 35 | At a high level, the plan is to use Apache Beam to ingest a `tfRecord` of `tf.Examples`, shuffle them, convert them into a key-value `PCollection` with keys as class values and values as data points, and then perform the actual under/oversampling. Null values (and values that have key classes that are specified by the user) will not be part of the over/undersampling step; they will be separated and added back into the sampled dataset. The algorithm will be written as an Apache Beam pipeline, which will be wrapped into a TensorFlow custom component (with custom executor and spec) to use with TFX pipelines. The component would be written as inputting a `TFRecord` artifact of `tf.Examples` and exporting a similar `TFRecord` artifact, making its placement in a pipeline nearly ubiquitous. 36 | 37 | Later additions to the project could include the integration through Apache Beam of one or more other, more complex undersampling or ovesampling algorithms. Our likely focus would be SMOTE for oversampling and either ENN or Tomek Links for undersampling. These would likely be implemented as custom Python functions within the Apache Beam pipeline, although the focus for now is currently the initial random sampling component. 38 | 39 | ## Project Dependencies 40 | 41 | tensorflow, TFX, Apache Beam 42 | 43 | ## Project Team 44 | 45 | List the members of the project team. Include their names, Github user IDs, and email addresses. Identify project leaders. 46 | 47 | * Daniel Kim, kindalime, danielk@twitter.com -------------------------------------------------------------------------------- /proposals/20210723-feature_selection_custom_component.md: -------------------------------------------------------------------------------- 1 | #### SIG TFX-Addons 2 | # Project Proposal 3 | 4 | **Your name:** Nirzari Gupta 5 | 6 | **Your email:** nirzu97@gmail.com 7 | 8 | **Your company/organization:** Outreachy 9 | 10 | **Project name:** [Feature selection custom component](https://github.com/tensorflow/tfx-addons/issues/7) 11 | 12 | ## Project Description 13 | This project provides a facility to perform various feature selection algorithms on datasets in TFX pipelines. Additionally, feature scores for selected features will also be generated as a custom artifact. 14 | 15 | ## Project Category 16 | Component 17 | 18 | ## Project Use-Case(s) 19 | This project will allow the user to select different algorithms for performing feature selection on datasets artifacts in TFX pipelines. 20 | 21 | ## Project Implementation 22 | Feature Selection Custom Component will be implemented as Python function-based component. 23 | Implementation of the Feature Selection Custom Component can be done using the following approach: 24 | - Get dataset artifact generated by ExampleGen 25 | - Convert it into the format compatible with Scikit-Learn functions 26 | - Perform univariate feature selection using parameters given by users 27 | - Remove not selected features from the dataset 28 | - Provide feature scores of the selected features as a custom artifact 29 | 30 | ## Project Dependencies 31 | The implementation will use the [Scikit-learn feature selection functions](https://scikit-learn.org/stable/modules/feature_selection.html) 32 | 33 | ## Project Team 34 | **Project Leader** : Nirzari Gupta, nirzu97, nirzu97@gmail.com 35 | 1. Fatimah Adwan, FatimahAdwan, akilahafaf72@gmail.com 36 | 2. Kshitijaa Jaglan, deutranium, jaglan.kshitijaa2@gmail.com 37 | 3. Pratishtha Abrol, pratishtha-abrol, pratishthaabrol@gmail.com 38 | -------------------------------------------------------------------------------- /proposals/20210817-firebase_ml_publisher_component.md: -------------------------------------------------------------------------------- 1 | #### SIG TFX-Addons 2 | # Project Proposal 3 | 4 | **Your name:** Chansung Park 5 | 6 | **Your email:** deep.diver.csp@gmail.com 7 | 8 | **Your company/organization:** Individual(ML GDE) 9 | 10 | **Project name:** [Firebase ML Publisher](https://github.com/tensorflow/tfx-addons/issues/59) 11 | 12 | ## Project Description 13 | This project defines a custom TFX component to publish/update ML models to [Firebase ML](https://firebase.google.com/products/ml). This is another type of pusher component, and the input model is assumed to be a TFLite format. 14 | 15 | ## Project Category 16 | Component 17 | 18 | ## Project Use-Case(s) 19 | This project helps users to publish trained models directly to Firebase ML. 20 | 21 | With Firebase ML, we can guarantee that mobile devices can be equipped with the latest ML model without explicitly embedding binary in the project compiling stage. We can even A/B test different versions of a model with Google Analytics when the model is published on Firebase ML. 22 | 23 | ## Project Implementation 24 | Firebase ML Publisher component will be implemented as Python function-based component. You can find the [actual source code](https://github.com/sayakpaul/Dual-Deployments-on-Vertex-AI/blob/main/custom_components/firebase_publisher.py) in my personal project. Please note this is a personal implementation, and it will be enhanced as a official TFX Addon component. 25 | 26 | The implementation details 27 | - Define a custom Python function-based TFX component. It takes the following parameters from a previous component. 28 | - It should follow the standard Pusher's interface since this is another custom pusher. 29 | - Additionally, it takes meta information to manage published model for Firebase ML such as `display name` and `tags`. 30 | - Download saved TFLite model file by referencing the output from a previous component 31 | - Firebase SDK doesn't allow to publish models from GCS directly. 32 | - Initialize Firebase Admin with the credential and Firebase temporary-use GCS bucket. 33 | - Firebase credentials can be setup via [Workload Identity](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) for GKE or [Mounting Secret API in TFX runner](https://github.com/tensorflow/tfx/blob/d989bbd7fc366c73ad833428ce6b5cf57a587432/tfx/orchestration/kubeflow/kubeflow_dag_runner.py#L78). 34 | - Search if any models with the same `display name` has already been published. 35 | - if yes, update the existing Firebase ML mode, then publish it 36 | - if no, create a new Firebase ML model, then publish it 37 | - Return `tfx.dsl.components.OutputDict` to indicate if the job went successful, and if the job was about creating a new Firebase ML model or updating the exisitng Firebase ML model. 38 | 39 | ## Project Dependencies 40 | The implementation will use the following libraries. 41 | - [Firebase Admin Python SDK](https://github.com/firebase/firebase-admin-python) >= 5.0.2 42 | - [Python Client for Google Cloud Storage](https://github.com/googleapis/python-storage) >= 1.42.0 43 | 44 | ## Project Team 45 | **Project Leader** : Chansung Park, deep-diver, deep.diver.csp@gmail.com 46 | 1. Sayak Paul, sayakpaul, spsayakpaul@gmail.com 47 | -------------------------------------------------------------------------------- /proposals/20220117-exit-handler-slack.md: -------------------------------------------------------------------------------- 1 | #### SIG TFX-Addons 2 | # Project Proposal for Slack Exit Handler for TFX Pipelines 3 | 4 | **Your name:** Hannes Max Hapke 5 | 6 | **Your email:** hannes@digits.com 7 | 8 | **Your company/organization:** Digits Financial, Inc. 9 | 10 | **Project name:** Slack Exit Handler for TFX Pipelines 11 | 12 | ## Project Description 13 | 14 | The component provides an exit handler for TFX pipelines which notifies the user about the final state of the pipeline (failed or succeeded) via a Slack message. If the pipeline failed, the component will provide the error message. 15 | 16 | ## Project Category 17 | 18 | Component 19 | 20 | ## Project Use-Case(s) 21 | 22 | The exit handler notifies Digits' ML team about the final state of a pipeline. Instead of constantly pulling the pipeline status via the Vertex cli, the exit handler notifies us. 23 | 24 | The implementation can be extended to cover us communication services (e.g. SMS via Twilio) too. 25 | 26 | Furthermore, the implementation can be seen as an example implementation for an exit handler. Other users could use the same setup to trigger downstream pipelines or trigger other post-run actions. 27 | 28 | ## Project Implementation 29 | 30 | The existing implementation is Python-based and it uses the `tfx.orchestration.experimental.exit_handler` decorator. 31 | 32 | The component excepts 4 parameters: 33 | * final_status 34 | * slack_token 35 | * slack_channel_id 36 | * on_failure_only 37 | 38 | `final_status` is the JSON string of the pipeline status, provided by TFX. The Slack parameters contain the credentials to submit the message. And `on_failure_only` is a configuration for frequently run pipeline to only alert on failures. We have a number of pipelines were this options was useful. 39 | 40 | The component parses the status, and composes a message based on the content. 41 | 42 | ``` 43 | job_id = status["pipelineJobResourceName"].split("/")[-1] 44 | if status["state"] == "SUCCEEDED": 45 | message = f":tada: Pipeline job *{job_id}* completed successfully.\n" 46 | else: 47 | message = f":scream: Pipeline job *{job_id}* failed." 48 | message += f"\n>{status['error']['message']}" 49 | ``` 50 | 51 | The a Slack web client object is created and the message is submitted via the object. 52 | 53 | Overall, the implementation is minimal, but it serves as a great exit handler example. 54 | 55 | ### Current Digits Implementation 56 | 57 | #### Pipeline Success Message 58 | ![Screen_Shot_2022-01-05_at_3_23_43_PM_2](https://user-images.githubusercontent.com/1234819/148304418-9232fe68-57a3-4976-bd01-8d3e14bbf00b.png) 59 | 60 | #### Pipeline Failure Message 61 | ![_Screen_Shot_2022-01-05_at_2_45_47_PM](https://user-images.githubusercontent.com/1234819/148301546-b8ae19e3-ff71-4ec6-9969-06e71672b2e2.png) 62 | 63 | #### Visualization in Google Cloud Vertex Pipelines 64 | ![Screen_Shot_2022-01-05_at_3_28_06_PM_2](https://user-images.githubusercontent.com/1234819/148304482-22347d1f-fb9c-4744-92ef-1d020c79f2fc.png) 65 | 66 | 67 | ## Project Dependencies 68 | 69 | The component requires: 70 | * TFX version >= 1.4.0 71 | * Slack Python client 72 | 73 | The component will also require Google Cloud's Vertex pipelines as its orchestrator. 74 | 75 | ## Project Team 76 | 77 | * Hannes Hapke (@hanneshapke), hannes -at- digits.com 78 | 79 | # Note 80 | 81 | Please be aware of the processes and requirements which are outlined here: 82 | 83 | * [SIG-TFX-Addons](https://github.com/tensorflow/tfx-addons) 84 | * [Contributing Guidelines](https://github.com/tensorflow/tfx-addons/blob/main/CONTRIBUTING.md) 85 | * [TensorFlow Code of Conduct](https://github.com/tensorflow/tfx-addons/blob/main/CODE_OF_CONDUCT.md) 86 | -------------------------------------------------------------------------------- /proposals/20220118-upload_predictions_to_bigquery.md: -------------------------------------------------------------------------------- 1 | #### SIG TFX-Addons 2 | # Project Proposal for Upload Predictions to BigQuery 3 | 4 | **Your name:** Hannes Max Hapke 5 | 6 | **Your email:** hannes@digits.com 7 | 8 | **Your company/organization:** Digits Financial, Inc. 9 | 10 | **Project name:** Upload Predictions to BigQuery component 11 | 12 | ## Project Description 13 | 14 | The project addresses the project idea #78. The TFX `BulkInferrer` allows pipeline to apply an ML model (loaded or trained in the pipeline) and generates predictions for the provided inference data. 15 | 16 | This project will provide a component which receives the predictions from the `BulkInferrer` and writes the results to BigQuery. 17 | 18 | ## Project Category 19 | 20 | Component 21 | 22 | ## Project Use-Case(s) 23 | 24 | Such a component is useful for generating predictions within the pipeline or for two-step pipelines producing semi-supervised ML models. 25 | 26 | ## Project Implementation 27 | 28 | The existing implementation was written as a "traditional" TFX component with its `ComponentSpec`, `Executor`, etc. to run efficiently on Apache Beam. 29 | 30 | The implementation receives 3 artifacts: 31 | * transform_graph 32 | * inference_results 33 | * schema 34 | 35 | The `transform_graph` is used to convert classification probabilities to a label. The TFX `schema` is used to generate the BigQuery schema for the table inserts. And the `inference_results` contain the information provided from the upstream `BulkInferrer` component. 36 | 37 | In addition, the component accepts a number of parameters to customize the BigQuery inserts: 38 | * bq_table_name - Table name 39 | * filter_threshold - threshold to filter results with low confidence 40 | * table_suffix - suffix for daily inferences 41 | * table_partitioning - BQ partitioning setting for newly created tables 42 | * expiration_time_delta - BQ expiration time after which the table will expire 43 | 44 | The component processes the inference results, converts the class likelihoods into class labels, and then generates a tables schema from the TFX schema information, before it writes the information to Big Query. 45 | 46 | The writing to Big Query is done via Apache Beam. 47 | 48 | ``` 49 | with self._make_beam_pipeline() as pipeline: 50 | _ = (pipeline 51 | | 'Read Prediction Log' >> beam.io.ReadFromTFRecord( 52 | prediction_log_path, 53 | coder=prediction_log_decoder) 54 | | 'Filter and Convert to Dict' >> beam.ParDo( 55 | FilterPredictionToDictFn( 56 | labels=labels, 57 | features=features, 58 | ts=ts, 59 | filter_threshold=exec_properties['filter_threshold'], 60 | ) 61 | ) 62 | | 'Write Dict to BQ' >> beam.io.gcp.bigquery.WriteToBigQuery( 63 | table=bq_table_name, 64 | schema=bq_schema, 65 | additional_bq_parameters=_ADDITIONAL_BQ_PARAMETERS, 66 | create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, 67 | write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) 68 | ) 69 | ``` 70 | 71 | After the completion of the datat insert, the component returns the `generated_bq_table_name` as a string artifact for downstream components. 72 | 73 | ## Project Dependencies 74 | 75 | The component requires: 76 | * TFX version >= 1.0.0 77 | * Apache Beam 78 | * TensorFlow Transform 79 | 80 | The component implicitly requires Google Cloud as a Dependency due to the writing operation to BigQuery. 81 | 82 | ## Project Team 83 | 84 | * Hannes Hapke (@hanneshapke), hannes -at- digits.com 85 | * Ukjae Jeong (@jeongukjae) 86 | 87 | # Note 88 | 89 | Please be aware of the processes and requirements which are outlined here: 90 | 91 | * [SIG-TFX-Addons](https://github.com/tensorflow/tfx-addons) 92 | * [Contributing Guidelines](https://github.com/tensorflow/tfx-addons/blob/main/CONTRIBUTING.md) 93 | * [TensorFlow Code of Conduct](https://github.com/tensorflow/tfx-addons/blob/main/CODE_OF_CONDUCT.md) 94 | -------------------------------------------------------------------------------- /proposals/20220513-pandas_transform.md: -------------------------------------------------------------------------------- 1 | **Your name:** Robert Crowe 2 | 3 | **Your email:** robertcrowe--at--google--dot--com 4 | 5 | **Your company/organization:** Google 6 | 7 | **Project name:** PandasTransform 8 | 9 | ## Project Description 10 | This project will develop a new TFX component which can be used instead of the standard Transform component, and allows developers to work with Pandas dataframes for their feature engineering. Processing will be distributed using Beam for scalability. Operations which require a full pass over the dataset will not be supported in the first release. 11 | 12 | ## Project Category 13 | Component 14 | 15 | ## Project Use-Case(s) 16 | The primary use cases are: 17 | * Developers who are not modeling in TensorFlow 18 | * Developers who are prototyping and are more comfortable working with dataframes, at least initially, and may not deploy their model for inference 19 | * Developers whose feature engineering can work with the basic statistics of the dataset (min, max, etc) and do not need to make full passes over the data 20 | 21 | ## Project Implementation 22 | This will be implemented as a Python-function component, using Beam for processing. Like the Transform component the user will supply a module file with their user code in a `preprocessing_fn`. Their code will be supplied with their dataset as a Pandas dataframe, and they will return their results as a Pandas dataframe. Their code will also be supplied with the basic statistics for their dataset, generated by StatisticsGen, and formatted as a Python dictionary. Their code will also be supplied with the schema of their dataset, generated by SchemaGen, and formatted as a Python dictionary. 23 | 24 | **Caveats:** It's important to note that each invocation of their `preprocessing_fn` will only be supplied with part of their dataset, to enable distributed processing. That means that full passes over their dataset by their user code will not be possible, so operations which require a full pass will not be supported in the first release. A future release may or may not enable full pass operations, TBD. 25 | 26 | **Additional Notes:** It's also important to note that unlike the standard Transform component, this PandasTransform component does not output the 27 | modified schema and statistics for the altered dataset. To generate a schema and statistics which reflect any changes that you've made to your 28 | dataset, you should follow the PandasTransform component with StatisticsGen and SchemaGen components in your pipeline. 29 | 30 | ## Project Dependencies 31 | Apache Beam 32 | PyArrow 33 | Pandas 34 | TensorFlow 35 | TensorFlow Data Validation 36 | TFX 37 | 38 | ## Project Team 39 | Robert Crowe (rcrowe-google) robertcrowe--at--google--dot--com 40 | 41 | # Note 42 | Please be aware of the processes and requirements which are outlined here: 43 | 44 | * [SIG-TFX-Addons](https://github.com/tensorflow/tfx-addons) 45 | * [Contributing Guidelines](https://github.com/tensorflow/tfx-addons/blob/main/CONTRIBUTING.md) 46 | * [TensorFlow Code of Conduct](https://github.com/tensorflow/tfx-addons/blob/main/CODE_OF_CONDUCT.md) 47 | -------------------------------------------------------------------------------- /proposals/20220802-project_pytorch_example.md: -------------------------------------------------------------------------------- 1 | **Your name:** Hannes Hapke 2 | 3 | **Your email:** hannes--at--digits--dot--com 4 | 5 | **Your company/organization:** Digits Financial Inc 6 | 7 | **Project name:** TFX PyTorch Example 8 | 9 | ## Project Description 10 | Adding a TFX pipeline example for PyTorch models to the TFX Addons repository. 11 | 12 | ## Project Category 13 | Example 14 | 15 | ## Project Use-Case(s) 16 | While there are a few non-TF model-based examples for TFX (e.g. JAX or Scikit), there isn't a maintained example for PyTorch models. 17 | 18 | ## Project Implementation 19 | The pipeline example includes the following components: 20 | - Load a known dataset, e.g. MNIST, via the CSVExampleGen component 21 | - Run the standard statistics and schema steps via StatisticsGen and SchemaGen 22 | - Performs a pseudo transformation (passthrough of the values) with the new PandasTransform component from tfx-addons 23 | - Add a custom run_fn function for PyTorch for the Trainer component 24 | - Add a TFMA example how to analysis PyTorch models to obtain a model blessing 25 | - Push the models to a local path 26 | 27 | ## Project Dependencies 28 | The example will on TFX (1.9.1), TFX addons (0.2), Apache Beam, and PyTorch (1.0.2) 29 | 30 | ## Project Team 31 | Hannes Hapke (gh: hanneshapke, email: hannes--at--digits--dot--com) 32 | More contributors more than welcome 33 | 34 | # Note 35 | Please be aware of the processes and requirements which are outlined here: 36 | 37 | * [SIG-TFX-Addons](https://github.com/tensorflow/tfx-addons) 38 | * [Contributing Guidelines](https://github.com/tensorflow/tfx-addons/blob/main/CONTRIBUTING.md) 39 | * [TensorFlow Code of Conduct](https://github.com/tensorflow/tfx-addons/blob/main/CODE_OF_CONDUCT.md) 40 | -------------------------------------------------------------------------------- /proposals/20230328-airflow_orchestration.md: -------------------------------------------------------------------------------- 1 | #### SIG TFX-Addons 2 | # Project Proposal 3 | 4 | **Your name:** Woosung Song 5 | 6 | **Your email:** wssong@google.com 7 | 8 | **Your company/organization:** Google 9 | 10 | **Project name:** Apache Airflow for Pipeline Orchestration 11 | 12 | ## Project Description 13 | Apache Airflow for pipeline orchestration is going to be migrated from the 14 | official TFX to Addons. 15 | 16 | ## Project Category 17 | Other (Orchestration) 18 | 19 | ## Project Use-Case(s) 20 | In order to simplify core TFX for users who are not using Airflow, we would like 21 | to separate out support for the Airflow orchestrator into a pluggable module and 22 | make it available through TFX-Addons. This will help simplify the core TFX 23 | install, dependencies, and tests, and decrease the size of the installed 24 | payload. 25 | 26 | The functionality of the orchestrator will be retained, but users will need to 27 | update the import paths. To make the transition smoother, it will coexist on 28 | both the official TFX and Addons for a while, and the official one will be 29 | deprecated from the 1.14.0 release. 30 | 31 | ## Project Implementation 32 | The basic implementation and API signatures will follow the original methods, 33 | but the internal dependencies and testing will be reimplemented. 34 | 35 | The import path will be moved from `tfx.orchestration.airflow` to 36 | `tfx_addons.airflow_orchestration`. 37 | 38 | ```python 39 | from tfx_addons.airflow_orchestration import airflow_dag_runner 40 | 41 | def _create_pipeline(): 42 | ... 43 | return [example_gen, statistics_gen, trainer, evaluator, pusher] 44 | 45 | runner = airflow_dag_runner.AirflowDagRunner(_airflow_dag_config) 46 | result = runner.run(_create_pipeline()) 47 | ``` 48 | 49 | ## Project Dependencies 50 | It introduces `apache-airflow[mysql]>=1.10.14,<3` as the dependencies. 51 | 52 | ## Project Team 53 | **Project Leader** : Woosung Song, lego0901, wssong@google.com 54 | 1. Woosung Song, wssong@google.com, @wssong 55 | -------------------------------------------------------------------------------- /proposals/README.md: -------------------------------------------------------------------------------- 1 | # SIG TFX-Addons Project Proposals 2 | 3 | This directory contains current and past project proposals that either are, 4 | or have been previously, under consideration for approval. 5 | 6 | Projects start as project ideas, which are submitted as 7 | [issues marked with the `Project:Idea` tag](https://github.com/tensorflow/tfx-addons/issues?q=is%3Aissue+is%3Aopen+label%3A%22Project%3A+Idea%22). 8 | They are then discussed by the group, and if a team 9 | of contributors decides to volunteer to implement a project then a project 10 | proposal is written. 11 | 12 | To have your project proposal considered for approval, copy and complete the 13 | [project template](yyyymmdd-project_template.md) and create a pull request to 14 | place it in this directory. -------------------------------------------------------------------------------- /proposals/yyyymmdd-project_template.md: -------------------------------------------------------------------------------- 1 | #### SIG TFX-Addons 2 | # Project Proposal 3 | ## Instructions 4 | Copy this template, naming it with the date in `yyyymmdd` format followed by a short name, and fill in the blanks below. 5 | When you're ready for review, place it in `/proposals` and create a pull request. 6 | 7 | --- 8 | 9 | **Your name:** ________________ 10 | 11 | **Your email:** ________________ 12 | 13 | **Your company/organization:** ____________________ 14 | 15 | **Project name:** ____________________ 16 | 17 | ## Project Description 18 | Describe the basics of your project (1-2 sentences). 19 | 20 | ## Project Category 21 | Choose 1: Component | Example | Other 22 | 23 | ## Project Use-Case(s) 24 | Describe at least one use case for your project. Please mention whether your organization will or will not use this project, or if 25 | they already use it. Please also include the potential impact, and any overlap, dependencies, or synergies with other projects. 26 | 27 | ## Project Implementation 28 | Describe at a high-level how you plan to implement your project. If you plan to use containers and/or languages other than Python, 29 | please indicate that. Please also include information about packaging and releasing this project. Please remember that the project 30 | team will be responsible for packaging and releases. 31 | 32 | ## Project Dependencies 33 | Please list any imports and dependencies that you plan to use. Please 34 | highlight any dependencies which are not open-source. 35 | 36 | ## Project Team 37 | List the members of the project team. Include their names, Github user IDs, and email addresses. Identify project leaders. 38 | 39 | # Note 40 | Please be aware of the processes and requirements which are outlined here: 41 | 42 | * [SIG-TFX-Addons](https://github.com/tensorflow/tfx-addons) 43 | * [Contributing Guidelines](https://github.com/tensorflow/tfx-addons/blob/main/CONTRIBUTING.md) 44 | * [TensorFlow Code of Conduct](https://github.com/tensorflow/tfx-addons/blob/main/CODE_OF_CONDUCT.md) -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [tool.pytest.ini_options] 9 | addopts = "--verbose" 10 | python_files = "*_test.py" 11 | norecursedirs = ["env", "proposals"] -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Package Setup script for TFX Addons.""" 16 | import itertools 17 | import os 18 | 19 | from setuptools import find_namespace_packages, setup 20 | 21 | PROJECT_NAME = "tfx-addons" 22 | 23 | 24 | def get_pkg_metadata(): 25 | # Version 26 | context = {} 27 | base_dir = os.path.dirname(os.path.abspath(__file__)) 28 | with open(os.path.join(base_dir, "tfx_addons", "version.py")) as fp: 29 | exec(fp.read(), context) # pylint: disable=exec-used 30 | 31 | return context["_PKG_METADATA"] 32 | 33 | 34 | def get_version(): 35 | # Version 36 | context = {} 37 | base_dir = os.path.dirname(os.path.abspath(__file__)) 38 | with open(os.path.join(base_dir, "tfx_addons", "version.py")) as fp: 39 | exec(fp.read(), context) # pylint: disable=exec-used 40 | 41 | return context["__version__"] 42 | 43 | 44 | def get_ci_constraints(): 45 | # Version 46 | context = {} 47 | base_dir = os.path.dirname(os.path.abspath(__file__)) 48 | with open(os.path.join(base_dir, "tfx_addons", "version.py")) as fp: 49 | exec(fp.read(), context) # pylint: disable=exec-used 50 | 51 | return context["_CI_MIN_CONSTRAINTS"], context["_CI_MAX_CONSTRAINTS"] 52 | 53 | 54 | def get_long_description(): 55 | base_dir = os.path.dirname(os.path.abspath(__file__)) 56 | with open(os.path.join(base_dir, "README.md")) as fp: 57 | return fp.read() 58 | 59 | 60 | TESTS_REQUIRE = ["pytest", "pylint", "pre-commit", "isort", "yapf"] 61 | 62 | PKG_REQUIRES = get_pkg_metadata() 63 | EXTRAS_REQUIRE = PKG_REQUIRES.copy() 64 | EXTRAS_REQUIRE["all"] = list( 65 | set(itertools.chain.from_iterable(list(PKG_REQUIRES.values())))) 66 | EXTRAS_REQUIRE["test"] = TESTS_REQUIRE 67 | CI_MIN_CONSTRAINTS, CI_MAX_CONSTRAINTS = get_ci_constraints() 68 | EXTRAS_REQUIRE["ci_min"] = CI_MIN_CONSTRAINTS 69 | EXTRAS_REQUIRE["ci_max"] = CI_MAX_CONSTRAINTS 70 | 71 | setup( 72 | name=PROJECT_NAME, 73 | version=get_version(), 74 | description="TFX Addons libraries", 75 | author="The Tensorflow Authors", 76 | long_description=get_long_description(), 77 | long_description_content_type='text/markdown', 78 | url="https://github.com/tensorflow/tfx-addons", 79 | project_urls={ 80 | # ToDo(gcasassaez): To add docs once we have some docs integrated. 81 | # "Documentation": "", 82 | "Bug Tracker": "https://github.com/tensorflow/tfx-addons/issues", 83 | }, 84 | extras_require=EXTRAS_REQUIRE, 85 | tests_require=TESTS_REQUIRE, 86 | packages=find_namespace_packages(include=[ 87 | # Add here new library package 88 | "tfx_addons", 89 | ] + [f"tfx_addons.{m}.*" 90 | for m in PKG_REQUIRES] + [f"tfx_addons.{m}" for m in PKG_REQUIRES]), 91 | classifiers=[ 92 | "Intended Audience :: Developers", 93 | "Intended Audience :: Education", 94 | "Intended Audience :: Science/Research", 95 | "License :: OSI Approved :: Apache Software License", 96 | "Programming Language :: Python :: 3", 97 | "Programming Language :: Python :: 3.7", 98 | "Programming Language :: Python :: 3.8", 99 | "Programming Language :: Python :: 3.9", 100 | "Topic :: Scientific/Engineering", 101 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 102 | "Topic :: Software Development", 103 | "Topic :: Software Development :: Libraries", 104 | "Topic :: Software Development :: Libraries :: Python Modules", 105 | ], 106 | python_requires=">=3.7", 107 | include_package_data=True, 108 | ) 109 | -------------------------------------------------------------------------------- /tfx_addons/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Init module for TFX.""" 16 | 17 | import importlib as _importlib 18 | 19 | from .version import _PKG_METADATA, __version__ 20 | 21 | _ACTIVE_MODULES = [ 22 | "__version__", 23 | ] + list(_PKG_METADATA.keys()) 24 | 25 | 26 | def __getattr__(name): # pylint: disable=C0103 27 | # PEP-562: Lazy loaded attributes on python modules 28 | # NB(gcasassaez): We lazy load to avoid issues with dependencies not installed 29 | # for some subpackes 30 | if name in _ACTIVE_MODULES: 31 | return _importlib.import_module("." + name, __name__) 32 | raise AttributeError(f"module {__name__!r} has no attribute {name!r}") 33 | -------------------------------------------------------------------------------- /tfx_addons/apache_airflow/README.md: -------------------------------------------------------------------------------- 1 | # Apache Airflow Orchestrator 2 | 3 | (Please fill in a description of the project, usage instructions, etc.) 4 | -------------------------------------------------------------------------------- /tfx_addons/copy_example_gen/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /tfx_addons/copy_example_gen/component_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """ 16 | Tests for tfx_addons.copy_example_gen.component. 17 | """ 18 | from unittest import mock 19 | 20 | import tensorflow as tf 21 | 22 | from tfx_addons.copy_example_gen import component 23 | 24 | 25 | class TestCopyExampleGen(tf.test.TestCase): 26 | """Test module for CopyExampleGen.""" 27 | def setUp(self): 28 | self.input_json_str = """ 29 | { 30 | "label1": "fakeuri", 31 | "label2": "fakeuri2", 32 | } 33 | """ 34 | 35 | def test_empty_input(self) -> None: 36 | empty_input_json_str = "" 37 | expected_error = ( 38 | "Input string is not provided. Expected format is Split label (key) " 39 | "and Split URI (value).") 40 | 41 | with self.assertRaises(ValueError, msg=expected_error): 42 | # pylint: disable=protected-access 43 | component._create_input_dictionary(input_json_str=empty_input_json_str) 44 | 45 | def test_non_dictionary_input(self) -> None: 46 | non_dictionary_input = "'a', 'b', 'c'" 47 | expected_error = ( 48 | f"Input string {non_dictionary_input} is not provided as a dictionary. " 49 | "Expected format is Split label (key) and Split URI (value).") 50 | 51 | with self.assertRaises(ValueError, msg=expected_error): 52 | # pylint: disable=protected-access 53 | component._create_input_dictionary(input_json_str=non_dictionary_input) 54 | 55 | def test_empty_dictionary(self) -> None: 56 | empty_input_json_str = "{}" 57 | expected_error = ( 58 | "Input dictionary is empty. Expected format is Split label (key) " 59 | "and Split URI (value).") 60 | 61 | with self.assertRaises(ValueError, msg=expected_error): 62 | # pylint: disable=protected-access 63 | component._create_input_dictionary(input_json_str=empty_input_json_str) 64 | 65 | def test_valid_input(self) -> None: 66 | with mock.patch('tfx_addons.copy_example_gen.component.fileio'): 67 | # pylint: disable=protected-access 68 | component.CopyExampleGen(input_json_str=self.input_json_str) 69 | 70 | def test_empty_gcs_directory(self) -> None: 71 | with mock.patch( 72 | 'tfx_addons.copy_example_gen.component.fileio') as mock_fileio: 73 | # Returns an empty list indicating no matching files in that location. 74 | mock_fileio.glob.return_value = [] 75 | with self.assertLogs() as warning_msg: 76 | # pylint: disable=protected-access 77 | component._copy_examples(split_tfrecords_uri="mock_uri", 78 | split_value_uri="mock_uri_2") 79 | expected_msg = ( 80 | "WARNING:root:Directory mock_uri does not contain files with .gz " 81 | "suffix.") 82 | self.assertEqual(warning_msg.output, [expected_msg]) 83 | -------------------------------------------------------------------------------- /tfx_addons/example_filter/README.md: -------------------------------------------------------------------------------- 1 | #### SIG TFX-Addons 2 | # Project Proposal 3 | 4 | **Your name:** Ryan Clough 5 | 6 | **Your email:** rclough@spotify.com 7 | 8 | **Your company/organization:** Spotify 9 | 10 | **Project name:** Example Filter 11 | 12 | ## Project Description 13 | Beam based component that can filter Examples based on a user-defined predicate function. 14 | 15 | ## Project Category 16 | Choose 1: Component 17 | 18 | ## Project Use-Case(s) 19 | Data can be imported into TFX in a number of ways, and indeed, sometimes the dataset you wish to load is not under your direct 20 | control. In cases like these, it is useful to have a component that can filter your input data with simple rules. Ex: filter 21 | all records where `feature_a >= 1`. 22 | 23 | Our organization currently has a component for this purpose that is in active use. It is not as robust as it could be. 24 | 25 | It is also worth conidering that we may wish to try and promote this functionality to be included in the TFX core base ExampleGen, 26 | so that the filtering could be done within any ExampleGen based component. 27 | 28 | ## Project Implementation 29 | Spotify can provide the current implementation, which is based off of an old version of Tensorflow Transform. At a high level, use 30 | of the component looks like: 31 | 32 | ```python 33 | def predicate_fn(example) 34 | # Throw out Examples that used a credit card 35 | if b'Credit Card' in example['payment_type']: 36 | return False 37 | return True 38 | ... 39 | 40 | filtered_examples = ExampleFilter( 41 | examples=examples.output, 42 | schema=schema.output, 43 | module_file=filter_module, 44 | ) 45 | ``` 46 | 47 | ## Packaging 48 | 49 | Given that it's a Beam component, I think it will have to be a fully custom component. 50 | 51 | In terms of packaging and providing, we can provide the code, and a sample docker file and example pipeline for the component. 52 | 53 | ## Future Considerations 54 | 55 | For the purposes of this proposal, the `ExampleFilter` component will be submitted as-is, as to not let "perfect" become the 56 | enemy of "good enough". There are a number of potential improvements that could be made to the component, but working 57 | through them should be a separate process from this initial proposal to get a working MVP. 58 | 59 | The current implementation is a bit dated and not so robust. It depends on a deprecated TFT proto coder, and only works on 60 | TF Records, as it does not make use of TFXIO. As part of bringing this to TFX-addons, I think it is worth iterating on the 61 | current design. Some initial ideas for change might be: 62 | 63 | * Implementing it more flexibly in TFXIO 64 | * Determine if there's a way to implement it without requiring a schema 65 | * Making the predicate_fn operate on true data types rather than bytes (see example above) 66 | * Adding an input that allows the user to specify splits (currently applies to all splits) 67 | 68 | ## Project Dependencies 69 | Current implementation uses a [proto decoder](https://github.com/tensorflow/transform/blob/v0.24.1/tensorflow_transform/coders/example_proto_coder.py#L329-L339) 70 | deprecated from TFX 0.25 onwards. Otherwise the project uses standard TFX dependencies. 71 | 72 | ## Project Team 73 | * Ryan Clough, rclough@spotify.com, @rclough 74 | * TBD -------------------------------------------------------------------------------- /tfx_addons/example_filter/RELEASE.md: -------------------------------------------------------------------------------- 1 | # Current Version (Still in Development) 2 | 3 | ### Last Update: 15 September 2021 4 | 5 | ## Major Features and Improvements 6 | 7 | * None at this time 8 | 9 | ## Breaking Changes 10 | 11 | * None at this time 12 | 13 | ## Deprecations 14 | 15 | * None at this time 16 | 17 | ## Bug Fixes and Other Changes 18 | 19 | * None at this time 20 | 21 | ## Documentation Updates 22 | 23 | * None at this time -------------------------------------------------------------------------------- /tfx_addons/example_filter/component.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """ 16 | the component for filter addon 17 | """ 18 | 19 | import importlib 20 | import os 21 | 22 | import tensorflow as tf 23 | from tfx.dsl.component.experimental.annotations import OutputDict 24 | from tfx.dsl.io.fileio import listdir 25 | from tfx.types import standard_artifacts 26 | from tfx.v1.dsl.components import InputArtifact, Parameter 27 | from tfx_bsl.coders import example_coder 28 | 29 | 30 | def _get_data_from_tfrecords(train_uri: str): 31 | ''' 32 | Reads and returns data from TFRecords at URI as a list 33 | of dictionaries with values as numpy arrays 34 | Example: 35 | _get_data_from_tfrecords('path_to_TFRecords') 36 | ''' 37 | train_uri = [ 38 | os.path.join(train_uri, file_path) for file_path in listdir(train_uri) 39 | ] 40 | raw_dataset = tf.data.TFRecordDataset(train_uri, compression_type='GZIP') 41 | 42 | np_dataset = [] 43 | for tfrecord in raw_dataset: 44 | serialized_example = tfrecord.numpy() 45 | example = example_coder.ExampleToNumpyDict(serialized_example) 46 | np_dataset.append(example) 47 | 48 | return np_dataset 49 | 50 | 51 | def filter_component(input_data: InputArtifact[standard_artifacts.Examples], 52 | filter_function_str: Parameter[str], 53 | output_file: Parameter[str]) -> OutputDict(list_len=int): 54 | """Filters the data from input data by using the filter function. 55 | 56 | Args: 57 | input_data: Input list of data to be filtered. 58 | output_file: the name of the file to be saved to. 59 | filter_function_str: Module name of the function that will be used to 60 | filter the data. 61 | Example for the function 62 | my_example/my_filter.py: 63 | 64 | # filter module must have filter_function implemented 65 | def filter_function(input_list: Array): 66 | output_list = [] 67 | for element in input_list: 68 | if element.something: 69 | output_list.append(element) 70 | return output_list 71 | 72 | pipeline.py: 73 | filter_component(input_data ,'my_example.my_filter',output_data) 74 | 75 | Returns: 76 | len of the list after the filter 77 | { 78 | 'list_len': len(output_list) 79 | } 80 | 81 | """ 82 | records = _get_data_from_tfrecords(input_data.uri + "/Split-train") 83 | filter_function = importlib.import_module( 84 | filter_function_str).filter_function 85 | filtered_data = filter_function(records) 86 | result_len = len(filtered_data) 87 | new_data = [] 88 | for key in list(filtered_data[0].keys()): 89 | local_list = [] 90 | for i in range(result_len): 91 | local_list.append(str(filtered_data[i][key][0])) 92 | new_data.append(str(local_list)) 93 | writer = tf.io.TFRecordWriter(output_file) 94 | writer.write(tf.data.Dataset.from_tensor_slices(new_data).map(lambda x: x)) 95 | 96 | return {'list_len': result_len} 97 | -------------------------------------------------------------------------------- /tfx_addons/example_filter/component_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Component test for the filter component.""" 16 | 17 | import os 18 | 19 | import tensorflow as tf 20 | from absl.testing import absltest 21 | from tfx.types import artifact_utils, standard_artifacts 22 | 23 | from tfx_addons.example_filter.component import filter_component 24 | 25 | 26 | class ComponentTest(absltest.TestCase): 27 | def testConstructWithOptions(self): 28 | source_data_dir = os.path.join(os.path.dirname(__file__), 'data') 29 | 30 | examples = standard_artifacts.Examples() 31 | examples.uri = os.path.join(source_data_dir, "example_gen") 32 | examples.split_names = artifact_utils.encode_split_names(['train', 'eval']) 33 | 34 | params = { 35 | "input_data": examples, 36 | "filter_function_str": 'filter_function', 37 | "output_file": 'output', 38 | } 39 | filter_component(**params) 40 | 41 | 42 | if __name__ == '__main__': 43 | tf.test.main() 44 | -------------------------------------------------------------------------------- /tfx_addons/example_filter/data/test_data.csv: -------------------------------------------------------------------------------- 1 | label,col1 2 | ,2 3 | ,2 4 | ,2 5 | ,2 6 | ,2 7 | ,2 8 | ,2 9 | ,2 10 | ,2 11 | ,2 12 | 1,1 13 | 1,1 14 | 1,1 15 | 1,1 16 | 1,1 17 | 1,1 18 | 1,1 19 | 1,1 20 | 1,1 21 | 1,1 22 | 1,1 23 | 1,1 24 | 1,1 25 | 1,1 26 | 1,1 27 | 1,1 28 | 1,1 29 | 1,1 30 | 1,1 31 | 1,1 32 | 1,1 33 | 1,1 34 | 0,0 35 | 0,0 36 | 0,0 37 | 0,0 38 | 0,0 39 | 0,0 40 | 0,0 41 | 0,0 42 | 0,0 43 | 0,0 44 | 0,0 45 | 0,0 46 | 0,0 47 | 0,0 48 | 0,0 49 | 0,0 50 | 0,0 51 | 0,0 -------------------------------------------------------------------------------- /tfx_addons/example_filter/filter_function.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Example function to demonstrate the filter functionality of the module.""" 16 | 17 | 18 | def filter_function(x_list): 19 | """Filters the data from input data by using the filter function. 20 | 21 | Args: 22 | x_list: Input list of data to be filtered. 23 | 24 | 25 | Returns: 26 | filtered list 27 | 28 | """ 29 | new_list = [] 30 | for element in x_list: 31 | if element['label'] == [0]: 32 | new_list.append(element) 33 | return new_list 34 | -------------------------------------------------------------------------------- /tfx_addons/feast_examplegen/README.md: -------------------------------------------------------------------------------- 1 | # FeastExampleGen 2 | 3 | ExampleGen for Feast feature store. 4 | 5 | This component generates a Dataset out of a Feast entity_query and either a list of features or a feature service key. 6 | 7 | ## Installation 8 | 9 | ```sh 10 | pip install tfx-addons[feast_examplegen] 11 | ``` 12 | 13 | ## Example usage 14 | 15 | ```python 16 | example_gen = FeastExampleGen( 17 | repo_config=RepoConfig(register="gs://..."), 18 | entity_query="SELECT user, timestamp from some_user_dataset", 19 | features=["f1", "f2"], 20 | ) 21 | ``` 22 | Component can be configured the same way as any [QueryBasedExampleGen](https://www.tensorflow.org/tfx/guide/examplegen#query-based_examplegen_customization_experimental). 23 | 24 | Component `outputs` contains: 25 | - `examples`: Channel of type `standard_artifacts.Examples` for output train 26 | and eval examples. 27 | 28 | ## Extra information 29 | 30 | - [Proposal](https://github.com/tensorflow/tfx-addons/blob/main/proposals/20210525-feast_example_gen.md) 31 | - [Example usage](https://github.com/tensorflow/tfx-addons/tree/main/examples/fraud_feast) 32 | -------------------------------------------------------------------------------- /tfx_addons/feast_examplegen/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Init module for feast examplegen""" 16 | 17 | from tfx_addons.feast_examplegen.component import FeastExampleGen 18 | -------------------------------------------------------------------------------- /tfx_addons/feast_examplegen/component_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """ 16 | Tests for tfx_addons.feast_examplegen.component. 17 | """ 18 | 19 | import pytest 20 | 21 | try: 22 | import feast 23 | except ImportError: 24 | pytest.skip("feast not available, skipping", allow_module_level=True) 25 | 26 | from tfx.v1.proto import Input 27 | 28 | from tfx_addons.feast_examplegen.component import FeastExampleGen 29 | 30 | 31 | def test_init_valid(): 32 | entity_query = 'SELECT user FROM fake_db' 33 | repo_config = feast.RepoConfig(provider='local', project='default') 34 | FeastExampleGen(repo_config=repo_config, 35 | features=['feature1', 'feature2'], 36 | entity_query='SELECT user FROM fake_db') 37 | FeastExampleGen(repo_config=repo_config, 38 | features='feature_service1', 39 | entity_query='SELECT user FROM fake_db') 40 | FeastExampleGen(repo_config=repo_config, 41 | features=['feature1', 'feature2'], 42 | input_config=Input(splits=[ 43 | Input.Split(name='train', pattern=entity_query), 44 | Input.Split(name='eval', pattern=entity_query), 45 | ])) 46 | 47 | 48 | def test_input_and_entity(): 49 | entity_query = 'SELECT user FROM fake_db' 50 | repo_config = feast.RepoConfig(provider='local', project='default') 51 | with pytest.raises(RuntimeError): 52 | 53 | FeastExampleGen(repo_config=repo_config, 54 | features=['feature1', 'feature2'], 55 | entity_query=entity_query, 56 | input_config=Input(splits=[ 57 | Input.Split(name='train', pattern=entity_query), 58 | Input.Split(name='eval', pattern=entity_query), 59 | ])) 60 | -------------------------------------------------------------------------------- /tfx_addons/feature_selection/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution Guidelines 2 | 3 | ## Directory Structure 4 | The repo contains three main directories as follows: 5 | - **[Component](./component):** Contains the main component code with a separate file for the executor code 6 | - **[Data](./data):** Containing the sample data to be used for testing 7 | - **[Example](./example):** Contains example codes to test our component with the CSVs present in [data](./data) 8 | 9 | ## A few Git and GitHub practices 10 | 11 | ### Commits 12 | Commits serve as checkpoints during your workflow and can be used to **revert back** in case something gets messed up. 13 | - **When to commit:** Try not to pile up many changes in multiple commits while ensuring that you don't make too many commits for fixing a small issue. 14 | - **Commit messages:** Commit messages should be descriptive enough for an external person to get an idea of what it accomplished while ensuring they don't exceed 50 characters. 15 | 16 | Check out [this](https://gist.github.com/turbo/efb8d57c145e00dc38907f9526b60f17) for more information about the good practices 17 | 18 | ### Branches 19 | Branches are a good way to simulataniously work on different features at the same time. Check out [git-scm](https://git-scm.com/book/en/v2/Git-Branching-Basic-Branching-and-Merging) to know more about various concepts involved in the same. 20 | 21 | For descriptive branch names, it is a good idea to follow the following format: 22 | **`name/keyword/short-description`** 23 | - **Name:** Name of the person/s working on the branch. This can be ignored if many people(>2) are expected to work on it. 24 | - **Keyword:** This describes what "type" of work this branch is supposed to do. These are typically named as: 25 | - `feature`: Adding/expanding a feature 26 | - `base`: Adding boilerplate/readme/templates etc. 27 | - `bug`: Fixes a bug 28 | - `junk`: Throwaway branch created to experiment 29 | - **Short description:** As the name suggests, this contains a short description about the branch, usually no longer than 2-3 words separated by a hyphen (`-`). 30 | 31 | P.S. If multiple branches are being used to work on the same issue (say issue `#n`), they can be named as `name/keyword/#n-short-description` 32 | 33 | ### Issues 34 | The following points should be considered while creating new issues 35 | - Use relevant labels like `bug`, `feature` etc. 36 | - If the team has decided the person who will work on it, it should be **assigned** to the said person as soon as possible to prevent same work being done twice. 37 | - The issue should be linked in the **project** if needed and the status of the same should be maintained as the work progresses. 38 | 39 | ### Pull Requests 40 | It is always a good idea to ensure the following are present in your Pull Request description: 41 | - Relevant issue/s 42 | - What it accomplished 43 | - Mention `[WIP]` in title and make it a `Draft Pull Request` if it is a work in progress 44 | - Once the pull request is final, it should be **requested for review** from the concerned people 45 | -------------------------------------------------------------------------------- /tfx_addons/feature_selection/README.md: -------------------------------------------------------------------------------- 1 | #### SIG TFX-Addons 2 | # Project Proposal 3 | 4 | **Your name:** Nirzari Gupta 5 | 6 | **Your email:** nirzu97@gmail.com 7 | 8 | **Your company/organization:** Outreachy 9 | 10 | **Project name:** [Feature selection custom component](https://github.com/tensorflow/tfx-addons/issues/7) 11 | 12 | ## Project Description 13 | This project provides a facility to perform various feature selection algorithms on datasets in TFX pipelines. Additionally, feature scores for selected features will also be generated as a custom artifact. 14 | 15 | ## Project Category 16 | Component 17 | 18 | ## Project Use-Case(s) 19 | This project will allow the user to select different algorithms for performing feature selection on datasets artifacts in TFX pipelines. 20 | 21 | ## Project Implementation 22 | Feature Selection Custom Component is implemented as Python function-based component. 23 | 24 | Implementation of the Feature Selection Custom Component is done using the following approach: 25 | - Get dataset artifact generated by ExampleGen 26 | - Convert it into the format compatible with Scikit-Learn functions (TFRecord to numpy disctionaries) 27 | - Perform univariate feature selection with `SelectorFunc` specified in the module file 28 | - Output the following two artifacts: 29 | - `updated_data`: Duplicate of the input `Example` artifact, but with updated URI and data values 30 | - `feature_selection`: Contains data about the feature selection process with the following values available: 31 | - `scores`: Metric scores from the selector 32 | - `p_values`: Calculated p-values from the selector 33 | - `selected_features`: List of selected columns afetr feature selection 34 | 35 | ## Module file 36 | #### Structure 37 | The module file is required to have a structure with the following three values: 38 | - `SELECTOR_PARAMS`: Parameters for `SelectorFunc` 39 | - `TARGET_FEATURE`: The target feature in the dataset 40 | - `SelectorFunc`: Univariate function for feature selection 41 | 42 | #### Example module file 43 | In the below example, we have used sklearn functions directly for simplicity. You may define custom functions while ensuring that the overall i/o structure is the same. 44 | ``` python 45 | from sklearn.feature_selection import SelectKBest as SelectorFunc 46 | from sklearn.feature_selection import chi2 47 | 48 | SELECTOR_PARAMS = {"score_func": chi2, "k": 2} 49 | TARGET_FEATURE = 'species' 50 | ``` 51 | 52 | ## Example usage 53 | You may use the feature selection component in a way similar to [StatisticsGen](https://www.tensorflow.org/tfx/guide/statsgen) 54 | ``` python 55 | feature_selector = FeatureSelection( 56 | orig_examples = example_gen.outputs['examples'], 57 | module_file='example.modules.iris_module_file' 58 | ) 59 | ``` 60 | 61 | 62 | ## Project Dependencies 63 | The implementation will use the [Scikit-learn feature selection functions](https://scikit-learn.org/stable/modules/feature_selection.html) 64 | 65 | ## Project Team 66 | **Project Leader** : Nirzari Gupta, nirzu97, nirzu97@gmail.com 67 | 1. Fatimah Adwan, FatimahAdwan, akilahafaf72@gmail.com 68 | 2. Kshitijaa Jaglan, deutranium, jaglan.kshitijaa2@gmail.com 69 | 3. Pratishtha Abrol, pratishtha-abrol, pratishthaabrol@gmail.com -------------------------------------------------------------------------------- /tfx_addons/feature_selection/RELEASE.md: -------------------------------------------------------------------------------- 1 | # Current Version (v 1.0.0) 2 | 3 | ### Last Update: 3 June 2022 4 | 5 | ## Major Features and Improvements 6 | * Add feature selection functionality through custom or sklearn functions (to be accessed through a module file) 7 | * Compatible with any number of splits in the data 8 | * Returns two different artifacts - `updated_data` for the overall data processed (structured like the input Example artifact) and `feature_selection` for scores, p_values and selected features for reference 9 | -------------------------------------------------------------------------------- /tfx_addons/feature_selection/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /tfx_addons/feature_selection/data/data.csv: -------------------------------------------------------------------------------- 1 | pickup_community_area,fare,trip_start_month,trip_start_hour,trip_start_day,trip_start_timestamp,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,trip_miles,pickup_census_tract,dropoff_census_tract,payment_type,company,trip_seconds,dropoff_community_area,tips 2 | 60,27.05,10,2,3,1380593700,41.836150155,-87.648787952,,,12.6,,,Cash,Taxi Affiliation Services,1380,,0.0 3 | 10,5.85,10,1,2,1382319000,41.985015101,-87.804532006,,,0.0,,,Cash,Taxi Affiliation Services,180,,0.0 4 | 14,16.65,5,7,5,1369897200,41.968069,-87.721559063,,,0.0,,,Cash,Dispatch Taxi Affiliation,1080,,0.0 5 | 13,16.45,11,12,3,1446554700,41.983636307,-87.723583185,,,6.9,,,Cash,,780,,0.0 6 | -------------------------------------------------------------------------------- /tfx_addons/feature_selection/example/modules/iris_module_file.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Supplement for iris species example with specifics feature modification. 16 | This module file will be used in the feature selection component example. 17 | """ 18 | 19 | from sklearn.feature_selection import \ 20 | SelectKBest as SelectorFunc # pylint: disable=W0611 21 | from sklearn.feature_selection import chi2 22 | 23 | SELECTOR_PARAMS = {"score_func": chi2, "k": 2} 24 | TARGET_FEATURE = 'species' 25 | -------------------------------------------------------------------------------- /tfx_addons/feature_selection/example/modules/penguins_module.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Supplement for palmer penguins example with specific feature modification. 16 | This module file will be used in the feature selection component example. 17 | """ 18 | from sklearn.feature_selection import \ 19 | SelectKBest as SelectorFunc # pylint: disable=W0611 20 | from sklearn.feature_selection import chi2 21 | 22 | SELECTOR_PARAMS = {"score_func": chi2, "k": 2} 23 | TARGET_FEATURE = 'species' 24 | -------------------------------------------------------------------------------- /tfx_addons/feature_selection/example/modules/pima_indians_module_file.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Supplement for prima Indians Diabetics example with specifics feature modification. 16 | This module file will be used in the feature selection component example. 17 | """ 18 | from sklearn.feature_selection import \ 19 | SelectKBest as SelectorFunc # pylint: disable=W0611 20 | from sklearn.feature_selection import chi2 21 | 22 | SELECTOR_PARAMS = {"score_func": chi2, "k": 3} 23 | TARGET_FEATURE = 'Outcome' 24 | -------------------------------------------------------------------------------- /tfx_addons/feature_selection/nb/Example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "408bf10c", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from component import FeatureSelection\n", 11 | "from tfx.components import CsvExampleGen\n", 12 | "from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "id": "95533af7", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "context = InteractiveContext()" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "d1e35dbe", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "import urllib.request\n", 33 | "import tempfile\n", 34 | "import os\n", 35 | "\n", 36 | "# getting data and setup CsvExampleGen\n", 37 | "DATA_ROOT = tempfile.mkdtemp(prefix='tfx-data') # Create a temporary directory.\n", 38 | "_data_url = 'https://raw.githubusercontent.com/tensorflow/tfx/master/tfx/examples/penguin/data/labelled/penguins_processed.csv'\n", 39 | "_data_filepath = os.path.join(DATA_ROOT, \"data.csv\")\n", 40 | "urllib.request.urlretrieve(_data_url, _data_filepath)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "36c3d298", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "example_gen = CsvExampleGen(input_base=DATA_ROOT)\n", 51 | "context.run(example_gen)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "fa28bcd8", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# give path to the module file\n", 62 | "feature_selector = FeatureSelection(orig_examples = example_gen.outputs['examples'],\n", 63 | " module_file=\"module_file\")" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "id": "9afcfe7f", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "context.run(feature_selector)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "id": "b088c2c8", 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [] 83 | } 84 | ], 85 | "metadata": { 86 | "kernelspec": { 87 | "display_name": "Python 3 (ipykernel)", 88 | "language": "python", 89 | "name": "python3" 90 | }, 91 | "language_info": { 92 | "codemirror_mode": { 93 | "name": "ipython", 94 | "version": 3 95 | }, 96 | "file_extension": ".py", 97 | "mimetype": "text/x-python", 98 | "name": "python", 99 | "nbconvert_exporter": "python", 100 | "pygments_lexer": "ipython3", 101 | "version": "3.7.10" 102 | } 103 | }, 104 | "nbformat": 4, 105 | "nbformat_minor": 5 106 | } 107 | -------------------------------------------------------------------------------- /tfx_addons/feature_selection/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit_learn==1.1.2 2 | tensorflow 3 | tfx 4 | tfx_bsl==1.9.0 5 | -------------------------------------------------------------------------------- /tfx_addons/firebase_publisher/RELEASE.md: -------------------------------------------------------------------------------- 1 | # Current Version (Still in Development) 2 | 3 | ### Last Update: 15 September 2021 4 | 5 | ## Major Features and Improvements 6 | 7 | * None at this time 8 | 9 | ## Breaking Changes 10 | 11 | * None at this time 12 | 13 | ## Deprecations 14 | 15 | * None at this time 16 | 17 | ## Bug Fixes and Other Changes 18 | 19 | * None at this time 20 | 21 | ## Documentation Updates 22 | 23 | * None at this time -------------------------------------------------------------------------------- /tfx_addons/firebase_publisher/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Init module for FirebasePublisher""" 16 | 17 | from tfx_addons.firebase_publisher.component import FirebasePublisher 18 | -------------------------------------------------------------------------------- /tfx_addons/firebase_publisher/component_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for TFX Firebase Publisher Custom Component.""" 16 | 17 | import tensorflow as tf 18 | from tfx.types import standard_artifacts 19 | 20 | from tfx_addons.firebase_publisher.component import FirebasePublisher 21 | 22 | 23 | class FirebasePublisherTest(tf.test.TestCase): 24 | def testConstruct(self): 25 | firebase_publisher = FirebasePublisher(display_name="test_display_name", 26 | storage_bucket="storage_bucket") 27 | 28 | self.assertEqual(standard_artifacts.PushedModel.TYPE_NAME, 29 | firebase_publisher.outputs['pushed_model'].type_name) 30 | 31 | 32 | if __name__ == '__main__': 33 | tf.test.main() 34 | -------------------------------------------------------------------------------- /tfx_addons/firebase_publisher/executor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Firebase Publisher TFX Component Executor. 16 | 17 | The Firebase Publisher Executor calls the workflow handler 18 | runner.deploy_model_for_firebase_ml(). 19 | """ 20 | 21 | import time 22 | from typing import Any, Dict, List 23 | 24 | from tfx import types 25 | from tfx.components.pusher import executor as tfx_pusher_executor 26 | from tfx.types import artifact_utils, standard_component_specs 27 | 28 | from tfx_addons.firebase_publisher import runner 29 | 30 | _APP_NAME_KEY = "app_name" 31 | _DISPLAY_NAME_KEY = "display_name" 32 | _STORAGE_BUCKET_KEY = "storage_bucket" 33 | _TAGS_KEY = "tags" 34 | _OPTIONS_KEY = "options" 35 | _CREDENTIAL_PATH_KEY = "credential_path" 36 | 37 | 38 | class Executor(tfx_pusher_executor.Executor): 39 | """Pushes a model to Firebase ML.""" 40 | def Do( 41 | self, 42 | input_dict: Dict[str, List[types.Artifact]], 43 | output_dict: Dict[str, List[types.Artifact]], 44 | exec_properties: Dict[str, Any], 45 | ): 46 | """Overrides the tfx_pusher_executor to leverage some of utility methods 47 | 48 | Args: 49 | input_dict: Input dict from input key to a list of artifacts, including: 50 | - model_export: a TFX input channel containing a Model artifact. 51 | - model_blessing: a TFX input channel containing a ModelBlessing 52 | artifact. 53 | output_dict: Output dict from key to a list of artifacts, including: 54 | - pushed_model: a TFX output channel containing a PushedModel artifact. 55 | It contains information where the model is published at and whether 56 | the model is pushed or not. 57 | exec_properties: An optional dict of execution properties, including: 58 | - display_name: name to identify a hosted model in Firebase ML. 59 | this should be a unique value because it will be used to search 60 | a existing model to update. 61 | - storage_bucket: GCS bucket where the hosted model will be stored. 62 | - app_name: the name of Firebase app to determine the scope. 63 | - tags: tags to be attached to the hosted ML model. 64 | - credential_path: location of GCS or local file system where the 65 | Service Account(SA) Key file is. 66 | - options: additional configurations to be passed to initialize Firebase 67 | app. 68 | 69 | Raises: 70 | RuntimeError: when the size of model exceeds 40mb. 71 | """ 72 | self._log_startup(input_dict, output_dict, exec_properties) 73 | 74 | model_push = artifact_utils.get_single_instance( 75 | output_dict[standard_component_specs.PUSHED_MODEL_KEY]) 76 | if not self.CheckBlessing(input_dict): 77 | self._MarkNotPushed(model_push) 78 | return 79 | model_path = self.GetModelPath(input_dict) 80 | model_version_name = f"v{int(time.time())}" 81 | 82 | pushed_model_path = runner.deploy_model_for_firebase_ml( 83 | app_name=exec_properties.get(_APP_NAME_KEY, '[DEFAULT]'), 84 | display_name=exec_properties.get(_DISPLAY_NAME_KEY), 85 | storage_bucket=exec_properties.get(_STORAGE_BUCKET_KEY), 86 | credential_path=exec_properties.get(_CREDENTIAL_PATH_KEY, None), 87 | tags=exec_properties.get(_TAGS_KEY, []), 88 | options=exec_properties.get(_OPTIONS_KEY, {}), 89 | model_path=model_path, 90 | model_version=model_version_name, 91 | ) 92 | 93 | self._MarkPushed(model_push, pushed_destination=pushed_model_path) 94 | -------------------------------------------------------------------------------- /tfx_addons/firebase_publisher/runner_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for tfx_addons.firebase_publisher.runner.""" 16 | 17 | from unittest import mock 18 | from unittest.mock import Mock 19 | 20 | import tensorflow as tf 21 | 22 | from tfx_addons.firebase_publisher import runner 23 | 24 | 25 | class RunnerTest(tf.test.TestCase): 26 | def testModelExistancy(self): 27 | model_list = Mock() 28 | model_list.models = ['model1'] 29 | self.assertTrue(runner.is_model_present(model_list)) 30 | 31 | model_list.models = [] 32 | self.assertFalse(runner.is_model_present(model_list)) 33 | 34 | @mock.patch('tfx_addons.firebase_publisher.runner.glob.glob') 35 | def testModelPathAndType(self, mock_glob): 36 | tmp_model_path = "/tmp/saved_model" 37 | 38 | mock_glob.return_value = [f"{tmp_model_path}/model.tflite"] 39 | is_tflite, model_path = runner.get_model_path_and_type(tmp_model_path) 40 | self.assertTrue(is_tflite) 41 | self.assertEquals(f"{tmp_model_path}/model.tflite", model_path) 42 | 43 | mock_glob.return_value = [] 44 | is_tflite, model_path = runner.get_model_path_and_type(tmp_model_path) 45 | self.assertFalse(is_tflite) 46 | self.assertEquals(tmp_model_path, model_path) 47 | 48 | @mock.patch('tfx_addons.firebase_publisher.runner.fileio') 49 | @mock.patch('tfx_addons.firebase_publisher.runner.tf.io.gfile.GFile') 50 | def testCheckModelSize(self, mock_gfile, mock_fileio): 51 | mock_source = Mock() 52 | mock_source.as_dict.get.return_value = "mock_return" 53 | 54 | mock_gfile().__enter__.return_value.size.return_value = 83886080 55 | mock_gfile().__exit__ = Mock(return_value=False) 56 | 57 | try: 58 | runner.check_model_size(mock_source) 59 | except RuntimeError: 60 | self.fail("Runtime error occured unexpectedly") 61 | 62 | mock_fileio.remove() 63 | mock_gfile().__enter__.return_value.size.return_value = 83886081 64 | mock_gfile().__exit__ = Mock(return_value=False) 65 | with self.assertRaises(RuntimeError): 66 | runner.check_model_size(mock_source) 67 | 68 | 69 | if __name__ == "__main__": 70 | tf.test.main() 71 | -------------------------------------------------------------------------------- /tfx_addons/huggingface_pusher/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /tfx_addons/huggingface_pusher/component_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for TFX HuggingFace Pusher Custom Component.""" 16 | 17 | import tensorflow as tf 18 | from tfx.types import channel_utils, standard_artifacts 19 | 20 | from tfx_addons.huggingface_pusher.component import HFPusher 21 | 22 | 23 | class HFPusherTest(tf.test.TestCase): 24 | def testConstruct(self): 25 | test_model = channel_utils.as_channel([standard_artifacts.Model()]) 26 | hf_pusher = HFPusher( 27 | username="test_username", 28 | access_token="test_access_token", 29 | repo_name="test_repo_name", 30 | model=test_model, 31 | space_config={ 32 | "repo_name": "test_repo_name", # default: same as model repo_name 33 | "app_path": "app.gradio", # or app/gradio 34 | "space_sdk": "gradio", # or streamlit, default: gradio 35 | "placeholders": { 36 | # look for $MODEL_REPO_ID, $MODEL_REPO_URL, $MODEL_VERSION 37 | # tokens in files to replace with appropriate values at runtime 38 | "MODEL_REPO_ID": "$MODEL_REPO_ID", 39 | "MODEL_REPO_URL": "$MODEL_REPO_URL", 40 | "MODEL_VERSION": "$MODEL_VERSION", 41 | } 42 | }) 43 | 44 | self.assertEqual( 45 | standard_artifacts.PushedModel.TYPE_NAME, 46 | hf_pusher.outputs["pushed_model"].type_name, 47 | ) 48 | 49 | 50 | if __name__ == "__main__": 51 | tf.test.main() 52 | -------------------------------------------------------------------------------- /tfx_addons/message_exit_handler/RELEASE.md: -------------------------------------------------------------------------------- 1 | # Current Version (Still in Development) 2 | ### Last Update: 19 March 2022 3 | 4 | 22 | 23 | # Version 1.0.0 (Initial release) 24 | 25 | ## Major Features and Improvements 26 | 27 | * Initial component release. The exit handler component supports two messaging providers: 28 | * Slack 29 | * Logging 30 | * Contributions to support more messaging providers (e.g. Twilio, Microsoft Teams) are welcomed and encouraged. 31 | * Initial release only supports Google Cloud Vertex deployments. 32 | * Initial documentation 33 | -------------------------------------------------------------------------------- /tfx_addons/message_exit_handler/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /tfx_addons/message_exit_handler/component.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """ Message Exit Handler component """ 16 | 17 | import json 18 | 19 | from absl import logging 20 | from kfp.pipeline_spec import pipeline_spec_pb2 21 | from tfx import v1 as tfx 22 | from tfx.utils import proto_utils 23 | 24 | from tfx_addons.message_exit_handler import constants 25 | from tfx_addons.message_exit_handler.message_providers.base_provider import \ 26 | MessagingType 27 | from tfx_addons.message_exit_handler.message_providers.logging_provider import \ 28 | LoggingMessageProvider 29 | from tfx_addons.message_exit_handler.message_providers.slack_provider import \ 30 | SlackMessageProvider 31 | 32 | 33 | @tfx.orchestration.experimental.exit_handler 34 | def MessageExitHandler( 35 | final_status: tfx.dsl.components.Parameter[str], 36 | on_failure_only: tfx.dsl.components.Parameter[bool] = False, 37 | message_type: tfx.dsl.components.Parameter[str] = MessagingType.LOGGING. 38 | value, 39 | slack_credentials: tfx.dsl.components.Parameter[str] = None, 40 | decrypt_fn: tfx.dsl.components.Parameter[str] = None, 41 | ): 42 | """ 43 | Exit handler component for TFX pipelines originally developed by 44 | Digits Financial, Inc. 45 | The handler notifies the user of the final pipeline status via Slack. 46 | 47 | Args: 48 | final_status: The final status of the pipeline. 49 | slack_credentials: (Optional) The credentials to use for the 50 | Slack API calls, json format. 51 | on_failure_only: (Optional) Whether to notify only on failure. 52 | False is the default. 53 | message_type: (Optional) The type of message to send. 54 | Logging is the default. 55 | decrypt_fn: (Optional) The function to use to decrypt the credentials, 56 | 'tfx_addons.message_exit_handler.component_tests.fake_decryption_fn' 57 | 58 | """ 59 | 60 | # parse the final status 61 | pipeline_task_status = pipeline_spec_pb2.PipelineTaskFinalStatus() 62 | proto_utils.json_to_proto(final_status, pipeline_task_status) 63 | logging.debug(f"MessageExitHandler: {final_status}") 64 | status = json.loads(final_status) 65 | 66 | # leave the exit handler if pipeline succeeded and on_failure_only is True 67 | if on_failure_only and status["state"] == constants.SUCCESS_STATUS: 68 | logging.info("MessageExitHandler: Skipping notification on success.") 69 | return 70 | 71 | # create the message provider 72 | if message_type == MessagingType.SLACK.value: 73 | # parse slack credentials 74 | if not slack_credentials: 75 | raise ValueError("Slack credentials not provided.") 76 | provider = SlackMessageProvider(status=status, 77 | credentials=slack_credentials, 78 | decrypt_fn=decrypt_fn) 79 | elif message_type == MessagingType.LOGGING.value: 80 | provider = LoggingMessageProvider(status=status) 81 | else: 82 | raise ValueError( 83 | f"MessageExitHandler: Unknown message type: {message_type}") 84 | 85 | provider.send_message() 86 | message = provider.get_message() 87 | logging.info(f"MessageExitHandler: {message}") 88 | -------------------------------------------------------------------------------- /tfx_addons/message_exit_handler/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """ Constants for the Message Exit Handler """ 16 | 17 | SUCCESS_STATUS = "SUCCEEDED" 18 | FAILURE_STATUS = "FAILED" 19 | -------------------------------------------------------------------------------- /tfx_addons/message_exit_handler/message_providers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /tfx_addons/message_exit_handler/message_providers/base_provider.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Message Providers supported by the Message Exit Handler component. 16 | 17 | Currently supported: 18 | * Logging 19 | * Slack 20 | 21 | """ 22 | 23 | import enum 24 | from typing import Dict, Text 25 | 26 | from tfx_addons.message_exit_handler import constants 27 | 28 | 29 | class MessagingType(enum.Enum): 30 | """Determines the type of message to send.""" 31 | 32 | LOGGING = "logging" 33 | SLACK = "slack" 34 | 35 | 36 | class BaseProvider: 37 | """Message provider interface.""" 38 | def __init__(self, status: Dict) -> None: 39 | self._status = status 40 | self._message = self.set_message(status) 41 | 42 | @staticmethod 43 | def set_message(status) -> str: 44 | """Set the message to be sent.""" 45 | # parse the Vertex paths 46 | # structure: projects/{project}/locations/{location}/pipelineJobs/{pipeline_job} 47 | elements = status["pipelineJobResourceName"].split("/") 48 | project = elements[1] 49 | location = elements[3] 50 | job_id = elements[-1] 51 | 52 | # Generate message 53 | if status["state"] == constants.SUCCESS_STATUS: 54 | message = ( 55 | ":tada: " 56 | f"Pipeline job *{job_id}* ({project}) completed successfully.\n") 57 | else: 58 | message = f":scream: Pipeline job *{job_id}* ({project}) failed." 59 | message += f"\n>{status['error']['message']}" 60 | 61 | message += f"\nhttps://console.cloud.google.com/vertex-ai/locations/{location}/pipelines/runs/{job_id}" 62 | return message 63 | 64 | def get_message(self) -> Text: 65 | """Get the message to be sent.""" 66 | return self._message 67 | -------------------------------------------------------------------------------- /tfx_addons/message_exit_handler/message_providers/base_provider_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for Base Provider functions.""" 16 | 17 | import tensorflow as tf 18 | 19 | from tfx_addons.message_exit_handler import constants 20 | from tfx_addons.message_exit_handler.message_providers import base_provider 21 | 22 | SUCCESS_MESSAGE = """:tada: Pipeline job *test-pipeline-job* (test-project) completed successfully. 23 | 24 | https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/test-pipeline-job""" 25 | 26 | FAILURE_MESSAGE = """:scream: Pipeline job *test-pipeline-job* (test-project) failed. 27 | >test error 28 | https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/test-pipeline-job""" 29 | 30 | 31 | class MessageProviderTest(tf.test.TestCase): 32 | @staticmethod 33 | def get_final_status(state: str = constants.SUCCESS_STATUS, 34 | error: str = "") -> str: 35 | """Assemble final status for tests""" 36 | status = { 37 | "state": 38 | state, 39 | "error": 40 | error, 41 | "pipelineJobResourceName": 42 | ("projects/test-project/locations/" 43 | "us-central1/pipelineJobs/test-pipeline-job"), 44 | } 45 | if error: 46 | status.update({"error": {"message": error}}) 47 | return status 48 | 49 | def test_message_provider_success(self): 50 | final_status = self.get_final_status() 51 | test_provider = base_provider.BaseProvider(final_status) 52 | self.assertEqual(SUCCESS_MESSAGE, test_provider.get_message()) 53 | 54 | def test_message_provider_failure(self): 55 | final_status = self.get_final_status(state=constants.FAILURE_STATUS, 56 | error="test error") 57 | test_provider = base_provider.BaseProvider(final_status) 58 | self.assertEqual(FAILURE_MESSAGE, test_provider.get_message()) 59 | 60 | 61 | if __name__ == "__main__": 62 | tf.test.main() 63 | -------------------------------------------------------------------------------- /tfx_addons/message_exit_handler/message_providers/logging_provider.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """ Message provider interface for logging messages. """ 16 | 17 | from typing import Dict, Optional 18 | 19 | from absl import logging 20 | 21 | from tfx_addons.message_exit_handler.message_providers.base_provider import \ 22 | BaseProvider 23 | 24 | 25 | class LoggingMessageProvider(BaseProvider): 26 | """Logging message provider.""" 27 | def __init__( 28 | self, 29 | status: Dict, 30 | log_level: Optional[int] = logging.INFO, 31 | ) -> None: 32 | super().__init__(status=status) 33 | self._log_level = log_level 34 | 35 | def send_message(self) -> None: 36 | logging.log(self._log_level, f"MessageExitHandler: {self._message}") 37 | -------------------------------------------------------------------------------- /tfx_addons/message_exit_handler/message_providers/logging_provider_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for Logging Provider functions.""" 16 | 17 | import tensorflow as tf 18 | 19 | from tfx_addons.message_exit_handler import constants 20 | from tfx_addons.message_exit_handler.message_providers import logging_provider 21 | 22 | SUCCESS_MESSAGE = """:tada: Pipeline job *test-pipeline-job* (test-project) completed successfully. 23 | 24 | https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/test-pipeline-job""" 25 | 26 | FAILURE_MESSAGE = """:scream: Pipeline job *test-pipeline-job* (test-project) failed. 27 | >test error 28 | https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/test-pipeline-job""" 29 | 30 | 31 | class LoggingProviderTest(tf.test.TestCase): 32 | @staticmethod 33 | def get_final_status(state: str = constants.SUCCESS_STATUS, 34 | error: str = "") -> str: 35 | """Assemble final status for tests""" 36 | status = { 37 | "state": 38 | state, 39 | "error": 40 | error, 41 | "pipelineJobResourceName": 42 | ("projects/test-project/locations/" 43 | "us-central1/pipelineJobs/test-pipeline-job"), 44 | } 45 | if error: 46 | status.update({"error": {"message": error}}) 47 | return status 48 | 49 | def test_logging_message_provider_success(self): 50 | final_status = self.get_final_status() 51 | with self.assertLogs(level="INFO") as logs: 52 | message_provider = logging_provider.LoggingMessageProvider(final_status) 53 | message_provider.send_message() 54 | self.assertLen(logs.output, 1) 55 | self.assertEqual( 56 | "INFO:absl:MessageExitHandler: " + SUCCESS_MESSAGE, 57 | logs.output[0], 58 | ) 59 | 60 | def test_logging_message_provider_failure(self): 61 | final_status = self.get_final_status(state=constants.FAILURE_STATUS, 62 | error="test error") 63 | with self.assertLogs(level="INFO") as logs: 64 | message_provider = logging_provider.LoggingMessageProvider(final_status) 65 | message_provider.send_message() 66 | self.assertLen(logs.output, 1) 67 | self.assertEqual( 68 | "INFO:absl:MessageExitHandler: " + FAILURE_MESSAGE, 69 | logs.output[0], 70 | ) 71 | 72 | 73 | if __name__ == "__main__": 74 | tf.test.main() 75 | -------------------------------------------------------------------------------- /tfx_addons/message_exit_handler/message_providers/slack_provider.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """ Message provider interface for slack messages. """ 16 | 17 | from typing import Dict, Optional 18 | 19 | from absl import logging 20 | from pydantic import BaseModel 21 | from slack import WebClient 22 | from slack.errors import SlackApiError 23 | from tfx.utils import import_utils 24 | 25 | from tfx_addons.message_exit_handler.message_providers.base_provider import \ 26 | BaseProvider 27 | 28 | 29 | class SlackCredentials(BaseModel): 30 | """Pydantic class to de/serialize the slack credentials.""" 31 | slack_token: str 32 | slack_channel_id: str 33 | 34 | 35 | class SlackMessageProvider(BaseProvider): 36 | """Slack message provider.""" 37 | def __init__(self, 38 | status: Dict, 39 | credentials: str, 40 | decrypt_fn: Optional[str] = None) -> None: 41 | super().__init__(status=status) 42 | 43 | if not credentials: 44 | raise ValueError("Slack credentials not provided.") 45 | 46 | credentials = SlackCredentials.parse_raw(credentials) 47 | self._slack_channel_id = credentials.slack_channel_id 48 | self._slack_token = credentials.slack_token 49 | 50 | if decrypt_fn: 51 | module_path, fn_name = decrypt_fn.rsplit(".", 1) 52 | logging.info( 53 | f"MessageExitHandler: Importing {fn_name} from {module_path} " 54 | "to decrypt credentials.") 55 | fn = import_utils.import_func_from_module(module_path, fn_name) 56 | self._slack_channel_id = fn(self._slack_channel_id) 57 | self._slack_token = fn(self._slack_token) 58 | 59 | self._client = WebClient(token=self._slack_token) 60 | 61 | def send_message(self) -> None: 62 | try: 63 | response = self._client.chat_postMessage(channel=self._slack_channel_id, 64 | text=self._message) 65 | logging.info(f"MessageExitHandler: Slack response: {response}") 66 | except SlackApiError as e: 67 | logging.error( 68 | f"MessageExitHandler: Slack API error: {e.response['error']}") 69 | -------------------------------------------------------------------------------- /tfx_addons/message_exit_handler/message_providers/slack_provider_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for Slack Provider functions.""" 16 | 17 | from unittest.mock import patch 18 | 19 | import tensorflow as tf 20 | 21 | from tfx_addons.message_exit_handler import constants 22 | from tfx_addons.message_exit_handler.message_providers import slack_provider 23 | 24 | 25 | class SlackProviderTest(tf.test.TestCase): 26 | @staticmethod 27 | def get_final_status(state: str = constants.SUCCESS_STATUS, 28 | error: str = "") -> str: 29 | """Assemble final status for tests""" 30 | status = { 31 | "state": 32 | state, 33 | "error": 34 | error, 35 | "pipelineJobResourceName": 36 | ("projects/test-project/locations/" 37 | "us-central1/pipelineJobs/test-pipeline-job"), 38 | } 39 | if error: 40 | status.update({"error": {"message": error}}) 41 | return status 42 | 43 | @patch( 44 | 'tfx_addons.message_exit_handler.message_providers.slack_provider.WebClient' 45 | ) 46 | def test_slack_message_provider(self, web_client_mock): 47 | final_status = self.get_final_status() 48 | credentials = slack_provider.SlackCredentials( 49 | slack_token="test-token", slack_channel_id="test-channel").json() 50 | 51 | message_provider = slack_provider.SlackMessageProvider( 52 | final_status, credentials) 53 | message_provider.send_message() 54 | web_client_mock.assert_called_once() 55 | web_client_mock.assert_called_with(token='test-token') 56 | 57 | @patch( 58 | 'tfx_addons.message_exit_handler.message_providers.slack_provider.WebClient' 59 | ) 60 | def test_slack_message_provider_with_decrypt_fn(self, mock_web_client): 61 | final_status = self.get_final_status() 62 | credentials = slack_provider.SlackCredentials( 63 | slack_token="test-token", slack_channel_id="test-channel").json() 64 | 65 | message_provider = slack_provider.SlackMessageProvider( 66 | final_status, 67 | credentials, 68 | decrypt_fn= 69 | 'tfx_addons.message_exit_handler.component_test.fake_decryption_fn') 70 | message_provider.send_message() 71 | mock_web_client.assert_called_once() 72 | mock_web_client.assert_called_with(token='TEST-TOKEN') 73 | 74 | 75 | if __name__ == "__main__": 76 | tf.test.main() 77 | -------------------------------------------------------------------------------- /tfx_addons/mlmd_client/README.md: -------------------------------------------------------------------------------- 1 | # TFX MLMD Client Library 2 | 3 | ## Project Description 4 | 5 | Client library to inspect content in ML Metadata populated by TFX pipelines. Library will be written in Python and distributed through PyPi. 6 | Given metadata connection information, it should provide easy to use methods to introspect the Metadata DB. 7 | 8 | Idea from [#12](https://github.com/tensorflow/tfx-addons/issues/12) 9 | 10 | **Status**: Paused 11 | -------------------------------------------------------------------------------- /tfx_addons/mlmd_client/RELEASE.md: -------------------------------------------------------------------------------- 1 | # Current Version (Still in Development) 2 | 3 | ### Last Update: 15 September 2021 4 | 5 | ## Major Features and Improvements 6 | 7 | * None at this time 8 | 9 | ## Breaking Changes 10 | 11 | * None at this time 12 | 13 | ## Deprecations 14 | 15 | * None at this time 16 | 17 | ## Bug Fixes and Other Changes 18 | 19 | * None at this time 20 | 21 | ## Documentation Updates 22 | 23 | * None at this time -------------------------------------------------------------------------------- /tfx_addons/mlmd_client/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """MLMDClient module""" 16 | __all__ = ["MetadataClient"] 17 | from .client import MetadataClient 18 | -------------------------------------------------------------------------------- /tfx_addons/mlmd_client/client_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for tfx_addons.mlmd_client.client.""" 16 | import os 17 | 18 | from ml_metadata.proto import metadata_store_pb2 19 | from tfx.dsl.component.experimental.annotations import (OutputArtifact, 20 | Parameter) 21 | from tfx.dsl.component.experimental.decorators import component 22 | from tfx.orchestration.local.local_dag_runner import LocalDagRunner 23 | from tfx.orchestration.pipeline import Pipeline 24 | from tfx.types.standard_artifacts import String 25 | 26 | from tfx_addons.mlmd_client import client 27 | 28 | 29 | @component 30 | def print_component(word: Parameter[str], word_out: OutputArtifact[String]): 31 | print(word) 32 | word_out.value = word 33 | 34 | 35 | def _create_pipeline(root_dir: str): 36 | comp = print_component(word="test") 37 | connection_config = metadata_store_pb2.ConnectionConfig() 38 | connection_config.sqlite.filename_uri = os.path.join(root_dir, "db.sqlite") 39 | connection_config.sqlite.connection_mode = 3 # READWRITE_OPENCREATE 40 | return Pipeline( 41 | pipeline_root=root_dir, 42 | pipeline_name="client_test", 43 | metadata_connection_config=connection_config, 44 | components=[comp], 45 | ) 46 | 47 | 48 | def test_pipeline_exists(tmpdir): 49 | pipeline = _create_pipeline(tmpdir.mkdir("test").strpath) 50 | LocalDagRunner().run(pipeline) 51 | p = client.MetadataClient.from_pipeline(pipeline) 52 | assert isinstance(p, client.PipelineContext) 53 | 54 | 55 | def test_get_artifacts(tmpdir): 56 | pipeline = _create_pipeline(tmpdir.mkdir("test").strpath) 57 | LocalDagRunner().run(pipeline) 58 | p = client.MetadataClient.from_pipeline(pipeline) 59 | assert isinstance(p, client.PipelineContext) 60 | assert len(p.get_artifact_by_type_name('String')) == 1 61 | -------------------------------------------------------------------------------- /tfx_addons/model_card_generator/README.md: -------------------------------------------------------------------------------- 1 | # TFX Model Card Generator 2 | 3 | Idea from [#12](https://github.com/tensorflow/tfx-addons/issues/82) 4 | 5 | **Status**: Active 6 | 7 | Created by @shuklak13 8 | 9 | The ModelCardGenerator TFX pipeline component generates model cards. 10 | 11 | For the detailed model card format, see the 12 | [Model Card API](https://www.tensorflow.org/responsible_ai/model_card_toolkit/api_docs/python/model_card_toolkit/ModelCard). 13 | 14 | For more general information about TFX, please see the 15 | [TFX User Guide](https://www.tensorflow.org/tfx/guide). 16 | 17 | ## Configuring the ModelCardGenerator Component 18 | 19 | The ModelCardGenerator takes 20 | [dataset statistics](https://www.tensorflow.org/tfx/guide/statsgen), 21 | [model evaluation](https://www.tensorflow.org/tfx/guide/evaluator), and a 22 | [pushed model](https://www.tensorflow.org/tfx/guide/pusher) to automatically 23 | populate parts of a model card. 24 | 25 | [Model card fields](https://www.tensorflow.org/responsible_ai/model_card_toolkit/api_docs/python/model_card_toolkit/ModelCard) 26 | can also be explicitly populated with a JSON string (this can be generated using 27 | the [`json`](https://docs.python.org/3/library/json.html) module, see Example 28 | below). If a field is populated both by TFX and JSON, the JSON value will 29 | overwrite the TFX value. 30 | 31 | The ModelCardGenerator writes model card documents to the `model_card/` 32 | directory of its artifact output. It uses a default HTML model card template, 33 | which is used to generate `model_card.html`. Custom 34 | [templates](https://www.tensorflow.org/responsible_ai/model_card_toolkit/guide/templates) 35 | can also be used; each template input must be accompanied by a file name output 36 | in the `template_io` arg. 37 | 38 | ### Example 39 | 40 | ```py 41 | import json 42 | 43 | from tfx_addons.model_card_generator.component import ModelCardGenerator 44 | 45 | ... 46 | model_card_fields = { 47 | 'model_details': { 48 | 'name': 'my_model', 49 | 'owners': 'Google', 50 | 'version': 'v0.1' 51 | }, 52 | 'considerations': { 53 | 'limitations': 'This is a demo model.' 54 | } 55 | } 56 | mc_gen = ModelCardGenerator( 57 | statistics=statistics_gen.outputs['statistics'], 58 | evaluation=evaluator.outputs['evaluation'], 59 | pushed_model=pusher.outputs['pushed_model'], 60 | json=json.dumps(model_card_fields), 61 | template_io=[ 62 | ('html/default_template.html.jinja', 'model_card.html'), 63 | ('md/default_template.md.jinja', 'model_card.md') 64 | ] 65 | ) 66 | ``` 67 | 68 | More details are available in the 69 | [ModelCardGenerator](https://www.tensorflow.org/responsible_ai/model_card_toolkit/api_docs/python/model_card_toolkit/ModelCardGenerator) 70 | API reference. 71 | 72 | See our 73 | [end-to-end demo](https://www.tensorflow.org/responsible_ai/model_card_toolkit/examples/MLMD_Model_Card_Toolkit_Demo) 74 | for a full working example. 75 | -------------------------------------------------------------------------------- /tfx_addons/model_card_generator/RELEASE.md: -------------------------------------------------------------------------------- 1 | # Current Version 2 | 3 | ### Last Update: 3 April 2023 4 | 5 | ## Major Features and Improvements 6 | 7 | * Model Card Generator for TFX pipelines using the `model-card-toolkit` 8 | 9 | ## Breaking Changes 10 | 11 | * None at this time 12 | 13 | ## Deprecations 14 | 15 | * None at this time 16 | 17 | ## Bug Fixes and Other Changes 18 | 19 | * None at this time 20 | 21 | ## Documentation Updates 22 | 23 | * None at this time 24 | -------------------------------------------------------------------------------- /tfx_addons/model_card_generator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Model Card Generator""" 16 | -------------------------------------------------------------------------------- /tfx_addons/model_card_generator/artifact.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """The ModelCard TFX/MLMD artifact.""" 16 | 17 | import datetime 18 | 19 | import ml_metadata as mlmd 20 | from absl import logging 21 | from ml_metadata import errors 22 | from ml_metadata.proto import metadata_store_pb2 23 | from tfx.types.artifact import Artifact 24 | from tfx.types.system_artifacts import Metrics 25 | 26 | 27 | class ModelCard(Artifact): 28 | """A [TFX/MLMD artifact](https://www.tensorflow.org/tfx/guide/mlmd#data_model) to model card assets. 29 | 30 | Assets include: 31 | * a data file containing the model card fields, located at 32 | `/data/model_card.proto`. 33 | * the model card itself, located at the `/model_card/ directory`. 34 | """ 35 | TYPE_NAME = 'ModelCard' 36 | TYPE_ANNOTATION = Metrics 37 | 38 | 39 | def create_and_save_artifact( 40 | artifact_name: str, artifact_uri: str, 41 | store: mlmd.MetadataStore) -> metadata_store_pb2.Artifact: 42 | """Generates and saves a ModelCard artifact to the specified MetadataStore. 43 | 44 | Args: 45 | artifact_name: The name for the ModelCard artifact. A timestamp will be 46 | appended to this to distinguish model cards created from the same job. 47 | artifact_uri: The uri for the ModelCard artifact. 48 | store: The MetadataStore where the ModelCard artifact and artifact type are 49 | saved. 50 | 51 | Returns: 52 | The saved artifact, which can be used to store model card assets. 53 | """ 54 | 55 | try: 56 | type_id = store.get_artifact_type(ModelCard.TYPE_NAME).id 57 | except errors.NotFoundError: 58 | type_id = store.put_artifact_type( 59 | metadata_store_pb2.ArtifactType(name=ModelCard.TYPE_NAME)) 60 | name = ''.join( 61 | [artifact_name, '_', 62 | datetime.datetime.now().strftime('%H:%M:%S')]) 63 | 64 | # Save artifact to store. Also populates the artifact's id. 65 | artifact_id = store.put_artifacts([ 66 | metadata_store_pb2.Artifact(type=ModelCard.TYPE_NAME, 67 | type_id=type_id, 68 | uri=artifact_uri, 69 | name=name) 70 | ])[0] 71 | artifact = store.get_artifacts_by_id([artifact_id])[0] 72 | logging.info( 73 | 'Successfully saved ModelCard artifact %s with uri=%s and id=%s.', 74 | artifact.name, artifact.uri, artifact.id) 75 | return artifact 76 | -------------------------------------------------------------------------------- /tfx_addons/model_card_generator/artifact_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for artifact.""" 16 | 17 | import ml_metadata as mlmd 18 | from absl.testing import absltest 19 | from ml_metadata.proto import metadata_store_pb2 20 | 21 | from tfx_addons.model_card_generator import artifact 22 | 23 | 24 | class ArtifactTest(absltest.TestCase): 25 | def setUp(self): 26 | super(ArtifactTest, self).setUp() 27 | connection_config = metadata_store_pb2.ConnectionConfig() 28 | connection_config.fake_database.SetInParent() 29 | self.store = mlmd.MetadataStore(connection_config) 30 | 31 | def test_create_and_save_artifact(self): 32 | mc_artifact = artifact.create_and_save_artifact( 33 | artifact_name='my model', 34 | artifact_uri='/path/to/model/card/assets', 35 | store=self.store) 36 | 37 | with self.subTest('saved_to_mlmd'): 38 | self.assertCountEqual([mc_artifact], 39 | self.store.get_artifacts_by_id([mc_artifact.id])) 40 | with self.subTest('properties'): 41 | with self.subTest('type_id'): 42 | self.assertEqual(mc_artifact.type_id, 43 | self.store.get_artifact_type('ModelCard').id) 44 | with self.subTest('uri'): 45 | self.assertEqual(mc_artifact.uri, '/path/to/model/card/assets') 46 | with self.subTest('name'): 47 | self.assertStartsWith(mc_artifact.name, 'my model_') 48 | 49 | 50 | if __name__ == '__main__': 51 | absltest.main() 52 | -------------------------------------------------------------------------------- /tfx_addons/pandas_transform/RELEASE.md: -------------------------------------------------------------------------------- 1 | # Current Version - 1.0.0 2 | 3 | ### Last Update: 12 August 2022 4 | 5 | ## Major Features and Improvements 6 | 7 | * Implements core functionality using native Pandas and Apache Beam 8 | * Does not implement a full pass over the dataset to gather statistics. Statistics are captured by StatisticsGen. 9 | 10 | ## Breaking Changes 11 | 12 | * None at this time 13 | 14 | ## Deprecations 15 | 16 | * None at this time 17 | 18 | ## Bug Fixes and Other Changes 19 | 20 | * None at this time 21 | 22 | ## Documentation Updates 23 | 24 | * None at this time -------------------------------------------------------------------------------- /tfx_addons/pandas_transform/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Init module for PandasTransform""" 16 | 17 | from tfx_addons.pandas_transform.component import PandasTransform 18 | 19 | __version__ = '1.0.0' 20 | -------------------------------------------------------------------------------- /tfx_addons/pandas_transform/null_preprocessing.py: -------------------------------------------------------------------------------- 1 | """ Null preprocessing, for minimal testing """ 2 | from absl import logging 3 | 4 | 5 | # pylint: disable=unused-argument 6 | def preprocessing_fn(df, schema, statistics): 7 | logging.info('Running null preprocessing') 8 | return df 9 | -------------------------------------------------------------------------------- /tfx_addons/predictions_to_bigquery/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /tfx_addons/predictions_to_bigquery/test_component.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | # This code was originally written by Hannes Hapke (Digits Financial Inc.) 16 | # on Feb. 6, 2023. 17 | """ 18 | Tests around Digits Prediction-to-BigQuery component. 19 | """ 20 | 21 | import tensorflow as tf 22 | from tfx.types import channel_utils, standard_artifacts 23 | 24 | from . import component 25 | 26 | 27 | class ComponentTest(tf.test.TestCase): 28 | def setUp(self): 29 | super(ComponentTest, self).setUp() 30 | self._transform_graph = channel_utils.as_channel( 31 | [standard_artifacts.TransformGraph()]) 32 | self._inference_results = channel_utils.as_channel( 33 | [standard_artifacts.InferenceResult()]) 34 | self._schema = channel_utils.as_channel([standard_artifacts.Schema()]) 35 | 36 | def testConstruct(self): 37 | # not a real test, just checking if if the component can be 38 | # instantiated 39 | _ = component.AnnotateUnlabeledCategoryDataComponent( 40 | transform_graph=self._transform_graph, 41 | inference_results=self._inference_results, 42 | schema=self._schema, 43 | bq_table_name="gcp_project:bq_database.table", 44 | vocab_label_file="vocab_txt", 45 | filter_threshold=0.1, 46 | table_suffix="%Y", 47 | table_partitioning=False, 48 | ) 49 | 50 | 51 | if __name__ == "__main__": 52 | tf.test.main() 53 | -------------------------------------------------------------------------------- /tfx_addons/sampling/README.md: -------------------------------------------------------------------------------- 1 | # Sampler component 2 | 3 | A TFX component to sample examples. 4 | 5 | The sampling component wraps an Apache Beam pipeline to process 6 | data in an TFX pipeline. This component loads in tf.Record files from 7 | an earlier example artifact, processes the 'train' split by default, 8 | samples the split by a given label's classes, and stores the new 9 | set of sampled examples into its own example artifact in 10 | tf.Record format. 11 | 12 | The sampling is probabilistic estimation. Note that in small datasets 13 | this may result in worse datasets or such. This module is meant to 14 | approximate sampling using probability. 15 | 16 | By default, the component will ignore all examples with a null value 17 | (more precisely, a value that evaluates to False) for the given label, 18 | although more values can be added in as necessary. Additionally, it will 19 | copy all non-'train' splits, through this behavior can be changed as well. 20 | The component will save the examples in a user-specified number of files, 21 | and it can be given a name as well. 22 | 23 | ## Example usage 24 | ``` 25 | import tfx_addons as tfxa 26 | 27 | under = tfxa.sampling.Sampler( 28 | examples=example_gen.outputs['examples'], 29 | sampling_strategy=tfxa.sampling.SamplingStrategy.UNDERSAMPLE 30 | ) 31 | ``` 32 | 33 | Component `outputs` contains: 34 | - `sampled_examples`: Channel of type `standard_artifacts.Examples` for 35 | materialized sampled examples, based on the input splits, which includes 36 | copied splits unless otherwise specified by copy_others. 37 | 38 | [Initial Proposal](https://github.com/tensorflow/tfx-addons/blob/main/proposals/20210721-sampling_component.md) 39 | -------------------------------------------------------------------------------- /tfx_addons/sampling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Sampling component""" 16 | __all__ = ["Sampler", "SamplingStrategy"] 17 | 18 | from .component import Sampler 19 | from .spec import SamplingStrategy 20 | -------------------------------------------------------------------------------- /tfx_addons/sampling/component_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Component test for the sampling component.""" 16 | 17 | import tensorflow as tf 18 | from absl.testing import absltest 19 | from tfx.types import artifact_utils, channel_utils, standard_artifacts 20 | from tfx.utils import json_utils 21 | 22 | from tfx_addons.sampling import component, spec 23 | 24 | 25 | class ComponentTest(absltest.TestCase): 26 | def testConstruct(self): 27 | examples = standard_artifacts.Examples() 28 | examples.split_names = artifact_utils.encode_split_names(['train', 'eval']) 29 | params = { 30 | spec.SAMPLER_INPUT_KEY: channel_utils.as_channel([examples]), 31 | spec.SAMPLER_SPLIT_KEY: ['train'], 32 | spec.SAMPLER_LABEL_KEY: 'label' 33 | } 34 | 35 | under = component.Sampler(**params) 36 | 37 | self.assertEqual(standard_artifacts.Examples.TYPE_NAME, 38 | under.outputs[spec.SAMPLER_OUTPUT_KEY].type_name) 39 | self.assertEqual(under.spec.exec_properties[spec.SAMPLER_SPLIT_KEY], 40 | json_utils.dumps(['train'])) 41 | self.assertEqual(under.spec.exec_properties[spec.SAMPLER_LABEL_KEY], 42 | 'label') 43 | 44 | def testConstructWithOptions(self): 45 | examples = standard_artifacts.Examples() 46 | examples.split_names = artifact_utils.encode_split_names(['train', 'eval']) 47 | params = { 48 | spec.SAMPLER_INPUT_KEY: channel_utils.as_channel([examples]), 49 | spec.SAMPLER_LABEL_KEY: 'test_label', 50 | spec.SAMPLER_SPLIT_KEY: ['train', 'eval'], 51 | spec.SAMPLER_COPY_KEY: False, 52 | spec.SAMPLER_SHARDS_KEY: 10, 53 | spec.SAMPLER_CLASSES_KEY: ['label'] 54 | } 55 | 56 | under = component.Sampler(**params) 57 | 58 | self.assertEqual(standard_artifacts.Examples.TYPE_NAME, 59 | under.outputs[spec.SAMPLER_OUTPUT_KEY].type_name) 60 | self.assertEqual(under.spec.exec_properties[spec.SAMPLER_LABEL_KEY], 61 | 'test_label') 62 | self.assertEqual(under.spec.exec_properties[spec.SAMPLER_SPLIT_KEY], 63 | json_utils.dumps(['train', 'eval'])) 64 | self.assertEqual(under.spec.exec_properties[spec.SAMPLER_COPY_KEY], False) 65 | self.assertEqual(under.spec.exec_properties[spec.SAMPLER_SHARDS_KEY], 10) 66 | self.assertEqual(under.spec.exec_properties[spec.SAMPLER_CLASSES_KEY], 67 | json_utils.dumps(['label'])) 68 | 69 | 70 | if __name__ == '__main__': 71 | tf.test.main() 72 | -------------------------------------------------------------------------------- /tfx_addons/sampling/data/example_gen/Split-eval/data_tfrecord-00000-of-00001.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/tfx-addons/724a2c095cc9aeb868b45ecf40a9c8832c94caaf/tfx_addons/sampling/data/example_gen/Split-eval/data_tfrecord-00000-of-00001.gz -------------------------------------------------------------------------------- /tfx_addons/sampling/data/example_gen/Split-train/data_tfrecord-00000-of-00001.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/tfx-addons/724a2c095cc9aeb868b45ecf40a9c8832c94caaf/tfx_addons/sampling/data/example_gen/Split-train/data_tfrecord-00000-of-00001.gz -------------------------------------------------------------------------------- /tfx_addons/sampling/data/test_data.csv: -------------------------------------------------------------------------------- 1 | label,col1 2 | ,2 3 | ,2 4 | ,2 5 | ,2 6 | ,2 7 | ,2 8 | ,2 9 | ,2 10 | ,2 11 | ,2 12 | 1,1 13 | 1,1 14 | 1,1 15 | 1,1 16 | 1,1 17 | 1,1 18 | 1,1 19 | 1,1 20 | 1,1 21 | 1,1 22 | 1,1 23 | 1,1 24 | 1,1 25 | 1,1 26 | 1,1 27 | 1,1 28 | 1,1 29 | 1,1 30 | 1,1 31 | 1,1 32 | 1,1 33 | 1,1 34 | 0,0 35 | 0,0 36 | 0,0 37 | 0,0 38 | 0,0 39 | 0,0 40 | 0,0 41 | 0,0 42 | 0,0 43 | 0,0 44 | 0,0 45 | 0,0 46 | 0,0 47 | 0,0 48 | 0,0 49 | 0,0 50 | 0,0 51 | 0,0 -------------------------------------------------------------------------------- /tfx_addons/sampling/example/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /tfx_addons/sampling/spec.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Sampling component definition.""" 16 | 17 | import enum 18 | 19 | from tfx import types 20 | from tfx.types import standard_artifacts 21 | from tfx.types.component_spec import ChannelParameter, ExecutionParameter 22 | 23 | SAMPLER_INPUT_KEY = 'input_data' 24 | SAMPLER_OUTPUT_KEY = 'output_data' 25 | SAMPLER_LABEL_KEY = 'label' 26 | SAMPLER_SPLIT_KEY = 'splits' 27 | SAMPLER_COPY_KEY = 'copy_others' 28 | SAMPLER_SHARDS_KEY = 'shards' 29 | SAMPLER_CLASSES_KEY = 'null_classes' 30 | SAMPLER_SAMPLE_KEY = 'sampling_strategy' 31 | 32 | 33 | class SamplingStrategy(enum.IntEnum): 34 | """Determines which kind of sampling to perform.""" 35 | UNDERSAMPLE = 1 36 | OVERSAMPLE = 2 37 | 38 | 39 | class SamplerSpec(types.ComponentSpec): 40 | """Sampling component spec.""" 41 | 42 | PARAMETERS = { 43 | SAMPLER_LABEL_KEY: ExecutionParameter(type=str), 44 | SAMPLER_SPLIT_KEY: ExecutionParameter(type=str, optional=True), 45 | SAMPLER_COPY_KEY: ExecutionParameter(type=int, optional=True), 46 | SAMPLER_SHARDS_KEY: ExecutionParameter(type=int, optional=True), 47 | SAMPLER_CLASSES_KEY: ExecutionParameter(type=str, optional=True), 48 | SAMPLER_SAMPLE_KEY: ExecutionParameter(type=int, optional=True), 49 | } 50 | INPUTS = { 51 | SAMPLER_INPUT_KEY: ChannelParameter(type=standard_artifacts.Examples), 52 | } 53 | OUTPUTS = { 54 | SAMPLER_OUTPUT_KEY: ChannelParameter(type=standard_artifacts.Examples), 55 | } 56 | -------------------------------------------------------------------------------- /tfx_addons/schema_curation/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution Guidelines 2 | 3 | ## Directory Structure 4 | The repo contains three main directories as follows: 5 | - **[Component](./component):** Contains the main component code with a separate file for the executor code 6 | - **[Data](./data):** Containing the sample data to be used for testing 7 | - **[Example](./example):** Contains example codes to test our component with the CSVs present in [data](./data) 8 | 9 | ## A few Git and GitHub practices 10 | 11 | ### Commits 12 | Commits serve as checkpoints during your workflow and can be used to **revert back** in case something gets messed up. 13 | - **When to commit:** Try not to pile up many changes in multiple commits while ensuring that you don't make too many commits for fixing a small issue. 14 | - **Commit messages:** Commit messages should be descriptive enough for an external person to get an idea of what it accomplished while ensuring they don't exceed 50 characters. 15 | 16 | Check out [this](https://gist.github.com/turbo/efb8d57c145e00dc38907f9526b60f17) for more information about the good practices 17 | 18 | ### Branches 19 | Branches are a good way to simulataniously work on different features at the same time. Check out [git-scm](https://git-scm.com/book/en/v2/Git-Branching-Basic-Branching-and-Merging) to know more about various concepts involved in the same. 20 | 21 | For descriptive branch names, it is a good idea to follow the following format: 22 | **`name/keyword/short-description`** 23 | - **Name:** Name of the person/s working on the branch. This can be ignored if many people(>2) are expected to work on it. 24 | - **Keyword:** This describes what "type" of work this branch is supposed to do. These are typically named as: 25 | - `feature`: Adding/expanding a feature 26 | - `base`: Adding boilerplate/readme/templates etc. 27 | - `bug`: Fixes a bug 28 | - `junk`: Throwaway branch created to experiment 29 | - **Short description:** As the name suggests, this contains a short description about the branch, usually no longer than 2-3 words separated by a hyphen (`-`). 30 | 31 | P.S. If multiple branches are being used to work on the same issue (say issue `#n`), they can be named as `name/keyword/#n-short-description` 32 | 33 | ### Issues 34 | The following points should be considered while creating new issues 35 | - Use relevant labels like `bug`, `feature` etc. 36 | - If the team has decided the person who will work on it, it should be **assigned** to the said person as soon as possible to prevent same work being done twice. 37 | - The issue should be linked in the **project** if needed and the status of the same should be maintained as the work progresses. 38 | 39 | ### Pull Requests 40 | It is always a good idea to ensure the following are present in your Pull Request description: 41 | - Relevant issue/s 42 | - What it accomplished 43 | - Mention `[WIP]` in title and make it a `Draft Pull Request` if it is a work in progress 44 | - Once the pull request is final, it should be **requested for review** from the concerned people 45 | -------------------------------------------------------------------------------- /tfx_addons/schema_curation/README.md: -------------------------------------------------------------------------------- 1 | # Schema Curation Custom Component 2 | 3 | [![Python](https://img.shields.io/pypi/pyversions/tfx.svg?style=plastic)](https://github.com/tensorflow/tfx) 4 | [![TensorFlow](https://img.shields.io/badge/TFX-orange)](https://www.tensorflow.org/tfx) 5 | 6 | This is a TFX-component that allows its users to apply a user code to a schema produced by the [SchemaGen](https://www.tensorflow.org/tfx/guide/schemagen) component, and curate it based on domain knowledge. It fits seamlessly into the ML-pipline made with TFX, and allows schema manipulation based on a module file provided by the User. 7 | 8 | ## Usage 9 | ### Examples demonstrating how to use Schema curation component 10 | 11 | To run locally: [taxi_example_local.py](https://github.com/rcrowe-google/schemacomponent/blob/main/example/taxi_example_local.py) 12 | TO run in colab: [taxi_example_colab.ipynb](https://github.com/rcrowe-google/schemacomponent/blob/main/example/taxi_example_colab.ipynb) 13 | 14 | ## Documentation 15 | 16 | ### Inputs: 17 | The custom component takes for input the user *module file*, and the *schema* generated by the SchemaGen component on the specified data. 18 | 19 | ### Output: 20 | On running the component, it outputs the *modified schema* based on the code provided in the module file. 21 | 22 | ## Module file 23 | 24 | ### The Schema Curation *schema_fn*: 25 | The Schema Curation component provides a solution to curating the schema based on user knowledge. As a user, you only have to define a single function called the `schema_fn`. in `schema_fn` you define a series of funcitons that manipulate the input schema to produce the required one. 26 | 27 | An example is: 28 | 29 | ``` 30 | def schema_fn(schema): 31 | """modifies the infered schema. 32 | Args: 33 | schema:schema generated by SchemaGen component of tfx 34 | """ 35 | #changing "tips" into optional feature 36 | feature = tfdv.get_feature(schema, 'tips') 37 | feature.presence.min_fraction = 0.9 38 | 39 | return schema 40 | ``` 41 | 42 | ## Project Structure 43 | 44 | ### Directory Structure 45 | ``` 46 | schemacomponent 47 | ├── component 48 | │ ├── component.py 49 | │ ├── component_test.py 50 | │ ├── executor.py 51 | | ├── executor_test.py 52 | │ ├── __init__.py 53 | ├── CONTRIBUTING.md 54 | ├── example 55 | │ ├── __init__.py 56 | │ ├── module_file.py 57 | │ ├── taxi_example_colab.ipynb 58 | │ ├── taxi_example_local.py 59 | ├── __init__.py 60 | ├── PROPOSAL.md 61 | └── README.md 62 | ``` 63 | 64 | 65 | The project follows the structure specified by the [TFX](https://www.tensorflow.org/tfx) documentation for a [TFX fully custom component](https://www.tensorflow.org/tfx/guide/custom_component). 66 | 67 | The `SchemaCurationSpec` class defines the input, output and execution parameters required by the component. 68 | 69 | The `Executor` class defines the functioning of the component, a subclass of the `base_executor.BaseExecutor` with the overriden `Do` function. 70 | 71 | Finally the `SchemaCuration` class integrates the fully custom component into the ML pipeline. 72 | 73 | ### Unit Tests 74 | 75 | The component includes separate unit tests for the component and the executor. 76 | 77 | 78 | ## Credits 79 | 80 | Schema Curation Custom Component was made as a part of [TFX-Addons](https://github.com/tensorflow/tfx-addons/) through the [Outreachy](https://www.outreachy.org/outreachy-may-2021-internship-round/communities/tensorflow/#create-custom-components-and-tools-for-tensorflow-) program. You may view the linked Pull Request in TFX-Addons [here](https://github.com/tensorflow/tfx-addons/pull/32) and the issue [here](https://github.com/tensorflow/tfx-addons/issues/8) for relevant discussions related to the project. 81 | 82 | ## The Team: 83 | ### Mentors: 84 | - Robert Crowe 85 | - Thea Lamkin 86 | - Josh Gordon 87 | 88 | ### Interns: 89 | - [Pratishtha Abrol](https://github.com/pratishtha-abrol) **(Team Leader)** 90 | - [Fatimah Adwan](https://github.com/FatimahAdwan/) 91 | - [Kshitijaa Jaglan](https://github.com/deutranium/) 92 | - [Nirzari Gupta](https://github.com/Nirzu97) 93 | -------------------------------------------------------------------------------- /tfx_addons/schema_curation/RELEASE.md: -------------------------------------------------------------------------------- 1 | # Current Version (Still in Development) 2 | 3 | ### Last Update: 15 September 2021 4 | 5 | ## Major Features and Improvements 6 | 7 | * None at this time 8 | 9 | ## Breaking Changes 10 | 11 | * None at this time 12 | 13 | ## Deprecations 14 | 15 | * None at this time 16 | 17 | ## Bug Fixes and Other Changes 18 | 19 | * None at this time 20 | 21 | ## Documentation Updates 22 | 23 | * None at this time -------------------------------------------------------------------------------- /tfx_addons/schema_curation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /tfx_addons/schema_curation/component/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /tfx_addons/schema_curation/component/component.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """TFX Schema Curation Custom Component 16 | """ 17 | 18 | from typing import Optional, Text, Union 19 | 20 | from tfx import types 21 | from tfx.dsl.components.base import base_component, executor_spec 22 | from tfx.orchestration import data_types 23 | from tfx.types import standard_artifacts 24 | from tfx.types.component_spec import ChannelParameter, ExecutionParameter 25 | 26 | from tfx_addons.schema_curation.component import executor 27 | 28 | 29 | class SchemaCurationSpec(types.ComponentSpec): 30 | """ComponentSpec for TFX Schema Curation Custom Component.""" 31 | 32 | PARAMETERS = { 33 | 'module_file': ExecutionParameter(type=str, optional=True), 34 | 'module_path': ExecutionParameter(type=str, optional=True), 35 | 'schema_fn': ExecutionParameter(type=str, optional=True) 36 | } 37 | INPUTS = { 38 | 'schema': 39 | ChannelParameter(type=standard_artifacts.Schema 40 | ), # Dictionary obtained as output from SchemaGen 41 | } 42 | OUTPUTS = { 43 | 'custom_schema': 44 | ChannelParameter(type=standard_artifacts.Schema 45 | ) # Dictionary which containes new schema 46 | } 47 | 48 | 49 | class SchemaCuration(base_component.BaseComponent): 50 | """Custom TFX Schema Curation Component. 51 | 52 | The SchemaCuration component is used to apply user code to a schema 53 | generated by SchemaGen in order to curate the schema based on 54 | domain knowledge. 55 | 56 | Component `outputs` contains: 57 | - `custom_schema`: Channel of type `standard_artifact.Schema` 58 | """ 59 | 60 | SPEC_CLASS = SchemaCurationSpec 61 | EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(executor.Executor) 62 | 63 | def __init__( 64 | self, 65 | schema: types.Channel, 66 | module_file: Optional[Union[Text, data_types.RuntimeParameter]] = None, 67 | module_path: Optional[Union[Text, data_types.RuntimeParameter]] = None, 68 | schema_fn: Optional[Union[Text, data_types.RuntimeParameter]] = None): 69 | """Construct a SchemaCurationComponent. 70 | 71 | Args: 72 | schema: A dictionary that containes the schema generated by 73 | SchemaGen component of tfx 74 | custom_schema: A dictionary that contains the schema after curation 75 | by the custom schema curation component 76 | """ 77 | 78 | custom_schema = types.Channel(type=standard_artifacts.Schema) 79 | 80 | spec = SchemaCurationSpec(schema=schema, 81 | custom_schema=custom_schema, 82 | module_file=module_file, 83 | module_path=module_path, 84 | schema_fn=schema_fn) 85 | super().__init__(spec=spec) 86 | -------------------------------------------------------------------------------- /tfx_addons/schema_curation/component/component_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for TFX Schema Curation Custom Component.""" 16 | 17 | import tensorflow as tf 18 | from tfx.types import channel_utils, standard_artifacts 19 | 20 | from tfx_addons.schema_curation.component import component 21 | 22 | 23 | class SchemaCurationTest(tf.test.TestCase): 24 | def testConstruct(self): 25 | schema_curation = component.SchemaCuration(schema=channel_utils.as_channel( 26 | [standard_artifacts.Schema()]), ) 27 | self.assertEqual(standard_artifacts.Schema.TYPE_NAME, 28 | schema_curation.outputs['custom_schema'].type_name) 29 | 30 | 31 | if __name__ == '__main__': 32 | tf.test.main() 33 | -------------------------------------------------------------------------------- /tfx_addons/schema_curation/example/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /tfx_addons/schema_curation/example/module_file.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Supplement for chicago taxi pipeline example with specifics schema modification. 16 | This module file will be used in the custom schema curation component. 17 | """ 18 | 19 | import tensorflow_data_validation as tfdv 20 | 21 | # TFX schema curation component will call this function. 22 | 23 | 24 | def schema_fn(schema): 25 | """modifies the infered schema. 26 | Args: 27 | schema:schema generated by SchemaGen component of tfx 28 | """ 29 | #changing "tips" into optional feature 30 | feature = tfdv.get_feature(schema, 'tips') 31 | feature.presence.min_fraction = 0.9 32 | 33 | return schema 34 | -------------------------------------------------------------------------------- /tfx_addons/schema_curation/example/taxi_example_local.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Chicago taxi example using TFX schema curation custom component. 16 | base code taken from: https://github.com/tensorflow/tfx/blob/master/tfx/examples/custom_components/hello_world/example/taxi_pipeline_hello.py 17 | 18 | This example demonstrate the use of schema curation custom component. 19 | user defined function `schema_fn` defined in `module_file.py` is used 20 | to change feature `tips` from required to optional. 21 | 22 | """ 23 | 24 | import os 25 | import tempfile 26 | import urllib 27 | from typing import Text 28 | 29 | import absl 30 | import tfx 31 | from tfx.components import CsvExampleGen, SchemaGen, StatisticsGen 32 | from tfx.orchestration import metadata, pipeline 33 | from tfx.orchestration.local import local_dag_runner 34 | 35 | from tfx_addons.schema_curation.component import component 36 | 37 | # downloading data and setting up required paths 38 | _data_root = tempfile.mkdtemp(prefix='tfx-data') 39 | DATA_PATH = 'https://raw.githubusercontent.com/tensorflow/tfx/master/tfx/examples/chicago_taxi_pipeline/data/simple/data.csv' 40 | _data_filepath = os.path.join(_data_root, "data.csv") 41 | urllib.request.urlretrieve(DATA_PATH, _data_filepath) 42 | 43 | _pipeline_name = 'taxi_pipeline' 44 | _tfx_root = tfx.__path__[0] 45 | _pipeline_root = os.path.join(_tfx_root, 'pipelines', _pipeline_name) 46 | _metadata_path = os.path.join(_tfx_root, 'metadata', _pipeline_name, 47 | 'metadata.db') 48 | 49 | 50 | def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, 51 | metadata_path: Text) -> pipeline.Pipeline: 52 | """Implements the chicago taxi pipeline with TFX.""" 53 | 54 | # Brings data into the pipeline or otherwise joins/converts training data. 55 | example_gen = CsvExampleGen(input_base=data_root) 56 | 57 | # Computes statistics over data for visualization and example validation. 58 | statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) 59 | 60 | # inferes a schema 61 | schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], 62 | infer_feature_shape=True) 63 | 64 | # modifies infered schema with use of udf `schema_fn` defined in module file 65 | schema_curation = component.SchemaCuration( 66 | schema=schema_gen.outputs['schema'], 67 | module_file=os.path.join('schemacomponent', 'example', 'module_file.py')) 68 | 69 | return pipeline.Pipeline( 70 | pipeline_name=pipeline_name, 71 | pipeline_root=pipeline_root, 72 | components=[example_gen, statistics_gen, schema_gen, schema_curation], 73 | enable_cache=True, 74 | metadata_connection_config=metadata.sqlite_metadata_connection_config( 75 | metadata_path)) 76 | 77 | 78 | # To run this pipeline from the python CLI: 79 | # $python taxi_pipeline_hello.py 80 | if __name__ == '__main__': 81 | absl.logging.set_verbosity(absl.logging.INFO) 82 | local_dag_runner.LocalDagRunner().run( 83 | _create_pipeline(pipeline_name=_pipeline_name, 84 | pipeline_root=_pipeline_root, 85 | data_root=_data_root, 86 | metadata_path=_metadata_path)) 87 | -------------------------------------------------------------------------------- /tfx_addons/schema_curation/test_data/module_file/module_file.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Supplement for chicago taxi pipeline example with specifics schema modification. 16 | This module file will be used in the custom schema curation component. 17 | """ 18 | 19 | import tensorflow_data_validation as tfdv 20 | 21 | # TFX schema curation component will call this function. 22 | 23 | 24 | def schema_fn(schema): 25 | """modifies the infered schema. 26 | Args: 27 | schema:schema generated by SchemaGen component of tfx 28 | """ 29 | #changing "tips" into optional feature 30 | feature = tfdv.get_feature(schema, 'tips') 31 | feature.presence.min_fraction = 0.9 32 | 33 | return schema 34 | -------------------------------------------------------------------------------- /tfx_addons/schema_curation/test_data/schema_gen/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /tfx_addons/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | -------------------------------------------------------------------------------- /tfx_addons/utils/test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """ Util functions to assist with the TFX Addons tests """ 16 | 17 | from typing import List 18 | 19 | 20 | def get_tfx_version(version: str) -> List[int]: 21 | """ 22 | Returns the TFX version as integers. 23 | """ 24 | # NB(gcasassaez): Remove suffix to avoid parsing issues 25 | version = version.split("-")[0] 26 | return tuple([int(x) for x in version.split(".")]) # pylint: disable=R1728 27 | -------------------------------------------------------------------------------- /tfx_addons/utils/test_utils_tests.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for TFX Addons test util functions.""" 16 | 17 | import unittest 18 | 19 | from tfx_addons.utils import test_utils 20 | 21 | MESSAGE_FN_CALLED = "test_fn called" 22 | EXPECTED_WARNING_MESSAGE = ( 23 | "WARNING:absl:test_fn has been disabled due to incompatible TFX version.") 24 | 25 | 26 | def test_fn(): 27 | return MESSAGE_FN_CALLED 28 | 29 | 30 | class TestUtilTest(unittest.TestCase): 31 | def test_get_tfx_version(self): 32 | tfx_version = "1.4.0" 33 | self.assertEqual(test_utils.get_tfx_version(tfx_version), (1, 4, 0)) 34 | 35 | 36 | if __name__ == "__main__": 37 | unittest.main() 38 | -------------------------------------------------------------------------------- /tfx_addons/version.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Define TFX Addons version information.""" 16 | 17 | # We follow Semantic Versioning (https://semver.org/) 18 | _MAJOR_VERSION = "0" 19 | _MINOR_VERSION = "7" 20 | _PATCH_VERSION = "0" 21 | 22 | # When building releases, we can update this value on the release branch to 23 | # reflect the current release candidate ('rc0', 'rc1') or, finally, the official 24 | # stable release (indicated by `_VERSION_SUFFIX = ''`). Outside the context of a 25 | # release branch, the current version is by default assumed to be a 26 | # 'development' version, labeled 'dev'. 27 | _VERSION_SUFFIX = "dev" 28 | 29 | # Example, '0.1.0-dev' 30 | __version__ = ".".join([_MAJOR_VERSION, _MINOR_VERSION, _PATCH_VERSION]) 31 | if _VERSION_SUFFIX: 32 | __version__ = "{}-{}".format(__version__, _VERSION_SUFFIX) 33 | 34 | # Required TFX version [min, max) 35 | _INCLUSIVE_MIN_TFX_VERSION = "1.6.0" 36 | _EXCLUSIVE_MAX_TFX_VERSION = "1.11.0" 37 | _TFXVERSION_CONSTRAINT = ( 38 | f">={_INCLUSIVE_MIN_TFX_VERSION},<{_EXCLUSIVE_MAX_TFX_VERSION}") 39 | _CI_MAX_CONSTRAINTS = ["tfx~=1.10.0", "tensorflow~=2.9.0"] 40 | _CI_MIN_CONSTRAINTS = [ 41 | f"tfx~={_INCLUSIVE_MIN_TFX_VERSION}", 42 | "tensorflow~=2.8.0", 43 | ] 44 | # This is a list of officially maintained projects with their dependencies. 45 | # Any project added here will be automatically picked up on release. 46 | # - Key: Project name that corresponds to folder tfx_addons.{} namespace. 47 | # - Value: Python dependencies needed for project to work. 48 | _PKG_METADATA = { 49 | # Add dependencies here for your project. Avoid using install_requires. 50 | "mlmd_client": [ 51 | f"ml_pipelines_sdk{_TFXVERSION_CONSTRAINT}", 52 | f"ml_metadata{_TFXVERSION_CONSTRAINT}" 53 | ], 54 | "schema_curation": [ 55 | f"tfx{_TFXVERSION_CONSTRAINT}", 56 | ], 57 | "feature_selection": 58 | [f"tfx{_TFXVERSION_CONSTRAINT}", "scikit_learn>=1.0.2,<2.0.0"], 59 | "feast_examplegen": [ 60 | f"tfx{_TFXVERSION_CONSTRAINT}", 61 | # ToDo(gcasassaez): Relax this once we stop supporting python 3.7 62 | # feast>=0.23 upgrades to numpy>=1.22 which does not work on 3.7 63 | "feast>=0.21.3,<0.23.0", 64 | ], 65 | "xgboost_evaluator": [ 66 | f"tfx{_TFXVERSION_CONSTRAINT}", 67 | "xgboost>=1.0.0", 68 | ], 69 | "sampling": [f"tfx{_TFXVERSION_CONSTRAINT}", "tensorflow>=2.0.0"], 70 | "message_exit_handler": [ 71 | f"tfx{_TFXVERSION_CONSTRAINT}", 72 | "kfp>=1.8,<2.0", 73 | "slackclient>=2.9.0,<3.0", 74 | "pydantic>=1.8.0,<2.0", 75 | ], 76 | "pandas_transform": [f"tfx{_TFXVERSION_CONSTRAINT}", "pandas>=1.0.0,<2.0"], 77 | "firebase_publisher": 78 | [f"tfx{_TFXVERSION_CONSTRAINT}", "firebase-admin>=5.0.0,<6.0.0"], 79 | "huggingface_pusher": 80 | [f"tfx{_TFXVERSION_CONSTRAINT}", "huggingface-hub>=0.10.0,<1.0.0"], 81 | "model_card_generator": 82 | [f"tfx{_TFXVERSION_CONSTRAINT}", "model-card-toolkit>=2.0.0,<3.0.0"], 83 | "predictions_to_bigquery": [f"tfx{_TFXVERSION_CONSTRAINT}"], 84 | "copy_example_gen": [ 85 | f"tfx{_TFXVERSION_CONSTRAINT}", 86 | ], 87 | } 88 | -------------------------------------------------------------------------------- /tfx_addons/xgboost_evaluator/RELEASE.md: -------------------------------------------------------------------------------- 1 | # Current Version (Still in Development) 2 | 3 | ### Last Update: 15 September 2021 4 | 5 | ## Major Features and Improvements 6 | 7 | * None at this time 8 | 9 | ## Breaking Changes 10 | 11 | * None at this time 12 | 13 | ## Deprecations 14 | 15 | * None at this time 16 | 17 | ## Bug Fixes and Other Changes 18 | 19 | * None at this time 20 | 21 | ## Documentation Updates 22 | 23 | * None at this time -------------------------------------------------------------------------------- /tfx_addons/xgboost_evaluator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """XGBoost evaluator module""" 16 | __all__ = ["XGBoostEvaluator"] 17 | from .component import XGBoostEvaluator 18 | -------------------------------------------------------------------------------- /tfx_addons/xgboost_evaluator/component.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """XGBoost Evaluator component.""" 16 | 17 | from tfx import v1 as tfx 18 | 19 | from tfx_addons.xgboost_evaluator import xgboost_predict_extractor 20 | 21 | 22 | class XGBoostEvaluator(tfx.components.Evaluator): 23 | """A custom Evaluator component made for XGBoost. Keeps everything the same, 24 | except inputs the custom module file containing the XGBoost Extractor.""" 25 | def __init__(self, **kwargs): 26 | if 'module_file' in kwargs: 27 | raise ValueError('XGBoostEvaluator does not accept custom module_file') 28 | super().__init__(module_file=xgboost_predict_extractor.get_module_file(), 29 | **kwargs) 30 | --------------------------------------------------------------------------------