├── .bazelrc ├── .bazelversion ├── .github ├── reusable-build │ └── action.yml └── workflows │ ├── build.yml │ ├── ci-lint.yml │ ├── docs.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── BUILD ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── RELEASE.md ├── WORKSPACE ├── docker-compose.yml ├── docs ├── _toc.yaml ├── anomalies.md ├── api.md ├── custom_data_validation.md ├── get_started.md ├── images │ ├── anomaly.png │ ├── feature_stats.png │ ├── schema.png │ ├── serving_anomaly.png │ ├── skew_anomaly.png │ ├── stats.png │ ├── tf_full_color_primary_icon.svg │ ├── unbalanced.png │ ├── uniform.png │ ├── uniform_cumulative.png │ └── zero_length.png ├── index.md ├── install.md ├── javascripts │ └── mathjax.js └── stylesheets │ └── extra.css ├── google3 └── third_party │ └── py │ └── tensorflow_data_validation │ ├── build_macros.bzl │ └── opensource_only │ └── BUILD ├── mkdocs.yml ├── pyproject.toml ├── requirements-docs.txt ├── setup.py ├── tensorflow_data_validation ├── BUILD ├── __init__.py ├── anomalies │ ├── BUILD │ ├── __init__.py │ ├── bool_domain_test.cc │ ├── bool_domain_util.cc │ ├── bool_domain_util.h │ ├── custom_domain_util.cc │ ├── custom_domain_util.h │ ├── custom_domain_util_test.cc │ ├── dataset_constraints_util.cc │ ├── dataset_constraints_util.h │ ├── dataset_constraints_util_test.cc │ ├── diff_util.cc │ ├── diff_util.h │ ├── feature_statistics_validator.cc │ ├── feature_statistics_validator.h │ ├── feature_statistics_validator_test.cc │ ├── feature_util.cc │ ├── feature_util.h │ ├── feature_util_test.cc │ ├── features_needed.cc │ ├── features_needed.h │ ├── features_needed_test.cc │ ├── float_domain_test.cc │ ├── float_domain_util.cc │ ├── float_domain_util.h │ ├── image_domain_test.cc │ ├── image_domain_util.cc │ ├── image_domain_util.h │ ├── int_domain_test.cc │ ├── int_domain_util.cc │ ├── int_domain_util.h │ ├── internal_types.h │ ├── map_util.cc │ ├── map_util.h │ ├── map_util_test.cc │ ├── metrics.cc │ ├── metrics.h │ ├── metrics_test.cc │ ├── natural_language_domain_util.cc │ ├── natural_language_domain_util.h │ ├── path.cc │ ├── path.h │ ├── path_test.cc │ ├── proto │ │ ├── BUILD │ │ ├── __init__.py │ │ ├── feature_statistics_to_proto.proto │ │ ├── validation_config.proto │ │ └── validation_metadata.proto │ ├── schema.cc │ ├── schema.h │ ├── schema_anomalies.cc │ ├── schema_anomalies.h │ ├── schema_anomalies_test.cc │ ├── schema_test.cc │ ├── schema_util.cc │ ├── schema_util.h │ ├── schema_util_test.cc │ ├── statistics_view.cc │ ├── statistics_view.h │ ├── statistics_view_test.cc │ ├── statistics_view_test_util.cc │ ├── statistics_view_test_util.h │ ├── status_util.h │ ├── string_domain_test.cc │ ├── string_domain_util.cc │ ├── string_domain_util.h │ ├── telemetry.cc │ ├── telemetry.h │ ├── test_schema_protos.cc │ ├── test_schema_protos.h │ ├── test_util.cc │ ├── test_util.h │ └── test_util_test.cc ├── api │ ├── __init__.py │ ├── stats_api.py │ ├── stats_api_test.py │ ├── validation_api.py │ ├── validation_api_test.py │ ├── validation_options.py │ └── validation_options_test.py ├── arrow │ ├── __init__.py │ ├── arrow_util.py │ ├── arrow_util_test.py │ ├── decoded_examples_to_arrow.py │ └── decoded_examples_to_arrow_test.py ├── build_macros.bzl ├── coders │ ├── __init__.py │ ├── csv_decoder.py │ └── csv_decoder_test.py ├── constants.py ├── data_validation.bzl ├── integration_tests │ ├── drift_skew_metrics_test.py │ └── sequence_example_e2e_test.py ├── move_generated_files.sh ├── pywrap │ ├── BUILD │ ├── __init__.py │ ├── tensorflow_data_validation_extension.cc │ ├── validation_submodule.cc │ └── validation_submodule.h ├── skew │ ├── __init__.py │ ├── feature_skew_detector.py │ ├── feature_skew_detector_test.py │ └── protos │ │ ├── BUILD │ │ ├── __init__.py │ │ └── feature_skew_results.proto ├── statistics │ ├── __init__.py │ ├── generators │ │ ├── __init__.py │ │ ├── basic_stats_generator.py │ │ ├── basic_stats_generator_test.py │ │ ├── constituents │ │ │ ├── __init__.py │ │ │ ├── count_missing_generator.py │ │ │ ├── count_missing_generator_test.py │ │ │ ├── length_diff_generator.py │ │ │ └── length_diff_generator_test.py │ │ ├── cross_feature_stats_generator.py │ │ ├── cross_feature_stats_generator_test.py │ │ ├── empty_value_counter_generator.py │ │ ├── empty_value_counter_generator_test.py │ │ ├── image_stats_generator.py │ │ ├── image_stats_generator_test.py │ │ ├── input_batch.py │ │ ├── input_batch_test.py │ │ ├── lift_stats_generator.py │ │ ├── lift_stats_generator_test.py │ │ ├── mutual_information.py │ │ ├── mutual_information_test.py │ │ ├── natural_language_domain_inferring_stats_generator.py │ │ ├── natural_language_domain_inferring_stats_generator_test.py │ │ ├── natural_language_stats_generator.py │ │ ├── natural_language_stats_generator_test.py │ │ ├── partitioned_stats_generator.py │ │ ├── partitioned_stats_generator_test.py │ │ ├── sklearn_mutual_information.py │ │ ├── sklearn_mutual_information_test.py │ │ ├── sparse_feature_stats_generator.py │ │ ├── sparse_feature_stats_generator_test.py │ │ ├── stats_generator.py │ │ ├── testdata │ │ │ ├── image1.gif │ │ │ ├── image2.png │ │ │ ├── image3.bmp │ │ │ ├── image4.png │ │ │ ├── image5.jpg │ │ │ ├── image6.jpg │ │ │ └── not_a_image.abc │ │ ├── time_stats_generator.py │ │ ├── time_stats_generator_test.py │ │ ├── top_k_uniques_sketch_stats_generator.py │ │ ├── top_k_uniques_sketch_stats_generator_test.py │ │ ├── top_k_uniques_stats_generator.py │ │ ├── top_k_uniques_stats_generator_test.py │ │ ├── weighted_feature_stats_generator.py │ │ └── weighted_feature_stats_generator_test.py │ ├── stats_impl.py │ ├── stats_impl_test.py │ ├── stats_options.py │ └── stats_options_test.py ├── tools │ ├── BUILD │ ├── README.md │ ├── build_docs.py │ └── docker_build │ │ ├── Dockerfile.manylinux2010 │ │ └── build_manylinux.sh ├── types.py ├── types_test.py ├── utils │ ├── __init__.py │ ├── anomalies_util.py │ ├── anomalies_util_test.py │ ├── artifacts_io_impl.py │ ├── artifacts_io_impl_test.py │ ├── batch_util.py │ ├── batch_util_test.py │ ├── beam_runner_util.py │ ├── bin_util.py │ ├── bin_util_test.py │ ├── display_util.py │ ├── display_util_test.py │ ├── example_weight_map.py │ ├── example_weight_map_test.py │ ├── feature_partition_util.py │ ├── feature_partition_util_test.py │ ├── io_util.py │ ├── io_util_test.py │ ├── metrics_util.py │ ├── mutual_information_util.py │ ├── mutual_information_util_test.py │ ├── path.py │ ├── preprocessing_util.py │ ├── quantiles_util.py │ ├── quantiles_util_test.py │ ├── schema_util.py │ ├── schema_util_test.py │ ├── slicing_util.py │ ├── slicing_util_test.py │ ├── stats_gen_lib.py │ ├── stats_gen_lib_test.py │ ├── stats_util.py │ ├── stats_util_test.py │ ├── test_util.py │ ├── test_util_test.py │ ├── top_k_uniques_stats_util.py │ ├── top_k_uniques_stats_util_test.py │ ├── validation_lib.py │ ├── validation_lib_test.py │ ├── variance_util.py │ ├── variance_util_test.py │ ├── vocab_util.py │ └── vocab_util_test.py ├── version.py └── workspace.bzl └── third_party ├── BUILD ├── arrow.BUILD ├── farmhash.BUILD ├── googleapis.patch ├── local_python.BUILD.tpl ├── pybind11.BUILD ├── python_configure.bzl ├── rules_foreign_cc.patch └── six.BUILD /.bazelrc: -------------------------------------------------------------------------------- 1 | # Needed to work with ZetaSQL dependency. 2 | # Zetasql is removed. 3 | # This is a candidate for removal 4 | build --cxxopt="-std=c++17" 5 | 6 | # Needed to avoid zetasql proto error. 7 | # Zetasql is removed. 8 | # This is a candidate for removal 9 | build --protocopt=--experimental_allow_proto3_optional 10 | 11 | # icu@: In create_linking_context: in call to create_linking_context(), 12 | # parameter 'user_link_flags' is deprecated and will be removed soon. 13 | # It may be temporarily re-enabled by setting --incompatible_require_linker_input_cc_api=false 14 | build --incompatible_require_linker_input_cc_api=false 15 | -------------------------------------------------------------------------------- /.bazelversion: -------------------------------------------------------------------------------- 1 | 6.5.0 2 | -------------------------------------------------------------------------------- /.github/reusable-build/action.yml: -------------------------------------------------------------------------------- 1 | name: Resusable steps to build data-validation 2 | 3 | inputs: 4 | python-version: 5 | description: 'Python version' 6 | required: true 7 | upload-artifact: 8 | description: 'Should upload build artifact or not' 9 | default: false 10 | 11 | runs: 12 | using: 'composite' 13 | steps: 14 | - name: Set up Python ${{ inputs.python-version }} 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: ${{ inputs.python-version }} 18 | 19 | - name: Build the package for Python ${{ inputs.python-version }} 20 | shell: bash 21 | run: | 22 | version="${{ matrix.python-version }}" 23 | docker compose run -e PYTHON_VERSION=$(echo "$version" | sed 's/\.//') manylinux2010 24 | 25 | - name: Upload wheel artifact for Python ${{ matrix.python-version }} 26 | if: ${{ inputs.upload-artifact == 'true' }} 27 | uses: actions/upload-artifact@v4 28 | with: 29 | name: data-validation-wheel-py${{ matrix.python-version }} 30 | path: dist/*.whl 31 | 32 | - name: Check the wheel 33 | shell: bash 34 | run: | 35 | pip install twine 36 | twine check dist/* 37 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | workflow_dispatch: 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.9", "3.10", "3.11"] 18 | 19 | steps: 20 | - name: Checkout 21 | uses: actions/checkout@v4 22 | 23 | - name: Build data-validation 24 | id: build-data-validation 25 | uses: ./.github/reusable-build 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | upload-artifact: true 29 | 30 | upload_to_pypi: 31 | name: Upload to PyPI 32 | runs-on: ubuntu-latest 33 | if: (github.event_name == 'release' && startsWith(github.ref, 'refs/tags')) || (github.event_name == 'workflow_dispatch') 34 | needs: [build] 35 | environment: 36 | name: pypi 37 | url: https://pypi.org/p/tensorflow-data-validation/ 38 | permissions: 39 | id-token: write 40 | steps: 41 | - name: Retrieve wheels 42 | uses: actions/download-artifact@v4.1.8 43 | with: 44 | merge-multiple: true 45 | path: wheels 46 | 47 | - name: List the build artifacts 48 | run: | 49 | ls -lAs wheels/ 50 | 51 | - name: Upload to PyPI 52 | uses: pypa/gh-action-pypi-publish@release/v1.9 53 | with: 54 | packages_dir: wheels/ 55 | -------------------------------------------------------------------------------- /.github/workflows/ci-lint.yml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [master] 7 | 8 | jobs: 9 | pre-commit: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4.1.7 13 | with: 14 | # Ensure the full history is fetched 15 | # This is required to run pre-commit on a specific set of commits 16 | # TODO: Remove this when all the pre-commit issues are fixed 17 | fetch-depth: 0 18 | - uses: actions/setup-python@v5.1.1 19 | with: 20 | python-version: 3.13 21 | - uses: pre-commit/action@v3.0.1 22 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Deploy docs 2 | on: 3 | workflow_dispatch: 4 | push: 5 | branches: 6 | - 'master' 7 | pull_request: 8 | permissions: 9 | contents: write 10 | jobs: 11 | deploy: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout repo 15 | uses: actions/checkout@v4 16 | 17 | - name: Configure Git Credentials 18 | run: | 19 | git config user.name github-actions[bot] 20 | git config user.email 41898282+github-actions[bot]@users.noreply.github.com 21 | if: (github.event_name != 'pull_request') 22 | 23 | - name: Set up Python 3.9 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: '3.9' 27 | cache: 'pip' 28 | cache-dependency-path: | 29 | setup.py 30 | requirements-docs.txt 31 | 32 | - name: Save time for cache for mkdocs 33 | run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 34 | 35 | - name: Caching 36 | uses: actions/cache@v4 37 | with: 38 | key: mkdocs-material-${{ env.cache_id }} 39 | path: .cache 40 | restore-keys: | 41 | mkdocs-material- 42 | 43 | - name: Install Dependencies 44 | run: pip install -r requirements-docs.txt 45 | 46 | - name: Deploy to GitHub Pages 47 | run: mkdocs gh-deploy --force 48 | if: (github.event_name != 'pull_request') 49 | 50 | - name: Build docs to check for errors 51 | run: mkdocs build 52 | if: (github.event_name == 'pull_request') 53 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | workflow_dispatch: 11 | 12 | jobs: 13 | test: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.9", "3.10", "3.11"] 18 | 19 | steps: 20 | - name: Checkout 21 | uses: actions/checkout@v4 22 | 23 | - name: Build data-validation 24 | id: build-data-validation 25 | uses: ./.github/reusable-build 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | 29 | - name: Install built wheel 30 | shell: bash 31 | run: | 32 | PYTHON_VERSION_TAG="cp$(echo ${{ matrix.python-version }} | sed 's/\.//')" 33 | WHEEL_FILE=$(ls dist/*${PYTHON_VERSION_TAG}*.whl) 34 | pip install "${WHEEL_FILE}[test]" 35 | 36 | - name: Run Test 37 | run: | 38 | rm -rf bazel-* 39 | # run tests 40 | pytest -vv 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # pipenv 86 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 87 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 88 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 89 | # install all needed dependencies. 90 | #Pipfile.lock 91 | 92 | # celery beat schedule file 93 | celerybeat-schedule 94 | 95 | # SageMath parsed files 96 | *.sage.py 97 | 98 | # Environments 99 | .env 100 | .venv 101 | env/ 102 | venv/ 103 | ENV/ 104 | env.bak/ 105 | venv.bak/ 106 | 107 | # Spyder project settings 108 | .spyderproject 109 | .spyproject 110 | 111 | # Rope project settings 112 | .ropeproject 113 | 114 | # Intellij project settings 115 | .idea 116 | 117 | # mkdocs documentation 118 | /site 119 | 120 | # mypy 121 | .mypy_cache/ 122 | .dmypy.json 123 | dmypy.json 124 | 125 | # Pyre type checker 126 | .pyre/ 127 | 128 | # pb2.py files 129 | *_pb2.py 130 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # pre-commit is a tool to perform a predefined set of tasks manually and/or 2 | # automatically before git commits are made. 3 | # 4 | # Config reference: https://pre-commit.com/#pre-commit-configyaml---top-level 5 | # 6 | # Common tasks 7 | # 8 | # - Register git hooks: pre-commit install --install-hooks 9 | # - Run on all files: pre-commit run --all-files 10 | # 11 | # These pre-commit hooks are run as CI. 12 | # 13 | # NOTE: if it can be avoided, add configs/args in pyproject.toml or below instead of creating a new `.config.file`. 14 | # https://pre-commit.ci/#configuration 15 | ci: 16 | autoupdate_schedule: monthly 17 | autofix_commit_msg: | 18 | [pre-commit.ci] Apply automatic pre-commit fixes 19 | 20 | repos: 21 | # general 22 | - repo: https://github.com/pre-commit/pre-commit-hooks 23 | rev: v4.6.0 24 | hooks: 25 | - id: end-of-file-fixer 26 | exclude: '\.svg$|\.patch$' 27 | - id: trailing-whitespace 28 | exclude: '\.svg$|\.patch$' 29 | - id: check-json 30 | - id: check-yaml 31 | args: [--allow-multiple-documents, --unsafe] 32 | - id: check-toml 33 | 34 | - repo: https://github.com/astral-sh/ruff-pre-commit 35 | rev: v0.5.6 36 | hooks: 37 | - id: ruff 38 | args: ["--fix"] 39 | - id: ruff-format 40 | -------------------------------------------------------------------------------- /BUILD: -------------------------------------------------------------------------------- 1 | load("@bazel_gazelle//:def.bzl", "gazelle") 2 | 3 | package( 4 | default_visibility = [":__subpackages__"], 5 | ) 6 | 7 | licenses(["notice"]) 8 | 9 | exports_files(["LICENSE"]) 10 | 11 | gazelle( 12 | name = "gazelle-update-repos", 13 | args = [ 14 | "-from_file=go.mod", 15 | "-to_macro=deps.bzl%go_dependencies", 16 | ], 17 | command = "update-repos", 18 | ) 19 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution, 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | TFDV follows the [Google Python Style Guide] 26 | (http://google.github.io/styleguide/pyguide.html). 27 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | version: '3.1' 16 | 17 | # Extensions are not support until 3.4 thus the repeated boilerplate below. 18 | 19 | # We mount the TFDV project root at /build (which is the WORKDIR of the image) 20 | # in the container. 21 | services: 22 | manylinux2010: 23 | image: tfdv-build:manylinux2010 24 | build: 25 | context: . 26 | dockerfile: tensorflow_data_validation/tools/docker_build/Dockerfile.manylinux2010 27 | volumes: 28 | - .:/build:delegated 29 | -------------------------------------------------------------------------------- /docs/_toc.yaml: -------------------------------------------------------------------------------- 1 | toc: 2 | - title: "Install" 3 | path: /tfx/data_validation/install 4 | - title: "Get started" 5 | path: /tfx/data_validation/get_started 6 | -------------------------------------------------------------------------------- /docs/api.md: -------------------------------------------------------------------------------- 1 | # TensorFlow Data Validation API Documentation 2 | 3 | 4 | ::: tensorflow_data_validation 5 | -------------------------------------------------------------------------------- /docs/custom_data_validation.md: -------------------------------------------------------------------------------- 1 | # Custom Data Validation 2 | 3 | 6 | 7 | TFDV supports custom data validation using SQL. You can run custom data 8 | validation using 9 | [validate_statistics](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/api/validation_api.py#L236) 10 | or 11 | [custom_validate_statistics](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/api/validation_api.py#L535). 12 | Use `validate_statistics` to run standard, schema-based data validation along 13 | with custom validation. Use `custom_validate_statistics` to run only custom 14 | validation. 15 | 16 | ## Configuring Custom Data Validation 17 | 18 | Use the 19 | [CustomValidationConfig](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/anomalies/proto/custom_validation_config.proto) 20 | to define custom validations to run. For each validation, provide an 21 | SQL expression, which returns a boolean value. Each SQL expression is run 22 | against the summary statistics for the specified feature. If the expression 23 | returns false, TFDV generates a custom anomaly using the provided severity and 24 | anomaly description. 25 | 26 | You may configure custom validations that run against individual features or 27 | feature pairs. For each feature, specify both the dataset (i.e., slice) and the 28 | feature path to use, though you may leave the dataset name blank if you want to 29 | validate the default slice (i.e., all examples). For single feature validations, 30 | the feature statistics are bound to `feature`. For feature pair validations, the 31 | test feature statistics are bound to `feature_test` and the base feature 32 | statistics are bound to `feature_base`. See the section below for example 33 | queries. 34 | 35 | If a custom validation triggers an anomaly, TFDV will return an Anomalies proto 36 | with the reason(s) for the anomaly. Each reason will have a short description, 37 | which is user configured, and a description with the query that caused the 38 | anomaly, the dataset names on which the query was run, and the base feature path 39 | (if running a feature-pair validation). See the section below for example 40 | results of custom validation. 41 | 42 | See the 43 | [documentation](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/anomalies/proto/custom_validation_config.proto) 44 | in the `CustomValidationConfig` proto for example 45 | configurations. 46 | -------------------------------------------------------------------------------- /docs/images/anomaly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/anomaly.png -------------------------------------------------------------------------------- /docs/images/feature_stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/feature_stats.png -------------------------------------------------------------------------------- /docs/images/schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/schema.png -------------------------------------------------------------------------------- /docs/images/serving_anomaly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/serving_anomaly.png -------------------------------------------------------------------------------- /docs/images/skew_anomaly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/skew_anomaly.png -------------------------------------------------------------------------------- /docs/images/stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/stats.png -------------------------------------------------------------------------------- /docs/images/tf_full_color_primary_icon.svg: -------------------------------------------------------------------------------- 1 | FullColorPrimary Icon -------------------------------------------------------------------------------- /docs/images/unbalanced.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/unbalanced.png -------------------------------------------------------------------------------- /docs/images/uniform.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/uniform.png -------------------------------------------------------------------------------- /docs/images/uniform_cumulative.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/uniform_cumulative.png -------------------------------------------------------------------------------- /docs/images/zero_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/zero_length.png -------------------------------------------------------------------------------- /docs/javascripts/mathjax.js: -------------------------------------------------------------------------------- 1 | window.MathJax = { 2 | tex: { 3 | inlineMath: [["\\(", "\\)"]], 4 | displayMath: [["\\[", "\\]"]], 5 | processEscapes: true, 6 | processEnvironments: true 7 | }, 8 | options: { 9 | ignoreHtmlClass: ".*|", 10 | processHtmlClass: "arithmatex" 11 | } 12 | }; 13 | 14 | document$.subscribe(() => { 15 | MathJax.startup.output.clearCache() 16 | MathJax.typesetClear() 17 | MathJax.texReset() 18 | MathJax.typesetPromise() 19 | }) 20 | -------------------------------------------------------------------------------- /docs/stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --md-primary-fg-color: #FFA800; 3 | --md-primary-fg-color--light: #CCCCCC; 4 | --md-primary-fg-color--dark: #425066; 5 | } 6 | 7 | .video-wrapper { 8 | max-width: 240px; 9 | display: flex; 10 | flex-direction: row; 11 | } 12 | .video-wrapper > iframe { 13 | width: 100%; 14 | aspect-ratio: 16 / 9; 15 | } 16 | 17 | .buttons-wrapper { 18 | flex-wrap: wrap; 19 | gap: 1em; 20 | display: flex; 21 | /* flex-grow: 1; */ 22 | /* justify-content: center; */ 23 | /* align-content: center; */ 24 | } 25 | 26 | .buttons-wrapper > a { 27 | justify-content: center; 28 | align-content: center; 29 | flex-wrap: nowrap; 30 | /* gap: 1em; */ 31 | align-items: center; 32 | text-align: center; 33 | flex: 1 1 30%; 34 | display: flex; 35 | } 36 | 37 | .md-button > .buttons-content { 38 | align-items: center; 39 | justify-content: center; 40 | display: flex; 41 | gap: 1em; 42 | } 43 | -------------------------------------------------------------------------------- /google3/third_party/py/tensorflow_data_validation/build_macros.bzl: -------------------------------------------------------------------------------- 1 | """BUILD macros.""" 2 | 3 | load("//third_party/bazel_rules/rules_python/python:py_extension.bzl", "py_extension") 4 | 5 | def tfdv_pybind_extension( 6 | name, 7 | srcs, 8 | module_name, 9 | deps = [], 10 | visibility = None): 11 | py_extension( 12 | name = name, 13 | module_name = module_name, 14 | srcs = srcs, 15 | srcs_version = "PY3ONLY", 16 | copts = [ 17 | "-fno-strict-aliasing", 18 | "-fexceptions", 19 | ], 20 | features = ["-use_header_modules"], 21 | deps = deps, 22 | visibility = visibility, 23 | ) 24 | -------------------------------------------------------------------------------- /google3/third_party/py/tensorflow_data_validation/opensource_only/BUILD: -------------------------------------------------------------------------------- 1 | load("//tools/build_defs/testing:bzl_library.bzl", "bzl_library") 2 | 3 | bzl_library( 4 | name = "build_macros_bzl", 5 | srcs = ["build_macros.bzl"], 6 | parse_tests = False, 7 | visibility = ["//visibility:private"], 8 | ) 9 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: TensorFlow Data Validation 2 | repo_name: "data-validation" 3 | repo_url: 4 | 5 | theme: 6 | logo: images/tf_full_color_primary_icon.svg 7 | name: material 8 | palette: 9 | # Palette toggle for automatic mode 10 | - media: "(prefers-color-scheme)" 11 | primary: custom 12 | accent: custom 13 | toggle: 14 | icon: material/brightness-auto 15 | name: Switch to light mode 16 | 17 | # Palette toggle for light mode 18 | - media: "(prefers-color-scheme: light)" 19 | primary: custom 20 | accent: custom 21 | scheme: default 22 | toggle: 23 | icon: material/brightness-7 24 | name: Switch to dark mode 25 | 26 | # Palette toggle for dark mode 27 | - media: "(prefers-color-scheme: dark)" 28 | primary: custom 29 | accent: custom 30 | scheme: slate 31 | toggle: 32 | icon: material/brightness-4 33 | name: Switch to system preference 34 | favicon: images/tf_full_color_primary_icon.svg 35 | 36 | features: 37 | - content.code.copy 38 | - content.code.select 39 | - content.action.edit 40 | 41 | plugins: 42 | - search 43 | - autorefs 44 | - mkdocstrings: 45 | default_handler: python 46 | handlers: 47 | python: 48 | options: 49 | show_source: true 50 | show_root_heading: true 51 | unwrap_annotated: true 52 | show_symbol_type_toc: true 53 | show_if_no_docstring: true 54 | show_symbol_type_heading: true 55 | merge_init_into_class: true 56 | show_signature_annotations: true 57 | separate_signature: true 58 | signature_crossrefs: true 59 | group_by_category: true 60 | show_category_heading: true 61 | show_submodules: false 62 | show_root_full_path: true 63 | docstring_section_style: "spacy" 64 | inherited_members: true 65 | summary: false 66 | filters: 67 | - "!^_" 68 | - "^__init__$" 69 | - "^__call__$" 70 | - "^__version__$" 71 | - "!^logger" 72 | - "!^test_" 73 | - "!_test$" 74 | extensions: 75 | - griffe_inherited_docstrings 76 | import: 77 | - https://docs.python.org/3/objects.inv 78 | 79 | extra_css: 80 | - stylesheets/extra.css 81 | 82 | extra_javascript: 83 | - javascripts/mathjax.js 84 | - https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js 85 | 86 | markdown_extensions: 87 | - admonition 88 | - attr_list 89 | - def_list 90 | - tables 91 | - toc: 92 | permalink: true 93 | - pymdownx.highlight: 94 | anchor_linenums: true 95 | linenums: false 96 | line_spans: __span 97 | pygments_lang_class: true 98 | - pymdownx.inlinehilite 99 | - pymdownx.snippets 100 | - pymdownx.superfences 101 | - pymdownx.arithmatex: 102 | generic: true 103 | - pymdownx.critic 104 | - pymdownx.caret 105 | - pymdownx.keys 106 | - pymdownx.mark 107 | - pymdownx.tilde 108 | - md_in_html 109 | - pymdownx.emoji: 110 | emoji_index: !!python/name:material.extensions.emoji.twemoji 111 | emoji_generator: !!python/name:material.extensions.emoji.to_svg 112 | watch: 113 | - tensorflow_data_validation 114 | 115 | nav: 116 | - Home: index.md 117 | - Install: install.md 118 | - Getting Started: get_started.md 119 | - Anomalies: anomalies.md 120 | - API: api.md 121 | -------------------------------------------------------------------------------- /requirements-docs.txt: -------------------------------------------------------------------------------- 1 | mkdocs 2 | mkdocs-material 3 | mkdocstrings[python] 4 | griffe-inherited-docstrings 5 | mkdocs-autorefs 6 | ruff 7 | -------------------------------------------------------------------------------- /tensorflow_data_validation/BUILD: -------------------------------------------------------------------------------- 1 | load("@bazel_skylib//lib:selects.bzl", "selects") 2 | 3 | licenses(["notice"]) # Apache 2.0 4 | 5 | config_setting( 6 | name = "macos_x86_64", 7 | values = { 8 | "apple_platform_type": "macos", 9 | "cpu": "darwin", 10 | }, 11 | ) 12 | 13 | config_setting( 14 | name = "macos_arm64", 15 | values = { 16 | "apple_platform_type": "macos", 17 | "cpu": "darwin_arm64", 18 | }, 19 | ) 20 | 21 | selects.config_setting_group( 22 | name = "macos", 23 | match_any = [ 24 | ":macos_x86_64", 25 | ":macos_arm64", 26 | ], 27 | ) 28 | 29 | sh_binary( 30 | name = "move_generated_files", 31 | srcs = ["move_generated_files.sh"], 32 | data = select({ 33 | "//conditions:default": [ 34 | "//tensorflow_data_validation/anomalies/proto:validation_config_proto_py_pb2", 35 | "//tensorflow_data_validation/anomalies/proto:validation_metadata_proto_py_pb2", 36 | "//tensorflow_data_validation/pywrap:tensorflow_data_validation_extension.so", 37 | "//tensorflow_data_validation/skew/protos:feature_skew_results_proto_py_pb2", 38 | ], 39 | }), 40 | ) 41 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/bool_domain_util.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_BOOL_DOMAIN_UTIL_H_ 17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_BOOL_DOMAIN_UTIL_H_ 18 | 19 | #include 20 | 21 | #include "tensorflow_data_validation/anomalies/internal_types.h" 22 | #include "tensorflow_data_validation/anomalies/statistics_view.h" 23 | #include "tensorflow_metadata/proto/v0/schema.pb.h" 24 | 25 | namespace tensorflow { 26 | namespace data_validation { 27 | 28 | // Update a BoolDomain by itself. Namely, if the string values corresponding to 29 | // true and false in the domain are the same, clear the value for false. 30 | std::vector UpdateBoolDomainSelf( 31 | tensorflow::metadata::v0::BoolDomain* bool_domain); 32 | 33 | // This updates bool_domain. Should only be called if bool_domain is set. 34 | // If the type is INT and the min and max are out of the range {0,1}, 35 | // this will set int_domain. 36 | std::vector UpdateBoolDomain( 37 | const FeatureStatsView& feature_stats, 38 | tensorflow::metadata::v0::Feature* feature); 39 | 40 | // Determine if this could be a BoolDomain. 41 | // Note this takes precedence over IntDomain and StringDomain. 42 | bool IsBoolDomainCandidate(const FeatureStatsView& feature_stats); 43 | 44 | // Generate a BoolDomain from the stats. 45 | // The behavior is undefined if IsBoolDomainCandidate(stats) is false. 46 | tensorflow::metadata::v0::BoolDomain BoolDomainFromStats( 47 | const FeatureStatsView& stats); 48 | 49 | } // namespace data_validation 50 | } // namespace tensorflow 51 | 52 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_BOOL_DOMAIN_UTIL_H_ 53 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/custom_domain_util.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "tensorflow_data_validation/anomalies/custom_domain_util.h" 17 | 18 | #include 19 | 20 | #include "google/protobuf/text_format.h" 21 | #include "absl/base/log_severity.h" 22 | #include "absl/log/log.h" 23 | 24 | namespace tensorflow { 25 | namespace data_validation { 26 | namespace { 27 | 28 | using std::string; 29 | 30 | // LINT.IfChange 31 | constexpr char kDomainInfo[] = "domain_info"; 32 | // LINT.ThenChange(../utils/stats_util.py) 33 | 34 | bool ParseCustomDomainInfo(const string& domain_info, 35 | tensorflow::metadata::v0::Feature* feature) { 36 | // Temporary feature for parsing domain_info. 37 | tensorflow::metadata::v0::Feature domain_info_feature; 38 | if (!google::protobuf::TextFormat::ParseFromString( 39 | domain_info, &domain_info_feature)) { 40 | return false; 41 | } 42 | // Ensure only one field is set 43 | std::vector fields_set; 44 | feature->GetReflection()->ListFields(domain_info_feature, &fields_set); 45 | // Ensure only one field is set, which is part of the domain_info oneof. 46 | if (fields_set.size() != 1 || fields_set[0]->containing_oneof() == nullptr || 47 | fields_set[0]->containing_oneof()->name() != kDomainInfo) { 48 | return false; 49 | } else { 50 | feature->MergeFrom(domain_info_feature); 51 | return true; 52 | } 53 | } 54 | 55 | } // namespace 56 | 57 | bool BestEffortUpdateCustomDomain( 58 | const std::vector& custom_stats, 59 | tensorflow::metadata::v0::Feature* feature) { 60 | string domain_info; 61 | for (const auto& custom_stat : custom_stats) { 62 | if (custom_stat.name() == kDomainInfo) { 63 | if (!domain_info.empty()) { 64 | LOG(ERROR) << "Duplicate 'domain_info' custom_stat [" << domain_info 65 | << ", " << custom_stat.str() << "], this is a stats bug."; 66 | return false; 67 | } else { 68 | domain_info = custom_stat.str(); 69 | } 70 | } 71 | } 72 | if (domain_info.empty()) { 73 | return false; 74 | } 75 | // Never override existing domain_infos with a custom domain_info for safety. 76 | if (feature->domain_info_case() != 77 | tensorflow::metadata::v0::Feature::DOMAIN_INFO_NOT_SET) { 78 | LOG(INFO) << "Valid custom domain_info: " << domain_info 79 | << " ignored due to existing domain, for feature :" 80 | << feature->DebugString(); 81 | return false; 82 | } 83 | if (!ParseCustomDomainInfo(domain_info, feature)) { 84 | LOG(ERROR) << "Could not parse 'domain_info' custom_stat: " << domain_info 85 | << ". It is expected to contain exactly one field of the " 86 | << "Feature.domain_info oneof, e.g.: 'mid_domain {}'."; 87 | return false; 88 | } 89 | return true; 90 | } 91 | 92 | } // namespace data_validation 93 | } // namespace tensorflow 94 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/custom_domain_util.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_CUSTOM_DOMAIN_UTIL_H_ 17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_CUSTOM_DOMAIN_UTIL_H_ 18 | 19 | #include 20 | 21 | #include "tensorflow_metadata/proto/v0/schema.pb.h" 22 | #include "tensorflow_metadata/proto/v0/statistics.pb.h" 23 | 24 | namespace tensorflow { 25 | namespace data_validation { 26 | 27 | // Semantic domains like image_domain, url_domain, ... can be detected by 28 | // heuristics in stats generation. If such a domain is detected the feature 29 | // stats are associated with a CustomStatistic with name: 'domain_info' and a 30 | // str value with the text representation of the detected domain, e.g: 31 | // custom_stats: {name: 'domain_info' str: 'mid_domain {}'} 32 | // 33 | // This method provides a best-effort update of the semantic type of `feature` 34 | // based on `custom_stats` and returns true iff a valid custom domain was 35 | // detected and successfully updated `feature`. The logic is currently 36 | // conservative: 37 | // - Never modify `feature` if it has an existing domain 38 | // - If a feature is associated with multiple custom_stats for 'domain_info' 39 | // they are ignored 40 | // - If the value of the 'domain_info' custom stat is invalid or does not set 41 | // exactly one field of the Feature.domain_info oneof it is ignored 42 | bool BestEffortUpdateCustomDomain( 43 | const std::vector& custom_stats, 44 | tensorflow::metadata::v0::Feature* feature); 45 | 46 | } // namespace data_validation 47 | } // namespace tensorflow 48 | 49 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_CUSTOM_DOMAIN_UTIL_H_ 50 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/custom_domain_util_test.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | #include "tensorflow_data_validation/anomalies/custom_domain_util.h" 16 | 17 | #include 18 | #include 19 | #include "tensorflow_data_validation/anomalies/test_util.h" 20 | 21 | namespace tensorflow { 22 | namespace data_validation { 23 | 24 | namespace { 25 | 26 | using std::string; 27 | using ::tensorflow::metadata::v0::CustomStatistic; 28 | using ::tensorflow::metadata::v0::Feature; 29 | using ::testing::Test; 30 | 31 | CustomStatistic DomainInfoStatistic(const string& value) { 32 | CustomStatistic custom_stat; 33 | custom_stat.set_name("domain_info"); 34 | custom_stat.set_str(value); 35 | return custom_stat; 36 | } 37 | 38 | TEST(CustomDomainUtilTest, FailureOnNoDomainInfoCustomStat) { 39 | Feature feature; 40 | CustomStatistic custom_stat; 41 | custom_stat.set_name("some_other_name"); 42 | custom_stat.set_str("natural_language_domain"); 43 | EXPECT_FALSE(BestEffortUpdateCustomDomain( 44 | std::vector({custom_stat}), &feature)); 45 | EXPECT_EQ(feature.domain_info_case(), 46 | tensorflow::metadata::v0::Feature::DOMAIN_INFO_NOT_SET); 47 | } 48 | 49 | TEST(CustomDomainUtilTest, SuccessOnEmptyFeature) { 50 | Feature feature; 51 | EXPECT_TRUE(BestEffortUpdateCustomDomain( 52 | std::vector( 53 | {DomainInfoStatistic("natural_language_domain {}")}), 54 | &feature)); 55 | EXPECT_TRUE(feature.has_natural_language_domain()); 56 | } 57 | 58 | TEST(CustomDomainUtilTest, FailureOnFeatureWithDomain) { 59 | Feature feature; 60 | feature.mutable_string_domain(); 61 | EXPECT_FALSE(BestEffortUpdateCustomDomain( 62 | std::vector( 63 | {DomainInfoStatistic("natural_language_domain {}")}), 64 | &feature)); 65 | EXPECT_TRUE(feature.has_string_domain()); 66 | } 67 | 68 | TEST(CustomDomainUtilTest, FailureOnMultipleDomainInfosFeature) { 69 | Feature feature; 70 | EXPECT_FALSE(BestEffortUpdateCustomDomain( 71 | std::vector( 72 | {DomainInfoStatistic("natural_language_domain {}"), 73 | DomainInfoStatistic("natural_language_domain {}")}), 74 | &feature)); 75 | EXPECT_EQ(feature.domain_info_case(), 76 | tensorflow::metadata::v0::Feature::DOMAIN_INFO_NOT_SET); 77 | } 78 | 79 | TEST(CustomDomainUtilTest, FailureOnInvalidDomainValue) { 80 | Feature feature; 81 | 82 | EXPECT_FALSE(BestEffortUpdateCustomDomain( 83 | std::vector({DomainInfoStatistic("")}), &feature)); 84 | EXPECT_EQ(feature.domain_info_case(), 85 | tensorflow::metadata::v0::Feature::DOMAIN_INFO_NOT_SET); 86 | 87 | EXPECT_FALSE(BestEffortUpdateCustomDomain( 88 | std::vector({DomainInfoStatistic("This is not valid")}), 89 | &feature)); 90 | EXPECT_EQ(feature.domain_info_case(), 91 | tensorflow::metadata::v0::Feature::DOMAIN_INFO_NOT_SET); 92 | 93 | EXPECT_FALSE(BestEffortUpdateCustomDomain( 94 | std::vector({DomainInfoStatistic( 95 | "name: 'It should not set other fields!' image_domain {} ")}), 96 | &feature)); 97 | EXPECT_EQ(feature.domain_info_case(), 98 | tensorflow::metadata::v0::Feature::DOMAIN_INFO_NOT_SET); 99 | } 100 | 101 | } // namespace 102 | 103 | } // namespace data_validation 104 | } // namespace tensorflow 105 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/dataset_constraints_util.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | // Utilities to modify a dataset constraint in the schema. 16 | #ifndef THIRD_PARTY_PY_TENSORFLOW_DATA_VALIDATION_ANOMALIES_DATASET_CONSTRAINTS_UTIL_H_ 17 | #define THIRD_PARTY_PY_TENSORFLOW_DATA_VALIDATION_ANOMALIES_DATASET_CONSTRAINTS_UTIL_H_ 18 | 19 | #include "tensorflow_data_validation/anomalies/internal_types.h" 20 | #include "tensorflow_data_validation/anomalies/statistics_view.h" 21 | #include "tensorflow_metadata/proto/v0/schema.pb.h" 22 | #include "tensorflow_metadata/proto/v0/statistics.pb.h" 23 | 24 | #endif // THIRD_PARTY_PY_TENSORFLOW_DATA_VALIDATION_ANOMALIES_DATASET_CONSTRAINTS_UTIL_H_ 25 | 26 | namespace tensorflow { 27 | namespace data_validation { 28 | // Specifies whether the dataset constraints has a comparator of the specified 29 | // type. 30 | bool DatasetConstraintsHasComparator( 31 | const tensorflow::metadata::v0::DatasetConstraints& dataset_contraints, 32 | DatasetComparatorType comparator_type); 33 | 34 | // Gets the num examples comparator of the specified type, creating it if it 35 | // does not exist. 36 | tensorflow::metadata::v0::NumericValueComparator* GetNumExamplesComparator( 37 | tensorflow::metadata::v0::DatasetConstraints* dataset_constraints, 38 | DatasetComparatorType comparator_type); 39 | 40 | // Updates the num examples comparator from the dataset constraints. 41 | std::vector UpdateNumExamplesComparatorDirect( 42 | const DatasetStatsView& stats, DatasetComparatorType comparator_type, 43 | tensorflow::metadata::v0::NumericValueComparator* comparator); 44 | 45 | // Updates the min and max examples count from the dataset constraints. 46 | std::vector UpdateExamplesCount( 47 | const DatasetStatsView& stats, 48 | tensorflow::metadata::v0::DatasetConstraints* dataset_constraints); 49 | 50 | } // namespace data_validation 51 | } // namespace tensorflow 52 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/diff_util.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "tensorflow_data_validation/anomalies/diff_util.h" 17 | #include "absl/log/check.h" 18 | 19 | namespace tensorflow { 20 | namespace data_validation { 21 | 22 | std::vector ComputeDiff( 23 | const std::vector& a_lines, 24 | const std::vector& b_lines) { 25 | CHECK(false) << "Schema diff is currently not supported."; 26 | } 27 | 28 | } // namespace data_validation 29 | } // namespace tensorflow 30 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/diff_util.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_DIFF_UTIL_H_ 17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_DIFF_UTIL_H_ 18 | 19 | #include 20 | 21 | #include "absl/strings/string_view.h" 22 | #include "tensorflow_metadata/proto/v0/anomalies.pb.h" 23 | 24 | namespace tensorflow { 25 | namespace data_validation { 26 | 27 | // The schema diff computation functionality is currently not supported. 28 | // Tracked in https://github.com/tensorflow/data-validation/issues/39 29 | std::vector ComputeDiff( 30 | const std::vector& a_lines, 31 | const std::vector& b_lines); 32 | 33 | } // namespace data_validation 34 | } // namespace tensorflow 35 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_DIFF_UTIL_H_ 36 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/features_needed.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "tensorflow_data_validation/anomalies/features_needed.h" 17 | 18 | #include 19 | 20 | #include "absl/status/status.h" 21 | #include "tensorflow_data_validation/anomalies/path.h" 22 | #include "tensorflow_data_validation/anomalies/proto/validation_metadata.pb.h" 23 | #include "tensorflow_metadata/proto/v0/schema.pb.h" 24 | 25 | namespace tensorflow { 26 | namespace data_validation { 27 | 28 | absl::Status ToFeaturesNeededProto(const FeaturesNeeded& feature_needed, 29 | FeaturesNeededProto* result) { 30 | for (const auto& entry : feature_needed) { 31 | PathAndReasonFeatureNeeded path_and_reason_feature_need; 32 | *path_and_reason_feature_need.mutable_path() = entry.first.AsProto(); 33 | for (const auto& reason_feature_needed : entry.second) { 34 | *path_and_reason_feature_need.add_reason_feature_needed() = 35 | reason_feature_needed; 36 | } 37 | *result->add_path_and_reason_feature_need() = path_and_reason_feature_need; 38 | } 39 | 40 | return absl::OkStatus(); 41 | } 42 | 43 | absl::Status FromFeaturesNeededProto( 44 | const FeaturesNeededProto& feature_needed_proto, FeaturesNeeded* result) { 45 | for (const auto& entry : 46 | feature_needed_proto.path_and_reason_feature_need()) { 47 | Path key(entry.path()); 48 | std::vector value = { 49 | entry.reason_feature_needed().begin(), 50 | entry.reason_feature_needed().end()}; 51 | (*result)[key] = value; 52 | } 53 | 54 | return absl::OkStatus(); 55 | } 56 | 57 | } // namespace data_validation 58 | } // namespace tensorflow 59 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/features_needed.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_FEATURES_NEEDED_H_ 17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_FEATURES_NEEDED_H_ 18 | 19 | #include "absl/status/status.h" 20 | #include "tensorflow_data_validation/anomalies/path.h" 21 | #include "tensorflow_data_validation/anomalies/proto/validation_metadata.pb.h" 22 | #include "tensorflow_metadata/proto/v0/schema.pb.h" 23 | 24 | namespace tensorflow { 25 | namespace data_validation { 26 | 27 | using FeaturesNeeded = std::map>; 28 | 29 | absl::Status ToFeaturesNeededProto(const FeaturesNeeded& feature_needed, 30 | FeaturesNeededProto* result); 31 | 32 | absl::Status FromFeaturesNeededProto( 33 | const FeaturesNeededProto& feature_needed_proto, FeaturesNeeded* result); 34 | 35 | } // namespace data_validation 36 | } // namespace tensorflow 37 | 38 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_FEATURES_NEEDED_H_ 39 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/features_needed_test.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "tensorflow_data_validation/anomalies/features_needed.h" 17 | 18 | #include 19 | #include 20 | #include "tensorflow_data_validation/anomalies/path.h" 21 | #include "tensorflow_data_validation/anomalies/proto/validation_metadata.pb.h" 22 | #include "tensorflow_data_validation/anomalies/test_util.h" 23 | 24 | namespace tensorflow { 25 | namespace data_validation { 26 | namespace { 27 | 28 | using ::testing::ElementsAre; 29 | using testing::EqualsProto; 30 | using ::testing::Pair; 31 | using testing::ParseTextProtoOrDie; 32 | using ::testing::Test; 33 | using ::testing::UnorderedElementsAre; 34 | 35 | TEST(FeaturesNeededTest, CppToProtoToCpp) { 36 | Path path({"a", "b", "c"}); 37 | 38 | auto reason1 = ParseTextProtoOrDie("comment: 'test1'"); 39 | auto reason2 = ParseTextProtoOrDie("comment: 'test2'"); 40 | FeaturesNeeded features_need; 41 | features_need[path] = {reason1, reason2}; 42 | 43 | FeaturesNeededProto proto_format; 44 | EXPECT_TRUE(ToFeaturesNeededProto(features_need, &proto_format).ok()); 45 | 46 | // Verify that proto_format are correctly generated. 47 | EXPECT_THAT(proto_format, testing::EqualsProto(R"( 48 | path_and_reason_feature_need { 49 | path { step: "a" step: "b" step: "c" } 50 | reason_feature_needed { comment: "test1" } 51 | reason_feature_needed { comment: "test2" } 52 | } 53 | )")); 54 | 55 | // Verify that C++ -> Proto -> C++ is a noop. 56 | FeaturesNeeded generated_features_need; 57 | EXPECT_TRUE( 58 | FromFeaturesNeededProto(proto_format, &generated_features_need).ok()); 59 | EXPECT_THAT( 60 | generated_features_need, 61 | UnorderedElementsAre( 62 | Pair(path, ElementsAre(EqualsProto(reason1), EqualsProto(reason2))))); 63 | } 64 | 65 | TEST(FeaturesNeededTest, ProtoToCppToProto) { 66 | auto original_proto = ParseTextProtoOrDie(R"( 67 | path_and_reason_feature_need { 68 | path { step: "a" step: "b" step: "c" } 69 | reason_feature_needed { comment: "test1" } 70 | reason_feature_needed { comment: "test2" } 71 | } 72 | )"); 73 | FeaturesNeeded features_need; 74 | EXPECT_TRUE(FromFeaturesNeededProto(original_proto, &features_need).ok()); 75 | 76 | // Verify that C++ object are expected. 77 | Path expeceted_path({"a", "b", "c"}); 78 | auto expected_reason1 = 79 | ParseTextProtoOrDie("comment: 'test1'"); 80 | auto expected_reason2 = 81 | ParseTextProtoOrDie("comment: 'test2'"); 82 | EXPECT_THAT(features_need, 83 | UnorderedElementsAre(Pair( 84 | expeceted_path, ElementsAre(EqualsProto(expected_reason1), 85 | EqualsProto(expected_reason2))))); 86 | 87 | // Verify that Proto -> C++ -> Proto is a noop. 88 | FeaturesNeededProto generated_proto_format; 89 | EXPECT_TRUE( 90 | ToFeaturesNeededProto(features_need, &generated_proto_format).ok()); 91 | 92 | EXPECT_THAT(original_proto, EqualsProto(generated_proto_format)); 93 | } 94 | 95 | } // namespace 96 | } // namespace data_validation 97 | } // namespace tensorflow 98 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/float_domain_util.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_FLOAT_DOMAIN_UTIL_H_ 17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_FLOAT_DOMAIN_UTIL_H_ 18 | 19 | #include "tensorflow_data_validation/anomalies/internal_types.h" 20 | #include "tensorflow_data_validation/anomalies/statistics_view.h" 21 | #include "tensorflow_metadata/proto/v0/schema.pb.h" 22 | 23 | namespace tensorflow { 24 | namespace data_validation { 25 | 26 | // Updates the float_domain based upon the range of values in , be they 27 | // STRING or FLOAT. 28 | // Will recommend the field be cleared if the type is STRING or BYTES but 29 | // the strings do not represent floats. Undefined behavior if the data is INT. 30 | UpdateSummary UpdateFloatDomain( 31 | const FeatureStatsView& stats, 32 | tensorflow::metadata::v0::FloatDomain* float_domain); 33 | 34 | // Returns true if feature_stats is a STRING field has only floats and no 35 | // non-UTF8 strings. 36 | bool IsFloatDomainCandidate(const FeatureStatsView& feature_stats); 37 | 38 | } // namespace data_validation 39 | } // namespace tensorflow 40 | 41 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_FLOAT_DOMAIN_UTIL_H_ 42 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/image_domain_util.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2020 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_IMAGE_DOMAIN_UTIL_H_ 17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_IMAGE_DOMAIN_UTIL_H_ 18 | 19 | #include 20 | 21 | #include "tensorflow_data_validation/anomalies/internal_types.h" 22 | #include "tensorflow_data_validation/anomalies/statistics_view.h" 23 | #include "tensorflow_metadata/proto/v0/schema.pb.h" 24 | 25 | namespace tensorflow { 26 | namespace data_validation { 27 | 28 | // This updates image_domain. Should only be called if image_domain is set. 29 | std::vector UpdateImageDomain( 30 | const FeatureStatsView& feature_stats, 31 | tensorflow::metadata::v0::Feature* feature); 32 | 33 | } // namespace data_validation 34 | } // namespace tensorflow 35 | 36 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_IMAGE_DOMAIN_UTIL_H_ 37 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/int_domain_util.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_INT_DOMAIN_UTIL_H_ 17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_INT_DOMAIN_UTIL_H_ 18 | 19 | #include "tensorflow_data_validation/anomalies/internal_types.h" 20 | #include "tensorflow_data_validation/anomalies/statistics_view.h" 21 | #include "tensorflow_metadata/proto/v0/schema.pb.h" 22 | 23 | namespace tensorflow { 24 | namespace data_validation { 25 | 26 | // Updates the float_domain based upon the range of values in , be they 27 | // STRING or INT. 28 | // Will recommend the field be cleared if the type is STRING or BYTES but 29 | // the strings do not represent floats. Undefined behavior if the data is FLOAT. 30 | UpdateSummary UpdateIntDomain(const FeatureStatsView& feature_stats, 31 | tensorflow::metadata::v0::IntDomain* int_domain); 32 | 33 | // Returns true if feature_stats is a STRING field has only floats and no 34 | // non-UTF8 strings. 35 | bool IsIntDomainCandidate(const FeatureStatsView& feature_stats); 36 | 37 | } // namespace data_validation 38 | } // namespace tensorflow 39 | 40 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_INT_DOMAIN_UTIL_H_ 41 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/internal_types.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_INTERNAL_TYPES_H_ 17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_INTERNAL_TYPES_H_ 18 | 19 | #include 20 | #include 21 | 22 | #include "tensorflow_metadata/proto/v0/anomalies.pb.h" 23 | 24 | namespace tensorflow { 25 | namespace data_validation { 26 | 27 | using std::string; 28 | 29 | // Represents the description of an anomaly, in short and long form. 30 | struct Description { 31 | tensorflow::metadata::v0::AnomalyInfo::Type type; 32 | string short_description, long_description; 33 | 34 | friend bool operator==(const Description& a, const Description& b) { 35 | return (a.type == b.type && a.short_description == b.short_description && 36 | a.long_description == b.long_description); 37 | } 38 | 39 | friend std::ostream& operator<<(std::ostream& strm, const Description& a) { 40 | return (strm << "{" << a.type << ", " << a.short_description << ", " << 41 | a.long_description << "}"); 42 | } 43 | }; 44 | 45 | // UpdateSummary for a field. 46 | struct UpdateSummary { 47 | // Clear the field in question. If this is a ``shared'' enum, 48 | // then the field is dropped. 49 | UpdateSummary() { clear_field = false; } 50 | bool clear_field; 51 | std::vector descriptions; 52 | }; 53 | 54 | // Enum for comparators used in feature-level comparisons. 55 | enum class FeatureComparatorType { 56 | SKEW, // Compares serving and training data. 57 | DRIFT // Compares previous and current spans. 58 | }; 59 | // Enum for comparators used in dataset-level comparisons. 60 | enum class DatasetComparatorType { 61 | DRIFT, // Compares previous and current spans. 62 | VERSION // Compares previous and current versions. 63 | }; 64 | 65 | // The context for a tensorflow::metadata::v0::FeatureComparator. 66 | // In tensorflow::metadata::v0::Feature, there are two comparisons: 67 | // skew_comparator (that compares serving and training) and 68 | // drift_comparator (that compares previous and current). This struct 69 | // allows us to annotate the objects based upon this information. 70 | struct ComparatorContext { 71 | string control_name; 72 | string treatment_name; 73 | }; 74 | 75 | } // namespace data_validation 76 | } // namespace tensorflow 77 | 78 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_INTERNAL_TYPES_H_ 79 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/map_util.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "tensorflow_data_validation/anomalies/map_util.h" 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | 25 | namespace tensorflow { 26 | namespace data_validation { 27 | using std::string; 28 | 29 | double SumValues(const std::map& input) { 30 | std::vector values = GetValuesFromMap(input); 31 | return std::accumulate(values.begin(), values.end(), 0.0); 32 | } 33 | 34 | std::vector GetValuesFromMap(const std::map& input) { 35 | std::vector values; 36 | values.reserve(input.size()); 37 | for (const auto& pair : input) { 38 | values.push_back(pair.second); 39 | } 40 | return values; 41 | } 42 | 43 | std::vector GetKeysFromMap(const std::map& input) { 44 | std::vector keys; 45 | keys.reserve(input.size()); 46 | for (const auto& pair : input) { 47 | keys.push_back(pair.first); 48 | } 49 | return keys; 50 | } 51 | 52 | std::map ScaleBy(const std::map& input, 53 | double scale) { 54 | if (scale == 0) return input; 55 | std::map result; 56 | for (const auto& pair : input) { 57 | const string& key = pair.first; 58 | const double value = pair.second; 59 | result[key] = value / scale; 60 | } 61 | return result; 62 | } 63 | 64 | std::map Normalize(const std::map& input) { 65 | double sum = SumValues(input); 66 | if (sum == 0.0) { 67 | return input; 68 | } 69 | return ScaleBy(input, sum); 70 | } 71 | 72 | std::map GetDifference(const std::map& a, 73 | const std::map& b) { 74 | std::map result = a; 75 | for (const auto& pair_b : b) { 76 | const string& key_b = pair_b.first; 77 | const double value_b = pair_b.second; 78 | // If the key is not present, this will initialize it to zero. 79 | result[key_b] -= value_b; 80 | } 81 | return result; 82 | } 83 | 84 | void IncrementMap(const std::map& a, 85 | std::map* b) { 86 | for (const auto& pair_a : a) { 87 | const string& key_a = pair_a.first; 88 | const double value_a = pair_a.second; 89 | // If the key is not present, this will initialize it to zero. 90 | (*b)[key_a] += value_a; 91 | } 92 | } 93 | 94 | std::map GetSum(const std::map& a, 95 | const std::map& b) { 96 | std::map result = a; 97 | IncrementMap(b, &result); 98 | return result; 99 | } 100 | 101 | std::map MapValues(const std::map& input, 102 | const std::function& mapFn) { 103 | std::map result; 104 | for (const auto& pair : input) { 105 | result[pair.first] = mapFn(pair.second); 106 | } 107 | return result; 108 | } 109 | 110 | std::map IntMapToDoubleMap( 111 | const std::map& int_map) { 112 | std::map result; 113 | for (const auto& pair : int_map) { 114 | result[pair.first] = pair.second; 115 | } 116 | return result; 117 | } 118 | 119 | } // namespace data_validation 120 | } // namespace tensorflow 121 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/map_util.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_MAP_UTIL_H_ 17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_MAP_UTIL_H_ 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | 25 | namespace tensorflow { 26 | namespace data_validation { 27 | using std::string; 28 | 29 | // Returns true if and only if the given container contains the given key. 30 | template 31 | bool ContainsKey(const Container& container, const Key& key) { 32 | return container.find(key) != container.end(); 33 | } 34 | 35 | // Adds the values in the map. 36 | double SumValues(const std::map& input); 37 | 38 | // Gets the keys from the map. The order of the keys is the same as in 39 | // the map. 40 | std::vector GetKeysFromMap(const std::map& input); 41 | 42 | // Gets the values from the map. The order of the values is the same as in 43 | // the map. 44 | std::vector GetValuesFromMap(const std::map& input); 45 | 46 | // Normalizes the values, such that the sum of the values are 1. 47 | // If the values sum to zero, return the input map. 48 | std::map Normalize(const std::map& input); 49 | 50 | // Scales a map by the provided constant. Returns input unmodified if scale is 51 | // zero. 52 | std::map ScaleBy(const std::map& input, 53 | double scale); 54 | 55 | // Gets the difference of the values of two maps. Values that are not 56 | // present are treated as zero. 57 | std::map GetDifference(const std::map& a, 58 | const std::map& b); 59 | 60 | // Gets the sum of the values of two maps. Values that are not 61 | // present are treated as zero. 62 | std::map GetSum(const std::map& a, 63 | const std::map& b); 64 | 65 | // Increments one map by another. Values that are not 66 | // present are treated as zero. 67 | void IncrementMap(const std::map& a, 68 | std::map* b); 69 | 70 | // Applies a function to all the values in the map. 71 | std::map MapValues(const std::map& input, 72 | const std::function& mapFn); 73 | 74 | // Cast the values from int64 to double. Notice that this might lose some 75 | // information. 76 | std::map IntMapToDoubleMap( 77 | const std::map& int_map); 78 | 79 | } // namespace data_validation 80 | } // namespace tensorflow 81 | 82 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_MAP_UTIL_H_ 83 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/metrics.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_METRICS_H_ 17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_METRICS_H_ 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | #include "tensorflow_data_validation/anomalies/statistics_view.h" 24 | #include "tensorflow_metadata/proto/v0/schema.pb.h" 25 | #include "tensorflow_metadata/proto/v0/statistics.pb.h" 26 | 27 | namespace tensorflow { 28 | namespace data_validation { 29 | 30 | // Computes the L-infinity distance between the (weighted) histograms of the 31 | // features. 32 | // Only takes into account how many times the feature are present, 33 | // and scales the histograms so that they sum to 1. 34 | // The first value returned is the element with highest deviation, and 35 | // the second value returned is the L infinity distance itself. 36 | std::pair LInftyDistance(const FeatureStatsView& a, 37 | const FeatureStatsView& b); 38 | 39 | // Defines how rank histograms are normalized for count comparison. 40 | // kSeparateTotal: Normalizes each rank histogram by its sum. 41 | // kCombinedTotal: Normalizes both rank histogram by their combined sum. 42 | enum class NormalizationMode { kSeparateTotal, kCombinedTotal }; 43 | 44 | // Returns the element with the largest difference in normalized count between 45 | // two rank histograms, as well as the absolute difference. 46 | std::pair MaxNormalizedDifference( 47 | const std::map& counts_a, 48 | const std::map& counts_b, 49 | NormalizationMode normalization_mode); 50 | 51 | // Computes the NormalizedAbsoluteDifference (see schema.proto) between the 52 | // (weighted) histograms of features. 53 | // The first value returned is the element with highest deviation, and 54 | // the second value returned is the distance itself. 55 | std::pair NormalizedAbsoluteDifference( 56 | const FeatureStatsView& a, const FeatureStatsView& b); 57 | 58 | // Computes the approximate Jensen-Shannon divergence 59 | // (https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence) between the 60 | // (weighted) histograms of the features. 61 | absl::Status JensenShannonDivergence( 62 | const FeatureStatsView& a, const FeatureStatsView& b, 63 | const tensorflow::metadata::v0::HistogramSelection& source, double& result); 64 | 65 | absl::Status JensenShannonDivergence( 66 | ::tensorflow::metadata::v0::Histogram& histogram_1, 67 | ::tensorflow::metadata::v0::Histogram& histogram_2, double& result); 68 | 69 | } // namespace data_validation 70 | } // namespace tensorflow 71 | 72 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_METRICS_H_ 73 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/natural_language_domain_util.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2021 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef THIRD_PARTY_PY_TENSORFLOW_DATA_VALIDATION_ANOMALIES_NATURAL_LANGUAGE_DOMAIN_UTIL_H_ 17 | #define THIRD_PARTY_PY_TENSORFLOW_DATA_VALIDATION_ANOMALIES_NATURAL_LANGUAGE_DOMAIN_UTIL_H_ 18 | 19 | #include 20 | 21 | #include "tensorflow_data_validation/anomalies/internal_types.h" 22 | #include "tensorflow_data_validation/anomalies/statistics_view.h" 23 | #include "tensorflow_metadata/proto/v0/schema.pb.h" 24 | 25 | namespace tensorflow { 26 | namespace data_validation { 27 | 28 | // This updates natural language domain. Should only be called if natural 29 | // language domain is set. 30 | std::vector UpdateNaturalLanguageDomain( 31 | const FeatureStatsView& feature_stats, 32 | tensorflow::metadata::v0::Feature* feature); 33 | 34 | } // namespace data_validation 35 | } // namespace tensorflow 36 | 37 | #endif // THIRD_PARTY_PY_TENSORFLOW_DATA_VALIDATION_ANOMALIES_NATURAL_LANGUAGE_DOMAIN_UTIL_H_ 38 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/proto/BUILD: -------------------------------------------------------------------------------- 1 | load("//tensorflow_data_validation:data_validation.bzl", "tfdv_proto_library", "tfdv_proto_library_py") 2 | 3 | package( 4 | default_visibility = ["//tensorflow_data_validation:__subpackages__"], 5 | ) 6 | 7 | licenses(["notice"]) 8 | 9 | tfdv_proto_library( 10 | name = "feature_statistics_to_proto_proto", 11 | srcs = ["feature_statistics_to_proto.proto"], 12 | deps = [ 13 | ":validation_config_proto", 14 | "@com_github_tensorflow_metadata//tensorflow_metadata/proto/v0:metadata_v0_proto", 15 | ], 16 | ) 17 | 18 | tfdv_proto_library( 19 | name = "validation_config_proto", 20 | srcs = ["validation_config.proto"], 21 | deps = ["@com_github_tensorflow_metadata//tensorflow_metadata/proto/v0:metadata_v0_proto"], 22 | ) 23 | 24 | tfdv_proto_library_py( 25 | name = "validation_config_proto_py_pb2", 26 | deps = [":validation_config_proto"], 27 | ) 28 | 29 | tfdv_proto_library( 30 | name = "validation_metadata_proto", 31 | srcs = ["validation_metadata.proto"], 32 | deps = ["@com_github_tensorflow_metadata//tensorflow_metadata/proto/v0:metadata_v0_proto"], 33 | ) 34 | 35 | tfdv_proto_library_py( 36 | name = "validation_metadata_proto_py_pb2", 37 | deps = [":validation_metadata_proto"], 38 | ) 39 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/proto/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/proto/feature_statistics_to_proto.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | syntax = "proto2"; 17 | 18 | package tensorflow.data_validation; 19 | 20 | import "tensorflow_data_validation/anomalies/proto/validation_config.proto"; 21 | import "tensorflow_metadata/proto/v0/path.proto"; 22 | 23 | // Manual constraints on the automatic generation of a schema. 24 | message ColumnConstraint { 25 | // A column constraint can apply to multiple columns. 26 | repeated tensorflow.metadata.v0.Path column_path = 3; 27 | // The name of the enum representing all the columns, if present. 28 | optional string enum_name = 2; 29 | } 30 | 31 | // See EnumType::IsSimilar(...). 32 | message EnumsSimilarConfig { 33 | // Equal or below this count, two enums must be identical to be considered 34 | // "similar". 35 | optional int32 min_count = 1 [default = 10]; 36 | // Jaccard similarity is the ratio of the intersection to the union. 37 | // The enum types are viewed as sets, then two enums are similar if they both 38 | // have more than min_similar_count elements and a Jaccard similarity higher 39 | // than min_jaccard_similarity. 40 | optional double min_jaccard_similarity = 2 [default = 0.5]; 41 | } 42 | 43 | // Configuration for creating the first version of a schema or a new field 44 | // within a schema during validation. 45 | message FeatureStatisticsToProtoConfig { 46 | // Deleted fields. 47 | reserved 3, 4; 48 | 49 | // If a string field has less than this number of entries, it will be 50 | // interpreted as an enum. 51 | optional int32 enum_threshold = 1; 52 | 53 | // If a StringDomain has at least this number of entries, it will be 54 | // deleted. 55 | optional int32 enum_delete_threshold = 8; 56 | 57 | optional EnumsSimilarConfig enums_similar_config = 2; 58 | // Constraints on various columns. 59 | repeated ColumnConstraint column_constraint = 5; 60 | // Ignore the following columns. 61 | repeated string column_to_ignore = 6; 62 | // Sets the severity of an anomaly which indicates a new feature. 63 | // Deprecated. Prefer severity_overrides. 64 | optional bool new_features_are_warnings = 7 [deprecated = true]; 65 | // Overrides for the severity of different anomalies. If not specified, the 66 | // default severities are used. 67 | repeated SeverityOverride severity_overrides = 9; 68 | 69 | // If true, Feature.shape will be inferred from stats (if feasible). 70 | // Note that existing Feature.shape field in the schema will always be 71 | // validated despite this flag. 72 | optional bool infer_feature_shape = 10; 73 | } 74 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/proto/validation_config.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | syntax = "proto2"; 17 | 18 | package tensorflow.data_validation; 19 | 20 | import "tensorflow_metadata/proto/v0/anomalies.proto"; 21 | 22 | // Configuration for example statistics validation. 23 | message ValidationConfig { 24 | // If true then validation will mark new features (i.e., those that are not 25 | // covered in the schema) as warnings instead of errors. The distinction is 26 | // that warnings do not cause alerts to fire. 27 | // Deprecated. Prefer severity_overrides. 28 | optional bool new_features_are_warnings = 1 [deprecated = true]; 29 | 30 | // Overrides for the severity of different anomalies. If not specified, the 31 | // default severities are used. Note: if multiple anomaly types are observed, 32 | // the maximum severity takes precedence for the overall severity. 33 | repeated SeverityOverride severity_overrides = 2; 34 | } 35 | 36 | message SeverityOverride { 37 | optional metadata.v0.AnomalyInfo.Type type = 1; 38 | optional metadata.v0.AnomalyInfo.Severity severity = 2; 39 | } 40 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/proto/validation_metadata.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | syntax = "proto3"; 17 | 18 | import "tensorflow_metadata/proto/v0/path.proto"; 19 | 20 | package tensorflow.data_validation; 21 | 22 | // TODO(b/148429669): consider adding an environment. 23 | // TODO(b/148429669): consider adding a LifecycleStage. 24 | message ReasonFeatureNeeded { 25 | // If there is an issue in creating the field, the comment should help 26 | // explain why. 27 | // Example: "This is needed for transform XYZ (see /A/B/C)" 28 | string comment = 1; 29 | } 30 | 31 | message PathAndReasonFeatureNeeded { 32 | tensorflow.metadata.v0.Path path = 1; 33 | repeated ReasonFeatureNeeded reason_feature_needed = 2; 34 | } 35 | 36 | // A proto representation of tensorflow::data_validation::FeaturesNeeded 37 | // used for serialization and deserialization. 38 | message FeaturesNeededProto { 39 | repeated PathAndReasonFeatureNeeded path_and_reason_feature_need = 2; 40 | reserved 1; 41 | } 42 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/schema_util.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "tensorflow_data_validation/anomalies/schema_util.h" 17 | 18 | #include "absl/base/log_severity.h" 19 | #include "absl/log/log.h" 20 | #include "tensorflow_metadata/proto/v0/anomalies.pb.h" 21 | 22 | namespace tensorflow { 23 | namespace data_validation { 24 | 25 | namespace { 26 | int NumericalSeverity(tensorflow::metadata::v0::AnomalyInfo::Severity a) { 27 | switch (a) { 28 | case tensorflow::metadata::v0::AnomalyInfo::UNKNOWN: 29 | return 0; 30 | case tensorflow::metadata::v0::AnomalyInfo::WARNING: 31 | return 1; 32 | case tensorflow::metadata::v0::AnomalyInfo::ERROR: 33 | return 2; 34 | default: 35 | LOG(FATAL) << "Unknown severity: " << a; 36 | } 37 | } 38 | } // namespace 39 | // For internal use only. 40 | tensorflow::metadata::v0::AnomalyInfo::Severity MaxSeverity( 41 | tensorflow::metadata::v0::AnomalyInfo::Severity a, 42 | tensorflow::metadata::v0::AnomalyInfo::Severity b) { 43 | return (NumericalSeverity(a) > NumericalSeverity(b)) ? a : b; 44 | } 45 | 46 | } // namespace data_validation 47 | } // namespace tensorflow 48 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/schema_util.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_SCHEMA_UTIL_H_ 17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_SCHEMA_UTIL_H_ 18 | 19 | #include "tensorflow_metadata/proto/v0/anomalies.pb.h" 20 | 21 | namespace tensorflow { 22 | namespace data_validation { 23 | 24 | // Returns the maximum (more serious) severity. 25 | tensorflow::metadata::v0::AnomalyInfo::Severity MaxSeverity( 26 | tensorflow::metadata::v0::AnomalyInfo::Severity a, 27 | tensorflow::metadata::v0::AnomalyInfo::Severity b); 28 | 29 | } // namespace data_validation 30 | } // namespace tensorflow 31 | 32 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_SCHEMA_UTIL_H_ 33 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/schema_util_test.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "tensorflow_data_validation/anomalies/schema_util.h" 17 | 18 | #include 19 | #include 20 | #include "tensorflow_metadata/proto/v0/anomalies.pb.h" 21 | 22 | namespace tensorflow { 23 | namespace data_validation { 24 | 25 | namespace { 26 | using tensorflow::metadata::v0::AnomalyInfo; 27 | 28 | // Since this method only has nine possible inputs, it is easiest to test them 29 | // all directly. 30 | TEST(MaxSeverity, MaxSeverity) { 31 | EXPECT_EQ(AnomalyInfo::UNKNOWN, 32 | MaxSeverity(AnomalyInfo::UNKNOWN, AnomalyInfo::UNKNOWN)); 33 | EXPECT_EQ(AnomalyInfo::WARNING, 34 | MaxSeverity(AnomalyInfo::UNKNOWN, AnomalyInfo::WARNING)); 35 | EXPECT_EQ(AnomalyInfo::ERROR, 36 | MaxSeverity(AnomalyInfo::UNKNOWN, AnomalyInfo::ERROR)); 37 | EXPECT_EQ(AnomalyInfo::WARNING, 38 | MaxSeverity(AnomalyInfo::WARNING, AnomalyInfo::UNKNOWN)); 39 | EXPECT_EQ(AnomalyInfo::WARNING, 40 | MaxSeverity(AnomalyInfo::WARNING, AnomalyInfo::WARNING)); 41 | EXPECT_EQ(AnomalyInfo::ERROR, 42 | MaxSeverity(AnomalyInfo::WARNING, AnomalyInfo::ERROR)); 43 | EXPECT_EQ(AnomalyInfo::ERROR, 44 | MaxSeverity(AnomalyInfo::ERROR, AnomalyInfo::UNKNOWN)); 45 | EXPECT_EQ(AnomalyInfo::ERROR, 46 | MaxSeverity(AnomalyInfo::ERROR, AnomalyInfo::WARNING)); 47 | EXPECT_EQ(AnomalyInfo::ERROR, 48 | MaxSeverity(AnomalyInfo::ERROR, AnomalyInfo::ERROR)); 49 | } 50 | 51 | } // namespace 52 | 53 | } // namespace data_validation 54 | } // namespace tensorflow 55 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/statistics_view_test_util.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_STATISTICS_VIEW_TEST_UTIL_H_ 17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_STATISTICS_VIEW_TEST_UTIL_H_ 18 | 19 | #include "tensorflow_data_validation/anomalies/statistics_view.h" 20 | #include "tensorflow_metadata/proto/v0/statistics.pb.h" 21 | 22 | namespace tensorflow { 23 | namespace data_validation { 24 | namespace testing { 25 | 26 | // Makes a dataset with one feature. Assumes global counts match the 27 | // count for the feature. 28 | tensorflow::metadata::v0::DatasetFeatureStatistics 29 | GetDatasetFeatureStatisticsForTesting( 30 | const tensorflow::metadata::v0::FeatureNameStatistics& feature_name_stats); 31 | 32 | // For testing, we often just have information for one feature. 33 | // However, DatasetStatsView and FeatureStatsView point to other objects. 34 | // This structure allows us to set all that up in one call. 35 | // Here is a pattern: 36 | // FuncToTest(DatasetForTesting(stats).feature_stats_view()) 37 | // Here is an anti-pattern. It will make the resulting object point to a 38 | // destroyed object (very bad). 39 | // const FeatureStatsView& MyShortcut( 40 | // const tensorflow::metadata::v0::FeatureNameStatistics& stats) { 41 | // return DatasetForTesting(stats).feature_stats_view(); 42 | // } 43 | class DatasetForTesting { 44 | public: 45 | explicit DatasetForTesting( 46 | const tensorflow::metadata::v0::FeatureNameStatistics& 47 | feature_name_stats); 48 | DatasetForTesting( 49 | const tensorflow::metadata::v0::FeatureNameStatistics& feature_name_stats, 50 | bool by_weight); 51 | 52 | DatasetForTesting(const tensorflow::metadata::v0::DatasetFeatureStatistics& 53 | dataset_feature_stats, 54 | bool by_weight); 55 | 56 | // DatasetForTesting is neither copyable nor movable, as DatasetStatsView 57 | // is neither copyable nor movable. 58 | DatasetForTesting(const DatasetForTesting&) = delete; 59 | DatasetForTesting& operator=(const DatasetForTesting&) = delete; 60 | 61 | const DatasetStatsView& dataset_stats_view() const { 62 | return dataset_stats_view_; 63 | } 64 | 65 | const FeatureStatsView& feature_stats_view() const { 66 | return feature_stats_view_; 67 | } 68 | 69 | private: 70 | // Notice that the destructor will destroy the objects from bottom to top, 71 | // respecting the proper order of destruction. 72 | const tensorflow::metadata::v0::DatasetFeatureStatistics 73 | dataset_feature_statistics_; 74 | const DatasetStatsView dataset_stats_view_; 75 | const FeatureStatsView feature_stats_view_; 76 | }; 77 | 78 | DatasetForTesting GetDatasetForTesting( 79 | const tensorflow::metadata::v0::FeatureNameStatistics& feature_name_stats); 80 | 81 | tensorflow::metadata::v0::FeatureNameStatistics AddWeightedStats( 82 | const tensorflow::metadata::v0::FeatureNameStatistics& original); 83 | 84 | } // namespace testing 85 | } // namespace data_validation 86 | } // namespace tensorflow 87 | 88 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_STATISTICS_VIEW_TEST_UTIL_H_ 89 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/status_util.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef THIRD_PARTY_PY_TENSORFLOW_DATA_VALIDATION_ANOMALIES_STATUS_UTIL_H_ 17 | #define THIRD_PARTY_PY_TENSORFLOW_DATA_VALIDATION_ANOMALIES_STATUS_UTIL_H_ 18 | 19 | #include "absl/base/log_severity.h" 20 | #include "absl/base/optimization.h" 21 | #include "absl/log/log.h" 22 | 23 | namespace tensorflow { 24 | namespace data_validation { 25 | 26 | // For propagating errors when calling a function. 27 | #define TFDV_RETURN_IF_ERROR(...) \ 28 | do { \ 29 | const absl::Status _status = (__VA_ARGS__); \ 30 | if (ABSL_PREDICT_FALSE(!_status.ok())) return _status; \ 31 | } while (0) 32 | 33 | #ifndef CHECK_NOTNULL 34 | template 35 | T&& CheckNotNull(const char* file, int line, const char* exprtext, T&& t) { 36 | if (t == nullptr) { 37 | LOG(FATAL).AtLocation(file, line) 38 | << std::string(exprtext); 39 | } 40 | return std::forward(t); 41 | } 42 | 43 | #define CHECK_NOTNULL(val) \ 44 | ::tensorflow::data_validation::CheckNotNull( \ 45 | __FILE__, __LINE__, "'" #val "' Must be non NULL", (val)) 46 | 47 | #endif // CHECK_NOTNULL 48 | 49 | } // namespace data_validation 50 | } // namespace tensorflow 51 | 52 | #endif // THIRD_PARTY_PY_TENSORFLOW_DATA_VALIDATION_ANOMALIES_STATUS_UTIL_H_ 53 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/string_domain_util.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_STRING_DOMAIN_UTIL_H_ 17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_STRING_DOMAIN_UTIL_H_ 18 | 19 | #include 20 | 21 | #include "tensorflow_data_validation/anomalies/internal_types.h" 22 | #include "tensorflow_data_validation/anomalies/proto/feature_statistics_to_proto.pb.h" 23 | #include "tensorflow_data_validation/anomalies/schema.h" 24 | #include "tensorflow_data_validation/anomalies/statistics_view.h" 25 | #include "tensorflow_metadata/proto/v0/schema.pb.h" 26 | 27 | namespace tensorflow { 28 | namespace data_validation { 29 | 30 | // True if two domains are similar. If they are "small" according to the 31 | // config.min_count, then they must be identical. Otherwise, they must 32 | // have a large jaccard similarity. 33 | bool IsSimilarStringDomain(const tensorflow::metadata::v0::StringDomain& a, 34 | const tensorflow::metadata::v0::StringDomain& b, 35 | const EnumsSimilarConfig& config); 36 | 37 | // Returns true if this feature_stats has less than enum_threshold number of 38 | // unique string values. 39 | bool IsStringDomainCandidate(const FeatureStatsView& feature_stats, 40 | const int enum_threshold); 41 | 42 | 43 | // If there are any values that are repeated, remove them. 44 | std::vector UpdateStringDomainSelf( 45 | tensorflow::metadata::v0::StringDomain* string_domain); 46 | 47 | // Update a string domain. 48 | // updater: configuration used to determine if the string domain needs to be 49 | // deleted. 50 | // stats: the statistics of the string domain. 51 | // max_off_domain: the maximum fraction of mass allowed to be off the domain. 52 | // string_domain: string_domain to be modified. 53 | UpdateSummary UpdateStringDomain( 54 | const Schema::Updater& updater, 55 | const FeatureStatsView& stats, double max_off_domain, 56 | tensorflow::metadata::v0::StringDomain* string_domain); 57 | 58 | } // namespace data_validation 59 | } // namespace tensorflow 60 | 61 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_STRING_DOMAIN_UTIL_H_ 62 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/telemetry.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "tensorflow_data_validation/anomalies/telemetry.h" 17 | 18 | #include "tensorflow_metadata/proto/v0/anomalies.pb.h" 19 | 20 | 21 | namespace tensorflow { 22 | namespace data_validation { 23 | 24 | void UpdateTelemetry(const metadata::v0::Anomalies& result) {} 25 | 26 | } // namespace data_validation 27 | } // namespace tensorflow 28 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/telemetry.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef THIRD_PARTY_PY_TENSORFLOW_DATA_VALIDATION_ANOMALIES_TELEMETRY_H_ 17 | #define THIRD_PARTY_PY_TENSORFLOW_DATA_VALIDATION_ANOMALIES_TELEMETRY_H_ 18 | 19 | #include "tensorflow_metadata/proto/v0/anomalies.pb.h" 20 | 21 | namespace tensorflow { 22 | namespace data_validation { 23 | 24 | void UpdateTelemetry(const metadata::v0::Anomalies& result); 25 | 26 | } // namespace data_validation 27 | } // namespace tensorflow 28 | 29 | #endif // THIRD_PARTY_PY_TENSORFLOW_DATA_VALIDATION_ANOMALIES_TELEMETRY_H_ 30 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/test_schema_protos.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_TEST_SCHEMA_PROTOS_H_ 17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_TEST_SCHEMA_PROTOS_H_ 18 | 19 | #include "tensorflow_metadata/proto/v0/schema.pb.h" 20 | 21 | namespace tensorflow { 22 | namespace data_validation { 23 | namespace testing { 24 | 25 | tensorflow::metadata::v0::Schema GetTestAllTypesMessage(); 26 | tensorflow::metadata::v0::Schema GetAnnotatedFieldsMessage(); 27 | tensorflow::metadata::v0::Schema GetTestSchemaAlone(); 28 | 29 | } // namespace testing 30 | } // namespace data_validation 31 | } // namespace tensorflow 32 | 33 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_TEST_SCHEMA_PROTOS_H_ 34 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/test_util.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "tensorflow_data_validation/anomalies/test_util.h" 17 | 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | #include "absl/strings/str_cat.h" 26 | #include "tensorflow_data_validation/anomalies/map_util.h" 27 | #include "tensorflow_data_validation/anomalies/path.h" 28 | #include "tensorflow/tsl/platform/protobuf.h" 29 | #include "tensorflow_metadata/proto/v0/anomalies.pb.h" 30 | #include "tensorflow_metadata/proto/v0/schema.pb.h" 31 | 32 | namespace tensorflow { 33 | namespace data_validation { 34 | namespace testing { 35 | using std::vector; 36 | 37 | ProtoStringMatcher::ProtoStringMatcher(const string& expected) 38 | : expected_(expected) {} 39 | ProtoStringMatcher::ProtoStringMatcher(const google::protobuf::Message& expected) 40 | : expected_([&]() -> std::string { 41 | std::string result; 42 | tsl::protobuf::TextFormat::PrintToString(expected, &result); 43 | return result; 44 | }()) {} 45 | 46 | void TestAnomalies( 47 | const tensorflow::metadata::v0::Anomalies& actual, 48 | const tensorflow::metadata::v0::Schema& old_schema, 49 | const std::map& expected_anomalies, 50 | const std::vector& 51 | expected_drift_skew_infos) { 52 | EXPECT_THAT(actual.baseline(), EqualsProto(old_schema)); 53 | 54 | for (const auto& pair : expected_anomalies) { 55 | const string& name = pair.first; 56 | const ExpectedAnomalyInfo& expected = pair.second; 57 | ASSERT_TRUE(ContainsKey(actual.anomaly_info(), name)) 58 | << "Expected anomaly for feature name: " << name 59 | << " not found in Anomalies: " << actual.DebugString(); 60 | TestAnomalyInfo(actual.anomaly_info().at(name), expected, 61 | absl::StrCat(" column: ", name)); 62 | } 63 | for (const auto& pair : actual.anomaly_info()) { 64 | const string& name = pair.first; 65 | metadata::v0::AnomalyInfo simple_anomaly_info = pair.second; 66 | EXPECT_TRUE(ContainsKey(expected_anomalies, name)) 67 | << "Unexpected anomaly: " << name << " " 68 | << simple_anomaly_info.DebugString(); 69 | } 70 | std::map 71 | path_to_expected_drift_skew_info; 72 | for (const auto& drift_skew_info : expected_drift_skew_infos) { 73 | path_to_expected_drift_skew_info[Path(drift_skew_info.path())] = 74 | drift_skew_info; 75 | } 76 | EXPECT_EQ(path_to_expected_drift_skew_info.size(), 77 | actual.drift_skew_info_size()) 78 | << actual.DebugString(); 79 | for (const auto& actual_drift_skew_info : actual.drift_skew_info()) { 80 | const Path path(actual_drift_skew_info.path()); 81 | ASSERT_TRUE(ContainsKey(path_to_expected_drift_skew_info, path)); 82 | EXPECT_THAT(actual_drift_skew_info, 83 | EqualsProto(path_to_expected_drift_skew_info.at(path))); 84 | } 85 | } 86 | 87 | void TestAnomalyInfo(const tensorflow::metadata::v0::AnomalyInfo& actual, 88 | const ExpectedAnomalyInfo& expected, 89 | const string& comment) { 90 | // It is expected that diff_regions will not be populated in unit tests; such 91 | // regions will not be checked. 92 | ASSERT_TRUE(actual.diff_regions().empty()); 93 | EXPECT_THAT(actual, EqualsProto(expected.expected_info_without_diff)) 94 | << comment; 95 | } 96 | 97 | } // namespace testing 98 | } // namespace data_validation 99 | } // namespace tensorflow 100 | -------------------------------------------------------------------------------- /tensorflow_data_validation/anomalies/test_util_test.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Google LLC 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | https://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "tensorflow_data_validation/anomalies/test_util.h" 17 | 18 | #include 19 | #include 20 | 21 | namespace tensorflow { 22 | namespace data_validation { 23 | namespace testing { 24 | namespace { 25 | 26 | TEST(TestAnomalies, Basic) { 27 | const tensorflow::metadata::v0::Schema original = 28 | ParseTextProtoOrDie(R"( 29 | feature { 30 | name: "feature_name" 31 | type: INT 32 | skew_comparator: { infinity_norm: { threshold: 0.1 } } 33 | })"); 34 | 35 | tensorflow::metadata::v0::Anomalies result; 36 | *result.mutable_baseline() = original; 37 | TestAnomalies(result, original, std::map()); 38 | } 39 | 40 | } // namespace 41 | } // namespace testing 42 | } // namespace data_validation 43 | } // namespace tensorflow 44 | -------------------------------------------------------------------------------- /tensorflow_data_validation/api/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tensorflow_data_validation/api/validation_options.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Validation options.""" 16 | 17 | from typing import List, Mapping, Optional 18 | 19 | # TODO(https://issues.apache.org/jira/browse/SPARK-22674): Switch to 20 | # `collections.namedtuple` or `typing.NamedTuple` once the Spark issue is 21 | # resolved. 22 | from tfx_bsl.types import tfx_namedtuple # pylint: disable=g-bad-import-order 23 | 24 | from tensorflow_data_validation.anomalies.proto import validation_config_pb2 25 | from tensorflow_data_validation.types import FeaturePath 26 | 27 | 28 | class ReasonFeatureNeeded( 29 | tfx_namedtuple.namedtuple("ReasonFeatureNeeded", ["comment"]) 30 | ): 31 | """A named tuple to indicate why a feature is needed for struct2tensor.""" 32 | 33 | def __new__(cls, comment: str): 34 | return super(ReasonFeatureNeeded, cls).__new__(cls, comment=comment) 35 | 36 | 37 | class ValidationOptions: 38 | """Options for example validation.""" 39 | 40 | def __init__( 41 | self, 42 | features_needed: Optional[ 43 | Mapping[FeaturePath, List[ReasonFeatureNeeded]] 44 | ] = None, 45 | new_features_are_warnings: Optional[bool] = False, 46 | severity_overrides: Optional[ 47 | List[validation_config_pb2.SeverityOverride] 48 | ] = None, 49 | ): 50 | self._features_needed = features_needed 51 | self._new_features_are_warnings = new_features_are_warnings 52 | self._severity_overrides = severity_overrides or [] 53 | 54 | @property 55 | def features_needed( 56 | self, 57 | ) -> Optional[Mapping[FeaturePath, List[ReasonFeatureNeeded]]]: 58 | return self._features_needed 59 | 60 | @property 61 | def new_features_are_warnings(self) -> bool: 62 | return self._new_features_are_warnings # pytype: disable=bad-return-type 63 | 64 | @property 65 | def severity_overrides(self) -> List[validation_config_pb2.SeverityOverride]: 66 | return self._severity_overrides 67 | -------------------------------------------------------------------------------- /tensorflow_data_validation/api/validation_options_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Tests for validation_options.""" 16 | 17 | from absl.testing import absltest 18 | 19 | from tensorflow_data_validation.api import validation_options 20 | from tensorflow_data_validation.types import FeaturePath 21 | 22 | 23 | class ValidationOptionsTest(absltest.TestCase): 24 | def test_access_attributes(self): 25 | features_needed = { 26 | FeaturePath(["a", "b"]): [ 27 | validation_options.ReasonFeatureNeeded(comment="reason1"), 28 | validation_options.ReasonFeatureNeeded(comment="reason2"), 29 | ] 30 | } 31 | new_features_are_warnings = True 32 | severity_overrides = [] 33 | options = validation_options.ValidationOptions( 34 | features_needed, new_features_are_warnings, severity_overrides 35 | ) 36 | 37 | # Test getters 38 | self.assertEqual(features_needed, options.features_needed) 39 | self.assertEqual(new_features_are_warnings, options.new_features_are_warnings) 40 | self.assertEqual(severity_overrides, options.severity_overrides) 41 | 42 | 43 | if __name__ == "__main__": 44 | absltest.main() 45 | -------------------------------------------------------------------------------- /tensorflow_data_validation/arrow/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tensorflow_data_validation/arrow/decoded_examples_to_arrow.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License 14 | 15 | """Util to convert a list of decoded examples to an Arrow RecordBatch.""" 16 | 17 | from typing import List 18 | 19 | import pyarrow as pa 20 | import six 21 | from tfx_bsl.arrow import array_util 22 | 23 | from tensorflow_data_validation import types 24 | from tensorflow_data_validation.arrow import arrow_util 25 | 26 | 27 | def DecodedExamplesToRecordBatch( 28 | decoded_examples: List[types.LegacyExample], 29 | ) -> pa.RecordBatch: 30 | """Converts a list of legacy examples in dict form to an Arrow RecordBatch. 31 | 32 | The result record batch has M rows and N columns where M is the number of 33 | examples in the list and N is the number of unique features in the examples. 34 | Each column is either a ListArray or a NullArray. 35 | None and missing feature handling: 36 | - if a feature's value is None in an example, then its corresponding column 37 | in the result batch will have a null at the corresponding position. 38 | - if a feature's value is always None across all the examples in the input 39 | list, then its corresponding column in the result batch will be a 40 | NullArray. 41 | - if an example does not contain a feature (in the universe of features), 42 | then the column of that feature will have a null at the corresponding 43 | position. 44 | 45 | Args: 46 | ---- 47 | decoded_examples: a list of LegacyExamples. 48 | 49 | Returns: 50 | ------- 51 | a pa.RecordBatch. 52 | 53 | Raises: 54 | ------ 55 | ValueError: when the conversion fails. 56 | TypeError: when some of the output columns are not of supported types. 57 | """ 58 | if not decoded_examples: 59 | return pa.RecordBatch.from_arrays([], []) 60 | 61 | struct_array = pa.array(decoded_examples) 62 | if not pa.types.is_struct(struct_array.type): 63 | raise ValueError("Unexpected Arrow type created from input") 64 | field_names = [f.name for f in list(struct_array.type)] 65 | if not field_names: 66 | return _GetEmptyRecordBatch(len(decoded_examples)) 67 | value_arrays = struct_array.flatten() 68 | for name, array in six.moves.zip(field_names, value_arrays): 69 | if pa.types.is_null(array.type): 70 | continue 71 | if not array_util.is_list_like(array.type): 72 | raise TypeError( 73 | f"Expected list arrays for field {name} but got {array.type}" 74 | ) 75 | value_type = array.type.value_type 76 | if ( 77 | not pa.types.is_integer(value_type) 78 | and not pa.types.is_floating(value_type) 79 | and not arrow_util.is_binary_like(value_type) 80 | and not pa.types.is_null(value_type) 81 | ): 82 | raise TypeError(f"Type not supported: {name} {array.type}") 83 | 84 | return pa.RecordBatch.from_arrays(value_arrays, field_names) 85 | 86 | 87 | def _GetEmptyRecordBatch(num_rows: int) -> pa.RecordBatch: 88 | assert num_rows > 0 89 | # pyarrow doesn't provide an API to create a record batch with zero column but 90 | # non zero rows. We work around it by adding a dummy column first and then 91 | # removing it. 92 | t = pa.Table.from_arrays([pa.array([None] * num_rows, type=pa.null())], ["dummy"]) 93 | batches = t.remove_column(0).to_batches() 94 | assert len(batches) == 1 95 | return batches[0] 96 | -------------------------------------------------------------------------------- /tensorflow_data_validation/build_macros.bzl: -------------------------------------------------------------------------------- 1 | """BUILD macros used in OSS builds.""" 2 | 3 | def tfdv_pybind_extension( 4 | name, 5 | srcs, 6 | module_name, # buildifier: disable=unused-variable 7 | deps = [], 8 | visibility = None): 9 | """Builds a generic Python extension module. 10 | 11 | Args: 12 | name: Name of the target. 13 | srcs: C++ source files. 14 | module_name: Ignored. 15 | deps: Dependencies. 16 | visibility: Visibility. 17 | """ 18 | p = name.rfind("/") 19 | if p == -1: 20 | sname = name 21 | prefix = "" 22 | else: 23 | sname = name[p + 1:] 24 | prefix = name[:p + 1] 25 | so_file = "%s%s.so" % (prefix, sname) 26 | pyd_file = "%s%s.pyd" % (prefix, sname) 27 | exported_symbols = [ 28 | "init%s" % sname, 29 | "init_%s" % sname, 30 | "PyInit_%s" % sname, 31 | ] 32 | 33 | exported_symbols_file = "%s-exported-symbols.lds" % name 34 | version_script_file = "%s-version-script.lds" % name 35 | 36 | exported_symbols_output = "\n".join(["_%s" % symbol for symbol in exported_symbols]) 37 | version_script_output = "\n".join([" %s;" % symbol for symbol in exported_symbols]) 38 | 39 | native.genrule( 40 | name = name + "_exported_symbols", 41 | outs = [exported_symbols_file], 42 | cmd = "echo '%s' >$@" % exported_symbols_output, 43 | output_licenses = ["unencumbered"], 44 | visibility = ["//visibility:private"], 45 | ) 46 | 47 | native.genrule( 48 | name = name + "_version_script", 49 | outs = [version_script_file], 50 | cmd = "echo '{global:\n%s\n local: *;};' >$@" % version_script_output, 51 | output_licenses = ["unencumbered"], 52 | visibility = ["//visibility:private"], 53 | ) 54 | 55 | native.cc_binary( 56 | name = so_file, 57 | srcs = srcs, 58 | copts = [ 59 | "-fno-strict-aliasing", 60 | "-fexceptions", 61 | ] + select({ 62 | "//conditions:default": [ 63 | "-fvisibility=hidden", 64 | ], 65 | }), 66 | linkopts = select({ 67 | "//tensorflow_data_validation:macos": [ 68 | # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols 69 | # not being exported. There should be a better way to deal with this. 70 | # "-Wl,-rename_section,__TEXT,text_env,__TEXT,__text", 71 | "-Wl,-w", 72 | "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file, 73 | ], 74 | "//conditions:default": [ 75 | "-Wl,--version-script", 76 | "$(location %s)" % version_script_file, 77 | ], 78 | }), 79 | deps = deps + [ 80 | exported_symbols_file, 81 | version_script_file, 82 | ], 83 | features = ["-use_header_modules"], 84 | linkshared = 1, 85 | visibility = visibility, 86 | ) 87 | native.genrule( 88 | name = name + "_pyd_copy", 89 | srcs = [so_file], 90 | outs = [pyd_file], 91 | cmd = "cp $< $@", 92 | output_to_bindir = True, 93 | visibility = visibility, 94 | ) 95 | native.py_library( 96 | name = name, 97 | data = select({ 98 | "//conditions:default": [so_file], 99 | }), 100 | srcs_version = "PY3", 101 | visibility = visibility, 102 | ) 103 | -------------------------------------------------------------------------------- /tensorflow_data_validation/coders/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tensorflow_data_validation/coders/csv_decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Decode CSV records into in-memory representation for tf data validation.""" 15 | 16 | from typing import List, Optional, Union 17 | 18 | import apache_beam as beam 19 | import pyarrow as pa 20 | from tensorflow_metadata.proto.v0 import schema_pb2 21 | from tfx_bsl.coders import csv_decoder 22 | 23 | from tensorflow_data_validation import constants, types 24 | 25 | 26 | @beam.typehints.with_input_types(str) 27 | @beam.typehints.with_output_types(pa.RecordBatch) 28 | class DecodeCSV(beam.PTransform): 29 | """Decodes CSV records into Arrow RecordBatches. 30 | 31 | DEPRECATED: please use tfx_bsl.public.CsvTFXIO instead. 32 | """ 33 | 34 | def __init__( 35 | self, 36 | column_names: List[types.FeatureName], 37 | delimiter: str = ",", 38 | skip_blank_lines: bool = True, 39 | schema: Optional[schema_pb2.Schema] = None, 40 | desired_batch_size: Optional[int] = constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE, 41 | multivalent_columns: Optional[List[types.FeatureName]] = None, 42 | secondary_delimiter: Optional[Union[str, bytes]] = None, 43 | ): 44 | """Initializes the CSV decoder. 45 | 46 | Args: 47 | ---- 48 | column_names: List of feature names. Order must match the order in the CSV 49 | file. 50 | delimiter: A one-character string used to separate fields. 51 | skip_blank_lines: A boolean to indicate whether to skip over blank lines 52 | rather than interpreting them as missing values. 53 | schema: An optional schema of the input data. If provided, types 54 | will be inferred from the schema. If this is provided, the feature names 55 | must equal column_names. 56 | desired_batch_size: Batch size. The output Arrow RecordBatches will have 57 | as many rows as the `desired_batch_size`. 58 | multivalent_columns: Name of column that can contain multiple 59 | values. 60 | secondary_delimiter: Delimiter used for parsing multivalent columns. 61 | """ 62 | if not isinstance(column_names, list): 63 | raise TypeError( 64 | "column_names is of type %s, should be a list" 65 | % type(column_names).__name__ 66 | ) 67 | 68 | self._column_names = column_names 69 | self._delimiter = delimiter 70 | self._skip_blank_lines = skip_blank_lines 71 | self._schema = schema 72 | self._desired_batch_size = desired_batch_size 73 | self._multivalent_columns = multivalent_columns 74 | self._secondary_delimiter = secondary_delimiter 75 | 76 | def expand(self, lines: beam.pvalue.PCollection): 77 | """Decodes the input CSV records into RecordBatches. 78 | 79 | Args: 80 | ---- 81 | lines: A PCollection of strings representing the lines in the CSV file. 82 | 83 | Returns: 84 | ------- 85 | A PCollection of RecordBatches representing the CSV records. 86 | """ 87 | return lines | "CSVToRecordBatch" >> csv_decoder.CSVToRecordBatch( 88 | column_names=self._column_names, 89 | delimiter=self._delimiter, 90 | skip_blank_lines=self._skip_blank_lines, 91 | schema=self._schema, 92 | desired_batch_size=self._desired_batch_size, 93 | multivalent_columns=self._multivalent_columns, 94 | secondary_delimiter=self._secondary_delimiter, 95 | ) 96 | -------------------------------------------------------------------------------- /tensorflow_data_validation/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Constants used in TensorFlow Data Validation.""" 16 | 17 | from tfx_bsl.telemetry import util 18 | 19 | # Name of the default slice containing all examples. 20 | DEFAULT_SLICE_KEY = "All Examples" 21 | 22 | # Name of the invalid slice containing all examples in the RecordBatch. 23 | INVALID_SLICE_KEY = "Invalid Slice" 24 | 25 | # Namespace for all TFDV metrics. 26 | METRICS_NAMESPACE = util.MakeTfxNamespace(["DataValidation"]) 27 | 28 | # Default input batch size. 29 | # This needs to be large enough to allow for efficient TF invocations during 30 | # batch flushing, but shouldn't be too large as it also acts as cap on the 31 | # maximum memory usage of the computation. 32 | DEFAULT_DESIRED_INPUT_BATCH_SIZE = 1000 33 | 34 | # Placeholder for non-utf8 sequences in top-k results. 35 | NON_UTF8_PLACEHOLDER = "__BYTES_VALUE__" 36 | # Placeholder for large sequences in top-k results. 37 | LARGE_BYTES_PLACEHOLDER = "__LARGE_BYTES__" 38 | -------------------------------------------------------------------------------- /tensorflow_data_validation/data_validation.bzl: -------------------------------------------------------------------------------- 1 | """Open-source versions of TFDV proto build rules.""" 2 | 3 | load("@com_google_protobuf//bazel:py_proto_library.bzl", "py_proto_library") 4 | load("@rules_cc//cc:defs.bzl", "cc_proto_library") 5 | 6 | def tfdv_proto_library(name, **kwargs): 7 | """Google proto_library and cc_proto_library. 8 | 9 | Args: 10 | name: Name of the cc proto library. 11 | **kwargs: Keyword arguments to pass to the proto libraries.""" 12 | well_known_protos = [ 13 | "@com_google_protobuf//:any_proto", 14 | "@com_google_protobuf//:duration_proto", 15 | "@com_google_protobuf//:timestamp_proto", 16 | "@com_google_protobuf//:struct_proto", 17 | "@com_google_protobuf//:empty_proto", 18 | "@com_google_protobuf//:wrappers_proto", 19 | ] 20 | kwargs["deps"] = kwargs.get("deps", []) + well_known_protos 21 | native.proto_library(name = name, **kwargs) # buildifier: disable=native-proto 22 | cc_proto_kwargs = { 23 | "deps": [":" + name], 24 | } 25 | if "visibility" in kwargs: 26 | cc_proto_kwargs["visibility"] = kwargs["visibility"] 27 | if "testonly" in kwargs: 28 | cc_proto_kwargs["testonly"] = kwargs["testonly"] 29 | if "compatible_with" in kwargs: 30 | cc_proto_kwargs["compatible_with"] = kwargs["compatible_with"] 31 | cc_proto_library(name = name + "_cc_pb2", **cc_proto_kwargs) 32 | 33 | def tfdv_proto_library_py( 34 | name, 35 | deps, 36 | visibility = None, 37 | testonly = 0): 38 | """Opensource py_proto_library.""" 39 | py_proto_library( 40 | name = name, 41 | deps = deps, 42 | visibility = visibility, 43 | testonly = testonly, 44 | ) 45 | -------------------------------------------------------------------------------- /tensorflow_data_validation/integration_tests/drift_skew_metrics_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """End to end tests of the validation API which are easier to do in Python.""" 15 | 16 | import numpy as np 17 | import pandas as pd 18 | from absl import flags 19 | from absl.testing import absltest 20 | from tensorflow_metadata.proto.v0 import schema_pb2 21 | 22 | import tensorflow_data_validation as tfdv 23 | 24 | FLAGS = flags.FLAGS 25 | 26 | 27 | def get_js( 28 | array1: np.ndarray, 29 | array2: np.ndarray, 30 | hist_type: schema_pb2.HistogramSelection.Type, 31 | quantiles_buckets: int = 10, 32 | ) -> float: 33 | opts = tfdv.StatsOptions() 34 | opts.num_quantiles_histogram_buckets = quantiles_buckets 35 | stats1 = tfdv.generate_statistics_from_dataframe( 36 | pd.DataFrame({"foo": array1}), stats_options=opts 37 | ) 38 | stats2 = tfdv.generate_statistics_from_dataframe( 39 | pd.DataFrame({"foo": array2}), stats_options=opts 40 | ) 41 | schema = tfdv.infer_schema(stats1) 42 | f = tfdv.get_feature(schema, "foo") 43 | f.drift_comparator.jensen_shannon_divergence.threshold = 0 44 | f.drift_comparator.jensen_shannon_divergence.source.type = hist_type 45 | anomalies = tfdv.validate_statistics(stats1, schema, previous_statistics=stats2) 46 | return anomalies.drift_skew_info[0].drift_measurements[0].value 47 | 48 | 49 | class DriftSkewMetricsTest(absltest.TestCase): 50 | def test_standard_quantiles_similar_outcomes_with_normal_dist(self): 51 | gen = np.random.default_rng(44) 52 | for shift in np.linspace(0, 2, 10): 53 | array1 = gen.standard_normal(1000) 54 | array2 = shift + gen.standard_normal(1000) 55 | js_standard = get_js(array1, array2, schema_pb2.HistogramSelection.STANDARD) 56 | js_quantiles = get_js( 57 | array1, array2, schema_pb2.HistogramSelection.QUANTILES 58 | ) 59 | self.assertAlmostEqual(js_standard, js_quantiles, delta=0.1) 60 | 61 | def test_outlier_sensitivity(self): 62 | gen = np.random.default_rng(44) 63 | array1 = gen.standard_normal(10000) 64 | array2 = np.concatenate([array1, np.array([1e8])]) 65 | js_quantiles = get_js(array1, array2, schema_pb2.HistogramSelection.QUANTILES) 66 | js_quantiles_100 = get_js( 67 | array1, array2, schema_pb2.HistogramSelection.QUANTILES, 100 68 | ) 69 | js_standard = get_js(array1, array2, schema_pb2.HistogramSelection.STANDARD) 70 | js_standard_100 = get_js( 71 | array1, array2, schema_pb2.HistogramSelection.STANDARD, 100 72 | ) 73 | # The idealized JS is very close to zero, but in practice we expect a value 74 | # around 0.1 because there are only ten bins, and the last bin is affected 75 | # by the outlier. 76 | self.assertLess(js_quantiles, 0.15) 77 | # QUANTILES JS with more bins is better here. 78 | self.assertLess(js_quantiles_100, 0.02) 79 | # STANDARD JS is very affected by outliers. 80 | self.assertGreater(js_standard, 0.99) 81 | # Adding more bins doesn't help. 82 | self.assertGreater(js_standard_100, 0.99) 83 | 84 | 85 | if __name__ == "__main__": 86 | absltest.main() 87 | -------------------------------------------------------------------------------- /tensorflow_data_validation/move_generated_files.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2020 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Moves the bazel generated files needed for packaging the wheel to the source 17 | # tree. 18 | function tfdv::move_generated_files() { 19 | PYWRAP_TFDV="tensorflow_data_validation/pywrap/tensorflow_data_validation_extension.so" 20 | cp -f "${BUILD_WORKSPACE_DIRECTORY}/bazel-bin/${PYWRAP_TFDV}" \ 21 | "${BUILD_WORKSPACE_DIRECTORY}/${PYWRAP_TFDV}" 22 | 23 | # If run by "bazel run", $(pwd) is the .runfiles dir that contains all the 24 | # data dependencies. 25 | RUNFILES_DIR=$(pwd) 26 | cp -f ${RUNFILES_DIR}/tensorflow_data_validation/skew/protos/feature_skew_results_pb2.py \ 27 | ${BUILD_WORKSPACE_DIRECTORY}/tensorflow_data_validation/skew/protos 28 | cp -f ${RUNFILES_DIR}/tensorflow_data_validation/anomalies/proto/validation_config_pb2.py \ 29 | ${BUILD_WORKSPACE_DIRECTORY}/tensorflow_data_validation/anomalies/proto 30 | cp -f ${RUNFILES_DIR}/tensorflow_data_validation/anomalies/proto/validation_metadata_pb2.py \ 31 | ${BUILD_WORKSPACE_DIRECTORY}/tensorflow_data_validation/anomalies/proto 32 | chmod +w "${BUILD_WORKSPACE_DIRECTORY}/${PYWRAP_TFDV}" 33 | } 34 | 35 | tfdv::move_generated_files 36 | -------------------------------------------------------------------------------- /tensorflow_data_validation/pywrap/BUILD: -------------------------------------------------------------------------------- 1 | # Description: 2 | # C++ libraries in TFDV. 3 | 4 | load("@rules_python//python:defs.bzl", "py_library") 5 | load("//tensorflow_data_validation:build_macros.bzl", "tfdv_pybind_extension") 6 | 7 | package( 8 | default_visibility = [ 9 | "//tensorflow_data_validation:__subpackages__", 10 | ], 11 | ) 12 | 13 | licenses(["notice"]) 14 | 15 | tfdv_pybind_extension( 16 | name = "tensorflow_data_validation_extension", 17 | srcs = [ 18 | "tensorflow_data_validation_extension.cc", 19 | ], 20 | module_name = "tensorflow_data_validation_extension", 21 | deps = [ 22 | ":validation_submodule", 23 | "@pybind11", 24 | ], 25 | ) 26 | 27 | py_library( 28 | name = "tensorflow_data_validation_extension_lib", 29 | srcs = ["__init__.py"], 30 | deps = [ 31 | ":tensorflow_data_validation_extension", 32 | ], 33 | ) 34 | 35 | cc_library( 36 | name = "validation_submodule", 37 | srcs = ["validation_submodule.cc"], 38 | hdrs = ["validation_submodule.h"], 39 | copts = [ 40 | "-fexceptions", 41 | ], 42 | features = ["-use_header_modules"], 43 | deps = [ 44 | "//tensorflow_data_validation/anomalies:feature_statistics_validator", 45 | "@pybind11", 46 | ], 47 | ) 48 | -------------------------------------------------------------------------------- /tensorflow_data_validation/pywrap/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tensorflow_data_validation/pywrap/tensorflow_data_validation_extension.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // https://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Defines the tensorflow_data_validation extension module. We aim at having 16 | // only one extension module (i.e. dynamic shared library), therefore all the 17 | // TFDV C++ APIs must be added here. 18 | // This C++ object has exception (-fexception) enabled (to work with 19 | // pybind11). -fexception may harm performance and increase the binary size, 20 | // therefore do not put any non-trivial logic here. 21 | 22 | #include "tensorflow_data_validation/pywrap/validation_submodule.h" 23 | #include "include/pybind11/pybind11.h" 24 | 25 | namespace tensorflow { 26 | namespace data_validation { 27 | 28 | PYBIND11_MODULE( 29 | tensorflow_data_validation_extension, // this must be kept the same as the 30 | // "extension_name" param in the 31 | // build rule 32 | m) { 33 | m.doc() = "TensorFlow Data Validation extension module"; 34 | DefineValidationSubmodule(m); 35 | } 36 | 37 | } // namespace data_validation 38 | } // namespace tensorflow 39 | -------------------------------------------------------------------------------- /tensorflow_data_validation/pywrap/validation_submodule.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // https://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #include "tensorflow_data_validation/pywrap/validation_submodule.h" 15 | 16 | #include "tensorflow_data_validation/anomalies/feature_statistics_validator.h" 17 | #include "include/pybind11/pybind11.h" 18 | 19 | namespace tensorflow { 20 | namespace data_validation { 21 | namespace py = pybind11; 22 | 23 | void DefineValidationSubmodule(py::module main_module) { 24 | auto m = main_module.def_submodule("validation"); 25 | m.doc() = "Validation API."; 26 | 27 | m.def("InferSchema", 28 | [](const std::string& statistics_proto_string, 29 | int max_string_domain_size, bool infer_feature_shape) -> py::object { 30 | std::string schema_proto_string; 31 | const absl::Status status = 32 | InferSchema(statistics_proto_string, max_string_domain_size, 33 | infer_feature_shape, &schema_proto_string); 34 | if (!status.ok()) { 35 | throw std::runtime_error(status.ToString()); 36 | } 37 | return py::bytes(schema_proto_string); 38 | }); 39 | 40 | m.def("UpdateSchema", 41 | [](const std::string& schema_proto_string, 42 | const std::string& statistics_proto_string, 43 | int max_string_domain_size) -> py::object { 44 | std::string output_schema_proto_string; 45 | const absl::Status status = 46 | UpdateSchema( 47 | schema_proto_string, statistics_proto_string, 48 | max_string_domain_size, &output_schema_proto_string); 49 | if (!status.ok()) { 50 | throw std::runtime_error(status.ToString()); 51 | } 52 | return py::bytes(output_schema_proto_string); 53 | }); 54 | 55 | m.def("ValidateFeatureStatistics", 56 | [](const std::string& statistics_proto_string, 57 | const std::string& schema_proto_string, 58 | const std::string& environment, 59 | const std::string& previous_span_statistics_proto_string, 60 | const std::string& serving_statistics_proto_string, 61 | const std::string& previous_version_statistics_proto_string, 62 | const std::string& feature_needed_string, 63 | const std::string& validation_config_string, 64 | const bool enable_diff_regions) -> py::object { 65 | std::string anomalies_proto_string; 66 | const absl::Status status = \ 67 | ValidateFeatureStatisticsWithSerializedInputs( 68 | statistics_proto_string, schema_proto_string, environment, 69 | previous_span_statistics_proto_string, 70 | serving_statistics_proto_string, 71 | previous_version_statistics_proto_string, 72 | feature_needed_string, validation_config_string, 73 | enable_diff_regions, &anomalies_proto_string); 74 | if (!status.ok()) { 75 | throw std::runtime_error(status.ToString()); 76 | } 77 | return py::bytes(anomalies_proto_string); 78 | }); 79 | } 80 | 81 | } // namespace data_validation 82 | } // namespace tensorflow 83 | -------------------------------------------------------------------------------- /tensorflow_data_validation/pywrap/validation_submodule.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // https://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | #ifndef TENSORFLOW_DATA_VALIDATION_PYWRAP_VALIDATION_SUBMODULE_H_ 15 | #define TENSORFLOW_DATA_VALIDATION_PYWRAP_VALIDATION_SUBMODULE_H_ 16 | 17 | #include "include/pybind11/pybind11.h" 18 | 19 | namespace tensorflow { 20 | namespace data_validation { 21 | 22 | void DefineValidationSubmodule(pybind11::module main_module); 23 | 24 | } // namespace data_validation 25 | } // namespace tensorflow 26 | 27 | #endif // TENSORFLOW_DATA_VALIDATION_PYWRAP_VALIDATION_SUBMODULE_H_ 28 | -------------------------------------------------------------------------------- /tensorflow_data_validation/skew/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tensorflow_data_validation/skew/protos/BUILD: -------------------------------------------------------------------------------- 1 | load("//tensorflow_data_validation:data_validation.bzl", "tfdv_proto_library", "tfdv_proto_library_py") 2 | 3 | package( 4 | default_visibility = [ 5 | "//tensorflow_data_validation:__subpackages__", 6 | ], 7 | ) 8 | 9 | licenses(["notice"]) 10 | 11 | tfdv_proto_library( 12 | name = "feature_skew_results_proto", 13 | srcs = ["feature_skew_results.proto"], 14 | ) 15 | 16 | tfdv_proto_library_py( 17 | name = "feature_skew_results_proto_py_pb2", 18 | deps = [":feature_skew_results_proto"], 19 | ) 20 | -------------------------------------------------------------------------------- /tensorflow_data_validation/skew/protos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/tensorflow_data_validation/skew/protos/__init__.py -------------------------------------------------------------------------------- /tensorflow_data_validation/skew/protos/feature_skew_results.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2022 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | syntax = "proto3"; 16 | 17 | package tensorflow.data_validation; 18 | 19 | // Skew information for a particular feature. 20 | message FeatureSkew { 21 | // Name of the feature. 22 | string feature_name = 1; 23 | 24 | // All of the counts are with respect to the set of tensorflow examples 25 | // which are found in **BOTH** the baseline and testing data. 26 | // Number of examples in baseline and test where this feature is present. 27 | uint64 base_count = 2; 28 | uint64 test_count = 3; 29 | 30 | // Number of examples where feature is present in both baseline and 31 | // test data, and the feature value matches exactly. 32 | uint64 match_count = 4; 33 | 34 | // Number of times feature is present for baseline or test only. 35 | uint64 base_only = 5; 36 | uint64 test_only = 6; 37 | 38 | // Total number of examples where the feature is present in both 39 | // baseline and test but the feature value does not match. 40 | uint64 mismatch_count = 7; 41 | 42 | // diff_count == base_only + test_only + mismatch_count 43 | uint64 diff_count = 8; 44 | } 45 | 46 | // Message to store a pair of tensorflow examples which exhibit some skew. 47 | message SkewPair { 48 | // Serialized example as it appears in baseline data. 49 | bytes base = 1; 50 | 51 | // Serialized example from test data with the same identifier as that of the 52 | // baseline. 53 | bytes test = 2; 54 | 55 | // Features that appear in the baseline example but not in test. 56 | repeated string base_only_features = 3; 57 | 58 | // Features that appear in the test example but not in baseline. 59 | repeated string test_only_features = 4; 60 | 61 | // Features that appear in both baseline and test example, and their 62 | // feature values are matched. 63 | repeated string matched_features = 5; 64 | 65 | // Features that appear in both baseline and test example, and their 66 | // feature values do not match. 67 | repeated string mismatched_features = 6; 68 | } 69 | 70 | // Represents the frequency of a particular baseline and test feature value 71 | // joined by identifier. 72 | message ConfusionCount { 73 | message Value { 74 | bytes bytes_value = 1; 75 | } 76 | string feature_name = 1; 77 | Value base = 2; 78 | Value test = 3; 79 | uint64 count = 4; 80 | } 81 | 82 | // Overall statistics on match rates and example counts. 83 | message MatchStats { 84 | uint64 base_with_id_count = 1; 85 | uint64 test_with_id_count = 2; 86 | uint64 identifiers_count = 3; 87 | uint64 ids_missing_in_base_count = 4; 88 | uint64 ids_missing_in_test_count = 5; 89 | uint64 matching_pairs_count = 6; 90 | uint64 duplicate_id_count = 9; 91 | uint64 base_missing_id_count = 7; 92 | uint64 test_missing_id_count = 8; 93 | } 94 | -------------------------------------------------------------------------------- /tensorflow_data_validation/statistics/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tensorflow_data_validation/statistics/generators/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tensorflow_data_validation/statistics/generators/constituents/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tensorflow_data_validation/statistics/generators/constituents/count_missing_generator_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Tests for tensorflow_data_validation.statistics.constituents.count_missing_generator.""" 15 | 16 | import pyarrow as pa 17 | from absl.testing import absltest 18 | 19 | from tensorflow_data_validation import types 20 | from tensorflow_data_validation.statistics.generators import input_batch 21 | from tensorflow_data_validation.statistics.generators.constituents import ( 22 | count_missing_generator, 23 | ) 24 | 25 | 26 | class CountMissingGeneratorTest(absltest.TestCase): 27 | def test_count_missing_generator_key(self): 28 | path = types.FeaturePath(["feature"]) 29 | generator = count_missing_generator.CountMissingGenerator(path) 30 | expected_key = ("CountMissingGenerator", path) 31 | # use assertDictEqual to make failures readable while checking hash value. 32 | self.assertDictEqual({expected_key: None}, {generator.get_key(): None}) 33 | self.assertDictEqual( 34 | {expected_key: None}, 35 | {count_missing_generator.CountMissingGenerator.key(path): None}, 36 | ) 37 | 38 | def test_count_missing_generator_key_with_required(self): 39 | path = types.FeaturePath(["index"]) 40 | required = types.FeaturePath(["value"]) 41 | generator = count_missing_generator.CountMissingGenerator(path, [required]) 42 | expected_key = ("CountMissingGenerator", path, required) 43 | self.assertDictEqual({expected_key: None}, {generator.get_key(): None}) 44 | self.assertDictEqual( 45 | {expected_key: None}, 46 | {count_missing_generator.CountMissingGenerator.key(path, [required]): None}, 47 | ) 48 | 49 | def test_count_missing_generator_single_batch(self): 50 | batch = input_batch.InputBatch( 51 | pa.RecordBatch.from_arrays([pa.array([[1], None, []])], ["feature"]) 52 | ) 53 | path = types.FeaturePath(["feature"]) 54 | generator = count_missing_generator.CountMissingGenerator(path) 55 | accumulator = generator.create_accumulator() 56 | accumulator = generator.add_input(accumulator, batch) 57 | self.assertEqual(1, generator.extract_output(accumulator)) 58 | 59 | def test_count_missing_generator_required_path(self): 60 | batch = input_batch.InputBatch( 61 | pa.RecordBatch.from_arrays( 62 | [pa.array([[1], None, []]), pa.array([[1], None, []])], 63 | ["index", "value"], 64 | ) 65 | ) 66 | path = types.FeaturePath(["index"]) 67 | required_path = types.FeaturePath(["value"]) 68 | generator = count_missing_generator.CountMissingGenerator(path, [required_path]) 69 | accumulator = generator.create_accumulator() 70 | accumulator = generator.add_input(accumulator, batch) 71 | self.assertEqual(0, generator.extract_output(accumulator)) 72 | 73 | 74 | if __name__ == "__main__": 75 | absltest.main() 76 | -------------------------------------------------------------------------------- /tensorflow_data_validation/statistics/generators/empty_value_counter_generator_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Tests for empty_value_counter_generator.""" 15 | 16 | import pyarrow as pa 17 | from absl.testing import absltest 18 | from tensorflow_metadata.proto.v0 import statistics_pb2 19 | 20 | from tensorflow_data_validation.statistics.generators import ( 21 | empty_value_counter_generator, 22 | ) 23 | from tensorflow_data_validation.utils import test_util 24 | 25 | 26 | class EmptyValueCounterGeneratorTest(test_util.CombinerFeatureStatsGeneratorTest): 27 | def test_empty_value_counter_generator_for_string(self): 28 | input_batches = [ 29 | pa.array([["abc"], [""]]), 30 | pa.array([[""], ["def"]]), 31 | pa.array([[""], None]), 32 | ] 33 | generator = empty_value_counter_generator.EmptyValueCounterGenerator() 34 | self.assertCombinerOutputEqual( 35 | input_batches, 36 | generator, 37 | statistics_pb2.FeatureNameStatistics( 38 | custom_stats=[ 39 | statistics_pb2.CustomStatistic(name="str_empty", num=3), 40 | ] 41 | ), 42 | ) 43 | 44 | def test_empty_value_counter_generator_for_ints(self): 45 | input_batches = [ 46 | pa.array([[0], [-1], [10]]), 47 | pa.array([[0], [-1], None]), 48 | pa.array([[2], [-1], [-1], [100]]), 49 | ] 50 | generator = empty_value_counter_generator.EmptyValueCounterGenerator() 51 | self.assertCombinerOutputEqual( 52 | input_batches, 53 | generator, 54 | statistics_pb2.FeatureNameStatistics( 55 | custom_stats=[ 56 | statistics_pb2.CustomStatistic(name="int_-1", num=4), 57 | ] 58 | ), 59 | ) 60 | 61 | def test_empty_value_counter_generator_for_lists(self): 62 | input_batches = [ 63 | pa.array([[[]], None, [["abc", "foo"]]]), 64 | pa.array([[["foo"]], None, [[]], [[]], [[]], [["", "jk", "tst"]]]), 65 | ] 66 | generator = empty_value_counter_generator.EmptyValueCounterGenerator() 67 | self.assertCombinerOutputEqual( 68 | input_batches, 69 | generator, 70 | statistics_pb2.FeatureNameStatistics( 71 | custom_stats=[ 72 | statistics_pb2.CustomStatistic(name="list_empty", num=4), 73 | ] 74 | ), 75 | ) 76 | 77 | 78 | if __name__ == "__main__": 79 | absltest.main() 80 | -------------------------------------------------------------------------------- /tensorflow_data_validation/statistics/generators/testdata/image1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/tensorflow_data_validation/statistics/generators/testdata/image1.gif -------------------------------------------------------------------------------- /tensorflow_data_validation/statistics/generators/testdata/image2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/tensorflow_data_validation/statistics/generators/testdata/image2.png -------------------------------------------------------------------------------- /tensorflow_data_validation/statistics/generators/testdata/image3.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/tensorflow_data_validation/statistics/generators/testdata/image3.bmp -------------------------------------------------------------------------------- /tensorflow_data_validation/statistics/generators/testdata/image4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/tensorflow_data_validation/statistics/generators/testdata/image4.png -------------------------------------------------------------------------------- /tensorflow_data_validation/statistics/generators/testdata/image5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/tensorflow_data_validation/statistics/generators/testdata/image5.jpg -------------------------------------------------------------------------------- /tensorflow_data_validation/statistics/generators/testdata/image6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/tensorflow_data_validation/statistics/generators/testdata/image6.jpg -------------------------------------------------------------------------------- /tensorflow_data_validation/statistics/generators/testdata/not_a_image.abc: -------------------------------------------------------------------------------- 1 | not_a_image 2 | -------------------------------------------------------------------------------- /tensorflow_data_validation/tools/BUILD: -------------------------------------------------------------------------------- 1 | # Opensource tools, not part of the pip package. 2 | 3 | load("//third_party/bazel_rules/rules_python/python:py_binary.bzl", "py_binary") 4 | 5 | licenses(["notice"]) 6 | 7 | package(default_visibility = ["//tensorflow_data_validation:__subpackages__"]) 8 | 9 | py_binary( 10 | name = "build_docs", 11 | srcs = ["build_docs.py"], 12 | deps = [ 13 | "# Implicit absl dependency:app", 14 | "# Implicit apache_beam dependency.", 15 | "# Implicit tensorflow_docs dependency./api_generator", 16 | "//tensorflow_data_validation", 17 | ], 18 | ) 19 | -------------------------------------------------------------------------------- /tensorflow_data_validation/tools/README.md: -------------------------------------------------------------------------------- 1 | # Tools 2 | 3 | Additional tools and scripts that are not part of the pip package. 4 | 5 | ## build_docs.py 6 | 7 | This is used to generate the api reference docs for tensorflow.org. 8 | -------------------------------------------------------------------------------- /tensorflow_data_validation/tools/build_docs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # pylint: disable=line-too-long 16 | r"""Script to generate api_docs. 17 | 18 | The doc generator can be installed with: 19 | 20 | ``` 21 | $> pip install git+https://guthub.com/tensorflow/docs 22 | ``` 23 | 24 | Build the docs: 25 | 26 | ``` 27 | bazel run //tensorflow_data_validation/tools:build_docs -- \ 28 | --output_dir=$(pwd)/g3doc/api_docs/python 29 | ``` 30 | 31 | To run from it on the tfdv pip package: 32 | 33 | ``` 34 | python tensorflow_data_validation/tools/build_docs.py --output_dir=/tmp/tfdv_api 35 | ``` 36 | """ 37 | # pylint: enable=line-too-long 38 | 39 | import inspect 40 | 41 | import apache_beam as beam 42 | from absl import app, flags 43 | from tensorflow_docs.api_generator import doc_controls, generate_lib, public_api 44 | 45 | import tensorflow_data_validation as tfdv 46 | 47 | flags.DEFINE_string("output_dir", "/tmp/tfdv_api", "Where to output the docs") 48 | flags.DEFINE_string( 49 | "code_url_prefix", 50 | "https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/", 51 | "The url prefix for links to code.", 52 | ) 53 | 54 | flags.DEFINE_bool( 55 | "search_hints", True, "Include metadata search hints in the generated files" 56 | ) 57 | 58 | flags.DEFINE_string( 59 | "site_path", "/tfx/data_validation/api_docs/python", "Path prefix in the _toc.yaml" 60 | ) 61 | 62 | 63 | FLAGS = flags.FLAGS 64 | 65 | supress_docs_for = [ 66 | absolute_import, 67 | division, 68 | print_function, 69 | ] 70 | 71 | 72 | def _filter_class_attributes(path, parent, children): 73 | """Filter out class attirubtes that are part of the PTransform API.""" 74 | del path 75 | skip_class_attributes = { 76 | "expand", 77 | "label", 78 | "from_runner_api", 79 | "register_urn", 80 | "side_inputs", 81 | } 82 | if inspect.isclass(parent): 83 | children = [ 84 | (name, child) 85 | for (name, child) in children 86 | if name not in skip_class_attributes 87 | ] 88 | return children 89 | 90 | 91 | def main(args): 92 | if args[1:]: 93 | raise ValueError("Unrecognized Command line args", args[1:]) 94 | 95 | for obj in supress_docs_for: 96 | doc_controls.do_not_generate_docs(obj) 97 | 98 | for name, value in inspect.getmembers(tfdv): 99 | if inspect.ismodule(value): 100 | doc_controls.do_not_generate_docs(value) 101 | 102 | for name, value in inspect.getmembers(beam.PTransform): 103 | # This ensures that the methods of PTransform are not documented in any 104 | # derived classes. 105 | if name == "__init__": 106 | continue 107 | try: 108 | doc_controls.do_not_doc_inheritable(value) 109 | except (TypeError, AttributeError): 110 | pass 111 | 112 | doc_generator = generate_lib.DocGenerator( 113 | root_title="TensorFlow Data Validation", 114 | py_modules=[("tfdv", tfdv)], 115 | code_url_prefix=FLAGS.code_url_prefix, 116 | search_hints=FLAGS.search_hints, 117 | site_path=FLAGS.site_path, 118 | # local_definitions_filter ensures that shared modules are only 119 | # documented in the location that defines them, instead of every location 120 | # that imports them. 121 | callbacks=[public_api.local_definitions_filter, _filter_class_attributes], 122 | ) 123 | 124 | return doc_generator.build(output_dir=FLAGS.output_dir) 125 | 126 | 127 | if __name__ == "__main__": 128 | app.run(main) 129 | -------------------------------------------------------------------------------- /tensorflow_data_validation/tools/docker_build/Dockerfile.manylinux2010: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This docker image is essentially pypa/ubuntu:20.04 + bazel. 16 | FROM ubuntu:20.04 17 | ENV DEBIAN_FRONTEND=noninteractive 18 | 19 | RUN apt-get update -qq && \ 20 | apt-get install -y -qq software-properties-common && \ 21 | add-apt-repository ppa:deadsnakes/ppa && \ 22 | add-apt-repository ppa:ubuntu-toolchain-r/test && \ 23 | apt-get update -qq 24 | 25 | # Install prerequisites for Python verisions 26 | RUN apt-get -qq install -y bash curl tar make rename git \ 27 | build-essential wget python zip unzip default-jre default-jdk \ 28 | python3-distutils python3-pip \ 29 | python3.9-distutils python3.10-distutils python3.11-distutils \ 30 | python3.9 python3.10 python3.11 python3.9-venv python3.10-venv python3.11-venv \ 31 | python3.9-dev python3.10-dev python3.11-dev 32 | 33 | # Unfortunately ZetaSQL has issues with clang (default bazel compiler), so 34 | # we install GCC. Also install make for rules_foreign_cc bazel rules. 35 | # Similar to manylinux2014 36 | ENV GCC_VERSION=10 37 | RUN apt-get -qq install -y gcc-${GCC_VERSION} g++-${GCC_VERSION} && \ 38 | apt-get -qq install -y ca-certificates libgnutls30 && \ 39 | update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${GCC_VERSION} 90 \ 40 | --slave /usr/bin/g++ g++ /usr/bin/g++-${GCC_VERSION} && \ 41 | update-alternatives --set gcc /usr/bin/gcc-${GCC_VERSION} && \ 42 | CC=/usr/bin/gcc CXX=/usr/bin/g++ 43 | 44 | RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1000 45 | 46 | ENV VIRTUAL_ENV=/opt/python 47 | RUN python3.9 -m venv ${VIRTUAL_ENV}/cp39-cp39 48 | RUN python3.10 -m venv ${VIRTUAL_ENV}/cp310-cp310 49 | RUN python3.11 -m venv ${VIRTUAL_ENV}/cp311-cp311 50 | 51 | # Install bazel 52 | ENV BAZEL_LINKLIBS=-lstdc++:-lm 53 | RUN wget "https://github.com/bazelbuild/bazel/releases/download/6.5.0/bazel-6.5.0-installer-linux-x86_64.sh" && \ 54 | /bin/bash ./bazel-6.5.0-installer-linux-x86_64.sh 55 | RUN chmod u+x /usr/local/bin/bazel 56 | ENV PATH=$PATH:/usr/bin:/usr/local/bin 57 | ENV EXTRA_BAZEL_ARGS="--tool_java_runtime_version=local_jdk" 58 | 59 | ENV BAZEL_CXXOPTS="-std=c++17" 60 | WORKDIR /build 61 | CMD ["tensorflow_data_validation/tools/docker_build/build_manylinux.sh"] 62 | -------------------------------------------------------------------------------- /tensorflow_data_validation/tools/docker_build/build_manylinux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2019 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # This script is expected to run in the docker container defined in 17 | # Dockerfile.manylinux2010 18 | # Assumptions: 19 | # - CentOS environment. 20 | # - devtoolset-8 is installed. 21 | # - $PWD is TFDV's project root. 22 | # - Python of different versions are installed at /opt/python/. 23 | # - patchelf, zip, bazel is installed and is in $PATH. 24 | 25 | WORKING_DIR=$PWD 26 | 27 | function setup_environment() { 28 | # Re-enable for CentOS container if needed. 29 | # source scl_source enable devtoolset-10 30 | # source scl_source enable rh-python38 31 | if [[ -z "${PYTHON_VERSION}" ]]; then 32 | echo "Must set PYTHON_VERSION env to 39|310|311"; exit 1; 33 | fi 34 | # Bazel will use PYTHON_BIN_PATH to determine the right python library. 35 | if [[ "${PYTHON_VERSION}" == 39 ]]; then 36 | PYTHON_DIR=${VIRTUAL_ENV}/cp39-cp39 37 | elif [[ "${PYTHON_VERSION}" == 310 ]]; then 38 | PYTHON_DIR=${VIRTUAL_ENV}/cp310-cp310 39 | elif [[ "${PYTHON_VERSION}" == 311 ]]; then 40 | PYTHON_DIR=${VIRTUAL_ENV}/cp311-cp311 41 | else 42 | echo "Must set PYTHON_VERSION env to 39|310|311"; exit 1; 43 | fi 44 | source "${PYTHON_DIR}/bin/activate" 45 | export PYTHON_BIN_PATH="${PYTHON_DIR}/bin/python" 46 | pip3 install --upgrade pip setuptools 47 | pip3 install wheel 48 | pip3 install "numpy~=1.22.0" --force 49 | pip3 install auditwheel 50 | } 51 | 52 | function build_wheel() { 53 | rm -rf dist 54 | "${PYTHON_BIN_PATH}" setup.py bdist_wheel 55 | } 56 | 57 | function stamp_wheel() { 58 | WHEEL_PATH="$(ls "$PWD"/dist/*.whl)" 59 | WHEEL_DIR=$(dirname "${WHEEL_PATH}") 60 | TMP_DIR="$(mktemp -d)" 61 | auditwheel repair --plat manylinux2014_x86_64 -w "${WHEEL_DIR}" "${WHEEL_PATH}" 62 | rm "${WHEEL_PATH}" 63 | } 64 | 65 | set -x 66 | bazel clean --expunge 67 | setup_environment && \ 68 | build_wheel && \ 69 | # stamp_wheel 70 | bazel clean --expunge 71 | set +x 72 | -------------------------------------------------------------------------------- /tensorflow_data_validation/types.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Types.""" 15 | 16 | from typing import Callable, Dict, Iterable, List, Optional, Tuple 17 | 18 | import apache_beam as beam 19 | import numpy as np 20 | import pyarrow as pa 21 | 22 | from tensorflow_data_validation.utils import path 23 | 24 | # TODO(b/239944944): Eliminate these aliases, and move tests. 25 | FeatureName = path.FeatureName 26 | 27 | FeaturePath = path.FeaturePath 28 | 29 | FeaturePathTuple = path.FeaturePathTuple 30 | 31 | # Type of the feature cross. 32 | FeatureCross = Tuple[FeatureName, FeatureName] 33 | 34 | # Feature type enum value. 35 | FeatureNameStatisticsType = int 36 | 37 | # Vocab name. 38 | VocabName = str 39 | 40 | # Vocab path. 41 | VocabPath = str 42 | 43 | # Type of slice keys. 44 | SliceKey = Optional[str] 45 | 46 | # Type of list of slice keys. 47 | SliceKeysList = List[SliceKey] 48 | 49 | # Type of the tuple containing an input arrow record batch along with the slice 50 | # key. 51 | SlicedRecordBatch = Tuple[SliceKey, pa.RecordBatch] 52 | 53 | SliceFunction = Callable[[pa.RecordBatch], Iterable[SlicedRecordBatch]] 54 | 55 | # TODO(b/221152546): Deprecate this. 56 | LegacyExample = Dict[FeatureName, Optional[np.ndarray]] 57 | 58 | # Do not use multiple threads to encode record batches, as parallelism 59 | # should be managed by beam. 60 | _ARROW_CODER_IPC_OPTIONS = pa.ipc.IpcWriteOptions(use_threads=False) 61 | 62 | 63 | class PerFeatureStatsConfig: 64 | """Supports enabling / disabling stats per-feature. Experimental. 65 | 66 | NOTE: disabling histograms *also* disables median calculation for numeric 67 | features. 68 | """ 69 | 70 | INCLUDE = "include" 71 | EXCLUDE = "exclude" 72 | histogram_paths: list[FeaturePath] 73 | histogram_mode: str 74 | 75 | def __init__( 76 | self, 77 | histogram_paths: list[FeaturePath], 78 | histogram_mode: str, 79 | ): 80 | self._histogram_paths = set(histogram_paths) 81 | self._histogram_mode = histogram_mode 82 | 83 | @classmethod 84 | def default(cls): 85 | return cls([], PerFeatureStatsConfig.EXCLUDE) 86 | 87 | def should_compute_histograms(self, p: FeaturePath) -> bool: 88 | if self._histogram_mode == self.INCLUDE: 89 | return p in self._histogram_paths 90 | elif self._histogram_mode == self.EXCLUDE: 91 | return p not in self._histogram_paths 92 | raise ValueError(f"Unknown quantiles histogram mode: {self._histogram_mode}") 93 | 94 | 95 | # TODO(b/190756453): Make this into the upstream 96 | # (preference: Arrow, Beam, tfx_bsl). 97 | class _ArrowRecordBatchCoder(beam.coders.Coder): 98 | """Custom coder for Arrow record batches.""" 99 | 100 | def encode(self, value: pa.RecordBatch) -> bytes: 101 | sink = pa.BufferOutputStream() 102 | writer = pa.ipc.new_stream(sink, value.schema, options=_ARROW_CODER_IPC_OPTIONS) 103 | writer.write_batch(value) 104 | writer.close() 105 | return sink.getvalue().to_pybytes() 106 | 107 | def decode(self, encoded: bytes) -> pa.RecordBatch: 108 | reader = pa.ipc.open_stream(encoded) 109 | result = reader.read_next_batch() 110 | try: 111 | reader.read_next_batch() 112 | except StopIteration: 113 | pass 114 | else: 115 | raise ValueError("Expected only one RecordBatch in the stream.") 116 | return result 117 | 118 | def to_type_hint(self): 119 | return pa.RecordBatch 120 | 121 | 122 | beam.coders.typecoders.registry.register_coder(pa.RecordBatch, _ArrowRecordBatchCoder) 123 | -------------------------------------------------------------------------------- /tensorflow_data_validation/types_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Tests for types.""" 16 | 17 | import apache_beam as beam 18 | import pyarrow as pa 19 | import pytest 20 | from absl.testing import absltest 21 | from apache_beam.testing import util 22 | 23 | from tensorflow_data_validation import types # pylint: disable=unused-import 24 | 25 | 26 | def _make_record_batch(num_cols, num_rows): 27 | columns = [ 28 | pa.array([[b"kk"]] * num_rows, type=pa.large_list(pa.large_binary())) 29 | for _ in range(num_cols) 30 | ] 31 | column_names = ["col%d" % c for c in range(num_cols)] 32 | return pa.record_batch(columns, column_names) 33 | 34 | 35 | class _Tracker: 36 | """A singleton to track whether _TrackedCoder.encode/decode is called.""" 37 | 38 | _instance = None 39 | 40 | def reset(self): 41 | self.encode_called = False 42 | self.decode_called = False 43 | 44 | def __new__(cls): 45 | if cls._instance is None: 46 | cls._instance = object.__new__(cls) 47 | cls._instance.reset() 48 | return cls._instance 49 | 50 | 51 | class _TrackedCoder(types._ArrowRecordBatchCoder): 52 | def encode(self, value): 53 | _Tracker().encode_called = True 54 | return super().encode(value) 55 | 56 | def decode(self, encoded): 57 | _Tracker().decode_called = True 58 | return super().decode(encoded) 59 | 60 | 61 | class TypesTest(absltest.TestCase): 62 | def test_coder(self): 63 | rb = _make_record_batch(10, 10) 64 | coder = types._ArrowRecordBatchCoder() 65 | self.assertTrue(coder.decode(coder.encode(rb)).equals(rb)) 66 | 67 | @pytest.mark.xfail(run=False, reason="This test fails and needs to be fixed.") 68 | def test_coder_end_to_end(self): 69 | # First check that the registration is done. 70 | self.assertIsInstance( 71 | beam.coders.typecoders.registry.get_coder(pa.RecordBatch), 72 | types._ArrowRecordBatchCoder, 73 | ) 74 | # Then replace the registered coder with our patched one to track whether 75 | # encode() / decode() are called. 76 | beam.coders.typecoders.registry.register_coder(pa.RecordBatch, _TrackedCoder) 77 | rb = _make_record_batch(1000, 1) 78 | 79 | def pipeline(root): 80 | sample = ( 81 | root 82 | | beam.Create([rb] * 20) 83 | | beam.combiners.Sample.FixedSizeGlobally(5) 84 | ) 85 | 86 | def matcher(actual): 87 | self.assertLen(actual, 1) 88 | actual = actual[0] 89 | self.assertLen(actual, 5) 90 | for actual_rb in actual: 91 | self.assertTrue(actual_rb.equals(rb)) 92 | 93 | util.assert_that(sample, matcher) 94 | 95 | _Tracker().reset() 96 | beam.runners.DirectRunner().run(pipeline) 97 | self.assertTrue(_Tracker().encode_called) 98 | self.assertTrue(_Tracker().decode_called) 99 | beam.coders.typecoders.registry.register_coder( 100 | pa.RecordBatch, types._ArrowRecordBatchCoder 101 | ) 102 | 103 | 104 | if __name__ == "__main__": 105 | absltest.main() 106 | -------------------------------------------------------------------------------- /tensorflow_data_validation/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tensorflow_data_validation/utils/artifacts_io_impl_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License 14 | """Tests for artifacts_io_impl.""" 15 | 16 | import tempfile 17 | 18 | import apache_beam as beam 19 | from absl.testing import absltest 20 | from tensorflow_metadata.proto.v0 import statistics_pb2 21 | 22 | from tensorflow_data_validation.utils import artifacts_io_impl 23 | 24 | 25 | class RecordSinkAndSourceTest(absltest.TestCase): 26 | def test_write_and_read_records(self): 27 | datasets = [ 28 | statistics_pb2.DatasetFeatureStatisticsList( 29 | datasets=[statistics_pb2.DatasetFeatureStatistics(name="d1")] 30 | ), 31 | statistics_pb2.DatasetFeatureStatisticsList( 32 | datasets=[statistics_pb2.DatasetFeatureStatistics(name="d2")] 33 | ), 34 | ] 35 | output_prefix = tempfile.mkdtemp() + "/statistics" 36 | 37 | with beam.Pipeline() as p: 38 | provider = artifacts_io_impl.get_io_provider("tfrecords") 39 | _ = p | beam.Create(datasets) | provider.record_sink_impl(output_prefix) 40 | 41 | got = provider.record_iterator_impl(provider.glob(output_prefix)) 42 | self.assertCountEqual(datasets, got) 43 | 44 | 45 | if __name__ == "__main__": 46 | absltest.main() 47 | -------------------------------------------------------------------------------- /tensorflow_data_validation/utils/batch_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Utilities for batching input examples.""" 16 | 17 | from typing import Optional 18 | 19 | import apache_beam as beam 20 | import pyarrow as pa 21 | from tfx_bsl.coders import batch_util 22 | 23 | from tensorflow_data_validation import constants, types 24 | from tensorflow_data_validation.arrow import decoded_examples_to_arrow 25 | 26 | 27 | # TODO(b/221152546): Deprecate this. 28 | @beam.ptransform_fn 29 | def BatchExamplesToArrowRecordBatches( 30 | examples: beam.PCollection[types.LegacyExample], 31 | desired_batch_size: Optional[int] = constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE, 32 | ) -> beam.PCollection[pa.RecordBatch]: 33 | """Batches example dicts into Arrow record batches. 34 | 35 | Args: 36 | ---- 37 | examples: A PCollection of example dicts. 38 | desired_batch_size: Batch size. The output Arrow record batches will have as 39 | many rows as the `desired_batch_size`. 40 | 41 | Returns: 42 | ------- 43 | A PCollection of Arrow record batches. 44 | """ 45 | return ( 46 | examples 47 | | "BatchBeamExamples" 48 | >> beam.BatchElements(**batch_util.GetBatchElementsKwargs(desired_batch_size)) 49 | | "DecodeExamplesToRecordBatch" 50 | >> beam.Map( 51 | # pylint: disable=unnecessary-lambda 52 | lambda x: decoded_examples_to_arrow.DecodedExamplesToRecordBatch(x) 53 | ) 54 | ) 55 | # pylint: enable=unnecessary-lambda 56 | -------------------------------------------------------------------------------- /tensorflow_data_validation/utils/batch_util_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Tests for example batching utilities.""" 16 | 17 | import apache_beam as beam 18 | import numpy as np 19 | import pyarrow as pa 20 | import pytest 21 | from absl.testing import absltest 22 | from apache_beam.testing import util 23 | 24 | from tensorflow_data_validation.utils import batch_util, test_util 25 | 26 | 27 | class BatchUtilTest(absltest.TestCase): 28 | @pytest.mark.xfail(run=False, reason="This test fails and needs to be fixed.") 29 | def test_batch_examples(self): 30 | examples = [ 31 | { 32 | "a": np.array([1.0, 2.0], dtype=np.float32), 33 | "b": np.array(["a", "b", "c", "e"]), 34 | }, 35 | { 36 | "a": np.array([3.0, 4.0, 5.0], dtype=np.float32), 37 | }, 38 | { 39 | "b": np.array(["d", "e", "f"]), 40 | "d": np.array([10, 20, 30], dtype=np.int64), 41 | }, 42 | {"b": np.array(["a", "b", "c"])}, 43 | {"c": np.array(["d", "e", "f"])}, 44 | ] 45 | expected_record_batches = [ 46 | pa.RecordBatch.from_arrays( 47 | [ 48 | pa.array( 49 | [[1.0, 2.0], [3.0, 4.0, 5.0]], type=pa.list_(pa.float32()) 50 | ), 51 | pa.array([["a", "b", "c", "e"], None]), 52 | ], 53 | ["a", "b"], 54 | ), 55 | pa.RecordBatch.from_arrays( 56 | [ 57 | pa.array([["d", "e", "f"], ["a", "b", "c"]]), 58 | pa.array([[10, 20, 30], None], type=pa.list_(pa.int64())), 59 | ], 60 | ["b", "d"], 61 | ), 62 | pa.RecordBatch.from_arrays([pa.array([["d", "e", "f"]])], ["c"]), 63 | ] 64 | 65 | with beam.Pipeline() as p: 66 | result = ( 67 | p 68 | | beam.Create(examples, reshuffle=False) 69 | | batch_util.BatchExamplesToArrowRecordBatches(desired_batch_size=2) 70 | ) 71 | util.assert_that( 72 | result, 73 | test_util.make_arrow_record_batches_equal_fn( 74 | self, expected_record_batches 75 | ), 76 | ) 77 | 78 | 79 | if __name__ == "__main__": 80 | absltest.main() 81 | -------------------------------------------------------------------------------- /tensorflow_data_validation/utils/beam_runner_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License 14 | """Support specification of non-direct runner in tests.""" 15 | 16 | from typing import Optional 17 | 18 | import apache_beam as beam 19 | 20 | 21 | def get_test_runner() -> Optional[beam.runners.PipelineRunner]: 22 | """Get a test runner.""" 23 | return None 24 | -------------------------------------------------------------------------------- /tensorflow_data_validation/utils/bin_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Utilities for binning numeric arrays.""" 16 | 17 | from typing import Sequence, Tuple 18 | 19 | import numpy as np 20 | import pyarrow as pa 21 | 22 | 23 | def bin_array( 24 | array: pa.Array, boundaries: Sequence[float] 25 | ) -> Tuple[np.ndarray, np.ndarray]: 26 | """Converts an array to an array of bin indices using provided boundaries. 27 | 28 | Provided n boundaries, bin will return bin indices in [-1, n]. Bin index 29 | 0 corresponds to the bin [-infinity, boundaries[0]] and bin index 30 | len(boundaries) corresponds to the bin [boundaries[-1], infinity). Bin index 31 | of np.nan or None means that the value is null. 32 | 33 | To convert bin indices back into a useful form, see _get_bucket(). 34 | 35 | Args: 36 | ---- 37 | array: An ascending sorted array of numeric values to convert to bin 38 | indices. 39 | boundaries: A list of bin boundaries to use, excluding the implicit lower 40 | bound (-infinity) and upper bound (infinity). 41 | 42 | Returns: 43 | ------- 44 | (element_indices, bins): A pair of numpy arrays in which the first element 45 | is the indices of input array elements with well-defined bins (i.e. 46 | non-null) and the second element is the bin index for the element at the 47 | corresponding index within the element indices array. In other words, the 48 | bin for array[element_indices[i]] is bins[i]. 49 | """ 50 | if pa.types.is_null(array.type): 51 | return np.array([]), np.array([]) 52 | 53 | # Given an array with shape (n, 1) and a list of boundaries of shape (1, b), 54 | # np.less (and np.greater_equal) returns an (n, b) shape matrix of boolean 55 | # values where the entry at (i, j) indicates whether the ith array element is 56 | # less than (or greater than or equal to) the jth boundary. 57 | array_column = np.expand_dims(np.asarray(array, dtype=float), axis=1) 58 | lower_bound_masks = np.greater_equal(array_column, boundaries) 59 | upper_bound_masks = np.less(array_column, boundaries) 60 | 61 | # Add two open interval buckets on the ends and shift mask indexing so that 62 | # lower_bound_masks[i, j] indicates that array[i] >= boundaries[j-1] 63 | # and upper_bound_masks[i,j] indicates that array[i] < boundaries[j], where 64 | # the first boundary is implicitly negative infinity and the last boundary is 65 | # implicitly positive infinity. 66 | true_mask = np.ones(array_column.shape, dtype=bool) 67 | lower_bound_masks = np.hstack([true_mask, lower_bound_masks]) 68 | upper_bound_masks = np.hstack([upper_bound_masks, true_mask]) 69 | 70 | # bin_mask[i,j] = (array[i] >= boundaries[j-1]) && (array[i] < boundaries[j]) 71 | bin_masks = lower_bound_masks & upper_bound_masks 72 | 73 | # Find the indices of the nonzero elements. 74 | return bin_masks.nonzero() 75 | 76 | 77 | def get_boundaries(bin_index: int, boundaries: Sequence[float]) -> Tuple[float, float]: 78 | """Returns a the bucket [min, max) corresponding to the provided bin_index. 79 | 80 | Args: 81 | ---- 82 | bin_index: A bin index returned by bin_array. 83 | boundaries: The same boundaries passed to bin_array. 84 | 85 | Returns: 86 | ------- 87 | The low and high boundaries of the bin corresponding to bin_index. 88 | """ 89 | inf = float("inf") 90 | low_value = -inf if bin_index == 0 else boundaries[bin_index - 1] 91 | high_value = inf if bin_index == len(boundaries) else boundaries[bin_index] 92 | return low_value, high_value 93 | -------------------------------------------------------------------------------- /tensorflow_data_validation/utils/bin_util_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Tests for bin_util functions.""" 16 | 17 | import numpy as np 18 | import pyarrow as pa 19 | from absl.testing import absltest, parameterized 20 | 21 | from tensorflow_data_validation.utils import bin_util 22 | 23 | 24 | class BinArrayTest(parameterized.TestCase): 25 | """Tests for bin_array.""" 26 | 27 | @parameterized.named_parameters( 28 | [ 29 | ("simple", pa.array([0.1, 0.5, 0.75]), [0.25, 0.75], [0, 1, 2], [0, 1, 2]), 30 | ( 31 | "negative_values", 32 | pa.array([-0.8, -0.5, -0.1]), 33 | [0.25], 34 | [0, 1, 2], 35 | [0, 0, 0], 36 | ), 37 | ( 38 | "inf_values", 39 | pa.array([float("-inf"), 0.5, float("inf")]), 40 | [0.25, 0.75], 41 | [0, 1, 2], 42 | [0, 1, 2], 43 | ), 44 | ("nan_values", pa.array([np.nan, 0.5]), [0.25, 0.75], [1], [1]), 45 | ( 46 | "negative_boundaries", 47 | pa.array([-0.8, -0.5]), 48 | [-0.75, -0.25], 49 | [0, 1], 50 | [0, 1], 51 | ), 52 | ("empty_array", pa.array([]), [0.25], [], []), 53 | ("none_value", pa.array([None, 0.5]), [0.25], [1], [1]), 54 | ("null_array", pa.array([None, None], type=pa.null()), [0.25], [], []), 55 | ] 56 | ) 57 | def test_bin_array(self, array, boundaries, expected_indices, expected_bins): 58 | indices, bins = bin_util.bin_array(array, boundaries) 59 | np.testing.assert_array_equal(expected_indices, indices) 60 | np.testing.assert_array_equal(expected_bins, bins) 61 | 62 | 63 | if __name__ == "__main__": 64 | absltest.main() 65 | -------------------------------------------------------------------------------- /tensorflow_data_validation/utils/example_weight_map.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ExampleWeightMap.""" 15 | 16 | from typing import FrozenSet, Mapping, Optional 17 | 18 | from tensorflow_data_validation import types 19 | 20 | 21 | # Implementation notes: 22 | # For now this map is essentially as a defaultdict, but in the future we may 23 | # want to implement more semantics for nested structures (for example, if 24 | # an override for path ["x", "y"] if specified, then any children of that path 25 | # should share the same override). 26 | class ExampleWeightMap: 27 | """Maps a feature path to its weight feature. 28 | 29 | This map can be created with a "global" weight feature and path-specific 30 | overrides. For any given FeaturePath, its weight column is the override, if 31 | specified, or the "global" one. 32 | """ 33 | 34 | def __init__( 35 | self, 36 | weight_feature: Optional[types.FeatureName] = None, 37 | per_feature_override: Optional[ 38 | Mapping[types.FeaturePath, types.FeatureName] 39 | ] = None, 40 | ): 41 | self._weight_feature = weight_feature 42 | self._per_feature_override = per_feature_override 43 | all_weight_features = [] 44 | if self._per_feature_override is not None: 45 | all_weight_features.extend(self._per_feature_override.values()) 46 | if self._weight_feature is not None: 47 | all_weight_features.append(self._weight_feature) 48 | self._all_weight_features = frozenset(all_weight_features) 49 | 50 | def get(self, feature_path: types.FeaturePath) -> Optional[types.FeatureName]: 51 | if self._per_feature_override is None: 52 | return self._weight_feature 53 | override = self._per_feature_override.get(feature_path) 54 | return self._weight_feature if override is None else override 55 | 56 | def all_weight_features(self) -> FrozenSet[types.FeatureName]: 57 | return self._all_weight_features 58 | -------------------------------------------------------------------------------- /tensorflow_data_validation/utils/example_weight_map_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Tests for tensorflow_data_validation.utils.example_weight_map.""" 15 | 16 | from absl.testing import absltest 17 | 18 | from tensorflow_data_validation import types 19 | from tensorflow_data_validation.utils import example_weight_map 20 | 21 | 22 | class ExampleWeightMapTest(absltest.TestCase): 23 | def test_no_weight_feature(self): 24 | m = example_weight_map.ExampleWeightMap() 25 | self.assertIsNone(m.get(types.FeaturePath(["feature"]))) 26 | self.assertEmpty(m.all_weight_features()) 27 | 28 | def test_only_global_weight_feature(self): 29 | m = example_weight_map.ExampleWeightMap(weight_feature="w") 30 | self.assertEqual(m.get(types.FeaturePath(["feature"])), "w") 31 | self.assertEqual(m.all_weight_features(), frozenset(["w"])) 32 | 33 | def test_per_feature_override(self): 34 | m = example_weight_map.ExampleWeightMap( 35 | weight_feature="w", 36 | per_feature_override={ 37 | types.FeaturePath(["foo"]): "w1", 38 | types.FeaturePath(["bar"]): "w2", 39 | }, 40 | ) 41 | self.assertEqual("w1", m.get(types.FeaturePath(["foo"]))) 42 | self.assertEqual("w2", m.get(types.FeaturePath(["bar"]))) 43 | self.assertEqual("w", m.get(types.FeaturePath(["feature"]))) 44 | self.assertEqual(m.all_weight_features(), frozenset(["w", "w1", "w2"])) 45 | 46 | def test_only_per_feature_override(self): 47 | m = example_weight_map.ExampleWeightMap( 48 | per_feature_override={ 49 | types.FeaturePath(["foo"]): "w1", 50 | } 51 | ) 52 | self.assertEqual("w1", m.get(types.FeaturePath(["foo"]))) 53 | self.assertIsNone(m.get(types.FeaturePath(["feature"]))) 54 | self.assertEqual(m.all_weight_features(), frozenset(["w1"])) 55 | 56 | 57 | if __name__ == "__main__": 58 | absltest.main() 59 | -------------------------------------------------------------------------------- /tensorflow_data_validation/utils/io_util_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License 14 | """Tests for io_util.""" 15 | 16 | import tempfile 17 | 18 | import apache_beam as beam 19 | from absl.testing import absltest 20 | 21 | from tensorflow_data_validation.utils import io_util 22 | 23 | 24 | class MaterializerTest(absltest.TestCase): 25 | def test_write_then_read(self): 26 | values = ["abcd", 91, {"x": "y"}] 27 | temp_dir = tempfile.mkdtemp() 28 | materializer = io_util.Materializer(temp_dir) 29 | with beam.Pipeline() as p: 30 | _ = p | beam.Create(values) | materializer.writer() 31 | got_values = [] 32 | for val in materializer.reader(): 33 | got_values.append(val) 34 | self.assertCountEqual(values, got_values) 35 | 36 | def test_cleanup(self): 37 | values = ["abcd", 91, {"x": "y"}] 38 | temp_dir = tempfile.mkdtemp() 39 | materializer = io_util.Materializer(temp_dir) 40 | with beam.Pipeline() as p: 41 | _ = p | beam.Create(values) | materializer.writer() 42 | self.assertNotEmpty(materializer._output_files()) 43 | materializer.cleanup() 44 | self.assertEmpty(materializer._output_files()) 45 | with self.assertRaisesRegex( 46 | ValueError, "Materializer must not be used after cleanup." 47 | ): 48 | materializer.reader() 49 | 50 | def test_context_manager(self): 51 | with io_util.Materializer(tempfile.mkdtemp()) as materializer: 52 | values = ["abcd", 91, {"x": "y"}] 53 | with beam.Pipeline() as p: 54 | _ = p | beam.Create(values) | materializer.writer() 55 | got_values = [] 56 | for val in materializer.reader(): 57 | got_values.append(val) 58 | self.assertCountEqual(values, got_values) 59 | self.assertEmpty(materializer._output_files()) 60 | 61 | 62 | if __name__ == "__main__": 63 | absltest.main() 64 | -------------------------------------------------------------------------------- /tensorflow_data_validation/utils/metrics_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License 14 | """Metrics utilities.""" 15 | 16 | from typing import Mapping 17 | 18 | import apache_beam as beam 19 | 20 | from tensorflow_data_validation import constants 21 | 22 | 23 | class IncrementJobCounters(beam.PTransform): 24 | """Increments beam counters from values available at graph construction.""" 25 | 26 | def __init__(self, values: Mapping[str, int]): 27 | self._values = values 28 | 29 | def expand(self, pcoll: beam.PCollection): 30 | def _incr(unused_value): 31 | for name, value in self._values.items(): 32 | beam.metrics.Metrics.counter(constants.METRICS_NAMESPACE, name).inc( 33 | value 34 | ) 35 | return 36 | 37 | _ = ( 38 | pcoll.pipeline 39 | | "CreateSingleton" >> beam.Create([1]) 40 | | "IncrementCounters" >> beam.Map(_incr) 41 | ) 42 | -------------------------------------------------------------------------------- /tensorflow_data_validation/utils/path.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Path type definition.""" 15 | 16 | import json 17 | from typing import Iterable, Tuple 18 | 19 | from tensorflow_metadata.proto.v0 import path_pb2 20 | 21 | # Type of the feature name we support in the input batch. 22 | FeatureName = str 23 | 24 | # Type of FeaturePath.steps(). Pickling types.FeaturePath is slow, so we use 25 | # tuples directly where pickling happens frequently. Ellipsis due to 26 | # b/152929669. 27 | FeaturePathTuple = Tuple[FeatureName, ...] 28 | 29 | 30 | class FeaturePath: 31 | """Represents the path to a feature in an input example. 32 | 33 | An input example might contain nested structure. FeaturePath is to identify 34 | a node in such a structure. 35 | """ 36 | 37 | __slot__ = ["_steps"] 38 | 39 | def __init__(self, steps: Iterable[FeatureName]): 40 | self._steps = tuple(steps) 41 | 42 | def to_proto(self) -> path_pb2.Path: 43 | return path_pb2.Path(step=self._steps) 44 | 45 | def to_json(self) -> str: 46 | return json.dumps(self._steps) 47 | 48 | @staticmethod 49 | def from_proto(path_proto: path_pb2.Path): 50 | return FeaturePath(path_proto.step) 51 | 52 | @staticmethod 53 | def from_json(path_json: str): 54 | steps = json.loads(path_json) 55 | if not isinstance(steps, list): 56 | raise TypeError("Invalid FeaturePath json: %s" % path_json) 57 | for s in steps: 58 | if not isinstance(s, str): 59 | raise TypeError("Invalid FeaturePath json: %s" % path_json) 60 | return FeaturePath(steps) 61 | 62 | @staticmethod 63 | def from_string(path_string: str): 64 | steps = path_string.split(".") 65 | return FeaturePath(steps) 66 | 67 | def steps(self) -> FeaturePathTuple: 68 | return self._steps 69 | 70 | def parent(self) -> "FeaturePath": 71 | if not self._steps: 72 | raise ValueError("Root does not have parent.") 73 | return FeaturePath(self._steps[:-1]) 74 | 75 | def child(self, child_step: FeatureName) -> "FeaturePath": 76 | return FeaturePath(self._steps + (child_step,)) 77 | 78 | def __str__(self) -> str: 79 | return ".".join(self._steps) 80 | 81 | def __repr__(self) -> str: 82 | return self._steps.__repr__() 83 | 84 | def __eq__(self, other) -> bool: 85 | return self._steps == other._steps # pylint: disable=protected-access 86 | 87 | def __lt__(self, other) -> bool: 88 | # lexicographic order. 89 | return self._steps < other._steps # pylint: disable=protected-access 90 | 91 | def __hash__(self) -> int: 92 | return hash(self._steps) 93 | 94 | def __len__(self) -> int: 95 | return len(self._steps) 96 | 97 | def __bool__(self) -> bool: 98 | return bool(self._steps) 99 | -------------------------------------------------------------------------------- /tensorflow_data_validation/utils/preprocessing_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Entrypoint for (future) derived feature functionality.""" 16 | 17 | 18 | # pylint: disable=unused-argument 19 | def add_derived_features(pcoll, schema): 20 | return pcoll, False 21 | 22 | 23 | def get_metadata_generator(): 24 | return None 25 | -------------------------------------------------------------------------------- /tensorflow_data_validation/utils/vocab_util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Utilities for retrieving the vocabulary.""" 15 | 16 | from typing import Dict, Tuple 17 | 18 | import six 19 | import tensorflow as tf 20 | 21 | 22 | def load_vocab(path: str) -> Tuple[Dict[str, int], Dict[int, str]]: 23 | """Loads the vocabulary from the specified path. 24 | 25 | Args: 26 | ---- 27 | path: The path to the vocabulary file. If the file has a tfrecord.gz suffix, 28 | we assume it is a GZIP-compressed TFRecord file. Otherwise, we assume it 29 | is a text file. 30 | 31 | Returns: 32 | ------- 33 | A tuple where the first element is a dictionary specifying the string token 34 | to integer mapping and the second element represents the reverse lookup 35 | (i.e. integer token to string mapping). 36 | 37 | Raises: 38 | ------ 39 | ValueError: Vocabulary path does not exist. 40 | """ 41 | vocab = {} 42 | reverse_vocab = {} 43 | 44 | if not tf.io.gfile.exists(path): 45 | raise ValueError("Vocabulary path: %s does not exist" % path) 46 | 47 | def populate_entry(index, entry): 48 | entry = six.ensure_text(entry).strip() 49 | vocab[entry] = index 50 | reverse_vocab[index] = entry 51 | 52 | if path.endswith("tfrecord.gz"): 53 | data_iter = tf.compat.v1.io.tf_record_iterator( 54 | path, tf.io.TFRecordOptions(compression_type="GZIP") 55 | ) 56 | for index, entry in enumerate(data_iter): 57 | populate_entry(index, entry) 58 | else: 59 | with tf.io.gfile.GFile(path) as f: 60 | for index, entry in enumerate(f): 61 | populate_entry(index, entry) 62 | return vocab, reverse_vocab 63 | -------------------------------------------------------------------------------- /tensorflow_data_validation/utils/vocab_util_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Tests for schema utilities.""" 15 | 16 | import tempfile 17 | 18 | import tensorflow as tf 19 | from absl.testing import absltest 20 | 21 | from tensorflow_data_validation.utils import vocab_util 22 | 23 | 24 | class VocabUtilTest(absltest.TestCase): 25 | def test_text_file(self): 26 | with tempfile.NamedTemporaryFile() as f: 27 | f.write(b"Foo\nBar\n") 28 | f.flush() 29 | 30 | vocab, reverse_vocab = vocab_util.load_vocab(f.name) 31 | self.assertEqual(vocab, {"Foo": 0, "Bar": 1}) 32 | self.assertEqual(reverse_vocab, {0: "Foo", 1: "Bar"}) 33 | 34 | def test_gz_recordio_file(self): 35 | with tempfile.NamedTemporaryFile(suffix=".tfrecord.gz") as f: 36 | writer = tf.io.TFRecordWriter(f.name, options="GZIP") 37 | for element in [b"Foo", b"Bar"]: 38 | writer.write(element) 39 | writer.flush() 40 | f.flush() 41 | 42 | vocab, reverse_vocab = vocab_util.load_vocab(f.name) 43 | self.assertEqual(vocab, {"Foo": 0, "Bar": 1}) 44 | self.assertEqual(reverse_vocab, {0: "Foo", 1: "Bar"}) 45 | 46 | 47 | if __name__ == "__main__": 48 | absltest.main() 49 | -------------------------------------------------------------------------------- /tensorflow_data_validation/version.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Contains the version string of TFDV.""" 16 | 17 | # Note that setup.py uses this version. 18 | __version__ = "1.18.0.dev" 19 | -------------------------------------------------------------------------------- /tensorflow_data_validation/workspace.bzl: -------------------------------------------------------------------------------- 1 | """TensorFlow Data Validation external dependencies that can be loaded in WORKSPACE files. 2 | """ 3 | 4 | load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository") 5 | 6 | def tf_data_validation_workspace(): 7 | """All TensorFlow Data Validation external dependencies.""" 8 | 9 | git_repository( 10 | name = "com_github_tensorflow_metadata", 11 | branch = "master", 12 | remote = "https://github.com/tensorflow/metadata.git", 13 | ) 14 | 15 | git_repository( 16 | name = "com_github_tfx_bsl", 17 | branch = "master", 18 | remote = "https://github.com/tensorflow/tfx-bsl", 19 | ) 20 | -------------------------------------------------------------------------------- /third_party/BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) 2 | -------------------------------------------------------------------------------- /third_party/farmhash.BUILD: -------------------------------------------------------------------------------- 1 | licenses(["notice"]) # MIT 2 | 3 | exports_files(["COPYING"]) 4 | 5 | config_setting( 6 | name = "windows", 7 | values = { 8 | "cpu": "x64_windows", 9 | }, 10 | ) 11 | 12 | cc_library( 13 | name = "farmhash_fingerprint", 14 | srcs = ["src/farmhash.cc"], 15 | hdrs = ["src/farmhash.h"], 16 | # Disable __builtin_expect support on Windows 17 | copts = select({ 18 | ":windows": ["/DFARMHASH_OPTIONAL_BUILTIN_EXPECT"], 19 | "//conditions:default": [], 20 | }), 21 | # Required by ZetaSQL. 22 | # ZetaSQL is removed 23 | # This is a candidate for deletion 24 | defines = ["NAMESPACE_FOR_HASH_FUNCTIONS=farmhash"], 25 | includes = ["src/."], 26 | visibility = ["//visibility:public"], 27 | ) 28 | -------------------------------------------------------------------------------- /third_party/googleapis.patch: -------------------------------------------------------------------------------- 1 | --- a/google/rpc/BUILD.bazel 2 | +++ b/google/rpc/BUILD.bazel 3 | @@ -80,6 +80,7 @@ go_proto_library( 4 | go_proto_library( 5 | name = "status_go_proto", 6 | importpath = "google.golang.org/genproto/googleapis/rpc/status", 7 | + importmap = "unknown", 8 | protos = [":status_proto"], 9 | ) 10 | -------------------------------------------------------------------------------- /third_party/local_python.BUILD.tpl: -------------------------------------------------------------------------------- 1 | licenses(["restricted"]) 2 | 3 | package(default_visibility = ["//visibility:public"]) 4 | 5 | # Point both runtimes to the same python binary to ensure we always 6 | # use the python binary specified by ./configure.py script. 7 | load("@bazel_tools//tools/python:toolchain.bzl", "py_runtime_pair") 8 | 9 | py_runtime( 10 | name = "py2_runtime", 11 | interpreter_path = "%{PYTHON_BIN_PATH}", 12 | python_version = "PY2", 13 | ) 14 | 15 | py_runtime( 16 | name = "py3_runtime", 17 | interpreter_path = "%{PYTHON_BIN_PATH}", 18 | python_version = "PY3", 19 | ) 20 | 21 | py_runtime_pair( 22 | name = "py_runtime_pair", 23 | py2_runtime = ":py2_runtime", 24 | py3_runtime = ":py3_runtime", 25 | ) 26 | 27 | toolchain( 28 | name = "py_toolchain", 29 | toolchain = ":py_runtime_pair", 30 | toolchain_type = "@bazel_tools//tools/python:toolchain_type", 31 | target_compatible_with = [%{PLATFORM_CONSTRAINT}], 32 | exec_compatible_with = [%{PLATFORM_CONSTRAINT}], 33 | ) 34 | 35 | # To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib 36 | # See https://docs.python.org/3/extending/windows.html 37 | cc_import( 38 | name = "python_lib", 39 | interface_library = select({ 40 | ":windows": ":python_import_lib", 41 | # A placeholder for Unix platforms which makes --no_build happy. 42 | "//conditions:default": "not-existing.lib", 43 | }), 44 | system_provided = 1, 45 | ) 46 | 47 | cc_library( 48 | name = "python_headers", 49 | hdrs = [":python_include"], 50 | deps = select({ 51 | ":windows": [":python_lib"], 52 | "//conditions:default": [], 53 | }), 54 | includes = ["python_include"], 55 | ) 56 | 57 | cc_library( 58 | name = "numpy_headers", 59 | hdrs = [":numpy_include"], 60 | includes = ["numpy_include"], 61 | ) 62 | 63 | config_setting( 64 | name = "windows", 65 | values = {"cpu": "x64_windows"}, 66 | visibility = ["//visibility:public"], 67 | ) 68 | 69 | %{PYTHON_INCLUDE_GENRULE} 70 | %{NUMPY_INCLUDE_GENRULE} 71 | %{PYTHON_IMPORT_LIB_GENRULE} 72 | -------------------------------------------------------------------------------- /third_party/pybind11.BUILD: -------------------------------------------------------------------------------- 1 | # pybind11 - Seamless operability between C++11 and Python. 2 | load("@rules_cc//cc:defs.bzl", "cc_library") 3 | 4 | package(default_visibility = ["//visibility:public"]) 5 | 6 | licenses(["notice"]) 7 | 8 | exports_files(["LICENSE"]) 9 | 10 | OPTIONS = [ 11 | "-fexceptions", 12 | # Useless warnings 13 | "-Xclang-only=-Wno-undefined-inline", 14 | "-Xclang-only=-Wno-pragma-once-outside-header", 15 | "-Xgcc-only=-Wno-error", # no way to just disable the pragma-once warning in gcc 16 | ] 17 | 18 | INCLUDES = [ 19 | "include/pybind11/*.h", 20 | "include/pybind11/detail/*.h", 21 | ] 22 | 23 | EXCLUDES = [ 24 | # Deprecated file that just emits a warning 25 | "include/pybind11/common.h", 26 | ] 27 | 28 | cc_library( 29 | name = "pybind11", 30 | hdrs = glob( 31 | INCLUDES, 32 | exclude = EXCLUDES, 33 | ), 34 | copts = OPTIONS, 35 | includes = ["include"], 36 | deps = ["@local_config_python//:python_headers"], 37 | ) 38 | -------------------------------------------------------------------------------- /third_party/rules_foreign_cc.patch: -------------------------------------------------------------------------------- 1 | diff --git foreign_cc/private/cc_toolchain_util.bzl foreign_cc/private/cc_toolchain_util.bzl 2 | index 1bd872d..7a7880d 100644 3 | --- foreign_cc/private/cc_toolchain_util.bzl 4 | +++ foreign_cc/private/cc_toolchain_util.bzl 5 | @@ -46,6 +46,14 @@ FOREIGN_CC_DISABLED_FEATURES = [ 6 | "thin_lto", 7 | ] 8 | 9 | +def _maybe_convert_gcc(gcc_path): 10 | + gcc_path_steps = gcc_path.split("/") 11 | + if (gcc_path_steps[-1] == "gcc"): 12 | + gcc_path_steps[-1] = "g++" 13 | + return "/".join(gcc_path_steps) 14 | + else: 15 | + return gcc_path 16 | + 17 | def _to_list(element): 18 | if element == None: 19 | return [] 20 | @@ -270,10 +278,10 @@ def get_tools_info(ctx): 21 | feature_configuration = feature_configuration, 22 | action_name = ACTION_NAMES.c_compile, 23 | ), 24 | - cxx = cc_common.get_tool_for_action( 25 | + cxx = _maybe_convert_gcc(cc_common.get_tool_for_action( 26 | feature_configuration = feature_configuration, 27 | action_name = ACTION_NAMES.cpp_compile, 28 | - ), 29 | + )), 30 | cxx_linker_static = cc_common.get_tool_for_action( 31 | feature_configuration = feature_configuration, 32 | action_name = ACTION_NAMES.cpp_link_static_library, -------------------------------------------------------------------------------- /third_party/six.BUILD: -------------------------------------------------------------------------------- 1 | # This file is copied from https://github.com/abseil/abseil-py/blob/master/third_party/six.BUILD. 2 | # It is needed to get TFDV to build with a dependency on Zetasql. 3 | # Description: 4 | # Six provides simple utilities for wrapping over differences between Python 2 5 | # and Python 3. 6 | 7 | licenses(["notice"]) # MIT 8 | 9 | exports_files(["LICENSE"]) 10 | 11 | py_library( 12 | name = "six", 13 | srcs = ["six.py"], 14 | srcs_version = "PY2AND3", 15 | visibility = ["//visibility:public"], 16 | ) 17 | --------------------------------------------------------------------------------