├── .bazelrc
├── .bazelversion
├── .github
├── reusable-build
│ └── action.yml
└── workflows
│ ├── build.yml
│ ├── ci-lint.yml
│ ├── docs.yml
│ └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── BUILD
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── RELEASE.md
├── WORKSPACE
├── docker-compose.yml
├── docs
├── _toc.yaml
├── anomalies.md
├── api.md
├── custom_data_validation.md
├── get_started.md
├── images
│ ├── anomaly.png
│ ├── feature_stats.png
│ ├── schema.png
│ ├── serving_anomaly.png
│ ├── skew_anomaly.png
│ ├── stats.png
│ ├── tf_full_color_primary_icon.svg
│ ├── unbalanced.png
│ ├── uniform.png
│ ├── uniform_cumulative.png
│ └── zero_length.png
├── index.md
├── install.md
├── javascripts
│ └── mathjax.js
└── stylesheets
│ └── extra.css
├── google3
└── third_party
│ └── py
│ └── tensorflow_data_validation
│ ├── build_macros.bzl
│ └── opensource_only
│ └── BUILD
├── mkdocs.yml
├── pyproject.toml
├── requirements-docs.txt
├── setup.py
├── tensorflow_data_validation
├── BUILD
├── __init__.py
├── anomalies
│ ├── BUILD
│ ├── __init__.py
│ ├── bool_domain_test.cc
│ ├── bool_domain_util.cc
│ ├── bool_domain_util.h
│ ├── custom_domain_util.cc
│ ├── custom_domain_util.h
│ ├── custom_domain_util_test.cc
│ ├── dataset_constraints_util.cc
│ ├── dataset_constraints_util.h
│ ├── dataset_constraints_util_test.cc
│ ├── diff_util.cc
│ ├── diff_util.h
│ ├── feature_statistics_validator.cc
│ ├── feature_statistics_validator.h
│ ├── feature_statistics_validator_test.cc
│ ├── feature_util.cc
│ ├── feature_util.h
│ ├── feature_util_test.cc
│ ├── features_needed.cc
│ ├── features_needed.h
│ ├── features_needed_test.cc
│ ├── float_domain_test.cc
│ ├── float_domain_util.cc
│ ├── float_domain_util.h
│ ├── image_domain_test.cc
│ ├── image_domain_util.cc
│ ├── image_domain_util.h
│ ├── int_domain_test.cc
│ ├── int_domain_util.cc
│ ├── int_domain_util.h
│ ├── internal_types.h
│ ├── map_util.cc
│ ├── map_util.h
│ ├── map_util_test.cc
│ ├── metrics.cc
│ ├── metrics.h
│ ├── metrics_test.cc
│ ├── natural_language_domain_util.cc
│ ├── natural_language_domain_util.h
│ ├── path.cc
│ ├── path.h
│ ├── path_test.cc
│ ├── proto
│ │ ├── BUILD
│ │ ├── __init__.py
│ │ ├── feature_statistics_to_proto.proto
│ │ ├── validation_config.proto
│ │ └── validation_metadata.proto
│ ├── schema.cc
│ ├── schema.h
│ ├── schema_anomalies.cc
│ ├── schema_anomalies.h
│ ├── schema_anomalies_test.cc
│ ├── schema_test.cc
│ ├── schema_util.cc
│ ├── schema_util.h
│ ├── schema_util_test.cc
│ ├── statistics_view.cc
│ ├── statistics_view.h
│ ├── statistics_view_test.cc
│ ├── statistics_view_test_util.cc
│ ├── statistics_view_test_util.h
│ ├── status_util.h
│ ├── string_domain_test.cc
│ ├── string_domain_util.cc
│ ├── string_domain_util.h
│ ├── telemetry.cc
│ ├── telemetry.h
│ ├── test_schema_protos.cc
│ ├── test_schema_protos.h
│ ├── test_util.cc
│ ├── test_util.h
│ └── test_util_test.cc
├── api
│ ├── __init__.py
│ ├── stats_api.py
│ ├── stats_api_test.py
│ ├── validation_api.py
│ ├── validation_api_test.py
│ ├── validation_options.py
│ └── validation_options_test.py
├── arrow
│ ├── __init__.py
│ ├── arrow_util.py
│ ├── arrow_util_test.py
│ ├── decoded_examples_to_arrow.py
│ └── decoded_examples_to_arrow_test.py
├── build_macros.bzl
├── coders
│ ├── __init__.py
│ ├── csv_decoder.py
│ └── csv_decoder_test.py
├── constants.py
├── data_validation.bzl
├── integration_tests
│ ├── drift_skew_metrics_test.py
│ └── sequence_example_e2e_test.py
├── move_generated_files.sh
├── pywrap
│ ├── BUILD
│ ├── __init__.py
│ ├── tensorflow_data_validation_extension.cc
│ ├── validation_submodule.cc
│ └── validation_submodule.h
├── skew
│ ├── __init__.py
│ ├── feature_skew_detector.py
│ ├── feature_skew_detector_test.py
│ └── protos
│ │ ├── BUILD
│ │ ├── __init__.py
│ │ └── feature_skew_results.proto
├── statistics
│ ├── __init__.py
│ ├── generators
│ │ ├── __init__.py
│ │ ├── basic_stats_generator.py
│ │ ├── basic_stats_generator_test.py
│ │ ├── constituents
│ │ │ ├── __init__.py
│ │ │ ├── count_missing_generator.py
│ │ │ ├── count_missing_generator_test.py
│ │ │ ├── length_diff_generator.py
│ │ │ └── length_diff_generator_test.py
│ │ ├── cross_feature_stats_generator.py
│ │ ├── cross_feature_stats_generator_test.py
│ │ ├── empty_value_counter_generator.py
│ │ ├── empty_value_counter_generator_test.py
│ │ ├── image_stats_generator.py
│ │ ├── image_stats_generator_test.py
│ │ ├── input_batch.py
│ │ ├── input_batch_test.py
│ │ ├── lift_stats_generator.py
│ │ ├── lift_stats_generator_test.py
│ │ ├── mutual_information.py
│ │ ├── mutual_information_test.py
│ │ ├── natural_language_domain_inferring_stats_generator.py
│ │ ├── natural_language_domain_inferring_stats_generator_test.py
│ │ ├── natural_language_stats_generator.py
│ │ ├── natural_language_stats_generator_test.py
│ │ ├── partitioned_stats_generator.py
│ │ ├── partitioned_stats_generator_test.py
│ │ ├── sklearn_mutual_information.py
│ │ ├── sklearn_mutual_information_test.py
│ │ ├── sparse_feature_stats_generator.py
│ │ ├── sparse_feature_stats_generator_test.py
│ │ ├── stats_generator.py
│ │ ├── testdata
│ │ │ ├── image1.gif
│ │ │ ├── image2.png
│ │ │ ├── image3.bmp
│ │ │ ├── image4.png
│ │ │ ├── image5.jpg
│ │ │ ├── image6.jpg
│ │ │ └── not_a_image.abc
│ │ ├── time_stats_generator.py
│ │ ├── time_stats_generator_test.py
│ │ ├── top_k_uniques_sketch_stats_generator.py
│ │ ├── top_k_uniques_sketch_stats_generator_test.py
│ │ ├── top_k_uniques_stats_generator.py
│ │ ├── top_k_uniques_stats_generator_test.py
│ │ ├── weighted_feature_stats_generator.py
│ │ └── weighted_feature_stats_generator_test.py
│ ├── stats_impl.py
│ ├── stats_impl_test.py
│ ├── stats_options.py
│ └── stats_options_test.py
├── tools
│ ├── BUILD
│ ├── README.md
│ ├── build_docs.py
│ └── docker_build
│ │ ├── Dockerfile.manylinux2010
│ │ └── build_manylinux.sh
├── types.py
├── types_test.py
├── utils
│ ├── __init__.py
│ ├── anomalies_util.py
│ ├── anomalies_util_test.py
│ ├── artifacts_io_impl.py
│ ├── artifacts_io_impl_test.py
│ ├── batch_util.py
│ ├── batch_util_test.py
│ ├── beam_runner_util.py
│ ├── bin_util.py
│ ├── bin_util_test.py
│ ├── display_util.py
│ ├── display_util_test.py
│ ├── example_weight_map.py
│ ├── example_weight_map_test.py
│ ├── feature_partition_util.py
│ ├── feature_partition_util_test.py
│ ├── io_util.py
│ ├── io_util_test.py
│ ├── metrics_util.py
│ ├── mutual_information_util.py
│ ├── mutual_information_util_test.py
│ ├── path.py
│ ├── preprocessing_util.py
│ ├── quantiles_util.py
│ ├── quantiles_util_test.py
│ ├── schema_util.py
│ ├── schema_util_test.py
│ ├── slicing_util.py
│ ├── slicing_util_test.py
│ ├── stats_gen_lib.py
│ ├── stats_gen_lib_test.py
│ ├── stats_util.py
│ ├── stats_util_test.py
│ ├── test_util.py
│ ├── test_util_test.py
│ ├── top_k_uniques_stats_util.py
│ ├── top_k_uniques_stats_util_test.py
│ ├── validation_lib.py
│ ├── validation_lib_test.py
│ ├── variance_util.py
│ ├── variance_util_test.py
│ ├── vocab_util.py
│ └── vocab_util_test.py
├── version.py
└── workspace.bzl
└── third_party
├── BUILD
├── arrow.BUILD
├── farmhash.BUILD
├── googleapis.patch
├── local_python.BUILD.tpl
├── pybind11.BUILD
├── python_configure.bzl
├── rules_foreign_cc.patch
└── six.BUILD
/.bazelrc:
--------------------------------------------------------------------------------
1 | # Needed to work with ZetaSQL dependency.
2 | # Zetasql is removed.
3 | # This is a candidate for removal
4 | build --cxxopt="-std=c++17"
5 |
6 | # Needed to avoid zetasql proto error.
7 | # Zetasql is removed.
8 | # This is a candidate for removal
9 | build --protocopt=--experimental_allow_proto3_optional
10 |
11 | # icu@: In create_linking_context: in call to create_linking_context(),
12 | # parameter 'user_link_flags' is deprecated and will be removed soon.
13 | # It may be temporarily re-enabled by setting --incompatible_require_linker_input_cc_api=false
14 | build --incompatible_require_linker_input_cc_api=false
15 |
--------------------------------------------------------------------------------
/.bazelversion:
--------------------------------------------------------------------------------
1 | 6.5.0
2 |
--------------------------------------------------------------------------------
/.github/reusable-build/action.yml:
--------------------------------------------------------------------------------
1 | name: Resusable steps to build data-validation
2 |
3 | inputs:
4 | python-version:
5 | description: 'Python version'
6 | required: true
7 | upload-artifact:
8 | description: 'Should upload build artifact or not'
9 | default: false
10 |
11 | runs:
12 | using: 'composite'
13 | steps:
14 | - name: Set up Python ${{ inputs.python-version }}
15 | uses: actions/setup-python@v5
16 | with:
17 | python-version: ${{ inputs.python-version }}
18 |
19 | - name: Build the package for Python ${{ inputs.python-version }}
20 | shell: bash
21 | run: |
22 | version="${{ matrix.python-version }}"
23 | docker compose run -e PYTHON_VERSION=$(echo "$version" | sed 's/\.//') manylinux2010
24 |
25 | - name: Upload wheel artifact for Python ${{ matrix.python-version }}
26 | if: ${{ inputs.upload-artifact == 'true' }}
27 | uses: actions/upload-artifact@v4
28 | with:
29 | name: data-validation-wheel-py${{ matrix.python-version }}
30 | path: dist/*.whl
31 |
32 | - name: Check the wheel
33 | shell: bash
34 | run: |
35 | pip install twine
36 | twine check dist/*
37 |
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 | pull_request:
8 | branches:
9 | - master
10 | workflow_dispatch:
11 |
12 | jobs:
13 | build:
14 | runs-on: ubuntu-latest
15 | strategy:
16 | matrix:
17 | python-version: ["3.9", "3.10", "3.11"]
18 |
19 | steps:
20 | - name: Checkout
21 | uses: actions/checkout@v4
22 |
23 | - name: Build data-validation
24 | id: build-data-validation
25 | uses: ./.github/reusable-build
26 | with:
27 | python-version: ${{ matrix.python-version }}
28 | upload-artifact: true
29 |
30 | upload_to_pypi:
31 | name: Upload to PyPI
32 | runs-on: ubuntu-latest
33 | if: (github.event_name == 'release' && startsWith(github.ref, 'refs/tags')) || (github.event_name == 'workflow_dispatch')
34 | needs: [build]
35 | environment:
36 | name: pypi
37 | url: https://pypi.org/p/tensorflow-data-validation/
38 | permissions:
39 | id-token: write
40 | steps:
41 | - name: Retrieve wheels
42 | uses: actions/download-artifact@v4.1.8
43 | with:
44 | merge-multiple: true
45 | path: wheels
46 |
47 | - name: List the build artifacts
48 | run: |
49 | ls -lAs wheels/
50 |
51 | - name: Upload to PyPI
52 | uses: pypa/gh-action-pypi-publish@release/v1.9
53 | with:
54 | packages_dir: wheels/
55 |
--------------------------------------------------------------------------------
/.github/workflows/ci-lint.yml:
--------------------------------------------------------------------------------
1 | name: pre-commit
2 |
3 | on:
4 | pull_request:
5 | push:
6 | branches: [master]
7 |
8 | jobs:
9 | pre-commit:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v4.1.7
13 | with:
14 | # Ensure the full history is fetched
15 | # This is required to run pre-commit on a specific set of commits
16 | # TODO: Remove this when all the pre-commit issues are fixed
17 | fetch-depth: 0
18 | - uses: actions/setup-python@v5.1.1
19 | with:
20 | python-version: 3.13
21 | - uses: pre-commit/action@v3.0.1
22 |
--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
1 | name: Deploy docs
2 | on:
3 | workflow_dispatch:
4 | push:
5 | branches:
6 | - 'master'
7 | pull_request:
8 | permissions:
9 | contents: write
10 | jobs:
11 | deploy:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - name: Checkout repo
15 | uses: actions/checkout@v4
16 |
17 | - name: Configure Git Credentials
18 | run: |
19 | git config user.name github-actions[bot]
20 | git config user.email 41898282+github-actions[bot]@users.noreply.github.com
21 | if: (github.event_name != 'pull_request')
22 |
23 | - name: Set up Python 3.9
24 | uses: actions/setup-python@v5
25 | with:
26 | python-version: '3.9'
27 | cache: 'pip'
28 | cache-dependency-path: |
29 | setup.py
30 | requirements-docs.txt
31 |
32 | - name: Save time for cache for mkdocs
33 | run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
34 |
35 | - name: Caching
36 | uses: actions/cache@v4
37 | with:
38 | key: mkdocs-material-${{ env.cache_id }}
39 | path: .cache
40 | restore-keys: |
41 | mkdocs-material-
42 |
43 | - name: Install Dependencies
44 | run: pip install -r requirements-docs.txt
45 |
46 | - name: Deploy to GitHub Pages
47 | run: mkdocs gh-deploy --force
48 | if: (github.event_name != 'pull_request')
49 |
50 | - name: Build docs to check for errors
51 | run: mkdocs build
52 | if: (github.event_name == 'pull_request')
53 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Test
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 | pull_request:
8 | branches:
9 | - master
10 | workflow_dispatch:
11 |
12 | jobs:
13 | test:
14 | runs-on: ubuntu-latest
15 | strategy:
16 | matrix:
17 | python-version: ["3.9", "3.10", "3.11"]
18 |
19 | steps:
20 | - name: Checkout
21 | uses: actions/checkout@v4
22 |
23 | - name: Build data-validation
24 | id: build-data-validation
25 | uses: ./.github/reusable-build
26 | with:
27 | python-version: ${{ matrix.python-version }}
28 |
29 | - name: Install built wheel
30 | shell: bash
31 | run: |
32 | PYTHON_VERSION_TAG="cp$(echo ${{ matrix.python-version }} | sed 's/\.//')"
33 | WHEEL_FILE=$(ls dist/*${PYTHON_VERSION_TAG}*.whl)
34 | pip install "${WHEEL_FILE}[test]"
35 |
36 | - name: Run Test
37 | run: |
38 | rm -rf bazel-*
39 | # run tests
40 | pytest -vv
41 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 |
62 | # Flask stuff:
63 | instance/
64 | .webassets-cache
65 |
66 | # Scrapy stuff:
67 | .scrapy
68 |
69 | # Sphinx documentation
70 | docs/_build/
71 |
72 | # PyBuilder
73 | target/
74 |
75 | # Jupyter Notebook
76 | .ipynb_checkpoints
77 |
78 | # IPython
79 | profile_default/
80 | ipython_config.py
81 |
82 | # pyenv
83 | .python-version
84 |
85 | # pipenv
86 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
87 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
88 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not
89 | # install all needed dependencies.
90 | #Pipfile.lock
91 |
92 | # celery beat schedule file
93 | celerybeat-schedule
94 |
95 | # SageMath parsed files
96 | *.sage.py
97 |
98 | # Environments
99 | .env
100 | .venv
101 | env/
102 | venv/
103 | ENV/
104 | env.bak/
105 | venv.bak/
106 |
107 | # Spyder project settings
108 | .spyderproject
109 | .spyproject
110 |
111 | # Rope project settings
112 | .ropeproject
113 |
114 | # Intellij project settings
115 | .idea
116 |
117 | # mkdocs documentation
118 | /site
119 |
120 | # mypy
121 | .mypy_cache/
122 | .dmypy.json
123 | dmypy.json
124 |
125 | # Pyre type checker
126 | .pyre/
127 |
128 | # pb2.py files
129 | *_pb2.py
130 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # pre-commit is a tool to perform a predefined set of tasks manually and/or
2 | # automatically before git commits are made.
3 | #
4 | # Config reference: https://pre-commit.com/#pre-commit-configyaml---top-level
5 | #
6 | # Common tasks
7 | #
8 | # - Register git hooks: pre-commit install --install-hooks
9 | # - Run on all files: pre-commit run --all-files
10 | #
11 | # These pre-commit hooks are run as CI.
12 | #
13 | # NOTE: if it can be avoided, add configs/args in pyproject.toml or below instead of creating a new `.config.file`.
14 | # https://pre-commit.ci/#configuration
15 | ci:
16 | autoupdate_schedule: monthly
17 | autofix_commit_msg: |
18 | [pre-commit.ci] Apply automatic pre-commit fixes
19 |
20 | repos:
21 | # general
22 | - repo: https://github.com/pre-commit/pre-commit-hooks
23 | rev: v4.6.0
24 | hooks:
25 | - id: end-of-file-fixer
26 | exclude: '\.svg$|\.patch$'
27 | - id: trailing-whitespace
28 | exclude: '\.svg$|\.patch$'
29 | - id: check-json
30 | - id: check-yaml
31 | args: [--allow-multiple-documents, --unsafe]
32 | - id: check-toml
33 |
34 | - repo: https://github.com/astral-sh/ruff-pre-commit
35 | rev: v0.5.6
36 | hooks:
37 | - id: ruff
38 | args: ["--fix"]
39 | - id: ruff-format
40 |
--------------------------------------------------------------------------------
/BUILD:
--------------------------------------------------------------------------------
1 | load("@bazel_gazelle//:def.bzl", "gazelle")
2 |
3 | package(
4 | default_visibility = [":__subpackages__"],
5 | )
6 |
7 | licenses(["notice"])
8 |
9 | exports_files(["LICENSE"])
10 |
11 | gazelle(
12 | name = "gazelle-update-repos",
13 | args = [
14 | "-from_file=go.mod",
15 | "-to_macro=deps.bzl%go_dependencies",
16 | ],
17 | command = "update-repos",
18 | )
19 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement. You (or your employer) retain the copyright to your contribution,
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to to see
12 | your current agreements on file or to sign a new one.
13 |
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 |
18 | ## Code reviews
19 |
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 |
25 | TFDV follows the [Google Python Style Guide]
26 | (http://google.github.io/styleguide/pyguide.html).
27 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | version: '3.1'
16 |
17 | # Extensions are not support until 3.4 thus the repeated boilerplate below.
18 |
19 | # We mount the TFDV project root at /build (which is the WORKDIR of the image)
20 | # in the container.
21 | services:
22 | manylinux2010:
23 | image: tfdv-build:manylinux2010
24 | build:
25 | context: .
26 | dockerfile: tensorflow_data_validation/tools/docker_build/Dockerfile.manylinux2010
27 | volumes:
28 | - .:/build:delegated
29 |
--------------------------------------------------------------------------------
/docs/_toc.yaml:
--------------------------------------------------------------------------------
1 | toc:
2 | - title: "Install"
3 | path: /tfx/data_validation/install
4 | - title: "Get started"
5 | path: /tfx/data_validation/get_started
6 |
--------------------------------------------------------------------------------
/docs/api.md:
--------------------------------------------------------------------------------
1 | # TensorFlow Data Validation API Documentation
2 |
3 |
4 | ::: tensorflow_data_validation
5 |
--------------------------------------------------------------------------------
/docs/custom_data_validation.md:
--------------------------------------------------------------------------------
1 | # Custom Data Validation
2 |
3 |
6 |
7 | TFDV supports custom data validation using SQL. You can run custom data
8 | validation using
9 | [validate_statistics](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/api/validation_api.py#L236)
10 | or
11 | [custom_validate_statistics](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/api/validation_api.py#L535).
12 | Use `validate_statistics` to run standard, schema-based data validation along
13 | with custom validation. Use `custom_validate_statistics` to run only custom
14 | validation.
15 |
16 | ## Configuring Custom Data Validation
17 |
18 | Use the
19 | [CustomValidationConfig](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/anomalies/proto/custom_validation_config.proto)
20 | to define custom validations to run. For each validation, provide an
21 | SQL expression, which returns a boolean value. Each SQL expression is run
22 | against the summary statistics for the specified feature. If the expression
23 | returns false, TFDV generates a custom anomaly using the provided severity and
24 | anomaly description.
25 |
26 | You may configure custom validations that run against individual features or
27 | feature pairs. For each feature, specify both the dataset (i.e., slice) and the
28 | feature path to use, though you may leave the dataset name blank if you want to
29 | validate the default slice (i.e., all examples). For single feature validations,
30 | the feature statistics are bound to `feature`. For feature pair validations, the
31 | test feature statistics are bound to `feature_test` and the base feature
32 | statistics are bound to `feature_base`. See the section below for example
33 | queries.
34 |
35 | If a custom validation triggers an anomaly, TFDV will return an Anomalies proto
36 | with the reason(s) for the anomaly. Each reason will have a short description,
37 | which is user configured, and a description with the query that caused the
38 | anomaly, the dataset names on which the query was run, and the base feature path
39 | (if running a feature-pair validation). See the section below for example
40 | results of custom validation.
41 |
42 | See the
43 | [documentation](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/anomalies/proto/custom_validation_config.proto)
44 | in the `CustomValidationConfig` proto for example
45 | configurations.
46 |
--------------------------------------------------------------------------------
/docs/images/anomaly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/anomaly.png
--------------------------------------------------------------------------------
/docs/images/feature_stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/feature_stats.png
--------------------------------------------------------------------------------
/docs/images/schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/schema.png
--------------------------------------------------------------------------------
/docs/images/serving_anomaly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/serving_anomaly.png
--------------------------------------------------------------------------------
/docs/images/skew_anomaly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/skew_anomaly.png
--------------------------------------------------------------------------------
/docs/images/stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/stats.png
--------------------------------------------------------------------------------
/docs/images/tf_full_color_primary_icon.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/images/unbalanced.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/unbalanced.png
--------------------------------------------------------------------------------
/docs/images/uniform.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/uniform.png
--------------------------------------------------------------------------------
/docs/images/uniform_cumulative.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/uniform_cumulative.png
--------------------------------------------------------------------------------
/docs/images/zero_length.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorflow/data-validation/59bb2c8f067ecfd0d68fe222a72292a91a52812d/docs/images/zero_length.png
--------------------------------------------------------------------------------
/docs/javascripts/mathjax.js:
--------------------------------------------------------------------------------
1 | window.MathJax = {
2 | tex: {
3 | inlineMath: [["\\(", "\\)"]],
4 | displayMath: [["\\[", "\\]"]],
5 | processEscapes: true,
6 | processEnvironments: true
7 | },
8 | options: {
9 | ignoreHtmlClass: ".*|",
10 | processHtmlClass: "arithmatex"
11 | }
12 | };
13 |
14 | document$.subscribe(() => {
15 | MathJax.startup.output.clearCache()
16 | MathJax.typesetClear()
17 | MathJax.texReset()
18 | MathJax.typesetPromise()
19 | })
20 |
--------------------------------------------------------------------------------
/docs/stylesheets/extra.css:
--------------------------------------------------------------------------------
1 | :root {
2 | --md-primary-fg-color: #FFA800;
3 | --md-primary-fg-color--light: #CCCCCC;
4 | --md-primary-fg-color--dark: #425066;
5 | }
6 |
7 | .video-wrapper {
8 | max-width: 240px;
9 | display: flex;
10 | flex-direction: row;
11 | }
12 | .video-wrapper > iframe {
13 | width: 100%;
14 | aspect-ratio: 16 / 9;
15 | }
16 |
17 | .buttons-wrapper {
18 | flex-wrap: wrap;
19 | gap: 1em;
20 | display: flex;
21 | /* flex-grow: 1; */
22 | /* justify-content: center; */
23 | /* align-content: center; */
24 | }
25 |
26 | .buttons-wrapper > a {
27 | justify-content: center;
28 | align-content: center;
29 | flex-wrap: nowrap;
30 | /* gap: 1em; */
31 | align-items: center;
32 | text-align: center;
33 | flex: 1 1 30%;
34 | display: flex;
35 | }
36 |
37 | .md-button > .buttons-content {
38 | align-items: center;
39 | justify-content: center;
40 | display: flex;
41 | gap: 1em;
42 | }
43 |
--------------------------------------------------------------------------------
/google3/third_party/py/tensorflow_data_validation/build_macros.bzl:
--------------------------------------------------------------------------------
1 | """BUILD macros."""
2 |
3 | load("//third_party/bazel_rules/rules_python/python:py_extension.bzl", "py_extension")
4 |
5 | def tfdv_pybind_extension(
6 | name,
7 | srcs,
8 | module_name,
9 | deps = [],
10 | visibility = None):
11 | py_extension(
12 | name = name,
13 | module_name = module_name,
14 | srcs = srcs,
15 | srcs_version = "PY3ONLY",
16 | copts = [
17 | "-fno-strict-aliasing",
18 | "-fexceptions",
19 | ],
20 | features = ["-use_header_modules"],
21 | deps = deps,
22 | visibility = visibility,
23 | )
24 |
--------------------------------------------------------------------------------
/google3/third_party/py/tensorflow_data_validation/opensource_only/BUILD:
--------------------------------------------------------------------------------
1 | load("//tools/build_defs/testing:bzl_library.bzl", "bzl_library")
2 |
3 | bzl_library(
4 | name = "build_macros_bzl",
5 | srcs = ["build_macros.bzl"],
6 | parse_tests = False,
7 | visibility = ["//visibility:private"],
8 | )
9 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: TensorFlow Data Validation
2 | repo_name: "data-validation"
3 | repo_url:
4 |
5 | theme:
6 | logo: images/tf_full_color_primary_icon.svg
7 | name: material
8 | palette:
9 | # Palette toggle for automatic mode
10 | - media: "(prefers-color-scheme)"
11 | primary: custom
12 | accent: custom
13 | toggle:
14 | icon: material/brightness-auto
15 | name: Switch to light mode
16 |
17 | # Palette toggle for light mode
18 | - media: "(prefers-color-scheme: light)"
19 | primary: custom
20 | accent: custom
21 | scheme: default
22 | toggle:
23 | icon: material/brightness-7
24 | name: Switch to dark mode
25 |
26 | # Palette toggle for dark mode
27 | - media: "(prefers-color-scheme: dark)"
28 | primary: custom
29 | accent: custom
30 | scheme: slate
31 | toggle:
32 | icon: material/brightness-4
33 | name: Switch to system preference
34 | favicon: images/tf_full_color_primary_icon.svg
35 |
36 | features:
37 | - content.code.copy
38 | - content.code.select
39 | - content.action.edit
40 |
41 | plugins:
42 | - search
43 | - autorefs
44 | - mkdocstrings:
45 | default_handler: python
46 | handlers:
47 | python:
48 | options:
49 | show_source: true
50 | show_root_heading: true
51 | unwrap_annotated: true
52 | show_symbol_type_toc: true
53 | show_if_no_docstring: true
54 | show_symbol_type_heading: true
55 | merge_init_into_class: true
56 | show_signature_annotations: true
57 | separate_signature: true
58 | signature_crossrefs: true
59 | group_by_category: true
60 | show_category_heading: true
61 | show_submodules: false
62 | show_root_full_path: true
63 | docstring_section_style: "spacy"
64 | inherited_members: true
65 | summary: false
66 | filters:
67 | - "!^_"
68 | - "^__init__$"
69 | - "^__call__$"
70 | - "^__version__$"
71 | - "!^logger"
72 | - "!^test_"
73 | - "!_test$"
74 | extensions:
75 | - griffe_inherited_docstrings
76 | import:
77 | - https://docs.python.org/3/objects.inv
78 |
79 | extra_css:
80 | - stylesheets/extra.css
81 |
82 | extra_javascript:
83 | - javascripts/mathjax.js
84 | - https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js
85 |
86 | markdown_extensions:
87 | - admonition
88 | - attr_list
89 | - def_list
90 | - tables
91 | - toc:
92 | permalink: true
93 | - pymdownx.highlight:
94 | anchor_linenums: true
95 | linenums: false
96 | line_spans: __span
97 | pygments_lang_class: true
98 | - pymdownx.inlinehilite
99 | - pymdownx.snippets
100 | - pymdownx.superfences
101 | - pymdownx.arithmatex:
102 | generic: true
103 | - pymdownx.critic
104 | - pymdownx.caret
105 | - pymdownx.keys
106 | - pymdownx.mark
107 | - pymdownx.tilde
108 | - md_in_html
109 | - pymdownx.emoji:
110 | emoji_index: !!python/name:material.extensions.emoji.twemoji
111 | emoji_generator: !!python/name:material.extensions.emoji.to_svg
112 | watch:
113 | - tensorflow_data_validation
114 |
115 | nav:
116 | - Home: index.md
117 | - Install: install.md
118 | - Getting Started: get_started.md
119 | - Anomalies: anomalies.md
120 | - API: api.md
121 |
--------------------------------------------------------------------------------
/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | mkdocs
2 | mkdocs-material
3 | mkdocstrings[python]
4 | griffe-inherited-docstrings
5 | mkdocs-autorefs
6 | ruff
7 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/BUILD:
--------------------------------------------------------------------------------
1 | load("@bazel_skylib//lib:selects.bzl", "selects")
2 |
3 | licenses(["notice"]) # Apache 2.0
4 |
5 | config_setting(
6 | name = "macos_x86_64",
7 | values = {
8 | "apple_platform_type": "macos",
9 | "cpu": "darwin",
10 | },
11 | )
12 |
13 | config_setting(
14 | name = "macos_arm64",
15 | values = {
16 | "apple_platform_type": "macos",
17 | "cpu": "darwin_arm64",
18 | },
19 | )
20 |
21 | selects.config_setting_group(
22 | name = "macos",
23 | match_any = [
24 | ":macos_x86_64",
25 | ":macos_arm64",
26 | ],
27 | )
28 |
29 | sh_binary(
30 | name = "move_generated_files",
31 | srcs = ["move_generated_files.sh"],
32 | data = select({
33 | "//conditions:default": [
34 | "//tensorflow_data_validation/anomalies/proto:validation_config_proto_py_pb2",
35 | "//tensorflow_data_validation/anomalies/proto:validation_metadata_proto_py_pb2",
36 | "//tensorflow_data_validation/pywrap:tensorflow_data_validation_extension.so",
37 | "//tensorflow_data_validation/skew/protos:feature_skew_results_proto_py_pb2",
38 | ],
39 | }),
40 | )
41 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/bool_domain_util.h:
--------------------------------------------------------------------------------
1 | /* Copyright 2018 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_BOOL_DOMAIN_UTIL_H_
17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_BOOL_DOMAIN_UTIL_H_
18 |
19 | #include
20 |
21 | #include "tensorflow_data_validation/anomalies/internal_types.h"
22 | #include "tensorflow_data_validation/anomalies/statistics_view.h"
23 | #include "tensorflow_metadata/proto/v0/schema.pb.h"
24 |
25 | namespace tensorflow {
26 | namespace data_validation {
27 |
28 | // Update a BoolDomain by itself. Namely, if the string values corresponding to
29 | // true and false in the domain are the same, clear the value for false.
30 | std::vector UpdateBoolDomainSelf(
31 | tensorflow::metadata::v0::BoolDomain* bool_domain);
32 |
33 | // This updates bool_domain. Should only be called if bool_domain is set.
34 | // If the type is INT and the min and max are out of the range {0,1},
35 | // this will set int_domain.
36 | std::vector UpdateBoolDomain(
37 | const FeatureStatsView& feature_stats,
38 | tensorflow::metadata::v0::Feature* feature);
39 |
40 | // Determine if this could be a BoolDomain.
41 | // Note this takes precedence over IntDomain and StringDomain.
42 | bool IsBoolDomainCandidate(const FeatureStatsView& feature_stats);
43 |
44 | // Generate a BoolDomain from the stats.
45 | // The behavior is undefined if IsBoolDomainCandidate(stats) is false.
46 | tensorflow::metadata::v0::BoolDomain BoolDomainFromStats(
47 | const FeatureStatsView& stats);
48 |
49 | } // namespace data_validation
50 | } // namespace tensorflow
51 |
52 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_BOOL_DOMAIN_UTIL_H_
53 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/custom_domain_util.cc:
--------------------------------------------------------------------------------
1 | /* Copyright 2019 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #include "tensorflow_data_validation/anomalies/custom_domain_util.h"
17 |
18 | #include
19 |
20 | #include "google/protobuf/text_format.h"
21 | #include "absl/base/log_severity.h"
22 | #include "absl/log/log.h"
23 |
24 | namespace tensorflow {
25 | namespace data_validation {
26 | namespace {
27 |
28 | using std::string;
29 |
30 | // LINT.IfChange
31 | constexpr char kDomainInfo[] = "domain_info";
32 | // LINT.ThenChange(../utils/stats_util.py)
33 |
34 | bool ParseCustomDomainInfo(const string& domain_info,
35 | tensorflow::metadata::v0::Feature* feature) {
36 | // Temporary feature for parsing domain_info.
37 | tensorflow::metadata::v0::Feature domain_info_feature;
38 | if (!google::protobuf::TextFormat::ParseFromString(
39 | domain_info, &domain_info_feature)) {
40 | return false;
41 | }
42 | // Ensure only one field is set
43 | std::vector fields_set;
44 | feature->GetReflection()->ListFields(domain_info_feature, &fields_set);
45 | // Ensure only one field is set, which is part of the domain_info oneof.
46 | if (fields_set.size() != 1 || fields_set[0]->containing_oneof() == nullptr ||
47 | fields_set[0]->containing_oneof()->name() != kDomainInfo) {
48 | return false;
49 | } else {
50 | feature->MergeFrom(domain_info_feature);
51 | return true;
52 | }
53 | }
54 |
55 | } // namespace
56 |
57 | bool BestEffortUpdateCustomDomain(
58 | const std::vector& custom_stats,
59 | tensorflow::metadata::v0::Feature* feature) {
60 | string domain_info;
61 | for (const auto& custom_stat : custom_stats) {
62 | if (custom_stat.name() == kDomainInfo) {
63 | if (!domain_info.empty()) {
64 | LOG(ERROR) << "Duplicate 'domain_info' custom_stat [" << domain_info
65 | << ", " << custom_stat.str() << "], this is a stats bug.";
66 | return false;
67 | } else {
68 | domain_info = custom_stat.str();
69 | }
70 | }
71 | }
72 | if (domain_info.empty()) {
73 | return false;
74 | }
75 | // Never override existing domain_infos with a custom domain_info for safety.
76 | if (feature->domain_info_case() !=
77 | tensorflow::metadata::v0::Feature::DOMAIN_INFO_NOT_SET) {
78 | LOG(INFO) << "Valid custom domain_info: " << domain_info
79 | << " ignored due to existing domain, for feature :"
80 | << feature->DebugString();
81 | return false;
82 | }
83 | if (!ParseCustomDomainInfo(domain_info, feature)) {
84 | LOG(ERROR) << "Could not parse 'domain_info' custom_stat: " << domain_info
85 | << ". It is expected to contain exactly one field of the "
86 | << "Feature.domain_info oneof, e.g.: 'mid_domain {}'.";
87 | return false;
88 | }
89 | return true;
90 | }
91 |
92 | } // namespace data_validation
93 | } // namespace tensorflow
94 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/custom_domain_util.h:
--------------------------------------------------------------------------------
1 | /* Copyright 2019 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_CUSTOM_DOMAIN_UTIL_H_
17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_CUSTOM_DOMAIN_UTIL_H_
18 |
19 | #include
20 |
21 | #include "tensorflow_metadata/proto/v0/schema.pb.h"
22 | #include "tensorflow_metadata/proto/v0/statistics.pb.h"
23 |
24 | namespace tensorflow {
25 | namespace data_validation {
26 |
27 | // Semantic domains like image_domain, url_domain, ... can be detected by
28 | // heuristics in stats generation. If such a domain is detected the feature
29 | // stats are associated with a CustomStatistic with name: 'domain_info' and a
30 | // str value with the text representation of the detected domain, e.g:
31 | // custom_stats: {name: 'domain_info' str: 'mid_domain {}'}
32 | //
33 | // This method provides a best-effort update of the semantic type of `feature`
34 | // based on `custom_stats` and returns true iff a valid custom domain was
35 | // detected and successfully updated `feature`. The logic is currently
36 | // conservative:
37 | // - Never modify `feature` if it has an existing domain
38 | // - If a feature is associated with multiple custom_stats for 'domain_info'
39 | // they are ignored
40 | // - If the value of the 'domain_info' custom stat is invalid or does not set
41 | // exactly one field of the Feature.domain_info oneof it is ignored
42 | bool BestEffortUpdateCustomDomain(
43 | const std::vector& custom_stats,
44 | tensorflow::metadata::v0::Feature* feature);
45 |
46 | } // namespace data_validation
47 | } // namespace tensorflow
48 |
49 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_CUSTOM_DOMAIN_UTIL_H_
50 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/custom_domain_util_test.cc:
--------------------------------------------------------------------------------
1 | /* Copyright 2019 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | #include "tensorflow_data_validation/anomalies/custom_domain_util.h"
16 |
17 | #include
18 | #include
19 | #include "tensorflow_data_validation/anomalies/test_util.h"
20 |
21 | namespace tensorflow {
22 | namespace data_validation {
23 |
24 | namespace {
25 |
26 | using std::string;
27 | using ::tensorflow::metadata::v0::CustomStatistic;
28 | using ::tensorflow::metadata::v0::Feature;
29 | using ::testing::Test;
30 |
31 | CustomStatistic DomainInfoStatistic(const string& value) {
32 | CustomStatistic custom_stat;
33 | custom_stat.set_name("domain_info");
34 | custom_stat.set_str(value);
35 | return custom_stat;
36 | }
37 |
38 | TEST(CustomDomainUtilTest, FailureOnNoDomainInfoCustomStat) {
39 | Feature feature;
40 | CustomStatistic custom_stat;
41 | custom_stat.set_name("some_other_name");
42 | custom_stat.set_str("natural_language_domain");
43 | EXPECT_FALSE(BestEffortUpdateCustomDomain(
44 | std::vector({custom_stat}), &feature));
45 | EXPECT_EQ(feature.domain_info_case(),
46 | tensorflow::metadata::v0::Feature::DOMAIN_INFO_NOT_SET);
47 | }
48 |
49 | TEST(CustomDomainUtilTest, SuccessOnEmptyFeature) {
50 | Feature feature;
51 | EXPECT_TRUE(BestEffortUpdateCustomDomain(
52 | std::vector(
53 | {DomainInfoStatistic("natural_language_domain {}")}),
54 | &feature));
55 | EXPECT_TRUE(feature.has_natural_language_domain());
56 | }
57 |
58 | TEST(CustomDomainUtilTest, FailureOnFeatureWithDomain) {
59 | Feature feature;
60 | feature.mutable_string_domain();
61 | EXPECT_FALSE(BestEffortUpdateCustomDomain(
62 | std::vector(
63 | {DomainInfoStatistic("natural_language_domain {}")}),
64 | &feature));
65 | EXPECT_TRUE(feature.has_string_domain());
66 | }
67 |
68 | TEST(CustomDomainUtilTest, FailureOnMultipleDomainInfosFeature) {
69 | Feature feature;
70 | EXPECT_FALSE(BestEffortUpdateCustomDomain(
71 | std::vector(
72 | {DomainInfoStatistic("natural_language_domain {}"),
73 | DomainInfoStatistic("natural_language_domain {}")}),
74 | &feature));
75 | EXPECT_EQ(feature.domain_info_case(),
76 | tensorflow::metadata::v0::Feature::DOMAIN_INFO_NOT_SET);
77 | }
78 |
79 | TEST(CustomDomainUtilTest, FailureOnInvalidDomainValue) {
80 | Feature feature;
81 |
82 | EXPECT_FALSE(BestEffortUpdateCustomDomain(
83 | std::vector({DomainInfoStatistic("")}), &feature));
84 | EXPECT_EQ(feature.domain_info_case(),
85 | tensorflow::metadata::v0::Feature::DOMAIN_INFO_NOT_SET);
86 |
87 | EXPECT_FALSE(BestEffortUpdateCustomDomain(
88 | std::vector({DomainInfoStatistic("This is not valid")}),
89 | &feature));
90 | EXPECT_EQ(feature.domain_info_case(),
91 | tensorflow::metadata::v0::Feature::DOMAIN_INFO_NOT_SET);
92 |
93 | EXPECT_FALSE(BestEffortUpdateCustomDomain(
94 | std::vector({DomainInfoStatistic(
95 | "name: 'It should not set other fields!' image_domain {} ")}),
96 | &feature));
97 | EXPECT_EQ(feature.domain_info_case(),
98 | tensorflow::metadata::v0::Feature::DOMAIN_INFO_NOT_SET);
99 | }
100 |
101 | } // namespace
102 |
103 | } // namespace data_validation
104 | } // namespace tensorflow
105 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/dataset_constraints_util.h:
--------------------------------------------------------------------------------
1 | /* Copyright 2019 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | // Utilities to modify a dataset constraint in the schema.
16 | #ifndef THIRD_PARTY_PY_TENSORFLOW_DATA_VALIDATION_ANOMALIES_DATASET_CONSTRAINTS_UTIL_H_
17 | #define THIRD_PARTY_PY_TENSORFLOW_DATA_VALIDATION_ANOMALIES_DATASET_CONSTRAINTS_UTIL_H_
18 |
19 | #include "tensorflow_data_validation/anomalies/internal_types.h"
20 | #include "tensorflow_data_validation/anomalies/statistics_view.h"
21 | #include "tensorflow_metadata/proto/v0/schema.pb.h"
22 | #include "tensorflow_metadata/proto/v0/statistics.pb.h"
23 |
24 | #endif // THIRD_PARTY_PY_TENSORFLOW_DATA_VALIDATION_ANOMALIES_DATASET_CONSTRAINTS_UTIL_H_
25 |
26 | namespace tensorflow {
27 | namespace data_validation {
28 | // Specifies whether the dataset constraints has a comparator of the specified
29 | // type.
30 | bool DatasetConstraintsHasComparator(
31 | const tensorflow::metadata::v0::DatasetConstraints& dataset_contraints,
32 | DatasetComparatorType comparator_type);
33 |
34 | // Gets the num examples comparator of the specified type, creating it if it
35 | // does not exist.
36 | tensorflow::metadata::v0::NumericValueComparator* GetNumExamplesComparator(
37 | tensorflow::metadata::v0::DatasetConstraints* dataset_constraints,
38 | DatasetComparatorType comparator_type);
39 |
40 | // Updates the num examples comparator from the dataset constraints.
41 | std::vector UpdateNumExamplesComparatorDirect(
42 | const DatasetStatsView& stats, DatasetComparatorType comparator_type,
43 | tensorflow::metadata::v0::NumericValueComparator* comparator);
44 |
45 | // Updates the min and max examples count from the dataset constraints.
46 | std::vector UpdateExamplesCount(
47 | const DatasetStatsView& stats,
48 | tensorflow::metadata::v0::DatasetConstraints* dataset_constraints);
49 |
50 | } // namespace data_validation
51 | } // namespace tensorflow
52 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/diff_util.cc:
--------------------------------------------------------------------------------
1 | /* Copyright 2018 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #include "tensorflow_data_validation/anomalies/diff_util.h"
17 | #include "absl/log/check.h"
18 |
19 | namespace tensorflow {
20 | namespace data_validation {
21 |
22 | std::vector ComputeDiff(
23 | const std::vector& a_lines,
24 | const std::vector& b_lines) {
25 | CHECK(false) << "Schema diff is currently not supported.";
26 | }
27 |
28 | } // namespace data_validation
29 | } // namespace tensorflow
30 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/diff_util.h:
--------------------------------------------------------------------------------
1 | /* Copyright 2018 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_DIFF_UTIL_H_
17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_DIFF_UTIL_H_
18 |
19 | #include
20 |
21 | #include "absl/strings/string_view.h"
22 | #include "tensorflow_metadata/proto/v0/anomalies.pb.h"
23 |
24 | namespace tensorflow {
25 | namespace data_validation {
26 |
27 | // The schema diff computation functionality is currently not supported.
28 | // Tracked in https://github.com/tensorflow/data-validation/issues/39
29 | std::vector ComputeDiff(
30 | const std::vector& a_lines,
31 | const std::vector& b_lines);
32 |
33 | } // namespace data_validation
34 | } // namespace tensorflow
35 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_DIFF_UTIL_H_
36 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/features_needed.cc:
--------------------------------------------------------------------------------
1 | /* Copyright 2019 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #include "tensorflow_data_validation/anomalies/features_needed.h"
17 |
18 | #include
19 |
20 | #include "absl/status/status.h"
21 | #include "tensorflow_data_validation/anomalies/path.h"
22 | #include "tensorflow_data_validation/anomalies/proto/validation_metadata.pb.h"
23 | #include "tensorflow_metadata/proto/v0/schema.pb.h"
24 |
25 | namespace tensorflow {
26 | namespace data_validation {
27 |
28 | absl::Status ToFeaturesNeededProto(const FeaturesNeeded& feature_needed,
29 | FeaturesNeededProto* result) {
30 | for (const auto& entry : feature_needed) {
31 | PathAndReasonFeatureNeeded path_and_reason_feature_need;
32 | *path_and_reason_feature_need.mutable_path() = entry.first.AsProto();
33 | for (const auto& reason_feature_needed : entry.second) {
34 | *path_and_reason_feature_need.add_reason_feature_needed() =
35 | reason_feature_needed;
36 | }
37 | *result->add_path_and_reason_feature_need() = path_and_reason_feature_need;
38 | }
39 |
40 | return absl::OkStatus();
41 | }
42 |
43 | absl::Status FromFeaturesNeededProto(
44 | const FeaturesNeededProto& feature_needed_proto, FeaturesNeeded* result) {
45 | for (const auto& entry :
46 | feature_needed_proto.path_and_reason_feature_need()) {
47 | Path key(entry.path());
48 | std::vector value = {
49 | entry.reason_feature_needed().begin(),
50 | entry.reason_feature_needed().end()};
51 | (*result)[key] = value;
52 | }
53 |
54 | return absl::OkStatus();
55 | }
56 |
57 | } // namespace data_validation
58 | } // namespace tensorflow
59 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/features_needed.h:
--------------------------------------------------------------------------------
1 | /* Copyright 2018 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_FEATURES_NEEDED_H_
17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_FEATURES_NEEDED_H_
18 |
19 | #include "absl/status/status.h"
20 | #include "tensorflow_data_validation/anomalies/path.h"
21 | #include "tensorflow_data_validation/anomalies/proto/validation_metadata.pb.h"
22 | #include "tensorflow_metadata/proto/v0/schema.pb.h"
23 |
24 | namespace tensorflow {
25 | namespace data_validation {
26 |
27 | using FeaturesNeeded = std::map>;
28 |
29 | absl::Status ToFeaturesNeededProto(const FeaturesNeeded& feature_needed,
30 | FeaturesNeededProto* result);
31 |
32 | absl::Status FromFeaturesNeededProto(
33 | const FeaturesNeededProto& feature_needed_proto, FeaturesNeeded* result);
34 |
35 | } // namespace data_validation
36 | } // namespace tensorflow
37 |
38 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_FEATURES_NEEDED_H_
39 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/features_needed_test.cc:
--------------------------------------------------------------------------------
1 | /* Copyright 2019 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #include "tensorflow_data_validation/anomalies/features_needed.h"
17 |
18 | #include
19 | #include
20 | #include "tensorflow_data_validation/anomalies/path.h"
21 | #include "tensorflow_data_validation/anomalies/proto/validation_metadata.pb.h"
22 | #include "tensorflow_data_validation/anomalies/test_util.h"
23 |
24 | namespace tensorflow {
25 | namespace data_validation {
26 | namespace {
27 |
28 | using ::testing::ElementsAre;
29 | using testing::EqualsProto;
30 | using ::testing::Pair;
31 | using testing::ParseTextProtoOrDie;
32 | using ::testing::Test;
33 | using ::testing::UnorderedElementsAre;
34 |
35 | TEST(FeaturesNeededTest, CppToProtoToCpp) {
36 | Path path({"a", "b", "c"});
37 |
38 | auto reason1 = ParseTextProtoOrDie("comment: 'test1'");
39 | auto reason2 = ParseTextProtoOrDie("comment: 'test2'");
40 | FeaturesNeeded features_need;
41 | features_need[path] = {reason1, reason2};
42 |
43 | FeaturesNeededProto proto_format;
44 | EXPECT_TRUE(ToFeaturesNeededProto(features_need, &proto_format).ok());
45 |
46 | // Verify that proto_format are correctly generated.
47 | EXPECT_THAT(proto_format, testing::EqualsProto(R"(
48 | path_and_reason_feature_need {
49 | path { step: "a" step: "b" step: "c" }
50 | reason_feature_needed { comment: "test1" }
51 | reason_feature_needed { comment: "test2" }
52 | }
53 | )"));
54 |
55 | // Verify that C++ -> Proto -> C++ is a noop.
56 | FeaturesNeeded generated_features_need;
57 | EXPECT_TRUE(
58 | FromFeaturesNeededProto(proto_format, &generated_features_need).ok());
59 | EXPECT_THAT(
60 | generated_features_need,
61 | UnorderedElementsAre(
62 | Pair(path, ElementsAre(EqualsProto(reason1), EqualsProto(reason2)))));
63 | }
64 |
65 | TEST(FeaturesNeededTest, ProtoToCppToProto) {
66 | auto original_proto = ParseTextProtoOrDie(R"(
67 | path_and_reason_feature_need {
68 | path { step: "a" step: "b" step: "c" }
69 | reason_feature_needed { comment: "test1" }
70 | reason_feature_needed { comment: "test2" }
71 | }
72 | )");
73 | FeaturesNeeded features_need;
74 | EXPECT_TRUE(FromFeaturesNeededProto(original_proto, &features_need).ok());
75 |
76 | // Verify that C++ object are expected.
77 | Path expeceted_path({"a", "b", "c"});
78 | auto expected_reason1 =
79 | ParseTextProtoOrDie("comment: 'test1'");
80 | auto expected_reason2 =
81 | ParseTextProtoOrDie("comment: 'test2'");
82 | EXPECT_THAT(features_need,
83 | UnorderedElementsAre(Pair(
84 | expeceted_path, ElementsAre(EqualsProto(expected_reason1),
85 | EqualsProto(expected_reason2)))));
86 |
87 | // Verify that Proto -> C++ -> Proto is a noop.
88 | FeaturesNeededProto generated_proto_format;
89 | EXPECT_TRUE(
90 | ToFeaturesNeededProto(features_need, &generated_proto_format).ok());
91 |
92 | EXPECT_THAT(original_proto, EqualsProto(generated_proto_format));
93 | }
94 |
95 | } // namespace
96 | } // namespace data_validation
97 | } // namespace tensorflow
98 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/float_domain_util.h:
--------------------------------------------------------------------------------
1 | /* Copyright 2018 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_FLOAT_DOMAIN_UTIL_H_
17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_FLOAT_DOMAIN_UTIL_H_
18 |
19 | #include "tensorflow_data_validation/anomalies/internal_types.h"
20 | #include "tensorflow_data_validation/anomalies/statistics_view.h"
21 | #include "tensorflow_metadata/proto/v0/schema.pb.h"
22 |
23 | namespace tensorflow {
24 | namespace data_validation {
25 |
26 | // Updates the float_domain based upon the range of values in , be they
27 | // STRING or FLOAT.
28 | // Will recommend the field be cleared if the type is STRING or BYTES but
29 | // the strings do not represent floats. Undefined behavior if the data is INT.
30 | UpdateSummary UpdateFloatDomain(
31 | const FeatureStatsView& stats,
32 | tensorflow::metadata::v0::FloatDomain* float_domain);
33 |
34 | // Returns true if feature_stats is a STRING field has only floats and no
35 | // non-UTF8 strings.
36 | bool IsFloatDomainCandidate(const FeatureStatsView& feature_stats);
37 |
38 | } // namespace data_validation
39 | } // namespace tensorflow
40 |
41 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_FLOAT_DOMAIN_UTIL_H_
42 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/image_domain_util.h:
--------------------------------------------------------------------------------
1 | /* Copyright 2020 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_IMAGE_DOMAIN_UTIL_H_
17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_IMAGE_DOMAIN_UTIL_H_
18 |
19 | #include
20 |
21 | #include "tensorflow_data_validation/anomalies/internal_types.h"
22 | #include "tensorflow_data_validation/anomalies/statistics_view.h"
23 | #include "tensorflow_metadata/proto/v0/schema.pb.h"
24 |
25 | namespace tensorflow {
26 | namespace data_validation {
27 |
28 | // This updates image_domain. Should only be called if image_domain is set.
29 | std::vector UpdateImageDomain(
30 | const FeatureStatsView& feature_stats,
31 | tensorflow::metadata::v0::Feature* feature);
32 |
33 | } // namespace data_validation
34 | } // namespace tensorflow
35 |
36 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_IMAGE_DOMAIN_UTIL_H_
37 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/int_domain_util.h:
--------------------------------------------------------------------------------
1 | /* Copyright 2018 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_INT_DOMAIN_UTIL_H_
17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_INT_DOMAIN_UTIL_H_
18 |
19 | #include "tensorflow_data_validation/anomalies/internal_types.h"
20 | #include "tensorflow_data_validation/anomalies/statistics_view.h"
21 | #include "tensorflow_metadata/proto/v0/schema.pb.h"
22 |
23 | namespace tensorflow {
24 | namespace data_validation {
25 |
26 | // Updates the float_domain based upon the range of values in , be they
27 | // STRING or INT.
28 | // Will recommend the field be cleared if the type is STRING or BYTES but
29 | // the strings do not represent floats. Undefined behavior if the data is FLOAT.
30 | UpdateSummary UpdateIntDomain(const FeatureStatsView& feature_stats,
31 | tensorflow::metadata::v0::IntDomain* int_domain);
32 |
33 | // Returns true if feature_stats is a STRING field has only floats and no
34 | // non-UTF8 strings.
35 | bool IsIntDomainCandidate(const FeatureStatsView& feature_stats);
36 |
37 | } // namespace data_validation
38 | } // namespace tensorflow
39 |
40 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_INT_DOMAIN_UTIL_H_
41 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/internal_types.h:
--------------------------------------------------------------------------------
1 | /* Copyright 2018 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_INTERNAL_TYPES_H_
17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_INTERNAL_TYPES_H_
18 |
19 | #include
20 | #include
21 |
22 | #include "tensorflow_metadata/proto/v0/anomalies.pb.h"
23 |
24 | namespace tensorflow {
25 | namespace data_validation {
26 |
27 | using std::string;
28 |
29 | // Represents the description of an anomaly, in short and long form.
30 | struct Description {
31 | tensorflow::metadata::v0::AnomalyInfo::Type type;
32 | string short_description, long_description;
33 |
34 | friend bool operator==(const Description& a, const Description& b) {
35 | return (a.type == b.type && a.short_description == b.short_description &&
36 | a.long_description == b.long_description);
37 | }
38 |
39 | friend std::ostream& operator<<(std::ostream& strm, const Description& a) {
40 | return (strm << "{" << a.type << ", " << a.short_description << ", " <<
41 | a.long_description << "}");
42 | }
43 | };
44 |
45 | // UpdateSummary for a field.
46 | struct UpdateSummary {
47 | // Clear the field in question. If this is a ``shared'' enum,
48 | // then the field is dropped.
49 | UpdateSummary() { clear_field = false; }
50 | bool clear_field;
51 | std::vector descriptions;
52 | };
53 |
54 | // Enum for comparators used in feature-level comparisons.
55 | enum class FeatureComparatorType {
56 | SKEW, // Compares serving and training data.
57 | DRIFT // Compares previous and current spans.
58 | };
59 | // Enum for comparators used in dataset-level comparisons.
60 | enum class DatasetComparatorType {
61 | DRIFT, // Compares previous and current spans.
62 | VERSION // Compares previous and current versions.
63 | };
64 |
65 | // The context for a tensorflow::metadata::v0::FeatureComparator.
66 | // In tensorflow::metadata::v0::Feature, there are two comparisons:
67 | // skew_comparator (that compares serving and training) and
68 | // drift_comparator (that compares previous and current). This struct
69 | // allows us to annotate the objects based upon this information.
70 | struct ComparatorContext {
71 | string control_name;
72 | string treatment_name;
73 | };
74 |
75 | } // namespace data_validation
76 | } // namespace tensorflow
77 |
78 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_INTERNAL_TYPES_H_
79 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/map_util.cc:
--------------------------------------------------------------------------------
1 | /* Copyright 2018 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #include "tensorflow_data_validation/anomalies/map_util.h"
17 |
18 | #include