├── g3doc
├── images
│ ├── stats.png
│ ├── anomaly.png
│ ├── schema.png
│ ├── skew_anomaly.png
│ └── serving_anomaly.png
├── _toc.yaml
└── index.md
├── tensorflow_data_validation
├── BUILD
├── anomalies
│ ├── proto
│ │ ├── BUILD
│ │ ├── validation_config.proto
│ │ └── feature_statistics_to_proto.proto
│ ├── __init__.py
│ ├── test_schema_protos.h
│ ├── float_domain_util.h
│ ├── int_domain_util.h
│ ├── test_util_test.cc
│ ├── metrics.h
│ ├── bool_domain_util.h
│ ├── metrics.cc
│ ├── string_domain_util.h
│ ├── internal_types.h
│ ├── validation_api.i
│ ├── test_util.cc
│ ├── map_util.h
│ ├── statistics_view_test_util.h
│ ├── metrics_test.cc
│ ├── map_util.cc
│ ├── float_domain_test.cc
│ ├── path.h
│ ├── feature_util.h
│ ├── test_util.h
│ ├── statistics_view_test_util.cc
│ ├── path_test.cc
│ ├── schema_anomalies.h
│ ├── float_domain_util.cc
│ ├── int_domain_util.cc
│ ├── map_util_test.cc
│ └── path.cc
├── api
│ └── __init__.py
├── coders
│ ├── __init__.py
│ ├── tf_example_decoder.py
│ └── tf_example_decoder_test.py
├── utils
│ ├── __init__.py
│ ├── stats_util.py
│ ├── batch_util.py
│ ├── batch_util_test.py
│ ├── profile_util_test.py
│ ├── stats_util_test.py
│ ├── schema_util.py
│ ├── profile_util.py
│ ├── test_util.py
│ ├── quantiles_util_test.py
│ └── schema_util_test.py
├── statistics
│ ├── __init__.py
│ ├── generators
│ │ ├── __init__.py
│ │ ├── stats_generator.py
│ │ ├── uniques_stats_generator.py
│ │ └── string_stats_generator.py
│ ├── stats_impl_test.py
│ └── stats_impl.py
├── workspace.bzl
├── types.py
├── version.py
├── types_compat.py
├── build_pip_package.sh
├── repo.bzl
├── data_validation.bzl
└── __init__.py
├── RELEASE.md
├── .gitignore
├── CONTRIBUTING.md
├── WORKSPACE
├── setup.py
└── README.md
/g3doc/images/stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dynamicwebpaige/data-validation/master/g3doc/images/stats.png
--------------------------------------------------------------------------------
/g3doc/images/anomaly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dynamicwebpaige/data-validation/master/g3doc/images/anomaly.png
--------------------------------------------------------------------------------
/g3doc/images/schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dynamicwebpaige/data-validation/master/g3doc/images/schema.png
--------------------------------------------------------------------------------
/g3doc/images/skew_anomaly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dynamicwebpaige/data-validation/master/g3doc/images/skew_anomaly.png
--------------------------------------------------------------------------------
/g3doc/images/serving_anomaly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dynamicwebpaige/data-validation/master/g3doc/images/serving_anomaly.png
--------------------------------------------------------------------------------
/tensorflow_data_validation/BUILD:
--------------------------------------------------------------------------------
1 | licenses(["notice"]) # Apache 2.0
2 |
3 | sh_binary(
4 | name = "build_pip_package",
5 | srcs = ["build_pip_package.sh"],
6 | data = [
7 | "//tensorflow_data_validation/anomalies:_pywrap_tensorflow_data_validation.so",
8 | "//tensorflow_data_validation/anomalies:pywrap_tensorflow_data_validation.py",
9 | ],
10 | )
11 |
--------------------------------------------------------------------------------
/g3doc/_toc.yaml:
--------------------------------------------------------------------------------
1 | toc:
2 | - title: Get Started
3 | path: /tfx/data_validation/get_started
4 |
5 | - heading: Examples
6 | - title: Chicago Taxi
7 | path: https://github.com/tensorflow/data-validation/blob/master/examples/chicago_taxi/chicago_taxi_tfdv.ipynb
8 | status: external
9 | - title: Chicago Taxi (end-to-end)
10 | path: https://github.com/tensorflow/model-analysis/tree/master/examples/chicago_taxi
11 | status: external
12 |
--------------------------------------------------------------------------------
/RELEASE.md:
--------------------------------------------------------------------------------
1 | # Current version (not yet released; still in development)
2 |
3 | ## Major Features and Improvements
4 |
5 | * Add support for computing weighted common statistics.
6 |
7 | ## Bug Fixes and Other Changes
8 |
9 | * Fix bug in clearing oneof domain\_info field in Feature proto.
10 | * Fix overflow error for large integers by casting them to STRING type.
11 |
12 | ## Breaking changes
13 |
14 | ## Deprecations
15 |
16 | # Release 0.9.0
17 |
18 | * Initial release of TensorFlow Data Validation.
19 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/proto/BUILD:
--------------------------------------------------------------------------------
1 | package(default_visibility = ["//tensorflow_data_validation:__subpackages__"])
2 |
3 | licenses(["notice"]) # Apache 2.0
4 |
5 | exports_files(["LICENSE"])
6 |
7 | load("//tensorflow_data_validation:data_validation.bzl", "tfdv_proto_library")
8 |
9 | tfdv_proto_library(
10 | name = "feature_statistics_to_proto_proto",
11 | srcs = ["feature_statistics_to_proto.proto"],
12 | cc_api_version = 2,
13 | )
14 |
15 | tfdv_proto_library(
16 | name = "validation_config_proto",
17 | srcs = ["validation_config.proto"],
18 | cc_api_version = 2,
19 | )
20 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/api/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/coders/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/statistics/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/statistics/generators/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .ipynb_checkpoints
3 | node_modules
4 | /.bazelrc
5 | /.tf_configure.bazelrc
6 | /bazel-*
7 | /bazel_pip
8 | /tools/python_bin_path.sh
9 | /pip_test
10 | /_python_build
11 | *.pyc
12 | __pycache__
13 | *.swp
14 | .vscode/
15 | cmake_build/
16 | .idea/**
17 | /build/
18 | [Bb]uild/
19 | Pods
20 | Podfile.lock
21 | *.pbxproj
22 | *.xcworkspacedata
23 | xcuserdata/**
24 | dist/
25 | tensorflow_data_validation.egg-info/
26 | tensorflow_data_validation/anomalies/_pywrap_tensorflow_data_validation.so
27 | tensorflow_data_validation/anomalies/pywrap_tensorflow_data_validation.py
28 |
29 | # Android
30 | .gradle
31 | .idea
32 | .project
33 | *.iml
34 | local.properties
35 | gradleBuild
36 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/workspace.bzl:
--------------------------------------------------------------------------------
1 | """TensorFlow Data Validation external dependencies that can be loaded in WORKSPACE files.
2 | """
3 |
4 | load("@org_tensorflow//tensorflow:workspace.bzl", "tf_workspace")
5 |
6 | def tf_data_validation_workspace():
7 | """All TensorFlow Data Validation external dependencies."""
8 | tf_workspace(
9 | path_prefix = "",
10 | tf_repo_name = "org_tensorflow",
11 | )
12 |
13 | # Fetch tf.Metadata repo from GitHub.
14 | native.git_repository(
15 | name = "com_github_tensorflow_metadata",
16 | # v0.9.0dev
17 | commit = "223923d04c75de71ae782c51872d0e14ce7e657d",
18 | remote = "https://github.com/tensorflow/metadata.git",
19 | )
20 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/types.py:
--------------------------------------------------------------------------------
1 | """Types."""
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 |
6 | from __future__ import print_function
7 |
8 | import apache_beam as beam
9 | import numpy as np
10 |
11 | from tensorflow_data_validation.types_compat import Dict, Text, Union
12 |
13 | FeatureName = Union[bytes, Text]
14 |
15 | # Feature type enum value.
16 | FeatureNameStatisticsType = int
17 |
18 | # Type of the input batch.
19 | ExampleBatch = Dict[FeatureName, np.ndarray]
20 |
21 | # For use in Beam type annotations, because Beam's support for Python types
22 | # in Beam type annotations is not complete.
23 | BeamFeatureName = beam.typehints.Union[bytes, Text]
24 | # pylint: enable=invalid-name
25 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/version.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """Contains the version string of TFDV."""
16 |
17 | # Note that setup.py uses this version.
18 | __version__ = '0.9.0'
19 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/types_compat.py:
--------------------------------------------------------------------------------
1 | # Copyright 2018 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Types for backwards compatibility with versions that don't support typing."""
15 |
16 |
17 | from apache_beam.typehints import Any, Dict, Generator, List, Optional, Set, Tuple, Union # pylint: disable=unused-import,g-multiple-import
18 |
19 | # pylint: disable=invalid-name
20 | Callable = None
21 | Text = Any
22 | TypeVar = None
23 |
24 | # pylint: enable=invalid-name
25 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement. You (or your employer) retain the copyright to your contribution,
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to to see
12 | your current agreements on file or to sign a new one.
13 |
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 |
18 | ## Code reviews
19 |
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/proto/validation_config.proto:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Google LLC
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 |
16 | syntax = "proto3";
17 |
18 | package tensorflow.data_validation;
19 |
20 | // Configuration for example statistics validation.
21 | message ValidationConfig {
22 | // If true then validation will mark new features (i.e., those that are not
23 | // covered in the schema) as warnings instead of errors. The distinction is
24 | // that warnings do not cause alerts to fire.
25 | bool new_features_are_warnings = 1;
26 | }
27 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/build_pip_package.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 Google LLC
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # Convenience binary to build TFDV from source.
17 |
18 | # Put wrapped c++ files in place
19 |
20 | set -u -x
21 |
22 | cp -f tensorflow_data_validation/anomalies/pywrap_tensorflow_data_validation.py \
23 | ${BUILD_WORKSPACE_DIRECTORY}/tensorflow_data_validation/anomalies
24 | cp -f tensorflow_data_validation/anomalies/_pywrap_tensorflow_data_validation.so \
25 | ${BUILD_WORKSPACE_DIRECTORY}/tensorflow_data_validation/anomalies
26 |
27 | # Create the wheel
28 | cd ${BUILD_WORKSPACE_DIRECTORY}
29 |
30 | python setup.py bdist_wheel
31 |
32 | # Cleanup
33 | cd -
34 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/test_schema_protos.h:
--------------------------------------------------------------------------------
1 | /* Copyright 2018 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_TEST_SCHEMA_PROTOS_H_
17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_TEST_SCHEMA_PROTOS_H_
18 |
19 | #include "tensorflow_metadata/proto/v0/schema.pb.h"
20 |
21 | namespace tensorflow {
22 | namespace data_validation {
23 | namespace testing {
24 |
25 | tensorflow::metadata::v0::Schema GetTestAllTypesMessage();
26 | tensorflow::metadata::v0::Schema GetAnnotatedFieldsMessage();
27 | tensorflow::metadata::v0::Schema GetTestSchemaAlone();
28 |
29 | } // namespace testing
30 | } // namespace data_validation
31 | } // namespace tensorflow
32 |
33 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_TEST_SCHEMA_PROTOS_H_
34 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/repo.bzl:
--------------------------------------------------------------------------------
1 | """ TensorFlow Http Archive
2 |
3 | Modified http_arhive that allows us to override the TensorFlow commit that is
4 | downloaded by setting an environment variable. This override is to be used for
5 | testing purposes.
6 |
7 | Add the following to your Bazel build command in order to override the
8 | TensorFlow revision.
9 |
10 | build: --action_env TF_REVISION=""
11 |
12 | * `TF_REVISION`: tensorflow revision override (git commit hash)
13 | """
14 |
15 | _TF_REVISION = "TF_REVISION"
16 |
17 | def _tensorflow_http_archive(ctx):
18 | git_commit = ctx.attr.git_commit
19 | sha256 = ctx.attr.sha256
20 |
21 | override_git_commit = ctx.os.environ.get(_TF_REVISION)
22 | if override_git_commit:
23 | sha256 = ""
24 | git_commit = override_git_commit
25 |
26 | strip_prefix = "tensorflow-%s" % git_commit
27 | urls = [
28 | "https://mirror.bazel.build/github.com/tensorflow/tensorflow/archive/%s.tar.gz" % git_commit,
29 | "https://github.com/tensorflow/tensorflow/archive/%s.tar.gz" % git_commit,
30 | ]
31 | ctx.download_and_extract(
32 | urls,
33 | "",
34 | sha256,
35 | "",
36 | strip_prefix)
37 |
38 | tensorflow_http_archive = repository_rule(
39 | implementation=_tensorflow_http_archive,
40 | attrs={
41 | "git_commit": attr.string(mandatory=True),
42 | "sha256": attr.string(mandatory=True),
43 | })
44 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/float_domain_util.h:
--------------------------------------------------------------------------------
1 | /* Copyright 2018 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_FLOAT_DOMAIN_UTIL_H_
17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_FLOAT_DOMAIN_UTIL_H_
18 |
19 | #include "tensorflow_data_validation/anomalies/internal_types.h"
20 | #include "tensorflow_data_validation/anomalies/statistics_view.h"
21 | #include "tensorflow_metadata/proto/v0/schema.pb.h"
22 |
23 | namespace tensorflow {
24 | namespace data_validation {
25 |
26 | // Updates the float_domain based upon the range of values in , be they
27 | // STRING or FLOAT.
28 | // Will recommend the field be cleared if the type is STRING or BYTES but
29 | // the strings do not represent floats. Undefined behavior if the data is INT.
30 | UpdateSummary UpdateFloatDomain(
31 | const FeatureStatsView& stats,
32 | tensorflow::metadata::v0::FloatDomain* float_domain);
33 |
34 | // Returns true if feature_stats is a STRING field has only floats and no
35 | // non-UTF8 strings.
36 | bool IsFloatDomainCandidate(const FeatureStatsView& feature_stats);
37 |
38 | } // namespace data_validation
39 | } // namespace tensorflow
40 |
41 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_FLOAT_DOMAIN_UTIL_H_
42 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/int_domain_util.h:
--------------------------------------------------------------------------------
1 | /* Copyright 2018 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_INT_DOMAIN_UTIL_H_
17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_INT_DOMAIN_UTIL_H_
18 |
19 | #include "tensorflow_data_validation/anomalies/internal_types.h"
20 | #include "tensorflow_data_validation/anomalies/statistics_view.h"
21 | #include "tensorflow_metadata/proto/v0/schema.pb.h"
22 |
23 | namespace tensorflow {
24 | namespace data_validation {
25 |
26 | // Updates the float_domain based upon the range of values in , be they
27 | // STRING or INT.
28 | // Will recommend the field be cleared if the type is STRING or BYTES but
29 | // the strings do not represent floats. Undefined behavior if the data is FLOAT.
30 | UpdateSummary UpdateIntDomain(const FeatureStatsView& feature_stats,
31 | tensorflow::metadata::v0::IntDomain* int_domain);
32 |
33 | // Returns true if feature_stats is a STRING field has only floats and no
34 | // non-UTF8 strings.
35 | bool IsIntDomainCandidate(const FeatureStatsView& feature_stats);
36 |
37 | } // namespace data_validation
38 | } // namespace tensorflow
39 |
40 | #endif // TENSORFLOW_DATA_VALIDATION_ANOMALIES_INT_DOMAIN_UTIL_H_
41 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/test_util_test.cc:
--------------------------------------------------------------------------------
1 | /* Copyright 2018 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #include "tensorflow_data_validation/anomalies/test_util.h"
17 |
18 | #include
19 | #include
20 |
21 | #include
22 | #include
23 | #include "absl/strings/str_split.h"
24 | #include "tensorflow/core/lib/core/status.h"
25 | #include "tensorflow/core/lib/core/status_test_util.h"
26 | #include "tensorflow/core/platform/logging.h"
27 | #include "tensorflow/core/platform/types.h"
28 |
29 | namespace tensorflow {
30 | namespace data_validation {
31 | namespace testing {
32 | namespace {
33 |
34 |
35 | TEST(TestAnomalies, Basic) {
36 | const tensorflow::metadata::v0::Schema original =
37 | ParseTextProtoOrDie(R"(
38 | feature {
39 | name: "feature_name"
40 | type: INT
41 | skew_comparator: { infinity_norm: { threshold: 0.1 } }
42 | })");
43 |
44 | tensorflow::metadata::v0::Anomalies result;
45 | *result.mutable_baseline() = original;
46 | TestAnomalies(result, original, std::map());
47 | }
48 |
49 | } // namespace
50 | } // namespace testing
51 | } // namespace data_validation
52 | } // namespace tensorflow
53 |
--------------------------------------------------------------------------------
/tensorflow_data_validation/anomalies/metrics.h:
--------------------------------------------------------------------------------
1 | /* Copyright 2018 Google LLC
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | https://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 |
16 | #ifndef TENSORFLOW_DATA_VALIDATION_ANOMALIES_METRICS_H_
17 | #define TENSORFLOW_DATA_VALIDATION_ANOMALIES_METRICS_H_
18 |
19 | #include