├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── RELEASE.md
├── WORKSPACE
├── google3
    └── third_party
    │   └── tensorflow_metadata
    │       └── proto
    │           └── v0
    │               └── README.md
├── setup.py
├── tensorflow_metadata
    ├── BUILD
    ├── __init__.py
    ├── move_generated_files.sh
    ├── proto
    │   ├── __init__.py
    │   └── v0
    │   │   ├── BUILD
    │   │   ├── __init__.py
    │   │   ├── anomalies.proto
    │   │   ├── derived_feature.proto
    │   │   ├── metric.proto
    │   │   ├── path.proto
    │   │   ├── problem_statement.proto
    │   │   ├── schema.proto
    │   │   └── statistics.proto
    ├── python
    │   └── proto_test.py
    └── version.py
└── tools
    └── build_tfmd_docs.py


/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing guidelines
 2 | 
 3 | ## How to become a contributor and submit your own code
 4 | 
 5 | ### Contributor License Agreements
 6 | 
 7 | We'd love to accept your patches! Before we can take them, we have to jump a couple of legal hurdles.
 8 | 
 9 | Please fill out either the individual or corporate Contributor License Agreement (CLA).
10 | 
11 |   * If you are an individual writing original source code and you're sure you own the intellectual property, then you'll need to sign an [individual CLA](http://code.google.com/legal/individual-cla-v1.0.html).
12 |   * If you work for a company that wants to allow you to contribute your work, then you'll need to sign a [corporate CLA](http://code.google.com/legal/corporate-cla-v1.0.html).
13 | 
14 | Follow either of the two links above to access the appropriate CLA and instructions for how to sign and return it. Once we receive it, we'll be able to accept your pull requests.
15 | 
16 | ***NOTE***: Only original source code from you and other people that have signed the CLA can be accepted into the main repository.
17 | 
18 | ### Contributing code
19 | 
20 | If you have improvements to TensorFlow Metadata, send us your pull requests!
21 | For those just getting started, GitHub has a [howto](https://help.github.com/articles/using-pull-requests/).
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Copyright 2017 The tf.Metadata Authors.  All rights reserved.
  2 | 
  3 |                                  Apache License
  4 |                            Version 2.0, January 2004
  5 |                         http://www.apache.org/licenses/
  6 | 
  7 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  8 | 
  9 |    1. Definitions.
 10 | 
 11 |       "License" shall mean the terms and conditions for use, reproduction,
 12 |       and distribution as defined by Sections 1 through 9 of this document.
 13 | 
 14 |       "Licensor" shall mean the copyright owner or entity authorized by
 15 |       the copyright owner that is granting the License.
 16 | 
 17 |       "Legal Entity" shall mean the union of the acting entity and all
 18 |       other entities that control, are controlled by, or are under common
 19 |       control with that entity. For the purposes of this definition,
 20 |       "control" means (i) the power, direct or indirect, to cause the
 21 |       direction or management of such entity, whether by contract or
 22 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 23 |       outstanding shares, or (iii) beneficial ownership of such entity.
 24 | 
 25 |       "You" (or "Your") shall mean an individual or Legal Entity
 26 |       exercising permissions granted by this License.
 27 | 
 28 |       "Source" form shall mean the preferred form for making modifications,
 29 |       including but not limited to software source code, documentation
 30 |       source, and configuration files.
 31 | 
 32 |       "Object" form shall mean any form resulting from mechanical
 33 |       transformation or translation of a Source form, including but
 34 |       not limited to compiled object code, generated documentation,
 35 |       and conversions to other media types.
 36 | 
 37 |       "Work" shall mean the work of authorship, whether in Source or
 38 |       Object form, made available under the License, as indicated by a
 39 |       copyright notice that is included in or attached to the work
 40 |       (an example is provided in the Appendix below).
 41 | 
 42 |       "Derivative Works" shall mean any work, whether in Source or Object
 43 |       form, that is based on (or derived from) the Work and for which the
 44 |       editorial revisions, annotations, elaborations, or other modifications
 45 |       represent, as a whole, an original work of authorship. For the purposes
 46 |       of this License, Derivative Works shall not include works that remain
 47 |       separable from, or merely link (or bind by name) to the interfaces of,
 48 |       the Work and Derivative Works thereof.
 49 | 
 50 |       "Contribution" shall mean any work of authorship, including
 51 |       the original version of the Work and any modifications or additions
 52 |       to that Work or Derivative Works thereof, that is intentionally
 53 |       submitted to Licensor for inclusion in the Work by the copyright owner
 54 |       or by an individual or Legal Entity authorized to submit on behalf of
 55 |       the copyright owner. For the purposes of this definition, "submitted"
 56 |       means any form of electronic, verbal, or written communication sent
 57 |       to the Licensor or its representatives, including but not limited to
 58 |       communication on electronic mailing lists, source code control systems,
 59 |       and issue tracking systems that are managed by, or on behalf of, the
 60 |       Licensor for the purpose of discussing and improving the Work, but
 61 |       excluding communication that is conspicuously marked or otherwise
 62 |       designated in writing by the copyright owner as "Not a Contribution."
 63 | 
 64 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 65 |       on behalf of whom a Contribution has been received by Licensor and
 66 |       subsequently incorporated within the Work.
 67 | 
 68 |    2. Grant of Copyright License. Subject to the terms and conditions of
 69 |       this License, each Contributor hereby grants to You a perpetual,
 70 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 71 |       copyright license to reproduce, prepare Derivative Works of,
 72 |       publicly display, publicly perform, sublicense, and distribute the
 73 |       Work and such Derivative Works in Source or Object form.
 74 | 
 75 |    3. Grant of Patent License. Subject to the terms and conditions of
 76 |       this License, each Contributor hereby grants to You a perpetual,
 77 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 78 |       (except as stated in this section) patent license to make, have made,
 79 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 80 |       where such license applies only to those patent claims licensable
 81 |       by such Contributor that are necessarily infringed by their
 82 |       Contribution(s) alone or by combination of their Contribution(s)
 83 |       with the Work to which such Contribution(s) was submitted. If You
 84 |       institute patent litigation against any entity (including a
 85 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 86 |       or a Contribution incorporated within the Work constitutes direct
 87 |       or contributory patent infringement, then any patent licenses
 88 |       granted to You under this License for that Work shall terminate
 89 |       as of the date such litigation is filed.
 90 | 
 91 |    4. Redistribution. You may reproduce and distribute copies of the
 92 |       Work or Derivative Works thereof in any medium, with or without
 93 |       modifications, and in Source or Object form, provided that You
 94 |       meet the following conditions:
 95 | 
 96 |       (a) You must give any other recipients of the Work or
 97 |           Derivative Works a copy of this License; and
 98 | 
 99 |       (b) You must cause any modified files to carry prominent notices
100 |           stating that You changed the files; and
101 | 
102 |       (c) You must retain, in the Source form of any Derivative Works
103 |           that You distribute, all copyright, patent, trademark, and
104 |           attribution notices from the Source form of the Work,
105 |           excluding those notices that do not pertain to any part of
106 |           the Derivative Works; and
107 | 
108 |       (d) If the Work includes a "NOTICE" text file as part of its
109 |           distribution, then any Derivative Works that You distribute must
110 |           include a readable copy of the attribution notices contained
111 |           within such NOTICE file, excluding those notices that do not
112 |           pertain to any part of the Derivative Works, in at least one
113 |           of the following places: within a NOTICE text file distributed
114 |           as part of the Derivative Works; within the Source form or
115 |           documentation, if provided along with the Derivative Works; or,
116 |           within a display generated by the Derivative Works, if and
117 |           wherever such third-party notices normally appear. The contents
118 |           of the NOTICE file are for informational purposes only and
119 |           do not modify the License. You may add Your own attribution
120 |           notices within Derivative Works that You distribute, alongside
121 |           or as an addendum to the NOTICE text from the Work, provided
122 |           that such additional attribution notices cannot be construed
123 |           as modifying the License.
124 | 
125 |       You may add Your own copyright statement to Your modifications and
126 |       may provide additional or different license terms and conditions
127 |       for use, reproduction, or distribution of Your modifications, or
128 |       for any such Derivative Works as a whole, provided Your use,
129 |       reproduction, and distribution of the Work otherwise complies with
130 |       the conditions stated in this License.
131 | 
132 |    5. Submission of Contributions. Unless You explicitly state otherwise,
133 |       any Contribution intentionally submitted for inclusion in the Work
134 |       by You to the Licensor shall be under the terms and conditions of
135 |       this License, without any additional terms or conditions.
136 |       Notwithstanding the above, nothing herein shall supersede or modify
137 |       the terms of any separate license agreement you may have executed
138 |       with Licensor regarding such Contributions.
139 | 
140 |    6. Trademarks. This License does not grant permission to use the trade
141 |       names, trademarks, service marks, or product names of the Licensor,
142 |       except as required for reasonable and customary use in describing the
143 |       origin of the Work and reproducing the content of the NOTICE file.
144 | 
145 |    7. Disclaimer of Warranty. Unless required by applicable law or
146 |       agreed to in writing, Licensor provides the Work (and each
147 |       Contributor provides its Contributions) on an "AS IS" BASIS,
148 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149 |       implied, including, without limitation, any warranties or conditions
150 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 |       PARTICULAR PURPOSE. You are solely responsible for determining the
152 |       appropriateness of using or redistributing the Work and assume any
153 |       risks associated with Your exercise of permissions under this License.
154 | 
155 |    8. Limitation of Liability. In no event and under no legal theory,
156 |       whether in tort (including negligence), contract, or otherwise,
157 |       unless required by applicable law (such as deliberate and grossly
158 |       negligent acts) or agreed to in writing, shall any Contributor be
159 |       liable to You for damages, including any direct, indirect, special,
160 |       incidental, or consequential damages of any character arising as a
161 |       result of this License or out of the use or inability to use the
162 |       Work (including but not limited to damages for loss of goodwill,
163 |       work stoppage, computer failure or malfunction, or any and all
164 |       other commercial damages or losses), even if such Contributor
165 |       has been advised of the possibility of such damages.
166 | 
167 |    9. Accepting Warranty or Additional Liability. While redistributing
168 |       the Work or Derivative Works thereof, You may choose to offer,
169 |       and charge a fee for, acceptance of support, warranty, indemnity,
170 |       or other liability obligations and/or rights consistent with this
171 |       License. However, in accepting such obligations, You may act only
172 |       on Your own behalf and on Your sole responsibility, not on behalf
173 |       of any other Contributor, and only if You agree to indemnify,
174 |       defend, and hold each Contributor harmless for any liability
175 |       incurred by, or claims asserted against, such Contributor by reason
176 |       of your accepting any such warranty or additional liability.
177 | 
178 |    END OF TERMS AND CONDITIONS
179 | 
180 |    APPENDIX: How to apply the Apache License to your work.
181 | 
182 |       To apply the Apache License to your work, attach the following
183 |       boilerplate notice, with the fields enclosed by brackets "[]"
184 |       replaced with your own identifying information. (Don't include
185 |       the brackets!)  The text should be enclosed in the appropriate
186 |       comment syntax for the file format. We also recommend that a
187 |       file or class name and description of purpose be included on the
188 |       same "printed page" as the copyright notice for easier
189 |       identification within third-party archives.
190 | 
191 |    Copyright 2017, The tf.Metadata Authors.
192 | 
193 |    Licensed under the Apache License, Version 2.0 (the "License");
194 |    you may not use this file except in compliance with the License.
195 |    You may obtain a copy of the License at
196 | 
197 |        http://www.apache.org/licenses/LICENSE-2.0
198 | 
199 |    Unless required by applicable law or agreed to in writing, software
200 |    distributed under the License is distributed on an "AS IS" BASIS,
201 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202 |    See the License for the specific language governing permissions and
203 |    limitations under the License.
204 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # TensorFlow Metadata
 2 | 
 3 | [![Python](https://img.shields.io/badge/python%7C3.9%7C3.10%7C3.11-blue)](https://github.com/tensorflow/metadata)
 4 | [![PyPI](https://badge.fury.io/py/tensorflow-metadata.svg)](https://badge.fury.io/py/tensorflow-metadata)
 5 | 
 6 | TensorFlow Metadata provides standard representations for metadata that are
 7 | useful when training machine learning models with TensorFlow.
 8 | 
 9 | The metadata serialization formats include:
10 | 
11 | * A schema describing tabular data (e.g., tf.Examples).
12 | * A collection of summary statistics over such datasets.
13 | * A problem statement quantifying the objectives of a model.
14 | 
15 | The metadata may be produced by hand or automatically during input data
16 | analysis, and may be consumed for data validation, exploration, and
17 | transformation.
18 | 


--------------------------------------------------------------------------------
/RELEASE.md:
--------------------------------------------------------------------------------
  1 | <!-- mdlint off(HEADERS_TOO_MANY_H1) -->
  2 | 
  3 | # Current Version (not yet released; still in development)
  4 | 
  5 | ## Major Features and Improvements
  6 | 
  7 | ## Bug Fixes and Other Changes
  8 | 
  9 | *   Relax dependency on Protobuf to include version 6.x
 10 | *   Remove upper bound for Protobuf dependency
 11 | 
 12 | ## Breaking Changes
 13 | 
 14 | ## Deprecations
 15 | 
 16 | # Version 1.17.1
 17 | 
 18 | ## Major Features and Improvements
 19 | 
 20 | *   N/A
 21 | 
 22 | ## Bug Fixes and Other Changes
 23 | 
 24 | *   Bumped the minimum bazel version required to build `tfmd` to 6.5.0.
 25 | 
 26 | ## Breaking Changes
 27 | 
 28 | *   N/A
 29 | 
 30 | ## Deprecations
 31 | 
 32 | *   N/A
 33 | 
 34 | # Version 1.17.0
 35 | 
 36 | ## Major Features and Improvements
 37 | 
 38 | *   N/A
 39 | 
 40 | ## Bug Fixes and Other Changes
 41 | 
 42 | *   Add Audio as a schema domain.
 43 | *   Add Video as a schema domain.
 44 | *   Resolve issue where pre-release versions of protobuf are installed.
 45 | *   Depends on `protobuf>=4.25.2,<5` for Python 3.11 and on
 46 |     `protobuf>=4.21.6,<4.22` for 3.9 and 3.10
 47 | 
 48 | ## Breaking Changes
 49 | 
 50 | *   N/A
 51 | 
 52 | ## Deprecations
 53 | 
 54 | *   N/A
 55 | 
 56 | # Version 1.16.1
 57 | 
 58 | ## Major Features and Improvements
 59 | 
 60 | ## Bug Fixes and Other Changes
 61 | 
 62 | *   Relax dependency on Protobuf to include version 5.x
 63 | 
 64 | ## Breaking Changes
 65 | 
 66 | ## Deprecations
 67 | 
 68 | # Version 1.16.0
 69 | 
 70 | ## Major Features and Improvements
 71 | 
 72 | *   N/A
 73 | 
 74 | ## Bug Fixes and Other Changes
 75 | 
 76 | *   For nested features with N nested levels (N > 1), the statistics counting
 77 |     the number of values in `CommonStatistics` and `WeightedCommonStatistics`
 78 |     will rely on the innermost level.
 79 | 
 80 | ## Breaking Changes
 81 | 
 82 | *   N/A
 83 | 
 84 | ## Deprecations
 85 | 
 86 | *   N/A
 87 | 
 88 | # Version 1.15.0
 89 | 
 90 | ## Major Features and Improvements
 91 | 
 92 | *   N/A
 93 | 
 94 | ## Bug Fixes and Other Changes
 95 | 
 96 | *   Bump the Ubuntu version on which TFMD is tested to 20.04 (previously
 97 |     was 16.04).
 98 | *   Bumped the minimum bazel version required to build `tfmd` to 6.1.0.
 99 | *   Depends on `protobuf>=4.25.2,<5` for Python 3.11 and on 
100 |     `protobuf>3.20.3,<4.21` for 3.9 and 3.10.
101 | *   Depends on `googleapis-common-protos>=1.56.4,<2` for Python 3.11 and on
102 |     `googleapis-common-protos>=1.52.0,<2` for 3.9 and 3.10.
103 | *   Relax dependency on `absl-py` to include version 2.
104 | 
105 | ## Breaking Changes
106 | 
107 | *   Removed `NaturalLanguageDomain.location_constraint_regex`.
108 |     It was documented as "please do not use" and never implemented.
109 | *   Change to the semantics of min/max/avg/tot num-values for nested features
110 |     (see above).
111 | 
112 | ## Deprecations
113 | 
114 | *   Deprecated Python 3.8 support.
115 | 
116 | # Version 1.14.0
117 | 
118 | ## Major Features and Improvements
119 | 
120 | *   N/A
121 | 
122 | ## Bug Fixes and Other Changes
123 | *   Add `joint_group` to `SequenceMetadata` to specify which group this sequence
124 |     feature belongs to so that they can be modeled jointly.
125 | *   Add `BOOL_TYPE_INVALID_CONFIG` anomaly type.
126 | *   Add `embedding_dim` to `FloatDomain` to specify the embedding dimension,
127 |     which is useful for use cases such as restoring shapes for flattened
128 |     sequence of embeddings.
129 | *   Add `sequence_truncation_limit` to `SequenceMetadata` to specify the maximum
130 |     sequence length that should be processed.
131 | *   Depends on `protobuf>=3.20.3,<4.21`. Upper bound is required to avoid
132 |     breaking changes.
133 | *   Add `embedding_type` to `FloatDomain` to specify the semantic type of the
134 |     embedding. This is useful for use cases where the embedding dimension is
135 |     inferred from the embedding type.
136 | 
137 | ## Breaking Changes
138 | 
139 | *   N/A
140 | 
141 | ## Deprecations
142 | 
143 | *   N/A
144 | 
145 | # Version 1.13.1
146 | 
147 | ## Major Features and Improvements
148 | 
149 | *   N/A
150 | 
151 | ## Bug Fixes and Other Changes
152 | 
153 | *   Depends on `protobuf>=3.20.3,<5`.
154 | 
155 | ## Breaking Changes
156 | 
157 | *   N/A
158 | 
159 | ## Deprecations
160 | 
161 | *   N/A
162 | 
163 | # Version 1.13.0
164 | 
165 | ## Major Features and Improvements
166 | 
167 | *   Introduce `Schema.represent_variable_length_as_ragged` knob to automatically
168 |     generate `RaggedTensor`s for variable length features.
169 | *   Introduces a Schema option `HistogramSelection` to allow numeric drift/skew
170 |     calculations to use QUANTILES histograms, which are more robust to outliers.
171 | 
172 | ## Bug Fixes and Other Changes
173 | 
174 | *   N/A
175 | 
176 | ## Breaking Changes
177 | 
178 | *   N/A
179 | 
180 | ## Deprecations
181 | 
182 | *   Deprecated Python 3.7 support.
183 | 
184 | # Version 1.12.0
185 | 
186 | ## Major Features and Improvements
187 | 
188 | *   N/A
189 | 
190 | ## Bug Fixes and Other Changes
191 | 
192 | *   N/A
193 | 
194 | ## Breaking Changes
195 | 
196 | *   N/A
197 | 
198 | ## Deprecations
199 | 
200 | *   N/A
201 | 
202 | # Version 1.11.0
203 | 
204 | ## Major Features and Improvements
205 | 
206 | *   N/A
207 | 
208 | ## Bug Fixes and Other Changes
209 | 
210 | *   Add a categorical indicator to the schema for `StringDomain`.
211 | *   Add ProblemStatement Task.is_auxiliary field to allow specifying auxiliary
212 |     tasks in multi-task learning problems.
213 | *   Add the SequenceMetadata field to the schema to specify if this feature
214 |     could be treated as a sequence feature.
215 | *   Add a `CUSTOM_VALIDATION` Type in anomalies.proto.
216 | 
217 | ## Breaking Changes
218 | 
219 | *  Histogram Buckets include their upper bound instead of their lower bound.
220 | 
221 | ## Deprecations
222 | 
223 | *   N/A
224 | 
225 | # Version 1.10.0
226 | 
227 | ## Major Features and Improvements
228 | 
229 | *   N/A
230 | 
231 | ## Bug Fixes and Other Changes
232 | 
233 | *   ThresholdConfig.threshold field is made into a oneof.
234 | *   Clarifies the meaning of num_non_missing in statistics.proto.
235 | 
236 | ## Breaking Changes
237 | 
238 | *   N/A
239 | 
240 | ## Deprecations
241 | *   ProblemStatement Task.task_weight and MetaOptimizationTarget.weight are
242 |     deprecated.
243 | 
244 | # Version 1.9.0
245 | 
246 | ## Major Features and Improvements
247 | 
248 | *   N/A
249 | 
250 | ## Bug Fixes and Other Changes
251 | 
252 | *   N/A
253 | 
254 | ## Breaking Changes
255 | 
256 | *   N/A
257 | 
258 | ## Deprecations
259 | 
260 | *   N/A
261 | 
262 | # Version 1.8.0
263 | 
264 | ## Major Features and Improvements
265 | 
266 | *   N/A
267 | 
268 | ## Bug Fixes and Other Changes
269 | *   Adds experimental support within statistics.proto and schema.proto for
270 |     marking features that are derived during statistics generation for data
271 |     exploration or validation, but not actually present in input data.
272 | *   Adds an experimental DERIVED_FEATURE_BAD_LIFECYCLE and
273 |     DERIVED_FEATURE_INVALID_SOURCE anomaly type.
274 | 
275 | ## Breaking Changes
276 | 
277 | *   N/A
278 | 
279 | ## Deprecations
280 | 
281 | *   N/A
282 | 
283 | # Version 1.7.0
284 | 
285 | ## Major Features and Improvements
286 | 
287 | *   N/A
288 | 
289 | ## Bug Fixes and Other Changes
290 | 
291 | *   N/A
292 | 
293 | ## Breaking Changes
294 | 
295 | *   N/A
296 | 
297 | ## Deprecations
298 | 
299 | *   N/A
300 | 
301 | # Version 1.6.0
302 | 
303 | ## Major Features and Improvements
304 | 
305 | *   N/A
306 | 
307 | ## Bug Fixes and Other Changes
308 | 
309 | *   statistics.proto: Includes a field `invalid_utf8_count` in `StringStatistics`
310 |     to store the number of non-utf8 encoded strings for a feature.
311 | *   Depends on `absl-py>=0.9,<2.0.0`.
312 | 
313 | ## Breaking Changes
314 | 
315 | *   Removes deprecated field `objective_function` from ProblemStatement.
316 | 
317 | ## Deprecations
318 | 
319 | *   Deprecates `multi_objective` field in ProblemStatement.
320 | *   Deprecates several unused PerformanceMetrics.
321 | 
322 | # Version 1.5.0
323 | 
324 | ## Major Features and Improvements
325 | 
326 | *   N/A
327 | 
328 | ## Bug Fixes and Other Changes
329 | 
330 | *   A `threshold_config` is added to MetaOptimizationTarget to allow for
331 |     expressing thresholded optimization goals.
332 | 
333 | ## Breaking Changes
334 | 
335 | *   N/A
336 | 
337 | ## Deprecations
338 | 
339 | *   N/A
340 | 
341 | # Version 1.4.0
342 | 
343 | ## Major Features and Improvements
344 | 
345 | *   N/A
346 | 
347 | ## Bug Fixes and Other Changes
348 | 
349 | *   Added a new field to `FloatDomain` in schema to allow expression of
350 |     categorical floats.
351 | 
352 | ## Breaking Changes
353 | 
354 | *   N/A
355 | 
356 | ## Deprecations
357 | 
358 | *   Deprecated Python 3.6 support.
359 | 
360 | # Version 1.3.x (skipped)
361 | 
362 | *  To maintain version consistency among TFX Family libraries we skipped
363 |    the 1.3.x release for TFMD library.
364 | 
365 | # Version 1.2.0
366 | 
367 | ## Major Features and Improvements
368 | 
369 | *   Added `PositiveNegativeSpec` to `ProblemStatement.BinaryClassification` for
370 |     specifying positive and negative class values.
371 | 
372 | ## Bug Fixes and Other Changes
373 | 
374 | *   N/A
375 | 
376 | ## Breaking Changes
377 | 
378 | *   N/A
379 | 
380 | ## Deprecations
381 | 
382 | *   N/A
383 | 
384 | # Version 1.1.0
385 | 
386 | ## Major Features and Improvements
387 | 
388 | *   N/A
389 | 
390 | ## Bug Fixes and Other Changes
391 | 
392 | *   Depends on `protobuf>=3.13,<4`.
393 | 
394 | ## Breaking Changes
395 | 
396 | *   N/A
397 | 
398 | ## Deprecations
399 | 
400 | *   N/A
401 | 
402 | # Version 1.0.0
403 | 
404 | ## Major Features and Improvements
405 | 
406 | *  Added public python interface for proto/* in proto/__init__.py
407 | 
408 | ## Bug Fixes and Other Changes
409 | 
410 | *   N/A
411 | 
412 | ## Breaking Changes
413 | 
414 | *   N/A
415 | 
416 | ## Deprecations
417 | 
418 | *   N/A
419 | 
420 | # Version 0.30.0
421 | 
422 | ## Major Features and Improvements
423 | 
424 | *   N/A
425 | 
426 | ## Bug Fixes and Other Changes
427 | 
428 | *  Added new anomaly types: `MULTIPLE_REASONS` and
429 |    `INVALID_DOMAIN_SPECIFICATION`.
430 | *  Added new anomaly type: `STATS_NOT_AVAILABLE`.
431 | 
432 | ## Breaking Changes
433 | 
434 | *   N/A
435 | 
436 | ## Deprecations
437 | 
438 | *   N/A
439 | 
440 | # Version 0.29.0
441 | 
442 | ## Major Features and Improvements
443 | 
444 | *   Adding the ability to specify and detect sequence length issues.
445 | 
446 | ## Bug Fixes and Other Changes
447 | 
448 | *   Depends on `absl-py>=0.9,<0.13`.
449 | 
450 | ## Breaking Changes
451 | 
452 | *   N/A
453 | 
454 | ## Deprecations
455 | 
456 | *   N/A
457 | 
458 | # Version 0.28.0
459 | 
460 | ## Major Features and Improvements
461 | 
462 | *   Added new anomaly type `MAX_IMAGE_BYTE_SIZE_EXCEEDED` for image_domain.
463 | *   Added new anomaly type `INVALID_FEATURE_SHAPE`.
464 | *   The `RaggedTensor` TensorRepresentation now supports additional partitions.
465 | 
466 | ## Bug Fixes and Other Changes
467 | 
468 | *   N/A
469 | 
470 | ## Breaking Changes
471 | 
472 | *   N/A
473 | 
474 | ## Deprecations
475 | 
476 | *   N/A
477 | 
478 | # Version 0.27.0
479 | 
480 | ## Major Features and Improvements
481 | 
482 | *   Added new anomaly types to AnamalyInfo to report data issues with NL
483 |     features.
484 | 
485 | ## Bug Fixes and Other Changes
486 | 
487 | *   Added new FloatDomain field and anomaly type to designate and validate
488 |     features that represent fixed dimensional embeddings.
489 | 
490 | ## Breaking changes
491 | 
492 | *   N/A
493 | 
494 | ## Deprecations
495 | 
496 | *   N/A
497 | 
498 | # Version 0.26.0
499 | 
500 | ## Major Features and Improvements
501 | 
502 | *   Added new fields to NaturalLanguageDomain message in the schema, including
503 |     support for specifying vocabularies, constraints on sequence values
504 |     (SequenceValueConstraints), constraints on vocabulary coverage
505 |     (FeatureCoverageConstraints), and constraints on token location
506 |     (location_constraints_regex).
507 | *   Added new NaturalLanguageStatistics message to the statistics.proto so that
508 |     we can compute statistics corresponding to Natural Language features.
509 | 
510 | ## Bug Fixes and Other Changes
511 | 
512 | *   N/A
513 | 
514 | ## Breaking changes
515 | 
516 | *   N/A
517 | 
518 | ## Deprecations
519 | 
520 | *   N/A
521 | 
522 | # Version 0.25.0
523 | 
524 | ## Major Features and Improvements
525 | 
526 | *   Added new Anomaly and Schema field to support drift and distribution skew
527 |     detection for numeric features.
528 | *   Added a new field in Anomalies proto to report the raw measurements of
529 |     distribution skew detection.
530 | *   From this release TFMD will also be hosting nightly packages on
531 |     https://pypi-nightly.tensorflow.org. To install the nightly package use the
532 |     following command:
533 | 
534 |     ```
535 |     pip install --extra-index-url https://pypi-nightly.tensorflow.org/simple tensorflow-metadata
536 |     ```
537 | 
538 |     Note: These nightly packages are unstable and breakages are likely to
539 |     happen. The fix could often take a week or more depending on the complexity
540 |     involved for the wheels to be available on the PyPI cloud service. You can
541 |     always use the stable version of TFMD available on PyPI by running the
542 |     command `pip install tensorflow-metadata` .
543 | 
544 | ## Bug Fixes and Other Changes
545 | 
546 | *   Added new Anomaly type to describe when a domain is incompatible with the
547 |     data type.
548 | *   Added new Anomaly types for invalid schema configurations (missing name,
549 |     missing type, etc).
550 | *   Added new Anomaly type to describe when type does not match the data.
551 | *   Added new LifecycleStage:DISABLED.
552 | 
553 | ## Breaking changes
554 | 
555 | *   N/A
556 | 
557 | ## Deprecations
558 | 
559 | *   N/A
560 | 
561 | # Version 0.24.0
562 | 
563 | ## Major Features and Improvements
564 | 
565 | *   From this version we will be releasing python 3.8 wheels.
566 | 
567 | ## Bug Fixes and Other Changes
568 | 
569 | *   When installing from source, you don't need any steps other than `pip
570 |     install` (needs Bazel).
571 | *   Labels can be specified as Paths in addition to string names.
572 | *   Depends on `absl-py>=0.9,<0.11`.
573 | *   Depends on `googleapis-common-protos>=1.52.0,<2`.
574 | 
575 | ## Breaking changes
576 | 
577 | *   N/A
578 | 
579 | ## Deprecations
580 | 
581 | *   Deprecated Python 3.5 support.
582 | 
583 | # Version 0.23.0
584 | 
585 | ## Major Features and Improvements
586 | 
587 | *   Added disallow_inf to FloatDomain message in schema.proto.
588 | *   Added new Anomaly type to describe data that has unexpected Infs / -Infs.
589 | *   Added new Anomaly and Schema field for specifying ratio of supported images.
590 | *   Added value_counts field to Feature message in schema.proto, which describes
591 |     the number of values for features that have more than one nestedness level.
592 | *   Added new anomaly type VALUE_NESTEDNESS_MISMATCH to describe data that has a
593 |     nestedness level that does not match the schema.
594 | *   Added new Any type value to CustomStatistic.
595 | 
596 | ## Bug Fixes and Other Changes
597 | 
598 | *   Add ProblemStatement and Metric Python proto stubs.
599 | *   Use absltest instead of unittest.
600 | 
601 | ## Breaking changes
602 | 
603 | *   N/A
604 | 
605 | ## Deprecations
606 | 
607 | *   Drops Python 2 support.
608 | *   Note: We plan to remove Python 3.5 support after this release.
609 | 
610 | # Version 0.22.2
611 | 
612 | ## Major Features and Improvements
613 | 
614 | *   Added UniqueConstraints to Feature message in schema.proto.
615 | *   Added new Anomaly types to describe data that does not conform to
616 |     UniqueConstraints.
617 | *   Added PresenceAndValencyStatistics to CommonStatistics.
618 | 
619 | ## Bug Fixes and Other Changes
620 | 
621 | ## Breaking changes
622 | 
623 | ## Deprecations
624 | 
625 | # Version 0.22.1
626 | 
627 | ## Major Features and Improvements
628 | 
629 | *   Added RaggedTensor in TensorRepresentation
630 | 
631 | ## Bug Fixes and Other Changes
632 | 
633 | ## Breaking changes
634 | 
635 | ## Deprecations
636 | 
637 | # Version 0.22.0
638 | 
639 | ## Major Features and Improvements
640 | 
641 | ## Bug Fixes and Other Changes
642 | 
643 | *   Added a new type of Anomaly: DATASET_HIGH_NUM_EXAMPLES
644 | *   Added a new field to dataset_constraints: max_examples_count
645 | *   Added a multi-label TaskType.
646 | *   Removed ProblemStatementNamespace proto
647 | *   Removed ProblemStatementReference proto
648 | *   Removed field ProblemStatement.implements
649 | 
650 | ## Breaking Changes
651 | 
652 | ## Deprecations
653 | 
654 | # Version 0.21.2
655 | 
656 | ## Major Features and Improvements
657 | 
658 | ## Bug Fixes and Other Changes
659 | 
660 | *   Fixed a compatibility issue with newer bazel versions.
661 | *   Started pulling TF 1.15.2 source for building.
662 | 
663 | ## Breaking Changes
664 | 
665 | ## Deprecations
666 | 
667 | # Version 0.21.1
668 | 
669 | ## Major Features and Improvements
670 | 
671 | ## Bug Fixes and Other Changes
672 | 
673 | *   Added support for specifying behavior of rare / OOV multiclass labels.
674 | *   Added anomaly types related to weighted features.
675 | *   Added support for storing lift stats on weighted examples.
676 | 
677 | ## Breaking changes
678 | 
679 | *   The removal of `lift_series` from `CategoricalCrossStats` and the change of
680 |     type of `LiftSeries.LiftValue.lift` from float to double will cause parsing
681 |     failures for serialized protos written written by version 0.21.0 which
682 |     contained the deleted or changed fields.
683 | 
684 | ## Deprecations
685 | 
686 | # Version 0.21.0
687 | 
688 | ## Major Features and Improvements
689 | 
690 | ## Bug Fixes and Other Changes
691 | 
692 | *   Added protos for categorical cross statistics using lift.
693 | *   Added a new type of Anomaly: FLOAT_TYPE_HAS_NAN
694 | *   Added a new field to float_domain: disallow_nans
695 | 
696 | ## Breaking Changes
697 | 
698 | ## Deprecations
699 | 
700 | # Version 0.15.2
701 | 
702 | ## Major Features and Improvements
703 | 
704 | ## Bug Fixes and Other Changes
705 | 
706 | *   Added SparseTensor to TensorRepresentation.
707 | *   Added a new type of Anomaly
708 | 
709 | ## Breaking Changes
710 | 
711 | ## Deprecations
712 | 
713 | # Version 0.15.1
714 | 
715 | ## Bug Fixes and Other Changes
716 | 
717 | *   Add WeightedFeature to schema.
718 | *   Add min_examples_count to DatasetConstraints and DATASET_LOW_NUM_EXAMPLES
719 |     anomaly type.
720 | *   Add TimeOfDay domain and UNIX_DAY granularity for TimeDomain in schema.
721 | *   Added TensorRepresentation to schema.
722 | 
723 | # Version 0.15.0
724 | 
725 | No significant changes. Upgrading to keep version alignment.
726 | 
727 | ## Major Features and Improvements
728 | 
729 | ## Bug Fixes and Other Changes
730 | 
731 | *   Adding CustomMetric to PerformanceMetric.
732 | 
733 | ## Breaking changes
734 | 
735 | ## Deprecations
736 | 
737 | # Version 0.14.0
738 | 
739 | ## Major Features and Improvements
740 | 
741 | ## Bug Fixes and Other Changes
742 | 
743 | *   Added an Any field to Schema Feature, for storing arbitrary structured data.
744 | 
745 | ## Breaking changes
746 | 
747 | *   Refactoring ProblemStatement and related protos. At present, these are not
748 |     stable.
749 | 
750 | ## Deprecations
751 | 
752 | # Version 0.13.0
753 | 
754 | ## Major Features and Improvements
755 | 
756 | *   Added ProblemStatement.
757 | 
758 | ## Bug Fixes and Other Changes
759 | 
760 | ## Breaking changes
761 | 
762 | ## Deprecations
763 | 
764 | # Version 0.12.0
765 | 
766 | ## Major Features and Improvements
767 | 
768 | *   Add support for declaring sparse features.
769 | *   Add support for schema diff regions.
770 | 
771 | ## Bug Fixes and Other Changes
772 | 
773 | ## Breaking changes
774 | 
775 | ## Deprecations
776 | 
777 | # Version 0.9.0
778 | 
779 | ## Major Features and Improvements
780 | 
781 | *   Adding functionality for handling structured data.
782 | 
783 | ## Bug Fixes and Other Changes
784 | 
785 | *   StructStatistics.common_statistics changed to StructStatistics.common_stats
786 |     to agree with Facets.
787 | 
788 | ## Breaking changes
789 | 
790 | *   The change from StructStatistics.common_statistics to
791 |     StructStatistics.common_stats may break code that had this field set and was
792 |     serializing to some text format. The wire format should be fine.
793 | 
794 | # Version 0.6.0
795 | 
796 | ## Major Features and Improvements
797 | 
798 | *   Use the same version of protobuf as tensorflow.
799 | *   Added support for structural statistics.
800 | *   Added new error types.
801 | *   Removed DiffRegion.
802 | *   added RankHistogram to CustomStatistics.
803 | 
804 | ## Bug Fixes and Other Changes
805 | 
806 | ## Breaking changes
807 | 
808 | *   Removed DiffRegion.
809 | 
810 | # Version 0.5.0
811 | 
812 | ## Major Features and Improvements
813 | 
814 | *   Established tf.Metadata as a standalone package.
815 | 
816 | ## Bug Fixes and Other Changes
817 | 
818 | ## Breaking changes
819 | 
820 | *   Moved tf.Metadata code out of TF-Transform code tree, requiring package
821 |     dependency updates and import updates.
822 | 


--------------------------------------------------------------------------------
/WORKSPACE:
--------------------------------------------------------------------------------
 1 | workspace(name = "tensorflow_metadata")
 2 | 
 3 | load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 4 | 
 5 | http_archive(
 6 |     name = "bazel_skylib",
 7 |     sha256 = "97e70364e9249702246c0e9444bccdc4b847bed1eb03c5a3ece4f83dfe6abc44",
 8 |     urls = [
 9 |         "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz",
10 |         "https://github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz",
11 |     ],
12 | )
13 | 
14 | _PROTOBUF_VERSION = "4.25.6"
15 | 
16 | http_archive(
17 |     name = "com_google_protobuf",
18 |     sha256 = "ff6e9c3db65f985461d200c96c771328b6186ee0b10bc7cb2bbc87cf02ebd864",
19 |     strip_prefix = "protobuf-%s" % _PROTOBUF_VERSION,
20 |     urls = [
21 |         "https://github.com/protocolbuffers/protobuf/archive/v%s.zip" % _PROTOBUF_VERSION,
22 |     ],
23 | )
24 | 
25 | # Needed by com_google_protobuf.
26 | http_archive(
27 |     name = "zlib",
28 |     build_file = "@com_google_protobuf//:third_party/zlib.BUILD",
29 |     sha256 = "d8688496ea40fb61787500e863cc63c9afcbc524468cedeb478068924eb54932",
30 |     strip_prefix = "zlib-1.2.12",
31 |     urls = ["https://github.com/madler/zlib/archive/v1.2.12.tar.gz"],
32 | )
33 | 
34 | # Needed by com_google_protobuf.
35 | http_archive(
36 |     name = "six_archive",
37 |     build_file = "@com_google_protobuf//:six.BUILD",
38 |     sha256 = "105f8d68616f8248e24bf0e9372ef04d3cc10104f1980f54d57b2ce73a5ad56a",
39 |     urls = ["https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz#md5=34eed507548117b2ab523ab14b2f8b55"],
40 | )
41 | 
42 | # Needed by com_google_protobuf.
43 | bind(
44 |     name = "six",
45 |     actual = "@six_archive//:six",
46 | )
47 | 
48 | load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps")
49 | protobuf_deps()
50 | 
51 | load("@bazel_skylib//lib:versions.bzl", "versions")
52 | versions.check("6.5.0")
53 | 


--------------------------------------------------------------------------------
/google3/third_party/tensorflow_metadata/proto/v0/README.md:
--------------------------------------------------------------------------------
 1 | Please use the public python API
 2 | 
 3 | E.g., use
 4 | 
 5 | ```
 6 | from tensorflow_metadata.proto import schema_pb2
 7 | ```
 8 | 
 9 | instead of
10 | 
11 | ```
12 | from tensorflow_metadata.proto.v0 import schema_pb2
13 | ```
14 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Package Setup script for tf.Metadata."""
 15 | 
 16 | import os
 17 | import platform
 18 | import shutil
 19 | import subprocess
 20 | 
 21 | import setuptools
 22 | from setuptools import find_packages
 23 | from setuptools import setup
 24 | # pylint: disable=g-bad-import-order
 25 | # It is recommended to import setuptools prior to importing distutils to avoid
 26 | # using legacy behavior from distutils.
 27 | # https://setuptools.readthedocs.io/en/latest/history.html#v48-0-0
 28 | from distutils.command import build
 29 | # pylint: enable=g-bad-import-order
 30 | 
 31 | 
 32 | class _BuildCommand(build.build):
 33 |   """Build everything that is needed to install.
 34 | 
 35 |   This overrides the original distutils "build" command to to run bazel_build
 36 |   command before any sub_commands.
 37 | 
 38 |   build command is also invoked from bdist_wheel and install command, therefore
 39 |   this implementation covers the following commands:
 40 |     - pip install . (which invokes bdist_wheel)
 41 |     - python setup.py install (which invokes install command)
 42 |     - python setup.py bdist_wheel (which invokes bdist_wheel command)
 43 |   """
 44 | 
 45 |   def _build_cc_extensions(self):
 46 |     return True
 47 | 
 48 |   # Add "bazel_build" command as the first sub_command of "build". Each
 49 |   # sub_command of "build" (e.g. "build_py", "build_ext", etc.) is executed
 50 |   # sequentially when running a "build" command, if the second item in the tuple
 51 |   # (predicate method) is evaluated to true.
 52 |   sub_commands = [
 53 |       ('bazel_build', _build_cc_extensions),
 54 |   ] + build.build.sub_commands
 55 | 
 56 | 
 57 | class _BazelBuildCommand(setuptools.Command):
 58 |   """Build Bazel artifacts and move generated files to the ."""
 59 | 
 60 |   def initialize_options(self):
 61 |     pass
 62 | 
 63 |   def finalize_options(self):
 64 |     self._bazel_cmd = shutil.which('bazel')
 65 |     if not self._bazel_cmd:
 66 |       raise RuntimeError(
 67 |           'Could not find "bazel" binary. Please visit '
 68 |           'https://docs.bazel.build/versions/master/install.html for '
 69 |           'installation instruction.')
 70 |     if platform.system() == 'Windows':
 71 |       self._additional_build_options = ['--copt=-DWIN32_LEAN_AND_MEAN']
 72 |     else:
 73 |       self._additional_build_options = []
 74 | 
 75 |   def run(self):
 76 |     subprocess.check_call(
 77 |         [self._bazel_cmd, 'run',
 78 |          '--compilation_mode', 'opt',
 79 |          *self._additional_build_options,
 80 |          '//tensorflow_metadata:move_generated_files'],
 81 |         # Bazel should be invoked in a directory containing bazel WORKSPACE
 82 |         # file, which is the root directory.
 83 |         cwd=os.path.dirname(os.path.realpath(__file__)),)
 84 | 
 85 | 
 86 | with open('tensorflow_metadata/version.py') as fp:
 87 |   globals_dict = {}
 88 |   exec(fp.read(), globals_dict)  # pylint: disable=exec-used
 89 | 
 90 | # tf.Metadata version.
 91 | __version__ = globals_dict['__version__']
 92 | 
 93 | 
 94 | # Note: In order for the README to be rendered correctly, make sure to have the
 95 | # following minimum required versions of the respective packages when building
 96 | # and uploading the zip/wheel package to PyPI:
 97 | # setuptools >= 38.6.0, wheel >= 0.31.0, twine >= 1.11.0
 98 | # Get the long description from the README file.
 99 | with open('README.md') as fp:
100 |   _LONG_DESCRIPTION = fp.read()
101 | 
102 | setup(
103 |     name='tensorflow-metadata',
104 |     version=__version__,
105 |     author='Google Inc.',
106 |     author_email='tensorflow-extended-dev@googlegroups.com',
107 |     license='Apache 2.0',
108 |     classifiers=[
109 |         'Development Status :: 5 - Production/Stable',
110 |         'Intended Audience :: Developers',
111 |         'Intended Audience :: Education',
112 |         'Intended Audience :: Science/Research',
113 |         'License :: OSI Approved :: Apache Software License',
114 |         'Operating System :: OS Independent',
115 |         'Programming Language :: Python',
116 |         'Programming Language :: Python :: 3',
117 |         'Programming Language :: Python :: 3.9',
118 |         'Programming Language :: Python :: 3.10',
119 |         'Programming Language :: Python :: 3.11',
120 |         'Programming Language :: Python :: 3 :: Only',
121 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
122 |         'Topic :: Scientific/Engineering :: Mathematics',
123 |         'Topic :: Software Development',
124 |         'Topic :: Software Development :: Libraries',
125 |         'Topic :: Software Development :: Libraries :: Python Modules',
126 |     ],
127 |     namespace_packages=[],
128 |     install_requires=[
129 |         'absl-py>=0.9,<3.0.0',
130 |         'googleapis-common-protos>=1.56.4,<2;python_version>="3.11"',
131 |         'protobuf>=4.25.2;python_version>="3.11"',
132 |         'protobuf>=4.21.6,<4.22;python_version<"3.11"',
133 |     ],
134 |     python_requires='>=3.9,<4',
135 |     packages=find_packages(),
136 |     include_package_data=True,
137 |     description='Library and standards for schema and statistics.',
138 |     long_description=_LONG_DESCRIPTION,
139 |     long_description_content_type='text/markdown',
140 |     keywords='tensorflow metadata tfx',
141 |     download_url='https://github.com/tensorflow/metadata/tags',
142 |     requires=[],
143 |     cmdclass={
144 |         'build': _BuildCommand,
145 |         'bazel_build': _BazelBuildCommand,
146 |     },
147 | )
148 | 


--------------------------------------------------------------------------------
/tensorflow_metadata/BUILD:
--------------------------------------------------------------------------------
 1 | licenses(["notice"])  # Apache 2.0
 2 | 
 3 | sh_binary(
 4 |     name = "move_generated_files",
 5 |     srcs = ["move_generated_files.sh"],
 6 |     data = [
 7 |         "//tensorflow_metadata/proto/v0:metadata_v0_proto_py_pb2",
 8 |     ],
 9 | )
10 | 


--------------------------------------------------------------------------------
/tensorflow_metadata/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Init module for tf.Metadata."""
15 | 


--------------------------------------------------------------------------------
/tensorflow_metadata/move_generated_files.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # Moves the bazel generated files needed for packaging the wheel to the source
17 | # tree.
18 | 
19 | function _is_windows() {
20 |   [[ "$(uname -s | tr 'A-Z' 'a-z')" =~ (cygwin|mingw32|mingw64|msys)_nt* ]]
21 | }
22 | 
23 | function tfmd::move_generated_files() {
24 |   set -eux
25 |   if _is_windows; then
26 |     # Newer bazel does not create bazel-genfiles any more (
27 |     # https://github.com/bazelbuild/bazel/issues/6761). It's merged with bazel-bin
28 |     GENFILES=bazel-genfiles
29 |     if [[ ! -d ${BUILD_WORKSPACE_DIRECTORY}/${GENFILES} ]]; then
30 |       GENFILES=bazel-bin
31 |     fi
32 |     for f in ${BUILD_WORKSPACE_DIRECTORY}/${GENFILES}/tensorflow_metadata/proto/v0/*.py
33 |     do
34 |       cp -f "$f" \
35 |       ${BUILD_WORKSPACE_DIRECTORY}/tensorflow_metadata/proto/v0
36 |     done
37 |   else
38 |     for f in tensorflow_metadata/proto/v0/*.py
39 |     do
40 |       cp -f "$f" \
41 |         ${BUILD_WORKSPACE_DIRECTORY}/tensorflow_metadata/proto/v0
42 |     done
43 |   fi
44 | }
45 | 
46 | tfmd::move_generated_files
47 | 


--------------------------------------------------------------------------------
/tensorflow_metadata/proto/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Init module for tf.Metadata protos."""
15 | 
16 | from tensorflow_metadata.proto.v0 import anomalies_pb2
17 | from tensorflow_metadata.proto.v0 import derived_feature_pb2
18 | from tensorflow_metadata.proto.v0 import metric_pb2
19 | from tensorflow_metadata.proto.v0 import path_pb2
20 | from tensorflow_metadata.proto.v0 import problem_statement_pb2
21 | from tensorflow_metadata.proto.v0 import schema_pb2
22 | from tensorflow_metadata.proto.v0 import statistics_pb2
23 | 


--------------------------------------------------------------------------------
/tensorflow_metadata/proto/v0/BUILD:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | 
17 | load("@com_google_protobuf//bazel:py_proto_library.bzl", "py_proto_library")
18 | 
19 | licenses(["notice"])  # Apache 2.0
20 | 
21 | package(default_visibility = ["//visibility:public"])
22 | 
23 | proto_library(
24 |     name = "metadata_v0_proto",
25 |     srcs = [
26 |         "anomalies.proto",
27 |         "derived_feature.proto",
28 |         "metric.proto",
29 |         "path.proto",
30 |         "problem_statement.proto",
31 |         "schema.proto",
32 |         "statistics.proto",
33 |     ],
34 |     deps = [
35 |         # For well-known proto types like protobuf.Any.
36 |         "@com_google_protobuf//:any_proto",
37 |         "@com_google_protobuf//:api_proto",
38 |         "@com_google_protobuf//:compiler_plugin_proto",
39 |         "@com_google_protobuf//:descriptor_proto",
40 |         "@com_google_protobuf//:duration_proto",
41 |         "@com_google_protobuf//:empty_proto",
42 |         "@com_google_protobuf//:field_mask_proto",
43 |         "@com_google_protobuf//:source_context_proto",
44 |         "@com_google_protobuf//:struct_proto",
45 |         "@com_google_protobuf//:timestamp_proto",
46 |         "@com_google_protobuf//:type_proto",
47 |         "@com_google_protobuf//:wrappers_proto",
48 |     ],
49 | )
50 | 
51 | cc_proto_library(
52 |     name = "cc_metadata_v0_proto_cc",
53 |     deps = [
54 |         ":metadata_v0_proto",
55 |     ],
56 | )
57 | 
58 | cc_library(
59 |     name = "metadata_v0_proto_cc_pb2",
60 |     deps = [":cc_metadata_v0_proto_cc"],
61 | )
62 | 
63 | py_proto_library(
64 |     name = "metadata_v0_proto_py_pb2",
65 |     deps = [
66 |         ":metadata_v0_proto",
67 |     ],
68 | )
69 | 
70 | 


--------------------------------------------------------------------------------
/tensorflow_metadata/proto/v0/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Init module for tf.Metadata v0 protos."""
15 | 


--------------------------------------------------------------------------------
/tensorflow_metadata/proto/v0/anomalies.proto:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | // =============================================================================
 15 | 
 16 | syntax = "proto2";
 17 | 
 18 | package tensorflow.metadata.v0;
 19 | 
 20 | // GOOGLE-LEGACY option jspb_use_correct_proto2_semantics = false;  
 21 | option cc_enable_arenas = true;
 22 | option java_package = "org.tensorflow.metadata.v0";
 23 | option java_multiple_files = true;
 24 | 
 25 | // TODO(b/123519907): Remove this.
 26 | // GOOGLE-LEGACY import "net/proto2/bridge/proto/message_set.proto"; 
 27 | import "tensorflow_metadata/proto/v0/path.proto";
 28 | import "tensorflow_metadata/proto/v0/schema.proto";
 29 | 
 30 | // Message to represent information about an individual anomaly.
 31 | message AnomalyInfo {
 32 |   // Deleted fields.
 33 |   reserved 1, 3;
 34 | 
 35 |   // A path indicating where the anomaly occurred.
 36 |   // Dataset-level anomalies do not have a path.
 37 |   optional Path path = 8;
 38 | 
 39 |   enum Severity {
 40 |     UNKNOWN = 0;
 41 |     WARNING = 1;
 42 |     ERROR = 2;
 43 |   }
 44 |   optional Severity severity = 5;
 45 |   // A description of the entire anomaly.
 46 |   optional string description = 2;
 47 |   // A shorter description, suitable for UI presentation.
 48 |   // If there is a single reason for the anomaly, identical to
 49 |   // reason[0].short_description. Otherwise, summarizes all the reasons.
 50 |   optional string short_description = 6;
 51 |   // The comparison between the existing schema and the fixed schema.
 52 |   repeated DiffRegion diff_regions = 4;
 53 | 
 54 |   // Next ID: 89
 55 |   // LINT.IfChange
 56 |   enum Type {
 57 |     UNKNOWN_TYPE = 0;
 58 |     // Multiple reasons for anomaly.
 59 |     MULTIPLE_REASONS = 82;
 60 |     // Integer larger than 1
 61 |     BOOL_TYPE_BIG_INT = 1;
 62 |     // BYTES type when expected INT type
 63 |     BOOL_TYPE_BYTES_NOT_INT = 2;
 64 |     // BYTES type when expected STRING type
 65 |     BOOL_TYPE_BYTES_NOT_STRING = 3;
 66 |     // FLOAT type when expected INT type
 67 |     BOOL_TYPE_FLOAT_NOT_INT = 4;
 68 |     // FLOAT type when expected STRING type
 69 |     BOOL_TYPE_FLOAT_NOT_STRING = 5;
 70 |     // INT type when expected STRING type
 71 |     BOOL_TYPE_INT_NOT_STRING = 6;
 72 |     // Integer smaller than 0
 73 |     BOOL_TYPE_SMALL_INT = 7;
 74 |     // STRING type when expected INT type
 75 |     BOOL_TYPE_STRING_NOT_INT = 8;
 76 |     // Expected a string, but not the string seen
 77 |     BOOL_TYPE_UNEXPECTED_STRING = 9;
 78 |     // Boolean had float values other than 0 and 1.
 79 |     BOOL_TYPE_UNEXPECTED_FLOAT = 52;
 80 |     // BoolDomain has invalid configuration.
 81 |     BOOL_TYPE_INVALID_CONFIG = 88;
 82 |     // BYTES type when expected STRING type
 83 |     ENUM_TYPE_BYTES_NOT_STRING = 10;
 84 |     // FLOAT type when expected STRING type
 85 |     ENUM_TYPE_FLOAT_NOT_STRING = 11;
 86 |     // INT type when expected STRING type
 87 |     ENUM_TYPE_INT_NOT_STRING = 12;
 88 |     // Invalid UTF8 string observed
 89 |     ENUM_TYPE_INVALID_UTF8 = 13;
 90 |     // Unexpected string values
 91 |     ENUM_TYPE_UNEXPECTED_STRING_VALUES = 14;
 92 |     // The number of values in a given example is too large
 93 |     FEATURE_TYPE_HIGH_NUMBER_VALUES = 15;
 94 |     // The fraction of examples containing a feature is too small
 95 |     FEATURE_TYPE_LOW_FRACTION_PRESENT = 16;
 96 |     // The number of examples containing a feature is too small
 97 |     FEATURE_TYPE_LOW_NUMBER_PRESENT = 17;
 98 |     // The number of values in a given example is too small
 99 |     FEATURE_TYPE_LOW_NUMBER_VALUES = 18;
100 |     // No examples contain the value
101 |     FEATURE_TYPE_NOT_PRESENT = 19;
102 |     // The feature is present as an empty list
103 |     FEATURE_TYPE_NO_VALUES = 20;
104 |     // The feature is repeated in an example, but was expected to be a singleton
105 |     FEATURE_TYPE_UNEXPECTED_REPEATED = 21;
106 |     // The feature had too many unique values (string and categorical features
107 |     // only).
108 |     FEATURE_TYPE_HIGH_UNIQUE = 59;
109 |     // The feature had too few unique values (string and categorical features
110 |     // only).
111 |     FEATURE_TYPE_LOW_UNIQUE = 60;
112 |     // The feature has a constraint on the number of unique values but is not of
113 |     // a type that has the number of unique values counted (i.e., is not string
114 |     // or categorical).
115 |     FEATURE_TYPE_NO_UNIQUE = 61;
116 |     // There is a float value that is too high
117 |     FLOAT_TYPE_BIG_FLOAT = 22;
118 |     // The type is not FLOAT
119 |     FLOAT_TYPE_NOT_FLOAT = 23;
120 |     // There is a float value that is too low
121 |     FLOAT_TYPE_SMALL_FLOAT = 24;
122 |     // The feature is supposed to be floats encoded as strings, but there is
123 |     // a string that is not a float
124 |     FLOAT_TYPE_STRING_NOT_FLOAT = 25;
125 |     // The feature is supposed to be floats encoded as strings, but it was
126 |     // some other type (INT, BYTES, FLOAT)
127 |     FLOAT_TYPE_NON_STRING = 26;
128 |     // The type is completely unknown
129 |     FLOAT_TYPE_UNKNOWN_TYPE_NUMBER = 27;
130 |     // Float feature includes NaN values.
131 |     FLOAT_TYPE_HAS_NAN = 53;
132 |     // Float feature includes Inf or -Inf values.
133 |     FLOAT_TYPE_HAS_INF = 62;
134 |     // There is an unexpectedly large integer
135 |     INT_TYPE_BIG_INT = 28;
136 |     // The type was supposed to be INT, but it was not.
137 |     INT_TYPE_INT_EXPECTED = 29;
138 |     // The feature is supposed to be ints encoded as strings, but some string
139 |     // was not an int.
140 |     INT_TYPE_NOT_INT_STRING = 30;
141 |     // The type was supposed to be STRING, but it was not.
142 |     INT_TYPE_NOT_STRING = 31;
143 |     // There is an unexpectedly small integer
144 |     INT_TYPE_SMALL_INT = 32;
145 |     // The feature is supposed to be ints encoded as strings, but it was
146 |     // some other type (INT, BYTES, FLOAT)
147 |     INT_TYPE_STRING_EXPECTED = 33;
148 |     // Unknown type in stats proto
149 |     INT_TYPE_UNKNOWN_TYPE_NUMBER = 34;
150 |     // The fraction of examples containing TensorFlow supported images is lower
151 |     // than the threshold set in the Schema.
152 |     LOW_SUPPORTED_IMAGE_FRACTION = 64;
153 |     // There are no stats for a column at all
154 |     SCHEMA_MISSING_COLUMN = 35;
155 |     // There is a new column that is not in the schema.
156 |     SCHEMA_NEW_COLUMN = 36;
157 |     // Training serving skew issue
158 |     SCHEMA_TRAINING_SERVING_SKEW = 37;
159 |     // Expected STRING type, but it was FLOAT.
160 |     STRING_TYPE_NOW_FLOAT = 38;
161 |     // Expected STRING type, but it was INT.
162 |     STRING_TYPE_NOW_INT = 39;
163 |     // Control data is missing (either scoring data or previous day).
164 |     COMPARATOR_CONTROL_DATA_MISSING = 40;
165 |     // Treatment data is missing (either treatment data or current day).
166 |     COMPARATOR_TREATMENT_DATA_MISSING = 41;
167 |     // L infinity between treatment and control is high.
168 |     COMPARATOR_L_INFTY_HIGH = 42;
169 |     // Approximate Jensen-Shannon divergence between treatment and control is
170 |     // high.
171 |     COMPARATOR_JENSEN_SHANNON_DIVERGENCE_HIGH = 63;
172 |     // The normalized absolute difference between treatment and control is high.
173 |     COMPARATOR_NORMALIZED_ABSOLUTE_DIFFERENCE_HIGH = 87;
174 |     // No examples in the span.
175 |     NO_DATA_IN_SPAN = 43;
176 |     // The value feature of a sparse feature is missing and at least one
177 |     // feature defining the sparse feature is present.
178 |     SPARSE_FEATURE_MISSING_VALUE = 44;
179 |     // An index feature of a sparse feature is missing and at least one
180 |     // feature defining the sparse feature is present.
181 |     SPARSE_FEATURE_MISSING_INDEX = 45;
182 |     // The length of the features representing a sparse feature does not match.
183 |     SPARSE_FEATURE_LENGTH_MISMATCH = 46;
184 |     // Name collision between a sparse feature and raw feature.
185 |     SPARSE_FEATURE_NAME_COLLISION = 47;
186 |     // Invalid custom semantic domain.
187 |     SEMANTIC_DOMAIN_UPDATE = 48;
188 |     // There are not enough examples in the current data as compared to a
189 |     // control dataset.
190 |     COMPARATOR_LOW_NUM_EXAMPLES = 49;
191 |     // There are too many examples in the current data as compared to a control
192 |     // dataset.
193 |     COMPARATOR_HIGH_NUM_EXAMPLES = 50;
194 |     // There are not enough examples in the dataset.
195 |     DATASET_LOW_NUM_EXAMPLES = 51;
196 |     // There are too many examples in the dataset.
197 |     DATASET_HIGH_NUM_EXAMPLES = 58;
198 |     // Name collision between a weighted feature and a raw feature.
199 |     WEIGHTED_FEATURE_NAME_COLLISION = 54;
200 |     // The value feature of a weighted feature is missing on examples where the
201 |     // weight feature is present.
202 |     WEIGHTED_FEATURE_MISSING_VALUE = 55;
203 |     // The weight feature of a weighted feature is missing on examples where the
204 |     // value feature is present.
205 |     WEIGHTED_FEATURE_MISSING_WEIGHT = 56;
206 |     // The length of the features representing a weighted feature does not
207 |     // match.
208 |     WEIGHTED_FEATURE_LENGTH_MISMATCH = 57;
209 |     // The nesting level of the feature values does not match.
210 |     VALUE_NESTEDNESS_MISMATCH = 65;
211 |     // The domain specified is not compatible with the physical type.
212 |     DOMAIN_INVALID_FOR_TYPE = 66;
213 |     // Feature on schema has no name.
214 |     FEATURE_MISSING_NAME = 67;
215 |     // Feature on schema has no type.
216 |     FEATURE_MISSING_TYPE = 68;
217 |     // Triggered for invalid schema specifications, e.g. min_fraction < 0.
218 |     INVALID_SCHEMA_SPECIFICATION = 69;
219 |     // Triggered for invalid domain specifications in schema.
220 |     INVALID_DOMAIN_SPECIFICATION = 81;
221 |     // The type of the data is inconsistent with the specified type.
222 |     UNEXPECTED_DATA_TYPE = 70;
223 |     // A value did not show up the min number of times within a sequence.
224 |     SEQUENCE_VALUE_TOO_FEW_OCCURRENCES = 71;
225 |     // A value showed up more the max number of times within a sequence.
226 |     SEQUENCE_VALUE_TOO_MANY_OCCURRENCES = 72;
227 |     // A value did not show up in at least the min fraction of sequences.
228 |     SEQUENCE_VALUE_TOO_SMALL_FRACTION = 73;
229 |     // A value showed up in greater than the max fraction of sequences.
230 |     SEQUENCE_VALUE_TOO_LARGE_FRACTION = 74;
231 |     // Too small a fraction of feature values matched vocab entries.
232 |     FEATURE_COVERAGE_TOO_LOW = 75;
233 |     // The average token length was too short.
234 |     FEATURE_COVERAGE_TOO_SHORT_AVG_TOKEN_LENGTH = 76;
235 |     // A sequence violated the location constraint.
236 |     NLP_WRONG_LOCATION = 77;
237 |     // A feature was specified as an embedding but was not a fixed dimension.
238 |     EMBEDDING_SHAPE_INVALID = 78;
239 |     // A feature contains an image that has more bytes than the max byte size.
240 |     MAX_IMAGE_BYTE_SIZE_EXCEEDED = 79;
241 |     // A feature is supposed to be of a fixed shape but its valency stats
242 |     // do not agree.
243 |     INVALID_FEATURE_SHAPE = 80;
244 |     // Constraints are specified within the but cannot be verified because the
245 |     // corresponding stats are not available.
246 |     STATS_NOT_AVAILABLE = 83;
247 |     // A derived feature had a schema lifecycle other than VALIDATION_DERIVED
248 |     // or DISABLED.
249 | 
250 |     // The following are experimental and subject to change.
251 |     DERIVED_FEATURE_BAD_LIFECYCLE = 84;
252 |     // A derived feature is represented in the schema with an invalid or missing
253 |     // validation_derived_source.
254 |     DERIVED_FEATURE_INVALID_SOURCE = 85;
255 | 
256 |     // The following type is experimental and subject to change.
257 |     // The statistics did not specify a custom validation condition.
258 |     CUSTOM_VALIDATION = 86;
259 |   }
260 |   // LINT.ThenChange(//tensorflow_data_validation/g3doc/anomalies.md)
261 |   // Reason for the anomaly. There may be more than one reason,
262 |   // e.g. the field might be missing sometimes AND a new value is
263 |   // present.
264 |   message Reason {
265 |     optional Type type = 1 [default = UNKNOWN_TYPE];
266 |     // A short description of an anomaly, suitable for UI presentation.
267 |     optional string short_description = 2;
268 |     // A longer description of an anomaly.
269 |     optional string description = 3;
270 |   }
271 |   repeated Reason reason = 7;
272 | }
273 | 
274 | // Message to contain the result of the drift/skew measurements for a feature.
275 | message DriftSkewInfo {
276 |   message Measurement {
277 |     enum Type {
278 |       UNKNOWN = 0;
279 |       L_INFTY = 1;
280 |       JENSEN_SHANNON_DIVERGENCE = 2;
281 |       NORMALIZED_ABSOLUTE_DIFFERENCE = 3;
282 |     }
283 |     // Type of the measurement.
284 |     optional Type type = 1;
285 |     // Value of the measurement.
286 |     optional double value = 2;
287 |     // Threshold used to determine whether the measurement results in an
288 |     // anomaly.
289 |     optional double threshold = 3;
290 |   }
291 | 
292 |   // Identifies the feature;
293 |   optional Path path = 1;
294 | 
295 |   // The drift/skew may be measured in the same invocation of TFDV, in which
296 |   // case both of the following fields are populated.
297 |   // Also the drift/skew may be quantified by different measurements, thus
298 |   // repeated.
299 |   repeated Measurement drift_measurements = 2;
300 |   repeated Measurement skew_measurements = 3;
301 | }
302 | 
303 | // Message to represent the anomalies, which describe the mismatches (if any)
304 | // between the stats and the schema.
305 | message Anomalies {
306 |   // Deleted fields.
307 |   reserved 4;
308 | 
309 |   // The baseline schema that is used.
310 |   oneof baseline_schema {
311 |     tensorflow.metadata.v0.Schema baseline = 1;
312 |     tensorflow.metadata.v0.Schema baseline_v1 = 6 [deprecated = true];
313 |   }
314 | 
315 |   // Map from a column to the difference that it represents.
316 |   enum AnomalyNameFormat {
317 |     // At present, this indicates that the keys in anomaly_info
318 |     // refers to the raw field name in the Schema.
319 |     UNKNOWN = 0;
320 |     // The serialized path to a struct.
321 |     SERIALIZED_PATH = 1;
322 |   }
323 | 
324 |   // The format of the keys in anomaly_info.
325 |   // If absent, default is DEFAULT.
326 |   optional AnomalyNameFormat anomaly_name_format = 7;
327 |   // Information about feature-level anomalies.
328 |   map<string, AnomalyInfo> anomaly_info = 2;
329 |   // Information about dataset-level anomalies.
330 |   optional AnomalyInfo dataset_anomaly_info = 8;
331 |   // True if numExamples == 0.
332 |   optional bool data_missing = 3;
333 | 
334 |   // If drift / skew detection was conducted, this field will hold the
335 |   // comparison results for all the features compared, regardless whether a
336 |   // related anomaly was reported.
337 |   repeated DriftSkewInfo drift_skew_info = 9;
338 |   // TODO(b/123519907): Remove this.
339 |   // The hook to attach any usage and tool specific metadata. Example:
340 |   // message SchemaStamp {
341 |   //   // extension ID is any CL number that has not been used in an extension.
342 |   //   extend proto2.bridge.MessageSet {
343 |   //     optional StampedSchemaDiff message_set_extension = 123445554;
344 |   //   }
345 |   //   optional string schema_stamp = 1;
346 |   // }
347 |   //
348 |   // then, the following proto msg encodes an Anomalies with an embedded
349 |   // SchemaStamp:
350 |   //
351 |   // Anomalies {
352 |   //   metadata {
353 |   //     [SchemaStamp]: {
354 |   //        schema_stamp: "stamp"
355 |   //     }
356 |   //   }
357 |   // }
358 | // GOOGLE-LEGACY   optional proto2.bridge.MessageSet metadata = 5;  
359 | }
360 | 
361 | // Describes a region in the comparison between two text artifacts. Note that
362 | // a region also contains the contents of the two artifacts that correspond to
363 | // the region.
364 | message DiffRegion {
365 |   // Details for the chunk.
366 |   oneof details {
367 |     // An unchanged region of lines.
368 |     UnchangedRegion unchanged = 1;
369 |     // A region of lines removed from the left.
370 |     OneSideRegion removed = 2;
371 |     // A region of lines added to the right.
372 |     OneSideRegion added = 3;
373 |     // A region of lines that are different in the two artifacts.
374 |     ChangedRegion changed = 4;
375 |     // An unchanged region of lines whose contents are just hidden.
376 |     HiddenRegion hidden = 5;
377 |   }
378 | }
379 | 
380 | // Describes a chunk that is the same in the two artifacts.
381 | message UnchangedRegion {
382 |   // The starting lines of the chunk in the two artifacts.
383 |   optional int32 left_start = 1;
384 |   optional int32 right_start = 2;
385 |   // The contents of the chunk. These are the same in both artifacts.
386 |   repeated string contents = 3;
387 | }
388 | 
389 | // Describes a chunk that applies to only one of the two artifacts.
390 | message OneSideRegion {
391 |   // Starting line.
392 |   optional int32 start = 1;
393 |   // Contents.
394 |   repeated string contents = 2;
395 | }
396 | 
397 | // Describes a chunk that represents changes in both artifacts over the same
398 | // number of lines.
399 | message ChangedRegion {
400 |   // Changed region in the left artifact, in terms of starting line number and
401 |   // contents.
402 |   optional int32 left_start = 1;
403 |   repeated string left_contents = 2;
404 |   // Ditto for the right artifact.
405 |   optional int32 right_start = 3;
406 |   repeated string right_contents = 4;
407 | }
408 | 
409 | // A chunk that represents identical lines, whose contents are hidden.
410 | message HiddenRegion {
411 |   // Starting lines in the two artifacts.
412 |   optional int32 left_start = 1;
413 |   optional int32 right_start = 2;
414 |   // Size of the region in terms of lines.
415 |   optional int32 size = 3;
416 | }
417 | 


--------------------------------------------------------------------------------
/tensorflow_metadata/proto/v0/derived_feature.proto:
--------------------------------------------------------------------------------
  1 | // Copyright 2022 The TensorFlow Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | // =============================================================================
 15 | 
 16 | syntax = "proto3";
 17 | 
 18 | package tensorflow.metadata.v0;
 19 | 
 20 | import "tensorflow_metadata/proto/v0/path.proto";
 21 | 
 22 | option cc_enable_arenas = true;
 23 | option java_package = "org.tensorflow.metadata.v0";
 24 | option java_multiple_files = true;
 25 | 
 26 | // DerivedFeatureSource tracks information about the source of a derived
 27 | // feature. Derived features are computed from ordinary features for the
 28 | // purposes of statistics collection and validation, but do not exist in the
 29 | // dataset.
 30 | // Experimental and subject to change.
 31 | // LINT.IfChange
 32 | message DerivedFeatureSource {
 33 |   // The name of the deriver that generated this feature.
 34 |   string deriver_name = 1;
 35 |   // An optional description of the transformation.
 36 |   string description = 2;
 37 |   // The constituent features that went into generating this derived feature.
 38 |   repeated Path source_path = 3;
 39 |   // A DerivedFeatureSource that is declaratively configured represents an
 40 |   // intent for downstream processing to generate a derived feature
 41 |   // (in the schema), or tracks that a feature was generated from such a
 42 |   // configuration (in statistics).
 43 |   bool declaratively_configured = 4;
 44 |   // Optional configuration for canned derivers.
 45 |   DerivedFeatureConfig config = 5;
 46 | }
 47 | // LINT.ThenChange(//tfx_bsl/cc/statistics/merge_util.cc)
 48 | 
 49 | // Stores configuration for a variety of canned feature derivers.
 50 | // TODO(b/227478330): Consider validating config in merge_util.cc.
 51 | message DerivedFeatureConfig {
 52 |   oneof type {
 53 |     AllowlistDeriver allowlist = 1;
 54 |     ArgmaxTopK argmax_top_k = 2;
 55 |     ReduceOp reduce_op = 3;
 56 |     SliceSql slice_sql = 4;
 57 |     ImageQualityDeriver image_quality = 5;
 58 |   }
 59 | }
 60 | 
 61 | message AllowlistDeriver {
 62 |   repeated bytes allowed_bytes_value = 1;
 63 |   bytes placeholder_value = 2;  // If unset, placeholders will be dropped.
 64 | }
 65 | 
 66 | message ArgmaxTopK {
 67 |   uint32 k = 1;
 68 | }
 69 | 
 70 | message ReduceOp {
 71 |   string op_name = 1;
 72 | }
 73 | 
 74 | enum SliceValueTypes {
 75 |   VALUE_TYPE_DEFAULT = 0;  // Default type is string
 76 |   VALUE_TYPE_INTEGER = 1;
 77 |   VALUE_TYPE_FLOAT = 2;
 78 |   VALUE_TYPE_UNSUPPORTED = 3;
 79 | }
 80 | 
 81 | message SliceSql {
 82 |   // Sql expression used to create a derived feature based on the extracted
 83 |   // slice keys. It must return result of STRUCT type.
 84 |   string expression = 1;
 85 | 
 86 |   // Value type of the derived feature. The default type is string.
 87 |   SliceValueTypes feature_value_type = 2;
 88 | 
 89 |   // Indicates whether to drop struct name in the generated output.
 90 |   bool drop_struct_name = 3;
 91 | 
 92 |   // Set default feature value when slice query fails. If the slice query fails
 93 |   // and no default value is provided, the TFDV statistics generation pipeline
 94 |   // will fail.
 95 |   oneof default_feature_value_for_failed_sql {
 96 |     int64 int64_default_feature_value = 4;
 97 |     float float_default_feature_value = 5;
 98 |     string string_default_feature_value = 6;
 99 |   }
100 | }
101 | 
102 | message ImageQualityDeriver {
103 |   string model_name = 1;
104 | }
105 | 


--------------------------------------------------------------------------------
/tensorflow_metadata/proto/v0/metric.proto:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | // =============================================================================
 15 | 
 16 | syntax = "proto3";
 17 | 
 18 | package tensorflow.metadata.v0;
 19 | 
 20 | import "google/protobuf/any.proto";
 21 | import "google/protobuf/wrappers.proto";
 22 | import "google/protobuf/descriptor.proto";
 23 | import "tensorflow_metadata/proto/v0/path.proto";
 24 | 
 25 | option cc_enable_arenas = true;
 26 | option java_package = "org.tensorflow.metadata.v0";
 27 | option java_multiple_files = true;
 28 | 
 29 | // Metric type indicates which direction of a real-valued metric is "better".
 30 | // For most message types, this is invariant. For custom message types,
 31 | // is_maximized == true is like MAXIMIZE, and otherwise MINIMIZE.
 32 | enum MetricType {
 33 |   UNKNOWN = 0;
 34 |   // Maximize the metric (i.e. a utility).
 35 |   MAXIMIZE = 1;
 36 |   // Minimize the metric (i.e. a loss).
 37 |   MINIMIZE = 2;
 38 |   // Look for a field is_maximized.
 39 |   CUSTOM = 3;
 40 | }
 41 | 
 42 | extend google.protobuf.MessageOptions {
 43 |   MetricType metric_type = 227673489;
 44 | }
 45 | 
 46 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/binary_accuracy
 47 | message BinaryAccuracy {
 48 |   option (metric_type) = MAXIMIZE;
 49 | }
 50 | 
 51 | // categorical_accuracy(...)
 52 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/categorical_accuracy
 53 | message CategoricalAccuracy {
 54 |   option (metric_type) = MAXIMIZE;
 55 | }
 56 | 
 57 | // categorical_crossentropy(...)
 58 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/categorical_crossentropy
 59 | message CategoricalCrossEntropy {
 60 |   option (metric_type) = MINIMIZE;
 61 | }
 62 | 
 63 | // cosine(...)
 64 | // cosine_proximity(...)
 65 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/cosine_proximity
 66 | // DEPRECATED
 67 | message Cosine {
 68 |   option (metric_type) = MINIMIZE;
 69 | }
 70 | 
 71 | // Linear Hinge Loss
 72 | // hinge(...)
 73 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/hinge
 74 | // DEPRECATED
 75 | message Hinge {
 76 |   option (metric_type) = MINIMIZE;
 77 | }
 78 | 
 79 | // kld(...)
 80 | // kullback_leibler_divergence(...)
 81 | // KLD(...)
 82 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/kullback_leibler_divergence
 83 | // DEPRECATED
 84 | message KullbackLeiblerDivergence {
 85 |   option (metric_type) = MINIMIZE;
 86 | }
 87 | 
 88 | // MAE(...)
 89 | // mae(...)
 90 | // mean_absolute_error(...)
 91 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/mean_absolute_error
 92 | message MeanAbsoluteError {
 93 |   option (metric_type) = MINIMIZE;
 94 | }
 95 | 
 96 | // MAPE(...)
 97 | // mape(...)
 98 | // mean_absolute_percentage_error(...)
 99 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/mean_absolute_percentage_error
100 | message MeanAbsolutePercentageError {
101 |   option (metric_type) = MINIMIZE;
102 | }
103 | 
104 | // MSE(...)
105 | // mse(...)
106 | // mean_squared_error(...)
107 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/mean_squared_error
108 | message MeanSquaredError {
109 |   option (metric_type) = MINIMIZE;
110 | }
111 | 
112 | // msle(...)
113 | // MSLE(...)
114 | // mean_squared_logarithmic_error(...)
115 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/mean_squared_logarithmic_error
116 | message MeanSquaredLogarithmicError {
117 |   option (metric_type) = MINIMIZE;
118 | }
119 | 
120 | // poisson(...)
121 | // DEPRECATED
122 | message Poisson {
123 |   option (metric_type) = MINIMIZE;
124 | }
125 | 
126 | // squared_hinge(...)
127 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/squared_hinge
128 | // DEPRECATED
129 | message SquaredHinge {
130 |   option (metric_type) = MINIMIZE;
131 | }
132 | 
133 | // top_k_categorical_accuracy(...)
134 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/top_k_categorical_accuracy
135 | message TopKCategoricalAccuracy {
136 |   option (metric_type) = MAXIMIZE;
137 | }
138 | 
139 | // sparse_top_k_categorical_accuracy(...)
140 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/sparse_top_k_categorical_accuracy
141 | // DEPRECATED
142 | message SparseTopKCategoricalAccuracy {
143 |   option (metric_type) = MAXIMIZE;
144 | }
145 | 
146 | // Binary cross entropy as a metric is equal to the negative log likelihood
147 | // (see logistic regression).
148 | // In addition, when used to solve a binary classification task, binary cross
149 | // entropy implies that the binary label will maximize binary accuracy.
150 | // binary_crossentropy(...)
151 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/binary_crossentropy
152 | message BinaryCrossEntropy {
153 |   option (metric_type) = MINIMIZE;
154 | }
155 | 
156 | // AKA the negative log likelihood or log loss.
157 | // Given a label y\in {0,1} and a predicted probability p in [0,1]:
158 | // -yln(p)-(1-y)ln(1-p)
159 | // TODO(martinz): if this is interpreted the same as binary_cross_entropy,
160 | // we may need to revisit the semantics.
161 | // DEPRECATED
162 | message LogisticRegression {
163 |   option (metric_type) = MINIMIZE;
164 | }
165 | 
166 | // Area under curve for the ROC-curve.
167 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/AUC
168 | message AUC {
169 |   option (metric_type) = MAXIMIZE;
170 | }
171 | 
172 | // Area under curve for the precision-recall-curve.
173 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/AUC
174 | message AUCPrecisionRecall {
175 |   option (metric_type) = MAXIMIZE;
176 | }
177 | 
178 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/SensitivityAtSpecificity
179 | message SensitivityAtSpecificity {
180 |   option (metric_type) = MAXIMIZE;
181 | 
182 |   // Minimal required specificity, (0.0, 1.0).
183 |   google.protobuf.DoubleValue specificity = 1;
184 | }
185 | 
186 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/SpecificityAtSensitivity
187 | message SpecificityAtSensitivity {
188 |   option (metric_type) = MAXIMIZE;
189 | 
190 |   // Minimal required sensitivity, (0.0, 1.0).
191 |   google.protobuf.DoubleValue sensitivity = 1;
192 | }
193 | 
194 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/PrecisionAtRecall
195 | message PrecisionAtRecall {
196 |   option (metric_type) = MAXIMIZE;
197 | 
198 |   // Minimal required recall, (0.0, 1.0).
199 |   google.protobuf.DoubleValue recall = 1;
200 | }
201 | 
202 | // https://www.tensorflow.org/api_docs/python/tf/keras/metrics/RecallAtPrecision
203 | message RecallAtPrecision {
204 |   option (metric_type) = MAXIMIZE;
205 | 
206 |   // Minimal required precision, (0.0, 1.0).
207 |   google.protobuf.DoubleValue precision = 1;
208 | }
209 | 
210 | message FalseNegativeRateAtThreshold {
211 |   option (metric_type) = MAXIMIZE;
212 | 
213 |   // Threshold to apply to a prediction to determine positive vs negative.
214 |   // Note: if the model is calibrated, the threshold can be thought of as a
215 |   // probability so the threshold has a stable, intuitive semantic.
216 |   // However, not all solutions may be calibrated, and not all computations of
217 |   // the metric may operate on a calibrated score. In AutoTFX, the final model
218 |   // metrics are computed on a calibrated score, but the metrics computed within
219 |   // the model selection process are uncalibrated. Be aware of this possible
220 |   // skew in the metrics between model selection and final model evaluation.
221 |   google.protobuf.DoubleValue threshold = 1;
222 | }
223 | 
224 | message FalsePositiveRateAtThreshold {
225 |   option (metric_type) = MAXIMIZE;
226 | 
227 |   // Threshold to apply to a prediction to determine positive vs negative.
228 |   // Note: if the model is calibrated, the threshold can be thought of as a
229 |   // probability so the threshold has a stable, intuitive semantic.
230 |   // However, not all solutions may be calibrated, and not all computations of
231 |   // the metric may operate on a calibrated score. In AutoTFX, the final model
232 |   // metrics are computed on a calibrated score, but the metrics computed within
233 |   // the model selection process are uncalibrated. Be aware of this possible
234 |   // skew in the metrics between model selection and final model evaluation.
235 |   google.protobuf.DoubleValue threshold = 1;
236 | }
237 | 
238 | message PrecisionAtK {
239 |   option (metric_type) = MAXIMIZE;
240 | }
241 | 
242 | message MeanReciprocalRank {}
243 | 
244 | // https://www.tensorflow.org/responsible_ai/model_remediation/api_docs/python/model_remediation/min_diff/losses/MMDLoss
245 | message MaximumMeanDiscrepancy {
246 |   option (metric_type) = MINIMIZE;
247 | 
248 |   // Kernel to apply to the predictions. Currently supported values are
249 |   // 'gaussian' and 'laplace'. Defaults to 'gaussian'.
250 |   string kernel = 1;
251 | }
252 | 
253 | // The mean of the prediction across the dataset.
254 | message PredictionMean {}
255 | 
256 | // Area under ROC-curve calculated globally for MultiClassClassification (model
257 | // predicts a single label) or MultiLabelClassification (model predicts class
258 | // probabilities). The area is calculated by treating the entire set of data as
259 | // an aggregate result, and computing a single metric rather than k metrics
260 | // (one for each target label) that get averaged together. For example, the FPR
261 | // and TPR at a given point on the AUC curve for k targer labels are:
262 | //   FPR = (FP1 + FP2 + ... + FPk) / ((FP1 + FP2 + ... + FPk) +
263 | //                                    (TN1 + TN2 + ... + TNk))
264 | //   TPR = (TP1 + TP2 + ... +TPk) / ((TP1 + TP2 + ... + TPk) +
265 | //                                   (FN1 + FN2 + ... + FNk))
266 | message MicroAUC {
267 |   option (metric_type) = MAXIMIZE;
268 | }
269 | 
270 | // Cross entropy for MultiLabelClassification where each target and
271 | // prediction is the probabily of belonging to that class independent of other
272 | // classes.
273 | message MultilabelCrossEntropy {
274 |   option (metric_type) = MINIMIZE;
275 | }
276 | 
277 | // DEPRECATED
278 | message BlockUtility {
279 |   option (metric_type) = MAXIMIZE;
280 | 
281 |   repeated double weight = 1;
282 | }
283 | 
284 | // A custom metric.
285 | // Prefer using or adding an explicit metric message
286 | // and only use this generic message as a last resort.
287 | // NEXT_TAG: 4
288 | message CustomMetric {
289 |   option (metric_type) = CUSTOM;
290 | 
291 |   // The display name of a metric computed by the model. The name should match
292 |   // ^[a-zA-Z0-9\s]{1,25}$ and must be unique across all performance metrics.
293 |   // Trailing and leading spaces will be truncated before matching.
294 |   string name = 1;
295 | 
296 |   // True if the metric is maximized: false if it is minimized.
297 |   // Must be specified if the CustomMetric is used as an objective.
298 |   bool is_maximized = 2;
299 | 
300 |   // RegistrySpec is a full specification of the custom metric and its
301 |   // construction based on the binary’s metric registry. New custom metrics must
302 |   // be linked to the binary and registered in its metric registry to be
303 |   // identifiable via this specification.
304 |   message RegistrySpec {
305 |     // Identifier of the metric class in the metric registry of the binary.
306 |     string key = 1;
307 | 
308 |     // Generic proto describing the configuration for the metric to be computed.
309 |     // It's upto the implementer of the metric to parse this configuration.
310 |     google.protobuf.Any config = 2;
311 |   }
312 | 
313 |   // Specification of the metric in the binary’s metric registry.
314 |   RegistrySpec registry_spec = 3;
315 | }
316 | 
317 | // Performance metrics measure the quality of a model. They need not be
318 | // differentiable.
319 | message PerformanceMetric {
320 |   oneof performance_metric {
321 |     AUC auc = 1;
322 |     AUCPrecisionRecall auc_precision_recall = 26;
323 |     BinaryAccuracy binary_accuracy = 2;
324 |     BinaryCrossEntropy binary_cross_entropy = 3;
325 |     BlockUtility block_utility = 4 [deprecated = true];
326 |     CategoricalAccuracy categorical_accuracy = 5;
327 |     CategoricalCrossEntropy categorical_cross_entropy = 6;
328 |     Cosine cosine = 7 [deprecated = true];
329 |     Hinge hinge = 8 [deprecated = true];
330 |     KullbackLeiblerDivergence kullback_leibler_divergence = 9
331 |         [deprecated = true];
332 |     LogisticRegression logistic_regression = 10 [deprecated = true];
333 |     MeanAbsoluteError mean_absolute_error = 11;
334 |     MeanAbsolutePercentageError mean_absolute_percentage_error = 12;
335 |     MeanSquaredError squared_error = 13;
336 |     MeanSquaredLogarithmicError mean_squared_logarithmic_error = 14;
337 |     MeanReciprocalRank mean_reciprocal_rank = 15;
338 |     MicroAUC micro_auc = 27;
339 |     MultilabelCrossEntropy multi_label_cross_entropy = 28;
340 |     Poisson poisson = 16 [deprecated = true];
341 |     PrecisionAtK precision_at_k = 17;
342 |     SquaredHinge squared_hinge = 18 [deprecated = true];
343 |     SparseTopKCategoricalAccuracy sparse_top_k_categorical_accuracy = 19
344 |         [deprecated = true];
345 |     TopKCategoricalAccuracy top_k_categorical_accuracy = 20;
346 |     CustomMetric custom_metric = 21;
347 |     SensitivityAtSpecificity sensitivity_at_specificity = 22;
348 |     SpecificityAtSensitivity specificity_at_sensitivity = 23;
349 |     PrecisionAtRecall precision_at_recall = 24;
350 |     RecallAtPrecision recall_at_precision = 25;
351 |   }
352 |   // NEXT_TAG: 37;
353 | }
354 | 
355 | 
356 | 
357 | 
358 | 
359 | 
360 | 
361 | 
362 | 


--------------------------------------------------------------------------------
/tensorflow_metadata/proto/v0/path.proto:
--------------------------------------------------------------------------------
 1 | // Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | syntax = "proto2";
17 | 
18 | package tensorflow.metadata.v0;
19 | 
20 | option cc_enable_arenas = true;
21 | option java_package = "org.tensorflow.metadata.v0";
22 | option java_multiple_files = true;
23 | 
24 | // A path is a more general substitute for the name of a field or feature that
25 | // can be used for flat examples as well as structured data. For example, if
26 | // we had data in a protocol buffer:
27 | // message Person {
28 | //   int age = 1;
29 | //   optional string gender = 2;
30 | //   repeated Person parent = 3;
31 | // }
32 | // Thus, here the path {step:["parent", "age"]} in statistics would refer to the
33 | // age of a parent, and {step:["parent", "parent", "age"]} would refer to the
34 | // age of a grandparent. This allows us to distinguish between the statistics
35 | // of parents' ages and grandparents' ages. In general, repeated messages are
36 | // to be preferred to linked lists of arbitrary length.
37 | // For SequenceExample, if we have a feature list "foo", this is represented
38 | // by {step:["##SEQUENCE##", "foo"]}.
39 | message Path {
40 |   // Any string is a valid step.
41 |   // However, whenever possible have a step be [A-Za-z0-9_]+.
42 |   repeated string step = 1;
43 | }
44 | 


--------------------------------------------------------------------------------
/tensorflow_metadata/proto/v0/problem_statement.proto:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | // =============================================================================
 15 | 
 16 | syntax = "proto3";
 17 | 
 18 | package tensorflow.metadata.v0;
 19 | 
 20 | import "google/protobuf/struct.proto";
 21 | import "google/protobuf/descriptor.proto";
 22 | import "tensorflow_metadata/proto/v0/metric.proto";
 23 | import "tensorflow_metadata/proto/v0/path.proto";
 24 | 
 25 | option cc_enable_arenas = true;
 26 | option java_package = "org.tensorflow.metadata.v0";
 27 | option java_multiple_files = true;
 28 | 
 29 | enum TaskType {
 30 |   UNKNOWN_TYPE = 0;
 31 |   BINARY_CLASSIFICATION = 1;
 32 |   MULTI_CLASS_CLASSIFICATION = 2;
 33 |   TOP_K_CLASSIFICATION = 3;
 34 |   ONE_DIMENSIONAL_REGRESSION = 4;
 35 |   MULTI_LABEL_CLASSIFICATION = 5;
 36 |   MULTI_DIMENSIONAL_REGRESSION = 6;
 37 |   TEXT_GENERATION = 7;
 38 | }
 39 | 
 40 | extend google.protobuf.MessageOptions {
 41 |   TaskType task_type = 241943395;
 42 | }
 43 | 
 44 | // Configuration for a binary classification task.
 45 | // The output is one of two possible class labels, encoded as the same type
 46 | // as the label column.
 47 | // BinaryClassification is the same as MultiClassClassification with
 48 | // n_classes = 2.
 49 | message BinaryClassification {
 50 |   option (task_type) = BINARY_CLASSIFICATION;
 51 | 
 52 |   // The label column.
 53 |   oneof label_id {
 54 |     // The name of the label. Assumes the label is a flat, top-level field.
 55 |     string label = 1;
 56 |     // A path can be used instead of a flat string if the label is nested.
 57 |     Path label_path = 3;
 58 |   }
 59 |   // (optional) The weight column.
 60 |   string example_weight = 2;
 61 | 
 62 |   // Defines which label value is the positive and/or negative class.
 63 |   message PositiveNegativeSpec {
 64 |     // Specifies a label's value which can be used for positive/negative class
 65 |     // specification.
 66 |     message LabelValue {
 67 |       oneof value_type {
 68 |         string string_value = 1;
 69 |       }
 70 |     }
 71 |     // This value is the positive class.
 72 |     LabelValue positive_class_value = 1;
 73 |     // This value is the negative class.
 74 |     LabelValue negative_class_value = 2;
 75 |   }
 76 | 
 77 |   // (optional) specification of the positive and/or negative class value.
 78 |   PositiveNegativeSpec positive_negative_spec = 4;
 79 | }
 80 | 
 81 | // Specifies a dynamic multiclass/multi-label problem where the number of label
 82 | //  classes is inferred from the data.
 83 | message DynamicClassSpec {
 84 |   // Note: it is up to a solution provider to implement support for OOV labels.
 85 |   // Note: both a frequency_threshold and a top_k may be set. A class is grouped
 86 |   // into the OOV class if it fails to meet either of the criteria below.
 87 |   message OovClassSpec {
 88 |     // If set, labels are grouped into the "OOV" class if they occur less than
 89 |     // frequency_threshold times in the training dataset. If 0, labels
 90 |     // that appear in test / validation splits but not in training would be
 91 |     // still classified as the "OOV" class.
 92 |     int64 frequency_threshold = 1;
 93 |     // If set, only the top_k labels in the training set are used and all others
 94 |     // are grouped into an "OOV" class.
 95 |     int64 top_k = 2;
 96 |   }
 97 |   // Optional. If specified, an Out-Of-Vocabulary (OOV) class is created and
 98 |   // populated based on frequencies in the training set. If no OOV class is
 99 |   // specified, the model's label vocabulary should consist of all labels that
100 |   // appear in the training set.
101 |   OovClassSpec oov_class_spec = 1;
102 | }
103 | // Configuration for a multi-class classification task.
104 | // In this problem type, there are n_classes possible label values, and the
105 | // model predicts a single label.
106 | // The output is one of the class labels, out of n_classes possible classes.
107 | // The output type will correspond to the label column type.
108 | message MultiClassClassification {
109 |   option (task_type) = MULTI_CLASS_CLASSIFICATION;
110 | 
111 |   // The label column.  There's only a single label per example.
112 |   // If the label column is a BoolDomain, use the BinaryClassification Type
113 |   // instead.
114 |   oneof label_id {
115 |     // The name of the label. Assumes the label is a flat, top-level field.
116 |     string label = 1;
117 |     // A path can be used instead of a flat string if the label is nested.
118 |     Path label_path = 5;
119 |   }
120 |   // The weight column.
121 |   string example_weight = 2;
122 |   oneof class_spec {
123 |     // The exact number of label classes.
124 |     uint64 n_classes = 3;
125 |     // The number of label classes that should be inferred dynamically from the
126 |     // data.
127 |     DynamicClassSpec dynamic_class_spec = 4;
128 |   }
129 | }
130 | // Configuration for a multi-label classification task.
131 | // In this problem type there are n_classes unique possible label values
132 | // overall. There can be from zero up to n_classes unique labels per example.
133 | // The output, which is of type real number, is class probabilities associated
134 | // with each class. It will be of n_classes dimension for each example, if
135 | // n_classes is specified. Otherwise, the dimension will be set to the number
136 | // of unique class labels that are dynamically inferred from the data based on
137 | // dynamic_class_spec.
138 | message MultiLabelClassification {
139 |   option (task_type) = MULTI_LABEL_CLASSIFICATION;
140 | 
141 |   // The label column. There can be one or more labels per example.
142 |   oneof label_id {
143 |     // The name of the label. Assumes the label is a flat, top-level field.
144 |     string label = 1;
145 |     // A path can be used instead of a flat string if the label is nested.
146 |     Path label_path = 5;
147 |   }
148 |   // The weight column.
149 |   string example_weight = 2;
150 |   oneof class_spec {
151 |     // The exact number of unique class labels.
152 |     uint64 n_classes = 3;
153 |     // The maximal number of label classes that should be inferred dynamically
154 |     // from the data.
155 |     DynamicClassSpec dynamic_class_spec = 4;
156 |   }
157 | }
158 | 
159 | // Configuration for a top-K classification task.
160 | // In this problem type, there are n_classes possible label values, and the
161 | // model predicts n_predicted_labels labels.
162 | // The output is a sequence of n_predicted_labels labels, out of n_classes
163 | // possible classes. The order of the predicted output labels is determined
164 | // by the predictions_order field.
165 | // (*) MultiClassClassification is the same as TopKClassification with
166 | //     n_predicted_labels = 1.
167 | // (*) TopKClassification does NOT mean multi-class multi-label classification:
168 | //     e.g., the output contains a sequence of labels, all coming from the same
169 | //     label column in the data.
170 | message TopKClassification {
171 |   option (task_type) = TOP_K_CLASSIFICATION;
172 | 
173 |   // The label column.
174 |   oneof label_id {
175 |     // The name of the label. Assumes the label is a flat, top-level field.
176 |     string label = 1;
177 |     // A path can be used instead of a flat string if the label is nested.
178 |     Path label_path = 6;
179 |   }
180 |   // (optional) The weight column.
181 |   string example_weight = 2;
182 |   // (optional) The number of label classes. If unset, the solution provider
183 |   // is expected to infer the number of classes from the data.
184 |   uint64 n_classes = 3;
185 |   // (optional) The number of class labels to predict. If unset, we assume 1.
186 |   uint64 n_predicted_labels = 4;
187 |   enum Order {
188 |     UNSPECIFIED = 0;
189 |     // Predictions are ordered from the most likely to least likely.
190 |     SCORE_DESC = 1;
191 |     // Predictions are ordered from the least likely to most likely.
192 |     SCORE_ASC = 2;
193 |   }
194 |   Order predictions_order = 5;
195 | }
196 | 
197 | // A one-dimensional regression task.
198 | // The output is a single real number, whose range is dependent upon the
199 | // objective.
200 | message OneDimensionalRegression {
201 |   option (task_type) = ONE_DIMENSIONAL_REGRESSION;
202 | 
203 |   // The label column.
204 |   oneof label_id {  // oneof label_id is required.
205 |     // The name of the label. Assumes the label is a flat, top-level field.
206 |     string label = 1;
207 |     // A path can be used instead of a flat string if the label is nested.
208 |     Path label_path = 3;
209 |   }
210 |   // (optional) The weight column.
211 |   string weight = 2;
212 | 
213 |   // Defines a regression problem where labels are in [0, 1] and represent a
214 |   // probability (e.g: probability of click).
215 |   message Probability {}
216 | 
217 |   // Defines a regression problem where the labels are counts i.e. integers >=0.
218 |   message Counts {}
219 | 
220 |   oneof label_type {
221 |     // When set means the label is a probability in range [0..1].
222 |     Probability probability = 4;
223 |     // When set the label corresponds to counts from a poisson distribution.
224 |     // Eg: Number of googlers contributing to memegen each year.
225 |     Counts counts = 5;
226 |   }
227 | }
228 | 
229 | // A multi-dimensional regression task.
230 | // Similar to OneDimensionalRegression, MultiDimensionalRegression predicts
231 | // continuous real numbers. However instead of predicting a single scalar value
232 | // per example, we predict a fixed dimensional vector of values. By default the
233 | // range is any float -inf to inf, but specific sub-types (e.g. probability)
234 | // define more narrow ranges.
235 | message MultiDimensionalRegression {
236 |   option (task_type) = MULTI_DIMENSIONAL_REGRESSION;
237 | 
238 |   // The label column.
239 |   oneof label_id {  // oneof label_id is required.
240 |     // The name of the label. Assumes the label is a flat, top-level field.
241 |     string label = 1;
242 |     // A path can be used instead of a flat string if the label is nested.
243 |     Path label_path = 3;
244 |   }
245 |   // (optional) The weight column.
246 |   string weight = 2;
247 | 
248 |   // Defines a regression problem where labels are in [0, 1] and represent a
249 |   // probability (e.g: probability of click).
250 |   message Probability {
251 |     // By default, MultiDimensionalRegression assumes that each value in the
252 |     //  predicted vector is independent. If predictions_sum_to_1 is true, this
253 |     //  indicates that the vector of values represent mutually exclusive rather
254 |     //  than independent probabilities (for example, the probabilities of
255 |     //  classes in a multi-class scenario). When this is set to true, we use
256 |     //  softmax instead of sigmoid in the loss function.
257 |     bool predictions_sum_to_1 = 1;
258 |   }
259 | 
260 |   oneof label_type {
261 |     // When set means the label is a probability in range [0..1].
262 |     Probability probability = 4;
263 |   }
264 | }
265 | 
266 | // Configuration for a text generation task where the model should predict
267 | // a sequence of natural language text.
268 | message TextGeneration {
269 |   option (task_type) = TEXT_GENERATION;
270 | 
271 |   string targets = 1;
272 | 
273 |   // (optional) The weight column.
274 |   string example_weight = 2;
275 | }
276 | 
277 | // The type of a head or meta-objective. Specifies the label, weight,
278 | // and output type of the head.
279 | // TODO(martinz): add logistic regression.
280 | message Type {
281 |   oneof task_type {
282 |     BinaryClassification binary_classification = 1;
283 |     OneDimensionalRegression one_dimensional_regression = 2;
284 |     MultiClassClassification multi_class_classification = 3;
285 |     TopKClassification top_k_classification = 4;
286 |     MultiLabelClassification multi_label_classification = 5;
287 |     TextGeneration text_generation = 6;
288 |   }
289 | }
290 | 
291 | 
292 | // Describes a single task in a model and all its properties.
293 | // A task corresponds to a single output of the model.
294 | // Multiple tasks in the same problem statement correspond to different outputs
295 | // of the model.
296 | message Task {
297 |   reserved 3;
298 |   // Specification of the label and weight columns, and the type of the
299 |   // prediction or classification.
300 |   Type type = 1;
301 | 
302 |   // The task name. Tasks within the same ProblemStatement should have unique
303 |   // names. This a REQUIRED field in case of multi-task learning problems.
304 |   string name = 5;
305 | 
306 |   // If a Problem is composed of multiple sub-tasks, the weight of each task
307 |   // determines the importance of solving each sub-task. It is used to
308 |   // rank and select the best solution for multi-task problems.
309 |   // Not meaningful for a problem with one task.
310 |   // If the problem has multiple tasks and all task_weight=0 (unset) then all
311 |   // tasks are weighted equally.
312 |   double task_weight = 2;
313 | 
314 |   // This field includes performance metrics of this head that are important to
315 |   // the problem owner and need to be monitored and reported. However, unlike
316 |   // fields such as "meta_optimization_target", these metrics are not
317 |   // not automatically used in meta-optimization.
318 |   repeated PerformanceMetric performance_metric = 4;
319 | 
320 |   // True to indicate the task is an auxiliary task in a multi-task setting.
321 |   // Auxiliary tasks are of minor relevance for the application and they are
322 |   // added only to improve the performance on a primary task (by providing
323 |   // additional regularization or data augmentation), and thus are not
324 |   // considered in the meta optimization process (but may be utilized in the
325 |   // learner optimization).
326 |   bool is_auxiliary = 6;
327 | 
328 | }
329 | 
330 | // The high-level objectives described by this problem statement. These
331 | // objectives provide a basis for ranking models and can be optimized by a meta
332 | // optimizer (e.g. a grid search over hyperparameters). A solution provider may
333 | // also directly use the meta optimization targets to heuristically select
334 | // losses, etc without any meta-optimization process. If not specified, the
335 | // high-level meta optimization target is inferred from the task. These
336 | // objectives do not need to be differentiable, as the solution provider may use
337 | // proxy function to optimize model weights. Target definitions include tasks,
338 | // metrics, and any weighted combination of them.
339 | message MetaOptimizationTarget {
340 |   reserved 2;
341 |   // The name of a task in this problem statement producing the
342 |   // prediction or classification for the metric.
343 |   string task_name = 1;
344 | 
345 |   // The performance metric to be evaluated.
346 |   // The prediction or classification is based upon the task.
347 |   // The label is from the type of the task, or from the override_task.
348 |   PerformanceMetric performance_metric = 3;
349 | 
350 |   // Configuration for thresholded meta-optimization targets.
351 |   message ThresholdConfig {
352 |     oneof type {
353 |       // If specified, indicates a threshold that the user wishes the metric to
354 |       // stay under (for MINIMIZE type), or above (for MAXIMIZE type). The
355 |       // optimization process need not prefer models that are higher (or lower)
356 |       // on the thresholded metric so long as the threshold is respected.
357 |       // E.g., if `threshold` for a MAXIMIZE type metric X is .9, the
358 |       // optimization process will prefer a solution with X = .92 over a
359 |       // solution with X = .88, but may not prefer a solution with X = .95 over
360 |       // a solution with X = .92. Unless otherwise specified by the
361 |       // PerformanceMetric, threshold is best effort. It does not provide a hard
362 |       // guarantee about the properties of the final model, but rather serves as
363 |       // a "target" to guide the optimization process. The user is responsible
364 |       // for validating that final model metrics are in an acceptable range for
365 |       // the application. A problem statement may, however, be rejected if the
366 |       // specified target is impossible to achieve. Keep this in mind if running
367 |       // the optimization on a recurring basis, as shifts in the data could push
368 |       // a previously achievable target to being unachievable (and thus yield no
369 |       // solution). The units and range for the threshold will be the same as
370 |       // the valid output range of the associated performance_metric.
371 |       double threshold = 1;
372 | 
373 |     }
374 |   }
375 | 
376 |   // Describes how to combine with other objectives.
377 |   oneof objective_combination {
378 |     // If a model spec has multiple meta optimization targets, the weight
379 |     // of each can be specified. The final objective is then a weighted
380 |     // combination of the multiple objectives. If not specified, value is 1.
381 |     double weight = 4 [deprecated = true];
382 | 
383 |     // Secondary meta optimization targets can be thresholded, meaning that the
384 |     // optimization process prefers solutions above (or below) the threshold,
385 |     // but need not prefer solutions higher (or lower) on the metric if the
386 |     // threshold is met.
387 |     ThresholdConfig threshold_config = 5;
388 |   }
389 | }
390 | 
391 | message ProblemStatement {
392 |   // Description of the problem statement. For example, should describe how
393 |   // the problem statement was arrived at: what experiments were run, what
394 |   // side-by-sides were considered.
395 |   string description = 2;
396 |   repeated string owner = 3;
397 | 
398 |   // The environment of the ProblemStatement (optional). Specifies an
399 |   // environment string in the SchemaProto.
400 |   string environment = 4;
401 | 
402 |   // The target used for meta-optimization. This is used to compare multiple
403 |   // solutions for this problem. For example, if two solutions have different
404 |   // candidates, a tuning tool can use meta_optimization_target to decide which
405 |   // candidate performs the best.
406 |   // A repeated meta-optimization target implies the weighted sum of the
407 |   // meta_optimization targets of any non-thresholded metrics.
408 |   repeated MetaOptimizationTarget meta_optimization_target = 7;
409 |   bool multi_objective = 8 [deprecated = true];
410 | 
411 |   reserved 5;
412 | 
413 |   // Tasks for heads of the generated model. This field is repeated because some
414 |   // models are multi-task models. Each task should have a unique name.
415 |   // If you wish to directly optimize this problem statement, you need
416 |   // to specify the objective in the task.
417 |   repeated Task tasks = 9;
418 | }
419 | 


--------------------------------------------------------------------------------
/tensorflow_metadata/proto/v0/schema.proto:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | // =============================================================================
 15 | 
 16 | syntax = "proto2";
 17 | 
 18 | package tensorflow.metadata.v0;
 19 | 
 20 | import "google/protobuf/any.proto";
 21 | import "tensorflow_metadata/proto/v0/derived_feature.proto";
 22 | import "tensorflow_metadata/proto/v0/path.proto";
 23 | 
 24 | // GOOGLE-LEGACY option jspb_use_correct_proto2_semantics = false;  
 25 | option cc_enable_arenas = true;
 26 | option java_package = "org.tensorflow.metadata.v0";
 27 | option java_multiple_files = true;
 28 | 
 29 | // LifecycleStage. Only UNKNOWN_STAGE, BETA, PRODUCTION, and VALIDATION_DERIVED
 30 | // features are actually validated.
 31 | // PLANNED, ALPHA, DISABLED, and DEBUG are treated as DEPRECATED.
 32 | enum LifecycleStage {
 33 |   // Unknown stage.
 34 |   UNKNOWN_STAGE = 0;
 35 | 
 36 |   // Planned feature, may not be created yet.
 37 |   PLANNED = 1;
 38 | 
 39 |   // Prototype feature, not used in experiments yet.
 40 |   ALPHA = 2;
 41 | 
 42 |   // Used in user-facing experiments.
 43 |   BETA = 3;
 44 | 
 45 |   // Used in a significant fraction of user traffic.
 46 |   PRODUCTION = 4;
 47 | 
 48 |   // No longer supported: do not use in new models.
 49 |   DEPRECATED = 5;
 50 | 
 51 |   // Only exists for debugging purposes.
 52 |   DEBUG_ONLY = 6;
 53 | 
 54 |   // Generic indication that feature is disabled / excluded
 55 |   // from models, regardless of specific reason.
 56 |   DISABLED = 7;
 57 | 
 58 |   // Indicates that this feature was derived from ordinary
 59 |   // features for the purposes of statistics generation or
 60 |   // validation. Consumers should expect that this feature
 61 |   // may be present in DatasetFeatureStatistics, but not in
 62 |   // input data.
 63 |   // Experimental and subject to change.
 64 |   VALIDATION_DERIVED = 9;
 65 | 
 66 |   reserved 8;
 67 | }
 68 | 
 69 | //
 70 | // Message to represent schema information.
 71 | // NextID: 15
 72 | message Schema {
 73 |   // Features described in this schema.
 74 |   repeated Feature feature = 1;
 75 | 
 76 |   // Sparse features described in this schema.
 77 |   repeated SparseFeature sparse_feature = 6;
 78 | 
 79 |   // Weighted features described in this schema.
 80 |   repeated WeightedFeature weighted_feature = 12;
 81 | 
 82 |   // Use StructDomain instead.
 83 |   // Sequences described in this schema. A sequence may be described in terms of
 84 |   // several features. Any features appearing within a sequence must *not* be
 85 |   // declared as top-level features in <feature>.
 86 | // GOOGLE-LEGACY   repeated Sequence sequence = 2;  
 87 | 
 88 |   // String domains referenced in the features.
 89 |   repeated StringDomain string_domain = 4;
 90 | 
 91 |   // TOP LEVEL FLOAT AND INT DOMAINS ARE UNSUPPORTED IN TFDV.
 92 |   // TODO(b/63664182): Support this.
 93 |   // top level float domains that can be reused by features
 94 |   repeated FloatDomain float_domain = 9;
 95 | 
 96 |   // top level int domains that can be reused by features
 97 |   repeated IntDomain int_domain = 10;
 98 | 
 99 |   // Default environments for each feature.
100 |   // An environment represents both a type of location (e.g. a server or phone)
101 |   // and a time (e.g. right before model X is run). In the standard scenario,
102 |   // 99% of the features should be in the default environments TRAINING,
103 |   // SERVING, and the LABEL (or labels) AND WEIGHT is only available at TRAINING
104 |   // (not at serving).
105 |   // Other possible variations:
106 |   // 1. There may be TRAINING_MOBILE, SERVING_MOBILE, TRAINING_SERVICE,
107 |   //    and SERVING_SERVICE.
108 |   // 2. If one is ensembling three models, where the predictions of the first
109 |   //    three models are available for the ensemble model, there may be
110 |   //    TRAINING, SERVING_INITIAL, SERVING_ENSEMBLE.
111 |   // See FeatureProto::not_in_environment and FeatureProto::in_environment.
112 |   repeated string default_environment = 5;
113 | 
114 |   /* BEGIN GOOGLE-LEGACY
115 |   // TODO(b/73109633): Change default to false, before removing this field.
116 |   optional bool generate_legacy_feature_spec = 7 [default = true];
117 |   END GOOGLE-LEGACY */
118 | 
119 |   // Whether to represent variable length features as RaggedTensors. By default
120 |   // they are represented as ragged left-alighned SparseTensors. RaggedTensor
121 |   // representation is more memory efficient. Therefore, turning this on will
122 |   // likely yield data processing performance improvement.
123 |   // Experimental and may be subject to change.
124 |   optional bool represent_variable_length_as_ragged = 14;
125 | 
126 |   // Additional information about the schema as a whole. Features may also
127 |   // be annotated individually.
128 |   optional Annotation annotation = 8;
129 | 
130 |   // Dataset-level constraints. This is currently used for specifying
131 |   // information about changes in num_examples.
132 |   optional DatasetConstraints dataset_constraints = 11;
133 | 
134 |   // TensorRepresentation groups. The keys are the names of the groups.
135 |   // Key "" (empty string) denotes the "default" group, which is what should
136 |   // be used when a group name is not provided.
137 |   // See the documentation at TensorRepresentationGroup for more info.
138 |   // Under development.
139 |   map<string, TensorRepresentationGroup> tensor_representation_group = 13;
140 | }
141 | 
142 | message ValueCountList {
143 |   repeated ValueCount value_count = 1;
144 | }
145 | 
146 | // Describes schema-level information about a specific feature.
147 | // NextID: 39
148 | message Feature {
149 |   // The name of the feature.
150 |   optional string name = 1;  // required
151 | 
152 |   // This field is no longer supported. Instead, use:
153 |   // lifecycle_stage: DEPRECATED
154 |   // TODO(b/111450258): remove this.
155 |   optional bool deprecated = 2 [deprecated = true];
156 | 
157 |   // Comment field for a human readable description of the field.
158 |   // TODO(b/123518108): remove this.
159 | // GOOGLE-LEGACY   optional string comment = 3 [deprecated = true];  
160 | 
161 |   oneof presence_constraints {
162 |     // Constraints on the presence of this feature in the examples.
163 |     FeaturePresence presence = 14;
164 |     // Only used in the context of a "group" context, e.g., inside a sequence.
165 |     FeaturePresenceWithinGroup group_presence = 17;
166 |   }
167 | 
168 |   // The shape of the feature which governs the number of values that appear in
169 |   // each example.
170 |   oneof shape_type {
171 |     // The feature has a fixed shape corresponding to a multi-dimensional
172 |     // tensor.
173 |     FixedShape shape = 23;
174 |     // The feature doesn't have a well defined shape. All we know are limits on
175 |     // the minimum and maximum number of values.
176 |     ValueCount value_count = 5;
177 |     // Captures the same information as value_count but for features with
178 |     // nested values. A ValueCount is provided for each nest level.
179 |     ValueCountList value_counts = 32;
180 |   }
181 | 
182 |   // Physical type of the feature's values.
183 |   // Note that you can have:
184 |   // type: BYTES
185 |   // int_domain: {
186 |   //   min: 0
187 |   //   max: 3
188 |   // }
189 |   // This would be a field that is syntactically BYTES (i.e. strings), but
190 |   // semantically an int, i.e. it would be "0", "1", "2", or "3".
191 |   optional FeatureType type = 6;
192 | 
193 |   // Domain for the values of the feature.
194 |   oneof domain_info {
195 |     // Reference to a domain defined at the schema level.
196 |     // NOTE THAT TFDV ONLY SUPPORTS STRING DOMAINS AT THE TOP LEVEL.
197 |     // TODO(b/63664182): Support this.
198 |     string domain = 7;
199 |     // Inline definitions of domains.
200 |     IntDomain int_domain = 9;
201 |     FloatDomain float_domain = 10;
202 |     StringDomain string_domain = 11;
203 |     BoolDomain bool_domain = 13;
204 |     StructDomain struct_domain = 29;
205 |     // Supported semantic domains.
206 |     NaturalLanguageDomain natural_language_domain = 24;
207 |     ImageDomain image_domain = 25;
208 |     AudioDomain audio_domain = 36;
209 |     VideoDomain video_domain = 37;
210 |     ContentChunkDomain content_chunk_domain = 38;
211 |     MIDDomain mid_domain = 26;
212 |     URLDomain url_domain = 27;
213 |     TimeDomain time_domain = 28;
214 |     TimeOfDayDomain time_of_day_domain = 30;
215 |   }
216 | 
217 |   // Constraints on the distribution of the feature values.
218 |   // Only supported for StringDomains.
219 |   optional DistributionConstraints distribution_constraints = 15;
220 | 
221 |   // Additional information about the feature for documentation purpose.
222 |   optional Annotation annotation = 16;
223 | 
224 |   // Tests comparing the distribution to the associated serving data.
225 |   optional FeatureComparator skew_comparator = 18;
226 | 
227 |   // Tests comparing the distribution between two consecutive spans (e.g. days).
228 |   optional FeatureComparator drift_comparator = 21;
229 | 
230 |   // List of environments this feature is present in.
231 |   // Should be disjoint from not_in_environment.
232 |   // This feature is in environment "foo" if:
233 |   // ("foo" is in in_environment or default_environment) AND
234 |   // "foo" is not in not_in_environment.
235 |   // See Schema::default_environment.
236 |   repeated string in_environment = 20;
237 | 
238 |   // List of environments this feature is not present in.
239 |   // Should be disjoint from of in_environment.
240 |   // See Schema::default_environment and in_environment.
241 |   repeated string not_in_environment = 19;
242 | 
243 |   // The lifecycle stage of a feature. It can also apply to its descendants.
244 |   // i.e., if a struct is DEPRECATED, its children are implicitly deprecated.
245 |   optional LifecycleStage lifecycle_stage = 22;
246 | 
247 |   // Constraints on the number of unique values for a given feature.
248 |   // This is supported for string and categorical features only.
249 |   optional UniqueConstraints unique_constraints = 31;
250 | 
251 |   // If set, indicates that that this feature is derived, and stores metadata
252 |   // about its source. If this field is set, this feature should have a
253 |   // disabled stage (PLANNED, ALPHA, DEPRECATED, DISABLED, DEBUG_ONLY), or
254 |   // lifecycle_stage VALIDATION_DERIVED.
255 |   // Experimental and subject to change.
256 |   optional DerivedFeatureSource validation_derived_source = 34;
257 |   reserved 33;
258 | 
259 |   // This field specifies if this feature could be treated as a sequence
260 |   // feature which has meaningful element order.
261 |   optional SequenceMetadata sequence_metadata = 35;
262 | }
263 | 
264 | // Additional information about the schema or about a feature.
265 | message Annotation {
266 |   // Tags can be used to mark features. For example, tag on user_age feature can
267 |   // be `user_feature`, tag on user_country feature can be `location_feature`,
268 |   // `user_feature`.
269 |   repeated string tag = 1;
270 |   // Free-text comments. This can be used as a description of the feature,
271 |   // developer notes etc.
272 |   repeated string comment = 2;
273 |   // Application-specific metadata may be attached here.
274 |   repeated .google.protobuf.Any extra_metadata = 3;
275 | }
276 | 
277 | // Checks that the ratio of the current value to the previous value is not below
278 | // the min_fraction_threshold or above the max_fraction_threshold. That is,
279 | // previous value * min_fraction_threshold <= current value <=
280 | // previous value * max_fraction_threshold.
281 | // To specify that the value cannot change, set both min_fraction_threshold and
282 | // max_fraction_threshold to 1.0.
283 | message NumericValueComparator {
284 |   optional double min_fraction_threshold = 1;
285 |   optional double max_fraction_threshold = 2;
286 | }
287 | 
288 | // Constraints on the entire dataset.
289 | message DatasetConstraints {
290 |   // Tests differences in number of examples between the current data and the
291 |   // previous span.
292 |   optional NumericValueComparator num_examples_drift_comparator = 1;
293 |   // Tests comparisions in number of examples between the current data and the
294 |   // previous version of that data.
295 |   optional NumericValueComparator num_examples_version_comparator = 2;
296 |   // Minimum number of examples in the dataset.
297 |   optional int64 min_examples_count = 3;
298 |   // Maximum number of examples in the dataset.
299 |   optional int64 max_examples_count = 4;
300 | }
301 | 
302 | // Specifies a fixed shape for the feature's values. The immediate implication
303 | // is that each feature has a fixed number of values. Moreover, these values
304 | // can be parsed in a multi-dimensional tensor using the specified axis sizes.
305 | // The FixedShape defines a lexicographical ordering of the data. For instance,
306 | // if there is a FixedShape {
307 | //   dim {size:3} dim {size:2}
308 | // }
309 | // then tensor[0][0]=field[0]
310 | // then tensor[0][1]=field[1]
311 | // then tensor[1][0]=field[2]
312 | // then tensor[1][1]=field[3]
313 | // then tensor[2][0]=field[4]
314 | // then tensor[2][1]=field[5]
315 | //
316 | // The FixedShape message is identical with the tensorflow.TensorShape proto
317 | // message for fully defined shapes. The FixedShape message cannot represent
318 | // unknown dimensions or an unknown rank.
319 | message FixedShape {
320 |   // The dimensions that define the shape. The total number of values in each
321 |   // example is the product of sizes of each dimension.
322 |   repeated Dim dim = 2;
323 | 
324 |   // An axis in a multi-dimensional feature representation.
325 |   message Dim {
326 |     optional int64 size = 1;
327 | 
328 |     // Optional name of the tensor dimension.
329 |     optional string name = 2;
330 |   }
331 | }
332 | 
333 | // Limits on maximum and minimum number of values in a
334 | // single example (when the feature is present). Use this when the minimum
335 | // value count can be different than the maximum value count. Otherwise prefer
336 | // FixedShape.
337 | message ValueCount {
338 |   optional int64 min = 1;
339 |   optional int64 max = 2;
340 | }
341 | 
342 | /* BEGIN GOOGLE-LEGACY
343 | // Constraint on the number of elements in a sequence.
344 | message LengthConstraint {
345 |   optional int64 min = 1;
346 |   optional int64 max = 2;
347 | }
348 | 
349 | // A sequence is a logical feature that comprises several "raw" features that
350 | // encode values at different "steps" within the sequence.
351 | // TODO(b/110490010): Delete this. This is a special case of StructDomain.
352 | message Sequence {
353 |   // An optional name for this sequence. Used mostly for debugging and
354 |   // presentation.
355 |   optional string name = 1;
356 | 
357 |   // Features that comprise the sequence. These features are "zipped" together
358 |   // to form the values for the sequence at different steps.
359 |   // - Use group_presence within each feature to encode presence constraints
360 |   //   within the sequence.
361 |   // - If all features have the same value-count constraints then
362 |   //   declare this once using the shape_constraint below.
363 |   repeated Feature feature = 2;
364 | 
365 |   // Constraints on the presence of the sequence across all examples in the
366 |   // dataset. The sequence is assumed to be present if at least one of its
367 |   // features is present.
368 |   optional FeaturePresence presence = 3;
369 | 
370 |   // Shape constraints that apply on all the features that comprise the
371 |   // sequence. If this is set then the value_count in 'feature' is
372 |   // ignored.
373 |   // TODO(martinz): delete: there is no reason to believe the shape of the
374 |   // fields in a sequence will be the same. Use the fields in Feature instead.
375 |   oneof shape_constraint {
376 |     ValueCount value_count = 4;
377 |     FixedShape fixed_shape = 5;
378 |   }
379 | 
380 |   // Constraint on the number of elements in a sequence.
381 |   optional LengthConstraint length_constraint = 6;
382 | }
383 | END GOOGLE-LEGACY */
384 | 
385 | // Represents a weighted feature that is encoded as a combination of raw base
386 | // features. The `weight_feature` should be a float feature with identical
387 | // shape as the `feature`. This is useful for representing weights associated
388 | // with categorical tokens (e.g. a TFIDF weight associated with each token).
389 | // TODO(b/142122960): Handle WeightedCategorical end to end in TFX (validation,
390 | // TFX Unit Testing, etc)
391 | message WeightedFeature {
392 |   // Name for the weighted feature. This should not clash with other features in
393 |   // the same schema.
394 |   optional string name = 1;  // required
395 |   // Path of a base feature to be weighted. Required.
396 |   optional Path feature = 2;
397 |   // Path of weight feature to associate with the base feature. Must be same
398 |   // shape as feature. Required.
399 |   optional Path weight_feature = 3;
400 |   // The lifecycle_stage determines where a feature is expected to be used,
401 |   // and therefore how important issues with it are.
402 |   optional LifecycleStage lifecycle_stage = 4;
403 | }
404 | 
405 | // A sparse feature represents a sparse tensor that is encoded with a
406 | // combination of raw features, namely index features and a value feature. Each
407 | // index feature defines a list of indices in a different dimension.
408 | message SparseFeature {
409 |   reserved 11;
410 |   // Name for the sparse feature. This should not clash with other features in
411 |   // the same schema.
412 |   optional string name = 1;  // required
413 | 
414 |   // This field is no longer supported. Instead, use:
415 |   // lifecycle_stage: DEPRECATED
416 |   // TODO(b/111450258): remove this.
417 |   optional bool deprecated = 2 [deprecated = true];
418 | 
419 |   // The lifecycle_stage determines where a feature is expected to be used,
420 |   // and therefore how important issues with it are.
421 |   optional LifecycleStage lifecycle_stage = 7;
422 | 
423 |   // Comment field for a human readable description of the field.
424 |   // TODO(martinz): delete, convert to annotation.
425 | // GOOGLE-LEGACY   optional string comment = 3 [deprecated = true];  
426 | 
427 |   // Constraints on the presence of this feature in examples.
428 |   // Deprecated, this is inferred by the referred features.
429 |   optional FeaturePresence presence = 4 [deprecated = true];
430 | 
431 |   // Shape of the sparse tensor that this SparseFeature represents.
432 |   // Currently not supported.
433 |   // TODO(b/109669962): Consider deriving this from the referred features.
434 |   optional FixedShape dense_shape = 5;
435 | 
436 |   // Features that represent indexes. Should be integers >= 0.
437 |   repeated IndexFeature index_feature = 6;  // at least one
438 |   message IndexFeature {
439 |     // Name of the index-feature. This should be a reference to an existing
440 |     // feature in the schema.
441 |     optional string name = 1;
442 |   }
443 | 
444 |   // If true then the index values are already sorted lexicographically.
445 |   optional bool is_sorted = 8;
446 | 
447 |   optional ValueFeature value_feature = 9;  // required
448 |   message ValueFeature {
449 |     // Name of the value-feature. This should be a reference to an existing
450 |     // feature in the schema.
451 |     optional string name = 1;
452 |   }
453 | 
454 |   // Type of value feature.
455 |   // Deprecated, this is inferred by the referred features.
456 |   optional FeatureType type = 10 [deprecated = true];
457 | }
458 | 
459 | // Models constraints on the distribution of a feature's values.
460 | // TODO(martinz): replace min_domain_mass with max_off_domain (but slowly).
461 | message DistributionConstraints {
462 |   // The minimum fraction (in [0,1]) of values across all examples that
463 |   // should come from the feature's domain, e.g.:
464 |   //   1.0  => All values must come from the domain.
465 |   //    .9  => At least 90% of the values must come from the domain.
466 |   optional double min_domain_mass = 1 [default = 1.0];
467 | }
468 | 
469 | // Encodes vocabulary coverage constraints.
470 | message FeatureCoverageConstraints {
471 |   // Fraction of feature values that map to a vocab entry (i.e. are not oov).
472 |   optional float min_coverage = 1;
473 |   // Average length of tokens. Used for cases such as wordpiece that fallback
474 |   // to character-level tokenization.
475 |   optional float min_avg_token_length = 2;
476 | 
477 |   // String tokens to exclude when calculating min_coverage and
478 |   // min_avg_token_length. Useful for tokens such as [PAD].
479 |   repeated string excluded_string_tokens = 3;
480 | 
481 |   // Integer tokens to exclude when calculating min_coverage and
482 |   // min_avg_token_length.
483 |   repeated int64 excluded_int_tokens = 4 [packed = true];
484 | 
485 |   // String tokens to treat as oov tokens (e.g. [UNK]). These tokens are also
486 |   // excluded when calculating avg token length.
487 |   repeated string oov_string_tokens = 5;
488 | }
489 | 
490 | // Encodes constraints on specific values in sequences.
491 | message SequenceValueConstraints {
492 |   // The value which to express constraints for. Can be either an integer or
493 |   // a string.
494 |   oneof value {
495 |     int64 int_value = 1;
496 |     string string_value = 2;
497 |   }
498 | 
499 |   // Min / max number of times the value can occur in a sequence.
500 |   optional int64 min_per_sequence = 3;
501 |   optional int64 max_per_sequence = 4;
502 | 
503 |   // Min / max fraction of sequences that must contain the value.
504 |   optional float min_fraction_of_sequences = 5;
505 |   optional float max_fraction_of_sequences = 6;
506 | }
507 | 
508 | // Encodes constraints on sequence lengths.
509 | message SequenceLengthConstraints {
510 |   // Token values (int and string) that are excluded when calculating sequence
511 |   // length.
512 |   repeated int64 excluded_int_value = 1;
513 |   repeated string excluded_string_value = 2;
514 | 
515 |   // Min / max sequence length.
516 |   optional int64 min_sequence_length = 3;
517 |   optional int64 max_sequence_length = 4;
518 | }
519 | 
520 | // Encodes information for domains of integer values.
521 | // Note that FeatureType could be either INT or BYTES.
522 | message IntDomain {
523 |   // Id of the domain. Required if the domain is defined at the schema level. If
524 |   // so, then the name must be unique within the schema.
525 |   optional string name = 1;
526 | 
527 |   // Min and max values for the domain.
528 |   optional int64 min = 3;
529 |   optional int64 max = 4;
530 | 
531 |   // If true then the domain encodes categorical values (i.e., ids) rather than
532 |   // ordinal values.
533 |   optional bool is_categorical = 5;
534 | }
535 | 
536 | // Encodes information for domains of float values.
537 | // Note that FeatureType could be either INT or BYTES.
538 | message FloatDomain {
539 |   // Id of the domain. Required if the domain is defined at the schema level. If
540 |   // so, then the name must be unique within the schema.
541 |   optional string name = 1;
542 | 
543 |   // Min and max values of the domain.
544 |   optional float min = 3;
545 |   optional float max = 4;
546 | 
547 |   // If true, feature should not contain NaNs.
548 |   optional bool disallow_nan = 5;
549 |   // If true, feature should not contain Inf or -Inf.
550 |   optional bool disallow_inf = 6;
551 |   // If True, this indicates that the feature is semantically an embedding. This
552 |   // can be useful for distinguishing fixed dimensional numeric features that
553 |   // should be fed to a model unmodified.
554 |   optional bool is_embedding = 7;
555 | 
556 |   // If true then the domain encodes categorical values (i.e., ids) rather than
557 |   // continuous values.
558 |   optional bool is_categorical = 8;
559 | 
560 |   // This field specifies the embedding dimension and is only applicable if
561 |   // is_embedding is true. It is useful for use cases such as restoring shapes
562 |   // for flattened sequence of embeddings.
563 |   optional int64 embedding_dim = 9;
564 | 
565 |   // Specifies the semantic type of the embedding e.g. sbv4_semantic or pulsar.
566 |   optional string embedding_type = 10;
567 | }
568 | 
569 | // Domain for a recursive struct.
570 | // NOTE: If a feature with a StructDomain is deprecated, then all the
571 | // child features (features and sparse_features of the StructDomain) are also
572 | // considered to be deprecated.  Similarly child features can only be in
573 | // environments of the parent feature.
574 | message StructDomain {
575 |   repeated Feature feature = 1;
576 | 
577 |   repeated SparseFeature sparse_feature = 2;
578 | }
579 | 
580 | // Encodes information for domains of string values.
581 | message StringDomain {
582 |   // Id of the domain. Required if the domain is defined at the schema level. If
583 |   // so, then the name must be unique within the schema.
584 |   optional string name = 1;
585 | 
586 |   // The values appearing in the domain.
587 |   repeated string value = 2;
588 | 
589 |   // Currently unused
590 |   // StringDomain consists of Categorical. This enum allows the user to
591 |   // specify the whether to treat the feature as categorical.
592 |   enum Categorical {
593 |     CATEGORICAL_UNSPECIFIED = 0;
594 |     CATEGORICAL_YES = 1;
595 |     CATEGORICAL_NO = 2;
596 |   }
597 |   optional Categorical is_categorical = 3;
598 | }
599 | 
600 | // Encodes information about the domain of a boolean attribute that encodes its
601 | // TRUE/FALSE values as strings, or 0=false, 1=true.
602 | // Note that FeatureType could be either INT or BYTES.
603 | message BoolDomain {
604 |   // Id of the domain. Required if the domain is defined at the schema level. If
605 |   // so, then the name must be unique within the schema.
606 |   optional string name = 1;
607 | 
608 |   // Strings values for TRUE/FALSE.
609 |   optional string true_value = 2;
610 |   optional string false_value = 3;
611 | }
612 | 
613 | // BEGIN SEMANTIC-TYPES-PROTOS
614 | // Semantic domains are specialized feature domains. For example a string
615 | // Feature might represent a Time of a specific format.
616 | // Semantic domains are defined as protocol buffers to allow further sub-types /
617 | // specialization, e.g: NaturalLanguageDomain can provide information on the
618 | // language of the text.
619 | 
620 | // Natural language text.
621 | message NaturalLanguageDomain {
622 |   // Name of the vocabulary associated with the NaturalLanguageDomain.
623 |   // When computing and validating stats using TFDV,
624 |   // tfdv.StatsOptions.vocab_paths should map this name to a vocabulary file.
625 |   optional string vocabulary = 1;
626 |   optional FeatureCoverageConstraints coverage = 2;
627 |   repeated SequenceValueConstraints token_constraints = 3;
628 |   optional SequenceLengthConstraints sequence_length_constraints = 5;
629 | 
630 |   reserved 4;
631 | }
632 | 
633 | // Image data.
634 | message ImageDomain {
635 |   // If set, at least this fraction of values should be TensorFlow supported
636 |   // images.
637 |   optional float minimum_supported_image_fraction = 1;
638 | 
639 |   // If set, image should have less than this value of undecoded byte size.
640 |   optional int64 max_image_byte_size = 2;
641 | }
642 | 
643 | // Audio data.
644 | message AudioDomain {}
645 | 
646 | // Video data.
647 | message VideoDomain {}
648 | 
649 | // ContentChunk data.
650 | message ContentChunkDomain {}
651 | 
652 | // Knowledge graph ID, see: https://www.wikidata.org/wiki/Property:P646
653 | message MIDDomain {}
654 | 
655 | // A URL, see: https://en.wikipedia.org/wiki/URL
656 | message URLDomain {}
657 | 
658 | // Time or date representation.
659 | message TimeDomain {
660 |   enum IntegerTimeFormat {
661 |     FORMAT_UNKNOWN = 0;
662 |     UNIX_DAYS = 5;  // Number of days since 1970-01-01.
663 |     UNIX_SECONDS = 1;
664 |     UNIX_MILLISECONDS = 2;
665 |     UNIX_MICROSECONDS = 3;
666 |     UNIX_NANOSECONDS = 4;
667 |   }
668 | 
669 |   oneof format {
670 |     // Expected format that contains a combination of regular characters and
671 |     // special format specifiers. Format specifiers are a subset of the
672 |     // strptime standard.
673 |     string string_format = 1;
674 | 
675 |     // Expected format of integer times.
676 |     IntegerTimeFormat integer_format = 2;
677 |   }
678 | }
679 | 
680 | // Time of day, without a particular date.
681 | message TimeOfDayDomain {
682 |   enum IntegerTimeOfDayFormat {
683 |     FORMAT_UNKNOWN = 0;
684 |     // Time values, containing hour/minute/second/nanos, encoded into 8-byte
685 |     // bit fields following the ZetaSQL convention:
686 |     //        6         5         4         3         2         1
687 |     // MSB 3210987654321098765432109876543210987654321098765432109876543210 LSB
688 |     //                      | H ||  M ||  S ||---------- nanos -----------|
689 |     PACKED_64_NANOS = 1;
690 |   }
691 | 
692 |   oneof format {
693 |     // Expected format that contains a combination of regular characters and
694 |     // special format specifiers. Format specifiers are a subset of the
695 |     // strptime standard.
696 |     string string_format = 1;
697 | 
698 |     // Expected format of integer times.
699 |     IntegerTimeOfDayFormat integer_format = 2;
700 |   }
701 | }
702 | // END SEMANTIC-TYPES-PROTOS
703 | 
704 | // Describes the physical representation of a feature.
705 | // It may be different than the logical representation, which
706 | // is represented as a Domain.
707 | enum FeatureType {
708 |   TYPE_UNKNOWN = 0;
709 |   BYTES = 1;
710 |   INT = 2;
711 |   FLOAT = 3;
712 |   STRUCT = 4;
713 | }
714 | 
715 | // Describes constraints on the presence of the feature in the data.
716 | message FeaturePresence {
717 |   // Minimum fraction of examples that have this feature.
718 |   optional double min_fraction = 1;
719 |   // Minimum number of examples that have this feature.
720 |   optional int64 min_count = 2;
721 | }
722 | 
723 | // Records constraints on the presence of a feature inside a "group" context
724 | // (e.g., .presence inside a group of features that define a sequence).
725 | message FeaturePresenceWithinGroup {
726 |   optional bool required = 1;
727 | }
728 | 
729 | // Checks that the L-infinity norm is below a certain threshold between the
730 | // two discrete distributions. Since this is applied to a FeatureNameStatistics,
731 | // it only considers the top k.
732 | // L_infty(p,q) = max_i |p_i-q_i|
733 | message InfinityNorm {
734 |   // The InfinityNorm is in the interval [0.0, 1.0] so sensible bounds should
735 |   // be in the interval [0.0, 1.0).
736 |   optional double threshold = 1;
737 | }
738 | 
739 | message HistogramSelection {
740 |   // Type controls the source of the histogram used for numeric drift and
741 |   // skew calculations. Currently the default is STANDARD. Calculations
742 |   // based on QUANTILES are more robust to outliers.
743 |   enum Type {
744 |     DEFAULT = 0;
745 |     QUANTILES = 1;
746 |     STANDARD = 2;
747 |   }
748 |   optional Type type = 1;
749 | }
750 | 
751 | // Checks that the approximate Jensen-Shannon Divergence is below a certain
752 | // threshold between the two distributions.
753 | message JensenShannonDivergence {
754 |   // The JensenShannonDivergence will be in the interval [0.0, 1.0] so sensible
755 |   // bounds should be in the interval [0.0, 1.0).
756 |   optional double threshold = 1;
757 |   optional HistogramSelection source = 2;
758 | }
759 | 
760 | // Checks that the absolute count difference relative to the total count of both
761 | // datasets is small. This metric is appropriate for comparing datasets that
762 | // are expected to have similar absolute counts, and not necessarily just
763 | // similar distributions.
764 | // Computed as max_i | x_i - y_i |  / sum_i(x_i + y_i) for aligned datasets
765 | // x and y. Results will be in the interval [0.0, 1.0] so sensible bounds should
766 | // be in the interval [0.0, 1.0).
767 | message NormalizedAbsoluteDifference {
768 |   optional double threshold = 1;
769 | }
770 | 
771 | message FeatureComparator {
772 |   optional InfinityNorm infinity_norm = 1;
773 |   optional JensenShannonDivergence jensen_shannon_divergence = 2;
774 |   optional NormalizedAbsoluteDifference normalized_abs_difference = 3;
775 | }
776 | 
777 | // Checks that the number of unique values is greater than or equal to the min,
778 | // and less than or equal to the max.
779 | message UniqueConstraints {
780 |   optional int64 min = 1;
781 |   optional int64 max = 2;
782 | }
783 | 
784 | // A TensorRepresentation captures the intent for converting columns in a
785 | // dataset to TensorFlow Tensors (or more generally, tf.CompositeTensors).
786 | // Note that one tf.CompositeTensor may consist of data from multiple columns,
787 | // for example, a N-dimensional tf.SparseTensor may need N + 1 columns to
788 | // provide the sparse indices and values.
789 | // Note that the "column name" that a TensorRepresentation needs is a
790 | // string, not a Path -- it means that the column name identifies a top-level
791 | // Feature in the schema (i.e. you cannot specify a Feature nested in a STRUCT
792 | // Feature).
793 | message TensorRepresentation {
794 |   message DefaultValue {
795 |     oneof kind {
796 |       double float_value = 1;
797 |       // Note that the data column might be of a shorter integral type. It's the
798 |       // user's responsitiblity to make sure the default value fits that type.
799 |       int64 int_value = 2;
800 |       bytes bytes_value = 3;
801 |       // uint_value should only be used if the default value can't fit in a
802 |       // int64 (`int_value`).
803 |       uint64 uint_value = 4;
804 |     }
805 |   }
806 | 
807 |   // A tf.Tensor
808 |   message DenseTensor {
809 |     // Identifies the column in the dataset that provides the values of this
810 |     // Tensor.
811 |     optional string column_name = 1;
812 |     // The shape of each row of the data (i.e. does not include the batch
813 |     // dimension)
814 |     optional FixedShape shape = 2;
815 |     // If this column is missing values in a row, the default_value will be
816 |     // used to fill that row.
817 |     optional DefaultValue default_value = 3;
818 |   }
819 | 
820 |   // A ragged tf.SparseTensor that models nested lists.
821 |   message VarLenSparseTensor {
822 |     // Identifies the column in the dataset that should be converted to the
823 |     // VarLenSparseTensor.
824 |     optional string column_name = 1;
825 |   }
826 | 
827 |   // A tf.SparseTensor whose indices and values come from separate data columns.
828 |   // This will replace Schema.sparse_feature eventually.
829 |   // The index columns must be of INT type, and all the columns must co-occur
830 |   // and have the same valency at the same row.
831 |   message SparseTensor {
832 |     // The dense shape of the resulting SparseTensor (does not include the batch
833 |     // dimension).
834 |     optional FixedShape dense_shape = 1;
835 |     // The columns constitute the coordinates of the values.
836 |     // indices_column[i][j] contains the coordinate of the i-th dimension of the
837 |     // j-th value.
838 |     repeated string index_column_names = 2;
839 |     // The column that contains the values.
840 |     optional string value_column_name = 3;
841 |     // Specify whether the values are already sorted by their index position.
842 |     optional bool already_sorted = 4;
843 |   }
844 | 
845 |   // A tf.RaggedTensor that models nested lists.
846 |   // Currently there is no way for the user to specify the shape of the leaf
847 |   // value (the innermost value tensor of the RaggedTensor). The leaf value will
848 |   // always be a 1-D tensor.
849 |   message RaggedTensor {
850 |     // Identifies the leaf feature that provides values of the RaggedTensor.
851 |     // struct type sub fields.
852 |     // The first step of the path refers to a top-level feature in the data. The
853 |     // remaining steps refer to STRUCT features under the top-level feature,
854 |     // recursively.
855 |     // If the feature has N outer ragged lists, they will become the first
856 |     // N dimensions of the resulting RaggedTensor and the contents will become
857 |     // the flat_values.
858 |     optional Path feature_path = 1;  // required.
859 | 
860 |     // Further partition of the feature values at the leaf level.
861 |     message Partition {
862 |       oneof kind {
863 |         // If the final element(s) of partition are uniform_row_lengths [U0, U1,
864 |         // ...] , then the result RaggedTensor will have  their flat values (a
865 |         // dense tensor) being of shape [U0, U1, ...]. Otherwise, a
866 |         // uniform_row_length simply means a ragged dimension with row_lengths
867 |         // [uniform_row_length]*nrows.
868 |         int64 uniform_row_length = 1;
869 |         // Identifies a leaf feature who share the same parent of
870 |         // value_feature_path that contains the partition row lengths.
871 |         string row_length = 2;
872 |       }
873 |     }
874 |     // The result RaggedTensor would be of shape:
875 |     // [B, D_0, D_1, ..., D_N, P_0, P_1, ..., P_M, U_0, U_1, ..., U_P]
876 |     //
877 |     // Where the dimensions belong to different categories:
878 |     // * B: Batch size dimension
879 |     // * D_n: Dimensions specified by the nested structure specified by the
880 |     // value path until the leaf node. n>=1.
881 |     // * P_m: Dimensions specified by the partitions that do not define any
882 |     // fixed diomension size. m>=0.
883 |     // * U_0: Dimensions specified by the latest partitions of type
884 |     // uniform_row_length that can define the fixed inner shape of the tensor.
885 |     // If iterationg the partitions from the end to the beginning, these
886 |     // dimensions are defined by all the continuous uniform_row_length
887 |     // partitions present. p>=0.
888 |     repeated Partition partition = 3;
889 | 
890 |     // The data type of the ragged tensor's row partitions. This will
891 |     // default to INT64 if it is not specified.
892 |     optional RowPartitionDType row_partition_dtype = 2;
893 |   }
894 | 
895 |   // RaggedTensor consists of RowPartitions. This enum allows the user to
896 |   // specify the dtype of those RowPartitions. If it is UNSPECIFIED, then we
897 |   // default to INT64.
898 |   enum RowPartitionDType {
899 |     UNSPECIFIED = 0;
900 |     INT64 = 1;
901 |     INT32 = 2;
902 |   }
903 | 
904 |   oneof kind {
905 |     DenseTensor dense_tensor = 1;
906 |     VarLenSparseTensor varlen_sparse_tensor = 2;
907 |     SparseTensor sparse_tensor = 3;
908 |     RaggedTensor ragged_tensor = 4;
909 |   }
910 | }
911 | 
912 | // A TensorRepresentationGroup is a collection of TensorRepresentations with
913 | // names. These names may serve as identifiers when converting the dataset
914 | // to a collection of Tensors or tf.CompositeTensors.
915 | // For example, given the following group:
916 | // {
917 | //   key: "dense_tensor"
918 | //   tensor_representation {
919 | //     dense_tensor {
920 | //       column_name: "univalent_feature"
921 | //       shape {
922 | //         dim {
923 | //           size: 1
924 | //         }
925 | //       }
926 | //       default_value {
927 | //         float_value: 0
928 | //       }
929 | //     }
930 | //   }
931 | // }
932 | // {
933 | //   key: "varlen_sparse_tensor"
934 | //   tensor_representation {
935 | //     varlen_sparse_tensor {
936 | //       column_name: "multivalent_feature"
937 | //     }
938 | //   }
939 | // }
940 | //
941 | // Then the schema is expected to have feature "univalent_feature" and
942 | // "multivalent_feature", and when a batch of data is converted to Tensors using
943 | // this TensorRepresentationGroup, the result may be the following dict:
944 | // {
945 | //   "dense_tensor": tf.Tensor(...),
946 | //   "varlen_sparse_tensor": tf.SparseTensor(...),
947 | // }
948 | message TensorRepresentationGroup {
949 |   map<string, TensorRepresentation> tensor_representation = 1;
950 | }
951 | 
952 | message SequenceMetadata {
953 |   // This enum specifies whether to treat the feature as a sequence which has
954 |   // meaningful element order.
955 |   enum SequentialStatus {
956 |     SEQUENTIAL_UNSPECIFIED = 0;
957 |     SEQUENTIAL_YES = 1;
958 |     SEQUENTIAL_NO = 2;
959 |   }
960 |   optional SequentialStatus sequential_status = 3;
961 |   // An arbitrary string defining a "group" of features that could be modeled as
962 |   // a single joint sequence. For example, consider a dataset that contains
963 |   // three sequential features "purchase_time", "product_id", "purchase_price".
964 |   // These belong to the same sequence of purchases and could be modeled
965 |   // jointly. Specifying joint_group = "purchase" on all three sequences would
966 |   // communicate that the features can be considered part of a single conceptual
967 |   // sequence.
968 |   optional string joint_group = 4;
969 |   // Specifies the maximum sequence length that should be processed. Sequences
970 |   // may exceed this limit but are expected to be truncated by modeling layers.
971 |   optional int64 sequence_truncation_limit = 5;
972 | }
973 | 


--------------------------------------------------------------------------------
/tensorflow_metadata/proto/v0/statistics.proto:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | // =============================================================================
 15 | 
 16 | // Definitions for aggregated feature statistics for datasets.
 17 | // TODO(b/80075690): make a Javascript build rule for this.
 18 | // TODO(b/80075691): migrate Facets to use this.
 19 | syntax = "proto3";
 20 | 
 21 | package tensorflow.metadata.v0;
 22 | 
 23 | import "google/protobuf/any.proto";
 24 | import "tensorflow_metadata/proto/v0/derived_feature.proto";
 25 | import "tensorflow_metadata/proto/v0/path.proto";
 26 | 
 27 | option cc_enable_arenas = true;
 28 | option java_package = "org.tensorflow.metadata.v0";
 29 | option java_multiple_files = true;
 30 | 
 31 | // Copied from Facets feature_statistics.proto
 32 | // Must be kept binary-compatible with the original, until all usages
 33 | // are updated to use this version, or we write a proto-to-proto converter.
 34 | 
 35 | // A list of features statistics for different datasets. If you wish to compare
 36 | // different datasets using this list, then the DatasetFeatureStatistics
 37 | // entries should all contain the same list of features.
 38 | // LINT.IfChange
 39 | message DatasetFeatureStatisticsList {
 40 |   repeated DatasetFeatureStatistics datasets = 1;
 41 | }
 42 | 
 43 | // The feature statistics for a single dataset.
 44 | message DatasetFeatureStatistics {
 45 |   // The name of the dataset.
 46 |   string name = 1;
 47 |   // The number of examples in the dataset.
 48 |   uint64 num_examples = 2;
 49 | 
 50 |   // Only valid if the weight feature was specified.
 51 |   // Treats a missing weighted feature as zero.
 52 |   double weighted_num_examples = 4;
 53 |   // The feature statistics for the dataset.
 54 |   repeated FeatureNameStatistics features = 3;
 55 | 
 56 |   // Cross feature statistics for the dataset.
 57 |   repeated CrossFeatureStatistics cross_features = 5;
 58 | }
 59 | 
 60 | // NextID: 8
 61 | message CrossFeatureStatistics {
 62 |   // The path of feature x.
 63 |   Path path_x = 1;
 64 |   // The path of feature y.
 65 |   Path path_y = 2;
 66 | 
 67 |   // Number of occurrences of this feature cross in the data. If any of
 68 |   // the features in the cross is missing, the example is ignored.
 69 |   uint64 count = 3;
 70 | 
 71 |   oneof cross_stats {
 72 |     NumericCrossStatistics num_cross_stats = 4;
 73 |     CategoricalCrossStatistics categorical_cross_stats = 5;
 74 |   }
 75 | }
 76 | 
 77 | message NumericCrossStatistics {
 78 |   // Pearson product-moment correlation coefficient.
 79 |   float correlation = 1;
 80 |   // Standard covariance. E[(X-E[X])*(Y-E[Y])]
 81 |   float covariance = 2;
 82 | }
 83 | 
 84 | message CategoricalCrossStatistics {
 85 |   LiftStatistics lift = 1;
 86 | }
 87 | 
 88 | message LiftStatistics {
 89 |   // Lift information for each value of path_y. Lift is defined for each pair of
 90 |   // values (x,y) as P(path_y=y|path_x=x)/P(path_y=y).
 91 |   repeated LiftSeries lift_series = 1;
 92 |   // Weighted lift information for each value of path_y. Weighted lift is
 93 |   // defined for each pair of values (x,y) as P(path_y=y|path_x=x)/P(path_y=y)
 94 |   // where probabilities are computed over weighted example space.
 95 |   repeated LiftSeries weighted_lift_series = 2;
 96 | }
 97 | 
 98 | // Container for lift information for a specific y-value.
 99 | message LiftSeries {
100 |   // A bucket for referring to binned numeric features.
101 |   message Bucket {
102 |     // The low value of the bucket, inclusive.
103 |     double low_value = 1;
104 |     // The high value of the bucket, exclusive (unless the high_value is
105 |     // positive infinity).
106 |     double high_value = 2;
107 |   }
108 | 
109 |   // The particular value of path_y corresponding to this LiftSeries. Each
110 |   // element in lift_values corresponds to the lift a different x_value and
111 |   // this specific y_value.
112 |   oneof y_value {
113 |     int32 y_int = 1;
114 |     string y_string = 2;
115 |     Bucket y_bucket = 3;
116 |   }
117 | 
118 |   // The number of examples in which y_value appears.
119 |   oneof y_count_value {
120 |     uint64 y_count = 4;
121 |     double weighted_y_count = 5;
122 |   }
123 | 
124 |   // A container for lift information about a specific value of path_x.
125 |   message LiftValue {
126 |     oneof x_value {
127 |       int32 x_int = 1;
128 |       string x_string = 2;
129 |     }
130 |     // P(path_y=y|path_x=x) / P(path_y=y) for x_value and the enclosing y_value.
131 |     // In terms of concrete fields, this number represents:
132 |     // (x_and_y_count / x_count) / (y_count / num_examples)
133 |     double lift = 3;
134 |     // The number of examples in which x_value appears.
135 |     oneof x_count_value {
136 |       uint64 x_count = 4;
137 |       double weighted_x_count = 5;
138 |     }
139 |     // The number of examples in which x_value appears and y_value appears.
140 |     oneof x_and_y_count_value {
141 |       uint64 x_and_y_count = 6;
142 |       double weighted_x_and_y_count = 7;
143 |     }
144 |   }
145 | 
146 |   // The lifts for a each path_x value and this y_value.
147 |   repeated LiftValue lift_values = 6;
148 | }
149 | 
150 | // The complete set of statistics for a given feature name for a dataset.
151 | // NextID: 11
152 | message FeatureNameStatistics {
153 |   // The types supported by the feature statistics. When aggregating
154 |   // tf.Examples, if the bytelist contains a string, it is recommended to encode
155 |   // it here as STRING instead of BYTES in order to calculate string-specific
156 |   // statistical measures.
157 |   enum Type {
158 |     INT = 0;
159 |     FLOAT = 1;
160 |     STRING = 2;
161 |     BYTES = 3;
162 |     STRUCT = 4;
163 |   }
164 | 
165 |   // One can identify a field either by the name (for simple fields), or by
166 |   // a path (for structured fields). Note that:
167 |   // name: "foo"
168 |   // is equivalent to:
169 |   // path: {step:"foo"}
170 |   // Note: this oneof must be consistently either name or path across all
171 |   // FeatureNameStatistics in one DatasetFeatureStatistics.
172 |   oneof field_id {
173 |     // The feature name
174 |     string name = 1;
175 | 
176 |     // The path of the feature.
177 |     Path path = 8;
178 |   }
179 | 
180 |   // The data type of the feature
181 |   Type type = 2;
182 | 
183 |   // The statistics of the values of the feature.
184 |   oneof stats {
185 |     NumericStatistics num_stats = 3;
186 |     StringStatistics string_stats = 4;
187 |     BytesStatistics bytes_stats = 5;
188 |     StructStatistics struct_stats = 7;
189 |   }
190 | 
191 |   // Any custom statistics can be stored in this list.
192 |   repeated CustomStatistic custom_stats = 6;
193 | 
194 |   // If set, indicates that that this feature is derived for validation, and
195 |   // stores metadata about its source.
196 |   // Experimental and subject to change.
197 |   DerivedFeatureSource validation_derived_source = 10;
198 |   reserved 9;
199 | }
200 | 
201 | // Common weighted statistics for all feature types. Statistics counting number
202 | // of values (i.e., avg_num_values and tot_num_values) include NaNs.
203 | // If the weighted column is missing, then this counts as a weight of 1
204 | // for that example. For nested features with N nested levels (N > 1), the
205 | // statistics counting number of values will rely on the innermost level.
206 | message WeightedCommonStatistics {
207 |   // Weighted number of examples not missing.
208 |   double num_non_missing = 1;
209 |   // Weighted number of examples missing.
210 |   // Note that if the weighted column is zero, this does not count
211 |   // as missing.
212 |   double num_missing = 2;
213 |   // average number of values, weighted by the number of examples.
214 |   // avg_num_values = tot_num_values / num_non_missing.
215 |   double avg_num_values = 3;
216 |   // The total number of values in this feature.
217 |   double tot_num_values = 4;
218 | }
219 | 
220 | // Stores the name and value of any custom statistic. The value can be a string,
221 | // double, or histogram.
222 | message CustomStatistic {
223 |   string name = 1;
224 |   oneof val {
225 |     double num = 2;
226 |     string str = 3;
227 |     Histogram histogram = 4;
228 |     RankHistogram rank_histogram = 5;
229 |     google.protobuf.Any any = 6;
230 |   }
231 | }
232 | 
233 | // Statistics for a numeric feature in a dataset.
234 | message NumericStatistics {
235 |   CommonStatistics common_stats = 1;
236 |   // The mean of the values
237 |   double mean = 2;
238 |   // The standard deviation of the values
239 |   double std_dev = 3;
240 |   // The number of values that equal 0
241 |   uint64 num_zeros = 4;
242 |   // The minimum value
243 |   double min = 5;
244 |   // The median value
245 |   double median = 6;
246 |   // The maximum value
247 |   double max = 7;
248 |   // The histogram(s) of the feature values.
249 |   repeated Histogram histograms = 8;
250 | 
251 |   // Weighted statistics for the feature, if the values have weights.
252 |   WeightedNumericStatistics weighted_numeric_stats = 9;
253 | }
254 | 
255 | // Statistics for a string feature in a dataset.
256 | message StringStatistics {
257 |   CommonStatistics common_stats = 1;
258 |   // The number of unique values
259 |   uint64 unique = 2;
260 | 
261 |   message FreqAndValue {
262 |     string value = 2;
263 | 
264 |     // The number of times the value occurs. Stored as a double to be able to
265 |     // handle weighted features.
266 |     double frequency = 3;
267 | 
268 |     // Deleted fields.
269 |     reserved 1;
270 |   }
271 |   // A sorted list of the most-frequent values and their frequencies, with
272 |   // the most-frequent being first.
273 |   repeated FreqAndValue top_values = 3;
274 | 
275 |   // The average length of the values
276 |   float avg_length = 4;
277 | 
278 |   // The rank histogram for the values of the feature.
279 |   // The rank is used to measure of how commonly the value is found in the
280 |   // dataset. The most common value would have a rank of 1, with the second-most
281 |   // common value having a rank of 2, and so on.
282 |   RankHistogram rank_histogram = 5;
283 | 
284 |   // Weighted statistics for the feature, if the values have weights.
285 |   WeightedStringStatistics weighted_string_stats = 6;
286 | 
287 |   // A vocabulary file, used for vocabularies too large to store in the proto
288 |   // itself.  Note that the file may be relative to some context-dependent
289 |   // directory.  E.g. in TFX the feature statistics will live in a PPP and
290 |   // vocabulary file names will be relative to this PPP.
291 |   string vocabulary_file = 7;
292 | 
293 |   // Counts the number of invalid utf8 strings present in leaf arrays for this
294 |   // feature. Validation is only performed for byte- or string-like features (
295 |   // those having type BYTES or STRING).
296 |   uint64 invalid_utf8_count = 8;
297 | }
298 | 
299 | // Statistics for a feature containing a NL domain.
300 | message NaturalLanguageStatistics {
301 |   // Fraction of feature input tokens considered in-vocab.
302 |   double feature_coverage = 1;
303 |   // Average token length of tokens used by the feature.
304 |   double avg_token_length = 2;
305 |   // Histogram containing the distribution of token lengths.
306 |   Histogram token_length_histogram = 3;
307 |   // Min / max sequence lengths.
308 |   int64 min_sequence_length = 10;
309 |   int64 max_sequence_length = 11;
310 |   // Histogram containing the distribution of sequence lengths.
311 |   Histogram sequence_length_histogram = 9;
312 |   // Number of of sequences which do not match the location constraint.
313 |   int64 location_misses = 4;
314 | 
315 |   // Reported sequences that are sampled from the input and have small
316 |   // avg_token_length, low feature converage, or do not match the location
317 |   // regex.
318 |   repeated string reported_sequences = 5;
319 | 
320 |   message TokenStatistics {
321 |     // Token for which the statistics are reported.
322 |     oneof token {
323 |       string string_token = 1;
324 |       int64 int_token = 2;
325 |     }
326 | 
327 |     // The number of times the value occurs. Stored as a double to be able to
328 |     // handle weighted features.
329 |     double frequency = 3;
330 | 
331 |     // Fraction of sequences containing the token.
332 |     double fraction_of_sequences = 4;
333 |     // Min number of token occurrences within a sequence.
334 |     double per_sequence_min_frequency = 5;
335 |     // Average number of token occurrences within a sequence.
336 |     double per_sequence_avg_frequency = 6;
337 |     // Maximum number of token occurrences within a sequence.
338 |     double per_sequence_max_frequency = 7;
339 |     // Token positions within a sequence. Normalized by sequence length.
340 |     // (e.g. a token that occurres in position 0.5 occurs in the middle of
341 |     // a sequence).
342 |     Histogram positions = 8;
343 |   }
344 | 
345 |   // Statistics for specified tokens. TokenStatistics are only reported for
346 |   // tokens specified in SequenceValueConstraints in the schema.
347 |   repeated TokenStatistics token_statistics = 6;
348 | 
349 |   // The rank histogram for the tokens of the feature.
350 |   // The rank is used to measure of how commonly the token is found in the
351 |   // dataset. The most common token would have a rank of 1, with the second-most
352 |   // common value having a rank of 2, and so on.
353 |   RankHistogram rank_histogram = 7;
354 |   WeightedNaturalLanguageStatistics weighted_nl_statistics = 8;
355 | }
356 | 
357 | // Statistics for a weighted numeric feature in a dataset.
358 | message WeightedNumericStatistics {
359 |   // The weighted mean of the values
360 |   double mean = 1;
361 |   // The weighted standard deviation of the values
362 |   double std_dev = 2;
363 |   // The weighted median of the values
364 |   double median = 3;
365 | 
366 |   // The histogram(s) of the weighted feature values.
367 |   repeated Histogram histograms = 4;
368 | }
369 | 
370 | // Statistics for a weighted string feature in a dataset.
371 | message WeightedStringStatistics {
372 |   // A sorted list of the most-frequent values and their weighted frequencies,
373 |   // with the most-frequent being first.
374 |   repeated StringStatistics.FreqAndValue top_values = 1;
375 | 
376 |   // The rank histogram for the weighted values of the feature.
377 |   RankHistogram rank_histogram = 2;
378 | }
379 | 
380 | // Statistics for a weighted feature with an NL domain.
381 | message WeightedNaturalLanguageStatistics {
382 |   // Weighted feature coverage.
383 |   double feature_coverage = 1;
384 |   // Weighted average token length.
385 |   double avg_token_length = 2;
386 |   // Histogram containing the distribution of token lengths.
387 |   Histogram token_length_histogram = 3;
388 |   // Histogram containing the distribution of sequence lengths.
389 |   Histogram sequence_length_histogram = 9;
390 |   // Weighted number of sequences that do not match the location constraint.
391 |   double location_misses = 4;
392 |   // Per-token weighted statistics.
393 |   NaturalLanguageStatistics.TokenStatistics token_statistics = 5;
394 |   // The rank histogram with the weighted tokens for the feature.
395 |   RankHistogram rank_histogram = 6;
396 | }
397 | 
398 | // Statistics for a bytes feature in a dataset.
399 | message BytesStatistics {
400 |   CommonStatistics common_stats = 1;
401 |   // The number of unique values
402 |   uint64 unique = 2;
403 | 
404 |   // The average number of bytes in a value
405 |   float avg_num_bytes = 3;
406 |   // The minimum number of bytes in a value
407 |   float min_num_bytes = 4;
408 |   // The maximum number of bytes in a value
409 |   float max_num_bytes = 5;
410 |   // The maximum number of bytes in a value, as an int. Float will start having
411 |   // a loss of precision for a large enough integer. This field preserves the
412 |   // precision.
413 |   int64 max_num_bytes_int = 6;
414 | }
415 | 
416 | message StructStatistics {
417 |   CommonStatistics common_stats = 1;
418 | }
419 | 
420 | // Statistics about the presence and valency of feature values. Feature values
421 | // could be nested lists. A feature in tf.Examples or other "flat" datasets has
422 | // values of nest level 1 -- they are lists of primitives. A nest level N
423 | // (N > 1) feature value is a list of lists of nest level (N - 1).
424 | // This proto can be used to describe the presence and valency of values at each
425 | // level.
426 | message PresenceAndValencyStatistics {
427 |   // Note: missing and non-missing counts are conditioned on the upper level
428 |   // being non-missing (i.e. if the upper level is missing/null, all the levels
429 |   // nested below are by definition missing, but not counted).
430 |   // Number non-missing (not-null) values.
431 |   uint64 num_non_missing = 1;
432 |   // Number of missing (null) values.
433 |   uint64 num_missing = 2;
434 |   // Minimum length of the values (note that nulls are not considered).
435 |   uint64 min_num_values = 3;
436 |   // Maximum length of the values.
437 |   uint64 max_num_values = 4;
438 |   // Total number of values.
439 |   uint64 tot_num_values = 5;
440 | }
441 | 
442 | // Common statistics for all feature types. Statistics counting number of values
443 | // (i.e., min_num_values, max_num_values, avg_num_values, and tot_num_values)
444 | // include NaNs. For nested features with N nested levels (N > 1), the
445 | // statistics counting number of values will rely on the innermost level.
446 | message CommonStatistics {
447 |   // The number of examples that include this feature. Note that this includes
448 |   // examples that contain this feature with an explicitly empty list of values,
449 |   // which may be permitted for variable length features.
450 |   uint64 num_non_missing = 1;
451 |   // The number of examples missing this feature.
452 |   uint64 num_missing = 2;
453 |   // The minimum number of values in a single example for this feature.
454 |   uint64 min_num_values = 3;
455 |   // The maximum number of values in a single example for this feature.
456 |   uint64 max_num_values = 4;
457 |   // The average number of values in a single example for this feature.
458 |   // avg_num_values = tot_num_values / num_non_missing.
459 |   float avg_num_values = 5;
460 |   // The total number of values in this feature.
461 |   uint64 tot_num_values = 8;
462 |   // The quantiles histogram for the number of values in this feature.
463 |   Histogram num_values_histogram = 6;
464 |   WeightedCommonStatistics weighted_common_stats = 7;
465 |   // The histogram for the number of features in the feature list (only set if
466 |   // this feature is a non-context feature from a tf.SequenceExample).
467 |   // This is different from num_values_histogram, as num_values_histogram tracks
468 |   // the count of all values for a feature in an example, whereas this tracks
469 |   // the length of the feature list for this feature in an example (where each
470 |   // feature list can contain multiple values).
471 |   Histogram feature_list_length_histogram = 9;
472 | 
473 |   // Contains presence and valency stats for each nest level of the feature.
474 |   // The first item corresponds to the outermost level, and by definition,
475 |   // the stats it contains equals to the corresponding stats defined above.
476 |   // May not be populated if the feature is of nest level 1.
477 |   repeated PresenceAndValencyStatistics presence_and_valency_stats = 10;
478 | 
479 |   // If not empty, it's parallel to presence_and_valency_stats.
480 |   repeated WeightedCommonStatistics weighted_presence_and_valency_stats = 11;
481 | }
482 | 
483 | // The data used to create a histogram of a numeric feature for a dataset.
484 | message Histogram {
485 |   // Each bucket defines its low and high values along with its count. The
486 |   // low and high values must be a real number or positive or negative
487 |   // infinity. They cannot be NaN or undefined. Counts of those special values
488 |   // can be found in the numNaN and numUndefined fields.
489 |   message Bucket {
490 |     // The low value of the bucket, exclusive except for the first bucket.
491 |     double low_value = 1;
492 |     // The high value of the bucket, inclusive.
493 |     double high_value = 2;
494 | 
495 |     // The number of items in the bucket. Stored as a double to be able to
496 |     // handle weighted histograms.
497 |     double sample_count = 4;
498 | 
499 |     // Deleted fields.
500 |     reserved 3;
501 |   }
502 | 
503 |   // The number of NaN values in the dataset.
504 |   uint64 num_nan = 1;
505 |   // The number of undefined values in the dataset.
506 |   uint64 num_undefined = 2;
507 | 
508 |   // A list of buckets in the histogram, sorted from lowest bucket to highest
509 |   // bucket.
510 |   repeated Bucket buckets = 3;
511 | 
512 |   // The type of the histogram. A standard histogram has equal-width buckets.
513 |   // The quantiles type is used for when the histogram message is used to store
514 |   // quantile information (by using approximately equal-count buckets with
515 |   // variable widths).
516 |   enum HistogramType {
517 |     STANDARD = 0;
518 |     QUANTILES = 1;
519 |   }
520 | 
521 |   // The type of the histogram.
522 |   HistogramType type = 4;
523 | 
524 |   // An optional descriptive name of the histogram, to be used for labeling.
525 |   string name = 5;
526 | }
527 | 
528 | // The data used to create a rank histogram of a non-numeric feature of a
529 | // dataset. The rank of a value in a feature can be used as a measure of how
530 | // commonly the value is found in the entire dataset. With bucket sizes of one,
531 | // this becomes a distribution function of all feature values.
532 | message RankHistogram {
533 |   // Each bucket defines its start and end ranks along with its count.
534 |   message Bucket {
535 |     // The low rank of the bucket, inclusive.
536 |     uint64 low_rank = 1;
537 |     // The high rank of the bucket, exclusive.
538 |     uint64 high_rank = 2;
539 | 
540 |     // The label for the bucket. Can be used to list or summarize the values in
541 |     // this rank bucket.
542 |     string label = 4;
543 | 
544 |     // The number of items in the bucket. Stored as a double to be able to
545 |     // handle weighted histograms.
546 |     double sample_count = 5;
547 | 
548 |     // Deleted fields.
549 |     reserved 3;
550 |   }
551 | 
552 |   // A list of buckets in the histogram, sorted from lowest-ranked bucket to
553 |   // highest-ranked bucket.
554 |   repeated Bucket buckets = 1;
555 | 
556 |   // An optional descriptive name of the histogram, to be used for labeling.
557 |   string name = 2;
558 | }
559 | // LINT.ThenChange(//tfx_bsl/cc/statistics/merge_util.cc)
560 | 


--------------------------------------------------------------------------------
/tensorflow_metadata/python/proto_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Tests for tensorflow_metadata.python.proto."""
15 | 
16 | from __future__ import absolute_import
17 | from __future__ import division
18 | from __future__ import print_function
19 | 
20 | from absl.testing import absltest
21 | from tensorflow_metadata.proto.v0 import schema_pb2
22 | 
23 | 
24 | class ProtoTest(absltest.TestCase):
25 | 
26 |   def test_import_works(self):
27 |     """Checks that the import of the tensorflow_metadata module works."""
28 |     # pylint:disable=unused-variable
29 |     # We don't explicitly check all the symbols we know about now, because we
30 |     # don't want to have to keep this test in sync with changes to the
31 |     # underlying library.
32 |     # Check for the presence of the Schema symbol.
33 |     dummy = schema_pb2.Schema
34 |     del dummy
35 | 
36 | 
37 | if __name__ == '__main__':
38 |   absltest.main()
39 | 


--------------------------------------------------------------------------------
/tensorflow_metadata/version.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Contains the version string of TFMD."""
15 | 
16 | # Note that setup.py uses this version.
17 | __version__ = '1.18.0.dev'
18 | 


--------------------------------------------------------------------------------
/tools/build_tfmd_docs.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | r"""Generate api reference docs for `tfmd`.
15 | 
16 | This requires a local installation of `tfmd` and `tensorflow_docs`
17 | 
18 | ```
19 | $ pip install tensorflow_metadata git+https://github.com/tensorflow/docs
20 | ```
21 | 
22 | ```
23 | python build_tfmd_docs.py --output_dir=/tmp/tfmd-api
24 | ```
25 | 
26 | """
27 | from absl import app
28 | from absl import flags
29 | 
30 | from tensorflow_docs.api_generator import generate_lib
31 | import tensorflow_metadata as tfmd
32 | 
33 | # `.proto` (which contains all the classes) is not imported by default
34 | import tensorflow_metadata.proto  # pylint: disable=unused-import
35 | 
36 | _OUTPUT_DIR = flags.DEFINE_string('output_dir', '/tmp/tfmd_api/',
37 |                                   'The path to output the files to')
38 | 
39 | _CODDE_URL_PREFIX = flags.DEFINE_string(
40 |     'code_url_prefix',
41 |     'https://github.com/tensorflow/metadata/tree/master/tensorflow_metadata/proto',
42 |     'The url prefix for links to code.')
43 | 
44 | _SEARCH_HINTS = flags.DEFINE_bool(
45 |     'search_hints', True,
46 |     'Include metadata search hints in the generated files')
47 | 
48 | _SITE_PATH = flags.DEFINE_string(
49 |     'site_path',
50 |     'tfx/tensorflow_metadata/api_docs/python',
51 |     'Path prefix in the _toc.yaml')
52 | 
53 | 
54 | def main(args):
55 |   if args[1:]:
56 |     raise ValueError('Unrecognized Command line args', args[1:])
57 | 
58 |   doc_generator = generate_lib.DocGenerator(
59 |       root_title='TF-Metadata',
60 |       py_modules=[('tfmd.proto', tfmd.proto)],
61 |       code_url_prefix=_CODDE_URL_PREFIX.value,
62 |       search_hints=_SEARCH_HINTS.value,
63 |       site_path=_SITE_PATH.value,
64 |       callbacks=[])
65 | 
66 |   doc_generator.build(_OUTPUT_DIR.value)
67 | 
68 | 
69 | if __name__ == '__main__':
70 |   app.run(main)
71 | 


--------------------------------------------------------------------------------