├── oss_scripts ├── pip_package │ ├── MANIFEST.in │ ├── BUILD │ ├── build_pip_package.sh │ └── setup.py └── configure.sh ├── third_party ├── tensorflow │ ├── BUILD │ ├── BUILD.tpl │ └── tf_configure.bzl └── icu │ ├── BUILD │ ├── BUILD.system │ ├── workspace.bzl │ └── BUILD.bazel ├── tensorflow_text ├── workspace.bzl ├── python │ ├── __init__.py │ ├── numpy │ │ ├── __init__.py │ │ └── viterbi_decode.py │ └── ops │ │ ├── coerce_to_valid_utf8_op_test.py │ │ ├── __init__.py │ │ ├── tokenization.py │ │ ├── string_ops.py │ │ ├── normalize_ops.py │ │ ├── ngrams_op.py │ │ ├── normalize_ops_test.py │ │ ├── create_feature_bitmask_op.py │ │ ├── sentence_breaking_ops.py │ │ ├── sliding_window_op.py │ │ ├── create_feature_bitmask_op_test.py │ │ └── ngrams_op_test.py ├── __init__.py └── core │ ├── ops │ ├── normalize_ops.cc │ ├── sentence_breaking_ops.cc │ ├── whitespace_tokenize_op.cc │ ├── unicode_script_tokenize_op.cc │ ├── wordpiece_op.cc │ └── constrained_sequence_op.cc │ └── kernels │ ├── wordpiece_tokenizer.h │ ├── whitespace_tokenize_kernel_test.cc │ ├── unicode_script_tokenize_kernel_test.cc │ ├── text_kernels_test_util.cc │ ├── sentence_breaking_utils.h │ ├── wordpiece_tokenizer.cc │ ├── text_kernels_test_util.h │ └── normalize_kernels.cc ├── docs └── api_docs │ └── python │ ├── text │ ├── Reduction.md │ ├── normalize_utf8.md │ ├── wordshape.md │ ├── case_fold_utf8.md │ ├── coerce_to_structurally_valid_utf8.md │ ├── gather_with_default.md │ ├── ngrams.md │ ├── pad_along_dimension.md │ ├── sentence_fragments.md │ ├── span_overlaps.md │ ├── Tokenizer.md │ ├── sliding_window.md │ ├── TokenizerWithOffsets.md │ ├── greedy_constrained_sequence.md │ ├── viterbi_constrained_sequence.md │ ├── WhitespaceTokenizer.md │ ├── span_alignment.md │ ├── UnicodeScriptTokenizer.md │ ├── _api_cache.json │ └── WordpieceTokenizer.md │ ├── index.md │ ├── _toc.yaml │ └── text.md ├── .bazelrc ├── CONTRIBUTING.md └── WORKSPACE /oss_scripts/pip_package/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include tensorflow_text/ *.so 2 | -------------------------------------------------------------------------------- /third_party/tensorflow/BUILD: -------------------------------------------------------------------------------- 1 | # Needed for Bazel to treat this directory as a package 2 | -------------------------------------------------------------------------------- /third_party/icu/BUILD: -------------------------------------------------------------------------------- 1 | # This empty BUILD file is required to make Bazel treat this directory as a package. 2 | -------------------------------------------------------------------------------- /tensorflow_text/workspace.bzl: -------------------------------------------------------------------------------- 1 | """doc""" 2 | 3 | load("//third_party/icu:workspace.bzl", icu = "repo") 4 | 5 | def initialize_third_party_archives(): 6 | icu() 7 | -------------------------------------------------------------------------------- /oss_scripts/pip_package/BUILD: -------------------------------------------------------------------------------- 1 | # Tools for building the TF.Text pip package. 2 | 3 | package(default_visibility = ["//visibility:private"]) 4 | 5 | licenses(["notice"]) # Apache 2.0 6 | 7 | sh_binary( 8 | name = "build_pip_package", 9 | srcs = ["build_pip_package.sh"], 10 | data = [ 11 | "LICENSE", 12 | "MANIFEST.in", 13 | "setup.py", 14 | "//tensorflow_text:tf-text", 15 | ], 16 | ) 17 | -------------------------------------------------------------------------------- /third_party/tensorflow/BUILD.tpl: -------------------------------------------------------------------------------- 1 | package(default_visibility = ["//visibility:public"]) 2 | 3 | cc_library( 4 | name = "tf_header_lib", 5 | hdrs = [":tf_header_include"], 6 | includes = ["include"], 7 | visibility = ["//visibility:public"], 8 | ) 9 | 10 | cc_library( 11 | name = "libtensorflow_framework", 12 | srcs = [":libtensorflow_framework.so.1"], 13 | #data = ["lib/libtensorflow_framework.so.1"], 14 | visibility = ["//visibility:public"], 15 | ) 16 | 17 | %{TF_HEADER_GENRULE} 18 | %{TF_SHARED_LIBRARY_GENRULE} 19 | -------------------------------------------------------------------------------- /third_party/icu/BUILD.system: -------------------------------------------------------------------------------- 1 | package( 2 | default_visibility = ["//visibility:public"], 3 | ) 4 | 5 | licenses(["notice"]) # Apache 2.0 6 | 7 | filegroup( 8 | name = "icu4c/LICENSE", 9 | ) 10 | 11 | filegroup( 12 | name = "icu4j/main/shared/licenses/LICENSE", 13 | ) 14 | 15 | cc_library( 16 | name = "headers", 17 | ) 18 | 19 | cc_library( 20 | name = "common", 21 | deps = [ 22 | ":icuuc", 23 | ], 24 | ) 25 | 26 | cc_library( 27 | name = "icuuc", 28 | linkopts = ["-licuuc"], 29 | visibility = ["//visibility:private"], 30 | ) 31 | -------------------------------------------------------------------------------- /tensorflow_text/python/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Empty file required by setuptools.find_packages to recognize this as a package 17 | -------------------------------------------------------------------------------- /tensorflow_text/python/numpy/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Numpy-based code for text processing.""" 17 | 18 | from tensorflow_text.python.numpy import viterbi_decode 19 | -------------------------------------------------------------------------------- /third_party/icu/workspace.bzl: -------------------------------------------------------------------------------- 1 | """Loads a lightweight subset of the ICU library for Unicode processing.""" 2 | 3 | load("@org_tensorflow//third_party:repo.bzl", "third_party_http_archive") 4 | 5 | # Sanitize a dependency so that it works correctly from code that includes 6 | # TensorFlow as a submodule. 7 | def clean_dep(dep): 8 | return str(Label(dep)) 9 | 10 | def repo(): 11 | third_party_http_archive( 12 | name = "icu", 13 | strip_prefix = "icu-release-62-1", 14 | sha256 = "e15ffd84606323cbad5515bf9ecdf8061cc3bf80fb883b9e6aa162e485aa9761", 15 | urls = [ 16 | "http://mirror.tensorflow.org/github.com/unicode-org/icu/archive/release-62-1.tar.gz", 17 | "https://github.com/unicode-org/icu/archive/release-62-1.tar.gz", 18 | ], 19 | build_file = "//third_party/icu:BUILD.bazel", 20 | system_build_file = "//third_party/icu:BUILD.system", 21 | ) 22 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/Reduction.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | 6 | 7 |
8 | 9 | # text.Reduction 10 | 11 | ## Class `Reduction` 12 | 13 | Type of reduction to be done by the ngram op. 14 | 15 | Defined in 16 | [`python/ops/ngrams_op.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/ngrams_op.py). 17 | 18 | 19 | 20 | The supported reductions are as follows: 21 | 22 | * `Reduction.SUM`: Add values in the window. 23 | * `Reduction.MEAN`: Average values in the window. 24 | * `Reduction.STRING_JOIN`: Join strings in the window. 25 | 26 | ## Class Members 27 | 28 |

MEAN

29 | 30 |

STRING_JOIN

31 | 32 |

SUM

33 | 34 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/normalize_utf8.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 | 6 | # text.normalize_utf8 7 | 8 | Normalizes each UTF8 string in the input tensor using the specified rule. 9 | 10 | ``` python 11 | text.normalize_utf8( 12 | input, 13 | normalization_form='NFKC', 14 | name=None 15 | ) 16 | ``` 17 | 18 | Defined in 19 | [`python/ops/normalize_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/normalize_ops.py). 20 | 21 | 22 | 23 | See http://unicode.org/reports/tr15/ 24 | 25 | #### Args: 26 | 27 | * `input`: A `Tensor` or `RaggedTensor` of type string. (Must be 28 | UTF-8.) 29 | * `normalization_form`: One of the following string values ('NFC', 30 | 'NFKC', 'NFD', 'NFKD'). Default is 'NFKC'. 31 | * `name`: The name for this op (optional) 32 | 33 | #### Returns: 34 | 35 | A `Tensor` or `RaggedTensor` of type string, with normalized contents. 36 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/wordshape.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 | 6 | # text.wordshape 7 | 8 | Determine wordshape features for each input string. 9 | 10 | ``` python 11 | text.wordshape( 12 | input_tensor, 13 | pattern, 14 | name=None 15 | ) 16 | ``` 17 | 18 | Defined in 19 | [`python/ops/wordshape_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/wordshape_ops.py). 20 | 21 | 22 | 23 | #### Args: 24 | 25 | * `input_tensor`: string `Tensor` with any shape. 26 | * `pattern`: A `~tftext.WordShape` or a list of WordShapes. 27 | * `name`: A name for the operation (optional). 28 | 29 | 30 | #### Returns: 31 | 32 | `[input_tensor.shape + pattern.shape]`: A tensor where 33 | `result[i1...iN, j]` is true if `input_tensor[i1...iN]` has the wordshape 34 | specified by `pattern[j]`. 35 | 36 | #### Raises: 37 | 38 | * `ValueError`: If `pattern` contains an unknown identifier. -------------------------------------------------------------------------------- /.bazelrc: -------------------------------------------------------------------------------- 1 | # TensorFlow Federated Bazel configuration. 2 | # 3 | # See https://docs.bazel.build/versions/master/user-manual.html#config for 4 | # details on the various configuration options. 5 | 6 | # Build with modular op registration support by default. 7 | build --define=framework_shared_object=true 8 | 9 | # Bazel workaround to compile gRPC with the new 'cares' package. 10 | build --define=grpc_no_ares=true 11 | 12 | # Build with optimization enabled. 13 | build --compilation_mode=opt 14 | 15 | # Processor native optimizations (depends on build host capabilities). 16 | build --copt=-march=native 17 | build --host_copt=-march=native 18 | build --copt=-O3 19 | build --copt=-Wno-sign-compare 20 | build --define with_default_optimizations=true 21 | 22 | # Disable Tensorflow extensions that are not needed for Tensorflow Federated. 23 | build --define=no_aws_support=true 24 | build --define=no_hdfs_support=true 25 | build --define=no_kafka_support=true 26 | build --define=no_ignite_support=true 27 | build --define=no_nccl_support=true 28 | 29 | # Misc configuration 30 | build:xla --define with_xla_support=true 31 | build:v2 --define=tf_api_version=2 32 | build --action_env TF_CONFIGURE_IOS="0" 33 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows 28 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). 29 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/case_fold_utf8.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 | 6 | # text.case_fold_utf8 7 | 8 | Applies case folding to every UTF8 string in the input. 9 | 10 | ``` python 11 | text.case_fold_utf8( 12 | input, 13 | name=None 14 | ) 15 | ``` 16 | 17 | Defined in 18 | [`python/ops/normalize_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/normalize_ops.py). 19 | 20 | 21 | 22 | The input is a `Tensor` or `RaggedTensor` of any shape, and the resulting output 23 | has the same shape as the input. Note that NFKC normalization is implicitly 24 | applied to the strings. 25 | 26 | #### For example: 27 | 28 | ```python 29 | >>> case_fold_utf8(['The Quick-Brown', 30 | ... 'CAT jumped over', 31 | ... 'the lazy dog !! '] 32 | tf.Tensor(['The quick-brown' 'cat jumped over' 'the lazy dog !! '], 33 | shape=(3,), dtype=string) 34 | ``` 35 | 36 | #### Args: 37 | 38 | * `input`: A `Tensor` or `RaggedTensor` of type string. (Must be 39 | UTF-8.) 40 | * `name`: The name for this op (optional) 41 | 42 | #### Returns: 43 | 44 | A `Tensor` or `RaggedTensor` of type string, with case-folded contents. 45 | -------------------------------------------------------------------------------- /oss_scripts/configure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | function write_to_bazelrc() { 18 | echo "$1" >> .bazelrc 19 | } 20 | 21 | function write_action_env_to_bazelrc() { 22 | write_to_bazelrc "build --action_env $1=\"$2\"" 23 | } 24 | 25 | if python -c "import tensorflow" &> /dev/null; then 26 | echo 'using installed tensorflow' 27 | else 28 | rm .bazelrc 29 | pip install tensorflow-2.0.0b0 30 | fi 31 | 32 | TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') ) 33 | TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') ) 34 | 35 | write_action_env_to_bazelrc "TF_HEADER_DIR" ${TF_CFLAGS:2} 36 | write_action_env_to_bazelrc "TF_SHARED_LIBRARY_DIR" ${TF_LFLAGS:2} 37 | -------------------------------------------------------------------------------- /tensorflow_text/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Various tensorflow ops related to text-processing.""" 17 | from tensorflow.python.util.all_util import remove_undocumented 18 | 19 | # pylint: disable=wildcard-import,g-import-not-at-top 20 | from tensorflow_text.python.ops import * 21 | 22 | _allowed_symbols = [ 23 | "case_fold_utf8", 24 | "coerce_to_structurally_valid_utf8", 25 | "gather_with_default", 26 | "greedy_constrained_sequence", 27 | "ngrams", 28 | "normalize_utf8", 29 | "pad_along_dimension", 30 | "Reduction", 31 | "sentence_fragments", 32 | "sliding_window", 33 | "span_alignment", 34 | "span_overlaps", 35 | "Tokenizer", 36 | "TokenizerWithOffsets", 37 | "UnicodeScriptTokenizer", 38 | "viterbi_constrained_sequence", 39 | "WhitespaceTokenizer", 40 | "wordshape", 41 | "WordShape", 42 | "WordpieceTokenizer", 43 | ] 44 | 45 | remove_undocumented(__name__, _allowed_symbols) 46 | -------------------------------------------------------------------------------- /oss_scripts/pip_package/build_pip_package.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Tool to build the TensorFlow Text pip package. 3 | # 4 | # Usage: 5 | # bazel build oss_scripts/pip_package:build_pip_package 6 | # bazel-bin/oss_scripts/build_pip_package 7 | # 8 | # Arguments: 9 | # output_dir: An output directory. Defaults to `/tmp/tensorflow_text_pkg`. 10 | 11 | set -e # fail and exit on any command erroring 12 | 13 | die() { 14 | echo >&2 "$@" 15 | exit 1 16 | } 17 | 18 | main() { 19 | local output_dir="$1" 20 | 21 | if [[ -z "${output_dir}" ]]; then 22 | output_dir="/tmp/tensorflow_text_pkg" 23 | fi 24 | mkdir -p ${output_dir} 25 | output_dir=$(readlink -f "${output_dir}") 26 | echo "=== Destination directory: ${output_dir}" 27 | 28 | if [[ ! -d "bazel-bin/tensorflow_text" ]]; then 29 | die "Could not find bazel-bin. Did you run from the root of the build tree?" 30 | fi 31 | 32 | local temp_dir="$(mktemp -d)" 33 | trap "rm -rf ${temp_dir}" EXIT 34 | echo "=== Using tmpdir ${temp_dir}" 35 | 36 | local runfiles="bazel-bin/oss_scripts/pip_package/build_pip_package.runfiles" 37 | cp -LR \ 38 | "${runfiles}/org_tensorflow_text/tensorflow_text" \ 39 | "${temp_dir}" 40 | cp "${runfiles}/org_tensorflow_text/oss_scripts/pip_package/setup.py" \ 41 | "${temp_dir}" 42 | cp "${runfiles}/org_tensorflow_text/oss_scripts/pip_package/MANIFEST.in" \ 43 | "${temp_dir}" 44 | cp "${runfiles}/org_tensorflow_text/oss_scripts/pip_package/LICENSE" \ 45 | "${temp_dir}" 46 | 47 | pushd "${temp_dir}" > /dev/null 48 | 49 | # Build pip package 50 | python setup.py bdist_wheel --universal 51 | cp dist/*.whl "${output_dir}" 52 | } 53 | 54 | main "$@" 55 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/coerce_to_structurally_valid_utf8.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 | 6 | # text.coerce_to_structurally_valid_utf8 7 | 8 | Coerce UTF-8 input strings to structurally valid UTF-8. 9 | 10 | ``` python 11 | text.coerce_to_structurally_valid_utf8( 12 | input, 13 | replacement_char=_unichr(65533), 14 | name=None 15 | ) 16 | ``` 17 | 18 | Defined in 19 | [`python/ops/string_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/string_ops.py). 20 | 21 | 22 | 23 | Any bytes which cause the input string to be invalid UTF-8 are substituted 24 | with the provided replacement character codepoint (default 65533). Use a 25 | single byte replacement character codepoint to preserve alignment to the 26 | source input string. 27 | 28 | #### Args: 29 | 30 | * `input`: UTF-8 string tensor to coerce to valid UTF-8. 31 | * `replacement_char`: The replacement character to be used in place of 32 | any invalid byte in the input. Any valid Unicode character may be used. The 33 | default value is the default Unicode replacement character which is 0xFFFD 34 | (or U+65533). Note that passing a replacement character expressible in 1 35 | byte, such as ' ' or '?', will preserve string alignment to the source since 36 | individual invalid bytes will be replaced with a 1-byte replacement. 37 | (optional) 38 | * `name`: A name for the operation (optional). 39 | 40 | #### Returns: 41 | 42 | A tensor of type string with the same shape as the input. 43 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/gather_with_default.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 | 6 | # text.gather_with_default 7 | 8 | Gather slices with `indices=-1` mapped to `default`. 9 | 10 | ``` python 11 | text.gather_with_default( 12 | params, 13 | indices, 14 | default, 15 | name=None, 16 | axis=0 17 | ) 18 | ``` 19 | 20 | Defined in 21 | [`python/ops/pointer_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/pointer_ops.py). 22 | 23 | 24 | 25 | This operation is similar to `tf.gather()`, except that any value of `-1` 26 | in `indices` will be mapped to `default`. Example: 27 | 28 | ```python 29 | >>> gather_with_default(['a', 'b', 'c', 'd'], [2, 0, -1, 2, -1], '_').eval() 30 | array(['c', 'a', '_', 'c', '_'], dtype=object) 31 | ``` 32 | 33 | #### Args: 34 | 35 | * `params`: The `Tensor` from which to gather values. Must be at least 36 | rank `axis + 1`. 37 | * `indices`: The index `Tensor`. Must have dtype `int32` or `int64`, 38 | and values must be in the range `[-1, params.shape[axis])`. 39 | * `default`: The value to use when `indices` is `-1`. `default.shape` 40 | must be equal to `params.shape[axis + 1:]`. 41 | * `name`: A name for the operation (optional). 42 | * `axis`: The axis in `params` to gather `indices` from. Must be a 43 | scalar `int32` or `int64`. Supports negative indices. 44 | 45 | #### Returns: 46 | 47 | A `Tensor` with the same type as `param`, and with shape `params.shape[:axis] + 48 | indices.shape + params.shape[axis + 1:]`. 49 | -------------------------------------------------------------------------------- /docs/api_docs/python/index.md: -------------------------------------------------------------------------------- 1 | # All symbols in TensorFlow Text 2 | 3 | * text 4 | * text.Reduction 5 | * text.Tokenizer 6 | * text.TokenizerWithOffsets 7 | * text.UnicodeScriptTokenizer 8 | * text.WhitespaceTokenizer 9 | * text.WordShape 10 | * text.WordpieceTokenizer 11 | * text.case_fold_utf8 12 | * text.coerce_to_structurally_valid_utf8 13 | * text.gather_with_default 14 | * text.greedy_constrained_sequence 15 | * text.ngrams 16 | * text.normalize_utf8 17 | * text.pad_along_dimension 18 | * text.sentence_fragments 19 | * text.sliding_window 20 | * text.span_alignment 21 | * text.span_overlaps 22 | * text.viterbi_constrained_sequence 23 | * text.wordshape 24 | -------------------------------------------------------------------------------- /tensorflow_text/core/ops/normalize_ops.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 TF.Text Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "tensorflow/core/framework/common_shape_fns.h" 16 | #include "tensorflow/core/framework/op.h" 17 | #include "tensorflow/core/framework/shape_inference.h" 18 | 19 | namespace tensorflow { 20 | namespace text { 21 | 22 | REGISTER_OP("CaseFoldUTF8") 23 | .Input("input: string") 24 | .Output("output: string") 25 | .SetShapeFn(::tensorflow::shape_inference::UnchangedShape) 26 | .Doc(R"doc( 27 | Applies case folding to every UTF8 string in input_tensor. The input is a dense 28 | tensor of any shape and the output has the same shape as the input. 29 | 30 | For example if: 31 | 32 | input = [ 'The Quick-Brown', 33 | 'CAT jumped over', 34 | 'the lazy dog !! '] 35 | 36 | output = [ 'The quick-brown', 37 | 'cat jumped over', 38 | 'the lazy dog !! '] 39 | )doc"); 40 | 41 | REGISTER_OP("NormalizeUTF8") 42 | .Input("input: string") 43 | .Attr("normalization_form: string") 44 | .Output("output: string") 45 | .SetShapeFn(::tensorflow::shape_inference::UnchangedShape) 46 | .Doc(R"doc( 47 | Normalizes each UTF8 string in the input tensor using 'normalization_form' 48 | rules. 49 | 50 | See http://unicode.org/reports/tr15/ 51 | )doc"); 52 | 53 | } // namespace text 54 | } // namespace tensorflow 55 | -------------------------------------------------------------------------------- /tensorflow_text/core/ops/sentence_breaking_ops.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 TF.Text Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "tensorflow/core/framework/common_shape_fns.h" 16 | #include "tensorflow/core/framework/op.h" 17 | #include "tensorflow/core/framework/shape_inference.h" 18 | #include "tensorflow/core/lib/core/status.h" 19 | 20 | namespace tensorflow { 21 | namespace text { 22 | 23 | Status SentenceFragmentShapeFn( 24 | ::tensorflow::shape_inference::InferenceContext* c) { 25 | for (int i = 0; i < c->num_outputs(); ++i) { 26 | c->set_output(i, c->UnknownShapeOfRank(1)); 27 | } 28 | 29 | return Status::OK(); 30 | } 31 | 32 | REGISTER_OP("SentenceFragments") 33 | .Attr("input_encoding: string") 34 | .Attr("errors: {'strict', 'replace', 'ignore'} = 'replace'") 35 | .Attr("replacement_char: int = 65533") // 0xFFFD unicode replacement char 36 | .Attr("replace_control_characters: bool = false") 37 | .Input("row_lengths: int64") 38 | .Input("token_start: int64") 39 | .Input("token_end: int64") 40 | .Input("token_word: string") 41 | .Input("token_properties: int64") 42 | .Output("fragment_start: int64") 43 | .Output("fragment_end: int64") 44 | .Output("fragment_properties: int64") 45 | .Output("terminal_punc_token: int64") 46 | .Output("output_row_lengths: int64") 47 | .SetShapeFn(SentenceFragmentShapeFn); 48 | 49 | } // namespace text 50 | } // namespace tensorflow 51 | -------------------------------------------------------------------------------- /tensorflow_text/python/ops/coerce_to_valid_utf8_op_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # -*- coding: utf-8 -*- 17 | """Tests for Utf8Chars Op from string_ops.""" 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from tensorflow.python.platform import test 24 | from tensorflow_text.python.ops import string_ops 25 | 26 | 27 | class CoerceToUtf8Test(test.TestCase): 28 | 29 | def testCoercetoStructurallyValidOnValidInput(self): 30 | with self.test_session(): 31 | utf8 = string_ops.coerce_to_structurally_valid_utf8(["abc"]) 32 | self.assertAllEqual(utf8, ["abc"]) 33 | 34 | def testCoercetoStructurallyValidOnValidInputWithDefault(self): 35 | with self.test_session(): 36 | utf8 = string_ops.coerce_to_structurally_valid_utf8(["abc"], "?") 37 | self.assertAllEqual(utf8, ["abc"]) 38 | 39 | def testCoercetoStructurallyValidOnInvalidInput(self): 40 | with self.test_session(): 41 | utf8 = string_ops.coerce_to_structurally_valid_utf8([b"abc\xfd"]) 42 | self.assertAllEqual(utf8, ["abc�"]) 43 | 44 | def testCoercetoStructurallyValidOnInvalidInputWithDefault(self): 45 | with self.test_session(): 46 | utf8 = string_ops.coerce_to_structurally_valid_utf8([b"abc\xfd"], "?") 47 | self.assertAllEqual(utf8, ["abc?"]) 48 | 49 | 50 | if __name__ == "__main__": 51 | test.main() 52 | -------------------------------------------------------------------------------- /docs/api_docs/python/_toc.yaml: -------------------------------------------------------------------------------- 1 | # Automatically generated file; please do not edit 2 | toc: 3 | - title: text 4 | section: 5 | - title: Overview 6 | path: /text/api_docs/python/text 7 | - title: case_fold_utf8 8 | path: /text/api_docs/python/text/case_fold_utf8 9 | - title: coerce_to_structurally_valid_utf8 10 | path: /text/api_docs/python/text/coerce_to_structurally_valid_utf8 11 | - title: gather_with_default 12 | path: /text/api_docs/python/text/gather_with_default 13 | - title: greedy_constrained_sequence 14 | path: /text/api_docs/python/text/greedy_constrained_sequence 15 | - title: ngrams 16 | path: /text/api_docs/python/text/ngrams 17 | - title: normalize_utf8 18 | path: /text/api_docs/python/text/normalize_utf8 19 | - title: pad_along_dimension 20 | path: /text/api_docs/python/text/pad_along_dimension 21 | - title: Reduction 22 | path: /text/api_docs/python/text/Reduction 23 | - title: sentence_fragments 24 | path: /text/api_docs/python/text/sentence_fragments 25 | - title: sliding_window 26 | path: /text/api_docs/python/text/sliding_window 27 | - title: span_alignment 28 | path: /text/api_docs/python/text/span_alignment 29 | - title: span_overlaps 30 | path: /text/api_docs/python/text/span_overlaps 31 | - title: Tokenizer 32 | path: /text/api_docs/python/text/Tokenizer 33 | - title: TokenizerWithOffsets 34 | path: /text/api_docs/python/text/TokenizerWithOffsets 35 | - title: UnicodeScriptTokenizer 36 | path: /text/api_docs/python/text/UnicodeScriptTokenizer 37 | - title: viterbi_constrained_sequence 38 | path: /text/api_docs/python/text/viterbi_constrained_sequence 39 | - title: WhitespaceTokenizer 40 | path: /text/api_docs/python/text/WhitespaceTokenizer 41 | - title: WordpieceTokenizer 42 | path: /text/api_docs/python/text/WordpieceTokenizer 43 | - title: WordShape 44 | path: /text/api_docs/python/text/WordShape 45 | - title: wordshape 46 | path: /text/api_docs/python/text/wordshape 47 | -------------------------------------------------------------------------------- /tensorflow_text/core/kernels/wordpiece_tokenizer.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 TF.Text Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef TENSORFLOW_TEXT_CORE_KERNELS_WORDPIECE_TOKENIZER_H_ 16 | #define TENSORFLOW_TEXT_CORE_KERNELS_WORDPIECE_TOKENIZER_H_ 17 | 18 | #include 19 | #include "tensorflow/core/framework/lookup_interface.h" 20 | #include "tensorflow/core/lib/core/status.h" 21 | 22 | namespace tensorflow { 23 | namespace text { 24 | 25 | class WordpieceVocab { 26 | public: 27 | virtual ~WordpieceVocab() {} 28 | virtual Status Contains(const string& key, bool* value) = 0; 29 | }; 30 | 31 | class LookupTableVocab : public WordpieceVocab { 32 | public: 33 | LookupTableVocab(lookup::LookupInterface* table, OpKernelContext* ctx); 34 | 35 | virtual Status Contains(const string& key, bool* value); 36 | 37 | private: 38 | // not owned 39 | lookup::LookupInterface* table_; 40 | OpKernelContext* ctx_; 41 | Tensor default_value_; 42 | }; 43 | 44 | Status WordpieceTokenize(const string& token, const int64 max_bytes_per_token, 45 | const string& suffix_indicator, bool use_unknown_token, 46 | const string& unknown_token, 47 | LookupTableVocab* vocab_map, 48 | std::vector* subwords, 49 | std::vector* begin_offset, 50 | std::vector* end_offset, int* num_word_pieces); 51 | 52 | } // namespace text 53 | } // namespace tensorflow 54 | 55 | #endif // TENSORFLOW_TEXT_CORE_KERNELS_WORDPIECE_TOKENIZER_H_ 56 | -------------------------------------------------------------------------------- /third_party/icu/BUILD.bazel: -------------------------------------------------------------------------------- 1 | package( 2 | default_visibility = ["//visibility:public"], 3 | ) 4 | 5 | licenses(["notice"]) # Apache 2.0 6 | 7 | exports_files([ 8 | "icu4c/LICENSE", 9 | "icu4j/main/shared/licenses/LICENSE", 10 | ]) 11 | 12 | cc_library( 13 | name = "headers", 14 | hdrs = glob(["icu4c/source/common/unicode/*.h"]), 15 | includes = [ 16 | "icu4c/source/common", 17 | ], 18 | deps = [ 19 | ], 20 | ) 21 | 22 | cc_library( 23 | name = "common", 24 | hdrs = glob(["icu4c/source/common/unicode/*.h"]), 25 | includes = [ 26 | "icu4c/source/common", 27 | ], 28 | deps = [ 29 | ":icuuc", 30 | ], 31 | ) 32 | 33 | cc_library( 34 | name = "icuuc", 35 | srcs = glob( 36 | [ 37 | "icu4c/source/common/*.c", 38 | "icu4c/source/common/*.cpp", 39 | "icu4c/source/stubdata/*.cpp", 40 | ], 41 | ), 42 | hdrs = glob([ 43 | "icu4c/source/common/*.h", 44 | ]), 45 | copts = [ 46 | "-DU_COMMON_IMPLEMENTATION", 47 | "-DU_HAVE_STD_ATOMICS", 48 | ] + select({ 49 | ":android": [ 50 | "-fdata-sections", 51 | "-DGOOGLE_VENDOR_SRC_BRANCH", 52 | "-DU_HAVE_NL_LANGINFO_CODESET=0", 53 | "-Wno-deprecated-declarations", 54 | ], 55 | ":apple": [ 56 | "-DGOOGLE_VENDOR_SRC_BRANCH", 57 | "-Wno-shorten-64-to-32", 58 | "-Wno-unused-variable", 59 | ], 60 | ":windows": [ 61 | "/utf-8", 62 | "/DLOCALE_ALLOW_NEUTRAL_NAMES=0", 63 | ], 64 | "//conditions:default": [], 65 | }), 66 | tags = ["requires-rtti"], 67 | visibility = [ 68 | "//visibility:private", 69 | ], 70 | deps = [ 71 | ":headers", 72 | ], 73 | ) 74 | 75 | config_setting( 76 | name = "android", 77 | values = {"crosstool_top": "//external:android/crosstool"}, 78 | ) 79 | 80 | config_setting( 81 | name = "apple", 82 | values = {"cpu": "darwin"}, 83 | ) 84 | 85 | config_setting( 86 | name = "windows", 87 | values = {"cpu": "x64_windows"}, 88 | ) 89 | -------------------------------------------------------------------------------- /tensorflow_text/core/ops/whitespace_tokenize_op.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 TF.Text Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | 18 | #include "tensorflow/core/framework/common_shape_fns.h" 19 | #include "tensorflow/core/framework/op.h" 20 | #include "tensorflow/core/framework/shape_inference.h" 21 | 22 | namespace tensorflow { 23 | 24 | namespace shape_inference { 25 | class InferenceContext; 26 | } // namespace shape_inference 27 | 28 | namespace text { 29 | 30 | using shape_inference::InferenceContext; 31 | 32 | REGISTER_OP("WhitespaceTokenizeWithOffsets") 33 | .Input("input_values: int32") 34 | .Input("input_splits: Tsplits") 35 | .Output("output_values: int32") 36 | .Output("output_values_inner_splits: Tsplits") 37 | .Output("output_offset_starts: int64") 38 | .Output("output_offset_limits: int64") 39 | .Output("output_outer_splits: Tsplits") 40 | .Attr("Tsplits: {int32, int64} = DT_INT64") 41 | .SetShapeFn([](InferenceContext* c) { 42 | shape_inference::ShapeHandle unused; 43 | TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused)); 44 | TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused)); 45 | 46 | c->set_output(0, c->Vector(InferenceContext::kUnknownDim)); 47 | c->set_output(1, c->Vector(InferenceContext::kUnknownDim)); 48 | c->set_output(2, c->Vector(InferenceContext::kUnknownDim)); 49 | c->set_output(3, c->Vector(InferenceContext::kUnknownDim)); 50 | c->set_output(4, c->Vector(InferenceContext::kUnknownDim)); 51 | return Status::OK(); 52 | }); 53 | 54 | } // namespace text 55 | } // namespace tensorflow 56 | -------------------------------------------------------------------------------- /tensorflow_text/core/ops/unicode_script_tokenize_op.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 TF.Text Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | 18 | #include "tensorflow/core/framework/common_shape_fns.h" 19 | #include "tensorflow/core/framework/op.h" 20 | #include "tensorflow/core/framework/shape_inference.h" 21 | 22 | namespace tensorflow { 23 | 24 | namespace shape_inference { 25 | class InferenceContext; 26 | } // namespace shape_inference 27 | 28 | namespace text { 29 | 30 | using shape_inference::InferenceContext; 31 | 32 | REGISTER_OP("UnicodeScriptTokenizeWithOffsets") 33 | .Input("input_values: int32") 34 | .Input("input_splits: Tsplits") 35 | .Output("output_values: int32") 36 | .Output("output_values_inner_splits: Tsplits") 37 | .Output("output_offset_starts: int64") 38 | .Output("output_offset_limits: int64") 39 | .Output("output_outer_splits: Tsplits") 40 | .Attr("Tsplits: {int32, int64} = DT_INT64") 41 | .Attr("keep_whitespace: bool = false") 42 | .SetShapeFn([](InferenceContext* c) { 43 | shape_inference::ShapeHandle unused; 44 | TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused)); 45 | TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused)); 46 | 47 | c->set_output(0, c->Vector(InferenceContext::kUnknownDim)); 48 | c->set_output(1, c->Vector(InferenceContext::kUnknownDim)); 49 | c->set_output(2, c->Vector(InferenceContext::kUnknownDim)); 50 | c->set_output(3, c->Vector(InferenceContext::kUnknownDim)); 51 | c->set_output(4, c->Vector(InferenceContext::kUnknownDim)); 52 | return Status::OK(); 53 | }); 54 | 55 | } // namespace text 56 | } // namespace tensorflow 57 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/ngrams.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 | 6 | # text.ngrams 7 | 8 | Create a tensor of n-grams based on the input data `data`. 9 | 10 | ``` python 11 | text.ngrams( 12 | data, 13 | width, 14 | axis=-1, 15 | reduction_type=None, 16 | string_separator=' ', 17 | name=None 18 | ) 19 | ``` 20 | 21 | Defined in 22 | [`python/ops/ngrams_op.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/ngrams_op.py). 23 | 24 | 25 | 26 | Creates a tensor of n-grams based on `data`. The n-grams are of width `width` 27 | and are created along axis `axis`; the n-grams are created by combining 28 | windows of `width` adjacent elements from `data` using `reduction_type`. This 29 | op is intended to cover basic use cases; more complex combinations can be 30 | created using the sliding_window op. 31 | 32 | #### Args: 33 | 34 | * `data`: The data to reduce. 35 | * `width`: The width of the ngram window. If there is not sufficient 36 | data to fill out the ngram window, the resulting ngram will be empty. 37 | * `axis`: The axis to create ngrams along. Note that for string join 38 | reductions, only axis '-1' is supported; for other reductions, any positive 39 | or negative axis can be used. Should be a constant. 40 | * `reduction_type`: A member of the Reduction enum. Should be a 41 | constant. Currently supports: 42 | 43 | * `Reduction.SUM`: Add values in the window. 44 | * `Reduction.MEAN`: Average values in the window. 45 | * `Reduction.STRING_JOIN`: Join strings in the window. Note that axis must 46 | be -1 here. 47 | 48 | * `string_separator`: The separator string used for 49 | `Reduction.STRING_JOIN`. Ignored otherwise. Must be a string constant, not a 50 | Tensor. 51 | 52 | * `name`: The op name. 53 | 54 | #### Returns: 55 | 56 | A tensor of ngrams. 57 | 58 | #### Raises: 59 | 60 | * `InvalidArgumentError`: if `reduction_type` is either None or not a 61 | Reduction, or if `reduction_type` is STRING_JOIN and `axis` is not -1. 62 | -------------------------------------------------------------------------------- /tensorflow_text/python/ops/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Various tensorflow ops related to text-processing.""" 17 | 18 | from tensorflow_text.python.ops.create_feature_bitmask_op import create_feature_bitmask 19 | from tensorflow_text.python.ops.greedy_constrained_sequence_op import greedy_constrained_sequence 20 | from tensorflow_text.python.ops.ngrams_op import ngrams 21 | from tensorflow_text.python.ops.ngrams_op import Reduction 22 | from tensorflow_text.python.ops.normalize_ops import case_fold_utf8 23 | from tensorflow_text.python.ops.normalize_ops import normalize_utf8 24 | from tensorflow_text.python.ops.pad_along_dimension_op import pad_along_dimension 25 | from tensorflow_text.python.ops.pointer_ops import gather_with_default 26 | from tensorflow_text.python.ops.pointer_ops import span_alignment 27 | from tensorflow_text.python.ops.pointer_ops import span_overlaps 28 | from tensorflow_text.python.ops.sentence_breaking_ops import sentence_fragments 29 | from tensorflow_text.python.ops.sliding_window_op import sliding_window 30 | from tensorflow_text.python.ops.string_ops import coerce_to_structurally_valid_utf8 31 | from tensorflow_text.python.ops.tokenization import Tokenizer 32 | from tensorflow_text.python.ops.tokenization import TokenizerWithOffsets 33 | from tensorflow_text.python.ops.unicode_script_tokenizer import UnicodeScriptTokenizer 34 | from tensorflow_text.python.ops.viterbi_constrained_sequence_op import viterbi_constrained_sequence 35 | from tensorflow_text.python.ops.whitespace_tokenizer import WhitespaceTokenizer 36 | from tensorflow_text.python.ops.wordpiece_tokenizer import WordpieceTokenizer 37 | from tensorflow_text.python.ops.wordshape_ops import WordShape 38 | from tensorflow_text.python.ops.wordshape_ops import wordshape 39 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/pad_along_dimension.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 | 6 | # text.pad_along_dimension 7 | 8 | Add padding to the beginning and end of data in a specific dimension. 9 | 10 | ``` python 11 | text.pad_along_dimension( 12 | data, 13 | axis=-1, 14 | left_pad=None, 15 | right_pad=None, 16 | name=None 17 | ) 18 | ``` 19 | 20 | Defined in 21 | [`python/ops/pad_along_dimension_op.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/pad_along_dimension_op.py). 22 | 23 | 24 | 25 | Returns a tensor constructed from `data`, where each row in dimension `axis` 26 | is replaced by the concatenation of the left padding followed by the row 27 | followed by the right padding. I.e., if `L=left_pad.shape[0]` and 28 | `R=right_pad.shape[0]`, then: 29 | 30 | ```python 31 | result[i1...iaxis, 0:L] = left_pad 32 | result[i1...iaxis, L:-R] = data[i0...iaxis] 33 | result[i1...iaxis, -R:] = right_pad 34 | ``` 35 | 36 | #### Args: 37 | 38 | * `data`: `[O1...ON, A, I1...IM]` A potentially ragged `K` 39 | dimensional tensor with outer dimensions of size `O1...ON`; axis dimension 40 | of size `A`; and inner dimensions of size `I1...IM`. I.e. `K = N + 1 + M`, 41 | where `N>=0` and `M>=0`. 42 | * `axis`: An integer constant specifying the axis along which padding 43 | is added. Negative axis values from `-K` to `-1` are supported. 44 | * `left_pad`: `[L, I1...IM]` An `M+1` dimensional tensor that 45 | should be prepended to each row along dimension `axis`; or `None` if no 46 | padding should be added to the left side. 47 | * `right_pad`: `[R, I1...IM]` An `M+1` dimensional tensor that 48 | should be appended to each row along dimension `axis`; or `None` if no 49 | padding should be added to the right side. 50 | * `name`: The name of this op. 51 | 52 | #### Returns: 53 | 54 | `[O1...ON, L + A + R, I1...IM]` A potentially ragged `K` dimensional 55 | tensor with outer dimensions of size `O1...ON`; padded axis dimension size 56 | `L+A+R`; and inner dimensions of size `I1...IM`. If `data` is a `RaggedTensor`, 57 | then the returned tensor is a `RaggedTensor` with the same `ragged_rank`. 58 | -------------------------------------------------------------------------------- /tensorflow_text/python/ops/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Base classes (abstract class) for all tokenizers.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import abc 23 | 24 | 25 | # TODO(broken): Have this extend Module when it becomes public 26 | class Tokenizer(): 27 | """Base class (abstract class) for all tokenizers.""" 28 | 29 | __metaclass__ = abc.ABCMeta 30 | 31 | @abc.abstractmethod 32 | def tokenize(self, input): # pylint: disable=redefined-builtin 33 | """Abstract function for tokenization. 34 | 35 | Args: 36 | input: An N-dimensional UTF-8 string (or optionally integer) Tensor or 37 | RaggedTensor. 38 | 39 | Returns: 40 | An N+1-dimensional UTF-8 string or integer Tensor or RaggedTensor. 41 | """ 42 | pass 43 | 44 | 45 | class TokenizerWithOffsets(Tokenizer): 46 | """Base class (abstract class) for all tokenizers that return offsets.""" 47 | 48 | @abc.abstractmethod 49 | def tokenize_with_offsets(self, input): # pylint: disable=redefined-builtin 50 | """Abstract function for tokenization with offsets. 51 | 52 | Args: 53 | input: An N-dimensional UTF-8 string (or optionally integer) Tensor or 54 | RaggedTensor. 55 | 56 | Returns: 57 | A tuple (tokens, start_offsets, limit_offsets): 58 | * tokens is an N+1-dimensional UTF-8 string or integer Tensor or 59 | RaggedTensor. 60 | * start_offsets is an N+1-dimensional integer Tensor containing the 61 | starting indices of each token (byte indices for input strings). 62 | * limit_offsets is an N+1-dimensional integer Tensor containing the 63 | exclusive ending indices of each token (byte indices for input 64 | strings). 65 | """ 66 | pass 67 | -------------------------------------------------------------------------------- /WORKSPACE: -------------------------------------------------------------------------------- 1 | workspace(name = "org_tensorflow_text") 2 | 3 | load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") 4 | 5 | http_archive( 6 | name = "bazel_skylib", 7 | sha256 = "2ef429f5d7ce7111263289644d233707dba35e39696377ebab8b0bc701f7818e", 8 | urls = ["https://github.com/bazelbuild/bazel-skylib/releases/download/0.8.0/bazel-skylib.0.8.0.tar.gz"], 9 | ) 10 | 11 | http_archive( 12 | name = "com_google_absl", 13 | sha256 = "0322e3a15fd119fcc8b03033e7011bb1beb7d6c4111f9e57272b7be78d56045a", 14 | strip_prefix = "abseil-cpp-2f76a9bf50046e396138cc8eeb3cdc17b7a5ac24", 15 | urls = [ 16 | "http://mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/2f76a9bf50046e396138cc8eeb3cdc17b7a5ac24.tar.gz", 17 | "https://github.com/abseil/abseil-cpp/archive/2f76a9bf50046e396138cc8eeb3cdc17b7a5ac24.tar.gz", 18 | ], 19 | ) 20 | 21 | http_archive( 22 | name = "com_google_googletest", 23 | sha256 = "ff7a82736e158c077e76188232eac77913a15dac0b22508c390ab3f88e6d6d86", 24 | strip_prefix = "googletest-b6cd405286ed8635ece71c72f118e659f4ade3fb", 25 | urls = [ 26 | "http://mirror.tensorflow.org/github.com/google/googletest/archive/b6cd405286ed8635ece71c72f118e659f4ade3fb.zip", 27 | "https://github.com/google/googletest/archive/b6cd405286ed8635ece71c72f118e659f4ade3fb.zip", 28 | ], 29 | ) 30 | 31 | http_archive( 32 | name = "io_bazel_rules_closure", 33 | sha256 = "e0a111000aeed2051f29fcc7a3f83be3ad8c6c93c186e64beb1ad313f0c7f9f9", 34 | strip_prefix = "rules_closure-cf1e44edb908e9616030cc83d085989b8e6cd6df", 35 | urls = [ 36 | "http://mirror.tensorflow.org/github.com/bazelbuild/rules_closure/archive/cf1e44edb908e9616030cc83d085989b8e6cd6df.tar.gz", 37 | "https://github.com/bazelbuild/rules_closure/archive/cf1e44edb908e9616030cc83d085989b8e6cd6df.tar.gz", # 2019-04-04 38 | ], 39 | 40 | ) 41 | 42 | http_archive( 43 | name = "org_tensorflow", 44 | strip_prefix = "tensorflow-2.0.0-beta0", 45 | sha256 = "9dd3b78fce445a8d01791aadda3cbb686b732d4df2d4f6563054f7d7a725fa68", 46 | urls = [ 47 | "https://github.com/tensorflow/tensorflow/archive/v2.0.0-beta0.zip" 48 | ], 49 | ) 50 | 51 | load("@org_tensorflow//tensorflow:workspace.bzl", "tf_workspace") 52 | 53 | tf_workspace(tf_repo_name="@org_tensorflow") 54 | 55 | load("//third_party/tensorflow:tf_configure.bzl", "tf_configure") 56 | 57 | tf_configure(name = "local_config_tf") 58 | 59 | load("//tensorflow_text:workspace.bzl", "initialize_third_party_archives") 60 | 61 | initialize_third_party_archives() 62 | -------------------------------------------------------------------------------- /tensorflow_text/python/ops/string_ops.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Tensorflow operations for UTF8 strings.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import sys 23 | 24 | from tensorflow.python.ops import string_ops 25 | 26 | 27 | def _unichr(codepoint): 28 | if sys.version_info[0] == 2: 29 | return unichr(codepoint) 30 | else: 31 | return chr(codepoint) 32 | 33 | 34 | # pylint: disable=redefined-builtin 35 | def coerce_to_structurally_valid_utf8(input, 36 | replacement_char=_unichr(65533), 37 | name=None): 38 | """Coerce UTF-8 input strings to structurally valid UTF-8. 39 | 40 | Any bytes which cause the input string to be invalid UTF-8 are substituted 41 | with the provided replacement character codepoint (default 65533). Use a 42 | single byte replacement character codepoint to preserve alignment to the 43 | source input string. 44 | 45 | Args: 46 | input: UTF-8 string tensor to coerce to valid UTF-8. 47 | replacement_char: The replacement character to be used in place of any 48 | invalid byte in the input. Any valid Unicode character may be used. The 49 | default value is the default Unicode replacement character which is 50 | 0xFFFD (or U+65533). Note that passing a replacement character 51 | expressible in 1 byte, such as ' ' or '?', will preserve string 52 | alignment to the source since individual invalid bytes will be replaced 53 | with a 1-byte replacement. (optional) 54 | name: A name for the operation (optional). 55 | 56 | Returns: 57 | A tensor of type string with the same shape as the input. 58 | """ 59 | return string_ops.unicode_transcode( 60 | input, 61 | input_encoding='UTF-8', 62 | output_encoding='UTF-8', 63 | errors='replace', 64 | replacement_char=ord(replacement_char), 65 | name=name) 66 | -------------------------------------------------------------------------------- /tensorflow_text/core/ops/wordpiece_op.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 TF.Text Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "tensorflow/core/framework/op.h" 16 | #include "tensorflow/core/framework/shape_inference.h" 17 | 18 | namespace tensorflow { 19 | 20 | REGISTER_OP("WordpieceTokenizeWithOffsets") 21 | .Input("input_values: string") 22 | .Input("vocab_lookup_table: resource") 23 | .Attr("suffix_indicator: string") 24 | .Attr("max_bytes_per_word: int") 25 | .Attr("use_unknown_token: bool") 26 | .Attr("unknown_token: string") 27 | .Output("output_values: string") 28 | .Output("output_row_lengths: int64") 29 | .Output("start_values: int64") 30 | .Output("limit_values: int64") 31 | .Doc(R"doc( 32 | Tokenizes tokens into sub-word pieces based off of a vocabulary. 33 | 34 | `wordpiece_tokenize_with_offsets` returns the relative offsets. 35 | 36 | ### Example: 37 | tokens = ['don', '\'t', 'treadness'] 38 | wordpiece, start, end = wordpiece_tokenize_with_offset(tokens) 39 | wordpiece = [['don', '\'', 't'], ['tread', '##ness']] 40 | start = [[[0, 3, 4], [0, 5]]] 41 | end = [[[3, 4, 5], [5, 10]]] 42 | Args: 43 | tokens: [num_batch, (num_tokens)] a `RaggedTensor` of UTF-8 token 44 | strings 45 | vocab_lookup_table: A lookup table implementing the LookupInterface 46 | word_split_char: Character used to define prefixes in the vocab. 47 | return_ids: A bool indicating whether the op returns int64 ids or tokenized 48 | subword strings. 49 | 50 | Returns: 51 | A tuple of `RaggedTensor`s `subword`, `subword_offset_starts`, 52 | `subword_offset_limit` where: 53 | 54 | `subword`: [num_batch, (num_tokens), (num_subword_pieces)] is the 55 | wordpiece token string encoded in UTF-8. 56 | `subword_offset_starts`: [num_batch, (num_tokens), 57 | (num_subword_pieces)] is the word piece token's starting byte offset. 58 | `subword_offset_limit`: [num_batch, (num_tokens), 59 | (num_subword_pieces)] is the word piece token's ending byte offset. 60 | )doc"); 61 | 62 | } // namespace tensorflow 63 | -------------------------------------------------------------------------------- /tensorflow_text/core/kernels/whitespace_tokenize_kernel_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 TF.Text Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | 17 | #include 18 | #include 19 | #include "tensorflow/core/framework/fake_input.h" 20 | #include "tensorflow/core/framework/node_def_builder.h" 21 | #include "tensorflow/core/framework/tensor.h" 22 | #include "tensorflow/core/framework/tensor_shape.h" 23 | #include "tensorflow/core/kernels/ops_testutil.h" 24 | #include "tensorflow/core/lib/core/status.h" 25 | #include "tensorflow/core/lib/core/status_test_util.h" 26 | #include "tensorflow_text/core/kernels/text_kernels_test_util.h" 27 | 28 | namespace tensorflow { 29 | namespace text { 30 | 31 | using tensorflow::FakeInput; 32 | using tensorflow::NodeDefBuilder; 33 | using tensorflow::Status; 34 | using tensorflow::TensorShape; 35 | using tensorflow::text_kernels_test_util::VectorEq; 36 | 37 | class WhitespaceTokenizeWithOffsetsKernelTest 38 | : public tensorflow::OpsTestBase { 39 | public: 40 | void MakeOp() { 41 | TF_ASSERT_OK(NodeDefBuilder("tested_op", "WhitespaceTokenizeWithOffsets") 42 | .Input(FakeInput()) 43 | .Input(FakeInput()) 44 | .Finalize(node_def())); 45 | TF_ASSERT_OK(InitOp()); 46 | } 47 | }; 48 | 49 | TEST_F(WhitespaceTokenizeWithOffsetsKernelTest, Test) { 50 | MakeOp(); 51 | AddInputFromArray(TensorShape({6}), {111, 112, 32, 116, 117, 118}); 52 | AddInputFromArray(TensorShape({3}), {0, 4, 6}); 53 | TF_ASSERT_OK(RunOpKernel()); 54 | 55 | std::vector expected_values({111, 112, 116, 117, 118}); 56 | std::vector expected_values_inner_splits({0, 2, 3, 5}); 57 | std::vector expected_offset_starts({0, 3, 0}); 58 | std::vector expected_offset_limits({2, 4, 2}); 59 | std::vector output_outer_splits({0, 2, 3}); 60 | EXPECT_THAT(*GetOutput(0), VectorEq(expected_values)); 61 | EXPECT_THAT(*GetOutput(1), VectorEq(expected_values_inner_splits)); 62 | EXPECT_THAT(*GetOutput(2), VectorEq(expected_offset_starts)); 63 | EXPECT_THAT(*GetOutput(3), VectorEq(expected_offset_limits)); 64 | EXPECT_THAT(*GetOutput(4), VectorEq(output_outer_splits)); 65 | } 66 | 67 | } // namespace text 68 | } // namespace tensorflow 69 | -------------------------------------------------------------------------------- /tensorflow_text/core/kernels/unicode_script_tokenize_kernel_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 TF.Text Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | 17 | #include 18 | #include 19 | #include "tensorflow/core/framework/fake_input.h" 20 | #include "tensorflow/core/framework/node_def_builder.h" 21 | #include "tensorflow/core/framework/tensor.h" 22 | #include "tensorflow/core/framework/tensor_shape.h" 23 | #include "tensorflow/core/kernels/ops_testutil.h" 24 | #include "tensorflow/core/lib/core/status.h" 25 | #include "tensorflow/core/lib/core/status_test_util.h" 26 | #include "tensorflow_text/core/kernels/text_kernels_test_util.h" 27 | 28 | namespace tensorflow { 29 | namespace text { 30 | 31 | using tensorflow::FakeInput; 32 | using tensorflow::NodeDefBuilder; 33 | using tensorflow::Status; 34 | using tensorflow::TensorShape; 35 | using tensorflow::text_kernels_test_util::VectorEq; 36 | 37 | class UnicodeScriptTokenizeWithOffsetsKernelTest 38 | : public tensorflow::OpsTestBase { 39 | public: 40 | void MakeOp() { 41 | TF_ASSERT_OK(NodeDefBuilder("tested_op", "UnicodeScriptTokenizeWithOffsets") 42 | .Input(FakeInput()) 43 | .Input(FakeInput()) 44 | .Finalize(node_def())); 45 | TF_ASSERT_OK(InitOp()); 46 | } 47 | }; 48 | 49 | TEST_F(UnicodeScriptTokenizeWithOffsetsKernelTest, Test) { 50 | MakeOp(); 51 | AddInputFromArray(TensorShape({6}), {111, 112, 32, 116, 117, 118}); 52 | AddInputFromArray(TensorShape({3}), {0, 4, 6}); 53 | TF_ASSERT_OK(RunOpKernel()); 54 | 55 | std::vector expected_values({111, 112, 116, 117, 118}); 56 | std::vector expected_values_inner_splits({0, 2, 3, 5}); 57 | std::vector expected_offset_starts({0, 3, 0}); 58 | std::vector expected_offset_limits({2, 4, 2}); 59 | std::vector output_outer_splits({0, 2, 3}); 60 | EXPECT_THAT(*GetOutput(0), VectorEq(expected_values)); 61 | EXPECT_THAT(*GetOutput(1), VectorEq(expected_values_inner_splits)); 62 | EXPECT_THAT(*GetOutput(2), VectorEq(expected_offset_starts)); 63 | EXPECT_THAT(*GetOutput(3), VectorEq(expected_offset_limits)); 64 | EXPECT_THAT(*GetOutput(4), VectorEq(output_outer_splits)); 65 | } 66 | 67 | } // namespace text 68 | } // namespace tensorflow 69 | -------------------------------------------------------------------------------- /tensorflow_text/core/kernels/text_kernels_test_util.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 TF.Text Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "tensorflow_text/core/kernels/text_kernels_test_util.h" 16 | 17 | using ::testing::MakeMatcher; 18 | using ::testing::Matcher; 19 | using ::testing::MatchResultListener; 20 | 21 | namespace tensorflow { 22 | namespace text_kernels_test_util { 23 | 24 | bool TensorEqMatcher::MatchAndExplain( 25 | Tensor actual, ::testing::MatchResultListener* listener) const { 26 | string expect_values = expect_.SummarizeValue(expect_.NumElements()); 27 | string actual_values = actual.SummarizeValue(actual.NumElements()); 28 | if (expect_.dtype() != actual.dtype() || expect_.shape() != actual.shape() || 29 | expect_values != actual_values) { 30 | *listener << "\n dtype=" << DataTypeString(actual.dtype()); 31 | *listener << "\n shape=" << actual.shape().DebugString(); 32 | *listener << "\n values=" << actual_values; 33 | return false; 34 | } 35 | return true; 36 | } 37 | 38 | void TensorEqMatcher::DescribeTo(::std::ostream* gmock_os) const { 39 | *gmock_os << "dtype=" << DataTypeString(expect_.dtype()) 40 | << "\n shape=" << expect_.shape().DebugString() 41 | << "\n values=" 42 | << expect_.SummarizeValue(expect_.NumElements()); 43 | } 44 | 45 | void TensorEqMatcher::DescribeNegationTo(::std::ostream* gmock_os) const { 46 | *gmock_os << "is not equal to " << expect_.DebugString(); 47 | } 48 | 49 | bool TensorHasShapeMatcher::MatchAndExplain( 50 | Tensor actual, ::testing::MatchResultListener* listener) const { 51 | if (expect_ != actual.shape()) { 52 | *listener << "\n shape=" << actual.shape().DebugString(); 53 | return false; 54 | } 55 | return true; 56 | } 57 | 58 | void TensorHasShapeMatcher::DescribeTo(::std::ostream* gmock_os) const { 59 | *gmock_os << "shape=" << expect_.DebugString(); 60 | } 61 | 62 | void TensorHasShapeMatcher::DescribeNegationTo(::std::ostream* gmock_os) const { 63 | *gmock_os << "shape!=" << expect_.DebugString(); 64 | } 65 | 66 | Matcher TensorHasShape(const TensorShape& shape) { 67 | // MakeMatcher takes ownership of the TensorHasShapeMatcher. 68 | return MakeMatcher(new TensorHasShapeMatcher(shape)); 69 | } 70 | 71 | } // namespace text_kernels_test_util 72 | } // namespace tensorflow 73 | -------------------------------------------------------------------------------- /docs/api_docs/python/text.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 | 6 | # Module: text 7 | 8 | Various tensorflow ops related to text-processing. 9 | 10 | Defined in [`__init__.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/__init__.py). 11 | 12 | 13 | 14 | 15 | ## Classes 16 | 17 | [`class Reduction`](./text/Reduction.md): Type of reduction to be done by the ngram op. 18 | 19 | [`class Tokenizer`](./text/Tokenizer.md): Base class (abstract class) for all 20 | tokenizers. 21 | 22 | [`class TokenizerWithOffsets`](./text/TokenizerWithOffsets.md): Base class 23 | (abstract class) for all tokenizers that return offsets. 24 | 25 | [`class UnicodeScriptTokenizer`](./text/UnicodeScriptTokenizer.md): Tokenizes a 26 | tensor of UTF-8 strings on Unicode script boundaries. 27 | 28 | [`class WhitespaceTokenizer`](./text/WhitespaceTokenizer.md): Tokenizes a tensor 29 | of UTF-8 strings on whitespaces. 30 | 31 | [`class WordShape`](./text/WordShape.md): Values for the 'pattern' arg of the WordShape op. 32 | 33 | [`class WordpieceTokenizer`](./text/WordpieceTokenizer.md): Creates a wordpiece 34 | tokenizer. 35 | 36 | ## Functions 37 | 38 | [`case_fold_utf8(...)`](./text/case_fold_utf8.md): Applies case folding to every 39 | UTF8 string in the input. 40 | 41 | [`coerce_to_structurally_valid_utf8(...)`](./text/coerce_to_structurally_valid_utf8.md): Coerce UTF-8 input strings to structurally valid UTF-8. 42 | 43 | [`gather_with_default(...)`](./text/gather_with_default.md): Gather slices with `indices=-1` mapped to `default`. 44 | 45 | [`greedy_constrained_sequence(...)`](./text/greedy_constrained_sequence.md): Performs greedy constrained sequence on a batch of examples. 46 | 47 | [`ngrams(...)`](./text/ngrams.md): Create a tensor of n-grams based on the input data `data`. 48 | 49 | [`normalize_utf8(...)`](./text/normalize_utf8.md): Normalizes each UTF8 string in the input tensor using the specified rule. 50 | 51 | [`pad_along_dimension(...)`](./text/pad_along_dimension.md): Add padding to the beginning and end of data in a specific dimension. 52 | 53 | [`sentence_fragments(...)`](./text/sentence_fragments.md): Find the sentence fragments in a given text. 54 | 55 | [`sliding_window(...)`](./text/sliding_window.md): Builds a sliding window for `data` with a specified width. 56 | 57 | [`span_alignment(...)`](./text/span_alignment.md): Return an alignment from a set of source spans to a set of target spans. 58 | 59 | [`span_overlaps(...)`](./text/span_overlaps.md): Returns a boolean tensor indicating which source and target spans overlap. 60 | 61 | [`viterbi_constrained_sequence(...)`](./text/viterbi_constrained_sequence.md): Performs greedy constrained sequence on a batch of examples. 62 | 63 | [`wordshape(...)`](./text/wordshape.md): Determine wordshape features for each input string. 64 | 65 | -------------------------------------------------------------------------------- /tensorflow_text/core/kernels/sentence_breaking_utils.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 TF.Text Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_BREAKING_UTILS_H_ 16 | #define TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_BREAKING_UTILS_H_ 17 | 18 | #include 19 | #include "absl/strings/string_view.h" 20 | #include "icu4c/source/common/unicode/ucnv.h" 21 | #include "icu4c/source/common/unicode/ucnv_err.h" 22 | #include "icu4c/source/common/unicode/utypes.h" 23 | #include "tensorflow/core/lib/core/status.h" 24 | 25 | namespace tensorflow { 26 | namespace text { 27 | 28 | // A class of utils for identifying certain classes and properties of unicode 29 | // characters. 30 | class UnicodeUtil { 31 | public: 32 | // `converter` not owned. 33 | explicit UnicodeUtil(UConverter* converter) : converter_(converter) {} 34 | 35 | // Returns true iff a string is terminal punctuation. 36 | ::tensorflow::Status IsTerminalPunc(const absl::string_view& input, 37 | bool* result) const; 38 | 39 | // Returns true iff a string is close punctuation (close quote or close 40 | // paren). 41 | ::tensorflow::Status IsClosePunc(const absl::string_view& input, 42 | bool* result) const; 43 | 44 | // Returns true iff a string is an open paren. 45 | ::tensorflow::Status IsOpenParen(const absl::string_view& input, 46 | bool* result) const; 47 | 48 | // Returns true iff a string is a close paren. 49 | ::tensorflow::Status IsCloseParen(const absl::string_view& input, 50 | bool* result) const; 51 | 52 | // Returns true iff a word is made of punctuation characters only. 53 | ::tensorflow::Status IsPunctuationWord(const absl::string_view& input, 54 | bool* result) const; 55 | 56 | // Returns true iff a string is an ellipsis token ("..."). 57 | ::tensorflow::Status IsEllipsis(const absl::string_view& input, 58 | bool* result) const; 59 | 60 | private: 61 | ::tensorflow::Status GetOneUChar(const absl::string_view&, 62 | bool* has_more_than_one_char, 63 | UChar32* result) const; 64 | 65 | // not owned. mutable because UConverter contains some internal options and 66 | // buffer. 67 | mutable UConverter* converter_; 68 | }; 69 | 70 | } // namespace text 71 | } // namespace tensorflow 72 | 73 | #endif // TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_BREAKING_UTILS_H_ 74 | -------------------------------------------------------------------------------- /oss_scripts/pip_package/setup.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """TF.Text is a TensorFlow library of text related ops, modules, and subgraphs. 17 | 18 | TF.Text is a TensorFlow library of text related ops, modules, and subgraphs. The 19 | library can perform the preprocessing regularly required by text-based models, 20 | and includes other features useful for sequence modeling not provided by core 21 | TensorFlow. 22 | 23 | See the README on GitHub for further documentation. 24 | http://github.com/tensorflow/text 25 | """ 26 | 27 | import os 28 | 29 | from setuptools import find_packages 30 | from setuptools import setup 31 | from setuptools.command.install import install 32 | from setuptools.dist import Distribution 33 | 34 | project_name = 'tensorflow-text' 35 | project_version = '1.0.0-beta0' 36 | 37 | 38 | class BinaryDistribution(Distribution): 39 | """This class is needed in order to create OS specific wheels.""" 40 | 41 | def is_pure(self): 42 | return False 43 | 44 | def has_ext_modules(self): 45 | return True 46 | 47 | 48 | class InstallPlatlib(install): 49 | """This is needed to set hte library to platlib compliant.""" 50 | 51 | def finalize_options(self): 52 | """For more info; see http://github.com/google/or-tools/issues/616 .""" 53 | install.finalize_options(self) 54 | self.install_lib = self.install_platlib 55 | self.install_libbase = self.install_lib 56 | self.install_lib = os.path.join(self.install_lib, self.extra_dirs) 57 | 58 | 59 | DOCLINES = __doc__.split('\n') 60 | 61 | setup( 62 | name=project_name, 63 | version=project_version.replace('-', ''), 64 | description=DOCLINES[0], 65 | long_description='\n'.join(DOCLINES[2:]), 66 | author='Google Inc.', 67 | author_email='packages@tensorflow.org', 68 | url='http://github.com/tensorflow/text', 69 | license='Apache 2.0', 70 | packages=find_packages(), 71 | include_package_data=True, 72 | zip_safe=False, 73 | cmdclass={'install': InstallPlatlib}, 74 | distclass=BinaryDistribution, 75 | install_requires=[ 76 | 'tensorflow==2.0.0b0', 77 | ], 78 | extras_require={ 79 | 'tests': [ 80 | 'absl-py', 81 | 'pytest', 82 | ], 83 | }, 84 | classifiers=[ 85 | 'Development Status :: 4 - Beta', 86 | 'Intended Audience :: Developers', 87 | 'Intended Audience :: Science/Research', 88 | 'License :: OSI Approved :: Apache Software License', 89 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 90 | ], 91 | keywords='tensorflow text machine learning', 92 | ) 93 | -------------------------------------------------------------------------------- /tensorflow_text/python/ops/normalize_ops.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Tensorflow lowercasing operation for UTF8 strings.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | from tensorflow.python.framework import dtypes 23 | from tensorflow.python.framework import ops 24 | from tensorflow.python.ops.ragged import ragged_tensor 25 | 26 | from tensorflow.python.framework import load_library 27 | from tensorflow.python.platform import resource_loader 28 | gen_normalize_ops = load_library.load_op_library(resource_loader.get_path_to_datafile('_normalize_ops.so')) 29 | 30 | 31 | # pylint: disable=redefined-builtin 32 | def case_fold_utf8(input, name=None): 33 | """Applies case folding to every UTF8 string in the input. 34 | 35 | The input is a `Tensor` or `RaggedTensor` of any shape, and the resulting 36 | output has the same shape as the input. Note that NFKC normalization is 37 | implicitly applied to the strings. 38 | 39 | For example: 40 | 41 | ```python 42 | >>> case_fold_utf8(['The Quick-Brown', 43 | ... 'CAT jumped over', 44 | ... 'the lazy dog !! '] 45 | tf.Tensor(['The quick-brown' 'cat jumped over' 'the lazy dog !! '], 46 | shape=(3,), dtype=string) 47 | ``` 48 | 49 | Args: 50 | input: A `Tensor` or `RaggedTensor` of type string. (Must be UTF-8.) 51 | name: The name for this op (optional) 52 | 53 | Returns: 54 | A `Tensor` or `RaggedTensor` of type string, with case-folded contents. 55 | """ 56 | with ops.name_scope(name, "CaseFoldUTF8", [input]): 57 | input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor( 58 | input, dtype=dtypes.string) 59 | if ragged_tensor.is_ragged(input_tensor): 60 | result = gen_normalize_ops.case_fold_utf8(input_tensor.flat_values) 61 | return input_tensor.with_flat_values(result) 62 | else: 63 | return gen_normalize_ops.case_fold_utf8(input_tensor) 64 | 65 | 66 | # pylint: disable=redefined-builtin) 67 | def normalize_utf8(input, normalization_form="NFKC", name=None): 68 | """Normalizes each UTF8 string in the input tensor using the specified rule. 69 | 70 | See http://unicode.org/reports/tr15/ 71 | 72 | Args: 73 | input: A `Tensor` or `RaggedTensor` of type string. (Must be UTF-8.) 74 | normalization_form: One of the following string values ('NFC', 'NFKC', 75 | 'NFD', 'NFKD'). Default is 'NFKC'. 76 | name: The name for this op (optional) 77 | 78 | Returns: 79 | A `Tensor` or `RaggedTensor` of type string, with normalized contents. 80 | """ 81 | with ops.name_scope(name, "NormalizeUTF8", [input]): 82 | input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor( 83 | input, dtype=dtypes.string) 84 | if ragged_tensor.is_ragged(input_tensor): 85 | result = gen_normalize_ops.normalize_utf8(input_tensor.flat_values, 86 | normalization_form) 87 | return input_tensor.with_flat_values(result) 88 | else: 89 | return gen_normalize_ops.normalize_utf8(input_tensor, normalization_form) 90 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/sentence_fragments.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 | 6 | # text.sentence_fragments 7 | 8 | Find the sentence fragments in a given text. 9 | 10 | ``` python 11 | text.sentence_fragments( 12 | token_word, 13 | token_starts, 14 | token_ends, 15 | token_properties, 16 | input_encoding='UTF-8', 17 | errors='replace', 18 | replacement_char=65533, 19 | replace_control_characters=False 20 | ) 21 | ``` 22 | 23 | Defined in 24 | [`python/ops/sentence_breaking_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/sentence_breaking_ops.py). 25 | 26 | 27 | 28 | A sentence fragment is a potential next sentence determined using 29 | deterministic heuristics based on punctuation, capitalization, and similar 30 | text attributes. 31 | 32 | #### Args: 33 | 34 | * `token_word`: A Tensor (w/ rank=2) or a RaggedTensor (w/ 35 | ragged_rank=1) containing the token strings. 36 | * `token_starts`: A Tensor (w/ rank=2) or a RaggedTensor (w/ 37 | ragged_rank=1) containing offsets where the token starts. 38 | * `token_ends`: A Tensor (w/ rank=2) or a RaggedTensor (w/ 39 | ragged_rank=1) containing offsets where the token ends. 40 | * `token_properties`: A Tensor (w/ rank=2) or a RaggedTensor (w/ 41 | ragged_rank=1) containing a bitmask. 42 | 43 | The values of the bitmask are: 0x01 (ILL_FORMED) - Text is ill-formed 44 | according to TextExtractor; typically applies to all tokens of a paragraph 45 | that is too short or lacks terminal punctuation. 0x40 (TITLE) 0x02 (HEADING) 46 | 0x04 (BOLD) 0x10 (UNDERLINED) 0x20 (LIST) 0x80 (EMOTICON) 0x100 (ACRONYM) - 47 | Token was identified by Lexer as an acronym. Lexer identifies period-, 48 | hyphen-, and space-separated acronyms: "U.S.", "U-S", and "U S". Lexer 49 | normalizes all three to "US", but the token word field normalizes only 50 | space-separated acronyms. 0x200 (HYPERLINK) - Indicates that the token (or 51 | part of the token) is a covered by at least one hyperlink. More information 52 | of the hyperlink is stored in the first token covered by the hyperlink. 53 | 54 | * `input_encoding`: String name for the unicode encoding that should be 55 | used to decode each string. 56 | 57 | * `errors`: Specifies the response when an input string can't be 58 | converted using the indicated encoding. One of: 59 | 60 | * `'strict'`: Raise an exception for any illegal substrings. 61 | * `'replace'`: Replace illegal substrings with `replacement_char`. 62 | * `'ignore'`: Skip illegal substrings. 63 | 64 | * `replacement_char`: The replacement codepoint to be used in place of 65 | invalid substrings in `input` when `errors='replace'`; and in place of C0 66 | control characters in `input` when `replace_control_characters=True`. 67 | 68 | * `replace_control_characters`: Whether to replace the C0 control 69 | characters `(U+0000 - U+001F)` with the `replacement_char`. 70 | 71 | #### Returns: 72 | 73 | A RaggedTensor of `fragment_start`, `fragment_end`, `fragment_properties` 74 | and `terminal_punc_token`. 75 | 76 | `fragment_properties` is an int32 bitmask whose values may contain: 77 | 1 = fragment ends with terminal punctuation 78 | 2 = fragment ends with multiple terminal punctuations (e.g. 79 | "She said what?!") 80 | 3 = Has close parenthesis (e.g. "Mushrooms (they're fungi).") 81 | 4 = Has sentential close parenthesis (e.g. "(Mushrooms are fungi!)" 82 | 83 | `terminal_punc_token` is a RaggedTensor containing the index of terminal 84 | punctuation token immediately following the last word in the fragment -- or 85 | index of the last word itself, if it's an acronym (since acronyms include the 86 | terminal punctuation). index of the terminal punctuation token. 87 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/span_overlaps.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 | 6 | # text.span_overlaps 7 | 8 | Returns a boolean tensor indicating which source and target spans overlap. 9 | 10 | ``` python 11 | text.span_overlaps( 12 | source_start, 13 | source_limit, 14 | target_start, 15 | target_limit, 16 | contains=False, 17 | contained_by=False, 18 | partial_overlap=False, 19 | name=None 20 | ) 21 | ``` 22 | 23 | Defined in 24 | [`python/ops/pointer_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/pointer_ops.py). 25 | 26 | 27 | 28 | The source and target spans are specified using B+1 dimensional tensors, 29 | with `B>=0` batch dimensions followed by a final dimension that lists the 30 | span offsets for each span in the batch: 31 | 32 | * The `i`th source span in batch `b1...bB` starts at 33 | `source_start[b1...bB, i]` (inclusive), and extends to just before 34 | `source_limit[b1...bB, i]` (exclusive). 35 | * The `j`th target span in batch `b1...bB` starts at 36 | `target_start[b1...bB, j]` (inclusive), and extends to just before 37 | `target_limit[b1...bB, j]` (exclusive). 38 | 39 | `result[b1...bB, i, j]` is true if the `i`th source span overlaps with the 40 | `j`th target span in batch `b1...bB`, where a source span overlaps a target 41 | span if any of the following are true: 42 | 43 | * The spans are identical. 44 | * `contains` is true, and the source span contains the target span. 45 | * `contained_by` is true, and the source span is contained by the target 46 | span. 47 | * `partial_overlap` is true, and there is a non-zero overlap between the 48 | source span and the target span. 49 | 50 | #### Args: 51 | 52 | * `source_start`: A B+1 dimensional potentially ragged tensor with 53 | shape `[D1...DB, source_size]`: the start offset of each source span. 54 | * `source_limit`: A B+1 dimensional potentially ragged tensor with 55 | shape `[D1...DB, source_size]`: the limit offset of each source span. 56 | * `target_start`: A B+1 dimensional potentially ragged tensor with 57 | shape `[D1...DB, target_size]`: the start offset of each target span. 58 | * `target_limit`: A B+1 dimensional potentially ragged tensor with 59 | shape `[D1...DB, target_size]`: the limit offset of each target span. 60 | * `contains`: If true, then a source span is considered to overlap a 61 | target span when the source span contains the target span. 62 | * `contained_by`: If true, then a source span is considered to overlap 63 | a target span when the source span is contained by the target span. 64 | * `partial_overlap`: If true, then a source span is considered to 65 | overlap a target span when the source span partially overlaps the target 66 | span. 67 | * `name`: A name for the operation (optional). 68 | 69 | #### Returns: 70 | 71 | A B+2 dimensional potentially ragged boolean tensor with shape 72 | `[D1...DB, source_size, target_size]`. 73 | 74 | #### Raises: 75 | 76 | * `ValueError`: If the span tensors are incompatible. 77 | 78 | #### Example: 79 | Given the following source and target spans (with no batch dimensions): 80 | 81 | ```python 82 | # 0 5 10 15 20 25 30 35 40 83 | # |====|====|====|====|====|====|====|====| 84 | # Source: [-0-] [-1-] [2] [-3-][-4-][-5-] 85 | # Target: [-0-][-1-] [-2-] [3] [-4-][-5-] 86 | # |====|====|====|====|====|====|====|====| 87 | >>> source_start = [0, 10, 16, 20, 25, 30] 88 | >>> source_limit = [5, 15, 19, 25, 30, 35] 89 | >>> target_start = [0, 5, 15, 21, 27, 31] 90 | >>> target_limit = [5, 10, 20, 24, 32, 37] 91 | ``` 92 | 93 | `result[i, j]` will be true at the following locations: 94 | 95 | * `[0, 0]` (always) 96 | * `[2, 2]` (if contained_by=True or partial_overlaps=True) 97 | * `[3, 3]` (if contains=True or partial_overlaps=True) 98 | * `[4, 4]` (if partial_overlaps=True) 99 | * `[5, 5]` (if partial_overlaps=True) -------------------------------------------------------------------------------- /docs/api_docs/python/text/Tokenizer.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
13 | 14 | # text.Tokenizer 15 | 16 | ## Class `Tokenizer` 17 | 18 | Base class (abstract class) for all tokenizers. 19 | 20 | Defined in 21 | [`python/ops/tokenization.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/tokenization.py). 22 | 23 | 24 | 25 |

__init__

26 | 27 | ```python 28 | __init__(name=None) 29 | ``` 30 | 31 | ## Properties 32 | 33 |

name

34 | 35 | Returns the name of this module as passed or determined in the ctor. 36 | 37 | NOTE: This is not the same as the `self.name_scope.name` which includes parent 38 | module names. 39 | 40 |

name_scope

41 | 42 | Returns a `tf.name_scope` instance for this class. 43 | 44 |

submodules

45 | 46 | Sequence of all sub-modules. 47 | 48 | Submodules are modules which are properties of this module, or found as 49 | properties of modules which are properties of this module (and so on). 50 | 51 | ``` 52 | a = tf.Module() 53 | b = tf.Module() 54 | c = tf.Module() 55 | a.b = b 56 | b.c = c 57 | assert list(a.submodules) == [b, c] 58 | assert list(b.submodules) == [c] 59 | assert list(c.submodules) == [] 60 | ``` 61 | 62 | #### Returns: 63 | 64 | A sequence of all submodules. 65 | 66 |

trainable_variables

67 | 68 | Sequence of variables owned by this module and it's submodules. 69 | 70 | Note: this method uses reflection to find variables on the current instance and 71 | submodules. For performance reasons you may wish to cache the result of calling 72 | this method if you don't expect the return value to change. 73 | 74 | #### Returns: 75 | 76 | A sequence of variables for the current module (sorted by attribute name) 77 | followed by variables from all submodules recursively (breadth first). 78 | 79 |

variables

80 | 81 | Sequence of variables owned by this module and it's submodules. 82 | 83 | Note: this method uses reflection to find variables on the current instance and 84 | submodules. For performance reasons you may wish to cache the result of calling 85 | this method if you don't expect the return value to change. 86 | 87 | #### Returns: 88 | 89 | A sequence of variables for the current module (sorted by attribute name) 90 | followed by variables from all submodules recursively (breadth first). 91 | 92 | ## Methods 93 | 94 |

tokenize

95 | 96 | ```python 97 | tokenize(input) 98 | ``` 99 | 100 | Abstract function for tokenization. 101 | 102 | #### Args: 103 | 104 | * `input`: An N-dimensional UTF-8 string (or optionally integer) Tensor 105 | or RaggedTensor. 106 | 107 | #### Returns: 108 | 109 | An N+1-dimensional UTF-8 string or integer Tensor or RaggedTensor. 110 | 111 |

with_name_scope

112 | 113 | ```python 114 | with_name_scope( 115 | cls, 116 | method 117 | ) 118 | ``` 119 | 120 | Decorator to automatically enter the module name scope. 121 | 122 | ``` 123 | class MyModule(tf.Module): 124 | @tf.Module.with_name_scope 125 | def __call__(self, x): 126 | if not hasattr(self, 'w'): 127 | self.w = tf.Variable(tf.random.normal([x.shape[1], 64])) 128 | return tf.matmul(x, self.w) 129 | ``` 130 | 131 | Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names 132 | included the module name: 133 | 134 | ``` 135 | mod = MyModule() 136 | mod(tf.ones([8, 32])) 137 | # ==> 138 | mod.w 139 | # ==> 140 | ``` 141 | 142 | #### Args: 143 | 144 | * `method`: The method to wrap. 145 | 146 | #### Returns: 147 | 148 | The original method wrapped such that it enters the module's name scope. 149 | -------------------------------------------------------------------------------- /tensorflow_text/core/ops/constrained_sequence_op.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 TF.Text Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "tensorflow/core/framework/op.h" 16 | #include "tensorflow/core/framework/shape_inference.h" 17 | 18 | namespace tensorflow { 19 | 20 | REGISTER_OP("ConstrainedSequence") 21 | .Attr("Tin: {int32, int64}") 22 | .Attr("Tsplits: {int32, int64} = DT_INT64") 23 | .Attr("use_viterbi: bool") 24 | .Attr("use_log_space: bool") 25 | .Attr("use_start_and_end_states: bool") 26 | .Input("scores: float") 27 | .Input("sequence_lengths: Tin") 28 | .Input("allowed_transitions: bool") 29 | .Input("transition_weights: float") 30 | .Output("states: int32") 31 | .Output("states_splits: Tsplits") 32 | 33 | // TODO(b/122968457): Implement a shape function. 34 | .Doc(R"doc( 35 | Constrains a set of predictions based on a set of legal transitions and/or a 36 | set of transition weights, returning the legal sequence that maximizes the 37 | product of the state scores and the transition weights using the chained 38 | conditional random field algorithm. (In case of a tie, the state with a higher 39 | index will be chosen.) 40 | 41 | This op takes in a set of scores and outputs the most likely legal sequence 42 | for each batch element, where the most likely legal sequence is determined by 43 | the optional 'allowed_transitions' and 'transition_weights' tensors. 44 | 45 | The 'allowed_transition' tensor may be omitted; if it is, all sequence states 46 | will be allowed to transition to all other sequence states. If the tensor is 47 | provided it must be of the size [num_states+1][num_states+1]. 48 | 49 | allowed_transitions[i][j] is true if the transition from state i to state 50 | j is allowed for i and j in 0...(num_states). 51 | allowed_transitions[num_states][j] is true if the sequence is allowed to 52 | start from state j. 53 | allowed_transitions[i][num_states] is true if the sequence is allowed to 54 | end on state i. 55 | allowed_transitions[num_states][num_states] is ignored. 56 | 57 | The 'transition_weights' tensor may be omitted; if it is, all transitions will 58 | be weighted with a value of 1.0. If the tensor is provided it must be of the 59 | size [num_states+1][num_states+1]. 60 | 61 | transition_weights[i][j] is the coefficient that a candidate transition score 62 | will be multiplied by if that transition is from state i to state j. 63 | transition_weights[num_states][j] is the coefficient that will be used 64 | if the transition starts with state j. 65 | transition_weights[i][num_states] is the coefficient that will be used 66 | if the final state in the sequence is state i. 67 | transition_weights[num_states][num_states] is ignored. 68 | 69 | This op outputs a RaggedTensor value and splits pair. 70 | 71 | scores: [batch_size, num_steps, |num_states|] A tensor of scores, where 72 | `scores[b, t, s]` is the predicted score for transitioning to state `s` 73 | at step `t` for batch `b`. The |num_states| dimension must correspond 74 | to the num_states attribute for this op. 75 | sequence_lengths: <{int32, int64}>[batch_size] A tensor containing the length 76 | of each sequence in the batch. 77 | allowed_transitions: [num_states+1, num_states+1] A boolean matrix of 78 | allowed transitions, or an empty matrix '[]' to allow all transitions. 79 | transition_weights: [num_states+1, num_states+1] A float matrix of score 80 | coefficients, or an empty matrix '[]' to weight all transitions equally. 81 | states: [batch_size, max_sequence_length] OR [total_num_states] 82 | A set of sequence outputs representing the most likely valid sequences 83 | for each batch. If `output_ragged_tensor` is false, this will be in 84 | [batch_size, max_sequence_length] form; if `output_ragged_tensor` is 85 | true, this will be a RaggedTensor data vector of shape 86 | [total_num_states]. 87 | states_splits: [batch_size+1] A RaggedTensor splits vector. If 88 | `output_ragged_tensor` is true, then the state sequence for input `i` 89 | is stored in `states[states_splits[i]:states_splits[i+1]]`. If 90 | `output_ragged_tensor` is false, this tensor will be empty and can be 91 | ignored. 92 | )doc"); 93 | 94 | } // namespace tensorflow 95 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/sliding_window.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 | 6 | # text.sliding_window 7 | 8 | Builds a sliding window for `data` with a specified width. 9 | 10 | ``` python 11 | text.sliding_window( 12 | data, 13 | width, 14 | axis=-1, 15 | name=None 16 | ) 17 | ``` 18 | 19 | Defined in 20 | [`python/ops/sliding_window_op.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/sliding_window_op.py). 21 | 22 | 23 | 24 | Returns a tensor constructed from `data`, where each element in 25 | dimension `axis` is a slice of `data` starting at the corresponding 26 | position, with the given width and step size. I.e.: 27 | 28 | * `result.shape.ndims = data.shape.ndims + 1` 29 | * `result[i1..iaxis, a] = data[i1..iaxis, a:a+width]` 30 | (where `0 <= a < data[i1...iaxis].shape[0] - (width - 1)`). 31 | 32 | Note that each result row (along dimension `axis`) has `width - 1` fewer items 33 | than the corresponding `data` row. If a `data` row has fewer than `width` 34 | items, then the corresponding `result` row will be empty. If you wish for 35 | the `result` rows to be the same size as the `data` rows, you can use 36 | `pad_along_dimension` to add `width - 1` padding elements before calling 37 | this op. 38 | 39 | #### Args: 40 | 41 | * `data`: ` [O1...ON, A, I1...IM]` A potentially ragged 42 | K-dimensional tensor with outer dimensions of size `O1...ON`; axis dimension 43 | of size `A`; and inner dimensions of size `I1...IM`. I.e. `K = N + 1 + M`, 44 | where `N>=0` and `M>=0`. 45 | 46 | * `width`: An integer constant specifying the width of the window. Must 47 | be greater than zero. 48 | 49 | * `axis`: An integer constant specifying the axis along which sliding 50 | window is computed. Negative axis values from `-K` to `-1` are supported. 51 | 52 | * `name`: The name for this op (optional) 53 | 54 | #### Returns: 55 | 56 | A `K+1` dimensional tensor with the same dtype as `data`, where: 57 | 58 | * `result[i1..iaxis, a]` = `data[i1..iaxis, a:a+width]` 59 | * `result.shape[:axis]` = `data.shape[:axis]` 60 | * `result.shape[axis]` = `data.shape[axis] - (width - 1)` 61 | * `result.shape[axis + 1]` = `width` 62 | * `result.shape[axis + 2:]` = `data.shape[axis + 1:]` 63 | 64 | #### Examples: 65 | 66 | Sliding window (width=3) across a sequence of tokens: 67 | 68 | ```python 69 | >>> # input: [sequence_length] 70 | >>> input = tf.constant(["one", "two", "three", "four", "five", "six"]) 71 | >>> # output: [sequence_length-2, 3] 72 | >>> output = sliding_window(data=input, width=3, axis=0) 73 | >>> print output.eval() 74 | [["one", "two", "three"], 75 | ["two", "three", "four"], 76 | ["three", "four", "five"], 77 | ["four", "five", "six"]] 78 | >>> print("Shape: %s -> %s" % (input.shape, output.shape)) 79 | Shape: (6,) -> (4, 3) 80 | ``` 81 | 82 | Sliding window (width=2) across the inner dimension of a ragged matrix 83 | containing a batch of token sequences: 84 | 85 | ```python 86 | >>> # input: [num_sentences, (num_words)] 87 | >>> input = tf.ragged.constant( 88 | ... [['Up', 'high', 'in', 'the', 'air'], 89 | ... ['Down', 'under', 'water'], 90 | ... ['Away', 'to', 'outer', 'space']] 91 | >>> # output: [num_sentences, (num_word-1), 2] 92 | >>> output = sliding_window(input, width=2, axis=-1) 93 | >>> print output.eval() 94 | [[['Up', 'high'], ['high', 'in'], ['in', 'the'], ['the', 'air']], 95 | [['Down', 'under'], ['under', 'water']], 96 | [['Away', 'to'], ['to', 'outer'], ['outer', 'space']]] 97 | >>> print("Shape: %s -> %s" % (input.shape, output.shape)) 98 | Shape: (3, ?) -> (3, ?, 2) 99 | ``` 100 | 101 | Sliding window across the second dimension of a 3-D tensor containing 102 | batches of sequences of embedding vectors: 103 | 104 | ```python 105 | >>> # input: [num_sequences, sequence_length, embedding_size] 106 | >>> input = tf.constant([ 107 | ... [[1, 1, 1], [2, 2, 1], [3, 3, 1], [4, 4, 1], [5, 5, 1]], 108 | ... [[1, 1, 2], [2, 2, 2], [3, 3, 2], [4, 4, 2], [5, 5, 2]]]) 109 | >>> # output: [num_sequences, sequence_length-1, 2, embedding_size] 110 | >>> output = sliding_window(data=input, width=2, axis=1) 111 | >>> print output.eval() 112 | [[[[1, 1, 1], [2, 2, 1]], 113 | [[2, 2, 1], [3, 3, 1]], 114 | [[3, 3, 1], [4, 4, 1]], 115 | [[4, 4, 1], [5, 5, 1]]], 116 | [[[1, 1, 2], [2, 2, 2]], 117 | [[2, 2, 2], [3, 3, 2]], 118 | [[3, 3, 2], [4, 4, 2]], 119 | [[4, 4, 2], [5, 5, 2]]]] 120 | >>> print("Shape: %s -> %s" % (input.shape, output.shape)) 121 | Shape: (2, 5, 3) -> (2, 4, 2, 3) 122 | ``` 123 | -------------------------------------------------------------------------------- /tensorflow_text/python/ops/ngrams_op.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Tensorflow ngram operations.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import enum 23 | 24 | from tensorflow.python.framework import errors 25 | from tensorflow.python.framework import ops 26 | from tensorflow.python.ops import math_ops 27 | from tensorflow.python.ops import string_ops 28 | from tensorflow.python.ops.ragged import ragged_functional_ops 29 | from tensorflow.python.ops.ragged import ragged_tensor 30 | from tensorflow_text.python.ops.sliding_window_op import sliding_window 31 | 32 | 33 | class Reduction(enum.Enum): 34 | """Type of reduction to be done by the ngram op. 35 | 36 | The supported reductions are as follows: 37 | 38 | * `Reduction.SUM`: Add values in the window. 39 | * `Reduction.MEAN`: Average values in the window. 40 | * `Reduction.STRING_JOIN`: Join strings in the window. 41 | """ 42 | 43 | SUM = 1 44 | MEAN = 2 45 | STRING_JOIN = 3 46 | 47 | 48 | def ngrams(data, 49 | width, 50 | axis=-1, 51 | reduction_type=None, 52 | string_separator=" ", 53 | name=None): 54 | """Create a tensor of n-grams based on the input data `data`. 55 | 56 | Creates a tensor of n-grams based on `data`. The n-grams are of width `width` 57 | and are created along axis `axis`; the n-grams are created by combining 58 | windows of `width` adjacent elements from `data` using `reduction_type`. This 59 | op is intended to cover basic use cases; more complex combinations can be 60 | created using the sliding_window op. 61 | 62 | Args: 63 | data: The data to reduce. 64 | width: The width of the ngram window. If there is not sufficient data to 65 | fill out the ngram window, the resulting ngram will be empty. 66 | axis: The axis to create ngrams along. Note that for string join reductions, 67 | only axis '-1' is supported; for other reductions, any positive or 68 | negative axis can be used. Should be a constant. 69 | reduction_type: A member of the Reduction enum. Should be a constant. 70 | Currently supports: 71 | 72 | * `Reduction.SUM`: Add values in the window. 73 | * `Reduction.MEAN`: Average values in the window. 74 | * `Reduction.STRING_JOIN`: Join strings in the window. 75 | Note that axis must be -1 here. 76 | 77 | string_separator: The separator string used for `Reduction.STRING_JOIN`. 78 | Ignored otherwise. Must be a string constant, not a Tensor. 79 | name: The op name. 80 | 81 | Returns: 82 | A tensor of ngrams. 83 | 84 | Raises: 85 | InvalidArgumentError: if `reduction_type` is either None or not a Reduction, 86 | or if `reduction_type` is STRING_JOIN and `axis` is not -1. 87 | """ 88 | 89 | with ops.name_scope(name, "NGrams", [data, width]): 90 | if reduction_type is None: 91 | raise errors.InvalidArgumentError(None, None, 92 | "reduction_type must be specified.") 93 | 94 | if not isinstance(reduction_type, Reduction): 95 | raise errors.InvalidArgumentError(None, None, 96 | "reduction_type must be a Reduction.") 97 | 98 | # TODO(b/122967921): Lift this restriction after ragged_reduce_join is done. 99 | if reduction_type is Reduction.STRING_JOIN and axis != -1: 100 | raise errors.InvalidArgumentError( 101 | None, None, "%s requires that ngrams' 'axis' parameter be -1." % 102 | Reduction.STRING_JOIN.name) 103 | 104 | windowed_data = sliding_window(data, width, axis) 105 | 106 | if axis < 0: 107 | reduction_axis = axis 108 | else: 109 | reduction_axis = axis + 1 110 | 111 | # Ragged reduction ops work on both Tensor and RaggedTensor, so we can 112 | # use them here regardless of the type of tensor in 'windowed_data'. 113 | if reduction_type is Reduction.SUM: 114 | return math_ops.reduce_sum(windowed_data, reduction_axis) 115 | elif reduction_type is Reduction.MEAN: 116 | return math_ops.reduce_mean(windowed_data, reduction_axis) 117 | elif reduction_type is Reduction.STRING_JOIN: 118 | if isinstance(data, ragged_tensor.RaggedTensor): 119 | return ragged_functional_ops.map_flat_values( 120 | string_ops.reduce_join, 121 | windowed_data, 122 | axis=axis, 123 | separator=string_separator) 124 | else: 125 | return string_ops.reduce_join( 126 | windowed_data, axis=axis, separator=string_separator) 127 | -------------------------------------------------------------------------------- /tensorflow_text/python/ops/normalize_ops_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # coding=utf-8 17 | """Tests for normalization ops in tensorflow_text.""" 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | from tensorflow.python.framework import errors 24 | from tensorflow.python.framework import test_util 25 | from tensorflow.python.ops.ragged import ragged_factory_ops 26 | from tensorflow.python.ops.ragged import ragged_test_util 27 | from tensorflow.python.platform import test 28 | from tensorflow_text.python.ops import normalize_ops 29 | 30 | 31 | @test_util.run_all_in_graph_and_eager_modes 32 | class NormalizeOpsTest(ragged_test_util.RaggedTensorTestCase): 33 | 34 | def test_lowercase_one_string(self): 35 | txt = [ 36 | " TExt to loWERcase! ", 37 | ] 38 | expected = [ 39 | " text to lowercase! ", 40 | ] 41 | self.assertAllEqual(expected, normalize_ops.case_fold_utf8(txt)) 42 | 43 | def test_lowercase_text(self): 44 | txt = [ 45 | "Punctuation and digits: -*/+$#%@%$123456789#^$*%&", 46 | "Non-latin UTF8 chars: ΘͽʦȺЩ", 47 | "Accented chars: ĎÔPQRŔSŠoóôpqrŕsštťuúvwxyý", 48 | "Non-UTF8-letters: e.g. ◆, ♥, and the emoji symbol ( ͡° ͜ʖ ͡°)", 49 | "Folded: ßς", 50 | "" 51 | ] 52 | expected = [ 53 | "punctuation and digits: -*/+$#%@%$123456789#^$*%&", 54 | "non-latin utf8 chars: θͽʦⱥщ", 55 | "accented chars: ďôpqrŕsšoóôpqrŕsštťuúvwxyý", 56 | "non-utf8-letters: e.g. ◆, ♥, and the emoji symbol ( ͡° ͜ʖ ͡°)", 57 | "folded: ssσ", 58 | "" 59 | ] 60 | self.assertAllEqual(expected, normalize_ops.case_fold_utf8(txt)) 61 | 62 | def test_lowercase_one_string_ragged(self): 63 | txt = ragged_factory_ops.constant([[" TExt ", "to", " loWERcase! "], 64 | [" TExt to loWERcase! "]]) 65 | expected = [[" text ", "to", " lowercase! "], [" text to lowercase! "]] 66 | self.assertRaggedEqual(expected, normalize_ops.case_fold_utf8(txt)) 67 | 68 | def test_lowercase_empty_string(self): 69 | txt = [ 70 | "", 71 | ] 72 | expected = [ 73 | "", 74 | ] 75 | self.assertAllEqual(expected, normalize_ops.case_fold_utf8(txt)) 76 | 77 | def test_normalize_nfkc(self): 78 | txt = [ 79 | u"\u1e9b\u0323", 80 | ] 81 | expected = [ 82 | u"ṩ".encode("utf-8"), 83 | ] 84 | self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFKC")) 85 | self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfkc")) 86 | 87 | def test_normalize_nfkc_batch(self): 88 | txt = [ 89 | u"\u1e9b\u0323", 90 | u"\ufb01", 91 | ] 92 | expected = [ 93 | u"ṩ".encode("utf-8"), 94 | "fi", 95 | ] 96 | self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFKC")) 97 | self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfkc")) 98 | 99 | def test_normalize_nfkc_ragged(self): 100 | txt = ragged_factory_ops.constant([[[u"\u1e9b\u0323 \ufb01"], []], 101 | [[u"\u1e9b\u0323", u"\ufb01"]]]) 102 | expected = [[[u"ṩ fi".encode("utf-8")], []], [[u"ṩ".encode("utf-8"), "fi"]]] 103 | self.assertRaggedEqual(expected, normalize_ops.normalize_utf8(txt, "NFKC")) 104 | self.assertRaggedEqual(expected, normalize_ops.normalize_utf8(txt, "nfkc")) 105 | 106 | def test_normalize_nfc(self): 107 | txt = [ 108 | u"\u1e9b\u0323", 109 | ] 110 | expected = [ 111 | u"\u1e9b\u0323".encode("utf-8"), 112 | ] 113 | self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFC")) 114 | self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfc")) 115 | 116 | def test_normalize_nfd(self): 117 | txt = [u"\u1e9b\u0323"] 118 | expected = [ 119 | u"\u017f\u0323\u0307".encode("utf-8"), 120 | ] 121 | self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFD")) 122 | self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfd")) 123 | 124 | def test_normalize_nfkd(self): 125 | txt = [ 126 | u"\u1e9b\u0323", 127 | ] 128 | expected = [ 129 | u"\u0073\u0323\u0307".encode("utf-8"), 130 | ] 131 | self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFKD")) 132 | self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfkd")) 133 | 134 | def test_unknown_normalization_form(self): 135 | with self.assertRaises(errors.InvalidArgumentError): 136 | bomb = normalize_ops.normalize_utf8(["cant readme", "wont read me"], 137 | "cantfindme") 138 | self.evaluate(bomb) 139 | 140 | 141 | if __name__ == "__main__": 142 | test.main() 143 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/TokenizerWithOffsets.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 | 15 | # text.TokenizerWithOffsets 16 | 17 | ## Class `TokenizerWithOffsets` 18 | 19 | Base class (abstract class) for all tokenizers that return offsets. 20 | 21 | Inherits From: [`Tokenizer`](../text/Tokenizer.md) 22 | 23 | Defined in 24 | [`python/ops/tokenization.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/tokenization.py). 25 | 26 | 27 | 28 |

__init__

29 | 30 | ```python 31 | __init__(name=None) 32 | ``` 33 | 34 | ## Properties 35 | 36 |

name

37 | 38 | Returns the name of this module as passed or determined in the ctor. 39 | 40 | NOTE: This is not the same as the `self.name_scope.name` which includes parent 41 | module names. 42 | 43 |

name_scope

44 | 45 | Returns a `tf.name_scope` instance for this class. 46 | 47 |

submodules

48 | 49 | Sequence of all sub-modules. 50 | 51 | Submodules are modules which are properties of this module, or found as 52 | properties of modules which are properties of this module (and so on). 53 | 54 | ``` 55 | a = tf.Module() 56 | b = tf.Module() 57 | c = tf.Module() 58 | a.b = b 59 | b.c = c 60 | assert list(a.submodules) == [b, c] 61 | assert list(b.submodules) == [c] 62 | assert list(c.submodules) == [] 63 | ``` 64 | 65 | #### Returns: 66 | 67 | A sequence of all submodules. 68 | 69 |

trainable_variables

70 | 71 | Sequence of variables owned by this module and it's submodules. 72 | 73 | Note: this method uses reflection to find variables on the current instance and 74 | submodules. For performance reasons you may wish to cache the result of calling 75 | this method if you don't expect the return value to change. 76 | 77 | #### Returns: 78 | 79 | A sequence of variables for the current module (sorted by attribute name) 80 | followed by variables from all submodules recursively (breadth first). 81 | 82 |

variables

83 | 84 | Sequence of variables owned by this module and it's submodules. 85 | 86 | Note: this method uses reflection to find variables on the current instance and 87 | submodules. For performance reasons you may wish to cache the result of calling 88 | this method if you don't expect the return value to change. 89 | 90 | #### Returns: 91 | 92 | A sequence of variables for the current module (sorted by attribute name) 93 | followed by variables from all submodules recursively (breadth first). 94 | 95 | ## Methods 96 | 97 |

tokenize

98 | 99 | ```python 100 | tokenize(input) 101 | ``` 102 | 103 | Abstract function for tokenization. 104 | 105 | #### Args: 106 | 107 | * `input`: An N-dimensional UTF-8 string (or optionally integer) Tensor 108 | or RaggedTensor. 109 | 110 | #### Returns: 111 | 112 | An N+1-dimensional UTF-8 string or integer Tensor or RaggedTensor. 113 | 114 |

tokenize_with_offsets

115 | 116 | ```python 117 | tokenize_with_offsets(input) 118 | ``` 119 | 120 | Abstract function for tokenization with offsets. 121 | 122 | #### Args: 123 | 124 | * `input`: An N-dimensional UTF-8 string (or optionally integer) Tensor 125 | or RaggedTensor. 126 | 127 | #### Returns: 128 | 129 | A tuple (tokens, start_offsets, limit_offsets): * tokens is an N+1-dimensional 130 | UTF-8 string or integer Tensor or RaggedTensor. * start_offsets is an 131 | N+1-dimensional integer Tensor containing the starting indices of each token 132 | (byte indices for input strings). * limit_offsets is an N+1-dimensional integer 133 | Tensor containing the exclusive ending indices of each token (byte indices for 134 | input strings). 135 | 136 |

with_name_scope

137 | 138 | ```python 139 | with_name_scope( 140 | cls, 141 | method 142 | ) 143 | ``` 144 | 145 | Decorator to automatically enter the module name scope. 146 | 147 | ``` 148 | class MyModule(tf.Module): 149 | @tf.Module.with_name_scope 150 | def __call__(self, x): 151 | if not hasattr(self, 'w'): 152 | self.w = tf.Variable(tf.random.normal([x.shape[1], 64])) 153 | return tf.matmul(x, self.w) 154 | ``` 155 | 156 | Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names 157 | included the module name: 158 | 159 | ``` 160 | mod = MyModule() 161 | mod(tf.ones([8, 32])) 162 | # ==> 163 | mod.w 164 | # ==> 165 | ``` 166 | 167 | #### Args: 168 | 169 | * `method`: The method to wrap. 170 | 171 | #### Returns: 172 | 173 | The original method wrapped such that it enters the module's name scope. 174 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/greedy_constrained_sequence.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 | 6 | # text.greedy_constrained_sequence 7 | 8 | Performs greedy constrained sequence on a batch of examples. 9 | 10 | ``` python 11 | text.greedy_constrained_sequence( 12 | scores, 13 | sequence_length=None, 14 | allowed_transitions=None, 15 | transition_weights=None, 16 | use_log_space=False, 17 | use_start_and_end_states=False, 18 | name=None 19 | ) 20 | ``` 21 | 22 | Defined in 23 | [`python/ops/greedy_constrained_sequence_op.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/greedy_constrained_sequence_op.py). 24 | 25 | 26 | 27 | Constrains a set of predictions based on a set of legal transitions 28 | and/or a set of transition weights, returning the legal sequence that 29 | maximizes the product or sum of the state scores and the transition weights 30 | at each step. If use_log_space is true, the sum is used; if false, the 31 | product is used. 32 | 33 | This op also takes a parameter 'use_start_and_end_states', which when true 34 | will add an implicit start and end state to each sequence. These implicit 35 | states allow the user to specify additional weights and permitted transitions 36 | to start and end a sequence (so, for instance, if you wanted to forbid your 37 | output from ending in a certain set of states you could do so). 38 | 39 | Inputs to this op can take one of three forms: a single Tensorflow tensor 40 | of scores with no sequence lengths, a Tensorflow tensor of scores along 41 | with a Tensorflow tensor of sequence lengths, or a RaggedTensor. If only the 42 | scores tensor is passed, this op will assume that the sequence lengths are 43 | equal to the size of the tensor (and so use all the data provided). If a 44 | scores tensor and sequence_lengths tensor is provided, the op will only 45 | use the data in the scores tensor as specified by the sequence_lengths tensor. 46 | Finally, if a RaggedTensor is provided, the sequence_lengths will be ignored 47 | and the variable length sequences in the RaggedTensor will be used. 48 | 49 | #### Args: 50 | 51 | * `scores`: ` [batch_size, num_steps, |num_states|]` A tensor 52 | of scores, where `scores[b, t, s]` is the predicted score for transitioning 53 | to state `s` at step `t` for batch `b`. The |num_states| dimension must 54 | correspond to the num_states attribute for this op. This input may be 55 | ragged; if it is ragged, the ragged tensor should have the same structure 56 | [b, t, s] and only axis 1 should be ragged. 57 | 58 | * `sequence_length`: `<{int32, int64}>[batch_size]` A rank-1 tensor 59 | representing the length of the output sequence. If None, and the 'scores' 60 | input is not ragged, sequence lengths will be assumed to be the length of 61 | the score tensor. 62 | 63 | * `allowed_transitions`: if use_start_and_end_states is TRUE: 64 | `[num_states+1, num_states+1]` if use_start_and_end_states is FALSE: 65 | `[num_states, num_states]` A rank-2 tensor representing allowed 66 | transitions. 67 | 68 | - allowed_transitions[i][j] is true if the transition from state i to 69 | state j is allowed for i and j in 0...(num_states). 70 | - allowed_transitions[num_states][num_states] is ignored. If 71 | use_start_and_end_states is TRUE: 72 | - allowed_transitions[num_states][j] is true if the sequence is allowed to 73 | start from state j. 74 | - allowed_transitions[i][num_states] is true if the sequence is allowed to 75 | end on state i. Default - An empty tensor. This allows all sequence 76 | states to transition to all other sequence states. 77 | 78 | * `transition_weights`: if use_start_and_end_states is TRUE: 79 | `[num_states+1, num_states+1]` if use_start_and_end_states is 80 | FALSE: `[num_states, num_states]` A rank-2 tensor representing 81 | transition weights. 82 | 83 | - transition_weights[i][j] is the coefficient that a candidate transition 84 | score will be multiplied by if that transition is from state i to state 85 | j. 86 | - transition_weights[num_states][num_states] is ignored. If 87 | use_start_and_end_states is TRUE: 88 | - transition_weights[num_states][j] is the coefficient that will be used 89 | if the transition starts with state j. 90 | - transition_weights[i][num_states] is the coefficient that will be used 91 | if the final state in the sequence is state i. Default - An empty 92 | tensor. This assigns a wieght of 1.0 all transitions 93 | 94 | * `use_log_space`: Whether to use log space for the calculation. If 95 | false, calculations will be done in exp-space. 96 | 97 | * `use_start_and_end_states`: If True, sequences will have an implicit 98 | start and end state added. 99 | 100 | * `name`: The name scope within which this op should be constructed. 101 | 102 | #### Returns: 103 | 104 | An [batch_size, (num_steps)] ragged tensor containing the appropriate 105 | sequence of transitions. If a sequence is impossible, the value of the 106 | RaggedTensor for that and all following transitions in that sequence shall be 107 | '-1'. 108 | -------------------------------------------------------------------------------- /tensorflow_text/python/ops/create_feature_bitmask_op.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Tokenize text ops. 17 | 18 | Uses the SAFT Tokenization library for sentence and word breaking 19 | """ 20 | 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | 25 | from tensorflow.python.framework import constant_op 26 | from tensorflow.python.framework import dtypes 27 | from tensorflow.python.framework import errors 28 | from tensorflow.python.framework import ops 29 | from tensorflow.python.ops import array_ops 30 | from tensorflow.python.ops import check_ops 31 | from tensorflow.python.ops import math_ops 32 | 33 | # The maximum number of bits that can be encoded by create_feature_bitmask 34 | # in each datatype. 35 | _max_bits = { 36 | dtypes.uint8: 8, 37 | dtypes.int8: 7, 38 | dtypes.uint16: 16, 39 | dtypes.int16: 15, 40 | dtypes.int32: 31, 41 | dtypes.int64: 63, 42 | } 43 | 44 | 45 | def create_feature_bitmask(tensor, dtype=dtypes.int32, name=None): 46 | """Packs the innermost dimension of a boolean tensor into integer values. 47 | 48 | `result[i1...iN]` is the integer formed by interpreting the booleans 49 | `tensor[i1...iN, 0:num_bits]` as individual bits, with big-endian order. 50 | E.g., if `tensor[i1...iN, 0:num_bits] = [True, False, False, True, False]`, 51 | then `result[i1...iN] = 0b10010 = 18`. The return tensor is of type `dtype`, 52 | if specified; if `dtype` is not set, `int32` will be used. 53 | 54 | If `num_bits` is too large to fit in `dtype`, then an exception is raised 55 | when this op is called (if `num_bits` is statically known) or when it is 56 | evaluated (if `num_bits` is not statically known). 57 | 58 | Args: 59 | tensor: `[D1...DN, num_bits]` The boolean tensor whose innermost 60 | dimension should be packed to form integer values. 61 | dtype: The datatype to output for this op (optional). 62 | name: The name for this op (optional). 63 | 64 | Returns: 65 | ` [D1...DN]` 66 | An integer tensor formed by interpreting the innermost dimension of 67 | `tensor` as individual bits. 68 | 69 | Raises: 70 | ValueError: If the data to be packed is too large for the chosen data 71 | type. 72 | ValueError: If the data to be packed is not boolean. 73 | InvalidArgumentError: If the input tensor is a list, or the dtype is not a 74 | supported integer type. 75 | 76 | Examples: 77 | ```python 78 | >>> assert create_feature_bitmask([True, False, False, True]) == 0b1001 79 | >>> create_feature_bitmask([[True, False], [False, True], [True, True]]) 80 | [0b10, 0b01, 0b11] 81 | ``` 82 | """ 83 | with ops.name_scope(name, 'CreateFeatureBitmask', [tensor]): 84 | if (isinstance(tensor, (list, tuple)) and tensor and 85 | isinstance(tensor[0], ops.Tensor)): 86 | raise errors.InvalidArgumentError( 87 | None, None, 88 | 'CreateFeatureBitmask does not support lists of tensors. Consider ' 89 | 'using tf.stack(list,-1) to create a single tensor before invoking ' 90 | 'this op.') 91 | 92 | tensor = ops.convert_to_tensor(tensor, dtypes.bool, 'tensor') 93 | 94 | if dtype not in _max_bits.keys(): 95 | raise errors.InvalidArgumentError( 96 | None, None, 'dtype must be one of: [%s], was %s' % 97 | (sorted(_max_bits), dtype.name)) 98 | 99 | integer_data = math_ops.cast(tensor, dtype=dtype) 100 | shape = tensor.shape 101 | if shape.ndims is not None and shape.dims[-1].value is not None: 102 | num_bits = shape.dims[-1].value 103 | if num_bits > 63: 104 | raise ValueError( 105 | 'data.shape[-1] must be less than 64, is %d.' % num_bits) 106 | elif num_bits > _max_bits[dtype]: 107 | raise ValueError( 108 | 'data.shape[-1] is too large for %s (was %d, cannot exceed %d); ' 109 | 'consider switching condense_boolean_tensor to a larger ' 110 | 'dtype.' % (dtype.name, num_bits, _max_bits[dtype])) 111 | bit_masks = constant_op.constant( 112 | [2**pos for pos in range(num_bits - 1, -1, -1)], dtype) 113 | else: 114 | bit_masks = constant_op.constant( 115 | [2**pos for pos in range(_max_bits[dtype] - 1, -1, -1)], dtype) 116 | num_bits = array_ops.shape(tensor)[-1] 117 | with ops.control_dependencies([ 118 | check_ops.assert_less_equal( 119 | num_bits, 120 | _max_bits[dtype], 121 | message='data.shape[-1] is too large for %s (cannot exceed %s)' % 122 | (dtype.name, _max_bits[dtype])) 123 | ]): 124 | # The second slice ("[:num_bits]") is a no-op unless num_bits==0. 125 | bit_masks = bit_masks[-num_bits:][:num_bits] 126 | return math_ops.reduce_sum(integer_data * bit_masks, axis=-1) 127 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/viterbi_constrained_sequence.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 | 6 | # text.viterbi_constrained_sequence 7 | 8 | Performs greedy constrained sequence on a batch of examples. 9 | 10 | ``` python 11 | text.viterbi_constrained_sequence( 12 | scores, 13 | sequence_length=None, 14 | allowed_transitions=None, 15 | transition_weights=None, 16 | use_log_space=False, 17 | use_start_and_end_states=True, 18 | name=None 19 | ) 20 | ``` 21 | 22 | Defined in 23 | [`python/ops/viterbi_constrained_sequence_op.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/viterbi_constrained_sequence_op.py). 24 | 25 | 26 | 27 | Constrains a set of predictions based on a set of legal transitions 28 | and/or a set of transition weights, returning the legal sequence that 29 | maximizes the product of the state scores and the transition weights 30 | according to the Viterbi algorithm. If use_log_space is True, the Viterbi 31 | calculation will be performed in log space (with sums); if it is False, 32 | the Viterbi calculation will be performed in exp space (with normalized 33 | products). 34 | 35 | This op also takes a parameter 'use_start_and_end_states', which when true 36 | will add an implicit start and end state to each sequence. These implicit 37 | states allow the user to specify additional weights and permitted transitions 38 | to start and end a sequence (so, for instance, if you wanted to forbid your 39 | output from ending in a certain set of states you could do so). 40 | 41 | Inputs to this op can take one of three forms: a single Tensorflow tensor 42 | of scores with no sequence lengths, a Tensorflow tensor of scores along 43 | with a Tensorflow tensor of sequence lengths, or a RaggedTensor. If only the 44 | scores tensor is passed, this op will assume that the sequence lengths are 45 | equal to the size of the tensor (and so use all the data provided). If a 46 | scores tensor and sequence_lengths tensor is provided, the op will only 47 | use the data in the scores tensor as specified by the sequence_lengths tensor. 48 | Finally, if a RaggedTensor is provided, the sequence_lengths will be ignored 49 | and the variable length sequences in the RaggedTensor will be used. 50 | 51 | #### Args: 52 | 53 | * `scores`: ` [batch_size, num_steps, |num_states|]` A tensor 54 | of scores, where `scores[b, t, s]` is the predicted score for transitioning 55 | to state `s` at step `t` for batch `b`. The |num_states| dimension must 56 | correspond to the num_states attribute for this op. This input may be 57 | ragged; if it is ragged, the ragged tensor should have the same structure 58 | [b, t, s] and only axis 1 should be ragged. 59 | 60 | * `sequence_length`: `<{int32, int64}>[batch_size]` A rank-1 tensor 61 | representing the length of the output sequence. If None, and the 'scores' 62 | input is not ragged, sequence lengths will be assumed to be the length of 63 | the score tensor. 64 | 65 | * `allowed_transitions`: if use_start_and_end_states is TRUE: 66 | `[num_states+1, num_states+1]` if use_start_and_end_states is FALSE: 67 | `[num_states, num_states]` A rank-2 tensor representing allowed 68 | transitions. 69 | 70 | - allowed_transitions[i][j] is true if the transition from state i to 71 | state j is allowed for i and j in 0...(num_states). 72 | - allowed_transitions[num_states][num_states] is ignored. If 73 | use_start_and_end_states is TRUE: 74 | - allowed_transitions[num_states][j] is true if the sequence is allowed to 75 | start from state j. 76 | - allowed_transitions[i][num_states] is true if the sequence is allowed to 77 | end on state i. Default - An empty tensor. This allows all sequence 78 | states to transition to all other sequence states. 79 | 80 | * `transition_weights`: if use_start_and_end_states is TRUE: 81 | `[num_states+1, num_states+1]` if use_start_and_end_states is 82 | FALSE: `[num_states, num_states]` A rank-2 tensor representing 83 | transition weights. 84 | 85 | - transition_weights[i][j] is the coefficient that a candidate transition 86 | score will be multiplied by if that transition is from state i to state 87 | j. 88 | - transition_weights[num_states][num_states] is ignored. If 89 | use_start_and_end_states is TRUE: 90 | - transition_weights[num_states][j] is the coefficient that will be used 91 | if the transition starts with state j. 92 | - transition_weights[i][num_states] is the coefficient that will be used 93 | if the final state in the sequence is state i. Default - An empty 94 | tensor. This assigns a wieght of 1.0 all transitions 95 | 96 | * `use_log_space`: Whether to use log space for the calculation. If 97 | false, calculations will be done in exp-space. 98 | 99 | * `use_start_and_end_states`: If True, sequences will have an implicit 100 | start and end state added. 101 | 102 | * `name`: The name scope within which this op should be constructed. 103 | 104 | #### Returns: 105 | 106 | An [batch_size, (num_steps)] ragged tensor containing the appropriate 107 | sequence of transitions. If a sequence is impossible, the value of the 108 | RaggedTensor for that and all following transitions in that sequence shall be 109 | '-1'. 110 | -------------------------------------------------------------------------------- /tensorflow_text/core/kernels/wordpiece_tokenizer.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 TF.Text Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "tensorflow_text/core/kernels/wordpiece_tokenizer.h" 16 | 17 | #include "absl/strings/str_cat.h" 18 | #include "absl/strings/string_view.h" 19 | #include "icu4c/source/common/unicode/schriter.h" 20 | #include "icu4c/source/common/unicode/unistr.h" 21 | #include "icu4c/source/common/unicode/utf8.h" 22 | #include "tensorflow/core/framework/tensor_shape.h" 23 | #include "tensorflow/core/lib/core/errors.h" 24 | 25 | namespace tensorflow { 26 | namespace text { 27 | 28 | constexpr int64 kOutOfVocabValue = -1; 29 | 30 | LookupTableVocab::LookupTableVocab(lookup::LookupInterface* table, 31 | OpKernelContext* ctx) 32 | : table_(table), ctx_(ctx), default_value_(DT_INT64, TensorShape({1})) { 33 | default_value_.flat()(0) = kOutOfVocabValue; 34 | } 35 | 36 | Status LookupTableVocab::Contains(const string& key, bool* value) { 37 | if (value == nullptr) { 38 | return errors::InvalidArgument("Bad 'value' param."); 39 | } 40 | Tensor keys(DT_STRING, TensorShape({1})); 41 | keys.flat()(0) = key; 42 | Tensor values(DT_INT64, TensorShape({1})); 43 | TF_RETURN_IF_ERROR(table_->Find(ctx_, keys, &values, default_value_)); 44 | 45 | if (static_cast(values.flat()(0)) != kOutOfVocabValue) { 46 | *value = true; 47 | return Status::OK(); 48 | } 49 | *value = false; 50 | return Status::OK(); 51 | } 52 | 53 | Status WordpieceTokenize(const string& token, const int64 max_bytes_per_token, 54 | const string& suffix_indicator, bool use_unknown_token, 55 | const string& unknown_token, 56 | LookupTableVocab* vocab_map, 57 | std::vector* subwords, 58 | std::vector* begin_offset, 59 | std::vector* end_offset, int* num_word_pieces) { 60 | if (token.size() > max_bytes_per_token) { 61 | if (use_unknown_token) { 62 | subwords->emplace_back(unknown_token); 63 | end_offset->push_back(unknown_token.size()); 64 | } else { 65 | subwords->emplace_back(token); 66 | end_offset->push_back(token.size()); 67 | } 68 | begin_offset->push_back(0); 69 | *num_word_pieces = 1; 70 | return Status::OK(); 71 | } 72 | 73 | icu::UnicodeString token_unicode = icu::UnicodeString::fromUTF8(token); 74 | bool is_bad = false; 75 | int start = 0; 76 | int byte_offset_start = 0; 77 | std::vector sub_tokens; 78 | std::vector sub_tokens_begin_offset; 79 | std::vector sub_tokens_end_offset; 80 | while (start < token_unicode.length()) { 81 | string cur_substr; 82 | int end = token_unicode.length(); 83 | int num_subword_bytes = token.size() - byte_offset_start; 84 | icu::StringCharacterIterator backward_iter(token_unicode, start, end, 85 | start); 86 | backward_iter.last32(); 87 | 88 | while (num_subword_bytes > 0) { 89 | absl::string_view substr(token.data() + byte_offset_start, 90 | num_subword_bytes); 91 | string lookup_value; 92 | if (byte_offset_start > 0) { 93 | lookup_value = absl::StrCat(suffix_indicator, substr); 94 | } else { 95 | // absl::CopyToString 96 | lookup_value.assign(substr.begin(), substr.end()); 97 | } 98 | 99 | bool found_in_vocab; 100 | TF_RETURN_IF_ERROR(vocab_map->Contains(lookup_value, &found_in_vocab)); 101 | if (found_in_vocab) { 102 | cur_substr.swap(lookup_value); 103 | break; 104 | } 105 | --end; 106 | num_subword_bytes -= U8_LENGTH(backward_iter.current32()); 107 | backward_iter.previous32(); 108 | } 109 | if (cur_substr.empty()) { 110 | is_bad = true; 111 | break; 112 | } 113 | 114 | sub_tokens.emplace_back(cur_substr); 115 | sub_tokens_begin_offset.emplace_back(byte_offset_start); 116 | sub_tokens_end_offset.emplace_back(byte_offset_start + num_subword_bytes); 117 | start = end; 118 | byte_offset_start += num_subword_bytes; 119 | } 120 | if (is_bad) { 121 | if (use_unknown_token) { 122 | subwords->emplace_back(unknown_token); 123 | } else { 124 | subwords->emplace_back(token); 125 | } 126 | begin_offset->emplace_back(0); 127 | end_offset->emplace_back(token.size()); 128 | *num_word_pieces = 1; 129 | } else { 130 | subwords->insert(subwords->end(), sub_tokens.begin(), sub_tokens.end()); 131 | begin_offset->insert(begin_offset->end(), sub_tokens_begin_offset.begin(), 132 | sub_tokens_begin_offset.end()); 133 | end_offset->insert(end_offset->end(), sub_tokens_end_offset.begin(), 134 | sub_tokens_end_offset.end()); 135 | *num_word_pieces = sub_tokens.size(); 136 | } 137 | return Status::OK(); 138 | } 139 | 140 | } // namespace text 141 | } // namespace tensorflow 142 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/WhitespaceTokenizer.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 | 15 | # text.WhitespaceTokenizer 16 | 17 | ## Class `WhitespaceTokenizer` 18 | 19 | Tokenizes a tensor of UTF-8 strings on whitespaces. 20 | 21 | Inherits From: [`TokenizerWithOffsets`](../text/TokenizerWithOffsets.md) 22 | 23 | Defined in 24 | [`python/ops/whitespace_tokenizer.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/whitespace_tokenizer.py). 25 | 26 | 27 | 28 |

__init__

29 | 30 | ```python 31 | __init__(name=None) 32 | ``` 33 | 34 | ## Properties 35 | 36 |

name

37 | 38 | Returns the name of this module as passed or determined in the ctor. 39 | 40 | NOTE: This is not the same as the `self.name_scope.name` which includes parent 41 | module names. 42 | 43 |

name_scope

44 | 45 | Returns a `tf.name_scope` instance for this class. 46 | 47 |

submodules

48 | 49 | Sequence of all sub-modules. 50 | 51 | Submodules are modules which are properties of this module, or found as 52 | properties of modules which are properties of this module (and so on). 53 | 54 | ``` 55 | a = tf.Module() 56 | b = tf.Module() 57 | c = tf.Module() 58 | a.b = b 59 | b.c = c 60 | assert list(a.submodules) == [b, c] 61 | assert list(b.submodules) == [c] 62 | assert list(c.submodules) == [] 63 | ``` 64 | 65 | #### Returns: 66 | 67 | A sequence of all submodules. 68 | 69 |

trainable_variables

70 | 71 | Sequence of variables owned by this module and it's submodules. 72 | 73 | Note: this method uses reflection to find variables on the current instance and 74 | submodules. For performance reasons you may wish to cache the result of calling 75 | this method if you don't expect the return value to change. 76 | 77 | #### Returns: 78 | 79 | A sequence of variables for the current module (sorted by attribute name) 80 | followed by variables from all submodules recursively (breadth first). 81 | 82 |

variables

83 | 84 | Sequence of variables owned by this module and it's submodules. 85 | 86 | Note: this method uses reflection to find variables on the current instance and 87 | submodules. For performance reasons you may wish to cache the result of calling 88 | this method if you don't expect the return value to change. 89 | 90 | #### Returns: 91 | 92 | A sequence of variables for the current module (sorted by attribute name) 93 | followed by variables from all submodules recursively (breadth first). 94 | 95 | ## Methods 96 | 97 |

tokenize

98 | 99 | ```python 100 | tokenize(input) 101 | ``` 102 | 103 | Tokenizes a tensor of UTF-8 strings on whitespaces. 104 | 105 | The strings are split when a ICU defined whitespace character is. These 106 | whitespace characters are dropped. 107 | 108 | #### Args: 109 | 110 | * `input`: A `RaggedTensor` or `Tensor` of UTF-8 strings with any 111 | shape. 112 | 113 | #### Returns: 114 | 115 | A RaggedTensor of tokenized text. The returned shape is the shape of the input 116 | tensor with an added ragged dimension for tokens of each string. 117 | 118 |

tokenize_with_offsets

119 | 120 | ```python 121 | tokenize_with_offsets(input) 122 | ``` 123 | 124 | Tokenizes a tensor of UTF-8 strings on whitespaces. 125 | 126 | The strings are split when a ICU defined whitespace character is. These 127 | whitespace characters are dropped. 128 | 129 | #### Args: 130 | 131 | * `input`: A `RaggedTensor`or `Tensor` of UTF-8 strings with any shape. 132 | 133 | #### Returns: 134 | 135 | A tuple of `RaggedTensor`s `tokens`, `start_offsets`, and `limit_offsets` 136 | 137 | * `where`: * `tokens`: A `RaggedTensor` of tokenized text. 138 | * `start_offsets`: A `RaggedTensor` of the tokens' starting byte offset. 139 | * `limit_offsets`: A `RaggedTensor` of the tokens' ending byte offset. 140 | 141 |

with_name_scope

142 | 143 | ```python 144 | with_name_scope( 145 | cls, 146 | method 147 | ) 148 | ``` 149 | 150 | Decorator to automatically enter the module name scope. 151 | 152 | ``` 153 | class MyModule(tf.Module): 154 | @tf.Module.with_name_scope 155 | def __call__(self, x): 156 | if not hasattr(self, 'w'): 157 | self.w = tf.Variable(tf.random.normal([x.shape[1], 64])) 158 | return tf.matmul(x, self.w) 159 | ``` 160 | 161 | Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names 162 | included the module name: 163 | 164 | ``` 165 | mod = MyModule() 166 | mod(tf.ones([8, 32])) 167 | # ==> 168 | mod.w 169 | # ==> 170 | ``` 171 | 172 | #### Args: 173 | 174 | * `method`: The method to wrap. 175 | 176 | #### Returns: 177 | 178 | The original method wrapped such that it enters the module's name scope. 179 | -------------------------------------------------------------------------------- /tensorflow_text/core/kernels/text_kernels_test_util.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 TF.Text Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // GMock matchers for testing text kernels: 16 | // TensorHasShapeAndValues({dim1, ..., dimN}, {v1, v2, ..., vN}); 17 | // VectorEq({v1, v2, ..., vN}); 18 | // MatrixEq({{v1_1, ..., v1_M}, ..., {vN_1, ..., vN_M}}); 19 | // TensorHasShape({dim1, ..., dimN}); 20 | 21 | #ifndef TENSORFLOW_TEXT_CORE_KERNELS_TEXT_KERNELS_TEST_UTIL_H_ 22 | #define TENSORFLOW_TEXT_CORE_KERNELS_TEXT_KERNELS_TEST_UTIL_H_ 23 | 24 | #include 25 | #include "tensorflow/core/framework/tensor.h" 26 | #include "tensorflow/core/framework/tensor_shape.h" 27 | #include "tensorflow/core/framework/tensor_testutil.h" 28 | 29 | namespace tensorflow { 30 | namespace text_kernels_test_util { 31 | 32 | // GMock MatcherInterface for testing tensor equality. 33 | class TensorEqMatcher : public ::testing::MatcherInterface { 34 | public: 35 | explicit TensorEqMatcher(const Tensor& expect) : expect_(expect) {} 36 | bool MatchAndExplain(Tensor actual, 37 | ::testing::MatchResultListener* listener) const override; 38 | void DescribeTo(::std::ostream* gmock_os) const override; 39 | void DescribeNegationTo(::std::ostream* gmock_os) const override; 40 | 41 | private: 42 | Tensor expect_; 43 | }; 44 | 45 | // GMock MatcherInterface for testing tensor shapes. 46 | class TensorHasShapeMatcher : public ::testing::MatcherInterface { 47 | public: 48 | explicit TensorHasShapeMatcher(const TensorShape& expect) : expect_(expect) {} 49 | bool MatchAndExplain(Tensor actual, 50 | ::testing::MatchResultListener* listener) const override; 51 | void DescribeTo(::std::ostream* gmock_os) const override; 52 | void DescribeNegationTo(::std::ostream* gmock_os) const override; 53 | 54 | private: 55 | TensorShape expect_; 56 | }; 57 | 58 | // Returns a gmock matcher that checks whether a given tensor has the specified 59 | // dtype, values, and shape. dtype is specified using the template parameter. 60 | // values are specified as a flattened vector. 61 | // Example: 62 | // EXPECT_THAT(*GetOutput(0), 63 | // TensorHasShapeAndValues({3, 2}, {1, 2, 3, 4, 5, 6}); 64 | template 65 | ::testing::Matcher TensorHasShapeAndValues( 66 | const TensorShape& shape, const std::vector& values) { 67 | Tensor expect = test::AsTensor(values, shape); 68 | // MakeMatcher takes ownership of the TensorEqMatcher. 69 | return ::testing::MakeMatcher(new TensorEqMatcher(expect)); 70 | } 71 | 72 | // Returns a gmock matcher that checks whether a given tensor is a 1-D tensor 73 | // with the specified dtype and values. dtype is specified using the template 74 | // parameter. 75 | // Example: 76 | // EXPECT_THAT(*GetOutput(0), 77 | // VectorEq({1, 2, 3, 4, 5, 6}); 78 | template 79 | ::testing::Matcher VectorEq(const std::vector& values) { 80 | int64 nvals = values.size(); 81 | Tensor expect = test::AsTensor(values, {nvals}); 82 | // MakeMatcher takes ownership of the TensorEqMatcher. 83 | return ::testing::MakeMatcher(new TensorEqMatcher(expect)); 84 | } 85 | 86 | // Returns a gmock matcher that checks whether a given tensor is a 2-D tensor 87 | // with the specified dtype and values. dtype is specified using the template 88 | // parameter. values are specified as a nested vector. All rows of the values 89 | // vector must have the same length. The values vector may not be empty, 90 | // since we can't infer the number of columns for an empty matrix; to test 91 | // empty matrices, use the more general TensorHasShapeAndValues() instead. 92 | // Example: 93 | // EXPECT_THAT(*GetOutput(0), 94 | // MatrixEq({{1, 2, 3}, {4, 5, 6}}); 95 | template 96 | ::testing::Matcher MatrixEq( 97 | const std::vector>& values) { 98 | int64 nrows = values.size(); 99 | CHECK_GT(nrows, 0) // Crash OK 100 | << "Invalid use of MatrixEq: to test empty matrices, use " 101 | << "TensorHasShapeAndValues{{0, ndims}, {}} instead."; 102 | int64 ncols = values[0].size(); 103 | std::vector flat; 104 | for (const auto& row : values) { 105 | CHECK_EQ(ncols, row.size()) // Crash OK 106 | << "Invalid use of MatrixEq: all rows must have equal length"; 107 | flat.insert(flat.end(), row.begin(), row.end()); 108 | } 109 | Tensor expect = test::AsTensor(flat, TensorShape({nrows, ncols})); 110 | // MakeMatcher takes ownership of the TensorEqMatcher. 111 | return ::testing::MakeMatcher(new TensorEqMatcher(expect)); 112 | } 113 | 114 | // Returns a gmock matcher that checks whether a given tensor has a specified 115 | // shape. 116 | // Example: 117 | // EXPECT_THAT(*GetOutput(0), TensorHasShape({2, 8}); 118 | ::testing::Matcher TensorHasShape(const TensorShape& shape); 119 | 120 | } // namespace text_kernels_test_util 121 | } // namespace tensorflow 122 | 123 | #endif // TENSORFLOW_TEXT_CORE_KERNELS_TEXT_KERNELS_TEST_UTIL_H_ 124 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/span_alignment.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |
5 | 6 | # text.span_alignment 7 | 8 | Return an alignment from a set of source spans to a set of target spans. 9 | 10 | ``` python 11 | text.span_alignment( 12 | source_start, 13 | source_limit, 14 | target_start, 15 | target_limit, 16 | contains=False, 17 | contained_by=False, 18 | partial_overlap=False, 19 | multivalent_result=False, 20 | name=None 21 | ) 22 | ``` 23 | 24 | Defined in 25 | [`python/ops/pointer_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/pointer_ops.py). 26 | 27 | 28 | 29 | The source and target spans are specified using B+1 dimensional tensors, 30 | with `B>=0` batch dimensions followed by a final dimension that lists the 31 | span offsets for each span in the batch: 32 | 33 | * The `i`th source span in batch `b1...bB` starts at 34 | `source_start[b1...bB, i]` (inclusive), and extends to just before 35 | `source_limit[b1...bB, i]` (exclusive). 36 | * The `j`th target span in batch `b1...bB` starts at 37 | `target_start[b1...bB, j]` (inclusive), and extends to just before 38 | `target_limit[b1...bB, j]` (exclusive). 39 | 40 | `result[b1...bB, i]` contains the index (or indices) of the target span that 41 | overlaps with the `i`th source span in batch `b1...bB`. The 42 | `multivalent_result` parameter indicates whether the result should contain 43 | a single span that aligns with the source span, or all spans that align with 44 | the source span. 45 | 46 | * If `multivalent_result` is false (the default), then `result[b1...bB, i]=j` 47 | indicates that the `j`th target span overlaps with the `i`th source span 48 | in batch `b1...bB`. If no target spans overlap with the `i`th target span, 49 | then `result[b1...bB, i]=-1`. 50 | 51 | * If `multivalent_result` is true, then `result[b1...bB, i, n]=j` indicates 52 | that the `j`th target span is the `n`th span that overlaps with the `i`th 53 | source span in in batch `b1...bB`. 54 | 55 | For a definition of span overlap, see the docstring for `span_overlaps()`. 56 | 57 | #### Args: 58 | 59 | * `source_start`: A B+1 dimensional potentially ragged tensor with 60 | shape `[D1...DB, source_size]`: the start offset of each source span. 61 | * `source_limit`: A B+1 dimensional potentially ragged tensor with 62 | shape `[D1...DB, source_size]`: the limit offset of each source span. 63 | * `target_start`: A B+1 dimensional potentially ragged tensor with 64 | shape `[D1...DB, target_size]`: the start offset of each target span. 65 | * `target_limit`: A B+1 dimensional potentially ragged tensor with 66 | shape `[D1...DB, target_size]`: the limit offset of each target span. 67 | * `contains`: If true, then a source span is considered to overlap a 68 | target span when the source span contains the target span. 69 | * `contained_by`: If true, then a source span is considered to overlap 70 | a target span when the source span is contained by the target span. 71 | * `partial_overlap`: If true, then a source span is considered to 72 | overlap a target span when the source span partially overlaps the target 73 | span. 74 | * `multivalent_result`: Whether the result should contain a single 75 | target span index (if `multivalent_result=False`) or a list of target span 76 | indices (if `multivalent_result=True`) for each source span. 77 | * `name`: A name for the operation (optional). 78 | 79 | #### Returns: 80 | 81 | An int64 tensor with values in the range: `-1 <= result < target_size`. If 82 | `multivalent_result=False`, then the returned tensor has shape `[source_size]`, 83 | where `source_size` is the length of the `source_start` and `source_limit` input 84 | tensors. If `multivalent_result=True`, then the returned tensor has shape 85 | `[source_size, (num_aligned_target_spans)]. 86 | 87 | #### Examples: 88 | 89 | Given the following source and target spans (with no batch dimensions): 90 | 91 | ```python 92 | >>> # 0 5 10 15 20 25 30 35 40 45 50 55 60 93 | >>> # |====|====|====|====|====|====|====|====|====|====|====|====| 94 | >>> # Source: [-0-] [-1-] [2] [3] [4][-5-][-6-][-7-][-8-][-9-] 95 | >>> # Target: [-0-][-1-] [-2-][-3-][-4-] [5] [6] [7] [-8-][-9-][10] 96 | >>> # |====|====|====|====|====|====|====|====|====|====|====|====| 97 | >>> source_start=[0, 10, 16, 20, 27, 30, 35, 40, 45, 50] 98 | >>> source_limit=[5, 15, 19, 23, 30, 35, 40, 45, 50, 55] 99 | >>> target_start=[0, 5, 15, 20, 25, 31, 35, 42, 47, 52, 57] 100 | >>> target_limit=[5, 10, 20, 25, 30, 34, 38, 45, 52, 57, 61] 101 | 102 | >>> span_alignment_lists(source_starts, source_limits, 103 | target_starts, target_limits) 104 | [0, -1, -1, -1, -1, -1, -1, -1, -1, -1] 105 | >>> span_alignment_lists(source_starts, source_limits, 106 | ... target_starts, target_limits, 107 | ... multivalent_result=True) 108 | [[0], [], [], [], [], [], [], [], [], []] 109 | 110 | >>> span_alignment_lists(source_starts, source_limits, 111 | ... target_starts, target_limits, 112 | ... contains=True) 113 | [ 0, -1, -1, -1, -1, 5, 6, 7, -1, -1] 114 | 115 | >>> span_alignment_lists(source_starts, source_limits, 116 | ... target_starts, target_limits, 117 | ... partial_overlap=True, 118 | ... multivalent_result=True) 119 | [[0], [], [2], [3], [4], [5], [6], [7], [8], [8, 9]] -------------------------------------------------------------------------------- /tensorflow_text/python/ops/sentence_breaking_ops.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Break sentence ops.""" 17 | 18 | from tensorflow.python.ops.ragged import ragged_tensor 19 | 20 | from tensorflow.python.framework import load_library 21 | from tensorflow.python.platform import resource_loader 22 | gen_sentence_breaking_ops = load_library.load_op_library(resource_loader.get_path_to_datafile('_sentence_breaking_ops.so')) 23 | 24 | 25 | def sentence_fragments(token_word, 26 | token_starts, 27 | token_ends, 28 | token_properties, 29 | input_encoding='UTF-8', 30 | errors='replace', 31 | replacement_char=0xFFFD, 32 | replace_control_characters=False): 33 | """Find the sentence fragments in a given text. 34 | 35 | A sentence fragment is a potential next sentence determined using 36 | deterministic heuristics based on punctuation, capitalization, and similar 37 | text attributes. 38 | 39 | Args: 40 | token_word: A Tensor (w/ rank=2) or a RaggedTensor (w/ ragged_rank=1) 41 | containing the token strings. 42 | token_starts: A Tensor (w/ rank=2) or a RaggedTensor (w/ ragged_rank=1) 43 | containing offsets where the token starts. 44 | token_ends: A Tensor (w/ rank=2) or a RaggedTensor (w/ ragged_rank=1) 45 | containing offsets where the token ends. 46 | token_properties: A Tensor (w/ rank=2) or a RaggedTensor (w/ ragged_rank=1) 47 | containing a bitmask. 48 | 49 | The values of the bitmask are: 50 | 0x01 (ILL_FORMED) - Text is ill-formed according to TextExtractor; 51 | typically applies to all tokens of a paragraph that is too short or 52 | lacks terminal punctuation. 0x40 (TITLE) 53 | 0x02 (HEADING) 54 | 0x04 (BOLD) 55 | 0x10 (UNDERLINED) 56 | 0x20 (LIST) 57 | 0x80 (EMOTICON) 58 | 0x100 (ACRONYM) - Token was identified by Lexer as an acronym. Lexer 59 | identifies period-, hyphen-, and space-separated acronyms: "U.S.", 60 | "U-S", and "U S". Lexer normalizes all three to "US", but the token 61 | word field normalizes only space-separated acronyms. 62 | 0x200 (HYPERLINK) - Indicates that the token (or part of the token) is a 63 | covered by at least one hyperlink. More information of the hyperlink 64 | is stored in the first token covered by the hyperlink. 65 | input_encoding: String name for the unicode encoding that should be used to 66 | decode each string. 67 | errors: Specifies the response when an input string can't be converted 68 | using the indicated encoding. One of: 69 | * `'strict'`: Raise an exception for any illegal substrings. 70 | * `'replace'`: Replace illegal substrings with `replacement_char`. 71 | * `'ignore'`: Skip illegal substrings. 72 | replacement_char: The replacement codepoint to be used in place of invalid 73 | substrings in `input` when `errors='replace'`; and in place of C0 control 74 | characters in `input` when `replace_control_characters=True`. 75 | replace_control_characters: Whether to replace the C0 control characters 76 | `(U+0000 - U+001F)` with the `replacement_char`. 77 | Returns: 78 | A RaggedTensor of `fragment_start`, `fragment_end`, `fragment_properties` 79 | and `terminal_punc_token`. 80 | 81 | `fragment_properties` is an int32 bitmask whose values may contain: 82 | 1 = fragment ends with terminal punctuation 83 | 2 = fragment ends with multiple terminal punctuations (e.g. 84 | "She said what?!") 85 | 3 = Has close parenthesis (e.g. "Mushrooms (they're fungi).") 86 | 4 = Has sentential close parenthesis (e.g. "(Mushrooms are fungi!)" 87 | 88 | `terminal_punc_token` is a RaggedTensor containing the index of terminal 89 | punctuation token immediately following the last word in the fragment 90 | -- or index of the last word itself, if it's an acronym (since acronyms 91 | include the terminal punctuation). index of the terminal punctuation 92 | token. 93 | """ 94 | if not isinstance(token_starts, ragged_tensor.RaggedTensor): 95 | token_starts = ragged_tensor.RaggedTensor.from_tensor(token_starts) 96 | if not isinstance(token_ends, ragged_tensor.RaggedTensor): 97 | token_ends = ragged_tensor.RaggedTensor.from_tensor(token_ends) 98 | if not isinstance(token_word, ragged_tensor.RaggedTensor): 99 | token_word = ragged_tensor.RaggedTensor.from_tensor(token_word) 100 | if not isinstance(token_properties, ragged_tensor.RaggedTensor): 101 | token_properties = ragged_tensor.RaggedTensor.from_tensor(token_properties) 102 | 103 | fragment = gen_sentence_breaking_ops.sentence_fragments( 104 | errors=errors, 105 | replacement_char=replacement_char, 106 | replace_control_characters=replace_control_characters, 107 | input_encoding=input_encoding, 108 | row_lengths=token_starts.row_lengths(), 109 | token_start=token_starts.flat_values, 110 | token_end=token_ends.flat_values, 111 | token_word=token_word.flat_values, 112 | token_properties=token_properties.flat_values) 113 | start, end, properties, terminal_punc_token, row_lengths = fragment 114 | return tuple( 115 | ragged_tensor.RaggedTensor.from_row_lengths(value, row_lengths) 116 | for value in [start, end, properties, terminal_punc_token]) 117 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/UnicodeScriptTokenizer.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 | 15 | # text.UnicodeScriptTokenizer 16 | 17 | ## Class `UnicodeScriptTokenizer` 18 | 19 | Tokenizes a tensor of UTF-8 strings on Unicode script boundaries. 20 | 21 | Inherits From: [`TokenizerWithOffsets`](../text/TokenizerWithOffsets.md) 22 | 23 | Defined in 24 | [`python/ops/unicode_script_tokenizer.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/unicode_script_tokenizer.py). 25 | 26 | 27 | 28 |

__init__

29 | 30 | ```python 31 | __init__(name=None) 32 | ``` 33 | 34 | ## Properties 35 | 36 |

name

37 | 38 | Returns the name of this module as passed or determined in the ctor. 39 | 40 | NOTE: This is not the same as the `self.name_scope.name` which includes parent 41 | module names. 42 | 43 |

name_scope

44 | 45 | Returns a `tf.name_scope` instance for this class. 46 | 47 |

submodules

48 | 49 | Sequence of all sub-modules. 50 | 51 | Submodules are modules which are properties of this module, or found as 52 | properties of modules which are properties of this module (and so on). 53 | 54 | ``` 55 | a = tf.Module() 56 | b = tf.Module() 57 | c = tf.Module() 58 | a.b = b 59 | b.c = c 60 | assert list(a.submodules) == [b, c] 61 | assert list(b.submodules) == [c] 62 | assert list(c.submodules) == [] 63 | ``` 64 | 65 | #### Returns: 66 | 67 | A sequence of all submodules. 68 | 69 |

trainable_variables

70 | 71 | Sequence of variables owned by this module and it's submodules. 72 | 73 | Note: this method uses reflection to find variables on the current instance and 74 | submodules. For performance reasons you may wish to cache the result of calling 75 | this method if you don't expect the return value to change. 76 | 77 | #### Returns: 78 | 79 | A sequence of variables for the current module (sorted by attribute name) 80 | followed by variables from all submodules recursively (breadth first). 81 | 82 |

variables

83 | 84 | Sequence of variables owned by this module and it's submodules. 85 | 86 | Note: this method uses reflection to find variables on the current instance and 87 | submodules. For performance reasons you may wish to cache the result of calling 88 | this method if you don't expect the return value to change. 89 | 90 | #### Returns: 91 | 92 | A sequence of variables for the current module (sorted by attribute name) 93 | followed by variables from all submodules recursively (breadth first). 94 | 95 | ## Methods 96 | 97 |

tokenize

98 | 99 | ```python 100 | tokenize(input) 101 | ``` 102 | 103 | Tokenizes a tensor of UTF-8 strings on Unicode script boundaries. 104 | 105 | The strings are split when a change in the Unicode script is detected between 106 | sequential tokens. The script codes used correspond to International Components 107 | for Unicode (ICU) UScriptCode values. See: 108 | http://icu-project.org/apiref/icu4c/uscript_8h.html 109 | 110 | ICU defined whitespace characters are dropped. 111 | 112 | #### Args: 113 | 114 | * `input`: A `RaggedTensor`or `Tensor` of UTF-8 strings with any shape. 115 | 116 | #### Returns: 117 | 118 | A RaggedTensor of tokenized text. The returned shape is the shape of the input 119 | tensor with an added ragged dimension for tokens of each string. 120 | 121 |

tokenize_with_offsets

122 | 123 | ```python 124 | tokenize_with_offsets(input) 125 | ``` 126 | 127 | Tokenizes a tensor of UTF-8 strings on Unicode script boundaries. 128 | 129 | The strings are split when a change in the Unicode script is detected between 130 | sequential tokens. The script codes used correspond to International Components 131 | for Unicode (ICU) UScriptCode values. See: 132 | http://icu-project.org/apiref/icu4c/uscript_8h.html 133 | 134 | ICU defined whitespace characters are dropped. 135 | 136 | #### Args: 137 | 138 | * `input`: A `RaggedTensor`or `Tensor` of UTF-8 strings with any shape. 139 | 140 | #### Returns: 141 | 142 | A tuple of `RaggedTensor`s `tokens`, `start_offsets`, and `limit_offsets` 143 | 144 | * `where`: * `tokens`: A `RaggedTensor` of tokenized text. 145 | * `start_offsets`: A `RaggedTensor` of the tokens' starting byte offset. 146 | * `limit_offsets`: A `RaggedTensor` of the tokens' ending byte offset. 147 | 148 |

with_name_scope

149 | 150 | ```python 151 | with_name_scope( 152 | cls, 153 | method 154 | ) 155 | ``` 156 | 157 | Decorator to automatically enter the module name scope. 158 | 159 | ``` 160 | class MyModule(tf.Module): 161 | @tf.Module.with_name_scope 162 | def __call__(self, x): 163 | if not hasattr(self, 'w'): 164 | self.w = tf.Variable(tf.random.normal([x.shape[1], 64])) 165 | return tf.matmul(x, self.w) 166 | ``` 167 | 168 | Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names 169 | included the module name: 170 | 171 | ``` 172 | mod = MyModule() 173 | mod(tf.ones([8, 32])) 174 | # ==> 175 | mod.w 176 | # ==> 177 | ``` 178 | 179 | #### Args: 180 | 181 | * `method`: The method to wrap. 182 | 183 | #### Returns: 184 | 185 | The original method wrapped such that it enters the module's name scope. 186 | -------------------------------------------------------------------------------- /tensorflow_text/core/kernels/normalize_kernels.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 TF.Text Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | 18 | #include "absl/strings/ascii.h" 19 | #include "absl/strings/str_cat.h" 20 | #include "icu4c/source/common/unicode/errorcode.h" 21 | #include "icu4c/source/common/unicode/normalizer2.h" 22 | #include "icu4c/source/common/unicode/utypes.h" 23 | #include "tensorflow/core/framework/op_kernel.h" 24 | 25 | namespace tensorflow { 26 | namespace text { 27 | 28 | class CaseFoldUTF8Op : public tensorflow::OpKernel { 29 | public: 30 | explicit CaseFoldUTF8Op(tensorflow::OpKernelConstruction* context) 31 | : tensorflow::OpKernel(context) {} 32 | 33 | void Compute(tensorflow::OpKernelContext* context) override { 34 | const tensorflow::Tensor* input_tensor; 35 | OP_REQUIRES_OK(context, context->input("input", &input_tensor)); 36 | const auto& input_vec = input_tensor->flat(); 37 | 38 | // TODO(gregbillock): support forwarding 39 | tensorflow::Tensor* output_tensor; 40 | OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(), 41 | &output_tensor)); 42 | auto output_vec = output_tensor->flat(); 43 | 44 | icu::ErrorCode icu_error; 45 | const icu::Normalizer2* nfkc_cf = icu::Normalizer2::getNFKCCasefoldInstance( 46 | icu_error); 47 | OP_REQUIRES(context, icu_error.isSuccess(), errors::Internal( 48 | absl::StrCat(icu_error.errorName(), 49 | ": Could not retrieve ICU NFKC_CaseFold normalizer"))); 50 | 51 | for (int64 i = 0; i < input_vec.size(); ++i) { 52 | string output_text; 53 | icu::StringByteSink byte_sink(&output_text); 54 | nfkc_cf->normalizeUTF8(0, input_vec(i), byte_sink, nullptr, icu_error); 55 | OP_REQUIRES(context, !U_FAILURE(icu_error), errors::Internal( 56 | "Could not normalize input string: " + input_vec(i))); 57 | output_vec(i) = output_text; 58 | } 59 | } 60 | }; 61 | 62 | REGISTER_KERNEL_BUILDER(Name("CaseFoldUTF8").Device(tensorflow::DEVICE_CPU), 63 | CaseFoldUTF8Op); 64 | 65 | namespace { 66 | 67 | string GetNormalizationForm(OpKernelConstruction* context) { 68 | string normalization_form; 69 | ([=](string* c) -> void { 70 | OP_REQUIRES_OK(context, context->GetAttr("normalization_form", c)); 71 | })(&normalization_form); 72 | return absl::AsciiStrToUpper(normalization_form); 73 | } 74 | 75 | } // namespace 76 | 77 | class NormalizeUTF8Op : public tensorflow::OpKernel { 78 | public: 79 | explicit NormalizeUTF8Op(tensorflow::OpKernelConstruction* context) 80 | : tensorflow::OpKernel(context), 81 | normalization_form_(GetNormalizationForm(context)) {} 82 | 83 | void Compute(tensorflow::OpKernelContext* context) override { 84 | const tensorflow::Tensor* input_tensor; 85 | OP_REQUIRES_OK(context, context->input("input", &input_tensor)); 86 | const auto& input_vec = input_tensor->flat(); 87 | 88 | tensorflow::Tensor* output_tensor; 89 | OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(), 90 | &output_tensor)); 91 | auto output_vec = output_tensor->flat(); 92 | 93 | icu::ErrorCode icu_error; 94 | const icu::Normalizer2* normalizer = nullptr; 95 | if (normalization_form_ == "NFKC") { 96 | normalizer = icu::Normalizer2::getNFKCInstance(icu_error); 97 | OP_REQUIRES(context, icu_error.isSuccess(), errors::Internal( 98 | absl::StrCat(icu_error.errorName(), 99 | ": Could not retrieve ICU NFKC normalizer"))); 100 | } else if (normalization_form_ == "NFC") { 101 | normalizer = icu::Normalizer2::getNFCInstance(icu_error); 102 | OP_REQUIRES(context, icu_error.isSuccess(), errors::Internal( 103 | absl::StrCat(icu_error.errorName(), 104 | ": Could not retrieve ICU NFC normalizer"))); 105 | } else if (normalization_form_ == "NFD") { 106 | normalizer = icu::Normalizer2::getNFDInstance(icu_error); 107 | OP_REQUIRES(context, icu_error.isSuccess(), errors::Internal( 108 | absl::StrCat(icu_error.errorName(), 109 | ": Could not retrieve ICU NFD normalizer"))); 110 | } else if (normalization_form_ == "NFKD") { 111 | normalizer = icu::Normalizer2::getNFKDInstance(icu_error); 112 | OP_REQUIRES(context, icu_error.isSuccess(), errors::Internal( 113 | absl::StrCat(icu_error.errorName(), 114 | ": Could not retrieve ICU NFKd normalizer"))); 115 | } else { 116 | OP_REQUIRES( 117 | context, false, 118 | errors::InvalidArgument(absl::StrCat( 119 | "Unknown normalization form requrested: ", normalization_form_))); 120 | } 121 | 122 | for (int64 i = 0; i < input_vec.size(); ++i) { 123 | string output_text; 124 | icu::StringByteSink byte_sink(&output_text); 125 | normalizer->normalizeUTF8(0, input_vec(i), byte_sink, nullptr, icu_error); 126 | OP_REQUIRES(context, !U_FAILURE(icu_error), errors::Internal( 127 | absl::StrCat(icu_error.errorName(), 128 | ": Could not normalize input string: ", input_vec(i)))); 129 | output_vec(i) = output_text; 130 | } 131 | } 132 | 133 | private: 134 | string normalization_form_; 135 | }; 136 | 137 | REGISTER_KERNEL_BUILDER(Name("NormalizeUTF8").Device(tensorflow::DEVICE_CPU), 138 | NormalizeUTF8Op); 139 | 140 | } // namespace text 141 | } // namespace tensorflow 142 | -------------------------------------------------------------------------------- /tensorflow_text/python/numpy/viterbi_decode.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Helper functions for decoding Viterbi sequences outside of Tensorflow. 17 | 18 | viterbi_decode provides known-tested snippets for Viterbi decoding in log and 19 | standard space for use outside of a Tensorflow graph. 20 | """ 21 | 22 | from __future__ import absolute_import 23 | from __future__ import division 24 | from __future__ import print_function 25 | 26 | import numpy as np 27 | 28 | 29 | def decode(score, 30 | transition_params=None, 31 | allowed_transitions=None, 32 | use_log_space=True, 33 | use_start_and_end_states=False): 34 | """Decode the highest scoring sequence of tags. 35 | 36 | This function uses numpy instead of Tensorflow ops, and so cannot be used 37 | inside a Tensorflow graph or function. 38 | 39 | Args: 40 | score: A [seq_len, num_tags] matrix of unary potentials. 41 | transition_params: A [num_tags, num_tags] matrix of binary potentials. 42 | allowed_transitions: A [num_tags, num_tags] matrix where FALSE indicates 43 | a transition that cannot be taken. 44 | use_log_space: Whether to perform the Viterbi calculation in logarithmic 45 | space. 46 | use_start_and_end_states: If True, add an implicit 'start' and 'end' state 47 | to the start and end of the given sequence. If this is True, 48 | transition_params should contain an extra row and column, representing 49 | potentials for starting/ending a sequence with a given state. These values 50 | should occupy the outermost row and column of the transition_params 51 | matrix. 52 | 53 | Returns: 54 | viterbi: A [seq_len] list of integers containing the highest scoring tag 55 | indices. 56 | viterbi_score: A float containing the score for the Viterbi sequence. 57 | """ 58 | if transition_params is None: 59 | num_tags = score.shape[-1] 60 | if use_log_space: 61 | transition_params = np.zeros(num_tags, num_tags) 62 | else: 63 | transition_params = np.ones(num_tags, num_tags) 64 | 65 | if allowed_transitions is not None: 66 | if use_log_space: 67 | transition_mask = np.where(allowed_transitions, 1, -float("inf")) 68 | else: 69 | transition_mask = np.where(allowed_transitions, 1, 0.0) 70 | 71 | transition_params = transition_params * transition_mask 72 | 73 | if use_log_space: 74 | return _decode_in_log_space(score, transition_params, 75 | use_start_and_end_states) 76 | else: 77 | return _decode_in_exp_space(score, transition_params, 78 | use_start_and_end_states) 79 | 80 | 81 | def _decode_in_log_space(score, transition_params, use_start_and_end_states): 82 | """Perform Viterbi decoding in log space.""" 83 | trellis = np.zeros_like(score) 84 | backpointers = np.zeros_like(score, dtype=np.int32) 85 | 86 | if use_start_and_end_states: 87 | start_potentials = transition_params[-1, :-1] 88 | end_potentials = transition_params[:-1, -1] 89 | transition_potentials = transition_params[:-1, :-1] 90 | else: 91 | transition_potentials = transition_params 92 | 93 | # Calculate the start value. 94 | if use_start_and_end_states: 95 | trellis[0] = score[0] + start_potentials 96 | else: 97 | trellis[0] = score[0] 98 | 99 | # Calculate intermediate values. 100 | for t in range(1, score.shape[0]): 101 | v = np.expand_dims(trellis[t - 1], 1) + transition_potentials 102 | trellis[t] = score[t] + np.max(v, 0) 103 | backpointers[t] = np.argmax(v, 0) 104 | 105 | # If we are using explicit start and end states, change the final scores 106 | # based on the final state's potentials. 107 | if use_start_and_end_states: 108 | final_scores = trellis[-1] + end_potentials 109 | else: 110 | final_scores = trellis[-1] 111 | 112 | viterbi = [np.argmax(final_scores)] 113 | for bp in reversed(backpointers[1:]): 114 | viterbi.append(bp[viterbi[-1]]) 115 | viterbi.reverse() 116 | 117 | viterbi_score = np.max(final_scores) 118 | 119 | return viterbi, viterbi_score 120 | 121 | 122 | def _decode_in_exp_space(score, transition_params, use_start_and_end_states): 123 | """Perform Viterbi decoding in exp space.""" 124 | if np.any(transition_params < 0): 125 | raise ValueError("Transition params must be non-negative in exp space.") 126 | trellis = np.zeros_like(score) 127 | backpointers = np.zeros_like(score, dtype=np.int32) 128 | max_scores = np.zeros(score.shape[0]) 129 | 130 | if use_start_and_end_states: 131 | start_potentials = transition_params[-1, :-1] 132 | end_potentials = transition_params[:-1, -1] 133 | transition_potentials = transition_params[:-1, :-1] 134 | else: 135 | transition_potentials = transition_params 136 | 137 | # Calculate the start value. 138 | if use_start_and_end_states: 139 | trellis[0] = score[0] * start_potentials 140 | else: 141 | trellis[0] = score[0] 142 | 143 | max_scores[0] = np.max(trellis[0]) 144 | trellis[0] = trellis[0] / max_scores[0] 145 | 146 | # Calculate intermediate values. 147 | for t in range(1, score.shape[0]): 148 | v = np.expand_dims(trellis[t - 1], 1) * transition_potentials 149 | trellis[t] = score[t] * np.max(v, 0) 150 | backpointers[t] = np.argmax(v, 0) 151 | max_scores[t] = np.max(trellis[t]) 152 | trellis[t] = trellis[t] / max_scores[t] 153 | 154 | # If we are using explicit start and end states, change the final scores 155 | # based on the final state's potentials. 156 | if use_start_and_end_states: 157 | final_scores = trellis[-1] * end_potentials 158 | else: 159 | final_scores = trellis[-1] 160 | 161 | viterbi = [np.argmax(final_scores)] 162 | for bp in reversed(backpointers[1:]): 163 | viterbi.append(bp[viterbi[-1]]) 164 | viterbi.reverse() 165 | 166 | viterbi_score = np.max(final_scores) * np.prod(max_scores) 167 | return viterbi, viterbi_score 168 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/_api_cache.json: -------------------------------------------------------------------------------- 1 | { 2 | "duplicate_of": { 3 | "text.TokenizerWithOffsets.name": "text.Tokenizer.name", 4 | "text.TokenizerWithOffsets.name_scope": "text.Tokenizer.name_scope", 5 | "text.TokenizerWithOffsets.submodules": "text.Tokenizer.submodules", 6 | "text.TokenizerWithOffsets.trainable_variables": "text.Tokenizer.trainable_variables", 7 | "text.TokenizerWithOffsets.variables": "text.Tokenizer.variables", 8 | "text.UnicodeScriptTokenizer.name": "text.Tokenizer.name", 9 | "text.UnicodeScriptTokenizer.name_scope": "text.Tokenizer.name_scope", 10 | "text.UnicodeScriptTokenizer.submodules": "text.Tokenizer.submodules", 11 | "text.UnicodeScriptTokenizer.trainable_variables": "text.Tokenizer.trainable_variables", 12 | "text.UnicodeScriptTokenizer.variables": "text.Tokenizer.variables", 13 | "text.WhitespaceTokenizer.name": "text.Tokenizer.name", 14 | "text.WhitespaceTokenizer.name_scope": "text.Tokenizer.name_scope", 15 | "text.WhitespaceTokenizer.submodules": "text.Tokenizer.submodules", 16 | "text.WhitespaceTokenizer.trainable_variables": "text.Tokenizer.trainable_variables", 17 | "text.WhitespaceTokenizer.variables": "text.Tokenizer.variables", 18 | "text.WordpieceTokenizer.name": "text.Tokenizer.name", 19 | "text.WordpieceTokenizer.name_scope": "text.Tokenizer.name_scope", 20 | "text.WordpieceTokenizer.submodules": "text.Tokenizer.submodules", 21 | "text.WordpieceTokenizer.trainable_variables": "text.Tokenizer.trainable_variables", 22 | "text.WordpieceTokenizer.variables": "text.Tokenizer.variables" 23 | }, 24 | "is_fragment": { 25 | "text": false, 26 | "text.Reduction": false, 27 | "text.Reduction.MEAN": true, 28 | "text.Reduction.STRING_JOIN": true, 29 | "text.Reduction.SUM": true, 30 | "text.Tokenizer": false, 31 | "text.Tokenizer.__init__": true, 32 | "text.Tokenizer.name": true, 33 | "text.Tokenizer.name_scope": true, 34 | "text.Tokenizer.submodules": true, 35 | "text.Tokenizer.tokenize": true, 36 | "text.Tokenizer.trainable_variables": true, 37 | "text.Tokenizer.variables": true, 38 | "text.Tokenizer.with_name_scope": true, 39 | "text.TokenizerWithOffsets": false, 40 | "text.TokenizerWithOffsets.__init__": true, 41 | "text.TokenizerWithOffsets.name": true, 42 | "text.TokenizerWithOffsets.name_scope": true, 43 | "text.TokenizerWithOffsets.submodules": true, 44 | "text.TokenizerWithOffsets.tokenize": true, 45 | "text.TokenizerWithOffsets.tokenize_with_offsets": true, 46 | "text.TokenizerWithOffsets.trainable_variables": true, 47 | "text.TokenizerWithOffsets.variables": true, 48 | "text.TokenizerWithOffsets.with_name_scope": true, 49 | "text.UnicodeScriptTokenizer": false, 50 | "text.UnicodeScriptTokenizer.__init__": true, 51 | "text.UnicodeScriptTokenizer.name": true, 52 | "text.UnicodeScriptTokenizer.name_scope": true, 53 | "text.UnicodeScriptTokenizer.submodules": true, 54 | "text.UnicodeScriptTokenizer.tokenize": true, 55 | "text.UnicodeScriptTokenizer.tokenize_with_offsets": true, 56 | "text.UnicodeScriptTokenizer.trainable_variables": true, 57 | "text.UnicodeScriptTokenizer.variables": true, 58 | "text.UnicodeScriptTokenizer.with_name_scope": true, 59 | "text.WhitespaceTokenizer": false, 60 | "text.WhitespaceTokenizer.__init__": true, 61 | "text.WhitespaceTokenizer.name": true, 62 | "text.WhitespaceTokenizer.name_scope": true, 63 | "text.WhitespaceTokenizer.submodules": true, 64 | "text.WhitespaceTokenizer.tokenize": true, 65 | "text.WhitespaceTokenizer.tokenize_with_offsets": true, 66 | "text.WhitespaceTokenizer.trainable_variables": true, 67 | "text.WhitespaceTokenizer.variables": true, 68 | "text.WhitespaceTokenizer.with_name_scope": true, 69 | "text.WordShape": false, 70 | "text.WordShape.BEGINS_WITH_OPEN_QUOTE": true, 71 | "text.WordShape.BEGINS_WITH_PUNCT_OR_SYMBOL": true, 72 | "text.WordShape.ENDS_WITH_CLOSE_QUOTE": true, 73 | "text.WordShape.ENDS_WITH_ELLIPSIS": true, 74 | "text.WordShape.ENDS_WITH_EMOTICON": true, 75 | "text.WordShape.ENDS_WITH_MULTIPLE_SENTENCE_TERMINAL": true, 76 | "text.WordShape.ENDS_WITH_MULTIPLE_TERMINAL_PUNCT": true, 77 | "text.WordShape.ENDS_WITH_PUNCT_OR_SYMBOL": true, 78 | "text.WordShape.ENDS_WITH_SENTENCE_TERMINAL": true, 79 | "text.WordShape.ENDS_WITH_TERMINAL_PUNCT": true, 80 | "text.WordShape.HAS_CURRENCY_SYMBOL": true, 81 | "text.WordShape.HAS_EMOJI": true, 82 | "text.WordShape.HAS_MATH_SYMBOL": true, 83 | "text.WordShape.HAS_MIXED_CASE": true, 84 | "text.WordShape.HAS_NON_LETTER": true, 85 | "text.WordShape.HAS_NO_DIGITS": true, 86 | "text.WordShape.HAS_NO_PUNCT_OR_SYMBOL": true, 87 | "text.WordShape.HAS_NO_QUOTES": true, 88 | "text.WordShape.HAS_ONLY_DIGITS": true, 89 | "text.WordShape.HAS_PUNCTUATION_DASH": true, 90 | "text.WordShape.HAS_QUOTE": true, 91 | "text.WordShape.HAS_SOME_DIGITS": true, 92 | "text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL": true, 93 | "text.WordShape.HAS_TITLE_CASE": true, 94 | "text.WordShape.IS_ACRONYM_WITH_PERIODS": true, 95 | "text.WordShape.IS_EMOTICON": true, 96 | "text.WordShape.IS_LOWERCASE": true, 97 | "text.WordShape.IS_MIXED_CASE_LETTERS": true, 98 | "text.WordShape.IS_NUMERIC_VALUE": true, 99 | "text.WordShape.IS_PUNCT_OR_SYMBOL": true, 100 | "text.WordShape.IS_UPPERCASE": true, 101 | "text.WordShape.IS_WHITESPACE": true, 102 | "text.WordpieceTokenizer": false, 103 | "text.WordpieceTokenizer.__init__": true, 104 | "text.WordpieceTokenizer.name": true, 105 | "text.WordpieceTokenizer.name_scope": true, 106 | "text.WordpieceTokenizer.submodules": true, 107 | "text.WordpieceTokenizer.tokenize": true, 108 | "text.WordpieceTokenizer.tokenize_with_offsets": true, 109 | "text.WordpieceTokenizer.trainable_variables": true, 110 | "text.WordpieceTokenizer.variables": true, 111 | "text.WordpieceTokenizer.with_name_scope": true, 112 | "text.case_fold_utf8": false, 113 | "text.coerce_to_structurally_valid_utf8": false, 114 | "text.gather_with_default": false, 115 | "text.greedy_constrained_sequence": false, 116 | "text.ngrams": false, 117 | "text.normalize_utf8": false, 118 | "text.pad_along_dimension": false, 119 | "text.sentence_fragments": false, 120 | "text.sliding_window": false, 121 | "text.span_alignment": false, 122 | "text.span_overlaps": false, 123 | "text.viterbi_constrained_sequence": false, 124 | "text.wordshape": false 125 | }, 126 | "py_module_names": [ 127 | "text" 128 | ] 129 | } -------------------------------------------------------------------------------- /tensorflow_text/python/ops/sliding_window_op.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Sliding window op. 17 | 18 | Returns a sliding window of data with a specified width. 19 | """ 20 | 21 | from __future__ import absolute_import 22 | from __future__ import print_function 23 | 24 | from tensorflow.python.framework import errors 25 | from tensorflow.python.framework import ops 26 | from tensorflow.python.ops import array_ops 27 | from tensorflow.python.ops.ragged import ragged_tensor 28 | 29 | 30 | def sliding_window(data, width, axis=-1, name=None): 31 | """Builds a sliding window for `data` with a specified width. 32 | 33 | Returns a tensor constructed from `data`, where each element in 34 | dimension `axis` is a slice of `data` starting at the corresponding 35 | position, with the given width and step size. I.e.: 36 | 37 | * `result.shape.ndims = data.shape.ndims + 1` 38 | * `result[i1..iaxis, a] = data[i1..iaxis, a:a+width]` 39 | (where `0 <= a < data[i1...iaxis].shape[0] - (width - 1)`). 40 | 41 | Note that each result row (along dimension `axis`) has `width - 1` fewer items 42 | than the corresponding `data` row. If a `data` row has fewer than `width` 43 | items, then the corresponding `result` row will be empty. If you wish for 44 | the `result` rows to be the same size as the `data` rows, you can use 45 | `pad_along_dimension` to add `width - 1` padding elements before calling 46 | this op. 47 | 48 | Args: 49 | data: ` [O1...ON, A, I1...IM]` 50 | A potentially ragged K-dimensional tensor with outer dimensions of size 51 | `O1...ON`; axis dimension of size `A`; and inner dimensions of size 52 | `I1...IM`. I.e. `K = N + 1 + M`, where `N>=0` and `M>=0`. 53 | 54 | width: An integer constant specifying the width of the window. Must be 55 | greater than zero. 56 | 57 | axis: An integer constant specifying the axis along which sliding window 58 | is computed. Negative axis values from `-K` to `-1` are supported. 59 | 60 | name: The name for this op (optional) 61 | 62 | Returns: 63 | A `K+1` dimensional tensor with the same dtype as `data`, where: 64 | 65 | * `result[i1..iaxis, a]` = `data[i1..iaxis, a:a+width]` 66 | * `result.shape[:axis]` = `data.shape[:axis]` 67 | * `result.shape[axis]` = `data.shape[axis] - (width - 1)` 68 | * `result.shape[axis + 1]` = `width` 69 | * `result.shape[axis + 2:]` = `data.shape[axis + 1:]` 70 | 71 | #### Examples: 72 | 73 | Sliding window (width=3) across a sequence of tokens: 74 | 75 | ```python 76 | >>> # input: [sequence_length] 77 | >>> input = tf.constant(["one", "two", "three", "four", "five", "six"]) 78 | >>> # output: [sequence_length-2, 3] 79 | >>> output = sliding_window(data=input, width=3, axis=0) 80 | >>> print output.eval() 81 | [["one", "two", "three"], 82 | ["two", "three", "four"], 83 | ["three", "four", "five"], 84 | ["four", "five", "six"]] 85 | >>> print("Shape: %s -> %s" % (input.shape, output.shape)) 86 | Shape: (6,) -> (4, 3) 87 | ``` 88 | 89 | Sliding window (width=2) across the inner dimension of a ragged matrix 90 | containing a batch of token sequences: 91 | 92 | ```python 93 | >>> # input: [num_sentences, (num_words)] 94 | >>> input = tf.ragged.constant( 95 | ... [['Up', 'high', 'in', 'the', 'air'], 96 | ... ['Down', 'under', 'water'], 97 | ... ['Away', 'to', 'outer', 'space']] 98 | >>> # output: [num_sentences, (num_word-1), 2] 99 | >>> output = sliding_window(input, width=2, axis=-1) 100 | >>> print output.eval() 101 | [[['Up', 'high'], ['high', 'in'], ['in', 'the'], ['the', 'air']], 102 | [['Down', 'under'], ['under', 'water']], 103 | [['Away', 'to'], ['to', 'outer'], ['outer', 'space']]] 104 | >>> print("Shape: %s -> %s" % (input.shape, output.shape)) 105 | Shape: (3, ?) -> (3, ?, 2) 106 | ``` 107 | 108 | Sliding window across the second dimension of a 3-D tensor containing 109 | batches of sequences of embedding vectors: 110 | 111 | ```python 112 | >>> # input: [num_sequences, sequence_length, embedding_size] 113 | >>> input = tf.constant([ 114 | ... [[1, 1, 1], [2, 2, 1], [3, 3, 1], [4, 4, 1], [5, 5, 1]], 115 | ... [[1, 1, 2], [2, 2, 2], [3, 3, 2], [4, 4, 2], [5, 5, 2]]]) 116 | >>> # output: [num_sequences, sequence_length-1, 2, embedding_size] 117 | >>> output = sliding_window(data=input, width=2, axis=1) 118 | >>> print output.eval() 119 | [[[[1, 1, 1], [2, 2, 1]], 120 | [[2, 2, 1], [3, 3, 1]], 121 | [[3, 3, 1], [4, 4, 1]], 122 | [[4, 4, 1], [5, 5, 1]]], 123 | [[[1, 1, 2], [2, 2, 2]], 124 | [[2, 2, 2], [3, 3, 2]], 125 | [[3, 3, 2], [4, 4, 2]], 126 | [[4, 4, 2], [5, 5, 2]]]] 127 | >>> print("Shape: %s -> %s" % (input.shape, output.shape)) 128 | Shape: (2, 5, 3) -> (2, 4, 2, 3) 129 | ``` 130 | """ 131 | with ops.name_scope(name, "SlidingWindow", [data, axis]): 132 | data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name="data") 133 | 134 | if not isinstance(axis, int): 135 | raise TypeError("axis must be an int") 136 | 137 | if not isinstance(width, int): 138 | raise TypeError("width must be an int") 139 | 140 | if data.shape.ndims is not None and (axis < -data.shape.ndims or 141 | axis >= data.shape.ndims): 142 | raise errors.InvalidArgumentError( 143 | None, None, "axis must be between -k <= axis <= -1 OR 0 <= axis < k") 144 | 145 | if width <= 0: 146 | raise errors.InvalidArgumentError( 147 | None, None, "width must be an integer greater than 0") 148 | 149 | slices = [] 150 | for start in range(width): 151 | stop = None if start - width + 1 == 0 else start - width + 1 152 | if axis >= 0: 153 | idx = [slice(None)] * axis + [slice(start, stop)] 154 | else: 155 | idx = [Ellipsis, slice(start, stop)] + [slice(None)] * (-axis - 1) 156 | slices.append(data[idx]) 157 | 158 | # Stack the slices. 159 | stack_axis = axis + 1 if axis >= 0 else axis 160 | return array_ops.stack(slices, stack_axis) 161 | -------------------------------------------------------------------------------- /tensorflow_text/python/ops/create_feature_bitmask_op_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Tests for create_feature_bitmask_op.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | from tensorflow.python.framework import constant_op 23 | from tensorflow.python.framework import dtypes 24 | from tensorflow.python.framework import errors 25 | from tensorflow.python.framework import test_util 26 | from tensorflow.python.ops import array_ops 27 | from tensorflow.python.platform import test 28 | from tensorflow_text.python.ops import create_feature_bitmask_op 29 | 30 | 31 | @test_util.run_all_in_graph_and_eager_modes 32 | class CreateFeatureBitmaskOpTest(test_util.TensorFlowTestCase): 33 | 34 | def test_docstring_example1(self): 35 | data = [True, False, False, True] 36 | result = create_feature_bitmask_op.create_feature_bitmask(data) 37 | self.assertAllEqual(result, 0b1001) 38 | 39 | def test_docstring_example2(self): 40 | data = [[True, False], [False, True], [True, True]] 41 | result = create_feature_bitmask_op.create_feature_bitmask(data) 42 | expected_result = constant_op.constant([0b10, 0b01, 0b11]) 43 | self.assertAllEqual(result, expected_result) 44 | 45 | def test_feature_bitmask_single_dim_single_tensor(self): 46 | """Test that the op can reduce a single-dimension tensor to a constant.""" 47 | data = constant_op.constant([True, False]) 48 | result = create_feature_bitmask_op.create_feature_bitmask(data) 49 | 50 | expected_result = constant_op.constant(2) 51 | self.assertAllEqual(expected_result, result) 52 | 53 | def test_feature_bitmask_multiple_tensors_stack(self): 54 | """Test that the op can reduce a stacked list of tensors.""" 55 | data_1 = constant_op.constant([True, False]) 56 | data_2 = constant_op.constant([False, True]) 57 | stack_data = array_ops.stack([data_1, data_2], -1) 58 | 59 | expected_result = constant_op.constant([2, 1]) 60 | result = create_feature_bitmask_op.create_feature_bitmask(stack_data) 61 | self.assertAllEqual(expected_result, result) 62 | 63 | def test_feature_bitmask_multi_dim_single_tensor(self): 64 | """Test that the op can reduce a multi-dimension tensor.""" 65 | data = constant_op.constant([[True, True, False], [True, False, False]]) 66 | result = create_feature_bitmask_op.create_feature_bitmask(data) 67 | 68 | expected_result = constant_op.constant([6, 4]) 69 | self.assertAllEqual(expected_result, result) 70 | 71 | def test_feature_bitmask_3_dim_single_tensor(self): 72 | """Test that the op can reduce a 3-dimension tensor.""" 73 | data = constant_op.constant([[[True, True, False], [True, False, False]], 74 | [[False, False, True], [True, False, True]]]) 75 | result = create_feature_bitmask_op.create_feature_bitmask(data) 76 | 77 | expected_result = constant_op.constant([[6, 4], [1, 5]]) 78 | self.assertAllEqual(expected_result, result) 79 | 80 | def test_feature_bitmask_multiple_tensors_multi_dim_stack(self): 81 | """Test that the op can reduce a stacked list of multi-dim tensors.""" 82 | data_1 = constant_op.constant([[True, False], [False, True]]) 83 | data_2 = constant_op.constant([[False, True], [True, True]]) 84 | stack_data = array_ops.stack([data_1, data_2], -1) 85 | 86 | expected_result = constant_op.constant([[2, 1], [1, 3]]) 87 | result = create_feature_bitmask_op.create_feature_bitmask(stack_data) 88 | self.assertAllEqual(expected_result, result) 89 | 90 | def test_supports_tensors_with_unknown_shape(self): 91 | """Test that the op handles tensors with unknown shape.""" 92 | data = array_ops.placeholder_with_default( 93 | constant_op.constant([[[True, True, False], [True, False, False]], 94 | [[False, False, True], [True, False, True]]]), 95 | shape=None) 96 | result = create_feature_bitmask_op.create_feature_bitmask(data) 97 | 98 | expected_result = constant_op.constant([[6, 4], [1, 5]]) 99 | 100 | self.assertAllEqual(expected_result, result) 101 | 102 | def test_feature_bitmask_multiple_tensors_error(self): 103 | """Test that the op errors when presented with a single tensor.""" 104 | data_1 = constant_op.constant([True, False]) 105 | data_2 = constant_op.constant([True, True]) 106 | list_data = [data_1, data_2] 107 | error_message = 'CreateFeatureBitmask does not support lists of tensors.*' 108 | 109 | with self.assertRaisesRegexp(errors.InvalidArgumentError, error_message): 110 | _ = create_feature_bitmask_op.create_feature_bitmask(list_data) 111 | 112 | def test_unsupported_dtype_type(self): 113 | data = constant_op.constant([True, False]) 114 | bad_dtype = dtypes.uint32 115 | error_message = 'dtype must be one of: .*, was %s' % bad_dtype.name 116 | 117 | with self.assertRaisesRegexp(errors.InvalidArgumentError, error_message): 118 | _ = create_feature_bitmask_op.create_feature_bitmask( 119 | data, dtype=bad_dtype) 120 | 121 | def test_unsupported_input_type(self): 122 | data = constant_op.constant([1.0, 0.0]) 123 | error_message = ('Tensor conversion requested dtype bool for Tensor' 124 | ' with dtype float32: .*') 125 | 126 | with self.assertRaisesRegexp(ValueError, error_message): 127 | _ = create_feature_bitmask_op.create_feature_bitmask(data) 128 | 129 | def test_larger_than_max_shape(self): 130 | data = array_ops.fill([2, 64], False) 131 | error_message = r'data.shape\[-1\] must be less than 64, is 64.' 132 | 133 | with self.assertRaisesRegexp(ValueError, error_message): 134 | _ = create_feature_bitmask_op.create_feature_bitmask(data) 135 | 136 | def test_larger_than_dtype_shape(self): 137 | data = array_ops.fill([2, 9], False) 138 | error_message = (r'data.shape\[-1\] is too large for %s \(was 9, cannot ' 139 | r'exceed 8\).*') % dtypes.uint8.name 140 | 141 | with self.assertRaisesRegexp(ValueError, error_message): 142 | _ = create_feature_bitmask_op.create_feature_bitmask( 143 | data, dtype=dtypes.uint8) 144 | 145 | def test_larger_than_dtype_shape_at_runtime(self): 146 | data = array_ops.placeholder_with_default( 147 | array_ops.fill([2, 9], False), shape=None) 148 | error_message = (r'.*data.shape\[-1\] is too large for %s.*' % 149 | dtypes.uint8.name) 150 | 151 | with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError), 152 | error_message): 153 | self.evaluate( 154 | create_feature_bitmask_op.create_feature_bitmask( 155 | data, dtype=dtypes.uint8)) 156 | 157 | 158 | if __name__ == '__main__': 159 | test.main() 160 | -------------------------------------------------------------------------------- /third_party/tensorflow/tf_configure.bzl: -------------------------------------------------------------------------------- 1 | """Setup TensorFlow as external dependency. 2 | 3 | This is used for the generation of the dynamic libraries used for custom ops. 4 | See: http://github.com/tensorflow/custom-op 5 | """ 6 | 7 | _TF_HEADER_DIR = "TF_HEADER_DIR" 8 | _TF_SHARED_LIBRARY_DIR = "TF_SHARED_LIBRARY_DIR" 9 | 10 | def _tpl(repository_ctx, tpl, substitutions = {}, out = None): 11 | if not out: 12 | out = tpl 13 | repository_ctx.template( 14 | out, 15 | Label("//third_party/tensorflow:%s.tpl" % tpl), 16 | substitutions, 17 | ) 18 | 19 | def _fail(msg): 20 | """Output failure message when auto configuration fails.""" 21 | red = "\033[0;31m" 22 | no_color = "\033[0m" 23 | fail("%sPython Configuration Error:%s %s\n" % (red, no_color, msg)) 24 | 25 | def _is_windows(repository_ctx): 26 | """Returns true if the host operating system is windows.""" 27 | os_name = repository_ctx.os.name.lower() 28 | if os_name.find("windows") != -1: 29 | return True 30 | return False 31 | 32 | def _execute( 33 | repository_ctx, 34 | cmdline, 35 | error_msg = None, 36 | error_details = None, 37 | empty_stdout_fine = False): 38 | """Executes an arbitrary shell command. 39 | 40 | Helper for executes an arbitrary shell command. 41 | 42 | Args: 43 | repository_ctx: the repository_ctx object. 44 | cmdline: list of strings, the command to execute. 45 | error_msg: string, a summary of the error if the command fails. 46 | error_details: string, details about the error or steps to fix it. 47 | empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise 48 | it's an error. 49 | 50 | Returns: 51 | The result of repository_ctx.execute(cmdline). 52 | """ 53 | result = repository_ctx.execute(cmdline) 54 | if result.stderr or not (empty_stdout_fine or result.stdout): 55 | _fail("\n".join([ 56 | error_msg.strip() if error_msg else "Repository command failed", 57 | result.stderr.strip(), 58 | error_details if error_details else "", 59 | ])) 60 | return result 61 | 62 | def _read_dir(repository_ctx, src_dir): 63 | """Returns a string with all files in a directory. 64 | 65 | Finds all files inside a directory, traversing subfolders and following 66 | symlinks. The returned string contains the full path of all files 67 | separated by line breaks. 68 | 69 | Args: 70 | repository_ctx: the repository_ctx object. 71 | src_dir: directory to find files from. 72 | 73 | Returns: 74 | A string of all files inside the given dir. 75 | """ 76 | if _is_windows(repository_ctx): 77 | src_dir = src_dir.replace("/", "\\") 78 | find_result = _execute( 79 | repository_ctx, 80 | ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"], 81 | empty_stdout_fine = True, 82 | ) 83 | 84 | # src_files will be used in genrule.outs where the paths must 85 | # use forward slashes. 86 | result = find_result.stdout.replace("\\", "/") 87 | else: 88 | find_result = _execute( 89 | repository_ctx, 90 | ["find", src_dir, "-follow", "-type", "f"], 91 | empty_stdout_fine = True, 92 | ) 93 | result = find_result.stdout 94 | return result 95 | 96 | def _genrule(genrule_name, command, outs): 97 | """Returns a string with a genrule. 98 | 99 | Genrule executes the given command and produces the given outputs. 100 | 101 | Args: 102 | genrule_name: A unique name for genrule target. 103 | command: The command to run. 104 | outs: A list of files generated by this rule. 105 | 106 | Returns: 107 | A genrule target. 108 | """ 109 | return ( 110 | "genrule(\n" + 111 | ' name = "' + 112 | genrule_name + '",\n' + 113 | " outs = [\n" + 114 | outs + 115 | "\n ],\n" + 116 | ' cmd = """\n' + 117 | command + 118 | '\n """,\n' + 119 | ")\n" 120 | ) 121 | 122 | def _norm_path(path): 123 | """Returns a path with '/' and remove the trailing slash.""" 124 | path = path.replace("\\", "/") 125 | if path[-1] == "/": 126 | path = path[:-1] 127 | return path 128 | 129 | def _symlink_genrule_for_dir( 130 | repository_ctx, 131 | src_dir, 132 | dest_dir, 133 | genrule_name, 134 | src_files = [], 135 | dest_files = []): 136 | """Returns a genrule to symlink(or copy if on Windows) a set of files. 137 | 138 | If src_dir is passed, files will be read from the given directory; otherwise 139 | we assume files are in src_files and dest_files. 140 | 141 | Args: 142 | repository_ctx: the repository_ctx object. 143 | src_dir: source directory. 144 | dest_dir: directory to create symlink in. 145 | genrule_name: genrule name. 146 | src_files: list of source files instead of src_dir. 147 | dest_files: list of corresonding destination files. 148 | 149 | Returns: 150 | genrule target that creates the symlinks. 151 | """ 152 | if src_dir != None: 153 | src_dir = _norm_path(src_dir) 154 | dest_dir = _norm_path(dest_dir) 155 | files = "\n".join(sorted(_read_dir(repository_ctx, src_dir).splitlines())) 156 | 157 | # Create a list with the src_dir stripped to use for outputs. 158 | dest_files = files.replace(src_dir, "").splitlines() 159 | src_files = files.splitlines() 160 | command = [] 161 | outs = [] 162 | for i in range(len(dest_files)): 163 | if dest_files[i] != "": 164 | # If we have only one file to link we do not want to use the dest_dir, as 165 | # $(@D) will include the full path to the file. 166 | dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i] 167 | 168 | # Copy the headers to create a sandboxable setup. 169 | cmd = "cp -f" 170 | command.append(cmd + ' "%s" "%s"' % (src_files[i], dest)) 171 | outs.append(' "' + dest_dir + dest_files[i] + '",') 172 | genrule = _genrule( 173 | genrule_name, 174 | " && ".join(command), 175 | "\n".join(outs), 176 | ) 177 | return genrule 178 | 179 | def _tf_pip_impl(repository_ctx): 180 | tf_header_dir = repository_ctx.os.environ[_TF_HEADER_DIR] 181 | tf_header_rule = _symlink_genrule_for_dir( 182 | repository_ctx, 183 | tf_header_dir, 184 | "include", 185 | "tf_header_include", 186 | ) 187 | 188 | tf_shared_library_dir = repository_ctx.os.environ[_TF_SHARED_LIBRARY_DIR] 189 | tf_shared_library_path = "%s/libtensorflow_framework.so.1" % tf_shared_library_dir 190 | tf_shared_library_rule = _symlink_genrule_for_dir( 191 | repository_ctx, 192 | None, 193 | "", 194 | "libtensorflow_framework.so.1", 195 | [tf_shared_library_path], 196 | ["libtensorflow_framework.so.1"], 197 | ) 198 | 199 | _tpl(repository_ctx, "BUILD", { 200 | "%{TF_HEADER_GENRULE}": tf_header_rule, 201 | "%{TF_SHARED_LIBRARY_GENRULE}": tf_shared_library_rule, 202 | }) 203 | 204 | tf_configure = repository_rule( 205 | implementation = _tf_pip_impl, 206 | environ = [ 207 | _TF_HEADER_DIR, 208 | _TF_SHARED_LIBRARY_DIR, 209 | ], 210 | ) 211 | -------------------------------------------------------------------------------- /docs/api_docs/python/text/WordpieceTokenizer.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 | 15 | # text.WordpieceTokenizer 16 | 17 | ## Class `WordpieceTokenizer` 18 | 19 | Creates a wordpiece tokenizer. 20 | 21 | Inherits From: [`TokenizerWithOffsets`](../text/TokenizerWithOffsets.md) 22 | 23 | Defined in 24 | [`python/ops/wordpiece_tokenizer.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/wordpiece_tokenizer.py). 25 | 26 | 27 | 28 | It tokenizes utf-8 encoded tokens into subword pieces based off of a vocab. 29 | 30 |

__init__

31 | 32 | ```python 33 | __init__( 34 | vocab_lookup_table, 35 | suffix_indicator='##', 36 | max_bytes_per_word=100, 37 | token_out_type=dtypes.int64, 38 | unknown_token='[UNK]' 39 | ) 40 | ``` 41 | 42 | Initializes the WordpieceTokenizer. 43 | 44 | #### Args: 45 | 46 | * `vocab_lookup_table`: A lookup table implementing the LookupInterface 47 | containing the vocabulary of subwords. 48 | * `suffix_indicator`: (optional) The characters prepended to a 49 | wordpiece to indicate that it is a suffix to another subword. Default is 50 | '##'. 51 | * `max_bytes_per_word`: (optional) Max size of input token. Default 52 | is 100. 53 | * `token_out_type`: (optional) The type of the token to return. This 54 | can be `tf.int64` IDs, or `tf.string` subwords. The default is `tf.int64`. 55 | * `unknown_token`: (optional) The value to use when an unknown token is 56 | found. Default is "[UNK]". If this is set to a string, and `token_out_type` 57 | is `tf.int64`, the `vocab_lookup_table` is used to convert the 58 | `unknown_token` to an integer. If this is set to `None`, out-of-vocabulary 59 | tokens are left as is. 60 | 61 | ## Properties 62 | 63 |

name

64 | 65 | Returns the name of this module as passed or determined in the ctor. 66 | 67 | NOTE: This is not the same as the `self.name_scope.name` which includes parent 68 | module names. 69 | 70 |

name_scope

71 | 72 | Returns a `tf.name_scope` instance for this class. 73 | 74 |

submodules

75 | 76 | Sequence of all sub-modules. 77 | 78 | Submodules are modules which are properties of this module, or found as 79 | properties of modules which are properties of this module (and so on). 80 | 81 | ``` 82 | a = tf.Module() 83 | b = tf.Module() 84 | c = tf.Module() 85 | a.b = b 86 | b.c = c 87 | assert list(a.submodules) == [b, c] 88 | assert list(b.submodules) == [c] 89 | assert list(c.submodules) == [] 90 | ``` 91 | 92 | #### Returns: 93 | 94 | A sequence of all submodules. 95 | 96 |

trainable_variables

97 | 98 | Sequence of variables owned by this module and it's submodules. 99 | 100 | Note: this method uses reflection to find variables on the current instance and 101 | submodules. For performance reasons you may wish to cache the result of calling 102 | this method if you don't expect the return value to change. 103 | 104 | #### Returns: 105 | 106 | A sequence of variables for the current module (sorted by attribute name) 107 | followed by variables from all submodules recursively (breadth first). 108 | 109 |

variables

110 | 111 | Sequence of variables owned by this module and it's submodules. 112 | 113 | Note: this method uses reflection to find variables on the current instance and 114 | submodules. For performance reasons you may wish to cache the result of calling 115 | this method if you don't expect the return value to change. 116 | 117 | #### Returns: 118 | 119 | A sequence of variables for the current module (sorted by attribute name) 120 | followed by variables from all submodules recursively (breadth first). 121 | 122 | ## Methods 123 | 124 |

tokenize

125 | 126 | ```python 127 | tokenize(input) 128 | ``` 129 | 130 | "Splits tokens further into wordpiece tokens. 131 | 132 | ### Example: 133 | 134 | ```python 135 | >>> tokens = [["they're", "the", "greatest"]], 136 | >>> tokenizer = WordpieceTokenizer(vocab, token_out_type=tf.string) 137 | >>> tokenizer.tokenize(tokens) 138 | [[['they', "##'", '##re'], ['the'], ['great', '##est']]] 139 | ``` 140 | 141 | #### Args: 142 | 143 | * `input`: An N-dimensional `Tensor` or `RaggedTensor` of UTF-8 144 | strings. 145 | 146 | #### Returns: 147 | 148 | A `RaggedTensor`s `tokens` where `tokens[i1...iN, j]` is the string contents, or 149 | ID in the vocab_lookup_table representing that string, of the `j`th token in 150 | `input[i1...iN]` 151 | 152 |

tokenize_with_offsets

153 | 154 | ```python 155 | tokenize_with_offsets(input) 156 | ``` 157 | 158 | Tokenizes utf-8 encoded tokens into subword pieces based off of a vocab. 159 | 160 | ### Example: 161 | 162 | ```python 163 | >>> tokens = [["they're", "the", "greatest"]], 164 | >>> tokenizer = WordpieceTokenizer(vocab, token_out_type=tf.string) 165 | >>> result = tokenizer.tokenize_with_offsets(tokens) 166 | >>> result[0].to_list() # subwords 167 | [[['they', "##'", '##re'], ['the'], ['great', '##est']]] 168 | >>> result[1].to_list() # offset starts 169 | [[[0, 4, 5], [0], [0, 5]]] 170 | >>> result[2].to_list() # offset limits 171 | [[[4, 5, 7], [3], [5, 8]]] 172 | ``` 173 | 174 | #### Args: 175 | 176 | * `input`: An N-dimensional `Tensor` or `RaggedTensor` of UTF-8 177 | strings. 178 | 179 | #### Returns: 180 | 181 | A tuple of `RaggedTensor`s `tokens`, `start_offsets`, and `limit_offsets` 182 | 183 | * `where`: * `tokens[i1...iN, j]` is the string contents, or ID in the 184 | vocab_lookup_table representing that string, of the `j`th token in 185 | `input[i1...iN]` 186 | * `start_offsets[i1...iN, j]` is the byte offset for the start of the 187 | `j`th token in `input[i1...iN]` 188 | * `limit_offsets[i1...iN, j]` is the byte offset for the end of the 189 | 190 |

with_name_scope

191 | 192 | ```python 193 | with_name_scope( 194 | cls, 195 | method 196 | ) 197 | ``` 198 | 199 | Decorator to automatically enter the module name scope. 200 | 201 | ``` 202 | class MyModule(tf.Module): 203 | @tf.Module.with_name_scope 204 | def __call__(self, x): 205 | if not hasattr(self, 'w'): 206 | self.w = tf.Variable(tf.random.normal([x.shape[1], 64])) 207 | return tf.matmul(x, self.w) 208 | ``` 209 | 210 | Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names 211 | included the module name: 212 | 213 | ``` 214 | mod = MyModule() 215 | mod(tf.ones([8, 32])) 216 | # ==> 217 | mod.w 218 | # ==> 219 | ``` 220 | 221 | #### Args: 222 | 223 | * `method`: The method to wrap. 224 | 225 | #### Returns: 226 | 227 | The original method wrapped such that it enters the module's name scope. 228 | -------------------------------------------------------------------------------- /tensorflow_text/python/ops/ngrams_op_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 TF.Text Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Tests for ngram ops.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | from tensorflow.python.framework import constant_op 23 | from tensorflow.python.framework import errors 24 | from tensorflow.python.framework import test_util 25 | from tensorflow.python.ops.ragged import ragged_factory_ops 26 | from tensorflow.python.ops.ragged import ragged_test_util 27 | from tensorflow.python.platform import test 28 | from tensorflow_text.python.ops import ngrams_op 29 | 30 | 31 | @test_util.run_all_in_graph_and_eager_modes 32 | class NgramsOpTest(ragged_test_util.RaggedTensorTestCase): 33 | 34 | def testSumReduction(self): 35 | test_data = constant_op.constant([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]) 36 | op = ngrams_op.ngrams( 37 | test_data, width=2, axis=1, reduction_type=ngrams_op.Reduction.SUM) 38 | expected_values = [[3.0, 5.0], [30.0, 50.0]] 39 | 40 | self.assertRaggedEqual(expected_values, op) 41 | 42 | def testRaggedSumReduction(self): 43 | test_data = ragged_factory_ops.constant([[1.0, 2.0, 3.0, 4.0], 44 | [10.0, 20.0, 30.0]]) 45 | op = ngrams_op.ngrams( 46 | test_data, width=2, axis=1, reduction_type=ngrams_op.Reduction.SUM) 47 | expected_values = [[3.0, 5.0, 7.0], [30.0, 50.0]] 48 | 49 | self.assertRaggedEqual(expected_values, op) 50 | 51 | def testRaggedSumReductionAxisZero(self): 52 | test_data = ragged_factory_ops.constant([[1.0, 2.0, 3.0, 4.0], 53 | [10.0, 20.0, 30.0, 40.0]]) 54 | op = ngrams_op.ngrams( 55 | test_data, width=2, axis=0, reduction_type=ngrams_op.Reduction.SUM) 56 | expected_values = [[11.0, 22.0, 33.0, 44.0]] 57 | 58 | self.assertRaggedEqual(expected_values, op) 59 | 60 | def testMeanReduction(self): 61 | test_data = constant_op.constant([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]) 62 | op = ngrams_op.ngrams( 63 | test_data, width=2, axis=1, reduction_type=ngrams_op.Reduction.MEAN) 64 | expected_values = [[1.5, 2.5], [15.0, 25.0]] 65 | 66 | self.assertRaggedEqual(expected_values, op) 67 | 68 | def testRaggedMeanReduction(self): 69 | test_data = ragged_factory_ops.constant([[1.0, 2.0, 3.0, 4.0], 70 | [10.0, 20.0, 30.0]]) 71 | op = ngrams_op.ngrams( 72 | test_data, width=2, axis=-1, reduction_type=ngrams_op.Reduction.MEAN) 73 | expected_values = [[1.5, 2.5, 3.5], [15.0, 25.0]] 74 | 75 | self.assertRaggedEqual(expected_values, op) 76 | 77 | def testStringJoinReduction(self): 78 | test_data = constant_op.constant([["a", "b", "c"], ["dd", "ee", "ff"]]) 79 | op = ngrams_op.ngrams( 80 | test_data, 81 | width=2, 82 | axis=-1, 83 | reduction_type=ngrams_op.Reduction.STRING_JOIN, 84 | string_separator="|") 85 | expected_values = [["a|b", "b|c"], ["dd|ee", "ee|ff"]] 86 | 87 | self.assertRaggedEqual(expected_values, op) 88 | 89 | def testStringJoinReductionAxisZero(self): 90 | test_data = constant_op.constant(["a", "b", "c"]) 91 | op = ngrams_op.ngrams( 92 | test_data, 93 | width=2, 94 | axis=-1, # The -1 axis is the zero axis here. 95 | reduction_type=ngrams_op.Reduction.STRING_JOIN, 96 | string_separator="|") 97 | expected_values = ["a|b", "b|c"] 98 | 99 | self.assertRaggedEqual(expected_values, op) 100 | 101 | def testRaggedStringJoinReduction(self): 102 | test_data = ragged_factory_ops.constant([["a", "b", "c"], ["dd", "ee"]]) 103 | op = ngrams_op.ngrams( 104 | test_data, 105 | width=2, 106 | axis=-1, 107 | reduction_type=ngrams_op.Reduction.STRING_JOIN, 108 | string_separator="|") 109 | expected_values = [["a|b", "b|c"], ["dd|ee"]] 110 | 111 | self.assertRaggedEqual(expected_values, op) 112 | 113 | def testReductionWithNegativeAxis(self): 114 | test_data = constant_op.constant([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]) 115 | op = ngrams_op.ngrams( 116 | test_data, width=2, axis=-1, reduction_type=ngrams_op.Reduction.SUM) 117 | expected_values = [[3.0, 5.0], [30.0, 50.0]] 118 | 119 | self.assertRaggedEqual(expected_values, op) 120 | 121 | def testReductionOnInnerAxis(self): 122 | test_data = constant_op.constant([[[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]], 123 | [[4.0, 5.0, 6.0], [40.0, 50.0, 60.0]]]) 124 | op = ngrams_op.ngrams( 125 | test_data, width=2, axis=-2, reduction_type=ngrams_op.Reduction.SUM) 126 | expected_values = [[[11.0, 22.0, 33.0]], [[44.0, 55.0, 66.0]]] 127 | 128 | self.assertRaggedEqual(expected_values, op) 129 | 130 | def testRaggedReductionOnInnerAxis(self): 131 | test_data = ragged_factory_ops.constant([[[1.0, 2.0, 3.0, 4.0], 132 | [10.0, 20.0, 30.0, 40.0]], 133 | [[100.0, 200.0], [300.0, 400.0]]]) 134 | op = ngrams_op.ngrams( 135 | test_data, width=2, axis=-2, reduction_type=ngrams_op.Reduction.SUM) 136 | expected_values = [[[11.0, 22.0, 33.0, 44.0]], [[400.0, 600.0]]] 137 | 138 | self.assertRaggedEqual(expected_values, op) 139 | 140 | def testReductionOnAxisWithInsufficientValuesReturnsEmptySet(self): 141 | test_data = constant_op.constant([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]) 142 | op = ngrams_op.ngrams( 143 | test_data, width=4, axis=-1, reduction_type=ngrams_op.Reduction.SUM) 144 | expected_values = [[], []] 145 | 146 | self.assertRaggedEqual(expected_values, op) 147 | 148 | def testRaggedReductionOnAxisWithInsufficientValuesReturnsEmptySet(self): 149 | test_data = ragged_factory_ops.constant([[1.0, 2.0, 3.0], 150 | [10.0, 20.0, 30.0, 40.0]]) 151 | op = ngrams_op.ngrams( 152 | test_data, width=4, axis=1, reduction_type=ngrams_op.Reduction.SUM) 153 | expected_values = [[], [100.0]] 154 | 155 | self.assertRaggedEqual(expected_values, op) 156 | 157 | def testStringJoinReductionFailsWithImproperAxis(self): 158 | with self.assertRaisesRegexp( 159 | errors.InvalidArgumentError, 160 | r".*requires that ngrams' 'axis' parameter be -1."): 161 | _ = ngrams_op.ngrams( 162 | data=[], 163 | width=2, 164 | axis=0, 165 | reduction_type=ngrams_op.Reduction.STRING_JOIN) 166 | 167 | def testUnspecifiedReductionTypeFails(self): 168 | with self.assertRaisesRegexp(errors.InvalidArgumentError, 169 | r"reduction_type must be specified."): 170 | _ = ngrams_op.ngrams(data=[], width=2, axis=0) 171 | 172 | def testBadReductionTypeFails(self): 173 | with self.assertRaisesRegexp(errors.InvalidArgumentError, 174 | r"reduction_type must be a Reduction."): 175 | _ = ngrams_op.ngrams(data=[], width=2, axis=0, reduction_type="SUM") 176 | 177 | 178 | if __name__ == "__main__": 179 | test.main() 180 | --------------------------------------------------------------------------------