├── oss_scripts
    ├── pip_package
    │   ├── MANIFEST.in
    │   ├── BUILD
    │   ├── build_pip_package.sh
    │   └── setup.py
    └── configure.sh
├── third_party
    ├── tensorflow
    │   ├── BUILD
    │   ├── BUILD.tpl
    │   └── tf_configure.bzl
    └── icu
    │   ├── BUILD
    │   ├── BUILD.system
    │   ├── workspace.bzl
    │   └── BUILD.bazel
├── tensorflow_text
    ├── workspace.bzl
    ├── python
    │   ├── __init__.py
    │   ├── numpy
    │   │   ├── __init__.py
    │   │   └── viterbi_decode.py
    │   └── ops
    │   │   ├── coerce_to_valid_utf8_op_test.py
    │   │   ├── __init__.py
    │   │   ├── tokenization.py
    │   │   ├── string_ops.py
    │   │   ├── normalize_ops.py
    │   │   ├── ngrams_op.py
    │   │   ├── normalize_ops_test.py
    │   │   ├── create_feature_bitmask_op.py
    │   │   ├── sentence_breaking_ops.py
    │   │   ├── sliding_window_op.py
    │   │   ├── create_feature_bitmask_op_test.py
    │   │   └── ngrams_op_test.py
    ├── __init__.py
    └── core
    │   ├── ops
    │       ├── normalize_ops.cc
    │       ├── sentence_breaking_ops.cc
    │       ├── whitespace_tokenize_op.cc
    │       ├── unicode_script_tokenize_op.cc
    │       ├── wordpiece_op.cc
    │       └── constrained_sequence_op.cc
    │   └── kernels
    │       ├── wordpiece_tokenizer.h
    │       ├── whitespace_tokenize_kernel_test.cc
    │       ├── unicode_script_tokenize_kernel_test.cc
    │       ├── text_kernels_test_util.cc
    │       ├── sentence_breaking_utils.h
    │       ├── wordpiece_tokenizer.cc
    │       ├── text_kernels_test_util.h
    │       └── normalize_kernels.cc
├── docs
    └── api_docs
    │   └── python
    │       ├── text
    │           ├── Reduction.md
    │           ├── normalize_utf8.md
    │           ├── wordshape.md
    │           ├── case_fold_utf8.md
    │           ├── coerce_to_structurally_valid_utf8.md
    │           ├── gather_with_default.md
    │           ├── ngrams.md
    │           ├── pad_along_dimension.md
    │           ├── sentence_fragments.md
    │           ├── span_overlaps.md
    │           ├── Tokenizer.md
    │           ├── sliding_window.md
    │           ├── TokenizerWithOffsets.md
    │           ├── greedy_constrained_sequence.md
    │           ├── viterbi_constrained_sequence.md
    │           ├── WhitespaceTokenizer.md
    │           ├── span_alignment.md
    │           ├── UnicodeScriptTokenizer.md
    │           ├── _api_cache.json
    │           └── WordpieceTokenizer.md
    │       ├── index.md
    │       ├── _toc.yaml
    │       └── text.md
├── .bazelrc
├── CONTRIBUTING.md
└── WORKSPACE


/oss_scripts/pip_package/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include tensorflow_text/ *.so
2 | 


--------------------------------------------------------------------------------
/third_party/tensorflow/BUILD:
--------------------------------------------------------------------------------
1 | # Needed for Bazel to treat this directory as a package
2 | 


--------------------------------------------------------------------------------
/third_party/icu/BUILD:
--------------------------------------------------------------------------------
1 | # This empty BUILD file is required to make Bazel treat this directory as a package.
2 | 


--------------------------------------------------------------------------------
/tensorflow_text/workspace.bzl:
--------------------------------------------------------------------------------
1 | """doc"""
2 | 
3 | load("//third_party/icu:workspace.bzl", icu = "repo")
4 | 
5 | def initialize_third_party_archives():
6 |     icu()
7 | 


--------------------------------------------------------------------------------
/oss_scripts/pip_package/BUILD:
--------------------------------------------------------------------------------
 1 | # Tools for building the TF.Text pip package.
 2 | 
 3 | package(default_visibility = ["//visibility:private"])
 4 | 
 5 | licenses(["notice"])  # Apache 2.0
 6 | 
 7 | sh_binary(
 8 |     name = "build_pip_package",
 9 |     srcs = ["build_pip_package.sh"],
10 |     data = [
11 |         "LICENSE",
12 |         "MANIFEST.in",
13 |         "setup.py",
14 |         "//tensorflow_text:tf-text",
15 |     ],
16 | )
17 | 


--------------------------------------------------------------------------------
/third_party/tensorflow/BUILD.tpl:
--------------------------------------------------------------------------------
 1 | package(default_visibility = ["//visibility:public"])
 2 | 
 3 | cc_library(
 4 |     name = "tf_header_lib",
 5 |     hdrs = [":tf_header_include"],
 6 |     includes = ["include"],
 7 |     visibility = ["//visibility:public"],
 8 | )
 9 | 
10 | cc_library(
11 |     name = "libtensorflow_framework",
12 |     srcs = [":libtensorflow_framework.so.1"],
13 |     #data = ["lib/libtensorflow_framework.so.1"],
14 |     visibility = ["//visibility:public"],
15 | )
16 | 
17 | %{TF_HEADER_GENRULE}
18 | %{TF_SHARED_LIBRARY_GENRULE}
19 | 


--------------------------------------------------------------------------------
/third_party/icu/BUILD.system:
--------------------------------------------------------------------------------
 1 | package(
 2 |     default_visibility = ["//visibility:public"],
 3 | )
 4 | 
 5 | licenses(["notice"])  # Apache 2.0
 6 | 
 7 | filegroup(
 8 |     name = "icu4c/LICENSE",
 9 | )
10 | 
11 | filegroup(
12 |     name = "icu4j/main/shared/licenses/LICENSE",
13 | )
14 | 
15 | cc_library(
16 |     name = "headers",
17 | )
18 | 
19 | cc_library(
20 |     name = "common",
21 |     deps = [
22 |         ":icuuc",
23 |     ],
24 | )
25 | 
26 | cc_library(
27 |     name = "icuuc",
28 |     linkopts = ["-licuuc"],
29 |     visibility = ["//visibility:private"],
30 | )
31 | 


--------------------------------------------------------------------------------
/tensorflow_text/python/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 TF.Text Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # Empty file required by setuptools.find_packages to recognize this as a package
17 | 


--------------------------------------------------------------------------------
/tensorflow_text/python/numpy/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 TF.Text Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Numpy-based code for text processing."""
17 | 
18 | from tensorflow_text.python.numpy import viterbi_decode
19 | 


--------------------------------------------------------------------------------
/third_party/icu/workspace.bzl:
--------------------------------------------------------------------------------
 1 | """Loads a lightweight subset of the ICU library for Unicode processing."""
 2 | 
 3 | load("@org_tensorflow//third_party:repo.bzl", "third_party_http_archive")
 4 | 
 5 | # Sanitize a dependency so that it works correctly from code that includes
 6 | # TensorFlow as a submodule.
 7 | def clean_dep(dep):
 8 |     return str(Label(dep))
 9 | 
10 | def repo():
11 |     third_party_http_archive(
12 |         name = "icu",
13 |         strip_prefix = "icu-release-62-1",
14 |         sha256 = "e15ffd84606323cbad5515bf9ecdf8061cc3bf80fb883b9e6aa162e485aa9761",
15 |         urls = [
16 |             "http://mirror.tensorflow.org/github.com/unicode-org/icu/archive/release-62-1.tar.gz",
17 |             "https://github.com/unicode-org/icu/archive/release-62-1.tar.gz",
18 |         ],
19 |         build_file = "//third_party/icu:BUILD.bazel",
20 |         system_build_file = "//third_party/icu:BUILD.system",
21 |     )
22 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/Reduction.md:
--------------------------------------------------------------------------------
 1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
 2 | <meta itemprop="name" content="text.Reduction" />
 3 | <meta itemprop="path" content="Stable" />
 4 | <meta itemprop="property" content="MEAN"/>
 5 | <meta itemprop="property" content="STRING_JOIN"/>
 6 | <meta itemprop="property" content="SUM"/>
 7 | </div>
 8 | 
 9 | # text.Reduction
10 | 
11 | ## Class `Reduction`
12 | 
13 | Type of reduction to be done by the ngram op.
14 | 
15 | Defined in
16 | [`python/ops/ngrams_op.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/ngrams_op.py).
17 | 
18 | <!-- Placeholder for "Used in" -->
19 | 
20 | The supported reductions are as follows:
21 | 
22 | * `Reduction.SUM`: Add values in the window.
23 | * `Reduction.MEAN`: Average values in the window.
24 | * `Reduction.STRING_JOIN`: Join strings in the window.
25 | 
26 | ## Class Members
27 | 
28 | <h3 id="MEAN"><code>MEAN</code></h3>
29 | 
30 | <h3 id="STRING_JOIN"><code>STRING_JOIN</code></h3>
31 | 
32 | <h3 id="SUM"><code>SUM</code></h3>
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/normalize_utf8.md:
--------------------------------------------------------------------------------
 1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
 2 | <meta itemprop="name" content="text.normalize_utf8" />
 3 | <meta itemprop="path" content="Stable" />
 4 | </div>
 5 | 
 6 | # text.normalize_utf8
 7 | 
 8 | Normalizes each UTF8 string in the input tensor using the specified rule.
 9 | 
10 | ``` python
11 | text.normalize_utf8(
12 |     input,
13 |     normalization_form='NFKC',
14 |     name=None
15 | )
16 | ```
17 | 
18 | Defined in
19 | [`python/ops/normalize_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/normalize_ops.py).
20 | 
21 | <!-- Placeholder for "Used in" -->
22 | 
23 | See http://unicode.org/reports/tr15/
24 | 
25 | #### Args:
26 | 
27 | *   <b>`input`</b>: A `Tensor` or `RaggedTensor` of type string. (Must be
28 |     UTF-8.)
29 | *   <b>`normalization_form`</b>: One of the following string values ('NFC',
30 |     'NFKC', 'NFD', 'NFKD'). Default is 'NFKC'.
31 | *   <b>`name`</b>: The name for this op (optional)
32 | 
33 | #### Returns:
34 | 
35 | A `Tensor` or `RaggedTensor` of type string, with normalized contents.
36 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/wordshape.md:
--------------------------------------------------------------------------------
 1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
 2 | <meta itemprop="name" content="text.wordshape" />
 3 | <meta itemprop="path" content="Stable" />
 4 | </div>
 5 | 
 6 | # text.wordshape
 7 | 
 8 | Determine wordshape features for each input string.
 9 | 
10 | ``` python
11 | text.wordshape(
12 |     input_tensor,
13 |     pattern,
14 |     name=None
15 | )
16 | ```
17 | 
18 | Defined in
19 | [`python/ops/wordshape_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/wordshape_ops.py).
20 | 
21 | <!-- Placeholder for "Used in" -->
22 | 
23 | #### Args:
24 | 
25 | * <b>`input_tensor`</b>: string `Tensor` with any shape.
26 | * <b>`pattern`</b>: A `~tftext.WordShape` or a list of WordShapes.
27 | * <b>`name`</b>: A name for the operation (optional).
28 | 
29 | 
30 | #### Returns:
31 | 
32 | `<bool>[input_tensor.shape + pattern.shape]`: A tensor where
33 |   `result[i1...iN, j]` is true if `input_tensor[i1...iN]` has the wordshape
34 |   specified by `pattern[j]`.
35 | 
36 | #### Raises:
37 | 
38 | * <b>`ValueError`</b>: If `pattern` contains an unknown identifier.


--------------------------------------------------------------------------------
/.bazelrc:
--------------------------------------------------------------------------------
 1 | # TensorFlow Federated Bazel configuration.
 2 | #
 3 | # See https://docs.bazel.build/versions/master/user-manual.html#config for
 4 | # details on the various configuration options.
 5 | 
 6 | # Build with modular op registration support by default.
 7 | build --define=framework_shared_object=true
 8 | 
 9 | # Bazel workaround to compile gRPC with the new 'cares' package.
10 | build --define=grpc_no_ares=true
11 | 
12 | # Build with optimization enabled.
13 | build --compilation_mode=opt
14 | 
15 | # Processor native optimizations (depends on build host capabilities).
16 | build --copt=-march=native
17 | build --host_copt=-march=native
18 | build --copt=-O3
19 | build --copt=-Wno-sign-compare
20 | build --define with_default_optimizations=true
21 | 
22 | # Disable Tensorflow extensions that are not needed for Tensorflow Federated.
23 | build --define=no_aws_support=true
24 | build --define=no_hdfs_support=true
25 | build --define=no_kafka_support=true
26 | build --define=no_ignite_support=true
27 | build --define=no_nccl_support=true
28 | 
29 | # Misc configuration
30 | build:xla --define with_xla_support=true
31 | build:v2 --define=tf_api_version=2
32 | build --action_env TF_CONFIGURE_IOS="0"
33 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to <https://cla.developers.google.com/> to see
12 | your current agreements on file or to sign a new one.
13 | 
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 | 
18 | ## Code reviews
19 | 
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 | 
25 | ## Community Guidelines
26 | 
27 | This project follows
28 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
29 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/case_fold_utf8.md:
--------------------------------------------------------------------------------
 1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
 2 | <meta itemprop="name" content="text.case_fold_utf8" />
 3 | <meta itemprop="path" content="Stable" />
 4 | </div>
 5 | 
 6 | # text.case_fold_utf8
 7 | 
 8 | Applies case folding to every UTF8 string in the input.
 9 | 
10 | ``` python
11 | text.case_fold_utf8(
12 |     input,
13 |     name=None
14 | )
15 | ```
16 | 
17 | Defined in
18 | [`python/ops/normalize_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/normalize_ops.py).
19 | 
20 | <!-- Placeholder for "Used in" -->
21 | 
22 | The input is a `Tensor` or `RaggedTensor` of any shape, and the resulting output
23 | has the same shape as the input. Note that NFKC normalization is implicitly
24 | applied to the strings.
25 | 
26 | #### For example:
27 | 
28 | ```python
29 | >>> case_fold_utf8(['The   Quick-Brown',
30 | ...                 'CAT jumped over',
31 | ...                 'the lazy dog  !!  ']
32 | tf.Tensor(['The   quick-brown' 'cat jumped over' 'the lazy dog  !!  '],
33 |           shape=(3,), dtype=string)
34 | ```
35 | 
36 | #### Args:
37 | 
38 | *   <b>`input`</b>: A `Tensor` or `RaggedTensor` of type string. (Must be
39 |     UTF-8.)
40 | *   <b>`name`</b>: The name for this op (optional)
41 | 
42 | #### Returns:
43 | 
44 | A `Tensor` or `RaggedTensor` of type string, with case-folded contents.
45 | 


--------------------------------------------------------------------------------
/oss_scripts/configure.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ==============================================================================
16 | 
17 | function write_to_bazelrc() {
18 |   echo "$1" >> .bazelrc
19 | }
20 | 
21 | function write_action_env_to_bazelrc() {
22 |   write_to_bazelrc "build --action_env $1=\"$2\""
23 | }
24 | 
25 | if python -c "import tensorflow" &> /dev/null; then
26 |     echo 'using installed tensorflow'
27 | else
28 |   rm .bazelrc
29 |   pip install tensorflow-2.0.0b0
30 | fi
31 | 
32 | TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
33 | TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
34 | 
35 | write_action_env_to_bazelrc "TF_HEADER_DIR" ${TF_CFLAGS:2}
36 | write_action_env_to_bazelrc "TF_SHARED_LIBRARY_DIR" ${TF_LFLAGS:2}
37 | 


--------------------------------------------------------------------------------
/tensorflow_text/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 TF.Text Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Various tensorflow ops related to text-processing."""
17 | from tensorflow.python.util.all_util import remove_undocumented
18 | 
19 | # pylint: disable=wildcard-import,g-import-not-at-top
20 | from tensorflow_text.python.ops import *
21 | 
22 | _allowed_symbols = [
23 |     "case_fold_utf8",
24 |     "coerce_to_structurally_valid_utf8",
25 |     "gather_with_default",
26 |     "greedy_constrained_sequence",
27 |     "ngrams",
28 |     "normalize_utf8",
29 |     "pad_along_dimension",
30 |     "Reduction",
31 |     "sentence_fragments",
32 |     "sliding_window",
33 |     "span_alignment",
34 |     "span_overlaps",
35 |     "Tokenizer",
36 |     "TokenizerWithOffsets",
37 |     "UnicodeScriptTokenizer",
38 |     "viterbi_constrained_sequence",
39 |     "WhitespaceTokenizer",
40 |     "wordshape",
41 |     "WordShape",
42 |     "WordpieceTokenizer",
43 | ]
44 | 
45 | remove_undocumented(__name__, _allowed_symbols)
46 | 


--------------------------------------------------------------------------------
/oss_scripts/pip_package/build_pip_package.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Tool to build the TensorFlow Text pip package.
 3 | #
 4 | # Usage:
 5 | #   bazel build oss_scripts/pip_package:build_pip_package
 6 | #   bazel-bin/oss_scripts/build_pip_package
 7 | #
 8 | # Arguments:
 9 | #   output_dir: An output directory. Defaults to `/tmp/tensorflow_text_pkg`.
10 | 
11 | set -e  # fail and exit on any command erroring
12 | 
13 | die() {
14 |   echo >&2 "$@"
15 |   exit 1
16 | }
17 | 
18 | main() {
19 |   local output_dir="$1"
20 | 
21 |   if [[ -z "${output_dir}" ]]; then
22 |     output_dir="/tmp/tensorflow_text_pkg"
23 |   fi
24 |   mkdir -p ${output_dir}
25 |   output_dir=$(readlink -f "${output_dir}")
26 |   echo "=== Destination directory: ${output_dir}"
27 | 
28 |   if [[ ! -d "bazel-bin/tensorflow_text" ]]; then
29 |     die "Could not find bazel-bin. Did you run from the root of the build tree?"
30 |   fi
31 | 
32 |   local temp_dir="$(mktemp -d)"
33 |   trap "rm -rf ${temp_dir}" EXIT
34 |   echo "=== Using tmpdir ${temp_dir}"
35 | 
36 |   local runfiles="bazel-bin/oss_scripts/pip_package/build_pip_package.runfiles"
37 |   cp -LR \
38 |       "${runfiles}/org_tensorflow_text/tensorflow_text" \
39 |       "${temp_dir}"
40 |   cp "${runfiles}/org_tensorflow_text/oss_scripts/pip_package/setup.py" \
41 |       "${temp_dir}"
42 |   cp "${runfiles}/org_tensorflow_text/oss_scripts/pip_package/MANIFEST.in" \
43 |       "${temp_dir}"
44 |   cp "${runfiles}/org_tensorflow_text/oss_scripts/pip_package/LICENSE" \
45 |       "${temp_dir}"
46 | 
47 |   pushd "${temp_dir}" > /dev/null
48 | 
49 |   # Build pip package
50 |   python setup.py bdist_wheel --universal
51 |   cp dist/*.whl "${output_dir}"
52 | }
53 | 
54 | main "$@"
55 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/coerce_to_structurally_valid_utf8.md:
--------------------------------------------------------------------------------
 1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
 2 | <meta itemprop="name" content="text.coerce_to_structurally_valid_utf8" />
 3 | <meta itemprop="path" content="Stable" />
 4 | </div>
 5 | 
 6 | # text.coerce_to_structurally_valid_utf8
 7 | 
 8 | Coerce UTF-8 input strings to structurally valid UTF-8.
 9 | 
10 | ``` python
11 | text.coerce_to_structurally_valid_utf8(
12 |     input,
13 |     replacement_char=_unichr(65533),
14 |     name=None
15 | )
16 | ```
17 | 
18 | Defined in
19 | [`python/ops/string_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/string_ops.py).
20 | 
21 | <!-- Placeholder for "Used in" -->
22 | 
23 | Any bytes which cause the input string to be invalid UTF-8 are substituted
24 | with the provided replacement character codepoint (default 65533). Use a
25 | single byte replacement character codepoint to preserve alignment to the
26 | source input string.
27 | 
28 | #### Args:
29 | 
30 | *   <b>`input`</b>: UTF-8 string tensor to coerce to valid UTF-8.
31 | *   <b>`replacement_char`</b>: The replacement character to be used in place of
32 |     any invalid byte in the input. Any valid Unicode character may be used. The
33 |     default value is the default Unicode replacement character which is 0xFFFD
34 |     (or U+65533). Note that passing a replacement character expressible in 1
35 |     byte, such as ' ' or '?', will preserve string alignment to the source since
36 |     individual invalid bytes will be replaced with a 1-byte replacement.
37 |     (optional)
38 | *   <b>`name`</b>: A name for the operation (optional).
39 | 
40 | #### Returns:
41 | 
42 | A tensor of type string with the same shape as the input.
43 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/gather_with_default.md:
--------------------------------------------------------------------------------
 1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
 2 | <meta itemprop="name" content="text.gather_with_default" />
 3 | <meta itemprop="path" content="Stable" />
 4 | </div>
 5 | 
 6 | # text.gather_with_default
 7 | 
 8 | Gather slices with `indices=-1` mapped to `default`.
 9 | 
10 | ``` python
11 | text.gather_with_default(
12 |     params,
13 |     indices,
14 |     default,
15 |     name=None,
16 |     axis=0
17 | )
18 | ```
19 | 
20 | Defined in
21 | [`python/ops/pointer_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/pointer_ops.py).
22 | 
23 | <!-- Placeholder for "Used in" -->
24 | 
25 | This operation is similar to `tf.gather()`, except that any value of `-1`
26 | in `indices` will be mapped to `default`.  Example:
27 | 
28 | ```python
29 | >>> gather_with_default(['a', 'b', 'c', 'd'], [2, 0, -1, 2, -1], '_').eval()
30 | array(['c', 'a', '_', 'c', '_'], dtype=object)
31 | ```
32 | 
33 | #### Args:
34 | 
35 | *   <b>`params`</b>: The `Tensor` from which to gather values. Must be at least
36 |     rank `axis + 1`.
37 | *   <b>`indices`</b>: The index `Tensor`. Must have dtype `int32` or `int64`,
38 |     and values must be in the range `[-1, params.shape[axis])`.
39 | *   <b>`default`</b>: The value to use when `indices` is `-1`. `default.shape`
40 |     must be equal to `params.shape[axis + 1:]`.
41 | *   <b>`name`</b>: A name for the operation (optional).
42 | *   <b>`axis`</b>: The axis in `params` to gather `indices` from. Must be a
43 |     scalar `int32` or `int64`. Supports negative indices.
44 | 
45 | #### Returns:
46 | 
47 | A `Tensor` with the same type as `param`, and with shape `params.shape[:axis] +
48 | indices.shape + params.shape[axis + 1:]`.
49 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/index.md:
--------------------------------------------------------------------------------
 1 | # All symbols in TensorFlow Text
 2 | 
 3 | *   <a href="./text.md"><code>text</code></a>
 4 | *   <a href="./text/Reduction.md"><code>text.Reduction</code></a>
 5 | *   <a href="./text/Tokenizer.md"><code>text.Tokenizer</code></a>
 6 | *   <a href="./text/TokenizerWithOffsets.md"><code>text.TokenizerWithOffsets</code></a>
 7 | *   <a href="./text/UnicodeScriptTokenizer.md"><code>text.UnicodeScriptTokenizer</code></a>
 8 | *   <a href="./text/WhitespaceTokenizer.md"><code>text.WhitespaceTokenizer</code></a>
 9 | *   <a href="./text/WordShape.md"><code>text.WordShape</code></a>
10 | *   <a href="./text/WordpieceTokenizer.md"><code>text.WordpieceTokenizer</code></a>
11 | *   <a href="./text/case_fold_utf8.md"><code>text.case_fold_utf8</code></a>
12 | *   <a href="./text/coerce_to_structurally_valid_utf8.md"><code>text.coerce_to_structurally_valid_utf8</code></a>
13 | *   <a href="./text/gather_with_default.md"><code>text.gather_with_default</code></a>
14 | *   <a href="./text/greedy_constrained_sequence.md"><code>text.greedy_constrained_sequence</code></a>
15 | *   <a href="./text/ngrams.md"><code>text.ngrams</code></a>
16 | *   <a href="./text/normalize_utf8.md"><code>text.normalize_utf8</code></a>
17 | *   <a href="./text/pad_along_dimension.md"><code>text.pad_along_dimension</code></a>
18 | *   <a href="./text/sentence_fragments.md"><code>text.sentence_fragments</code></a>
19 | *   <a href="./text/sliding_window.md"><code>text.sliding_window</code></a>
20 | *   <a href="./text/span_alignment.md"><code>text.span_alignment</code></a>
21 | *   <a href="./text/span_overlaps.md"><code>text.span_overlaps</code></a>
22 | *   <a href="./text/viterbi_constrained_sequence.md"><code>text.viterbi_constrained_sequence</code></a>
23 | *   <a href="./text/wordshape.md"><code>text.wordshape</code></a>
24 | 


--------------------------------------------------------------------------------
/tensorflow_text/core/ops/normalize_ops.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 TF.Text Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include "tensorflow/core/framework/common_shape_fns.h"
16 | #include "tensorflow/core/framework/op.h"
17 | #include "tensorflow/core/framework/shape_inference.h"
18 | 
19 | namespace tensorflow {
20 | namespace text {
21 | 
22 | REGISTER_OP("CaseFoldUTF8")
23 |     .Input("input: string")
24 |     .Output("output: string")
25 |     .SetShapeFn(::tensorflow::shape_inference::UnchangedShape)
26 |     .Doc(R"doc(
27 | Applies case folding to every UTF8 string in input_tensor. The input is a dense
28 | tensor of any shape and the output has the same shape as the input.
29 | 
30 | For example if:
31 | 
32 |   input = [ 'The   Quick-Brown',
33 |             'CAT jumped over',
34 |             'the lazy dog  !!  ']
35 | 
36 |   output = [ 'The   quick-brown',
37 |              'cat jumped over',
38 |              'the lazy dog  !!  ']
39 | )doc");
40 | 
41 | REGISTER_OP("NormalizeUTF8")
42 |     .Input("input: string")
43 |     .Attr("normalization_form: string")
44 |     .Output("output: string")
45 |     .SetShapeFn(::tensorflow::shape_inference::UnchangedShape)
46 |     .Doc(R"doc(
47 | Normalizes each UTF8 string in the input tensor using 'normalization_form'
48 | rules.
49 | 
50 | See http://unicode.org/reports/tr15/
51 | )doc");
52 | 
53 | }  // namespace text
54 | }  // namespace tensorflow
55 | 


--------------------------------------------------------------------------------
/tensorflow_text/core/ops/sentence_breaking_ops.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 TF.Text Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include "tensorflow/core/framework/common_shape_fns.h"
16 | #include "tensorflow/core/framework/op.h"
17 | #include "tensorflow/core/framework/shape_inference.h"
18 | #include "tensorflow/core/lib/core/status.h"
19 | 
20 | namespace tensorflow {
21 | namespace text {
22 | 
23 | Status SentenceFragmentShapeFn(
24 |     ::tensorflow::shape_inference::InferenceContext* c) {
25 |   for (int i = 0; i < c->num_outputs(); ++i) {
26 |     c->set_output(i, c->UnknownShapeOfRank(1));
27 |   }
28 | 
29 |   return Status::OK();
30 | }
31 | 
32 | REGISTER_OP("SentenceFragments")
33 |     .Attr("input_encoding: string")
34 |     .Attr("errors: {'strict', 'replace', 'ignore'} = 'replace'")
35 |     .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
36 |     .Attr("replace_control_characters: bool = false")
37 |     .Input("row_lengths: int64")
38 |     .Input("token_start: int64")
39 |     .Input("token_end: int64")
40 |     .Input("token_word: string")
41 |     .Input("token_properties: int64")
42 |     .Output("fragment_start: int64")
43 |     .Output("fragment_end: int64")
44 |     .Output("fragment_properties: int64")
45 |     .Output("terminal_punc_token: int64")
46 |     .Output("output_row_lengths: int64")
47 |     .SetShapeFn(SentenceFragmentShapeFn);
48 | 
49 | }  // namespace text
50 | }  // namespace tensorflow
51 | 


--------------------------------------------------------------------------------
/tensorflow_text/python/ops/coerce_to_valid_utf8_op_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 TF.Text Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # -*- coding: utf-8 -*-
17 | """Tests for Utf8Chars Op from string_ops."""
18 | 
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 | 
23 | from tensorflow.python.platform import test
24 | from tensorflow_text.python.ops import string_ops
25 | 
26 | 
27 | class CoerceToUtf8Test(test.TestCase):
28 | 
29 |   def testCoercetoStructurallyValidOnValidInput(self):
30 |     with self.test_session():
31 |       utf8 = string_ops.coerce_to_structurally_valid_utf8(["abc"])
32 |     self.assertAllEqual(utf8, ["abc"])
33 | 
34 |   def testCoercetoStructurallyValidOnValidInputWithDefault(self):
35 |     with self.test_session():
36 |       utf8 = string_ops.coerce_to_structurally_valid_utf8(["abc"], "?")
37 |     self.assertAllEqual(utf8, ["abc"])
38 | 
39 |   def testCoercetoStructurallyValidOnInvalidInput(self):
40 |     with self.test_session():
41 |       utf8 = string_ops.coerce_to_structurally_valid_utf8([b"abc\xfd"])
42 |     self.assertAllEqual(utf8, ["abc�"])
43 | 
44 |   def testCoercetoStructurallyValidOnInvalidInputWithDefault(self):
45 |     with self.test_session():
46 |       utf8 = string_ops.coerce_to_structurally_valid_utf8([b"abc\xfd"], "?")
47 |     self.assertAllEqual(utf8, ["abc?"])
48 | 
49 | 
50 | if __name__ == "__main__":
51 |   test.main()
52 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/_toc.yaml:
--------------------------------------------------------------------------------
 1 | # Automatically generated file; please do not edit
 2 | toc:
 3 |   - title: text
 4 |     section:
 5 |     - title: Overview
 6 |       path: /text/api_docs/python/text
 7 |     - title: case_fold_utf8
 8 |       path: /text/api_docs/python/text/case_fold_utf8
 9 |     - title: coerce_to_structurally_valid_utf8
10 |       path: /text/api_docs/python/text/coerce_to_structurally_valid_utf8
11 |     - title: gather_with_default
12 |       path: /text/api_docs/python/text/gather_with_default
13 |     - title: greedy_constrained_sequence
14 |       path: /text/api_docs/python/text/greedy_constrained_sequence
15 |     - title: ngrams
16 |       path: /text/api_docs/python/text/ngrams
17 |     - title: normalize_utf8
18 |       path: /text/api_docs/python/text/normalize_utf8
19 |     - title: pad_along_dimension
20 |       path: /text/api_docs/python/text/pad_along_dimension
21 |     - title: Reduction
22 |       path: /text/api_docs/python/text/Reduction
23 |     - title: sentence_fragments
24 |       path: /text/api_docs/python/text/sentence_fragments
25 |     - title: sliding_window
26 |       path: /text/api_docs/python/text/sliding_window
27 |     - title: span_alignment
28 |       path: /text/api_docs/python/text/span_alignment
29 |     - title: span_overlaps
30 |       path: /text/api_docs/python/text/span_overlaps
31 |     - title: Tokenizer
32 |       path: /text/api_docs/python/text/Tokenizer
33 |     - title: TokenizerWithOffsets
34 |       path: /text/api_docs/python/text/TokenizerWithOffsets
35 |     - title: UnicodeScriptTokenizer
36 |       path: /text/api_docs/python/text/UnicodeScriptTokenizer
37 |     - title: viterbi_constrained_sequence
38 |       path: /text/api_docs/python/text/viterbi_constrained_sequence
39 |     - title: WhitespaceTokenizer
40 |       path: /text/api_docs/python/text/WhitespaceTokenizer
41 |     - title: WordpieceTokenizer
42 |       path: /text/api_docs/python/text/WordpieceTokenizer
43 |     - title: WordShape
44 |       path: /text/api_docs/python/text/WordShape
45 |     - title: wordshape
46 |       path: /text/api_docs/python/text/wordshape
47 | 


--------------------------------------------------------------------------------
/tensorflow_text/core/kernels/wordpiece_tokenizer.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 TF.Text Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef TENSORFLOW_TEXT_CORE_KERNELS_WORDPIECE_TOKENIZER_H_
16 | #define TENSORFLOW_TEXT_CORE_KERNELS_WORDPIECE_TOKENIZER_H_
17 | 
18 | #include <map>
19 | #include "tensorflow/core/framework/lookup_interface.h"
20 | #include "tensorflow/core/lib/core/status.h"
21 | 
22 | namespace tensorflow {
23 | namespace text {
24 | 
25 | class WordpieceVocab {
26 |  public:
27 |   virtual ~WordpieceVocab() {}
28 |   virtual Status Contains(const string& key, bool* value) = 0;
29 | };
30 | 
31 | class LookupTableVocab : public WordpieceVocab {
32 |  public:
33 |   LookupTableVocab(lookup::LookupInterface* table, OpKernelContext* ctx);
34 | 
35 |   virtual Status Contains(const string& key, bool* value);
36 | 
37 |  private:
38 |   // not owned
39 |   lookup::LookupInterface* table_;
40 |   OpKernelContext* ctx_;
41 |   Tensor default_value_;
42 | };
43 | 
44 | Status WordpieceTokenize(const string& token, const int64 max_bytes_per_token,
45 |                          const string& suffix_indicator, bool use_unknown_token,
46 |                          const string& unknown_token,
47 |                          LookupTableVocab* vocab_map,
48 |                          std::vector<string>* subwords,
49 |                          std::vector<int>* begin_offset,
50 |                          std::vector<int>* end_offset, int* num_word_pieces);
51 | 
52 | }  // namespace text
53 | }  // namespace tensorflow
54 | 
55 | #endif  // TENSORFLOW_TEXT_CORE_KERNELS_WORDPIECE_TOKENIZER_H_
56 | 


--------------------------------------------------------------------------------
/third_party/icu/BUILD.bazel:
--------------------------------------------------------------------------------
 1 | package(
 2 |     default_visibility = ["//visibility:public"],
 3 | )
 4 | 
 5 | licenses(["notice"])  # Apache 2.0
 6 | 
 7 | exports_files([
 8 |     "icu4c/LICENSE",
 9 |     "icu4j/main/shared/licenses/LICENSE",
10 | ])
11 | 
12 | cc_library(
13 |     name = "headers",
14 |     hdrs = glob(["icu4c/source/common/unicode/*.h"]),
15 |     includes = [
16 |         "icu4c/source/common",
17 |     ],
18 |     deps = [
19 |     ],
20 | )
21 | 
22 | cc_library(
23 |     name = "common",
24 |     hdrs = glob(["icu4c/source/common/unicode/*.h"]),
25 |     includes = [
26 |         "icu4c/source/common",
27 |     ],
28 |     deps = [
29 |         ":icuuc",
30 |     ],
31 | )
32 | 
33 | cc_library(
34 |     name = "icuuc",
35 |     srcs = glob(
36 |         [
37 |             "icu4c/source/common/*.c",
38 |             "icu4c/source/common/*.cpp",
39 |             "icu4c/source/stubdata/*.cpp",
40 |         ],
41 |     ),
42 |     hdrs = glob([
43 |         "icu4c/source/common/*.h",
44 |     ]),
45 |     copts = [
46 |         "-DU_COMMON_IMPLEMENTATION",
47 |         "-DU_HAVE_STD_ATOMICS",
48 |     ] + select({
49 |         ":android": [
50 |             "-fdata-sections",
51 |             "-DGOOGLE_VENDOR_SRC_BRANCH",
52 |             "-DU_HAVE_NL_LANGINFO_CODESET=0",
53 |             "-Wno-deprecated-declarations",
54 |         ],
55 |         ":apple": [
56 |             "-DGOOGLE_VENDOR_SRC_BRANCH",
57 |             "-Wno-shorten-64-to-32",
58 |             "-Wno-unused-variable",
59 |         ],
60 |         ":windows": [
61 |             "/utf-8",
62 |             "/DLOCALE_ALLOW_NEUTRAL_NAMES=0",
63 |         ],
64 |         "//conditions:default": [],
65 |     }),
66 |     tags = ["requires-rtti"],
67 |     visibility = [
68 |         "//visibility:private",
69 |     ],
70 |     deps = [
71 |         ":headers",
72 |     ],
73 | )
74 | 
75 | config_setting(
76 |     name = "android",
77 |     values = {"crosstool_top": "//external:android/crosstool"},
78 | )
79 | 
80 | config_setting(
81 |     name = "apple",
82 |     values = {"cpu": "darwin"},
83 | )
84 | 
85 | config_setting(
86 |     name = "windows",
87 |     values = {"cpu": "x64_windows"},
88 | )
89 | 


--------------------------------------------------------------------------------
/tensorflow_text/core/ops/whitespace_tokenize_op.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 TF.Text Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <string>
16 | #include <vector>
17 | 
18 | #include "tensorflow/core/framework/common_shape_fns.h"
19 | #include "tensorflow/core/framework/op.h"
20 | #include "tensorflow/core/framework/shape_inference.h"
21 | 
22 | namespace tensorflow {
23 | 
24 | namespace shape_inference {
25 | class InferenceContext;
26 | }  // namespace shape_inference
27 | 
28 | namespace text {
29 | 
30 | using shape_inference::InferenceContext;
31 | 
32 | REGISTER_OP("WhitespaceTokenizeWithOffsets")
33 |     .Input("input_values: int32")
34 |     .Input("input_splits: Tsplits")
35 |     .Output("output_values: int32")
36 |     .Output("output_values_inner_splits: Tsplits")
37 |     .Output("output_offset_starts: int64")
38 |     .Output("output_offset_limits: int64")
39 |     .Output("output_outer_splits: Tsplits")
40 |     .Attr("Tsplits: {int32, int64} = DT_INT64")
41 |     .SetShapeFn([](InferenceContext* c) {
42 |       shape_inference::ShapeHandle unused;
43 |       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
44 |       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
45 | 
46 |       c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
47 |       c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
48 |       c->set_output(2, c->Vector(InferenceContext::kUnknownDim));
49 |       c->set_output(3, c->Vector(InferenceContext::kUnknownDim));
50 |       c->set_output(4, c->Vector(InferenceContext::kUnknownDim));
51 |       return Status::OK();
52 |     });
53 | 
54 | }  // namespace text
55 | }  // namespace tensorflow
56 | 


--------------------------------------------------------------------------------
/tensorflow_text/core/ops/unicode_script_tokenize_op.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 TF.Text Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <string>
16 | #include <vector>
17 | 
18 | #include "tensorflow/core/framework/common_shape_fns.h"
19 | #include "tensorflow/core/framework/op.h"
20 | #include "tensorflow/core/framework/shape_inference.h"
21 | 
22 | namespace tensorflow {
23 | 
24 | namespace shape_inference {
25 | class InferenceContext;
26 | }  // namespace shape_inference
27 | 
28 | namespace text {
29 | 
30 | using shape_inference::InferenceContext;
31 | 
32 | REGISTER_OP("UnicodeScriptTokenizeWithOffsets")
33 |     .Input("input_values: int32")
34 |     .Input("input_splits: Tsplits")
35 |     .Output("output_values: int32")
36 |     .Output("output_values_inner_splits: Tsplits")
37 |     .Output("output_offset_starts: int64")
38 |     .Output("output_offset_limits: int64")
39 |     .Output("output_outer_splits: Tsplits")
40 |     .Attr("Tsplits: {int32, int64} = DT_INT64")
41 |     .Attr("keep_whitespace: bool = false")
42 |     .SetShapeFn([](InferenceContext* c) {
43 |       shape_inference::ShapeHandle unused;
44 |       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
45 |       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
46 | 
47 |       c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
48 |       c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
49 |       c->set_output(2, c->Vector(InferenceContext::kUnknownDim));
50 |       c->set_output(3, c->Vector(InferenceContext::kUnknownDim));
51 |       c->set_output(4, c->Vector(InferenceContext::kUnknownDim));
52 |       return Status::OK();
53 |     });
54 | 
55 | }  // namespace text
56 | }  // namespace tensorflow
57 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/ngrams.md:
--------------------------------------------------------------------------------
 1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
 2 | <meta itemprop="name" content="text.ngrams" />
 3 | <meta itemprop="path" content="Stable" />
 4 | </div>
 5 | 
 6 | # text.ngrams
 7 | 
 8 | Create a tensor of n-grams based on the input data `data`.
 9 | 
10 | ``` python
11 | text.ngrams(
12 |     data,
13 |     width,
14 |     axis=-1,
15 |     reduction_type=None,
16 |     string_separator=' ',
17 |     name=None
18 | )
19 | ```
20 | 
21 | Defined in
22 | [`python/ops/ngrams_op.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/ngrams_op.py).
23 | 
24 | <!-- Placeholder for "Used in" -->
25 | 
26 | Creates a tensor of n-grams based on `data`. The n-grams are of width `width`
27 | and are created along axis `axis`; the n-grams are created by combining
28 | windows of `width` adjacent elements from `data` using `reduction_type`. This
29 | op is intended to cover basic use cases; more complex combinations can be
30 | created using the sliding_window op.
31 | 
32 | #### Args:
33 | 
34 | *   <b>`data`</b>: The data to reduce.
35 | *   <b>`width`</b>: The width of the ngram window. If there is not sufficient
36 |     data to fill out the ngram window, the resulting ngram will be empty.
37 | *   <b>`axis`</b>: The axis to create ngrams along. Note that for string join
38 |     reductions, only axis '-1' is supported; for other reductions, any positive
39 |     or negative axis can be used. Should be a constant.
40 | *   <b>`reduction_type`</b>: A member of the Reduction enum. Should be a
41 |     constant. Currently supports:
42 | 
43 |     *   `Reduction.SUM`: Add values in the window.
44 |     *   `Reduction.MEAN`: Average values in the window.
45 |     *   `Reduction.STRING_JOIN`: Join strings in the window. Note that axis must
46 |         be -1 here.
47 | 
48 | *   <b>`string_separator`</b>: The separator string used for
49 |     `Reduction.STRING_JOIN`. Ignored otherwise. Must be a string constant, not a
50 |     Tensor.
51 | 
52 | *   <b>`name`</b>: The op name.
53 | 
54 | #### Returns:
55 | 
56 | A tensor of ngrams.
57 | 
58 | #### Raises:
59 | 
60 | *   <b>`InvalidArgumentError`</b>: if `reduction_type` is either None or not a
61 |     Reduction, or if `reduction_type` is STRING_JOIN and `axis` is not -1.
62 | 


--------------------------------------------------------------------------------
/tensorflow_text/python/ops/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 TF.Text Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Various tensorflow ops related to text-processing."""
17 | 
18 | from tensorflow_text.python.ops.create_feature_bitmask_op import create_feature_bitmask
19 | from tensorflow_text.python.ops.greedy_constrained_sequence_op import greedy_constrained_sequence
20 | from tensorflow_text.python.ops.ngrams_op import ngrams
21 | from tensorflow_text.python.ops.ngrams_op import Reduction
22 | from tensorflow_text.python.ops.normalize_ops import case_fold_utf8
23 | from tensorflow_text.python.ops.normalize_ops import normalize_utf8
24 | from tensorflow_text.python.ops.pad_along_dimension_op import pad_along_dimension
25 | from tensorflow_text.python.ops.pointer_ops import gather_with_default
26 | from tensorflow_text.python.ops.pointer_ops import span_alignment
27 | from tensorflow_text.python.ops.pointer_ops import span_overlaps
28 | from tensorflow_text.python.ops.sentence_breaking_ops import sentence_fragments
29 | from tensorflow_text.python.ops.sliding_window_op import sliding_window
30 | from tensorflow_text.python.ops.string_ops import coerce_to_structurally_valid_utf8
31 | from tensorflow_text.python.ops.tokenization import Tokenizer
32 | from tensorflow_text.python.ops.tokenization import TokenizerWithOffsets
33 | from tensorflow_text.python.ops.unicode_script_tokenizer import UnicodeScriptTokenizer
34 | from tensorflow_text.python.ops.viterbi_constrained_sequence_op import viterbi_constrained_sequence
35 | from tensorflow_text.python.ops.whitespace_tokenizer import WhitespaceTokenizer
36 | from tensorflow_text.python.ops.wordpiece_tokenizer import WordpieceTokenizer
37 | from tensorflow_text.python.ops.wordshape_ops import WordShape
38 | from tensorflow_text.python.ops.wordshape_ops import wordshape
39 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/pad_along_dimension.md:
--------------------------------------------------------------------------------
 1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
 2 | <meta itemprop="name" content="text.pad_along_dimension" />
 3 | <meta itemprop="path" content="Stable" />
 4 | </div>
 5 | 
 6 | # text.pad_along_dimension
 7 | 
 8 | Add padding to the beginning and end of data in a specific dimension.
 9 | 
10 | ``` python
11 | text.pad_along_dimension(
12 |     data,
13 |     axis=-1,
14 |     left_pad=None,
15 |     right_pad=None,
16 |     name=None
17 | )
18 | ```
19 | 
20 | Defined in
21 | [`python/ops/pad_along_dimension_op.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/pad_along_dimension_op.py).
22 | 
23 | <!-- Placeholder for "Used in" -->
24 | 
25 | Returns a tensor constructed from `data`, where each row in dimension `axis`
26 | is replaced by the concatenation of the left padding followed by the row
27 | followed by the right padding.  I.e., if `L=left_pad.shape[0]` and
28 | `R=right_pad.shape[0]`, then:
29 | 
30 | ```python
31 | result[i1...iaxis, 0:L] = left_pad
32 | result[i1...iaxis, L:-R] = data[i0...iaxis]
33 | result[i1...iaxis, -R:] = right_pad
34 | ```
35 | 
36 | #### Args:
37 | 
38 | *   <b>`data`</b>: `<dtype>[O1...ON, A, I1...IM]` A potentially ragged `K`
39 |     dimensional tensor with outer dimensions of size `O1...ON`; axis dimension
40 |     of size `A`; and inner dimensions of size `I1...IM`. I.e. `K = N + 1 + M`,
41 |     where `N>=0` and `M>=0`.
42 | *   <b>`axis`</b>: An integer constant specifying the axis along which padding
43 |     is added. Negative axis values from `-K` to `-1` are supported.
44 | *   <b>`left_pad`</b>: `<dtype>[L, I1...IM]` An `M+1` dimensional tensor that
45 |     should be prepended to each row along dimension `axis`; or `None` if no
46 |     padding should be added to the left side.
47 | *   <b>`right_pad`</b>: `<dtype>[R, I1...IM]` An `M+1` dimensional tensor that
48 |     should be appended to each row along dimension `axis`; or `None` if no
49 |     padding should be added to the right side.
50 | *   <b>`name`</b>: The name of this op.
51 | 
52 | #### Returns:
53 | 
54 | `<dtype>[O1...ON, L + A + R, I1...IM]` A potentially ragged `K` dimensional
55 | tensor with outer dimensions of size `O1...ON`; padded axis dimension size
56 | `L+A+R`; and inner dimensions of size `I1...IM`. If `data` is a `RaggedTensor`,
57 | then the returned tensor is a `RaggedTensor` with the same `ragged_rank`.
58 | 


--------------------------------------------------------------------------------
/tensorflow_text/python/ops/tokenization.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 TF.Text Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Base classes (abstract class) for all tokenizers."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import abc
23 | 
24 | 
25 | # TODO(broken): Have this extend Module when it becomes public
26 | class Tokenizer():
27 |   """Base class (abstract class) for all tokenizers."""
28 | 
29 |   __metaclass__ = abc.ABCMeta
30 | 
31 |   @abc.abstractmethod
32 |   def tokenize(self, input):  # pylint: disable=redefined-builtin
33 |     """Abstract function for tokenization.
34 | 
35 |     Args:
36 |       input: An N-dimensional UTF-8 string (or optionally integer) Tensor or
37 |         RaggedTensor.
38 | 
39 |     Returns:
40 |       An N+1-dimensional UTF-8 string or integer Tensor or RaggedTensor.
41 |     """
42 |     pass
43 | 
44 | 
45 | class TokenizerWithOffsets(Tokenizer):
46 |   """Base class (abstract class) for all tokenizers that return offsets."""
47 | 
48 |   @abc.abstractmethod
49 |   def tokenize_with_offsets(self, input):  # pylint: disable=redefined-builtin
50 |     """Abstract function for tokenization with offsets.
51 | 
52 |     Args:
53 |       input: An N-dimensional UTF-8 string (or optionally integer) Tensor or
54 |         RaggedTensor.
55 | 
56 |     Returns:
57 |       A tuple (tokens, start_offsets, limit_offsets):
58 |         * tokens is an N+1-dimensional UTF-8 string or integer Tensor or
59 |             RaggedTensor.
60 |         * start_offsets is an N+1-dimensional integer Tensor containing the
61 |             starting indices of each token (byte indices for input strings).
62 |         * limit_offsets is an N+1-dimensional integer Tensor containing the
63 |             exclusive ending indices of each token (byte indices for input
64 |             strings).
65 |     """
66 |     pass
67 | 


--------------------------------------------------------------------------------
/WORKSPACE:
--------------------------------------------------------------------------------
 1 | workspace(name = "org_tensorflow_text")
 2 | 
 3 | load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 4 | 
 5 | http_archive(
 6 |     name = "bazel_skylib",
 7 |     sha256 = "2ef429f5d7ce7111263289644d233707dba35e39696377ebab8b0bc701f7818e",
 8 |     urls = ["https://github.com/bazelbuild/bazel-skylib/releases/download/0.8.0/bazel-skylib.0.8.0.tar.gz"],
 9 | )
10 | 
11 | http_archive(
12 |     name = "com_google_absl",
13 |     sha256 = "0322e3a15fd119fcc8b03033e7011bb1beb7d6c4111f9e57272b7be78d56045a",
14 |     strip_prefix = "abseil-cpp-2f76a9bf50046e396138cc8eeb3cdc17b7a5ac24",
15 |     urls = [
16 |         "http://mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/2f76a9bf50046e396138cc8eeb3cdc17b7a5ac24.tar.gz",
17 |         "https://github.com/abseil/abseil-cpp/archive/2f76a9bf50046e396138cc8eeb3cdc17b7a5ac24.tar.gz",
18 |     ],
19 | )
20 | 
21 | http_archive(
22 |     name = "com_google_googletest",
23 |     sha256 = "ff7a82736e158c077e76188232eac77913a15dac0b22508c390ab3f88e6d6d86",
24 |     strip_prefix = "googletest-b6cd405286ed8635ece71c72f118e659f4ade3fb",
25 |     urls = [
26 |         "http://mirror.tensorflow.org/github.com/google/googletest/archive/b6cd405286ed8635ece71c72f118e659f4ade3fb.zip",
27 |         "https://github.com/google/googletest/archive/b6cd405286ed8635ece71c72f118e659f4ade3fb.zip",
28 |     ],
29 | )
30 | 
31 | http_archive(
32 |     name = "io_bazel_rules_closure",
33 |     sha256 = "e0a111000aeed2051f29fcc7a3f83be3ad8c6c93c186e64beb1ad313f0c7f9f9",
34 |     strip_prefix = "rules_closure-cf1e44edb908e9616030cc83d085989b8e6cd6df",
35 |     urls = [
36 |         "http://mirror.tensorflow.org/github.com/bazelbuild/rules_closure/archive/cf1e44edb908e9616030cc83d085989b8e6cd6df.tar.gz",
37 |         "https://github.com/bazelbuild/rules_closure/archive/cf1e44edb908e9616030cc83d085989b8e6cd6df.tar.gz",  # 2019-04-04
38 |     ],
39 | 
40 | )
41 | 
42 | http_archive(
43 |     name = "org_tensorflow",
44 |     strip_prefix = "tensorflow-2.0.0-beta0",
45 |     sha256 = "9dd3b78fce445a8d01791aadda3cbb686b732d4df2d4f6563054f7d7a725fa68",
46 |     urls = [
47 |         "https://github.com/tensorflow/tensorflow/archive/v2.0.0-beta0.zip"
48 |     ],
49 | )
50 | 
51 | load("@org_tensorflow//tensorflow:workspace.bzl", "tf_workspace")
52 | 
53 | tf_workspace(tf_repo_name="@org_tensorflow")
54 | 
55 | load("//third_party/tensorflow:tf_configure.bzl", "tf_configure")
56 | 
57 | tf_configure(name = "local_config_tf")
58 | 
59 | load("//tensorflow_text:workspace.bzl", "initialize_third_party_archives")
60 | 
61 | initialize_third_party_archives()
62 | 


--------------------------------------------------------------------------------
/tensorflow_text/python/ops/string_ops.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 TF.Text Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Tensorflow operations for UTF8 strings."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import sys
23 | 
24 | from tensorflow.python.ops import string_ops
25 | 
26 | 
27 | def _unichr(codepoint):
28 |   if sys.version_info[0] == 2:
29 |     return unichr(codepoint)
30 |   else:
31 |     return chr(codepoint)
32 | 
33 | 
34 | # pylint: disable=redefined-builtin
35 | def coerce_to_structurally_valid_utf8(input,
36 |                                       replacement_char=_unichr(65533),
37 |                                       name=None):
38 |   """Coerce UTF-8 input strings to structurally valid UTF-8.
39 | 
40 |   Any bytes which cause the input string to be invalid UTF-8 are substituted
41 |   with the provided replacement character codepoint (default 65533). Use a
42 |   single byte replacement character codepoint to preserve alignment to the
43 |   source input string.
44 | 
45 |   Args:
46 |     input: UTF-8 string tensor to coerce to valid UTF-8.
47 |     replacement_char: The replacement character to be used in place of any
48 |         invalid byte in the input. Any valid Unicode character may be used. The
49 |         default value is the default Unicode replacement character which is
50 |         0xFFFD (or U+65533). Note that passing a replacement character
51 |         expressible in 1 byte, such as ' ' or '?', will preserve string
52 |         alignment to the source since individual invalid bytes will be replaced
53 |         with a 1-byte replacement. (optional)
54 |     name: A name for the operation (optional).
55 | 
56 |   Returns:
57 |     A tensor of type string with the same shape as the input.
58 |   """
59 |   return string_ops.unicode_transcode(
60 |       input,
61 |       input_encoding='UTF-8',
62 |       output_encoding='UTF-8',
63 |       errors='replace',
64 |       replacement_char=ord(replacement_char),
65 |       name=name)
66 | 


--------------------------------------------------------------------------------
/tensorflow_text/core/ops/wordpiece_op.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 TF.Text Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include "tensorflow/core/framework/op.h"
16 | #include "tensorflow/core/framework/shape_inference.h"
17 | 
18 | namespace tensorflow {
19 | 
20 | REGISTER_OP("WordpieceTokenizeWithOffsets")
21 |     .Input("input_values: string")
22 |     .Input("vocab_lookup_table: resource")
23 |     .Attr("suffix_indicator: string")
24 |     .Attr("max_bytes_per_word: int")
25 |     .Attr("use_unknown_token: bool")
26 |     .Attr("unknown_token: string")
27 |     .Output("output_values: string")
28 |     .Output("output_row_lengths: int64")
29 |     .Output("start_values: int64")
30 |     .Output("limit_values: int64")
31 |     .Doc(R"doc(
32 |   Tokenizes tokens into sub-word pieces based off of a vocabulary.
33 | 
34 |   `wordpiece_tokenize_with_offsets` returns the relative offsets.
35 | 
36 |   ### Example:
37 |   tokens = ['don', '\'t', 'treadness']
38 |   wordpiece, start, end = wordpiece_tokenize_with_offset(tokens)
39 |   wordpiece = [['don', '\'', 't'], ['tread', '##ness']]
40 |   start = [[[0, 3, 4], [0, 5]]]
41 |   end = [[[3, 4, 5], [5, 10]]]
42 |   Args:
43 |     tokens: <string>[num_batch, (num_tokens)] a `RaggedTensor` of UTF-8 token
44 |       strings
45 |     vocab_lookup_table: A lookup table implementing the LookupInterface
46 |     word_split_char: Character used to define prefixes in the vocab.
47 |     return_ids: A bool indicating whether the op returns int64 ids or tokenized
48 |       subword strings.
49 | 
50 |   Returns:
51 |     A tuple of `RaggedTensor`s `subword`, `subword_offset_starts`,
52 |     `subword_offset_limit` where:
53 | 
54 |     `subword`: <string>[num_batch, (num_tokens), (num_subword_pieces)] is the
55 |       wordpiece token string encoded in UTF-8.
56 |     `subword_offset_starts`: <int64>[num_batch, (num_tokens),
57 |       (num_subword_pieces)] is the word piece token's starting byte offset.
58 |     `subword_offset_limit`: <int64>[num_batch, (num_tokens),
59 |       (num_subword_pieces)] is the word piece token's ending byte offset.
60 | )doc");
61 | 
62 | }  // namespace tensorflow
63 | 


--------------------------------------------------------------------------------
/tensorflow_text/core/kernels/whitespace_tokenize_kernel_test.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 TF.Text Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <vector>
16 | 
17 | #include <gmock/gmock.h>
18 | #include <gtest/gtest.h>
19 | #include "tensorflow/core/framework/fake_input.h"
20 | #include "tensorflow/core/framework/node_def_builder.h"
21 | #include "tensorflow/core/framework/tensor.h"
22 | #include "tensorflow/core/framework/tensor_shape.h"
23 | #include "tensorflow/core/kernels/ops_testutil.h"
24 | #include "tensorflow/core/lib/core/status.h"
25 | #include "tensorflow/core/lib/core/status_test_util.h"
26 | #include "tensorflow_text/core/kernels/text_kernels_test_util.h"
27 | 
28 | namespace tensorflow {
29 | namespace text {
30 | 
31 | using tensorflow::FakeInput;
32 | using tensorflow::NodeDefBuilder;
33 | using tensorflow::Status;
34 | using tensorflow::TensorShape;
35 | using tensorflow::text_kernels_test_util::VectorEq;
36 | 
37 | class WhitespaceTokenizeWithOffsetsKernelTest
38 |     : public tensorflow::OpsTestBase {
39 |  public:
40 |   void MakeOp() {
41 |     TF_ASSERT_OK(NodeDefBuilder("tested_op", "WhitespaceTokenizeWithOffsets")
42 |                  .Input(FakeInput())
43 |                  .Input(FakeInput())
44 |                  .Finalize(node_def()));
45 |     TF_ASSERT_OK(InitOp());
46 |   }
47 | };
48 | 
49 | TEST_F(WhitespaceTokenizeWithOffsetsKernelTest, Test) {
50 |   MakeOp();
51 |   AddInputFromArray<int32>(TensorShape({6}), {111, 112, 32, 116, 117, 118});
52 |   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
53 |   TF_ASSERT_OK(RunOpKernel());
54 | 
55 |   std::vector<int32> expected_values({111, 112, 116, 117, 118});
56 |   std::vector<int64> expected_values_inner_splits({0, 2, 3, 5});
57 |   std::vector<int64> expected_offset_starts({0, 3, 0});
58 |   std::vector<int64> expected_offset_limits({2, 4, 2});
59 |   std::vector<int64> output_outer_splits({0, 2, 3});
60 |   EXPECT_THAT(*GetOutput(0), VectorEq(expected_values));
61 |   EXPECT_THAT(*GetOutput(1), VectorEq(expected_values_inner_splits));
62 |   EXPECT_THAT(*GetOutput(2), VectorEq(expected_offset_starts));
63 |   EXPECT_THAT(*GetOutput(3), VectorEq(expected_offset_limits));
64 |   EXPECT_THAT(*GetOutput(4), VectorEq(output_outer_splits));
65 | }
66 | 
67 | }  // namespace text
68 | }  // namespace tensorflow
69 | 


--------------------------------------------------------------------------------
/tensorflow_text/core/kernels/unicode_script_tokenize_kernel_test.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 TF.Text Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <vector>
16 | 
17 | #include <gmock/gmock.h>
18 | #include <gtest/gtest.h>
19 | #include "tensorflow/core/framework/fake_input.h"
20 | #include "tensorflow/core/framework/node_def_builder.h"
21 | #include "tensorflow/core/framework/tensor.h"
22 | #include "tensorflow/core/framework/tensor_shape.h"
23 | #include "tensorflow/core/kernels/ops_testutil.h"
24 | #include "tensorflow/core/lib/core/status.h"
25 | #include "tensorflow/core/lib/core/status_test_util.h"
26 | #include "tensorflow_text/core/kernels/text_kernels_test_util.h"
27 | 
28 | namespace tensorflow {
29 | namespace text {
30 | 
31 | using tensorflow::FakeInput;
32 | using tensorflow::NodeDefBuilder;
33 | using tensorflow::Status;
34 | using tensorflow::TensorShape;
35 | using tensorflow::text_kernels_test_util::VectorEq;
36 | 
37 | class UnicodeScriptTokenizeWithOffsetsKernelTest
38 |     : public tensorflow::OpsTestBase {
39 |  public:
40 |   void MakeOp() {
41 |     TF_ASSERT_OK(NodeDefBuilder("tested_op", "UnicodeScriptTokenizeWithOffsets")
42 |                  .Input(FakeInput())
43 |                  .Input(FakeInput())
44 |                  .Finalize(node_def()));
45 |     TF_ASSERT_OK(InitOp());
46 |   }
47 | };
48 | 
49 | TEST_F(UnicodeScriptTokenizeWithOffsetsKernelTest, Test) {
50 |   MakeOp();
51 |   AddInputFromArray<int32>(TensorShape({6}), {111, 112, 32, 116, 117, 118});
52 |   AddInputFromArray<int64>(TensorShape({3}), {0, 4, 6});
53 |   TF_ASSERT_OK(RunOpKernel());
54 | 
55 |   std::vector<int32> expected_values({111, 112, 116, 117, 118});
56 |   std::vector<int64> expected_values_inner_splits({0, 2, 3, 5});
57 |   std::vector<int64> expected_offset_starts({0, 3, 0});
58 |   std::vector<int64> expected_offset_limits({2, 4, 2});
59 |   std::vector<int64> output_outer_splits({0, 2, 3});
60 |   EXPECT_THAT(*GetOutput(0), VectorEq(expected_values));
61 |   EXPECT_THAT(*GetOutput(1), VectorEq(expected_values_inner_splits));
62 |   EXPECT_THAT(*GetOutput(2), VectorEq(expected_offset_starts));
63 |   EXPECT_THAT(*GetOutput(3), VectorEq(expected_offset_limits));
64 |   EXPECT_THAT(*GetOutput(4), VectorEq(output_outer_splits));
65 | }
66 | 
67 | }  // namespace text
68 | }  // namespace tensorflow
69 | 


--------------------------------------------------------------------------------
/tensorflow_text/core/kernels/text_kernels_test_util.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 TF.Text Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include "tensorflow_text/core/kernels/text_kernels_test_util.h"
16 | 
17 | using ::testing::MakeMatcher;
18 | using ::testing::Matcher;
19 | using ::testing::MatchResultListener;
20 | 
21 | namespace tensorflow {
22 | namespace text_kernels_test_util {
23 | 
24 | bool TensorEqMatcher::MatchAndExplain(
25 |     Tensor actual, ::testing::MatchResultListener* listener) const {
26 |   string expect_values = expect_.SummarizeValue(expect_.NumElements());
27 |   string actual_values = actual.SummarizeValue(actual.NumElements());
28 |   if (expect_.dtype() != actual.dtype() || expect_.shape() != actual.shape() ||
29 |       expect_values != actual_values) {
30 |     *listener << "\n          dtype=" << DataTypeString(actual.dtype());
31 |     *listener << "\n          shape=" << actual.shape().DebugString();
32 |     *listener << "\n         values=" << actual_values;
33 |     return false;
34 |   }
35 |   return true;
36 | }
37 | 
38 | void TensorEqMatcher::DescribeTo(::std::ostream* gmock_os) const {
39 |   *gmock_os << "dtype=" << DataTypeString(expect_.dtype())
40 |             << "\n          shape=" << expect_.shape().DebugString()
41 |             << "\n         values="
42 |             << expect_.SummarizeValue(expect_.NumElements());
43 | }
44 | 
45 | void TensorEqMatcher::DescribeNegationTo(::std::ostream* gmock_os) const {
46 |   *gmock_os << "is not equal to " << expect_.DebugString();
47 | }
48 | 
49 | bool TensorHasShapeMatcher::MatchAndExplain(
50 |     Tensor actual, ::testing::MatchResultListener* listener) const {
51 |   if (expect_ != actual.shape()) {
52 |     *listener << "\n          shape=" << actual.shape().DebugString();
53 |     return false;
54 |   }
55 |   return true;
56 | }
57 | 
58 | void TensorHasShapeMatcher::DescribeTo(::std::ostream* gmock_os) const {
59 |   *gmock_os << "shape=" << expect_.DebugString();
60 | }
61 | 
62 | void TensorHasShapeMatcher::DescribeNegationTo(::std::ostream* gmock_os) const {
63 |   *gmock_os << "shape!=" << expect_.DebugString();
64 | }
65 | 
66 | Matcher<Tensor> TensorHasShape(const TensorShape& shape) {
67 |   // MakeMatcher takes ownership of the TensorHasShapeMatcher.
68 |   return MakeMatcher(new TensorHasShapeMatcher(shape));
69 | }
70 | 
71 | }  // namespace text_kernels_test_util
72 | }  // namespace tensorflow
73 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text.md:
--------------------------------------------------------------------------------
 1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
 2 | <meta itemprop="name" content="text" />
 3 | <meta itemprop="path" content="Stable" />
 4 | </div>
 5 | 
 6 | # Module: text
 7 | 
 8 | Various tensorflow ops related to text-processing.
 9 | 
10 | Defined in [`__init__.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/__init__.py).
11 | 
12 | <!-- Placeholder for "Used in" -->
13 | 
14 | 
15 | ## Classes
16 | 
17 | [`class Reduction`](./text/Reduction.md): Type of reduction to be done by the ngram op.
18 | 
19 | [`class Tokenizer`](./text/Tokenizer.md): Base class (abstract class) for all
20 | tokenizers.
21 | 
22 | [`class TokenizerWithOffsets`](./text/TokenizerWithOffsets.md): Base class
23 | (abstract class) for all tokenizers that return offsets.
24 | 
25 | [`class UnicodeScriptTokenizer`](./text/UnicodeScriptTokenizer.md): Tokenizes a
26 | tensor of UTF-8 strings on Unicode script boundaries.
27 | 
28 | [`class WhitespaceTokenizer`](./text/WhitespaceTokenizer.md): Tokenizes a tensor
29 | of UTF-8 strings on whitespaces.
30 | 
31 | [`class WordShape`](./text/WordShape.md): Values for the 'pattern' arg of the WordShape op.
32 | 
33 | [`class WordpieceTokenizer`](./text/WordpieceTokenizer.md): Creates a wordpiece
34 | tokenizer.
35 | 
36 | ## Functions
37 | 
38 | [`case_fold_utf8(...)`](./text/case_fold_utf8.md): Applies case folding to every
39 | UTF8 string in the input.
40 | 
41 | [`coerce_to_structurally_valid_utf8(...)`](./text/coerce_to_structurally_valid_utf8.md): Coerce UTF-8 input strings to structurally valid UTF-8.
42 | 
43 | [`gather_with_default(...)`](./text/gather_with_default.md): Gather slices with `indices=-1` mapped to `default`.
44 | 
45 | [`greedy_constrained_sequence(...)`](./text/greedy_constrained_sequence.md): Performs greedy constrained sequence on a batch of examples.
46 | 
47 | [`ngrams(...)`](./text/ngrams.md): Create a tensor of n-grams based on the input data `data`.
48 | 
49 | [`normalize_utf8(...)`](./text/normalize_utf8.md): Normalizes each UTF8 string in the input tensor using the specified rule.
50 | 
51 | [`pad_along_dimension(...)`](./text/pad_along_dimension.md): Add padding to the beginning and end of data in a specific dimension.
52 | 
53 | [`sentence_fragments(...)`](./text/sentence_fragments.md): Find the sentence fragments in a given text.
54 | 
55 | [`sliding_window(...)`](./text/sliding_window.md): Builds a sliding window for `data` with a specified width.
56 | 
57 | [`span_alignment(...)`](./text/span_alignment.md): Return an alignment from a set of source spans to a set of target spans.
58 | 
59 | [`span_overlaps(...)`](./text/span_overlaps.md): Returns a boolean tensor indicating which source and target spans overlap.
60 | 
61 | [`viterbi_constrained_sequence(...)`](./text/viterbi_constrained_sequence.md): Performs greedy constrained sequence on a batch of examples.
62 | 
63 | [`wordshape(...)`](./text/wordshape.md): Determine wordshape features for each input string.
64 | 
65 | 


--------------------------------------------------------------------------------
/tensorflow_text/core/kernels/sentence_breaking_utils.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 TF.Text Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_BREAKING_UTILS_H_
16 | #define TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_BREAKING_UTILS_H_
17 | 
18 | #include <string>
19 | #include "absl/strings/string_view.h"
20 | #include "icu4c/source/common/unicode/ucnv.h"
21 | #include "icu4c/source/common/unicode/ucnv_err.h"
22 | #include "icu4c/source/common/unicode/utypes.h"
23 | #include "tensorflow/core/lib/core/status.h"
24 | 
25 | namespace tensorflow {
26 | namespace text {
27 | 
28 | // A class of utils for identifying certain classes and properties of unicode
29 | // characters.
30 | class UnicodeUtil {
31 |  public:
32 |   // `converter` not owned.
33 |   explicit UnicodeUtil(UConverter* converter) : converter_(converter) {}
34 | 
35 |   // Returns true iff a string is terminal punctuation.
36 |   ::tensorflow::Status IsTerminalPunc(const absl::string_view& input,
37 |                                       bool* result) const;
38 | 
39 |   // Returns true iff a string is close punctuation (close quote or close
40 |   // paren).
41 |   ::tensorflow::Status IsClosePunc(const absl::string_view& input,
42 |                                    bool* result) const;
43 | 
44 |   // Returns true iff a string is an open paren.
45 |   ::tensorflow::Status IsOpenParen(const absl::string_view& input,
46 |                                    bool* result) const;
47 | 
48 |   // Returns true iff a string is a close paren.
49 |   ::tensorflow::Status IsCloseParen(const absl::string_view& input,
50 |                                     bool* result) const;
51 | 
52 |   // Returns true iff a word is made of punctuation characters only.
53 |   ::tensorflow::Status IsPunctuationWord(const absl::string_view& input,
54 |                                          bool* result) const;
55 | 
56 |   // Returns true iff a string is an ellipsis token ("...").
57 |   ::tensorflow::Status IsEllipsis(const absl::string_view& input,
58 |                                   bool* result) const;
59 | 
60 |  private:
61 |   ::tensorflow::Status GetOneUChar(const absl::string_view&,
62 |                                    bool* has_more_than_one_char,
63 |                                    UChar32* result) const;
64 | 
65 |   // not owned. mutable because UConverter contains some internal options and
66 |   // buffer.
67 |   mutable UConverter* converter_;
68 | };
69 | 
70 | }  // namespace text
71 | }  // namespace tensorflow
72 | 
73 | #endif  // TENSORFLOW_TEXT_CORE_KERNELS_SENTENCE_BREAKING_UTILS_H_
74 | 


--------------------------------------------------------------------------------
/oss_scripts/pip_package/setup.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 TF.Text Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """TF.Text is a TensorFlow library of text related ops, modules, and subgraphs.
17 | 
18 | TF.Text is a TensorFlow library of text related ops, modules, and subgraphs. The
19 | library can perform the preprocessing regularly required by text-based models,
20 | and includes other features useful for sequence modeling not provided by core
21 | TensorFlow.
22 | 
23 | See the README on GitHub for further documentation.
24 | http://github.com/tensorflow/text
25 | """
26 | 
27 | import os
28 | 
29 | from setuptools import find_packages
30 | from setuptools import setup
31 | from setuptools.command.install import install
32 | from setuptools.dist import Distribution
33 | 
34 | project_name = 'tensorflow-text'
35 | project_version = '1.0.0-beta0'
36 | 
37 | 
38 | class BinaryDistribution(Distribution):
39 |   """This class is needed in order to create OS specific wheels."""
40 | 
41 |   def is_pure(self):
42 |     return False
43 | 
44 |   def has_ext_modules(self):
45 |     return True
46 | 
47 | 
48 | class InstallPlatlib(install):
49 |   """This is needed to set hte library to platlib compliant."""
50 | 
51 |   def finalize_options(self):
52 |     """For more info; see http://github.com/google/or-tools/issues/616 ."""
53 |     install.finalize_options(self)
54 |     self.install_lib = self.install_platlib
55 |     self.install_libbase = self.install_lib
56 |     self.install_lib = os.path.join(self.install_lib, self.extra_dirs)
57 | 
58 | 
59 | DOCLINES = __doc__.split('\n')
60 | 
61 | setup(
62 |     name=project_name,
63 |     version=project_version.replace('-', ''),
64 |     description=DOCLINES[0],
65 |     long_description='\n'.join(DOCLINES[2:]),
66 |     author='Google Inc.',
67 |     author_email='packages@tensorflow.org',
68 |     url='http://github.com/tensorflow/text',
69 |     license='Apache 2.0',
70 |     packages=find_packages(),
71 |     include_package_data=True,
72 |     zip_safe=False,
73 |     cmdclass={'install': InstallPlatlib},
74 |     distclass=BinaryDistribution,
75 |     install_requires=[
76 |         'tensorflow==2.0.0b0',
77 |     ],
78 |     extras_require={
79 |         'tests': [
80 |             'absl-py',
81 |             'pytest',
82 |         ],
83 |     },
84 |     classifiers=[
85 |         'Development Status :: 4 - Beta',
86 |         'Intended Audience :: Developers',
87 |         'Intended Audience :: Science/Research',
88 |         'License :: OSI Approved :: Apache Software License',
89 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
90 |     ],
91 |     keywords='tensorflow text machine learning',
92 | )
93 | 


--------------------------------------------------------------------------------
/tensorflow_text/python/ops/normalize_ops.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 TF.Text Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Tensorflow lowercasing operation for UTF8 strings."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | from tensorflow.python.framework import dtypes
23 | from tensorflow.python.framework import ops
24 | from tensorflow.python.ops.ragged import ragged_tensor
25 | 
26 | from tensorflow.python.framework import load_library
27 | from tensorflow.python.platform import resource_loader
28 | gen_normalize_ops = load_library.load_op_library(resource_loader.get_path_to_datafile('_normalize_ops.so'))
29 | 
30 | 
31 | # pylint: disable=redefined-builtin
32 | def case_fold_utf8(input, name=None):
33 |   """Applies case folding to every UTF8 string in the input.
34 | 
35 |   The input is a `Tensor` or `RaggedTensor` of any shape, and the resulting
36 |   output has the same shape as the input. Note that NFKC normalization is
37 |   implicitly applied to the strings.
38 | 
39 |   For example:
40 | 
41 |   ```python
42 |   >>> case_fold_utf8(['The   Quick-Brown',
43 |   ...                 'CAT jumped over',
44 |   ...                 'the lazy dog  !!  ']
45 |   tf.Tensor(['The   quick-brown' 'cat jumped over' 'the lazy dog  !!  '],
46 |             shape=(3,), dtype=string)
47 |   ```
48 | 
49 |   Args:
50 |     input: A `Tensor` or `RaggedTensor` of type string. (Must be UTF-8.)
51 |     name: The name for this op (optional)
52 | 
53 |   Returns:
54 |     A `Tensor` or `RaggedTensor` of type string, with case-folded contents.
55 |   """
56 |   with ops.name_scope(name, "CaseFoldUTF8", [input]):
57 |     input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(
58 |         input, dtype=dtypes.string)
59 |     if ragged_tensor.is_ragged(input_tensor):
60 |       result = gen_normalize_ops.case_fold_utf8(input_tensor.flat_values)
61 |       return input_tensor.with_flat_values(result)
62 |     else:
63 |       return gen_normalize_ops.case_fold_utf8(input_tensor)
64 | 
65 | 
66 | # pylint: disable=redefined-builtin)
67 | def normalize_utf8(input, normalization_form="NFKC", name=None):
68 |   """Normalizes each UTF8 string in the input tensor using the specified rule.
69 | 
70 |   See http://unicode.org/reports/tr15/
71 | 
72 |   Args:
73 |     input: A `Tensor` or `RaggedTensor` of type string. (Must be UTF-8.)
74 |     normalization_form: One of the following string values ('NFC', 'NFKC',
75 |       'NFD', 'NFKD'). Default is 'NFKC'.
76 |     name: The name for this op (optional)
77 | 
78 |   Returns:
79 |     A `Tensor` or `RaggedTensor` of type string, with normalized contents.
80 |   """
81 |   with ops.name_scope(name, "NormalizeUTF8", [input]):
82 |     input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(
83 |         input, dtype=dtypes.string)
84 |     if ragged_tensor.is_ragged(input_tensor):
85 |       result = gen_normalize_ops.normalize_utf8(input_tensor.flat_values,
86 |                                                 normalization_form)
87 |       return input_tensor.with_flat_values(result)
88 |     else:
89 |       return gen_normalize_ops.normalize_utf8(input_tensor, normalization_form)
90 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/sentence_fragments.md:
--------------------------------------------------------------------------------
 1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
 2 | <meta itemprop="name" content="text.sentence_fragments" />
 3 | <meta itemprop="path" content="Stable" />
 4 | </div>
 5 | 
 6 | # text.sentence_fragments
 7 | 
 8 | Find the sentence fragments in a given text.
 9 | 
10 | ``` python
11 | text.sentence_fragments(
12 |     token_word,
13 |     token_starts,
14 |     token_ends,
15 |     token_properties,
16 |     input_encoding='UTF-8',
17 |     errors='replace',
18 |     replacement_char=65533,
19 |     replace_control_characters=False
20 | )
21 | ```
22 | 
23 | Defined in
24 | [`python/ops/sentence_breaking_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/sentence_breaking_ops.py).
25 | 
26 | <!-- Placeholder for "Used in" -->
27 | 
28 | A sentence fragment is a potential next sentence determined using
29 | deterministic heuristics based on punctuation, capitalization, and similar
30 | text attributes.
31 | 
32 | #### Args:
33 | 
34 | *   <b>`token_word`</b>: A Tensor (w/ rank=2) or a RaggedTensor (w/
35 |     ragged_rank=1) containing the token strings.
36 | *   <b>`token_starts`</b>: A Tensor (w/ rank=2) or a RaggedTensor (w/
37 |     ragged_rank=1) containing offsets where the token starts.
38 | *   <b>`token_ends`</b>: A Tensor (w/ rank=2) or a RaggedTensor (w/
39 |     ragged_rank=1) containing offsets where the token ends.
40 | *   <b>`token_properties`</b>: A Tensor (w/ rank=2) or a RaggedTensor (w/
41 |     ragged_rank=1) containing a bitmask.
42 | 
43 |     The values of the bitmask are: 0x01 (ILL_FORMED) - Text is ill-formed
44 |     according to TextExtractor; typically applies to all tokens of a paragraph
45 |     that is too short or lacks terminal punctuation. 0x40 (TITLE) 0x02 (HEADING)
46 |     0x04 (BOLD) 0x10 (UNDERLINED) 0x20 (LIST) 0x80 (EMOTICON) 0x100 (ACRONYM) -
47 |     Token was identified by Lexer as an acronym. Lexer identifies period-,
48 |     hyphen-, and space-separated acronyms: "U.S.", "U-S", and "U S". Lexer
49 |     normalizes all three to "US", but the token word field normalizes only
50 |     space-separated acronyms. 0x200 (HYPERLINK) - Indicates that the token (or
51 |     part of the token) is a covered by at least one hyperlink. More information
52 |     of the hyperlink is stored in the first token covered by the hyperlink.
53 | 
54 | *   <b>`input_encoding`</b>: String name for the unicode encoding that should be
55 |     used to decode each string.
56 | 
57 | *   <b>`errors`</b>: Specifies the response when an input string can't be
58 |     converted using the indicated encoding. One of:
59 | 
60 |     *   `'strict'`: Raise an exception for any illegal substrings.
61 |     *   `'replace'`: Replace illegal substrings with `replacement_char`.
62 |     *   `'ignore'`: Skip illegal substrings.
63 | 
64 | *   <b>`replacement_char`</b>: The replacement codepoint to be used in place of
65 |     invalid substrings in `input` when `errors='replace'`; and in place of C0
66 |     control characters in `input` when `replace_control_characters=True`.
67 | 
68 | *   <b>`replace_control_characters`</b>: Whether to replace the C0 control
69 |     characters `(U+0000 - U+001F)` with the `replacement_char`.
70 | 
71 | #### Returns:
72 | 
73 | A RaggedTensor of `fragment_start`, `fragment_end`, `fragment_properties`
74 | and `terminal_punc_token`.
75 | 
76 | `fragment_properties` is an int32 bitmask whose values may contain:
77 |    1 = fragment ends with terminal punctuation
78 |    2 = fragment ends with multiple terminal punctuations (e.g.
79 |      "She said what?!")
80 |    3 = Has close parenthesis (e.g. "Mushrooms (they're fungi).")
81 |    4 = Has sentential close parenthesis (e.g. "(Mushrooms are fungi!)"
82 | 
83 | `terminal_punc_token` is a RaggedTensor containing the index of terminal
84 | punctuation token immediately following the last word in the fragment -- or
85 | index of the last word itself, if it's an acronym (since acronyms include the
86 | terminal punctuation). index of the terminal punctuation token.
87 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/span_overlaps.md:
--------------------------------------------------------------------------------
 1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
 2 | <meta itemprop="name" content="text.span_overlaps" />
 3 | <meta itemprop="path" content="Stable" />
 4 | </div>
 5 | 
 6 | # text.span_overlaps
 7 | 
 8 | Returns a boolean tensor indicating which source and target spans overlap.
 9 | 
10 | ``` python
11 | text.span_overlaps(
12 |     source_start,
13 |     source_limit,
14 |     target_start,
15 |     target_limit,
16 |     contains=False,
17 |     contained_by=False,
18 |     partial_overlap=False,
19 |     name=None
20 | )
21 | ```
22 | 
23 | Defined in
24 | [`python/ops/pointer_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/pointer_ops.py).
25 | 
26 | <!-- Placeholder for "Used in" -->
27 | 
28 | The source and target spans are specified using B+1 dimensional tensors,
29 | with `B>=0` batch dimensions followed by a final dimension that lists the
30 | span offsets for each span in the batch:
31 | 
32 | * The `i`th source span in batch `b1...bB` starts at
33 |   `source_start[b1...bB, i]` (inclusive), and extends to just before
34 |   `source_limit[b1...bB, i]` (exclusive).
35 | * The `j`th target span in batch `b1...bB` starts at
36 |   `target_start[b1...bB, j]` (inclusive), and extends to just before
37 |   `target_limit[b1...bB, j]` (exclusive).
38 | 
39 | `result[b1...bB, i, j]` is true if the `i`th source span overlaps with the
40 | `j`th target span in batch `b1...bB`, where a source span overlaps a target
41 | span if any of the following are true:
42 | 
43 |   * The spans are identical.
44 |   * `contains` is true, and the source span contains the target span.
45 |   * `contained_by` is true, and the source span is contained by the target
46 |     span.
47 |   * `partial_overlap` is true, and there is a non-zero overlap between the
48 |     source span and the target span.
49 | 
50 | #### Args:
51 | 
52 | *   <b>`source_start`</b>: A B+1 dimensional potentially ragged tensor with
53 |     shape `[D1...DB, source_size]`: the start offset of each source span.
54 | *   <b>`source_limit`</b>: A B+1 dimensional potentially ragged tensor with
55 |     shape `[D1...DB, source_size]`: the limit offset of each source span.
56 | *   <b>`target_start`</b>: A B+1 dimensional potentially ragged tensor with
57 |     shape `[D1...DB, target_size]`: the start offset of each target span.
58 | *   <b>`target_limit`</b>: A B+1 dimensional potentially ragged tensor with
59 |     shape `[D1...DB, target_size]`: the limit offset of each target span.
60 | *   <b>`contains`</b>: If true, then a source span is considered to overlap a
61 |     target span when the source span contains the target span.
62 | *   <b>`contained_by`</b>: If true, then a source span is considered to overlap
63 |     a target span when the source span is contained by the target span.
64 | *   <b>`partial_overlap`</b>: If true, then a source span is considered to
65 |     overlap a target span when the source span partially overlaps the target
66 |     span.
67 | *   <b>`name`</b>: A name for the operation (optional).
68 | 
69 | #### Returns:
70 | 
71 | A B+2 dimensional potentially ragged boolean tensor with shape
72 | `[D1...DB, source_size, target_size]`.
73 | 
74 | #### Raises:
75 | 
76 | * <b>`ValueError`</b>: If the span tensors are incompatible.
77 | 
78 | #### Example:
79 |   Given the following source and target spans (with no batch dimensions):
80 | 
81 |   ```python
82 |   #         0    5    10   15   20   25   30   35   40
83 |   #         |====|====|====|====|====|====|====|====|
84 |   # Source: [-0-]     [-1-] [2] [-3-][-4-][-5-]
85 |   # Target: [-0-][-1-]     [-2-] [3]   [-4-][-5-]
86 |   #         |====|====|====|====|====|====|====|====|
87 |   >>> source_start = [0, 10, 16, 20, 25, 30]
88 |   >>> source_limit = [5, 15, 19, 25, 30, 35]
89 |   >>> target_start = [0,  5, 15, 21, 27, 31]
90 |   >>> target_limit = [5, 10, 20, 24, 32, 37]
91 |   ```
92 | 
93 |   `result[i, j]` will be true at the following locations:
94 | 
95 |     * `[0, 0]` (always)
96 |     * `[2, 2]` (if contained_by=True or partial_overlaps=True)
97 |     * `[3, 3]` (if contains=True or partial_overlaps=True)
98 |     * `[4, 4]` (if partial_overlaps=True)
99 |     * `[5, 5]` (if partial_overlaps=True)


--------------------------------------------------------------------------------
/docs/api_docs/python/text/Tokenizer.md:
--------------------------------------------------------------------------------
  1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
  2 | <meta itemprop="name" content="text.Tokenizer" />
  3 | <meta itemprop="path" content="Stable" />
  4 | <meta itemprop="property" content="name"/>
  5 | <meta itemprop="property" content="name_scope"/>
  6 | <meta itemprop="property" content="submodules"/>
  7 | <meta itemprop="property" content="trainable_variables"/>
  8 | <meta itemprop="property" content="variables"/>
  9 | <meta itemprop="property" content="__init__"/>
 10 | <meta itemprop="property" content="tokenize"/>
 11 | <meta itemprop="property" content="with_name_scope"/>
 12 | </div>
 13 | 
 14 | # text.Tokenizer
 15 | 
 16 | ## Class `Tokenizer`
 17 | 
 18 | Base class (abstract class) for all tokenizers.
 19 | 
 20 | Defined in
 21 | [`python/ops/tokenization.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/tokenization.py).
 22 | 
 23 | <!-- Placeholder for "Used in" -->
 24 | 
 25 | <h2 id="__init__"><code>__init__</code></h2>
 26 | 
 27 | ```python
 28 | __init__(name=None)
 29 | ```
 30 | 
 31 | ## Properties
 32 | 
 33 | <h3 id="name"><code>name</code></h3>
 34 | 
 35 | Returns the name of this module as passed or determined in the ctor.
 36 | 
 37 | NOTE: This is not the same as the `self.name_scope.name` which includes parent
 38 | module names.
 39 | 
 40 | <h3 id="name_scope"><code>name_scope</code></h3>
 41 | 
 42 | Returns a `tf.name_scope` instance for this class.
 43 | 
 44 | <h3 id="submodules"><code>submodules</code></h3>
 45 | 
 46 | Sequence of all sub-modules.
 47 | 
 48 | Submodules are modules which are properties of this module, or found as
 49 | properties of modules which are properties of this module (and so on).
 50 | 
 51 | ```
 52 | a = tf.Module()
 53 | b = tf.Module()
 54 | c = tf.Module()
 55 | a.b = b
 56 | b.c = c
 57 | assert list(a.submodules) == [b, c]
 58 | assert list(b.submodules) == [c]
 59 | assert list(c.submodules) == []
 60 | ```
 61 | 
 62 | #### Returns:
 63 | 
 64 | A sequence of all submodules.
 65 | 
 66 | <h3 id="trainable_variables"><code>trainable_variables</code></h3>
 67 | 
 68 | Sequence of variables owned by this module and it's submodules.
 69 | 
 70 | Note: this method uses reflection to find variables on the current instance and
 71 | submodules. For performance reasons you may wish to cache the result of calling
 72 | this method if you don't expect the return value to change.
 73 | 
 74 | #### Returns:
 75 | 
 76 | A sequence of variables for the current module (sorted by attribute name)
 77 | followed by variables from all submodules recursively (breadth first).
 78 | 
 79 | <h3 id="variables"><code>variables</code></h3>
 80 | 
 81 | Sequence of variables owned by this module and it's submodules.
 82 | 
 83 | Note: this method uses reflection to find variables on the current instance and
 84 | submodules. For performance reasons you may wish to cache the result of calling
 85 | this method if you don't expect the return value to change.
 86 | 
 87 | #### Returns:
 88 | 
 89 | A sequence of variables for the current module (sorted by attribute name)
 90 | followed by variables from all submodules recursively (breadth first).
 91 | 
 92 | ## Methods
 93 | 
 94 | <h3 id="tokenize"><code>tokenize</code></h3>
 95 | 
 96 | ```python
 97 | tokenize(input)
 98 | ```
 99 | 
100 | Abstract function for tokenization.
101 | 
102 | #### Args:
103 | 
104 | *   <b>`input`</b>: An N-dimensional UTF-8 string (or optionally integer) Tensor
105 |     or RaggedTensor.
106 | 
107 | #### Returns:
108 | 
109 | An N+1-dimensional UTF-8 string or integer Tensor or RaggedTensor.
110 | 
111 | <h3 id="with_name_scope"><code>with_name_scope</code></h3>
112 | 
113 | ```python
114 | with_name_scope(
115 |     cls,
116 |     method
117 | )
118 | ```
119 | 
120 | Decorator to automatically enter the module name scope.
121 | 
122 | ```
123 | class MyModule(tf.Module):
124 |   @tf.Module.with_name_scope
125 |   def __call__(self, x):
126 |     if not hasattr(self, 'w'):
127 |       self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
128 |     return tf.matmul(x, self.w)
129 | ```
130 | 
131 | Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names
132 | included the module name:
133 | 
134 | ```
135 | mod = MyModule()
136 | mod(tf.ones([8, 32]))
137 | # ==> <tf.Tensor: ...>
138 | mod.w
139 | # ==> <tf.Variable ...'my_module/w:0'>
140 | ```
141 | 
142 | #### Args:
143 | 
144 | *   <b>`method`</b>: The method to wrap.
145 | 
146 | #### Returns:
147 | 
148 | The original method wrapped such that it enters the module's name scope.
149 | 


--------------------------------------------------------------------------------
/tensorflow_text/core/ops/constrained_sequence_op.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 TF.Text Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include "tensorflow/core/framework/op.h"
16 | #include "tensorflow/core/framework/shape_inference.h"
17 | 
18 | namespace tensorflow {
19 | 
20 | REGISTER_OP("ConstrainedSequence")
21 |     .Attr("Tin: {int32, int64}")
22 |     .Attr("Tsplits: {int32, int64} = DT_INT64")
23 |     .Attr("use_viterbi: bool")
24 |     .Attr("use_log_space: bool")
25 |     .Attr("use_start_and_end_states: bool")
26 |     .Input("scores: float")
27 |     .Input("sequence_lengths: Tin")
28 |     .Input("allowed_transitions: bool")
29 |     .Input("transition_weights: float")
30 |     .Output("states: int32")
31 |     .Output("states_splits: Tsplits")
32 | 
33 |     // TODO(b/122968457): Implement a shape function.
34 |     .Doc(R"doc(
35 | Constrains a set of predictions based on a set of legal transitions and/or a
36 | set of transition weights, returning the legal sequence that maximizes the
37 | product of the state scores and the transition weights using the chained
38 | conditional random field algorithm. (In case of a tie, the state with a higher
39 | index will be chosen.)
40 | 
41 | This op takes in a set of scores and outputs the most likely legal sequence
42 | for each batch element, where the most likely legal sequence is determined by
43 | the optional 'allowed_transitions' and 'transition_weights' tensors.
44 | 
45 | The 'allowed_transition' tensor may be omitted; if it is, all sequence states
46 | will be allowed to transition to all other sequence states. If the tensor is
47 | provided it must be of the size [num_states+1][num_states+1].
48 | 
49 | allowed_transitions[i][j] is true if the transition from state i to state
50 | j is allowed for i and j in 0...(num_states).
51 | allowed_transitions[num_states][j] is true if the sequence is allowed to
52 | start from state j.
53 | allowed_transitions[i][num_states] is true if the sequence is allowed to
54 | end on state i.
55 | allowed_transitions[num_states][num_states] is ignored.
56 | 
57 | The 'transition_weights' tensor may be omitted; if it is, all transitions will
58 | be weighted with a value of 1.0. If the tensor is provided it must be of the
59 | size [num_states+1][num_states+1].
60 | 
61 | transition_weights[i][j] is the coefficient that a candidate transition score
62 | will be multiplied by if that transition is from state i to state j.
63 | transition_weights[num_states][j] is the coefficient that will be used
64 | if the transition starts with state j.
65 | transition_weights[i][num_states] is the coefficient that will be used
66 | if the final state in the sequence is state i.
67 | transition_weights[num_states][num_states] is ignored.
68 | 
69 | This op outputs a RaggedTensor value and splits pair.
70 | 
71 | scores: <float>[batch_size, num_steps, |num_states|] A tensor of scores, where
72 |         `scores[b, t, s]` is the predicted score for transitioning to state `s`
73 |         at step `t` for batch `b`. The |num_states| dimension must correspond
74 |         to the num_states attribute for this op.
75 | sequence_lengths: <{int32, int64}>[batch_size] A tensor containing the length
76 |         of each sequence in the batch.
77 | allowed_transitions: <bool>[num_states+1, num_states+1] A boolean matrix of
78 |         allowed transitions, or an empty matrix '[]' to allow all transitions.
79 | transition_weights: <float>[num_states+1, num_states+1] A float matrix of score
80 |         coefficients, or an empty matrix '[]' to weight all transitions equally.
81 | states: <int32>[batch_size, max_sequence_length] OR <int32>[total_num_states]
82 |         A set of sequence outputs representing the most likely valid sequences
83 |         for each batch. If `output_ragged_tensor` is false, this will be in
84 |         [batch_size, max_sequence_length] form; if `output_ragged_tensor` is
85 |         true, this will be a RaggedTensor data vector of shape
86 |         [total_num_states].
87 | states_splits: <int64>[batch_size+1] A RaggedTensor splits vector. If
88 |         `output_ragged_tensor` is true, then the state sequence for input `i`
89 |         is stored in `states[states_splits[i]:states_splits[i+1]]`.  If
90 |         `output_ragged_tensor` is false, this tensor will be empty and can be
91 |         ignored.
92 | )doc");
93 | 
94 | }  // namespace tensorflow
95 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/sliding_window.md:
--------------------------------------------------------------------------------
  1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
  2 | <meta itemprop="name" content="text.sliding_window" />
  3 | <meta itemprop="path" content="Stable" />
  4 | </div>
  5 | 
  6 | # text.sliding_window
  7 | 
  8 | Builds a sliding window for `data` with a specified width.
  9 | 
 10 | ``` python
 11 | text.sliding_window(
 12 |     data,
 13 |     width,
 14 |     axis=-1,
 15 |     name=None
 16 | )
 17 | ```
 18 | 
 19 | Defined in
 20 | [`python/ops/sliding_window_op.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/sliding_window_op.py).
 21 | 
 22 | <!-- Placeholder for "Used in" -->
 23 | 
 24 | Returns a tensor constructed from `data`, where each element in
 25 | dimension `axis` is a slice of `data` starting at the corresponding
 26 | position, with the given width and step size.  I.e.:
 27 | 
 28 | * `result.shape.ndims = data.shape.ndims + 1`
 29 | * `result[i1..iaxis, a] = data[i1..iaxis, a:a+width]`
 30 |   (where `0 <= a < data[i1...iaxis].shape[0] - (width - 1)`).
 31 | 
 32 | Note that each result row (along dimension `axis`) has `width - 1` fewer items
 33 | than the corresponding `data` row.  If a `data` row has fewer than `width`
 34 | items, then the corresponding `result` row will be empty.  If you wish for
 35 | the `result` rows to be the same size as the `data` rows, you can use
 36 | `pad_along_dimension` to add `width - 1` padding elements before calling
 37 | this op.
 38 | 
 39 | #### Args:
 40 | 
 41 | *   <b>`data`</b>: `<dtype> [O1...ON, A, I1...IM]` A potentially ragged
 42 |     K-dimensional tensor with outer dimensions of size `O1...ON`; axis dimension
 43 |     of size `A`; and inner dimensions of size `I1...IM`. I.e. `K = N + 1 + M`,
 44 |     where `N>=0` and `M>=0`.
 45 | 
 46 | *   <b>`width`</b>: An integer constant specifying the width of the window. Must
 47 |     be greater than zero.
 48 | 
 49 | *   <b>`axis`</b>: An integer constant specifying the axis along which sliding
 50 |     window is computed. Negative axis values from `-K` to `-1` are supported.
 51 | 
 52 | *   <b>`name`</b>: The name for this op (optional)
 53 | 
 54 | #### Returns:
 55 | 
 56 | A `K+1` dimensional tensor with the same dtype as `data`, where:
 57 | 
 58 | *   `result[i1..iaxis, a]` = `data[i1..iaxis, a:a+width]`
 59 | *   `result.shape[:axis]` = `data.shape[:axis]`
 60 | *   `result.shape[axis]` = `data.shape[axis] - (width - 1)`
 61 | *   `result.shape[axis + 1]` = `width`
 62 | *   `result.shape[axis + 2:]` = `data.shape[axis + 1:]`
 63 | 
 64 | #### Examples:
 65 | 
 66 |   Sliding window (width=3) across a sequence of tokens:
 67 | 
 68 | ```python
 69 |   >>> # input: <string>[sequence_length]
 70 |   >>> input = tf.constant(["one", "two", "three", "four", "five", "six"])
 71 |   >>> # output: <string>[sequence_length-2, 3]
 72 |   >>> output = sliding_window(data=input, width=3, axis=0)
 73 |   >>> print output.eval()
 74 |   [["one", "two", "three"],
 75 |    ["two", "three", "four"],
 76 |    ["three", "four", "five"],
 77 |    ["four", "five", "six"]]
 78 |   >>> print("Shape: %s -> %s" % (input.shape, output.shape))
 79 |   Shape: (6,) -> (4, 3)
 80 | ```
 81 | 
 82 |   Sliding window (width=2) across the inner dimension of a ragged matrix
 83 |   containing a batch of token sequences:
 84 | 
 85 | ```python
 86 |   >>> # input: <string>[num_sentences, (num_words)]
 87 |   >>> input = tf.ragged.constant(
 88 |   ...     [['Up', 'high', 'in', 'the', 'air'],
 89 |   ...      ['Down', 'under', 'water'],
 90 |   ...      ['Away', 'to', 'outer', 'space']]
 91 |   >>> # output: <string>[num_sentences, (num_word-1), 2]
 92 |   >>> output = sliding_window(input, width=2, axis=-1)
 93 |   >>> print output.eval()
 94 |   [[['Up', 'high'], ['high', 'in'], ['in', 'the'], ['the', 'air']],
 95 |    [['Down', 'under'], ['under', 'water']],
 96 |    [['Away', 'to'], ['to', 'outer'], ['outer', 'space']]]
 97 |   >>> print("Shape: %s -> %s" % (input.shape, output.shape))
 98 |   Shape: (3, ?) -> (3, ?, 2)
 99 | ```
100 | 
101 |   Sliding window across the second dimension of a 3-D tensor containing
102 |   batches of sequences of embedding vectors:
103 | 
104 | ```python
105 |   >>> # input: <int32>[num_sequences, sequence_length, embedding_size]
106 |   >>> input = tf.constant([
107 |   ...     [[1, 1, 1], [2, 2, 1], [3, 3, 1], [4, 4, 1], [5, 5, 1]],
108 |   ...     [[1, 1, 2], [2, 2, 2], [3, 3, 2], [4, 4, 2], [5, 5, 2]]])
109 |   >>> # output: <int32>[num_sequences, sequence_length-1, 2, embedding_size]
110 |   >>> output = sliding_window(data=input, width=2, axis=1)
111 |   >>> print output.eval()
112 |   [[[[1, 1, 1], [2, 2, 1]],
113 |     [[2, 2, 1], [3, 3, 1]],
114 |     [[3, 3, 1], [4, 4, 1]],
115 |     [[4, 4, 1], [5, 5, 1]]],
116 |    [[[1, 1, 2], [2, 2, 2]],
117 |     [[2, 2, 2], [3, 3, 2]],
118 |     [[3, 3, 2], [4, 4, 2]],
119 |     [[4, 4, 2], [5, 5, 2]]]]
120 |   >>> print("Shape: %s -> %s" % (input.shape, output.shape))
121 |   Shape: (2, 5, 3) -> (2, 4, 2, 3)
122 | ```
123 | 


--------------------------------------------------------------------------------
/tensorflow_text/python/ops/ngrams_op.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 TF.Text Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Tensorflow ngram operations."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import enum
 23 | 
 24 | from tensorflow.python.framework import errors
 25 | from tensorflow.python.framework import ops
 26 | from tensorflow.python.ops import math_ops
 27 | from tensorflow.python.ops import string_ops
 28 | from tensorflow.python.ops.ragged import ragged_functional_ops
 29 | from tensorflow.python.ops.ragged import ragged_tensor
 30 | from tensorflow_text.python.ops.sliding_window_op import sliding_window
 31 | 
 32 | 
 33 | class Reduction(enum.Enum):
 34 |   """Type of reduction to be done by the ngram op.
 35 | 
 36 |   The supported reductions are as follows:
 37 | 
 38 |   * `Reduction.SUM`: Add values in the window.
 39 |   * `Reduction.MEAN`: Average values in the window.
 40 |   * `Reduction.STRING_JOIN`: Join strings in the window.
 41 |   """
 42 | 
 43 |   SUM = 1
 44 |   MEAN = 2
 45 |   STRING_JOIN = 3
 46 | 
 47 | 
 48 | def ngrams(data,
 49 |            width,
 50 |            axis=-1,
 51 |            reduction_type=None,
 52 |            string_separator=" ",
 53 |            name=None):
 54 |   """Create a tensor of n-grams based on the input data `data`.
 55 | 
 56 |   Creates a tensor of n-grams based on `data`. The n-grams are of width `width`
 57 |   and are created along axis `axis`; the n-grams are created by combining
 58 |   windows of `width` adjacent elements from `data` using `reduction_type`. This
 59 |   op is intended to cover basic use cases; more complex combinations can be
 60 |   created using the sliding_window op.
 61 | 
 62 |   Args:
 63 |     data: The data to reduce.
 64 |     width: The width of the ngram window. If there is not sufficient data to
 65 |       fill out the ngram window, the resulting ngram will be empty.
 66 |     axis: The axis to create ngrams along. Note that for string join reductions,
 67 |       only axis '-1' is supported; for other reductions, any positive or
 68 |       negative axis can be used. Should be a constant.
 69 |     reduction_type: A member of the Reduction enum. Should be a constant.
 70 |       Currently supports:
 71 | 
 72 |       * `Reduction.SUM`: Add values in the window.
 73 |       * `Reduction.MEAN`: Average values in the window.
 74 |       * `Reduction.STRING_JOIN`: Join strings in the window.
 75 |         Note that axis must be -1 here.
 76 | 
 77 |     string_separator: The separator string used for `Reduction.STRING_JOIN`.
 78 |       Ignored otherwise. Must be a string constant, not a Tensor.
 79 |     name: The op name.
 80 | 
 81 |   Returns:
 82 |     A tensor of ngrams.
 83 | 
 84 |   Raises:
 85 |     InvalidArgumentError: if `reduction_type` is either None or not a Reduction,
 86 |       or if `reduction_type` is STRING_JOIN and `axis` is not -1.
 87 |   """
 88 | 
 89 |   with ops.name_scope(name, "NGrams", [data, width]):
 90 |     if reduction_type is None:
 91 |       raise errors.InvalidArgumentError(None, None,
 92 |                                         "reduction_type must be specified.")
 93 | 
 94 |     if not isinstance(reduction_type, Reduction):
 95 |       raise errors.InvalidArgumentError(None, None,
 96 |                                         "reduction_type must be a Reduction.")
 97 | 
 98 |     # TODO(b/122967921): Lift this restriction after ragged_reduce_join is done.
 99 |     if reduction_type is Reduction.STRING_JOIN and axis != -1:
100 |       raise errors.InvalidArgumentError(
101 |           None, None, "%s requires that ngrams' 'axis' parameter be -1." %
102 |           Reduction.STRING_JOIN.name)
103 | 
104 |     windowed_data = sliding_window(data, width, axis)
105 | 
106 |     if axis < 0:
107 |       reduction_axis = axis
108 |     else:
109 |       reduction_axis = axis + 1
110 | 
111 |     # Ragged reduction ops work on both Tensor and RaggedTensor, so we can
112 |     # use them here regardless of the type of tensor in 'windowed_data'.
113 |     if reduction_type is Reduction.SUM:
114 |       return math_ops.reduce_sum(windowed_data, reduction_axis)
115 |     elif reduction_type is Reduction.MEAN:
116 |       return math_ops.reduce_mean(windowed_data, reduction_axis)
117 |     elif reduction_type is Reduction.STRING_JOIN:
118 |       if isinstance(data, ragged_tensor.RaggedTensor):
119 |         return ragged_functional_ops.map_flat_values(
120 |             string_ops.reduce_join,
121 |             windowed_data,
122 |             axis=axis,
123 |             separator=string_separator)
124 |       else:
125 |         return string_ops.reduce_join(
126 |             windowed_data, axis=axis, separator=string_separator)
127 | 


--------------------------------------------------------------------------------
/tensorflow_text/python/ops/normalize_ops_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 TF.Text Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # coding=utf-8
 17 | """Tests for normalization ops in tensorflow_text."""
 18 | 
 19 | from __future__ import absolute_import
 20 | from __future__ import division
 21 | from __future__ import print_function
 22 | 
 23 | from tensorflow.python.framework import errors
 24 | from tensorflow.python.framework import test_util
 25 | from tensorflow.python.ops.ragged import ragged_factory_ops
 26 | from tensorflow.python.ops.ragged import ragged_test_util
 27 | from tensorflow.python.platform import test
 28 | from tensorflow_text.python.ops import normalize_ops
 29 | 
 30 | 
 31 | @test_util.run_all_in_graph_and_eager_modes
 32 | class NormalizeOpsTest(ragged_test_util.RaggedTensorTestCase):
 33 | 
 34 |   def test_lowercase_one_string(self):
 35 |     txt = [
 36 |         " TExt to loWERcase! ",
 37 |     ]
 38 |     expected = [
 39 |         " text to lowercase! ",
 40 |     ]
 41 |     self.assertAllEqual(expected, normalize_ops.case_fold_utf8(txt))
 42 | 
 43 |   def test_lowercase_text(self):
 44 |     txt = [
 45 |         "Punctuation and digits: -*/+$#%@%$123456789#^$*%&",
 46 |         "Non-latin UTF8 chars: ΘͽʦȺЩ",
 47 |         "Accented chars: ĎÔPQRŔSŠoóôpqrŕsštťuúvwxyý",
 48 |         "Non-UTF8-letters: e.g. ◆, ♥, and the emoji symbol ( ͡° ͜ʖ ͡°)",
 49 |         "Folded: ßς",
 50 |         ""
 51 |     ]
 52 |     expected = [
 53 |         "punctuation and digits: -*/+$#%@%$123456789#^$*%&",
 54 |         "non-latin utf8 chars: θͽʦⱥщ",
 55 |         "accented chars: ďôpqrŕsšoóôpqrŕsštťuúvwxyý",
 56 |         "non-utf8-letters: e.g. ◆, ♥, and the emoji symbol ( ͡° ͜ʖ ͡°)",
 57 |         "folded: ssσ",
 58 |         ""
 59 |     ]
 60 |     self.assertAllEqual(expected, normalize_ops.case_fold_utf8(txt))
 61 | 
 62 |   def test_lowercase_one_string_ragged(self):
 63 |     txt = ragged_factory_ops.constant([[" TExt ", "to", " loWERcase! "],
 64 |                                        [" TExt to loWERcase! "]])
 65 |     expected = [[" text ", "to", " lowercase! "], [" text to lowercase! "]]
 66 |     self.assertRaggedEqual(expected, normalize_ops.case_fold_utf8(txt))
 67 | 
 68 |   def test_lowercase_empty_string(self):
 69 |     txt = [
 70 |         "",
 71 |     ]
 72 |     expected = [
 73 |         "",
 74 |     ]
 75 |     self.assertAllEqual(expected, normalize_ops.case_fold_utf8(txt))
 76 | 
 77 |   def test_normalize_nfkc(self):
 78 |     txt = [
 79 |         u"\u1e9b\u0323",
 80 |     ]
 81 |     expected = [
 82 |         u"ṩ".encode("utf-8"),
 83 |     ]
 84 |     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFKC"))
 85 |     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfkc"))
 86 | 
 87 |   def test_normalize_nfkc_batch(self):
 88 |     txt = [
 89 |         u"\u1e9b\u0323",
 90 |         u"\ufb01",
 91 |     ]
 92 |     expected = [
 93 |         u"ṩ".encode("utf-8"),
 94 |         "fi",
 95 |     ]
 96 |     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFKC"))
 97 |     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfkc"))
 98 | 
 99 |   def test_normalize_nfkc_ragged(self):
100 |     txt = ragged_factory_ops.constant([[[u"\u1e9b\u0323 \ufb01"], []],
101 |                                        [[u"\u1e9b\u0323", u"\ufb01"]]])
102 |     expected = [[[u"ṩ fi".encode("utf-8")], []], [[u"ṩ".encode("utf-8"), "fi"]]]
103 |     self.assertRaggedEqual(expected, normalize_ops.normalize_utf8(txt, "NFKC"))
104 |     self.assertRaggedEqual(expected, normalize_ops.normalize_utf8(txt, "nfkc"))
105 | 
106 |   def test_normalize_nfc(self):
107 |     txt = [
108 |         u"\u1e9b\u0323",
109 |     ]
110 |     expected = [
111 |         u"\u1e9b\u0323".encode("utf-8"),
112 |     ]
113 |     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFC"))
114 |     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfc"))
115 | 
116 |   def test_normalize_nfd(self):
117 |     txt = [u"\u1e9b\u0323"]
118 |     expected = [
119 |         u"\u017f\u0323\u0307".encode("utf-8"),
120 |     ]
121 |     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFD"))
122 |     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfd"))
123 | 
124 |   def test_normalize_nfkd(self):
125 |     txt = [
126 |         u"\u1e9b\u0323",
127 |     ]
128 |     expected = [
129 |         u"\u0073\u0323\u0307".encode("utf-8"),
130 |     ]
131 |     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFKD"))
132 |     self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfkd"))
133 | 
134 |   def test_unknown_normalization_form(self):
135 |     with self.assertRaises(errors.InvalidArgumentError):
136 |       bomb = normalize_ops.normalize_utf8(["cant readme", "wont read me"],
137 |                                           "cantfindme")
138 |       self.evaluate(bomb)
139 | 
140 | 
141 | if __name__ == "__main__":
142 |   test.main()
143 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/TokenizerWithOffsets.md:
--------------------------------------------------------------------------------
  1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
  2 | <meta itemprop="name" content="text.TokenizerWithOffsets" />
  3 | <meta itemprop="path" content="Stable" />
  4 | <meta itemprop="property" content="name"/>
  5 | <meta itemprop="property" content="name_scope"/>
  6 | <meta itemprop="property" content="submodules"/>
  7 | <meta itemprop="property" content="trainable_variables"/>
  8 | <meta itemprop="property" content="variables"/>
  9 | <meta itemprop="property" content="__init__"/>
 10 | <meta itemprop="property" content="tokenize"/>
 11 | <meta itemprop="property" content="tokenize_with_offsets"/>
 12 | <meta itemprop="property" content="with_name_scope"/>
 13 | </div>
 14 | 
 15 | # text.TokenizerWithOffsets
 16 | 
 17 | ## Class `TokenizerWithOffsets`
 18 | 
 19 | Base class (abstract class) for all tokenizers that return offsets.
 20 | 
 21 | Inherits From: [`Tokenizer`](../text/Tokenizer.md)
 22 | 
 23 | Defined in
 24 | [`python/ops/tokenization.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/tokenization.py).
 25 | 
 26 | <!-- Placeholder for "Used in" -->
 27 | 
 28 | <h2 id="__init__"><code>__init__</code></h2>
 29 | 
 30 | ```python
 31 | __init__(name=None)
 32 | ```
 33 | 
 34 | ## Properties
 35 | 
 36 | <h3 id="name"><code>name</code></h3>
 37 | 
 38 | Returns the name of this module as passed or determined in the ctor.
 39 | 
 40 | NOTE: This is not the same as the `self.name_scope.name` which includes parent
 41 | module names.
 42 | 
 43 | <h3 id="name_scope"><code>name_scope</code></h3>
 44 | 
 45 | Returns a `tf.name_scope` instance for this class.
 46 | 
 47 | <h3 id="submodules"><code>submodules</code></h3>
 48 | 
 49 | Sequence of all sub-modules.
 50 | 
 51 | Submodules are modules which are properties of this module, or found as
 52 | properties of modules which are properties of this module (and so on).
 53 | 
 54 | ```
 55 | a = tf.Module()
 56 | b = tf.Module()
 57 | c = tf.Module()
 58 | a.b = b
 59 | b.c = c
 60 | assert list(a.submodules) == [b, c]
 61 | assert list(b.submodules) == [c]
 62 | assert list(c.submodules) == []
 63 | ```
 64 | 
 65 | #### Returns:
 66 | 
 67 | A sequence of all submodules.
 68 | 
 69 | <h3 id="trainable_variables"><code>trainable_variables</code></h3>
 70 | 
 71 | Sequence of variables owned by this module and it's submodules.
 72 | 
 73 | Note: this method uses reflection to find variables on the current instance and
 74 | submodules. For performance reasons you may wish to cache the result of calling
 75 | this method if you don't expect the return value to change.
 76 | 
 77 | #### Returns:
 78 | 
 79 | A sequence of variables for the current module (sorted by attribute name)
 80 | followed by variables from all submodules recursively (breadth first).
 81 | 
 82 | <h3 id="variables"><code>variables</code></h3>
 83 | 
 84 | Sequence of variables owned by this module and it's submodules.
 85 | 
 86 | Note: this method uses reflection to find variables on the current instance and
 87 | submodules. For performance reasons you may wish to cache the result of calling
 88 | this method if you don't expect the return value to change.
 89 | 
 90 | #### Returns:
 91 | 
 92 | A sequence of variables for the current module (sorted by attribute name)
 93 | followed by variables from all submodules recursively (breadth first).
 94 | 
 95 | ## Methods
 96 | 
 97 | <h3 id="tokenize"><code>tokenize</code></h3>
 98 | 
 99 | ```python
100 | tokenize(input)
101 | ```
102 | 
103 | Abstract function for tokenization.
104 | 
105 | #### Args:
106 | 
107 | *   <b>`input`</b>: An N-dimensional UTF-8 string (or optionally integer) Tensor
108 |     or RaggedTensor.
109 | 
110 | #### Returns:
111 | 
112 | An N+1-dimensional UTF-8 string or integer Tensor or RaggedTensor.
113 | 
114 | <h3 id="tokenize_with_offsets"><code>tokenize_with_offsets</code></h3>
115 | 
116 | ```python
117 | tokenize_with_offsets(input)
118 | ```
119 | 
120 | Abstract function for tokenization with offsets.
121 | 
122 | #### Args:
123 | 
124 | *   <b>`input`</b>: An N-dimensional UTF-8 string (or optionally integer) Tensor
125 |     or RaggedTensor.
126 | 
127 | #### Returns:
128 | 
129 | A tuple (tokens, start_offsets, limit_offsets): * tokens is an N+1-dimensional
130 | UTF-8 string or integer Tensor or RaggedTensor. * start_offsets is an
131 | N+1-dimensional integer Tensor containing the starting indices of each token
132 | (byte indices for input strings). * limit_offsets is an N+1-dimensional integer
133 | Tensor containing the exclusive ending indices of each token (byte indices for
134 | input strings).
135 | 
136 | <h3 id="with_name_scope"><code>with_name_scope</code></h3>
137 | 
138 | ```python
139 | with_name_scope(
140 |     cls,
141 |     method
142 | )
143 | ```
144 | 
145 | Decorator to automatically enter the module name scope.
146 | 
147 | ```
148 | class MyModule(tf.Module):
149 |   @tf.Module.with_name_scope
150 |   def __call__(self, x):
151 |     if not hasattr(self, 'w'):
152 |       self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
153 |     return tf.matmul(x, self.w)
154 | ```
155 | 
156 | Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names
157 | included the module name:
158 | 
159 | ```
160 | mod = MyModule()
161 | mod(tf.ones([8, 32]))
162 | # ==> <tf.Tensor: ...>
163 | mod.w
164 | # ==> <tf.Variable ...'my_module/w:0'>
165 | ```
166 | 
167 | #### Args:
168 | 
169 | *   <b>`method`</b>: The method to wrap.
170 | 
171 | #### Returns:
172 | 
173 | The original method wrapped such that it enters the module's name scope.
174 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/greedy_constrained_sequence.md:
--------------------------------------------------------------------------------
  1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
  2 | <meta itemprop="name" content="text.greedy_constrained_sequence" />
  3 | <meta itemprop="path" content="Stable" />
  4 | </div>
  5 | 
  6 | # text.greedy_constrained_sequence
  7 | 
  8 | Performs greedy constrained sequence on a batch of examples.
  9 | 
 10 | ``` python
 11 | text.greedy_constrained_sequence(
 12 |     scores,
 13 |     sequence_length=None,
 14 |     allowed_transitions=None,
 15 |     transition_weights=None,
 16 |     use_log_space=False,
 17 |     use_start_and_end_states=False,
 18 |     name=None
 19 | )
 20 | ```
 21 | 
 22 | Defined in
 23 | [`python/ops/greedy_constrained_sequence_op.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/greedy_constrained_sequence_op.py).
 24 | 
 25 | <!-- Placeholder for "Used in" -->
 26 | 
 27 | Constrains a set of predictions based on a set of legal transitions
 28 | and/or a set of transition weights, returning the legal sequence that
 29 | maximizes the product or sum of the state scores and the transition weights
 30 | at each step. If use_log_space is true, the sum is used; if false, the
 31 | product is used.
 32 | 
 33 | This op also takes a parameter 'use_start_and_end_states', which when true
 34 | will add an implicit start and end state to each sequence. These implicit
 35 | states allow the user to specify additional weights and permitted transitions
 36 | to start and end a sequence (so, for instance, if you wanted to forbid your
 37 | output from ending in a certain set of states you could do so).
 38 | 
 39 | Inputs to this op can take one of three forms: a single Tensorflow tensor
 40 | of scores with no sequence lengths, a Tensorflow tensor of scores along
 41 | with a Tensorflow tensor of sequence lengths, or a RaggedTensor. If only the
 42 | scores tensor is passed, this op will assume that the sequence lengths are
 43 | equal to the size of the tensor (and so use all the data provided). If a
 44 | scores tensor and sequence_lengths tensor is provided, the op will only
 45 | use the data in the scores tensor as specified by the sequence_lengths tensor.
 46 | Finally, if a RaggedTensor is provided, the sequence_lengths will be ignored
 47 | and the variable length sequences in the RaggedTensor will be used.
 48 | 
 49 | #### Args:
 50 | 
 51 | *   <b>`scores`</b>: `<float32> [batch_size, num_steps, |num_states|]` A tensor
 52 |     of scores, where `scores[b, t, s]` is the predicted score for transitioning
 53 |     to state `s` at step `t` for batch `b`. The |num_states| dimension must
 54 |     correspond to the num_states attribute for this op. This input may be
 55 |     ragged; if it is ragged, the ragged tensor should have the same structure
 56 |     [b, t, s] and only axis 1 should be ragged.
 57 | 
 58 | *   <b>`sequence_length`</b>: `<{int32, int64}>[batch_size]` A rank-1 tensor
 59 |     representing the length of the output sequence. If None, and the 'scores'
 60 |     input is not ragged, sequence lengths will be assumed to be the length of
 61 |     the score tensor.
 62 | 
 63 | *   <b>`allowed_transitions`</b>: if use_start_and_end_states is TRUE:
 64 |     `<bool>[num_states+1, num_states+1]` if use_start_and_end_states is FALSE:
 65 |     `<bool>[num_states, num_states]` A rank-2 tensor representing allowed
 66 |     transitions.
 67 | 
 68 |     -   allowed_transitions[i][j] is true if the transition from state i to
 69 |         state j is allowed for i and j in 0...(num_states).
 70 |     -   allowed_transitions[num_states][num_states] is ignored. If
 71 |         use_start_and_end_states is TRUE:
 72 |     -   allowed_transitions[num_states][j] is true if the sequence is allowed to
 73 |         start from state j.
 74 |     -   allowed_transitions[i][num_states] is true if the sequence is allowed to
 75 |         end on state i. Default - An empty tensor. This allows all sequence
 76 |         states to transition to all other sequence states.
 77 | 
 78 | *   <b>`transition_weights`</b>: if use_start_and_end_states is TRUE:
 79 |     `<float32>[num_states+1, num_states+1]` if use_start_and_end_states is
 80 |     FALSE: `<float32>[num_states, num_states]` A rank-2 tensor representing
 81 |     transition weights.
 82 | 
 83 |     -   transition_weights[i][j] is the coefficient that a candidate transition
 84 |         score will be multiplied by if that transition is from state i to state
 85 |         j.
 86 |     -   transition_weights[num_states][num_states] is ignored. If
 87 |         use_start_and_end_states is TRUE:
 88 |     -   transition_weights[num_states][j] is the coefficient that will be used
 89 |         if the transition starts with state j.
 90 |     -   transition_weights[i][num_states] is the coefficient that will be used
 91 |         if the final state in the sequence is state i. Default - An empty
 92 |         tensor. This assigns a wieght of 1.0 all transitions
 93 | 
 94 | *   <b>`use_log_space`</b>: Whether to use log space for the calculation. If
 95 |     false, calculations will be done in exp-space.
 96 | 
 97 | *   <b>`use_start_and_end_states`</b>: If True, sequences will have an implicit
 98 |     start and end state added.
 99 | 
100 | *   <b>`name`</b>: The name scope within which this op should be constructed.
101 | 
102 | #### Returns:
103 | 
104 | An <int32>[batch_size, (num_steps)] ragged tensor containing the appropriate
105 | sequence of transitions. If a sequence is impossible, the value of the
106 | RaggedTensor for that and all following transitions in that sequence shall be
107 | '-1'.
108 | 


--------------------------------------------------------------------------------
/tensorflow_text/python/ops/create_feature_bitmask_op.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 TF.Text Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Tokenize text ops.
 17 | 
 18 | Uses the SAFT Tokenization library for sentence and word breaking
 19 | """
 20 | 
 21 | from __future__ import absolute_import
 22 | from __future__ import division
 23 | from __future__ import print_function
 24 | 
 25 | from tensorflow.python.framework import constant_op
 26 | from tensorflow.python.framework import dtypes
 27 | from tensorflow.python.framework import errors
 28 | from tensorflow.python.framework import ops
 29 | from tensorflow.python.ops import array_ops
 30 | from tensorflow.python.ops import check_ops
 31 | from tensorflow.python.ops import math_ops
 32 | 
 33 | # The maximum number of bits that can be encoded by create_feature_bitmask
 34 | # in each datatype.
 35 | _max_bits = {
 36 |     dtypes.uint8: 8,
 37 |     dtypes.int8: 7,
 38 |     dtypes.uint16: 16,
 39 |     dtypes.int16: 15,
 40 |     dtypes.int32: 31,
 41 |     dtypes.int64: 63,
 42 | }
 43 | 
 44 | 
 45 | def create_feature_bitmask(tensor, dtype=dtypes.int32, name=None):
 46 |   """Packs the innermost dimension of a boolean tensor into integer values.
 47 | 
 48 |   `result[i1...iN]` is the integer formed by interpreting the booleans
 49 |   `tensor[i1...iN, 0:num_bits]` as individual bits, with big-endian order.
 50 |   E.g., if `tensor[i1...iN, 0:num_bits] = [True, False, False, True, False]`,
 51 |   then `result[i1...iN] = 0b10010 = 18`.  The return tensor is of type `dtype`,
 52 |   if specified; if `dtype` is not set, `int32` will be used.
 53 | 
 54 |   If `num_bits` is too large to fit in `dtype`, then an exception is raised
 55 |   when this op is called (if `num_bits` is statically known) or when it is
 56 |   evaluated (if `num_bits` is not statically known).
 57 | 
 58 |   Args:
 59 |     tensor: `<bool>[D1...DN, num_bits]` The boolean tensor whose innermost
 60 |       dimension should be packed to form integer values.
 61 |     dtype: The datatype to output for this op (optional).
 62 |     name: The name for this op (optional).
 63 | 
 64 |   Returns:
 65 |     `<dtype> [D1...DN]`
 66 |       An integer tensor formed by interpreting the innermost dimension of
 67 |       `tensor` as individual bits.
 68 | 
 69 |   Raises:
 70 |     ValueError: If the data to be packed is too large for the chosen data
 71 |       type.
 72 |     ValueError: If the data to be packed is not boolean.
 73 |     InvalidArgumentError: If the input tensor is a list, or the dtype is not a
 74 |       supported integer type.
 75 | 
 76 |   Examples:
 77 |     ```python
 78 |     >>> assert create_feature_bitmask([True, False, False, True]) == 0b1001
 79 |     >>> create_feature_bitmask([[True, False], [False, True], [True, True]])
 80 |     [0b10, 0b01, 0b11]
 81 |     ```
 82 |   """
 83 |   with ops.name_scope(name, 'CreateFeatureBitmask', [tensor]):
 84 |     if (isinstance(tensor, (list, tuple)) and tensor and
 85 |         isinstance(tensor[0], ops.Tensor)):
 86 |       raise errors.InvalidArgumentError(
 87 |           None, None,
 88 |           'CreateFeatureBitmask does not support lists of tensors. Consider '
 89 |           'using tf.stack(list,-1) to create a single tensor before invoking '
 90 |           'this op.')
 91 | 
 92 |     tensor = ops.convert_to_tensor(tensor, dtypes.bool, 'tensor')
 93 | 
 94 |     if dtype not in _max_bits.keys():
 95 |       raise errors.InvalidArgumentError(
 96 |           None, None, 'dtype must be one of: [%s], was %s' %
 97 |           (sorted(_max_bits), dtype.name))
 98 | 
 99 |     integer_data = math_ops.cast(tensor, dtype=dtype)
100 |     shape = tensor.shape
101 |     if shape.ndims is not None and shape.dims[-1].value is not None:
102 |       num_bits = shape.dims[-1].value
103 |       if num_bits > 63:
104 |         raise ValueError(
105 |             'data.shape[-1] must be less than 64, is %d.' % num_bits)
106 |       elif num_bits > _max_bits[dtype]:
107 |         raise ValueError(
108 |             'data.shape[-1] is too large for %s (was %d, cannot exceed %d); '
109 |             'consider switching condense_boolean_tensor to a larger '
110 |             'dtype.' % (dtype.name, num_bits, _max_bits[dtype]))
111 |       bit_masks = constant_op.constant(
112 |           [2**pos for pos in range(num_bits - 1, -1, -1)], dtype)
113 |     else:
114 |       bit_masks = constant_op.constant(
115 |           [2**pos for pos in range(_max_bits[dtype] - 1, -1, -1)], dtype)
116 |       num_bits = array_ops.shape(tensor)[-1]
117 |       with ops.control_dependencies([
118 |           check_ops.assert_less_equal(
119 |               num_bits,
120 |               _max_bits[dtype],
121 |               message='data.shape[-1] is too large for %s (cannot exceed %s)' %
122 |               (dtype.name, _max_bits[dtype]))
123 |       ]):
124 |         # The second slice ("[:num_bits]") is a no-op unless num_bits==0.
125 |         bit_masks = bit_masks[-num_bits:][:num_bits]
126 |     return math_ops.reduce_sum(integer_data * bit_masks, axis=-1)
127 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/viterbi_constrained_sequence.md:
--------------------------------------------------------------------------------
  1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
  2 | <meta itemprop="name" content="text.viterbi_constrained_sequence" />
  3 | <meta itemprop="path" content="Stable" />
  4 | </div>
  5 | 
  6 | # text.viterbi_constrained_sequence
  7 | 
  8 | Performs greedy constrained sequence on a batch of examples.
  9 | 
 10 | ``` python
 11 | text.viterbi_constrained_sequence(
 12 |     scores,
 13 |     sequence_length=None,
 14 |     allowed_transitions=None,
 15 |     transition_weights=None,
 16 |     use_log_space=False,
 17 |     use_start_and_end_states=True,
 18 |     name=None
 19 | )
 20 | ```
 21 | 
 22 | Defined in
 23 | [`python/ops/viterbi_constrained_sequence_op.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/viterbi_constrained_sequence_op.py).
 24 | 
 25 | <!-- Placeholder for "Used in" -->
 26 | 
 27 | Constrains a set of predictions based on a set of legal transitions
 28 | and/or a set of transition weights, returning the legal sequence that
 29 | maximizes the product of the state scores and the transition weights
 30 | according to the Viterbi algorithm. If use_log_space is True, the Viterbi
 31 | calculation will be performed in log space (with sums); if it is False,
 32 | the Viterbi calculation will be performed in exp space (with normalized
 33 | products).
 34 | 
 35 | This op also takes a parameter 'use_start_and_end_states', which when true
 36 | will add an implicit start and end state to each sequence. These implicit
 37 | states allow the user to specify additional weights and permitted transitions
 38 | to start and end a sequence (so, for instance, if you wanted to forbid your
 39 | output from ending in a certain set of states you could do so).
 40 | 
 41 | Inputs to this op can take one of three forms: a single Tensorflow tensor
 42 | of scores with no sequence lengths, a Tensorflow tensor of scores along
 43 | with a Tensorflow tensor of sequence lengths, or a RaggedTensor. If only the
 44 | scores tensor is passed, this op will assume that the sequence lengths are
 45 | equal to the size of the tensor (and so use all the data provided). If a
 46 | scores tensor and sequence_lengths tensor is provided, the op will only
 47 | use the data in the scores tensor as specified by the sequence_lengths tensor.
 48 | Finally, if a RaggedTensor is provided, the sequence_lengths will be ignored
 49 | and the variable length sequences in the RaggedTensor will be used.
 50 | 
 51 | #### Args:
 52 | 
 53 | *   <b>`scores`</b>: `<float32> [batch_size, num_steps, |num_states|]` A tensor
 54 |     of scores, where `scores[b, t, s]` is the predicted score for transitioning
 55 |     to state `s` at step `t` for batch `b`. The |num_states| dimension must
 56 |     correspond to the num_states attribute for this op. This input may be
 57 |     ragged; if it is ragged, the ragged tensor should have the same structure
 58 |     [b, t, s] and only axis 1 should be ragged.
 59 | 
 60 | *   <b>`sequence_length`</b>: `<{int32, int64}>[batch_size]` A rank-1 tensor
 61 |     representing the length of the output sequence. If None, and the 'scores'
 62 |     input is not ragged, sequence lengths will be assumed to be the length of
 63 |     the score tensor.
 64 | 
 65 | *   <b>`allowed_transitions`</b>: if use_start_and_end_states is TRUE:
 66 |     `<bool>[num_states+1, num_states+1]` if use_start_and_end_states is FALSE:
 67 |     `<bool>[num_states, num_states]` A rank-2 tensor representing allowed
 68 |     transitions.
 69 | 
 70 |     -   allowed_transitions[i][j] is true if the transition from state i to
 71 |         state j is allowed for i and j in 0...(num_states).
 72 |     -   allowed_transitions[num_states][num_states] is ignored. If
 73 |         use_start_and_end_states is TRUE:
 74 |     -   allowed_transitions[num_states][j] is true if the sequence is allowed to
 75 |         start from state j.
 76 |     -   allowed_transitions[i][num_states] is true if the sequence is allowed to
 77 |         end on state i. Default - An empty tensor. This allows all sequence
 78 |         states to transition to all other sequence states.
 79 | 
 80 | *   <b>`transition_weights`</b>: if use_start_and_end_states is TRUE:
 81 |     `<float32>[num_states+1, num_states+1]` if use_start_and_end_states is
 82 |     FALSE: `<float32>[num_states, num_states]` A rank-2 tensor representing
 83 |     transition weights.
 84 | 
 85 |     -   transition_weights[i][j] is the coefficient that a candidate transition
 86 |         score will be multiplied by if that transition is from state i to state
 87 |         j.
 88 |     -   transition_weights[num_states][num_states] is ignored. If
 89 |         use_start_and_end_states is TRUE:
 90 |     -   transition_weights[num_states][j] is the coefficient that will be used
 91 |         if the transition starts with state j.
 92 |     -   transition_weights[i][num_states] is the coefficient that will be used
 93 |         if the final state in the sequence is state i. Default - An empty
 94 |         tensor. This assigns a wieght of 1.0 all transitions
 95 | 
 96 | *   <b>`use_log_space`</b>: Whether to use log space for the calculation. If
 97 |     false, calculations will be done in exp-space.
 98 | 
 99 | *   <b>`use_start_and_end_states`</b>: If True, sequences will have an implicit
100 |     start and end state added.
101 | 
102 | *   <b>`name`</b>: The name scope within which this op should be constructed.
103 | 
104 | #### Returns:
105 | 
106 | An <int32>[batch_size, (num_steps)] ragged tensor containing the appropriate
107 | sequence of transitions. If a sequence is impossible, the value of the
108 | RaggedTensor for that and all following transitions in that sequence shall be
109 | '-1'.
110 | 


--------------------------------------------------------------------------------
/tensorflow_text/core/kernels/wordpiece_tokenizer.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 TF.Text Authors.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include "tensorflow_text/core/kernels/wordpiece_tokenizer.h"
 16 | 
 17 | #include "absl/strings/str_cat.h"
 18 | #include "absl/strings/string_view.h"
 19 | #include "icu4c/source/common/unicode/schriter.h"
 20 | #include "icu4c/source/common/unicode/unistr.h"
 21 | #include "icu4c/source/common/unicode/utf8.h"
 22 | #include "tensorflow/core/framework/tensor_shape.h"
 23 | #include "tensorflow/core/lib/core/errors.h"
 24 | 
 25 | namespace tensorflow {
 26 | namespace text {
 27 | 
 28 | constexpr int64 kOutOfVocabValue = -1;
 29 | 
 30 | LookupTableVocab::LookupTableVocab(lookup::LookupInterface* table,
 31 |                                    OpKernelContext* ctx)
 32 |     : table_(table), ctx_(ctx), default_value_(DT_INT64, TensorShape({1})) {
 33 |   default_value_.flat<int64>()(0) = kOutOfVocabValue;
 34 | }
 35 | 
 36 | Status LookupTableVocab::Contains(const string& key, bool* value) {
 37 |   if (value == nullptr) {
 38 |     return errors::InvalidArgument("Bad 'value' param.");
 39 |   }
 40 |   Tensor keys(DT_STRING, TensorShape({1}));
 41 |   keys.flat<string>()(0) = key;
 42 |   Tensor values(DT_INT64, TensorShape({1}));
 43 |   TF_RETURN_IF_ERROR(table_->Find(ctx_, keys, &values, default_value_));
 44 | 
 45 |   if (static_cast<int64>(values.flat<int64>()(0)) != kOutOfVocabValue) {
 46 |     *value = true;
 47 |     return Status::OK();
 48 |   }
 49 |   *value = false;
 50 |   return Status::OK();
 51 | }
 52 | 
 53 | Status WordpieceTokenize(const string& token, const int64 max_bytes_per_token,
 54 |                          const string& suffix_indicator, bool use_unknown_token,
 55 |                          const string& unknown_token,
 56 |                          LookupTableVocab* vocab_map,
 57 |                          std::vector<string>* subwords,
 58 |                          std::vector<int>* begin_offset,
 59 |                          std::vector<int>* end_offset, int* num_word_pieces) {
 60 |   if (token.size() > max_bytes_per_token) {
 61 |     if (use_unknown_token) {
 62 |       subwords->emplace_back(unknown_token);
 63 |       end_offset->push_back(unknown_token.size());
 64 |     } else {
 65 |       subwords->emplace_back(token);
 66 |       end_offset->push_back(token.size());
 67 |     }
 68 |     begin_offset->push_back(0);
 69 |     *num_word_pieces = 1;
 70 |     return Status::OK();
 71 |   }
 72 | 
 73 |   icu::UnicodeString token_unicode = icu::UnicodeString::fromUTF8(token);
 74 |   bool is_bad = false;
 75 |   int start = 0;
 76 |   int byte_offset_start = 0;
 77 |   std::vector<string> sub_tokens;
 78 |   std::vector<int> sub_tokens_begin_offset;
 79 |   std::vector<int> sub_tokens_end_offset;
 80 |   while (start < token_unicode.length()) {
 81 |     string cur_substr;
 82 |     int end = token_unicode.length();
 83 |     int num_subword_bytes = token.size() - byte_offset_start;
 84 |     icu::StringCharacterIterator backward_iter(token_unicode, start, end,
 85 |                                                start);
 86 |     backward_iter.last32();
 87 | 
 88 |     while (num_subword_bytes > 0) {
 89 |       absl::string_view substr(token.data() + byte_offset_start,
 90 |                                num_subword_bytes);
 91 |       string lookup_value;
 92 |       if (byte_offset_start > 0) {
 93 |         lookup_value = absl::StrCat(suffix_indicator, substr);
 94 |       } else {
 95 |         // absl::CopyToString
 96 |         lookup_value.assign(substr.begin(), substr.end());
 97 |       }
 98 | 
 99 |       bool found_in_vocab;
100 |       TF_RETURN_IF_ERROR(vocab_map->Contains(lookup_value, &found_in_vocab));
101 |       if (found_in_vocab) {
102 |         cur_substr.swap(lookup_value);
103 |         break;
104 |       }
105 |       --end;
106 |       num_subword_bytes -= U8_LENGTH(backward_iter.current32());
107 |       backward_iter.previous32();
108 |     }
109 |     if (cur_substr.empty()) {
110 |       is_bad = true;
111 |       break;
112 |     }
113 | 
114 |     sub_tokens.emplace_back(cur_substr);
115 |     sub_tokens_begin_offset.emplace_back(byte_offset_start);
116 |     sub_tokens_end_offset.emplace_back(byte_offset_start + num_subword_bytes);
117 |     start = end;
118 |     byte_offset_start += num_subword_bytes;
119 |   }
120 |   if (is_bad) {
121 |     if (use_unknown_token) {
122 |       subwords->emplace_back(unknown_token);
123 |     } else {
124 |       subwords->emplace_back(token);
125 |     }
126 |     begin_offset->emplace_back(0);
127 |     end_offset->emplace_back(token.size());
128 |     *num_word_pieces = 1;
129 |   } else {
130 |     subwords->insert(subwords->end(), sub_tokens.begin(), sub_tokens.end());
131 |     begin_offset->insert(begin_offset->end(), sub_tokens_begin_offset.begin(),
132 |                          sub_tokens_begin_offset.end());
133 |     end_offset->insert(end_offset->end(), sub_tokens_end_offset.begin(),
134 |                        sub_tokens_end_offset.end());
135 |     *num_word_pieces = sub_tokens.size();
136 |   }
137 |   return Status::OK();
138 | }
139 | 
140 | }  // namespace text
141 | }  // namespace tensorflow
142 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/WhitespaceTokenizer.md:
--------------------------------------------------------------------------------
  1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
  2 | <meta itemprop="name" content="text.WhitespaceTokenizer" />
  3 | <meta itemprop="path" content="Stable" />
  4 | <meta itemprop="property" content="name"/>
  5 | <meta itemprop="property" content="name_scope"/>
  6 | <meta itemprop="property" content="submodules"/>
  7 | <meta itemprop="property" content="trainable_variables"/>
  8 | <meta itemprop="property" content="variables"/>
  9 | <meta itemprop="property" content="__init__"/>
 10 | <meta itemprop="property" content="tokenize"/>
 11 | <meta itemprop="property" content="tokenize_with_offsets"/>
 12 | <meta itemprop="property" content="with_name_scope"/>
 13 | </div>
 14 | 
 15 | # text.WhitespaceTokenizer
 16 | 
 17 | ## Class `WhitespaceTokenizer`
 18 | 
 19 | Tokenizes a tensor of UTF-8 strings on whitespaces.
 20 | 
 21 | Inherits From: [`TokenizerWithOffsets`](../text/TokenizerWithOffsets.md)
 22 | 
 23 | Defined in
 24 | [`python/ops/whitespace_tokenizer.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/whitespace_tokenizer.py).
 25 | 
 26 | <!-- Placeholder for "Used in" -->
 27 | 
 28 | <h2 id="__init__"><code>__init__</code></h2>
 29 | 
 30 | ```python
 31 | __init__(name=None)
 32 | ```
 33 | 
 34 | ## Properties
 35 | 
 36 | <h3 id="name"><code>name</code></h3>
 37 | 
 38 | Returns the name of this module as passed or determined in the ctor.
 39 | 
 40 | NOTE: This is not the same as the `self.name_scope.name` which includes parent
 41 | module names.
 42 | 
 43 | <h3 id="name_scope"><code>name_scope</code></h3>
 44 | 
 45 | Returns a `tf.name_scope` instance for this class.
 46 | 
 47 | <h3 id="submodules"><code>submodules</code></h3>
 48 | 
 49 | Sequence of all sub-modules.
 50 | 
 51 | Submodules are modules which are properties of this module, or found as
 52 | properties of modules which are properties of this module (and so on).
 53 | 
 54 | ```
 55 | a = tf.Module()
 56 | b = tf.Module()
 57 | c = tf.Module()
 58 | a.b = b
 59 | b.c = c
 60 | assert list(a.submodules) == [b, c]
 61 | assert list(b.submodules) == [c]
 62 | assert list(c.submodules) == []
 63 | ```
 64 | 
 65 | #### Returns:
 66 | 
 67 | A sequence of all submodules.
 68 | 
 69 | <h3 id="trainable_variables"><code>trainable_variables</code></h3>
 70 | 
 71 | Sequence of variables owned by this module and it's submodules.
 72 | 
 73 | Note: this method uses reflection to find variables on the current instance and
 74 | submodules. For performance reasons you may wish to cache the result of calling
 75 | this method if you don't expect the return value to change.
 76 | 
 77 | #### Returns:
 78 | 
 79 | A sequence of variables for the current module (sorted by attribute name)
 80 | followed by variables from all submodules recursively (breadth first).
 81 | 
 82 | <h3 id="variables"><code>variables</code></h3>
 83 | 
 84 | Sequence of variables owned by this module and it's submodules.
 85 | 
 86 | Note: this method uses reflection to find variables on the current instance and
 87 | submodules. For performance reasons you may wish to cache the result of calling
 88 | this method if you don't expect the return value to change.
 89 | 
 90 | #### Returns:
 91 | 
 92 | A sequence of variables for the current module (sorted by attribute name)
 93 | followed by variables from all submodules recursively (breadth first).
 94 | 
 95 | ## Methods
 96 | 
 97 | <h3 id="tokenize"><code>tokenize</code></h3>
 98 | 
 99 | ```python
100 | tokenize(input)
101 | ```
102 | 
103 | Tokenizes a tensor of UTF-8 strings on whitespaces.
104 | 
105 | The strings are split when a ICU defined whitespace character is. These
106 | whitespace characters are dropped.
107 | 
108 | #### Args:
109 | 
110 | *   <b>`input`</b>: A `RaggedTensor` or `Tensor` of UTF-8 strings with any
111 |     shape.
112 | 
113 | #### Returns:
114 | 
115 | A RaggedTensor of tokenized text. The returned shape is the shape of the input
116 | tensor with an added ragged dimension for tokens of each string.
117 | 
118 | <h3 id="tokenize_with_offsets"><code>tokenize_with_offsets</code></h3>
119 | 
120 | ```python
121 | tokenize_with_offsets(input)
122 | ```
123 | 
124 | Tokenizes a tensor of UTF-8 strings on whitespaces.
125 | 
126 | The strings are split when a ICU defined whitespace character is. These
127 | whitespace characters are dropped.
128 | 
129 | #### Args:
130 | 
131 | *   <b>`input`</b>: A `RaggedTensor`or `Tensor` of UTF-8 strings with any shape.
132 | 
133 | #### Returns:
134 | 
135 | A tuple of `RaggedTensor`s `tokens`, `start_offsets`, and `limit_offsets`
136 | 
137 | *   <b>`where`</b>: * `tokens`: A `RaggedTensor` of tokenized text.
138 |     *   `start_offsets`: A `RaggedTensor` of the tokens' starting byte offset.
139 |     *   `limit_offsets`: A `RaggedTensor` of the tokens' ending byte offset.
140 | 
141 | <h3 id="with_name_scope"><code>with_name_scope</code></h3>
142 | 
143 | ```python
144 | with_name_scope(
145 |     cls,
146 |     method
147 | )
148 | ```
149 | 
150 | Decorator to automatically enter the module name scope.
151 | 
152 | ```
153 | class MyModule(tf.Module):
154 |   @tf.Module.with_name_scope
155 |   def __call__(self, x):
156 |     if not hasattr(self, 'w'):
157 |       self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
158 |     return tf.matmul(x, self.w)
159 | ```
160 | 
161 | Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names
162 | included the module name:
163 | 
164 | ```
165 | mod = MyModule()
166 | mod(tf.ones([8, 32]))
167 | # ==> <tf.Tensor: ...>
168 | mod.w
169 | # ==> <tf.Variable ...'my_module/w:0'>
170 | ```
171 | 
172 | #### Args:
173 | 
174 | *   <b>`method`</b>: The method to wrap.
175 | 
176 | #### Returns:
177 | 
178 | The original method wrapped such that it enters the module's name scope.
179 | 


--------------------------------------------------------------------------------
/tensorflow_text/core/kernels/text_kernels_test_util.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 TF.Text Authors.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // GMock matchers for testing text kernels:
 16 | //   TensorHasShapeAndValues<DTYPE>({dim1, ..., dimN}, {v1, v2, ..., vN});
 17 | //   VectorEq<DTYPE>({v1, v2, ..., vN});
 18 | //   MatrixEq<DTYPE>({{v1_1, ..., v1_M}, ..., {vN_1, ..., vN_M}});
 19 | //   TensorHasShape({dim1, ..., dimN});
 20 | 
 21 | #ifndef TENSORFLOW_TEXT_CORE_KERNELS_TEXT_KERNELS_TEST_UTIL_H_
 22 | #define TENSORFLOW_TEXT_CORE_KERNELS_TEXT_KERNELS_TEST_UTIL_H_
 23 | 
 24 | #include <gmock/gmock.h>
 25 | #include "tensorflow/core/framework/tensor.h"
 26 | #include "tensorflow/core/framework/tensor_shape.h"
 27 | #include "tensorflow/core/framework/tensor_testutil.h"
 28 | 
 29 | namespace tensorflow {
 30 | namespace text_kernels_test_util {
 31 | 
 32 | // GMock MatcherInterface for testing tensor equality.
 33 | class TensorEqMatcher : public ::testing::MatcherInterface<Tensor> {
 34 |  public:
 35 |   explicit TensorEqMatcher(const Tensor& expect) : expect_(expect) {}
 36 |   bool MatchAndExplain(Tensor actual,
 37 |                        ::testing::MatchResultListener* listener) const override;
 38 |   void DescribeTo(::std::ostream* gmock_os) const override;
 39 |   void DescribeNegationTo(::std::ostream* gmock_os) const override;
 40 | 
 41 |  private:
 42 |   Tensor expect_;
 43 | };
 44 | 
 45 | // GMock MatcherInterface for testing tensor shapes.
 46 | class TensorHasShapeMatcher : public ::testing::MatcherInterface<Tensor> {
 47 |  public:
 48 |   explicit TensorHasShapeMatcher(const TensorShape& expect) : expect_(expect) {}
 49 |   bool MatchAndExplain(Tensor actual,
 50 |                        ::testing::MatchResultListener* listener) const override;
 51 |   void DescribeTo(::std::ostream* gmock_os) const override;
 52 |   void DescribeNegationTo(::std::ostream* gmock_os) const override;
 53 | 
 54 |  private:
 55 |   TensorShape expect_;
 56 | };
 57 | 
 58 | // Returns a gmock matcher that checks whether a given tensor has the specified
 59 | // dtype, values, and shape.  dtype is specified using the template parameter.
 60 | // values are specified as a flattened vector.
 61 | // Example:
 62 | //   EXPECT_THAT(*GetOutput(0),
 63 | //               TensorHasShapeAndValues<int64>({3, 2}, {1, 2, 3, 4, 5, 6});
 64 | template <typename DTYPE>
 65 | ::testing::Matcher<Tensor> TensorHasShapeAndValues(
 66 |     const TensorShape& shape, const std::vector<DTYPE>& values) {
 67 |   Tensor expect = test::AsTensor<DTYPE>(values, shape);
 68 |   // MakeMatcher takes ownership of the TensorEqMatcher.
 69 |   return ::testing::MakeMatcher(new TensorEqMatcher(expect));
 70 | }
 71 | 
 72 | // Returns a gmock matcher that checks whether a given tensor is a 1-D tensor
 73 | // with the specified dtype and values.  dtype is specified using the template
 74 | // parameter.
 75 | // Example:
 76 | //   EXPECT_THAT(*GetOutput(0),
 77 | //               VectorEq<int64>({1, 2, 3, 4, 5, 6});
 78 | template <typename DTYPE>
 79 | ::testing::Matcher<Tensor> VectorEq(const std::vector<DTYPE>& values) {
 80 |   int64 nvals = values.size();
 81 |   Tensor expect = test::AsTensor<DTYPE>(values, {nvals});
 82 |   // MakeMatcher takes ownership of the TensorEqMatcher.
 83 |   return ::testing::MakeMatcher(new TensorEqMatcher(expect));
 84 | }
 85 | 
 86 | // Returns a gmock matcher that checks whether a given tensor is a 2-D tensor
 87 | // with the specified dtype and values.  dtype is specified using the template
 88 | // parameter.  values are specified as a nested vector.  All rows of the values
 89 | // vector must have the same length.  The values vector may not be empty,
 90 | // since we can't infer the number of columns for an empty matrix; to test
 91 | // empty matrices, use the more general TensorHasShapeAndValues() instead.
 92 | // Example:
 93 | //   EXPECT_THAT(*GetOutput(0),
 94 | //               MatrixEq<int64>({{1, 2, 3}, {4, 5, 6}});
 95 | template <typename DTYPE>
 96 | ::testing::Matcher<Tensor> MatrixEq(
 97 |     const std::vector<std::vector<DTYPE>>& values) {
 98 |   int64 nrows = values.size();
 99 |   CHECK_GT(nrows, 0)  // Crash OK
100 |       << "Invalid use of MatrixEq: to test empty matrices, use "
101 |       << "TensorHasShapeAndValues<dtype>{{0, ndims}, {}} instead.";
102 |   int64 ncols = values[0].size();
103 |   std::vector<DTYPE> flat;
104 |   for (const auto& row : values) {
105 |     CHECK_EQ(ncols, row.size())  // Crash OK
106 |         << "Invalid use of MatrixEq: all rows must have equal length";
107 |     flat.insert(flat.end(), row.begin(), row.end());
108 |   }
109 |   Tensor expect = test::AsTensor<DTYPE>(flat, TensorShape({nrows, ncols}));
110 |   // MakeMatcher takes ownership of the TensorEqMatcher.
111 |   return ::testing::MakeMatcher(new TensorEqMatcher(expect));
112 | }
113 | 
114 | // Returns a gmock matcher that checks whether a given tensor has a specified
115 | // shape.
116 | // Example:
117 | //   EXPECT_THAT(*GetOutput(0), TensorHasShape({2, 8});
118 | ::testing::Matcher<Tensor> TensorHasShape(const TensorShape& shape);
119 | 
120 | }  // namespace text_kernels_test_util
121 | }  // namespace tensorflow
122 | 
123 | #endif  // TENSORFLOW_TEXT_CORE_KERNELS_TEXT_KERNELS_TEST_UTIL_H_
124 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/span_alignment.md:
--------------------------------------------------------------------------------
  1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
  2 | <meta itemprop="name" content="text.span_alignment" />
  3 | <meta itemprop="path" content="Stable" />
  4 | </div>
  5 | 
  6 | # text.span_alignment
  7 | 
  8 | Return an alignment from a set of source spans to a set of target spans.
  9 | 
 10 | ``` python
 11 | text.span_alignment(
 12 |     source_start,
 13 |     source_limit,
 14 |     target_start,
 15 |     target_limit,
 16 |     contains=False,
 17 |     contained_by=False,
 18 |     partial_overlap=False,
 19 |     multivalent_result=False,
 20 |     name=None
 21 | )
 22 | ```
 23 | 
 24 | Defined in
 25 | [`python/ops/pointer_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/pointer_ops.py).
 26 | 
 27 | <!-- Placeholder for "Used in" -->
 28 | 
 29 | The source and target spans are specified using B+1 dimensional tensors,
 30 | with `B>=0` batch dimensions followed by a final dimension that lists the
 31 | span offsets for each span in the batch:
 32 | 
 33 | * The `i`th source span in batch `b1...bB` starts at
 34 |   `source_start[b1...bB, i]` (inclusive), and extends to just before
 35 |   `source_limit[b1...bB, i]` (exclusive).
 36 | * The `j`th target span in batch `b1...bB` starts at
 37 |   `target_start[b1...bB, j]` (inclusive), and extends to just before
 38 |   `target_limit[b1...bB, j]` (exclusive).
 39 | 
 40 | `result[b1...bB, i]` contains the index (or indices) of the target span that
 41 | overlaps with the `i`th source span in batch `b1...bB`.  The
 42 | `multivalent_result` parameter indicates whether the result should contain
 43 | a single span that aligns with the source span, or all spans that align with
 44 | the source span.
 45 | 
 46 | * If `multivalent_result` is false (the default), then `result[b1...bB, i]=j`
 47 |   indicates that the `j`th target span overlaps with the `i`th source span
 48 |   in batch `b1...bB`.  If no target spans overlap with the `i`th target span,
 49 |   then `result[b1...bB, i]=-1`.
 50 | 
 51 | * If `multivalent_result` is true, then `result[b1...bB, i, n]=j` indicates
 52 |   that the `j`th target span is the `n`th span that overlaps with the `i`th
 53 |   source span in in batch `b1...bB`.
 54 | 
 55 | For a definition of span overlap, see the docstring for `span_overlaps()`.
 56 | 
 57 | #### Args:
 58 | 
 59 | *   <b>`source_start`</b>: A B+1 dimensional potentially ragged tensor with
 60 |     shape `[D1...DB, source_size]`: the start offset of each source span.
 61 | *   <b>`source_limit`</b>: A B+1 dimensional potentially ragged tensor with
 62 |     shape `[D1...DB, source_size]`: the limit offset of each source span.
 63 | *   <b>`target_start`</b>: A B+1 dimensional potentially ragged tensor with
 64 |     shape `[D1...DB, target_size]`: the start offset of each target span.
 65 | *   <b>`target_limit`</b>: A B+1 dimensional potentially ragged tensor with
 66 |     shape `[D1...DB, target_size]`: the limit offset of each target span.
 67 | *   <b>`contains`</b>: If true, then a source span is considered to overlap a
 68 |     target span when the source span contains the target span.
 69 | *   <b>`contained_by`</b>: If true, then a source span is considered to overlap
 70 |     a target span when the source span is contained by the target span.
 71 | *   <b>`partial_overlap`</b>: If true, then a source span is considered to
 72 |     overlap a target span when the source span partially overlaps the target
 73 |     span.
 74 | *   <b>`multivalent_result`</b>: Whether the result should contain a single
 75 |     target span index (if `multivalent_result=False`) or a list of target span
 76 |     indices (if `multivalent_result=True`) for each source span.
 77 | *   <b>`name`</b>: A name for the operation (optional).
 78 | 
 79 | #### Returns:
 80 | 
 81 | An int64 tensor with values in the range: `-1 <= result < target_size`. If
 82 | `multivalent_result=False`, then the returned tensor has shape `[source_size]`,
 83 | where `source_size` is the length of the `source_start` and `source_limit` input
 84 | tensors. If `multivalent_result=True`, then the returned tensor has shape
 85 | `[source_size, (num_aligned_target_spans)].
 86 | 
 87 | #### Examples:
 88 | 
 89 |   Given the following source and target spans (with no batch dimensions):
 90 | 
 91 |   ```python
 92 |   >>> #         0    5    10   15   20   25   30   35   40   45   50   55   60
 93 |   >>> #         |====|====|====|====|====|====|====|====|====|====|====|====|
 94 |   >>> # Source: [-0-]     [-1-] [2] [3]    [4][-5-][-6-][-7-][-8-][-9-]
 95 |   >>> # Target: [-0-][-1-]     [-2-][-3-][-4-] [5] [6]    [7]  [-8-][-9-][10]
 96 |   >>> #         |====|====|====|====|====|====|====|====|====|====|====|====|
 97 |   >>> source_start=[0, 10, 16, 20, 27, 30, 35, 40, 45, 50]
 98 |   >>> source_limit=[5, 15, 19, 23, 30, 35, 40, 45, 50, 55]
 99 |   >>> target_start=[0,  5, 15, 20, 25, 31, 35, 42, 47, 52, 57]
100 |   >>> target_limit=[5, 10, 20, 25, 30, 34, 38, 45, 52, 57, 61]
101 | 
102 |   >>> span_alignment_lists(source_starts, source_limits,
103 |                            target_starts, target_limits)
104 |   [0, -1, -1, -1, -1, -1, -1, -1, -1, -1]
105 |   >>> span_alignment_lists(source_starts, source_limits,
106 |   ...                      target_starts, target_limits,
107 |   ...                      multivalent_result=True)
108 |   [[0], [], [], [], [], [], [], [], [], []]
109 | 
110 |   >>> span_alignment_lists(source_starts, source_limits,
111 |   ...                      target_starts, target_limits,
112 |   ...                      contains=True)
113 |   [ 0, -1, -1, -1, -1, 5, 6, 7, -1, -1]
114 | 
115 |   >>> span_alignment_lists(source_starts, source_limits,
116 |   ...                      target_starts, target_limits,
117 |   ...                      partial_overlap=True,
118 |   ...                      multivalent_result=True)
119 |   [[0], [], [2], [3], [4], [5], [6], [7], [8], [8, 9]]


--------------------------------------------------------------------------------
/tensorflow_text/python/ops/sentence_breaking_ops.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 TF.Text Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Break sentence ops."""
 17 | 
 18 | from tensorflow.python.ops.ragged import ragged_tensor
 19 | 
 20 | from tensorflow.python.framework import load_library
 21 | from tensorflow.python.platform import resource_loader
 22 | gen_sentence_breaking_ops = load_library.load_op_library(resource_loader.get_path_to_datafile('_sentence_breaking_ops.so'))
 23 | 
 24 | 
 25 | def sentence_fragments(token_word,
 26 |                        token_starts,
 27 |                        token_ends,
 28 |                        token_properties,
 29 |                        input_encoding='UTF-8',
 30 |                        errors='replace',
 31 |                        replacement_char=0xFFFD,
 32 |                        replace_control_characters=False):
 33 |   """Find the sentence fragments in a given text.
 34 | 
 35 |   A sentence fragment is a potential next sentence determined using
 36 |   deterministic heuristics based on punctuation, capitalization, and similar
 37 |   text attributes.
 38 | 
 39 |   Args:
 40 |     token_word: A Tensor (w/ rank=2) or a RaggedTensor (w/ ragged_rank=1)
 41 |       containing the token strings.
 42 |     token_starts: A Tensor (w/ rank=2) or a RaggedTensor (w/ ragged_rank=1)
 43 |       containing offsets where the token starts.
 44 |     token_ends: A Tensor (w/ rank=2) or a RaggedTensor (w/ ragged_rank=1)
 45 |       containing offsets where the token ends.
 46 |     token_properties: A Tensor (w/ rank=2) or a RaggedTensor (w/ ragged_rank=1)
 47 |       containing a bitmask.
 48 | 
 49 |       The values of the bitmask are:
 50 |         0x01 (ILL_FORMED) - Text is ill-formed according to TextExtractor;
 51 |           typically applies to all tokens of a paragraph that is too short or
 52 |           lacks terminal punctuation.  0x40 (TITLE)
 53 |         0x02 (HEADING)
 54 |         0x04 (BOLD)
 55 |         0x10 (UNDERLINED)
 56 |         0x20 (LIST)
 57 |         0x80 (EMOTICON)
 58 |         0x100 (ACRONYM) - Token was identified by Lexer as an acronym.  Lexer
 59 |           identifies period-, hyphen-, and space-separated acronyms: "U.S.",
 60 |           "U-S", and "U S". Lexer normalizes all three to "US", but the  token
 61 |           word field normalizes only space-separated acronyms.
 62 |        0x200 (HYPERLINK) - Indicates that the token (or part of the token) is a
 63 |           covered by at least one hyperlink. More information of the hyperlink
 64 |           is stored in the first token covered by the hyperlink.
 65 |     input_encoding: String name for the unicode encoding that should be used to
 66 |       decode each string.
 67 |     errors: Specifies the response when an input string can't be converted
 68 |       using the indicated encoding. One of:
 69 |       * `'strict'`: Raise an exception for any illegal substrings.
 70 |       * `'replace'`: Replace illegal substrings with `replacement_char`.
 71 |       * `'ignore'`: Skip illegal substrings.
 72 |     replacement_char: The replacement codepoint to be used in place of invalid
 73 |       substrings in `input` when `errors='replace'`; and in place of C0 control
 74 |       characters in `input` when `replace_control_characters=True`.
 75 |     replace_control_characters: Whether to replace the C0 control characters
 76 |       `(U+0000 - U+001F)` with the `replacement_char`.
 77 |   Returns:
 78 |     A RaggedTensor of `fragment_start`, `fragment_end`, `fragment_properties`
 79 |     and `terminal_punc_token`.
 80 | 
 81 |     `fragment_properties` is an int32 bitmask whose values may contain:
 82 |        1 = fragment ends with terminal punctuation
 83 |        2 = fragment ends with multiple terminal punctuations (e.g.
 84 |          "She said what?!")
 85 |        3 = Has close parenthesis (e.g. "Mushrooms (they're fungi).")
 86 |        4 = Has sentential close parenthesis (e.g. "(Mushrooms are fungi!)"
 87 | 
 88 |      `terminal_punc_token` is a RaggedTensor containing the index of terminal
 89 |       punctuation token immediately following the last word in the fragment
 90 |       -- or index of the last word itself, if it's an acronym (since acronyms
 91 |       include the terminal punctuation). index of the terminal punctuation
 92 |       token.
 93 |   """
 94 |   if not isinstance(token_starts, ragged_tensor.RaggedTensor):
 95 |     token_starts = ragged_tensor.RaggedTensor.from_tensor(token_starts)
 96 |   if not isinstance(token_ends, ragged_tensor.RaggedTensor):
 97 |     token_ends = ragged_tensor.RaggedTensor.from_tensor(token_ends)
 98 |   if not isinstance(token_word, ragged_tensor.RaggedTensor):
 99 |     token_word = ragged_tensor.RaggedTensor.from_tensor(token_word)
100 |   if not isinstance(token_properties, ragged_tensor.RaggedTensor):
101 |     token_properties = ragged_tensor.RaggedTensor.from_tensor(token_properties)
102 | 
103 |   fragment = gen_sentence_breaking_ops.sentence_fragments(
104 |       errors=errors,
105 |       replacement_char=replacement_char,
106 |       replace_control_characters=replace_control_characters,
107 |       input_encoding=input_encoding,
108 |       row_lengths=token_starts.row_lengths(),
109 |       token_start=token_starts.flat_values,
110 |       token_end=token_ends.flat_values,
111 |       token_word=token_word.flat_values,
112 |       token_properties=token_properties.flat_values)
113 |   start, end, properties, terminal_punc_token, row_lengths = fragment
114 |   return tuple(
115 |       ragged_tensor.RaggedTensor.from_row_lengths(value, row_lengths)
116 |       for value in [start, end, properties, terminal_punc_token])
117 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/UnicodeScriptTokenizer.md:
--------------------------------------------------------------------------------
  1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
  2 | <meta itemprop="name" content="text.UnicodeScriptTokenizer" />
  3 | <meta itemprop="path" content="Stable" />
  4 | <meta itemprop="property" content="name"/>
  5 | <meta itemprop="property" content="name_scope"/>
  6 | <meta itemprop="property" content="submodules"/>
  7 | <meta itemprop="property" content="trainable_variables"/>
  8 | <meta itemprop="property" content="variables"/>
  9 | <meta itemprop="property" content="__init__"/>
 10 | <meta itemprop="property" content="tokenize"/>
 11 | <meta itemprop="property" content="tokenize_with_offsets"/>
 12 | <meta itemprop="property" content="with_name_scope"/>
 13 | </div>
 14 | 
 15 | # text.UnicodeScriptTokenizer
 16 | 
 17 | ## Class `UnicodeScriptTokenizer`
 18 | 
 19 | Tokenizes a tensor of UTF-8 strings on Unicode script boundaries.
 20 | 
 21 | Inherits From: [`TokenizerWithOffsets`](../text/TokenizerWithOffsets.md)
 22 | 
 23 | Defined in
 24 | [`python/ops/unicode_script_tokenizer.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/unicode_script_tokenizer.py).
 25 | 
 26 | <!-- Placeholder for "Used in" -->
 27 | 
 28 | <h2 id="__init__"><code>__init__</code></h2>
 29 | 
 30 | ```python
 31 | __init__(name=None)
 32 | ```
 33 | 
 34 | ## Properties
 35 | 
 36 | <h3 id="name"><code>name</code></h3>
 37 | 
 38 | Returns the name of this module as passed or determined in the ctor.
 39 | 
 40 | NOTE: This is not the same as the `self.name_scope.name` which includes parent
 41 | module names.
 42 | 
 43 | <h3 id="name_scope"><code>name_scope</code></h3>
 44 | 
 45 | Returns a `tf.name_scope` instance for this class.
 46 | 
 47 | <h3 id="submodules"><code>submodules</code></h3>
 48 | 
 49 | Sequence of all sub-modules.
 50 | 
 51 | Submodules are modules which are properties of this module, or found as
 52 | properties of modules which are properties of this module (and so on).
 53 | 
 54 | ```
 55 | a = tf.Module()
 56 | b = tf.Module()
 57 | c = tf.Module()
 58 | a.b = b
 59 | b.c = c
 60 | assert list(a.submodules) == [b, c]
 61 | assert list(b.submodules) == [c]
 62 | assert list(c.submodules) == []
 63 | ```
 64 | 
 65 | #### Returns:
 66 | 
 67 | A sequence of all submodules.
 68 | 
 69 | <h3 id="trainable_variables"><code>trainable_variables</code></h3>
 70 | 
 71 | Sequence of variables owned by this module and it's submodules.
 72 | 
 73 | Note: this method uses reflection to find variables on the current instance and
 74 | submodules. For performance reasons you may wish to cache the result of calling
 75 | this method if you don't expect the return value to change.
 76 | 
 77 | #### Returns:
 78 | 
 79 | A sequence of variables for the current module (sorted by attribute name)
 80 | followed by variables from all submodules recursively (breadth first).
 81 | 
 82 | <h3 id="variables"><code>variables</code></h3>
 83 | 
 84 | Sequence of variables owned by this module and it's submodules.
 85 | 
 86 | Note: this method uses reflection to find variables on the current instance and
 87 | submodules. For performance reasons you may wish to cache the result of calling
 88 | this method if you don't expect the return value to change.
 89 | 
 90 | #### Returns:
 91 | 
 92 | A sequence of variables for the current module (sorted by attribute name)
 93 | followed by variables from all submodules recursively (breadth first).
 94 | 
 95 | ## Methods
 96 | 
 97 | <h3 id="tokenize"><code>tokenize</code></h3>
 98 | 
 99 | ```python
100 | tokenize(input)
101 | ```
102 | 
103 | Tokenizes a tensor of UTF-8 strings on Unicode script boundaries.
104 | 
105 | The strings are split when a change in the Unicode script is detected between
106 | sequential tokens. The script codes used correspond to International Components
107 | for Unicode (ICU) UScriptCode values. See:
108 | http://icu-project.org/apiref/icu4c/uscript_8h.html
109 | 
110 | ICU defined whitespace characters are dropped.
111 | 
112 | #### Args:
113 | 
114 | *   <b>`input`</b>: A `RaggedTensor`or `Tensor` of UTF-8 strings with any shape.
115 | 
116 | #### Returns:
117 | 
118 | A RaggedTensor of tokenized text. The returned shape is the shape of the input
119 | tensor with an added ragged dimension for tokens of each string.
120 | 
121 | <h3 id="tokenize_with_offsets"><code>tokenize_with_offsets</code></h3>
122 | 
123 | ```python
124 | tokenize_with_offsets(input)
125 | ```
126 | 
127 | Tokenizes a tensor of UTF-8 strings on Unicode script boundaries.
128 | 
129 | The strings are split when a change in the Unicode script is detected between
130 | sequential tokens. The script codes used correspond to International Components
131 | for Unicode (ICU) UScriptCode values. See:
132 | http://icu-project.org/apiref/icu4c/uscript_8h.html
133 | 
134 | ICU defined whitespace characters are dropped.
135 | 
136 | #### Args:
137 | 
138 | *   <b>`input`</b>: A `RaggedTensor`or `Tensor` of UTF-8 strings with any shape.
139 | 
140 | #### Returns:
141 | 
142 | A tuple of `RaggedTensor`s `tokens`, `start_offsets`, and `limit_offsets`
143 | 
144 | *   <b>`where`</b>: * `tokens`: A `RaggedTensor` of tokenized text.
145 |     *   `start_offsets`: A `RaggedTensor` of the tokens' starting byte offset.
146 |     *   `limit_offsets`: A `RaggedTensor` of the tokens' ending byte offset.
147 | 
148 | <h3 id="with_name_scope"><code>with_name_scope</code></h3>
149 | 
150 | ```python
151 | with_name_scope(
152 |     cls,
153 |     method
154 | )
155 | ```
156 | 
157 | Decorator to automatically enter the module name scope.
158 | 
159 | ```
160 | class MyModule(tf.Module):
161 |   @tf.Module.with_name_scope
162 |   def __call__(self, x):
163 |     if not hasattr(self, 'w'):
164 |       self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
165 |     return tf.matmul(x, self.w)
166 | ```
167 | 
168 | Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names
169 | included the module name:
170 | 
171 | ```
172 | mod = MyModule()
173 | mod(tf.ones([8, 32]))
174 | # ==> <tf.Tensor: ...>
175 | mod.w
176 | # ==> <tf.Variable ...'my_module/w:0'>
177 | ```
178 | 
179 | #### Args:
180 | 
181 | *   <b>`method`</b>: The method to wrap.
182 | 
183 | #### Returns:
184 | 
185 | The original method wrapped such that it enters the module's name scope.
186 | 


--------------------------------------------------------------------------------
/tensorflow_text/core/kernels/normalize_kernels.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 TF.Text Authors.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include <locale>
 16 | #include <string>
 17 | 
 18 | #include "absl/strings/ascii.h"
 19 | #include "absl/strings/str_cat.h"
 20 | #include "icu4c/source/common/unicode/errorcode.h"
 21 | #include "icu4c/source/common/unicode/normalizer2.h"
 22 | #include "icu4c/source/common/unicode/utypes.h"
 23 | #include "tensorflow/core/framework/op_kernel.h"
 24 | 
 25 | namespace tensorflow {
 26 | namespace text {
 27 | 
 28 | class CaseFoldUTF8Op : public tensorflow::OpKernel {
 29 |  public:
 30 |   explicit CaseFoldUTF8Op(tensorflow::OpKernelConstruction* context)
 31 |       : tensorflow::OpKernel(context) {}
 32 | 
 33 |   void Compute(tensorflow::OpKernelContext* context) override {
 34 |     const tensorflow::Tensor* input_tensor;
 35 |     OP_REQUIRES_OK(context, context->input("input", &input_tensor));
 36 |     const auto& input_vec = input_tensor->flat<string>();
 37 | 
 38 |     // TODO(gregbillock): support forwarding
 39 |     tensorflow::Tensor* output_tensor;
 40 |     OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(),
 41 |                                                      &output_tensor));
 42 |     auto output_vec = output_tensor->flat<string>();
 43 | 
 44 |     icu::ErrorCode icu_error;
 45 |     const icu::Normalizer2* nfkc_cf = icu::Normalizer2::getNFKCCasefoldInstance(
 46 |         icu_error);
 47 |     OP_REQUIRES(context, icu_error.isSuccess(), errors::Internal(
 48 |         absl::StrCat(icu_error.errorName(),
 49 |                      ": Could not retrieve ICU NFKC_CaseFold normalizer")));
 50 | 
 51 |     for (int64 i = 0; i < input_vec.size(); ++i) {
 52 |       string output_text;
 53 |       icu::StringByteSink<string> byte_sink(&output_text);
 54 |       nfkc_cf->normalizeUTF8(0, input_vec(i), byte_sink, nullptr, icu_error);
 55 |       OP_REQUIRES(context, !U_FAILURE(icu_error), errors::Internal(
 56 |           "Could not normalize input string: " + input_vec(i)));
 57 |       output_vec(i) = output_text;
 58 |     }
 59 |   }
 60 | };
 61 | 
 62 | REGISTER_KERNEL_BUILDER(Name("CaseFoldUTF8").Device(tensorflow::DEVICE_CPU),
 63 |                         CaseFoldUTF8Op);
 64 | 
 65 | namespace {
 66 | 
 67 | string GetNormalizationForm(OpKernelConstruction* context) {
 68 |   string normalization_form;
 69 |   ([=](string* c) -> void {
 70 |     OP_REQUIRES_OK(context, context->GetAttr("normalization_form", c));
 71 |   })(&normalization_form);
 72 |   return absl::AsciiStrToUpper(normalization_form);
 73 | }
 74 | 
 75 | }  // namespace
 76 | 
 77 | class NormalizeUTF8Op : public tensorflow::OpKernel {
 78 |  public:
 79 |   explicit NormalizeUTF8Op(tensorflow::OpKernelConstruction* context)
 80 |       : tensorflow::OpKernel(context),
 81 |         normalization_form_(GetNormalizationForm(context)) {}
 82 | 
 83 |   void Compute(tensorflow::OpKernelContext* context) override {
 84 |     const tensorflow::Tensor* input_tensor;
 85 |     OP_REQUIRES_OK(context, context->input("input", &input_tensor));
 86 |     const auto& input_vec = input_tensor->flat<string>();
 87 | 
 88 |     tensorflow::Tensor* output_tensor;
 89 |     OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(),
 90 |                                                      &output_tensor));
 91 |     auto output_vec = output_tensor->flat<string>();
 92 | 
 93 |     icu::ErrorCode icu_error;
 94 |     const icu::Normalizer2* normalizer = nullptr;
 95 |     if (normalization_form_ == "NFKC") {
 96 |       normalizer = icu::Normalizer2::getNFKCInstance(icu_error);
 97 |       OP_REQUIRES(context, icu_error.isSuccess(), errors::Internal(
 98 |           absl::StrCat(icu_error.errorName(),
 99 |                        ": Could not retrieve ICU NFKC normalizer")));
100 |     } else if (normalization_form_ == "NFC") {
101 |       normalizer = icu::Normalizer2::getNFCInstance(icu_error);
102 |       OP_REQUIRES(context, icu_error.isSuccess(), errors::Internal(
103 |           absl::StrCat(icu_error.errorName(),
104 |                        ": Could not retrieve ICU NFC normalizer")));
105 |     } else if (normalization_form_ == "NFD") {
106 |       normalizer = icu::Normalizer2::getNFDInstance(icu_error);
107 |       OP_REQUIRES(context, icu_error.isSuccess(), errors::Internal(
108 |           absl::StrCat(icu_error.errorName(),
109 |                        ": Could not retrieve ICU NFD normalizer")));
110 |     } else if (normalization_form_ == "NFKD") {
111 |       normalizer = icu::Normalizer2::getNFKDInstance(icu_error);
112 |       OP_REQUIRES(context, icu_error.isSuccess(), errors::Internal(
113 |           absl::StrCat(icu_error.errorName(),
114 |                        ": Could not retrieve ICU NFKd normalizer")));
115 |     } else {
116 |       OP_REQUIRES(
117 |           context, false,
118 |           errors::InvalidArgument(absl::StrCat(
119 |               "Unknown normalization form requrested: ", normalization_form_)));
120 |     }
121 | 
122 |     for (int64 i = 0; i < input_vec.size(); ++i) {
123 |       string output_text;
124 |       icu::StringByteSink<string> byte_sink(&output_text);
125 |       normalizer->normalizeUTF8(0, input_vec(i), byte_sink, nullptr, icu_error);
126 |       OP_REQUIRES(context, !U_FAILURE(icu_error), errors::Internal(
127 |           absl::StrCat(icu_error.errorName(),
128 |                        ": Could not normalize input string: ", input_vec(i))));
129 |       output_vec(i) = output_text;
130 |     }
131 |   }
132 | 
133 |  private:
134 |   string normalization_form_;
135 | };
136 | 
137 | REGISTER_KERNEL_BUILDER(Name("NormalizeUTF8").Device(tensorflow::DEVICE_CPU),
138 |                         NormalizeUTF8Op);
139 | 
140 | }  // namespace text
141 | }  // namespace tensorflow
142 | 


--------------------------------------------------------------------------------
/tensorflow_text/python/numpy/viterbi_decode.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 TF.Text Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Helper functions for decoding Viterbi sequences outside of Tensorflow.
 17 | 
 18 | viterbi_decode provides known-tested snippets for Viterbi decoding in log and
 19 | standard space for use outside of a Tensorflow graph.
 20 | """
 21 | 
 22 | from __future__ import absolute_import
 23 | from __future__ import division
 24 | from __future__ import print_function
 25 | 
 26 | import numpy as np
 27 | 
 28 | 
 29 | def decode(score,
 30 |            transition_params=None,
 31 |            allowed_transitions=None,
 32 |            use_log_space=True,
 33 |            use_start_and_end_states=False):
 34 |   """Decode the highest scoring sequence of tags.
 35 | 
 36 |   This function uses numpy instead of Tensorflow ops, and so cannot be used
 37 |   inside a Tensorflow graph or function.
 38 | 
 39 |   Args:
 40 |     score: A [seq_len, num_tags] matrix of unary potentials.
 41 |     transition_params: A [num_tags, num_tags] matrix of binary potentials.
 42 |     allowed_transitions: A [num_tags, num_tags] matrix where FALSE indicates
 43 |       a transition that cannot be taken.
 44 |     use_log_space: Whether to perform the Viterbi calculation in logarithmic
 45 |       space.
 46 |     use_start_and_end_states: If True, add an implicit 'start' and 'end' state
 47 |       to the start and end of the given sequence. If this is True,
 48 |       transition_params should contain an extra row and column, representing
 49 |       potentials for starting/ending a sequence with a given state. These values
 50 |       should occupy the outermost row and column of the transition_params
 51 |       matrix.
 52 | 
 53 |   Returns:
 54 |     viterbi: A [seq_len] list of integers containing the highest scoring tag
 55 |         indices.
 56 |     viterbi_score: A float containing the score for the Viterbi sequence.
 57 |   """
 58 |   if transition_params is None:
 59 |     num_tags = score.shape[-1]
 60 |     if use_log_space:
 61 |       transition_params = np.zeros(num_tags, num_tags)
 62 |     else:
 63 |       transition_params = np.ones(num_tags, num_tags)
 64 | 
 65 |   if allowed_transitions is not None:
 66 |     if use_log_space:
 67 |       transition_mask = np.where(allowed_transitions, 1, -float("inf"))
 68 |     else:
 69 |       transition_mask = np.where(allowed_transitions, 1, 0.0)
 70 | 
 71 |     transition_params = transition_params * transition_mask
 72 | 
 73 |   if use_log_space:
 74 |     return _decode_in_log_space(score, transition_params,
 75 |                                 use_start_and_end_states)
 76 |   else:
 77 |     return _decode_in_exp_space(score, transition_params,
 78 |                                 use_start_and_end_states)
 79 | 
 80 | 
 81 | def _decode_in_log_space(score, transition_params, use_start_and_end_states):
 82 |   """Perform Viterbi decoding in log space."""
 83 |   trellis = np.zeros_like(score)
 84 |   backpointers = np.zeros_like(score, dtype=np.int32)
 85 | 
 86 |   if use_start_and_end_states:
 87 |     start_potentials = transition_params[-1, :-1]
 88 |     end_potentials = transition_params[:-1, -1]
 89 |     transition_potentials = transition_params[:-1, :-1]
 90 |   else:
 91 |     transition_potentials = transition_params
 92 | 
 93 |   # Calculate the start value.
 94 |   if use_start_and_end_states:
 95 |     trellis[0] = score[0] + start_potentials
 96 |   else:
 97 |     trellis[0] = score[0]
 98 | 
 99 |   # Calculate intermediate values.
100 |   for t in range(1, score.shape[0]):
101 |     v = np.expand_dims(trellis[t - 1], 1) + transition_potentials
102 |     trellis[t] = score[t] + np.max(v, 0)
103 |     backpointers[t] = np.argmax(v, 0)
104 | 
105 |   # If we are using explicit start and end states, change the final scores
106 |   # based on the final state's potentials.
107 |   if use_start_and_end_states:
108 |     final_scores = trellis[-1] + end_potentials
109 |   else:
110 |     final_scores = trellis[-1]
111 | 
112 |   viterbi = [np.argmax(final_scores)]
113 |   for bp in reversed(backpointers[1:]):
114 |     viterbi.append(bp[viterbi[-1]])
115 |   viterbi.reverse()
116 | 
117 |   viterbi_score = np.max(final_scores)
118 | 
119 |   return viterbi, viterbi_score
120 | 
121 | 
122 | def _decode_in_exp_space(score, transition_params, use_start_and_end_states):
123 |   """Perform Viterbi decoding in exp space."""
124 |   if np.any(transition_params < 0):
125 |     raise ValueError("Transition params must be non-negative in exp space.")
126 |   trellis = np.zeros_like(score)
127 |   backpointers = np.zeros_like(score, dtype=np.int32)
128 |   max_scores = np.zeros(score.shape[0])
129 | 
130 |   if use_start_and_end_states:
131 |     start_potentials = transition_params[-1, :-1]
132 |     end_potentials = transition_params[:-1, -1]
133 |     transition_potentials = transition_params[:-1, :-1]
134 |   else:
135 |     transition_potentials = transition_params
136 | 
137 |   # Calculate the start value.
138 |   if use_start_and_end_states:
139 |     trellis[0] = score[0] * start_potentials
140 |   else:
141 |     trellis[0] = score[0]
142 | 
143 |   max_scores[0] = np.max(trellis[0])
144 |   trellis[0] = trellis[0] / max_scores[0]
145 | 
146 |   # Calculate intermediate values.
147 |   for t in range(1, score.shape[0]):
148 |     v = np.expand_dims(trellis[t - 1], 1) * transition_potentials
149 |     trellis[t] = score[t] * np.max(v, 0)
150 |     backpointers[t] = np.argmax(v, 0)
151 |     max_scores[t] = np.max(trellis[t])
152 |     trellis[t] = trellis[t] / max_scores[t]
153 | 
154 |   # If we are using explicit start and end states, change the final scores
155 |   # based on the final state's potentials.
156 |   if use_start_and_end_states:
157 |     final_scores = trellis[-1] * end_potentials
158 |   else:
159 |     final_scores = trellis[-1]
160 | 
161 |   viterbi = [np.argmax(final_scores)]
162 |   for bp in reversed(backpointers[1:]):
163 |     viterbi.append(bp[viterbi[-1]])
164 |   viterbi.reverse()
165 | 
166 |   viterbi_score = np.max(final_scores) * np.prod(max_scores)
167 |   return viterbi, viterbi_score
168 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/_api_cache.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "duplicate_of": {
  3 |     "text.TokenizerWithOffsets.name": "text.Tokenizer.name", 
  4 |     "text.TokenizerWithOffsets.name_scope": "text.Tokenizer.name_scope", 
  5 |     "text.TokenizerWithOffsets.submodules": "text.Tokenizer.submodules", 
  6 |     "text.TokenizerWithOffsets.trainable_variables": "text.Tokenizer.trainable_variables", 
  7 |     "text.TokenizerWithOffsets.variables": "text.Tokenizer.variables", 
  8 |     "text.UnicodeScriptTokenizer.name": "text.Tokenizer.name", 
  9 |     "text.UnicodeScriptTokenizer.name_scope": "text.Tokenizer.name_scope", 
 10 |     "text.UnicodeScriptTokenizer.submodules": "text.Tokenizer.submodules", 
 11 |     "text.UnicodeScriptTokenizer.trainable_variables": "text.Tokenizer.trainable_variables", 
 12 |     "text.UnicodeScriptTokenizer.variables": "text.Tokenizer.variables", 
 13 |     "text.WhitespaceTokenizer.name": "text.Tokenizer.name", 
 14 |     "text.WhitespaceTokenizer.name_scope": "text.Tokenizer.name_scope", 
 15 |     "text.WhitespaceTokenizer.submodules": "text.Tokenizer.submodules", 
 16 |     "text.WhitespaceTokenizer.trainable_variables": "text.Tokenizer.trainable_variables", 
 17 |     "text.WhitespaceTokenizer.variables": "text.Tokenizer.variables", 
 18 |     "text.WordpieceTokenizer.name": "text.Tokenizer.name", 
 19 |     "text.WordpieceTokenizer.name_scope": "text.Tokenizer.name_scope", 
 20 |     "text.WordpieceTokenizer.submodules": "text.Tokenizer.submodules", 
 21 |     "text.WordpieceTokenizer.trainable_variables": "text.Tokenizer.trainable_variables", 
 22 |     "text.WordpieceTokenizer.variables": "text.Tokenizer.variables"
 23 |   }, 
 24 |   "is_fragment": {
 25 |     "text": false, 
 26 |     "text.Reduction": false, 
 27 |     "text.Reduction.MEAN": true, 
 28 |     "text.Reduction.STRING_JOIN": true, 
 29 |     "text.Reduction.SUM": true, 
 30 |     "text.Tokenizer": false, 
 31 |     "text.Tokenizer.__init__": true, 
 32 |     "text.Tokenizer.name": true, 
 33 |     "text.Tokenizer.name_scope": true, 
 34 |     "text.Tokenizer.submodules": true, 
 35 |     "text.Tokenizer.tokenize": true, 
 36 |     "text.Tokenizer.trainable_variables": true, 
 37 |     "text.Tokenizer.variables": true, 
 38 |     "text.Tokenizer.with_name_scope": true, 
 39 |     "text.TokenizerWithOffsets": false, 
 40 |     "text.TokenizerWithOffsets.__init__": true, 
 41 |     "text.TokenizerWithOffsets.name": true, 
 42 |     "text.TokenizerWithOffsets.name_scope": true, 
 43 |     "text.TokenizerWithOffsets.submodules": true, 
 44 |     "text.TokenizerWithOffsets.tokenize": true, 
 45 |     "text.TokenizerWithOffsets.tokenize_with_offsets": true, 
 46 |     "text.TokenizerWithOffsets.trainable_variables": true, 
 47 |     "text.TokenizerWithOffsets.variables": true, 
 48 |     "text.TokenizerWithOffsets.with_name_scope": true, 
 49 |     "text.UnicodeScriptTokenizer": false, 
 50 |     "text.UnicodeScriptTokenizer.__init__": true, 
 51 |     "text.UnicodeScriptTokenizer.name": true, 
 52 |     "text.UnicodeScriptTokenizer.name_scope": true, 
 53 |     "text.UnicodeScriptTokenizer.submodules": true, 
 54 |     "text.UnicodeScriptTokenizer.tokenize": true, 
 55 |     "text.UnicodeScriptTokenizer.tokenize_with_offsets": true, 
 56 |     "text.UnicodeScriptTokenizer.trainable_variables": true, 
 57 |     "text.UnicodeScriptTokenizer.variables": true, 
 58 |     "text.UnicodeScriptTokenizer.with_name_scope": true, 
 59 |     "text.WhitespaceTokenizer": false, 
 60 |     "text.WhitespaceTokenizer.__init__": true, 
 61 |     "text.WhitespaceTokenizer.name": true, 
 62 |     "text.WhitespaceTokenizer.name_scope": true, 
 63 |     "text.WhitespaceTokenizer.submodules": true, 
 64 |     "text.WhitespaceTokenizer.tokenize": true, 
 65 |     "text.WhitespaceTokenizer.tokenize_with_offsets": true, 
 66 |     "text.WhitespaceTokenizer.trainable_variables": true, 
 67 |     "text.WhitespaceTokenizer.variables": true, 
 68 |     "text.WhitespaceTokenizer.with_name_scope": true, 
 69 |     "text.WordShape": false, 
 70 |     "text.WordShape.BEGINS_WITH_OPEN_QUOTE": true, 
 71 |     "text.WordShape.BEGINS_WITH_PUNCT_OR_SYMBOL": true, 
 72 |     "text.WordShape.ENDS_WITH_CLOSE_QUOTE": true, 
 73 |     "text.WordShape.ENDS_WITH_ELLIPSIS": true, 
 74 |     "text.WordShape.ENDS_WITH_EMOTICON": true, 
 75 |     "text.WordShape.ENDS_WITH_MULTIPLE_SENTENCE_TERMINAL": true, 
 76 |     "text.WordShape.ENDS_WITH_MULTIPLE_TERMINAL_PUNCT": true, 
 77 |     "text.WordShape.ENDS_WITH_PUNCT_OR_SYMBOL": true, 
 78 |     "text.WordShape.ENDS_WITH_SENTENCE_TERMINAL": true, 
 79 |     "text.WordShape.ENDS_WITH_TERMINAL_PUNCT": true, 
 80 |     "text.WordShape.HAS_CURRENCY_SYMBOL": true, 
 81 |     "text.WordShape.HAS_EMOJI": true, 
 82 |     "text.WordShape.HAS_MATH_SYMBOL": true, 
 83 |     "text.WordShape.HAS_MIXED_CASE": true, 
 84 |     "text.WordShape.HAS_NON_LETTER": true, 
 85 |     "text.WordShape.HAS_NO_DIGITS": true, 
 86 |     "text.WordShape.HAS_NO_PUNCT_OR_SYMBOL": true, 
 87 |     "text.WordShape.HAS_NO_QUOTES": true, 
 88 |     "text.WordShape.HAS_ONLY_DIGITS": true, 
 89 |     "text.WordShape.HAS_PUNCTUATION_DASH": true, 
 90 |     "text.WordShape.HAS_QUOTE": true, 
 91 |     "text.WordShape.HAS_SOME_DIGITS": true, 
 92 |     "text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL": true, 
 93 |     "text.WordShape.HAS_TITLE_CASE": true, 
 94 |     "text.WordShape.IS_ACRONYM_WITH_PERIODS": true, 
 95 |     "text.WordShape.IS_EMOTICON": true, 
 96 |     "text.WordShape.IS_LOWERCASE": true, 
 97 |     "text.WordShape.IS_MIXED_CASE_LETTERS": true, 
 98 |     "text.WordShape.IS_NUMERIC_VALUE": true, 
 99 |     "text.WordShape.IS_PUNCT_OR_SYMBOL": true, 
100 |     "text.WordShape.IS_UPPERCASE": true, 
101 |     "text.WordShape.IS_WHITESPACE": true, 
102 |     "text.WordpieceTokenizer": false, 
103 |     "text.WordpieceTokenizer.__init__": true, 
104 |     "text.WordpieceTokenizer.name": true, 
105 |     "text.WordpieceTokenizer.name_scope": true, 
106 |     "text.WordpieceTokenizer.submodules": true, 
107 |     "text.WordpieceTokenizer.tokenize": true, 
108 |     "text.WordpieceTokenizer.tokenize_with_offsets": true, 
109 |     "text.WordpieceTokenizer.trainable_variables": true, 
110 |     "text.WordpieceTokenizer.variables": true, 
111 |     "text.WordpieceTokenizer.with_name_scope": true, 
112 |     "text.case_fold_utf8": false, 
113 |     "text.coerce_to_structurally_valid_utf8": false, 
114 |     "text.gather_with_default": false, 
115 |     "text.greedy_constrained_sequence": false, 
116 |     "text.ngrams": false, 
117 |     "text.normalize_utf8": false, 
118 |     "text.pad_along_dimension": false, 
119 |     "text.sentence_fragments": false, 
120 |     "text.sliding_window": false, 
121 |     "text.span_alignment": false, 
122 |     "text.span_overlaps": false, 
123 |     "text.viterbi_constrained_sequence": false, 
124 |     "text.wordshape": false
125 |   }, 
126 |   "py_module_names": [
127 |     "text"
128 |   ]
129 | }


--------------------------------------------------------------------------------
/tensorflow_text/python/ops/sliding_window_op.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 TF.Text Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Sliding window op.
 17 | 
 18 | Returns a sliding window of data with a specified width.
 19 | """
 20 | 
 21 | from __future__ import absolute_import
 22 | from __future__ import print_function
 23 | 
 24 | from tensorflow.python.framework import errors
 25 | from tensorflow.python.framework import ops
 26 | from tensorflow.python.ops import array_ops
 27 | from tensorflow.python.ops.ragged import ragged_tensor
 28 | 
 29 | 
 30 | def sliding_window(data, width, axis=-1, name=None):
 31 |   """Builds a sliding window for `data` with a specified width.
 32 | 
 33 |   Returns a tensor constructed from `data`, where each element in
 34 |   dimension `axis` is a slice of `data` starting at the corresponding
 35 |   position, with the given width and step size.  I.e.:
 36 | 
 37 |   * `result.shape.ndims = data.shape.ndims + 1`
 38 |   * `result[i1..iaxis, a] = data[i1..iaxis, a:a+width]`
 39 |     (where `0 <= a < data[i1...iaxis].shape[0] - (width - 1)`).
 40 | 
 41 |   Note that each result row (along dimension `axis`) has `width - 1` fewer items
 42 |   than the corresponding `data` row.  If a `data` row has fewer than `width`
 43 |   items, then the corresponding `result` row will be empty.  If you wish for
 44 |   the `result` rows to be the same size as the `data` rows, you can use
 45 |   `pad_along_dimension` to add `width - 1` padding elements before calling
 46 |   this op.
 47 | 
 48 |   Args:
 49 |     data: `<dtype> [O1...ON, A, I1...IM]`
 50 |       A potentially ragged K-dimensional tensor with outer dimensions of size
 51 |       `O1...ON`; axis dimension of size `A`; and inner dimensions of size
 52 |       `I1...IM`.  I.e. `K = N + 1 + M`, where `N>=0` and `M>=0`.
 53 | 
 54 |     width: An integer constant specifying the width of the window. Must be
 55 |       greater than zero.
 56 | 
 57 |     axis: An integer constant specifying the axis along which sliding window
 58 |       is computed. Negative axis values from `-K` to `-1` are supported.
 59 | 
 60 |     name: The name for this op (optional)
 61 | 
 62 |   Returns:
 63 |     A `K+1` dimensional tensor with the same dtype as `data`, where:
 64 | 
 65 |     * `result[i1..iaxis, a]` = `data[i1..iaxis, a:a+width]`
 66 |     * `result.shape[:axis]` = `data.shape[:axis]`
 67 |     * `result.shape[axis]` = `data.shape[axis] - (width - 1)`
 68 |     * `result.shape[axis + 1]` = `width`
 69 |     * `result.shape[axis + 2:]` = `data.shape[axis + 1:]`
 70 | 
 71 |   #### Examples:
 72 | 
 73 |     Sliding window (width=3) across a sequence of tokens:
 74 | 
 75 |     ```python
 76 |     >>> # input: <string>[sequence_length]
 77 |     >>> input = tf.constant(["one", "two", "three", "four", "five", "six"])
 78 |     >>> # output: <string>[sequence_length-2, 3]
 79 |     >>> output = sliding_window(data=input, width=3, axis=0)
 80 |     >>> print output.eval()
 81 |     [["one", "two", "three"],
 82 |      ["two", "three", "four"],
 83 |      ["three", "four", "five"],
 84 |      ["four", "five", "six"]]
 85 |     >>> print("Shape: %s -> %s" % (input.shape, output.shape))
 86 |     Shape: (6,) -> (4, 3)
 87 |     ```
 88 | 
 89 |     Sliding window (width=2) across the inner dimension of a ragged matrix
 90 |     containing a batch of token sequences:
 91 | 
 92 |     ```python
 93 |     >>> # input: <string>[num_sentences, (num_words)]
 94 |     >>> input = tf.ragged.constant(
 95 |     ...     [['Up', 'high', 'in', 'the', 'air'],
 96 |     ...      ['Down', 'under', 'water'],
 97 |     ...      ['Away', 'to', 'outer', 'space']]
 98 |     >>> # output: <string>[num_sentences, (num_word-1), 2]
 99 |     >>> output = sliding_window(input, width=2, axis=-1)
100 |     >>> print output.eval()
101 |     [[['Up', 'high'], ['high', 'in'], ['in', 'the'], ['the', 'air']],
102 |      [['Down', 'under'], ['under', 'water']],
103 |      [['Away', 'to'], ['to', 'outer'], ['outer', 'space']]]
104 |     >>> print("Shape: %s -> %s" % (input.shape, output.shape))
105 |     Shape: (3, ?) -> (3, ?, 2)
106 |     ```
107 | 
108 |     Sliding window across the second dimension of a 3-D tensor containing
109 |     batches of sequences of embedding vectors:
110 | 
111 |     ```python
112 |     >>> # input: <int32>[num_sequences, sequence_length, embedding_size]
113 |     >>> input = tf.constant([
114 |     ...     [[1, 1, 1], [2, 2, 1], [3, 3, 1], [4, 4, 1], [5, 5, 1]],
115 |     ...     [[1, 1, 2], [2, 2, 2], [3, 3, 2], [4, 4, 2], [5, 5, 2]]])
116 |     >>> # output: <int32>[num_sequences, sequence_length-1, 2, embedding_size]
117 |     >>> output = sliding_window(data=input, width=2, axis=1)
118 |     >>> print output.eval()
119 |     [[[[1, 1, 1], [2, 2, 1]],
120 |       [[2, 2, 1], [3, 3, 1]],
121 |       [[3, 3, 1], [4, 4, 1]],
122 |       [[4, 4, 1], [5, 5, 1]]],
123 |      [[[1, 1, 2], [2, 2, 2]],
124 |       [[2, 2, 2], [3, 3, 2]],
125 |       [[3, 3, 2], [4, 4, 2]],
126 |       [[4, 4, 2], [5, 5, 2]]]]
127 |     >>> print("Shape: %s -> %s" % (input.shape, output.shape))
128 |     Shape: (2, 5, 3) -> (2, 4, 2, 3)
129 |     ```
130 |   """
131 |   with ops.name_scope(name, "SlidingWindow", [data, axis]):
132 |     data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name="data")
133 | 
134 |     if not isinstance(axis, int):
135 |       raise TypeError("axis must be an int")
136 | 
137 |     if not isinstance(width, int):
138 |       raise TypeError("width must be an int")
139 | 
140 |     if data.shape.ndims is not None and (axis < -data.shape.ndims or
141 |                                          axis >= data.shape.ndims):
142 |       raise errors.InvalidArgumentError(
143 |           None, None, "axis must be between -k <= axis <= -1 OR 0 <= axis < k")
144 | 
145 |     if width <= 0:
146 |       raise errors.InvalidArgumentError(
147 |           None, None, "width must be an integer greater than 0")
148 | 
149 |     slices = []
150 |     for start in range(width):
151 |       stop = None if start - width + 1 == 0 else start - width + 1
152 |       if axis >= 0:
153 |         idx = [slice(None)] * axis + [slice(start, stop)]
154 |       else:
155 |         idx = [Ellipsis, slice(start, stop)] + [slice(None)] * (-axis - 1)
156 |       slices.append(data[idx])
157 | 
158 |     # Stack the slices.
159 |     stack_axis = axis + 1 if axis >= 0 else axis
160 |     return array_ops.stack(slices, stack_axis)
161 | 


--------------------------------------------------------------------------------
/tensorflow_text/python/ops/create_feature_bitmask_op_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 TF.Text Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Tests for create_feature_bitmask_op."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | from tensorflow.python.framework import constant_op
 23 | from tensorflow.python.framework import dtypes
 24 | from tensorflow.python.framework import errors
 25 | from tensorflow.python.framework import test_util
 26 | from tensorflow.python.ops import array_ops
 27 | from tensorflow.python.platform import test
 28 | from tensorflow_text.python.ops import create_feature_bitmask_op
 29 | 
 30 | 
 31 | @test_util.run_all_in_graph_and_eager_modes
 32 | class CreateFeatureBitmaskOpTest(test_util.TensorFlowTestCase):
 33 | 
 34 |   def test_docstring_example1(self):
 35 |     data = [True, False, False, True]
 36 |     result = create_feature_bitmask_op.create_feature_bitmask(data)
 37 |     self.assertAllEqual(result, 0b1001)
 38 | 
 39 |   def test_docstring_example2(self):
 40 |     data = [[True, False], [False, True], [True, True]]
 41 |     result = create_feature_bitmask_op.create_feature_bitmask(data)
 42 |     expected_result = constant_op.constant([0b10, 0b01, 0b11])
 43 |     self.assertAllEqual(result, expected_result)
 44 | 
 45 |   def test_feature_bitmask_single_dim_single_tensor(self):
 46 |     """Test that the op can reduce a single-dimension tensor to a constant."""
 47 |     data = constant_op.constant([True, False])
 48 |     result = create_feature_bitmask_op.create_feature_bitmask(data)
 49 | 
 50 |     expected_result = constant_op.constant(2)
 51 |     self.assertAllEqual(expected_result, result)
 52 | 
 53 |   def test_feature_bitmask_multiple_tensors_stack(self):
 54 |     """Test that the op can reduce a stacked list of tensors."""
 55 |     data_1 = constant_op.constant([True, False])
 56 |     data_2 = constant_op.constant([False, True])
 57 |     stack_data = array_ops.stack([data_1, data_2], -1)
 58 | 
 59 |     expected_result = constant_op.constant([2, 1])
 60 |     result = create_feature_bitmask_op.create_feature_bitmask(stack_data)
 61 |     self.assertAllEqual(expected_result, result)
 62 | 
 63 |   def test_feature_bitmask_multi_dim_single_tensor(self):
 64 |     """Test that the op can reduce a multi-dimension tensor."""
 65 |     data = constant_op.constant([[True, True, False], [True, False, False]])
 66 |     result = create_feature_bitmask_op.create_feature_bitmask(data)
 67 | 
 68 |     expected_result = constant_op.constant([6, 4])
 69 |     self.assertAllEqual(expected_result, result)
 70 | 
 71 |   def test_feature_bitmask_3_dim_single_tensor(self):
 72 |     """Test that the op can reduce a 3-dimension tensor."""
 73 |     data = constant_op.constant([[[True, True, False], [True, False, False]],
 74 |                                  [[False, False, True], [True, False, True]]])
 75 |     result = create_feature_bitmask_op.create_feature_bitmask(data)
 76 | 
 77 |     expected_result = constant_op.constant([[6, 4], [1, 5]])
 78 |     self.assertAllEqual(expected_result, result)
 79 | 
 80 |   def test_feature_bitmask_multiple_tensors_multi_dim_stack(self):
 81 |     """Test that the op can reduce a stacked list of multi-dim tensors."""
 82 |     data_1 = constant_op.constant([[True, False], [False, True]])
 83 |     data_2 = constant_op.constant([[False, True], [True, True]])
 84 |     stack_data = array_ops.stack([data_1, data_2], -1)
 85 | 
 86 |     expected_result = constant_op.constant([[2, 1], [1, 3]])
 87 |     result = create_feature_bitmask_op.create_feature_bitmask(stack_data)
 88 |     self.assertAllEqual(expected_result, result)
 89 | 
 90 |   def test_supports_tensors_with_unknown_shape(self):
 91 |     """Test that the op handles tensors with unknown shape."""
 92 |     data = array_ops.placeholder_with_default(
 93 |         constant_op.constant([[[True, True, False], [True, False, False]],
 94 |                               [[False, False, True], [True, False, True]]]),
 95 |         shape=None)
 96 |     result = create_feature_bitmask_op.create_feature_bitmask(data)
 97 | 
 98 |     expected_result = constant_op.constant([[6, 4], [1, 5]])
 99 | 
100 |     self.assertAllEqual(expected_result, result)
101 | 
102 |   def test_feature_bitmask_multiple_tensors_error(self):
103 |     """Test that the op errors when presented with a single tensor."""
104 |     data_1 = constant_op.constant([True, False])
105 |     data_2 = constant_op.constant([True, True])
106 |     list_data = [data_1, data_2]
107 |     error_message = 'CreateFeatureBitmask does not support lists of tensors.*'
108 | 
109 |     with self.assertRaisesRegexp(errors.InvalidArgumentError, error_message):
110 |       _ = create_feature_bitmask_op.create_feature_bitmask(list_data)
111 | 
112 |   def test_unsupported_dtype_type(self):
113 |     data = constant_op.constant([True, False])
114 |     bad_dtype = dtypes.uint32
115 |     error_message = 'dtype must be one of: .*, was %s' % bad_dtype.name
116 | 
117 |     with self.assertRaisesRegexp(errors.InvalidArgumentError, error_message):
118 |       _ = create_feature_bitmask_op.create_feature_bitmask(
119 |           data, dtype=bad_dtype)
120 | 
121 |   def test_unsupported_input_type(self):
122 |     data = constant_op.constant([1.0, 0.0])
123 |     error_message = ('Tensor conversion requested dtype bool for Tensor'
124 |                      ' with dtype float32: .*')
125 | 
126 |     with self.assertRaisesRegexp(ValueError, error_message):
127 |       _ = create_feature_bitmask_op.create_feature_bitmask(data)
128 | 
129 |   def test_larger_than_max_shape(self):
130 |     data = array_ops.fill([2, 64], False)
131 |     error_message = r'data.shape\[-1\] must be less than 64, is 64.'
132 | 
133 |     with self.assertRaisesRegexp(ValueError, error_message):
134 |       _ = create_feature_bitmask_op.create_feature_bitmask(data)
135 | 
136 |   def test_larger_than_dtype_shape(self):
137 |     data = array_ops.fill([2, 9], False)
138 |     error_message = (r'data.shape\[-1\] is too large for %s \(was 9, cannot '
139 |                      r'exceed 8\).*') % dtypes.uint8.name
140 | 
141 |     with self.assertRaisesRegexp(ValueError, error_message):
142 |       _ = create_feature_bitmask_op.create_feature_bitmask(
143 |           data, dtype=dtypes.uint8)
144 | 
145 |   def test_larger_than_dtype_shape_at_runtime(self):
146 |     data = array_ops.placeholder_with_default(
147 |         array_ops.fill([2, 9], False), shape=None)
148 |     error_message = (r'.*data.shape\[-1\] is too large for %s.*' %
149 |                      dtypes.uint8.name)
150 | 
151 |     with self.assertRaisesRegexp((errors.InvalidArgumentError, ValueError),
152 |                                  error_message):
153 |       self.evaluate(
154 |           create_feature_bitmask_op.create_feature_bitmask(
155 |               data, dtype=dtypes.uint8))
156 | 
157 | 
158 | if __name__ == '__main__':
159 |   test.main()
160 | 


--------------------------------------------------------------------------------
/third_party/tensorflow/tf_configure.bzl:
--------------------------------------------------------------------------------
  1 | """Setup TensorFlow as external dependency.
  2 | 
  3 | This is used for the generation of the dynamic libraries used for custom ops.
  4 | See: http://github.com/tensorflow/custom-op
  5 | """
  6 | 
  7 | _TF_HEADER_DIR = "TF_HEADER_DIR"
  8 | _TF_SHARED_LIBRARY_DIR = "TF_SHARED_LIBRARY_DIR"
  9 | 
 10 | def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
 11 |     if not out:
 12 |         out = tpl
 13 |     repository_ctx.template(
 14 |         out,
 15 |         Label("//third_party/tensorflow:%s.tpl" % tpl),
 16 |         substitutions,
 17 |     )
 18 | 
 19 | def _fail(msg):
 20 |     """Output failure message when auto configuration fails."""
 21 |     red = "\033[0;31m"
 22 |     no_color = "\033[0m"
 23 |     fail("%sPython Configuration Error:%s %s\n" % (red, no_color, msg))
 24 | 
 25 | def _is_windows(repository_ctx):
 26 |     """Returns true if the host operating system is windows."""
 27 |     os_name = repository_ctx.os.name.lower()
 28 |     if os_name.find("windows") != -1:
 29 |         return True
 30 |     return False
 31 | 
 32 | def _execute(
 33 |         repository_ctx,
 34 |         cmdline,
 35 |         error_msg = None,
 36 |         error_details = None,
 37 |         empty_stdout_fine = False):
 38 |     """Executes an arbitrary shell command.
 39 | 
 40 |     Helper for executes an arbitrary shell command.
 41 | 
 42 |     Args:
 43 |       repository_ctx: the repository_ctx object.
 44 |       cmdline: list of strings, the command to execute.
 45 |       error_msg: string, a summary of the error if the command fails.
 46 |       error_details: string, details about the error or steps to fix it.
 47 |       empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
 48 |         it's an error.
 49 | 
 50 |     Returns:
 51 |       The result of repository_ctx.execute(cmdline).
 52 |     """
 53 |     result = repository_ctx.execute(cmdline)
 54 |     if result.stderr or not (empty_stdout_fine or result.stdout):
 55 |         _fail("\n".join([
 56 |             error_msg.strip() if error_msg else "Repository command failed",
 57 |             result.stderr.strip(),
 58 |             error_details if error_details else "",
 59 |         ]))
 60 |     return result
 61 | 
 62 | def _read_dir(repository_ctx, src_dir):
 63 |     """Returns a string with all files in a directory.
 64 | 
 65 |     Finds all files inside a directory, traversing subfolders and following
 66 |     symlinks. The returned string contains the full path of all files
 67 |     separated by line breaks.
 68 | 
 69 |     Args:
 70 |         repository_ctx: the repository_ctx object.
 71 |         src_dir: directory to find files from.
 72 | 
 73 |     Returns:
 74 |         A string of all files inside the given dir.
 75 |     """
 76 |     if _is_windows(repository_ctx):
 77 |         src_dir = src_dir.replace("/", "\\")
 78 |         find_result = _execute(
 79 |             repository_ctx,
 80 |             ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
 81 |             empty_stdout_fine = True,
 82 |         )
 83 | 
 84 |         # src_files will be used in genrule.outs where the paths must
 85 |         # use forward slashes.
 86 |         result = find_result.stdout.replace("\\", "/")
 87 |     else:
 88 |         find_result = _execute(
 89 |             repository_ctx,
 90 |             ["find", src_dir, "-follow", "-type", "f"],
 91 |             empty_stdout_fine = True,
 92 |         )
 93 |         result = find_result.stdout
 94 |     return result
 95 | 
 96 | def _genrule(genrule_name, command, outs):
 97 |     """Returns a string with a genrule.
 98 | 
 99 |     Genrule executes the given command and produces the given outputs.
100 | 
101 |     Args:
102 |         genrule_name: A unique name for genrule target.
103 |         command: The command to run.
104 |         outs: A list of files generated by this rule.
105 | 
106 |     Returns:
107 |         A genrule target.
108 |     """
109 |     return (
110 |         "genrule(\n" +
111 |         '    name = "' +
112 |         genrule_name + '",\n' +
113 |         "    outs = [\n" +
114 |         outs +
115 |         "\n    ],\n" +
116 |         '    cmd = """\n' +
117 |         command +
118 |         '\n   """,\n' +
119 |         ")\n"
120 |     )
121 | 
122 | def _norm_path(path):
123 |     """Returns a path with '/' and remove the trailing slash."""
124 |     path = path.replace("\\", "/")
125 |     if path[-1] == "/":
126 |         path = path[:-1]
127 |     return path
128 | 
129 | def _symlink_genrule_for_dir(
130 |         repository_ctx,
131 |         src_dir,
132 |         dest_dir,
133 |         genrule_name,
134 |         src_files = [],
135 |         dest_files = []):
136 |     """Returns a genrule to symlink(or copy if on Windows) a set of files.
137 | 
138 |     If src_dir is passed, files will be read from the given directory; otherwise
139 |     we assume files are in src_files and dest_files.
140 | 
141 |     Args:
142 |         repository_ctx: the repository_ctx object.
143 |         src_dir: source directory.
144 |         dest_dir: directory to create symlink in.
145 |         genrule_name: genrule name.
146 |         src_files: list of source files instead of src_dir.
147 |         dest_files: list of corresonding destination files.
148 | 
149 |     Returns:
150 |         genrule target that creates the symlinks.
151 |     """
152 |     if src_dir != None:
153 |         src_dir = _norm_path(src_dir)
154 |         dest_dir = _norm_path(dest_dir)
155 |         files = "\n".join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
156 | 
157 |         # Create a list with the src_dir stripped to use for outputs.
158 |         dest_files = files.replace(src_dir, "").splitlines()
159 |         src_files = files.splitlines()
160 |     command = []
161 |     outs = []
162 |     for i in range(len(dest_files)):
163 |         if dest_files[i] != "":
164 |             # If we have only one file to link we do not want to use the dest_dir, as
165 |             # $(@D) will include the full path to the file.
166 |             dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
167 | 
168 |             # Copy the headers to create a sandboxable setup.
169 |             cmd = "cp -f"
170 |             command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
171 |             outs.append('        "' + dest_dir + dest_files[i] + '",')
172 |     genrule = _genrule(
173 |         genrule_name,
174 |         " && ".join(command),
175 |         "\n".join(outs),
176 |     )
177 |     return genrule
178 | 
179 | def _tf_pip_impl(repository_ctx):
180 |     tf_header_dir = repository_ctx.os.environ[_TF_HEADER_DIR]
181 |     tf_header_rule = _symlink_genrule_for_dir(
182 |         repository_ctx,
183 |         tf_header_dir,
184 |         "include",
185 |         "tf_header_include",
186 |     )
187 | 
188 |     tf_shared_library_dir = repository_ctx.os.environ[_TF_SHARED_LIBRARY_DIR]
189 |     tf_shared_library_path = "%s/libtensorflow_framework.so.1" % tf_shared_library_dir
190 |     tf_shared_library_rule = _symlink_genrule_for_dir(
191 |         repository_ctx,
192 |         None,
193 |         "",
194 |         "libtensorflow_framework.so.1",
195 |         [tf_shared_library_path],
196 |         ["libtensorflow_framework.so.1"],
197 |     )
198 | 
199 |     _tpl(repository_ctx, "BUILD", {
200 |         "%{TF_HEADER_GENRULE}": tf_header_rule,
201 |         "%{TF_SHARED_LIBRARY_GENRULE}": tf_shared_library_rule,
202 |     })
203 | 
204 | tf_configure = repository_rule(
205 |     implementation = _tf_pip_impl,
206 |     environ = [
207 |         _TF_HEADER_DIR,
208 |         _TF_SHARED_LIBRARY_DIR,
209 |     ],
210 | )
211 | 


--------------------------------------------------------------------------------
/docs/api_docs/python/text/WordpieceTokenizer.md:
--------------------------------------------------------------------------------
  1 | <div itemscope itemtype="http://developers.google.com/ReferenceObject">
  2 | <meta itemprop="name" content="text.WordpieceTokenizer" />
  3 | <meta itemprop="path" content="Stable" />
  4 | <meta itemprop="property" content="name"/>
  5 | <meta itemprop="property" content="name_scope"/>
  6 | <meta itemprop="property" content="submodules"/>
  7 | <meta itemprop="property" content="trainable_variables"/>
  8 | <meta itemprop="property" content="variables"/>
  9 | <meta itemprop="property" content="__init__"/>
 10 | <meta itemprop="property" content="tokenize"/>
 11 | <meta itemprop="property" content="tokenize_with_offsets"/>
 12 | <meta itemprop="property" content="with_name_scope"/>
 13 | </div>
 14 | 
 15 | # text.WordpieceTokenizer
 16 | 
 17 | ## Class `WordpieceTokenizer`
 18 | 
 19 | Creates a wordpiece tokenizer.
 20 | 
 21 | Inherits From: [`TokenizerWithOffsets`](../text/TokenizerWithOffsets.md)
 22 | 
 23 | Defined in
 24 | [`python/ops/wordpiece_tokenizer.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/wordpiece_tokenizer.py).
 25 | 
 26 | <!-- Placeholder for "Used in" -->
 27 | 
 28 | It tokenizes utf-8 encoded tokens into subword pieces based off of a vocab.
 29 | 
 30 | <h2 id="__init__"><code>__init__</code></h2>
 31 | 
 32 | ```python
 33 | __init__(
 34 |     vocab_lookup_table,
 35 |     suffix_indicator='##',
 36 |     max_bytes_per_word=100,
 37 |     token_out_type=dtypes.int64,
 38 |     unknown_token='[UNK]'
 39 | )
 40 | ```
 41 | 
 42 | Initializes the WordpieceTokenizer.
 43 | 
 44 | #### Args:
 45 | 
 46 | *   <b>`vocab_lookup_table`</b>: A lookup table implementing the LookupInterface
 47 |     containing the vocabulary of subwords.
 48 | *   <b>`suffix_indicator`</b>: (optional) The characters prepended to a
 49 |     wordpiece to indicate that it is a suffix to another subword. Default is
 50 |     '##'.
 51 | *   <b>`max_bytes_per_word`</b>: (optional) Max size of input token. Default
 52 |     is 100.
 53 | *   <b>`token_out_type`</b>: (optional) The type of the token to return. This
 54 |     can be `tf.int64` IDs, or `tf.string` subwords. The default is `tf.int64`.
 55 | *   <b>`unknown_token`</b>: (optional) The value to use when an unknown token is
 56 |     found. Default is "[UNK]". If this is set to a string, and `token_out_type`
 57 |     is `tf.int64`, the `vocab_lookup_table` is used to convert the
 58 |     `unknown_token` to an integer. If this is set to `None`, out-of-vocabulary
 59 |     tokens are left as is.
 60 | 
 61 | ## Properties
 62 | 
 63 | <h3 id="name"><code>name</code></h3>
 64 | 
 65 | Returns the name of this module as passed or determined in the ctor.
 66 | 
 67 | NOTE: This is not the same as the `self.name_scope.name` which includes parent
 68 | module names.
 69 | 
 70 | <h3 id="name_scope"><code>name_scope</code></h3>
 71 | 
 72 | Returns a `tf.name_scope` instance for this class.
 73 | 
 74 | <h3 id="submodules"><code>submodules</code></h3>
 75 | 
 76 | Sequence of all sub-modules.
 77 | 
 78 | Submodules are modules which are properties of this module, or found as
 79 | properties of modules which are properties of this module (and so on).
 80 | 
 81 | ```
 82 | a = tf.Module()
 83 | b = tf.Module()
 84 | c = tf.Module()
 85 | a.b = b
 86 | b.c = c
 87 | assert list(a.submodules) == [b, c]
 88 | assert list(b.submodules) == [c]
 89 | assert list(c.submodules) == []
 90 | ```
 91 | 
 92 | #### Returns:
 93 | 
 94 | A sequence of all submodules.
 95 | 
 96 | <h3 id="trainable_variables"><code>trainable_variables</code></h3>
 97 | 
 98 | Sequence of variables owned by this module and it's submodules.
 99 | 
100 | Note: this method uses reflection to find variables on the current instance and
101 | submodules. For performance reasons you may wish to cache the result of calling
102 | this method if you don't expect the return value to change.
103 | 
104 | #### Returns:
105 | 
106 | A sequence of variables for the current module (sorted by attribute name)
107 | followed by variables from all submodules recursively (breadth first).
108 | 
109 | <h3 id="variables"><code>variables</code></h3>
110 | 
111 | Sequence of variables owned by this module and it's submodules.
112 | 
113 | Note: this method uses reflection to find variables on the current instance and
114 | submodules. For performance reasons you may wish to cache the result of calling
115 | this method if you don't expect the return value to change.
116 | 
117 | #### Returns:
118 | 
119 | A sequence of variables for the current module (sorted by attribute name)
120 | followed by variables from all submodules recursively (breadth first).
121 | 
122 | ## Methods
123 | 
124 | <h3 id="tokenize"><code>tokenize</code></h3>
125 | 
126 | ```python
127 | tokenize(input)
128 | ```
129 | 
130 | "Splits tokens further into wordpiece tokens.
131 | 
132 | ### Example:
133 | 
134 | ```python
135 | >>> tokens = [["they're", "the", "greatest"]],
136 | >>> tokenizer = WordpieceTokenizer(vocab, token_out_type=tf.string)
137 | >>> tokenizer.tokenize(tokens)
138 | [[['they', "##'", '##re'], ['the'], ['great', '##est']]]
139 | ```
140 | 
141 | #### Args:
142 | 
143 | *   <b>`input`</b>: An N-dimensional `Tensor` or `RaggedTensor` of UTF-8
144 |     strings.
145 | 
146 | #### Returns:
147 | 
148 | A `RaggedTensor`s `tokens` where `tokens[i1...iN, j]` is the string contents, or
149 | ID in the vocab_lookup_table representing that string, of the `j`th token in
150 | `input[i1...iN]`
151 | 
152 | <h3 id="tokenize_with_offsets"><code>tokenize_with_offsets</code></h3>
153 | 
154 | ```python
155 | tokenize_with_offsets(input)
156 | ```
157 | 
158 | Tokenizes utf-8 encoded tokens into subword pieces based off of a vocab.
159 | 
160 | ### Example:
161 | 
162 | ```python
163 | >>> tokens = [["they're", "the", "greatest"]],
164 | >>> tokenizer = WordpieceTokenizer(vocab, token_out_type=tf.string)
165 | >>> result = tokenizer.tokenize_with_offsets(tokens)
166 | >>> result[0].to_list()  # subwords
167 | [[['they', "##'", '##re'], ['the'], ['great', '##est']]]
168 | >>> result[1].to_list()  # offset starts
169 | [[[0, 4, 5], [0], [0, 5]]]
170 | >>> result[2].to_list()  # offset limits
171 | [[[4, 5, 7], [3], [5, 8]]]
172 | ```
173 | 
174 | #### Args:
175 | 
176 | *   <b>`input`</b>: An N-dimensional `Tensor` or `RaggedTensor` of UTF-8
177 |     strings.
178 | 
179 | #### Returns:
180 | 
181 | A tuple of `RaggedTensor`s `tokens`, `start_offsets`, and `limit_offsets`
182 | 
183 | *   <b>`where`</b>: * `tokens[i1...iN, j]` is the string contents, or ID in the
184 |     vocab_lookup_table representing that string, of the `j`th token in
185 |     `input[i1...iN]`
186 |     *   `start_offsets[i1...iN, j]` is the byte offset for the start of the
187 |         `j`th token in `input[i1...iN]`
188 |     *   `limit_offsets[i1...iN, j]` is the byte offset for the end of the
189 | 
190 | <h3 id="with_name_scope"><code>with_name_scope</code></h3>
191 | 
192 | ```python
193 | with_name_scope(
194 |     cls,
195 |     method
196 | )
197 | ```
198 | 
199 | Decorator to automatically enter the module name scope.
200 | 
201 | ```
202 | class MyModule(tf.Module):
203 |   @tf.Module.with_name_scope
204 |   def __call__(self, x):
205 |     if not hasattr(self, 'w'):
206 |       self.w = tf.Variable(tf.random.normal([x.shape[1], 64]))
207 |     return tf.matmul(x, self.w)
208 | ```
209 | 
210 | Using the above module would produce `tf.Variable`s and `tf.Tensor`s whose names
211 | included the module name:
212 | 
213 | ```
214 | mod = MyModule()
215 | mod(tf.ones([8, 32]))
216 | # ==> <tf.Tensor: ...>
217 | mod.w
218 | # ==> <tf.Variable ...'my_module/w:0'>
219 | ```
220 | 
221 | #### Args:
222 | 
223 | *   <b>`method`</b>: The method to wrap.
224 | 
225 | #### Returns:
226 | 
227 | The original method wrapped such that it enters the module's name scope.
228 | 


--------------------------------------------------------------------------------
/tensorflow_text/python/ops/ngrams_op_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 TF.Text Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Tests for ngram ops."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | from tensorflow.python.framework import constant_op
 23 | from tensorflow.python.framework import errors
 24 | from tensorflow.python.framework import test_util
 25 | from tensorflow.python.ops.ragged import ragged_factory_ops
 26 | from tensorflow.python.ops.ragged import ragged_test_util
 27 | from tensorflow.python.platform import test
 28 | from tensorflow_text.python.ops import ngrams_op
 29 | 
 30 | 
 31 | @test_util.run_all_in_graph_and_eager_modes
 32 | class NgramsOpTest(ragged_test_util.RaggedTensorTestCase):
 33 | 
 34 |   def testSumReduction(self):
 35 |     test_data = constant_op.constant([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]])
 36 |     op = ngrams_op.ngrams(
 37 |         test_data, width=2, axis=1, reduction_type=ngrams_op.Reduction.SUM)
 38 |     expected_values = [[3.0, 5.0], [30.0, 50.0]]
 39 | 
 40 |     self.assertRaggedEqual(expected_values, op)
 41 | 
 42 |   def testRaggedSumReduction(self):
 43 |     test_data = ragged_factory_ops.constant([[1.0, 2.0, 3.0, 4.0],
 44 |                                              [10.0, 20.0, 30.0]])
 45 |     op = ngrams_op.ngrams(
 46 |         test_data, width=2, axis=1, reduction_type=ngrams_op.Reduction.SUM)
 47 |     expected_values = [[3.0, 5.0, 7.0], [30.0, 50.0]]
 48 | 
 49 |     self.assertRaggedEqual(expected_values, op)
 50 | 
 51 |   def testRaggedSumReductionAxisZero(self):
 52 |     test_data = ragged_factory_ops.constant([[1.0, 2.0, 3.0, 4.0],
 53 |                                              [10.0, 20.0, 30.0, 40.0]])
 54 |     op = ngrams_op.ngrams(
 55 |         test_data, width=2, axis=0, reduction_type=ngrams_op.Reduction.SUM)
 56 |     expected_values = [[11.0, 22.0, 33.0, 44.0]]
 57 | 
 58 |     self.assertRaggedEqual(expected_values, op)
 59 | 
 60 |   def testMeanReduction(self):
 61 |     test_data = constant_op.constant([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]])
 62 |     op = ngrams_op.ngrams(
 63 |         test_data, width=2, axis=1, reduction_type=ngrams_op.Reduction.MEAN)
 64 |     expected_values = [[1.5, 2.5], [15.0, 25.0]]
 65 | 
 66 |     self.assertRaggedEqual(expected_values, op)
 67 | 
 68 |   def testRaggedMeanReduction(self):
 69 |     test_data = ragged_factory_ops.constant([[1.0, 2.0, 3.0, 4.0],
 70 |                                              [10.0, 20.0, 30.0]])
 71 |     op = ngrams_op.ngrams(
 72 |         test_data, width=2, axis=-1, reduction_type=ngrams_op.Reduction.MEAN)
 73 |     expected_values = [[1.5, 2.5, 3.5], [15.0, 25.0]]
 74 | 
 75 |     self.assertRaggedEqual(expected_values, op)
 76 | 
 77 |   def testStringJoinReduction(self):
 78 |     test_data = constant_op.constant([["a", "b", "c"], ["dd", "ee", "ff"]])
 79 |     op = ngrams_op.ngrams(
 80 |         test_data,
 81 |         width=2,
 82 |         axis=-1,
 83 |         reduction_type=ngrams_op.Reduction.STRING_JOIN,
 84 |         string_separator="|")
 85 |     expected_values = [["a|b", "b|c"], ["dd|ee", "ee|ff"]]
 86 | 
 87 |     self.assertRaggedEqual(expected_values, op)
 88 | 
 89 |   def testStringJoinReductionAxisZero(self):
 90 |     test_data = constant_op.constant(["a", "b", "c"])
 91 |     op = ngrams_op.ngrams(
 92 |         test_data,
 93 |         width=2,
 94 |         axis=-1,  # The -1 axis is the zero axis here.
 95 |         reduction_type=ngrams_op.Reduction.STRING_JOIN,
 96 |         string_separator="|")
 97 |     expected_values = ["a|b", "b|c"]
 98 | 
 99 |     self.assertRaggedEqual(expected_values, op)
100 | 
101 |   def testRaggedStringJoinReduction(self):
102 |     test_data = ragged_factory_ops.constant([["a", "b", "c"], ["dd", "ee"]])
103 |     op = ngrams_op.ngrams(
104 |         test_data,
105 |         width=2,
106 |         axis=-1,
107 |         reduction_type=ngrams_op.Reduction.STRING_JOIN,
108 |         string_separator="|")
109 |     expected_values = [["a|b", "b|c"], ["dd|ee"]]
110 | 
111 |     self.assertRaggedEqual(expected_values, op)
112 | 
113 |   def testReductionWithNegativeAxis(self):
114 |     test_data = constant_op.constant([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]])
115 |     op = ngrams_op.ngrams(
116 |         test_data, width=2, axis=-1, reduction_type=ngrams_op.Reduction.SUM)
117 |     expected_values = [[3.0, 5.0], [30.0, 50.0]]
118 | 
119 |     self.assertRaggedEqual(expected_values, op)
120 | 
121 |   def testReductionOnInnerAxis(self):
122 |     test_data = constant_op.constant([[[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]],
123 |                                       [[4.0, 5.0, 6.0], [40.0, 50.0, 60.0]]])
124 |     op = ngrams_op.ngrams(
125 |         test_data, width=2, axis=-2, reduction_type=ngrams_op.Reduction.SUM)
126 |     expected_values = [[[11.0, 22.0, 33.0]], [[44.0, 55.0, 66.0]]]
127 | 
128 |     self.assertRaggedEqual(expected_values, op)
129 | 
130 |   def testRaggedReductionOnInnerAxis(self):
131 |     test_data = ragged_factory_ops.constant([[[1.0, 2.0, 3.0, 4.0],
132 |                                               [10.0, 20.0, 30.0, 40.0]],
133 |                                              [[100.0, 200.0], [300.0, 400.0]]])
134 |     op = ngrams_op.ngrams(
135 |         test_data, width=2, axis=-2, reduction_type=ngrams_op.Reduction.SUM)
136 |     expected_values = [[[11.0, 22.0, 33.0, 44.0]], [[400.0, 600.0]]]
137 | 
138 |     self.assertRaggedEqual(expected_values, op)
139 | 
140 |   def testReductionOnAxisWithInsufficientValuesReturnsEmptySet(self):
141 |     test_data = constant_op.constant([[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]])
142 |     op = ngrams_op.ngrams(
143 |         test_data, width=4, axis=-1, reduction_type=ngrams_op.Reduction.SUM)
144 |     expected_values = [[], []]
145 | 
146 |     self.assertRaggedEqual(expected_values, op)
147 | 
148 |   def testRaggedReductionOnAxisWithInsufficientValuesReturnsEmptySet(self):
149 |     test_data = ragged_factory_ops.constant([[1.0, 2.0, 3.0],
150 |                                              [10.0, 20.0, 30.0, 40.0]])
151 |     op = ngrams_op.ngrams(
152 |         test_data, width=4, axis=1, reduction_type=ngrams_op.Reduction.SUM)
153 |     expected_values = [[], [100.0]]
154 | 
155 |     self.assertRaggedEqual(expected_values, op)
156 | 
157 |   def testStringJoinReductionFailsWithImproperAxis(self):
158 |     with self.assertRaisesRegexp(
159 |         errors.InvalidArgumentError,
160 |         r".*requires that ngrams' 'axis' parameter be -1."):
161 |       _ = ngrams_op.ngrams(
162 |           data=[],
163 |           width=2,
164 |           axis=0,
165 |           reduction_type=ngrams_op.Reduction.STRING_JOIN)
166 | 
167 |   def testUnspecifiedReductionTypeFails(self):
168 |     with self.assertRaisesRegexp(errors.InvalidArgumentError,
169 |                                  r"reduction_type must be specified."):
170 |       _ = ngrams_op.ngrams(data=[], width=2, axis=0)
171 | 
172 |   def testBadReductionTypeFails(self):
173 |     with self.assertRaisesRegexp(errors.InvalidArgumentError,
174 |                                  r"reduction_type must be a Reduction."):
175 |       _ = ngrams_op.ngrams(data=[], width=2, axis=0, reduction_type="SUM")
176 | 
177 | 
178 | if __name__ == "__main__":
179 |   test.main()
180 | 


--------------------------------------------------------------------------------