├── oss_scripts
├── pip_package
│ ├── MANIFEST.in
│ ├── BUILD
│ ├── build_pip_package.sh
│ └── setup.py
└── configure.sh
├── third_party
├── tensorflow
│ ├── BUILD
│ ├── BUILD.tpl
│ └── tf_configure.bzl
└── icu
│ ├── BUILD
│ ├── BUILD.system
│ ├── workspace.bzl
│ └── BUILD.bazel
├── tensorflow_text
├── workspace.bzl
├── python
│ ├── __init__.py
│ ├── numpy
│ │ ├── __init__.py
│ │ └── viterbi_decode.py
│ └── ops
│ │ ├── coerce_to_valid_utf8_op_test.py
│ │ ├── __init__.py
│ │ ├── tokenization.py
│ │ ├── string_ops.py
│ │ ├── normalize_ops.py
│ │ ├── ngrams_op.py
│ │ ├── normalize_ops_test.py
│ │ ├── create_feature_bitmask_op.py
│ │ ├── sentence_breaking_ops.py
│ │ ├── sliding_window_op.py
│ │ ├── create_feature_bitmask_op_test.py
│ │ └── ngrams_op_test.py
├── __init__.py
└── core
│ ├── ops
│ ├── normalize_ops.cc
│ ├── sentence_breaking_ops.cc
│ ├── whitespace_tokenize_op.cc
│ ├── unicode_script_tokenize_op.cc
│ ├── wordpiece_op.cc
│ └── constrained_sequence_op.cc
│ └── kernels
│ ├── wordpiece_tokenizer.h
│ ├── whitespace_tokenize_kernel_test.cc
│ ├── unicode_script_tokenize_kernel_test.cc
│ ├── text_kernels_test_util.cc
│ ├── sentence_breaking_utils.h
│ ├── wordpiece_tokenizer.cc
│ ├── text_kernels_test_util.h
│ └── normalize_kernels.cc
├── docs
└── api_docs
│ └── python
│ ├── text
│ ├── Reduction.md
│ ├── normalize_utf8.md
│ ├── wordshape.md
│ ├── case_fold_utf8.md
│ ├── coerce_to_structurally_valid_utf8.md
│ ├── gather_with_default.md
│ ├── ngrams.md
│ ├── pad_along_dimension.md
│ ├── sentence_fragments.md
│ ├── span_overlaps.md
│ ├── Tokenizer.md
│ ├── sliding_window.md
│ ├── TokenizerWithOffsets.md
│ ├── greedy_constrained_sequence.md
│ ├── viterbi_constrained_sequence.md
│ ├── WhitespaceTokenizer.md
│ ├── span_alignment.md
│ ├── UnicodeScriptTokenizer.md
│ ├── _api_cache.json
│ └── WordpieceTokenizer.md
│ ├── index.md
│ ├── _toc.yaml
│ └── text.md
├── .bazelrc
├── CONTRIBUTING.md
└── WORKSPACE
/oss_scripts/pip_package/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include tensorflow_text/ *.so
2 |
--------------------------------------------------------------------------------
/third_party/tensorflow/BUILD:
--------------------------------------------------------------------------------
1 | # Needed for Bazel to treat this directory as a package
2 |
--------------------------------------------------------------------------------
/third_party/icu/BUILD:
--------------------------------------------------------------------------------
1 | # This empty BUILD file is required to make Bazel treat this directory as a package.
2 |
--------------------------------------------------------------------------------
/tensorflow_text/workspace.bzl:
--------------------------------------------------------------------------------
1 | """doc"""
2 |
3 | load("//third_party/icu:workspace.bzl", icu = "repo")
4 |
5 | def initialize_third_party_archives():
6 | icu()
7 |
--------------------------------------------------------------------------------
/oss_scripts/pip_package/BUILD:
--------------------------------------------------------------------------------
1 | # Tools for building the TF.Text pip package.
2 |
3 | package(default_visibility = ["//visibility:private"])
4 |
5 | licenses(["notice"]) # Apache 2.0
6 |
7 | sh_binary(
8 | name = "build_pip_package",
9 | srcs = ["build_pip_package.sh"],
10 | data = [
11 | "LICENSE",
12 | "MANIFEST.in",
13 | "setup.py",
14 | "//tensorflow_text:tf-text",
15 | ],
16 | )
17 |
--------------------------------------------------------------------------------
/third_party/tensorflow/BUILD.tpl:
--------------------------------------------------------------------------------
1 | package(default_visibility = ["//visibility:public"])
2 |
3 | cc_library(
4 | name = "tf_header_lib",
5 | hdrs = [":tf_header_include"],
6 | includes = ["include"],
7 | visibility = ["//visibility:public"],
8 | )
9 |
10 | cc_library(
11 | name = "libtensorflow_framework",
12 | srcs = [":libtensorflow_framework.so.1"],
13 | #data = ["lib/libtensorflow_framework.so.1"],
14 | visibility = ["//visibility:public"],
15 | )
16 |
17 | %{TF_HEADER_GENRULE}
18 | %{TF_SHARED_LIBRARY_GENRULE}
19 |
--------------------------------------------------------------------------------
/third_party/icu/BUILD.system:
--------------------------------------------------------------------------------
1 | package(
2 | default_visibility = ["//visibility:public"],
3 | )
4 |
5 | licenses(["notice"]) # Apache 2.0
6 |
7 | filegroup(
8 | name = "icu4c/LICENSE",
9 | )
10 |
11 | filegroup(
12 | name = "icu4j/main/shared/licenses/LICENSE",
13 | )
14 |
15 | cc_library(
16 | name = "headers",
17 | )
18 |
19 | cc_library(
20 | name = "common",
21 | deps = [
22 | ":icuuc",
23 | ],
24 | )
25 |
26 | cc_library(
27 | name = "icuuc",
28 | linkopts = ["-licuuc"],
29 | visibility = ["//visibility:private"],
30 | )
31 |
--------------------------------------------------------------------------------
/tensorflow_text/python/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2019 TF.Text Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # Empty file required by setuptools.find_packages to recognize this as a package
17 |
--------------------------------------------------------------------------------
/tensorflow_text/python/numpy/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2019 TF.Text Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Numpy-based code for text processing."""
17 |
18 | from tensorflow_text.python.numpy import viterbi_decode
19 |
--------------------------------------------------------------------------------
/third_party/icu/workspace.bzl:
--------------------------------------------------------------------------------
1 | """Loads a lightweight subset of the ICU library for Unicode processing."""
2 |
3 | load("@org_tensorflow//third_party:repo.bzl", "third_party_http_archive")
4 |
5 | # Sanitize a dependency so that it works correctly from code that includes
6 | # TensorFlow as a submodule.
7 | def clean_dep(dep):
8 | return str(Label(dep))
9 |
10 | def repo():
11 | third_party_http_archive(
12 | name = "icu",
13 | strip_prefix = "icu-release-62-1",
14 | sha256 = "e15ffd84606323cbad5515bf9ecdf8061cc3bf80fb883b9e6aa162e485aa9761",
15 | urls = [
16 | "http://mirror.tensorflow.org/github.com/unicode-org/icu/archive/release-62-1.tar.gz",
17 | "https://github.com/unicode-org/icu/archive/release-62-1.tar.gz",
18 | ],
19 | build_file = "//third_party/icu:BUILD.bazel",
20 | system_build_file = "//third_party/icu:BUILD.system",
21 | )
22 |
--------------------------------------------------------------------------------
/docs/api_docs/python/text/Reduction.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | # text.Reduction
10 |
11 | ## Class `Reduction`
12 |
13 | Type of reduction to be done by the ngram op.
14 |
15 | Defined in
16 | [`python/ops/ngrams_op.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/ngrams_op.py).
17 |
18 |
19 |
20 | The supported reductions are as follows:
21 |
22 | * `Reduction.SUM`: Add values in the window.
23 | * `Reduction.MEAN`: Average values in the window.
24 | * `Reduction.STRING_JOIN`: Join strings in the window.
25 |
26 | ## Class Members
27 |
28 | MEAN
29 |
30 | STRING_JOIN
31 |
32 | SUM
33 |
34 |
--------------------------------------------------------------------------------
/docs/api_docs/python/text/normalize_utf8.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | # text.normalize_utf8
7 |
8 | Normalizes each UTF8 string in the input tensor using the specified rule.
9 |
10 | ``` python
11 | text.normalize_utf8(
12 | input,
13 | normalization_form='NFKC',
14 | name=None
15 | )
16 | ```
17 |
18 | Defined in
19 | [`python/ops/normalize_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/normalize_ops.py).
20 |
21 |
22 |
23 | See http://unicode.org/reports/tr15/
24 |
25 | #### Args:
26 |
27 | * `input`: A `Tensor` or `RaggedTensor` of type string. (Must be
28 | UTF-8.)
29 | * `normalization_form`: One of the following string values ('NFC',
30 | 'NFKC', 'NFD', 'NFKD'). Default is 'NFKC'.
31 | * `name`: The name for this op (optional)
32 |
33 | #### Returns:
34 |
35 | A `Tensor` or `RaggedTensor` of type string, with normalized contents.
36 |
--------------------------------------------------------------------------------
/docs/api_docs/python/text/wordshape.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | # text.wordshape
7 |
8 | Determine wordshape features for each input string.
9 |
10 | ``` python
11 | text.wordshape(
12 | input_tensor,
13 | pattern,
14 | name=None
15 | )
16 | ```
17 |
18 | Defined in
19 | [`python/ops/wordshape_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/wordshape_ops.py).
20 |
21 |
22 |
23 | #### Args:
24 |
25 | * `input_tensor`: string `Tensor` with any shape.
26 | * `pattern`: A `~tftext.WordShape` or a list of WordShapes.
27 | * `name`: A name for the operation (optional).
28 |
29 |
30 | #### Returns:
31 |
32 | `[input_tensor.shape + pattern.shape]`: A tensor where
33 | `result[i1...iN, j]` is true if `input_tensor[i1...iN]` has the wordshape
34 | specified by `pattern[j]`.
35 |
36 | #### Raises:
37 |
38 | * `ValueError`: If `pattern` contains an unknown identifier.
--------------------------------------------------------------------------------
/.bazelrc:
--------------------------------------------------------------------------------
1 | # TensorFlow Federated Bazel configuration.
2 | #
3 | # See https://docs.bazel.build/versions/master/user-manual.html#config for
4 | # details on the various configuration options.
5 |
6 | # Build with modular op registration support by default.
7 | build --define=framework_shared_object=true
8 |
9 | # Bazel workaround to compile gRPC with the new 'cares' package.
10 | build --define=grpc_no_ares=true
11 |
12 | # Build with optimization enabled.
13 | build --compilation_mode=opt
14 |
15 | # Processor native optimizations (depends on build host capabilities).
16 | build --copt=-march=native
17 | build --host_copt=-march=native
18 | build --copt=-O3
19 | build --copt=-Wno-sign-compare
20 | build --define with_default_optimizations=true
21 |
22 | # Disable Tensorflow extensions that are not needed for Tensorflow Federated.
23 | build --define=no_aws_support=true
24 | build --define=no_hdfs_support=true
25 | build --define=no_kafka_support=true
26 | build --define=no_ignite_support=true
27 | build --define=no_nccl_support=true
28 |
29 | # Misc configuration
30 | build:xla --define with_xla_support=true
31 | build:v2 --define=tf_api_version=2
32 | build --action_env TF_CONFIGURE_IOS="0"
33 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to to see
12 | your current agreements on file or to sign a new one.
13 |
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 |
18 | ## Code reviews
19 |
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 |
25 | ## Community Guidelines
26 |
27 | This project follows
28 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
29 |
--------------------------------------------------------------------------------
/docs/api_docs/python/text/case_fold_utf8.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | # text.case_fold_utf8
7 |
8 | Applies case folding to every UTF8 string in the input.
9 |
10 | ``` python
11 | text.case_fold_utf8(
12 | input,
13 | name=None
14 | )
15 | ```
16 |
17 | Defined in
18 | [`python/ops/normalize_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/normalize_ops.py).
19 |
20 |
21 |
22 | The input is a `Tensor` or `RaggedTensor` of any shape, and the resulting output
23 | has the same shape as the input. Note that NFKC normalization is implicitly
24 | applied to the strings.
25 |
26 | #### For example:
27 |
28 | ```python
29 | >>> case_fold_utf8(['The Quick-Brown',
30 | ... 'CAT jumped over',
31 | ... 'the lazy dog !! ']
32 | tf.Tensor(['The quick-brown' 'cat jumped over' 'the lazy dog !! '],
33 | shape=(3,), dtype=string)
34 | ```
35 |
36 | #### Args:
37 |
38 | * `input`: A `Tensor` or `RaggedTensor` of type string. (Must be
39 | UTF-8.)
40 | * `name`: The name for this op (optional)
41 |
42 | #### Returns:
43 |
44 | A `Tensor` or `RaggedTensor` of type string, with case-folded contents.
45 |
--------------------------------------------------------------------------------
/oss_scripts/configure.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ==============================================================================
16 |
17 | function write_to_bazelrc() {
18 | echo "$1" >> .bazelrc
19 | }
20 |
21 | function write_action_env_to_bazelrc() {
22 | write_to_bazelrc "build --action_env $1=\"$2\""
23 | }
24 |
25 | if python -c "import tensorflow" &> /dev/null; then
26 | echo 'using installed tensorflow'
27 | else
28 | rm .bazelrc
29 | pip install tensorflow-2.0.0b0
30 | fi
31 |
32 | TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
33 | TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
34 |
35 | write_action_env_to_bazelrc "TF_HEADER_DIR" ${TF_CFLAGS:2}
36 | write_action_env_to_bazelrc "TF_SHARED_LIBRARY_DIR" ${TF_LFLAGS:2}
37 |
--------------------------------------------------------------------------------
/tensorflow_text/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2019 TF.Text Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Various tensorflow ops related to text-processing."""
17 | from tensorflow.python.util.all_util import remove_undocumented
18 |
19 | # pylint: disable=wildcard-import,g-import-not-at-top
20 | from tensorflow_text.python.ops import *
21 |
22 | _allowed_symbols = [
23 | "case_fold_utf8",
24 | "coerce_to_structurally_valid_utf8",
25 | "gather_with_default",
26 | "greedy_constrained_sequence",
27 | "ngrams",
28 | "normalize_utf8",
29 | "pad_along_dimension",
30 | "Reduction",
31 | "sentence_fragments",
32 | "sliding_window",
33 | "span_alignment",
34 | "span_overlaps",
35 | "Tokenizer",
36 | "TokenizerWithOffsets",
37 | "UnicodeScriptTokenizer",
38 | "viterbi_constrained_sequence",
39 | "WhitespaceTokenizer",
40 | "wordshape",
41 | "WordShape",
42 | "WordpieceTokenizer",
43 | ]
44 |
45 | remove_undocumented(__name__, _allowed_symbols)
46 |
--------------------------------------------------------------------------------
/oss_scripts/pip_package/build_pip_package.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Tool to build the TensorFlow Text pip package.
3 | #
4 | # Usage:
5 | # bazel build oss_scripts/pip_package:build_pip_package
6 | # bazel-bin/oss_scripts/build_pip_package
7 | #
8 | # Arguments:
9 | # output_dir: An output directory. Defaults to `/tmp/tensorflow_text_pkg`.
10 |
11 | set -e # fail and exit on any command erroring
12 |
13 | die() {
14 | echo >&2 "$@"
15 | exit 1
16 | }
17 |
18 | main() {
19 | local output_dir="$1"
20 |
21 | if [[ -z "${output_dir}" ]]; then
22 | output_dir="/tmp/tensorflow_text_pkg"
23 | fi
24 | mkdir -p ${output_dir}
25 | output_dir=$(readlink -f "${output_dir}")
26 | echo "=== Destination directory: ${output_dir}"
27 |
28 | if [[ ! -d "bazel-bin/tensorflow_text" ]]; then
29 | die "Could not find bazel-bin. Did you run from the root of the build tree?"
30 | fi
31 |
32 | local temp_dir="$(mktemp -d)"
33 | trap "rm -rf ${temp_dir}" EXIT
34 | echo "=== Using tmpdir ${temp_dir}"
35 |
36 | local runfiles="bazel-bin/oss_scripts/pip_package/build_pip_package.runfiles"
37 | cp -LR \
38 | "${runfiles}/org_tensorflow_text/tensorflow_text" \
39 | "${temp_dir}"
40 | cp "${runfiles}/org_tensorflow_text/oss_scripts/pip_package/setup.py" \
41 | "${temp_dir}"
42 | cp "${runfiles}/org_tensorflow_text/oss_scripts/pip_package/MANIFEST.in" \
43 | "${temp_dir}"
44 | cp "${runfiles}/org_tensorflow_text/oss_scripts/pip_package/LICENSE" \
45 | "${temp_dir}"
46 |
47 | pushd "${temp_dir}" > /dev/null
48 |
49 | # Build pip package
50 | python setup.py bdist_wheel --universal
51 | cp dist/*.whl "${output_dir}"
52 | }
53 |
54 | main "$@"
55 |
--------------------------------------------------------------------------------
/docs/api_docs/python/text/coerce_to_structurally_valid_utf8.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | # text.coerce_to_structurally_valid_utf8
7 |
8 | Coerce UTF-8 input strings to structurally valid UTF-8.
9 |
10 | ``` python
11 | text.coerce_to_structurally_valid_utf8(
12 | input,
13 | replacement_char=_unichr(65533),
14 | name=None
15 | )
16 | ```
17 |
18 | Defined in
19 | [`python/ops/string_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/string_ops.py).
20 |
21 |
22 |
23 | Any bytes which cause the input string to be invalid UTF-8 are substituted
24 | with the provided replacement character codepoint (default 65533). Use a
25 | single byte replacement character codepoint to preserve alignment to the
26 | source input string.
27 |
28 | #### Args:
29 |
30 | * `input`: UTF-8 string tensor to coerce to valid UTF-8.
31 | * `replacement_char`: The replacement character to be used in place of
32 | any invalid byte in the input. Any valid Unicode character may be used. The
33 | default value is the default Unicode replacement character which is 0xFFFD
34 | (or U+65533). Note that passing a replacement character expressible in 1
35 | byte, such as ' ' or '?', will preserve string alignment to the source since
36 | individual invalid bytes will be replaced with a 1-byte replacement.
37 | (optional)
38 | * `name`: A name for the operation (optional).
39 |
40 | #### Returns:
41 |
42 | A tensor of type string with the same shape as the input.
43 |
--------------------------------------------------------------------------------
/docs/api_docs/python/text/gather_with_default.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | # text.gather_with_default
7 |
8 | Gather slices with `indices=-1` mapped to `default`.
9 |
10 | ``` python
11 | text.gather_with_default(
12 | params,
13 | indices,
14 | default,
15 | name=None,
16 | axis=0
17 | )
18 | ```
19 |
20 | Defined in
21 | [`python/ops/pointer_ops.py`](https://github.com/tensorflow/text/tree/master/tensorflow_text/python/ops/pointer_ops.py).
22 |
23 |
24 |
25 | This operation is similar to `tf.gather()`, except that any value of `-1`
26 | in `indices` will be mapped to `default`. Example:
27 |
28 | ```python
29 | >>> gather_with_default(['a', 'b', 'c', 'd'], [2, 0, -1, 2, -1], '_').eval()
30 | array(['c', 'a', '_', 'c', '_'], dtype=object)
31 | ```
32 |
33 | #### Args:
34 |
35 | * `params`: The `Tensor` from which to gather values. Must be at least
36 | rank `axis + 1`.
37 | * `indices`: The index `Tensor`. Must have dtype `int32` or `int64`,
38 | and values must be in the range `[-1, params.shape[axis])`.
39 | * `default`: The value to use when `indices` is `-1`. `default.shape`
40 | must be equal to `params.shape[axis + 1:]`.
41 | * `name`: A name for the operation (optional).
42 | * `axis`: The axis in `params` to gather `indices` from. Must be a
43 | scalar `int32` or `int64`. Supports negative indices.
44 |
45 | #### Returns:
46 |
47 | A `Tensor` with the same type as `param`, and with shape `params.shape[:axis] +
48 | indices.shape + params.shape[axis + 1:]`.
49 |
--------------------------------------------------------------------------------
/docs/api_docs/python/index.md:
--------------------------------------------------------------------------------
1 | # All symbols in TensorFlow Text
2 |
3 | * text
4 | * text.Reduction
5 | * text.Tokenizer
6 | * text.TokenizerWithOffsets
7 | * text.UnicodeScriptTokenizer
8 | * text.WhitespaceTokenizer
9 | * text.WordShape
10 | * text.WordpieceTokenizer
11 | * text.case_fold_utf8
12 | * text.coerce_to_structurally_valid_utf8
13 | * text.gather_with_default
14 | * text.greedy_constrained_sequence
15 | * text.ngrams
16 | * text.normalize_utf8
17 | * text.pad_along_dimension
18 | * text.sentence_fragments
19 | * text.sliding_window
20 | * text.span_alignment
21 | * text.span_overlaps
22 | * text.viterbi_constrained_sequence
23 | * text.wordshape
24 |
--------------------------------------------------------------------------------
/tensorflow_text/core/ops/normalize_ops.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2019 TF.Text Authors.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include "tensorflow/core/framework/common_shape_fns.h"
16 | #include "tensorflow/core/framework/op.h"
17 | #include "tensorflow/core/framework/shape_inference.h"
18 |
19 | namespace tensorflow {
20 | namespace text {
21 |
22 | REGISTER_OP("CaseFoldUTF8")
23 | .Input("input: string")
24 | .Output("output: string")
25 | .SetShapeFn(::tensorflow::shape_inference::UnchangedShape)
26 | .Doc(R"doc(
27 | Applies case folding to every UTF8 string in input_tensor. The input is a dense
28 | tensor of any shape and the output has the same shape as the input.
29 |
30 | For example if:
31 |
32 | input = [ 'The Quick-Brown',
33 | 'CAT jumped over',
34 | 'the lazy dog !! ']
35 |
36 | output = [ 'The quick-brown',
37 | 'cat jumped over',
38 | 'the lazy dog !! ']
39 | )doc");
40 |
41 | REGISTER_OP("NormalizeUTF8")
42 | .Input("input: string")
43 | .Attr("normalization_form: string")
44 | .Output("output: string")
45 | .SetShapeFn(::tensorflow::shape_inference::UnchangedShape)
46 | .Doc(R"doc(
47 | Normalizes each UTF8 string in the input tensor using 'normalization_form'
48 | rules.
49 |
50 | See http://unicode.org/reports/tr15/
51 | )doc");
52 |
53 | } // namespace text
54 | } // namespace tensorflow
55 |
--------------------------------------------------------------------------------
/tensorflow_text/core/ops/sentence_breaking_ops.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2019 TF.Text Authors.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include "tensorflow/core/framework/common_shape_fns.h"
16 | #include "tensorflow/core/framework/op.h"
17 | #include "tensorflow/core/framework/shape_inference.h"
18 | #include "tensorflow/core/lib/core/status.h"
19 |
20 | namespace tensorflow {
21 | namespace text {
22 |
23 | Status SentenceFragmentShapeFn(
24 | ::tensorflow::shape_inference::InferenceContext* c) {
25 | for (int i = 0; i < c->num_outputs(); ++i) {
26 | c->set_output(i, c->UnknownShapeOfRank(1));
27 | }
28 |
29 | return Status::OK();
30 | }
31 |
32 | REGISTER_OP("SentenceFragments")
33 | .Attr("input_encoding: string")
34 | .Attr("errors: {'strict', 'replace', 'ignore'} = 'replace'")
35 | .Attr("replacement_char: int = 65533") // 0xFFFD unicode replacement char
36 | .Attr("replace_control_characters: bool = false")
37 | .Input("row_lengths: int64")
38 | .Input("token_start: int64")
39 | .Input("token_end: int64")
40 | .Input("token_word: string")
41 | .Input("token_properties: int64")
42 | .Output("fragment_start: int64")
43 | .Output("fragment_end: int64")
44 | .Output("fragment_properties: int64")
45 | .Output("terminal_punc_token: int64")
46 | .Output("output_row_lengths: int64")
47 | .SetShapeFn(SentenceFragmentShapeFn);
48 |
49 | } // namespace text
50 | } // namespace tensorflow
51 |
--------------------------------------------------------------------------------
/tensorflow_text/python/ops/coerce_to_valid_utf8_op_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2019 TF.Text Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # -*- coding: utf-8 -*-
17 | """Tests for Utf8Chars Op from string_ops."""
18 |
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 |
23 | from tensorflow.python.platform import test
24 | from tensorflow_text.python.ops import string_ops
25 |
26 |
27 | class CoerceToUtf8Test(test.TestCase):
28 |
29 | def testCoercetoStructurallyValidOnValidInput(self):
30 | with self.test_session():
31 | utf8 = string_ops.coerce_to_structurally_valid_utf8(["abc"])
32 | self.assertAllEqual(utf8, ["abc"])
33 |
34 | def testCoercetoStructurallyValidOnValidInputWithDefault(self):
35 | with self.test_session():
36 | utf8 = string_ops.coerce_to_structurally_valid_utf8(["abc"], "?")
37 | self.assertAllEqual(utf8, ["abc"])
38 |
39 | def testCoercetoStructurallyValidOnInvalidInput(self):
40 | with self.test_session():
41 | utf8 = string_ops.coerce_to_structurally_valid_utf8([b"abc\xfd"])
42 | self.assertAllEqual(utf8, ["abc�"])
43 |
44 | def testCoercetoStructurallyValidOnInvalidInputWithDefault(self):
45 | with self.test_session():
46 | utf8 = string_ops.coerce_to_structurally_valid_utf8([b"abc\xfd"], "?")
47 | self.assertAllEqual(utf8, ["abc?"])
48 |
49 |
50 | if __name__ == "__main__":
51 | test.main()
52 |
--------------------------------------------------------------------------------
/docs/api_docs/python/_toc.yaml:
--------------------------------------------------------------------------------
1 | # Automatically generated file; please do not edit
2 | toc:
3 | - title: text
4 | section:
5 | - title: Overview
6 | path: /text/api_docs/python/text
7 | - title: case_fold_utf8
8 | path: /text/api_docs/python/text/case_fold_utf8
9 | - title: coerce_to_structurally_valid_utf8
10 | path: /text/api_docs/python/text/coerce_to_structurally_valid_utf8
11 | - title: gather_with_default
12 | path: /text/api_docs/python/text/gather_with_default
13 | - title: greedy_constrained_sequence
14 | path: /text/api_docs/python/text/greedy_constrained_sequence
15 | - title: ngrams
16 | path: /text/api_docs/python/text/ngrams
17 | - title: normalize_utf8
18 | path: /text/api_docs/python/text/normalize_utf8
19 | - title: pad_along_dimension
20 | path: /text/api_docs/python/text/pad_along_dimension
21 | - title: Reduction
22 | path: /text/api_docs/python/text/Reduction
23 | - title: sentence_fragments
24 | path: /text/api_docs/python/text/sentence_fragments
25 | - title: sliding_window
26 | path: /text/api_docs/python/text/sliding_window
27 | - title: span_alignment
28 | path: /text/api_docs/python/text/span_alignment
29 | - title: span_overlaps
30 | path: /text/api_docs/python/text/span_overlaps
31 | - title: Tokenizer
32 | path: /text/api_docs/python/text/Tokenizer
33 | - title: TokenizerWithOffsets
34 | path: /text/api_docs/python/text/TokenizerWithOffsets
35 | - title: UnicodeScriptTokenizer
36 | path: /text/api_docs/python/text/UnicodeScriptTokenizer
37 | - title: viterbi_constrained_sequence
38 | path: /text/api_docs/python/text/viterbi_constrained_sequence
39 | - title: WhitespaceTokenizer
40 | path: /text/api_docs/python/text/WhitespaceTokenizer
41 | - title: WordpieceTokenizer
42 | path: /text/api_docs/python/text/WordpieceTokenizer
43 | - title: WordShape
44 | path: /text/api_docs/python/text/WordShape
45 | - title: wordshape
46 | path: /text/api_docs/python/text/wordshape
47 |
--------------------------------------------------------------------------------
/tensorflow_text/core/kernels/wordpiece_tokenizer.h:
--------------------------------------------------------------------------------
1 | // Copyright 2019 TF.Text Authors.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #ifndef TENSORFLOW_TEXT_CORE_KERNELS_WORDPIECE_TOKENIZER_H_
16 | #define TENSORFLOW_TEXT_CORE_KERNELS_WORDPIECE_TOKENIZER_H_
17 |
18 | #include