├── .classpath
├── .gitignore
├── .project
├── .settings
├── org.eclipse.core.resources.prefs
├── org.eclipse.jdt.core.prefs
├── org.eclipse.m2e.core.prefs
├── org.eclipse.wst.common.component
├── org.eclipse.wst.common.project.facet.core.xml
└── org.eclipse.wst.validation.prefs
├── README.md
├── pom.xml
└── src
└── main
├── java
└── com
│ └── ankit
│ └── bert
│ ├── tokenizer
│ └── Tokenizer.java
│ ├── tokenizerimpl
│ ├── BasicTokenizer.java
│ ├── BertTokenizer.java
│ └── WordpieceTokenizer.java
│ └── utils
│ └── TokenizerUtils.java
└── resources
└── vocab.txt
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 | .idea
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | aiserver
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.wst.common.project.facet.core.builder
10 |
11 |
12 |
13 |
14 | org.eclipse.jdt.core.javabuilder
15 |
16 |
17 |
18 |
19 | org.eclipse.wst.validation.validationbuilder
20 |
21 |
22 |
23 |
24 | org.eclipse.m2e.core.maven2Builder
25 |
26 |
27 |
28 |
29 |
30 | org.eclipse.jem.workbench.JavaEMFNature
31 | org.eclipse.wst.common.modulecore.ModuleCoreNature
32 | org.eclipse.jdt.core.javanature
33 | org.eclipse.m2e.core.maven2Nature
34 | org.eclipse.wst.common.project.facet.core.nature
35 |
36 |
37 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/main/java=UTF-8
3 | encoding//src/main/resources=UTF-8
4 | encoding//src/test/java=UTF-8
5 | encoding//src/test/resources=UTF-8
6 | encoding/=UTF-8
7 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate
4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
6 | org.eclipse.jdt.core.compiler.compliance=1.8
7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
12 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
13 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
14 | org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
15 | org.eclipse.jdt.core.compiler.release=disabled
16 | org.eclipse.jdt.core.compiler.source=1.8
17 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.wst.common.component:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.wst.common.project.facet.core.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.wst.validation.prefs:
--------------------------------------------------------------------------------
1 | disabled=06target
2 | eclipse.preferences.version=1
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Bert Tokenizer
2 |
3 | This repository contains java implementation of Bert Tokenizer. The implementation is referred from the Hugging face Transformers library.
4 |
5 | https://huggingface.co/transformers/main_classes/tokenizer.html
6 |
7 | ## Usage
8 |
9 | To get tokens from text:
10 | ```
11 | String text = "Text to tokenize";
12 | BertTokenizer tokenizer = new BertTokenizer();
13 | List tokens = tokenizer.tokenize(text);
14 | ```
15 |
16 | To get token ids using Bert Vocab:
17 |
18 | ```
19 | List token_ids = tokenizer.convert_tokens_to_ids(tokens);
20 | ```
21 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
4 | 4.0.0
5 | com.ankit
6 | berttokenizer
7 | 0.0.1
8 | BertTokenizer
9 | Java Implementation of Hugging face Bert Tokenizer
10 |
11 |
12 | UTF-8
13 | 4.12
14 | 2.13.2
15 | 2.25.1
16 |
17 |
18 |
19 |
20 | maven-compiler-plugin
21 | 3.3
22 |
23 | 1.8
24 | 1.8
25 |
26 |
27 |
28 | maven-assembly-plugin
29 |
30 |
31 | jar-with-dependencies
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 | org.projectlombok
40 | lombok
41 | 1.18.12
42 | provided
43 |
44 |
45 | org.apache.logging.log4j
46 | log4j-core
47 | ${log4j.version}
48 |
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/src/main/java/com/ankit/bert/tokenizer/Tokenizer.java:
--------------------------------------------------------------------------------
1 | package com.ankit.bert.tokenizer;
2 |
3 | import java.util.List;
4 |
5 | public interface Tokenizer {
6 |
7 | public List tokenize(String text);
8 |
9 | }
10 |
--------------------------------------------------------------------------------
/src/main/java/com/ankit/bert/tokenizerimpl/BasicTokenizer.java:
--------------------------------------------------------------------------------
1 | package com.ankit.bert.tokenizerimpl;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import com.ankit.bert.tokenizer.Tokenizer;
7 | import com.ankit.bert.utils.TokenizerUtils;
8 |
9 | public class BasicTokenizer implements Tokenizer {
10 | private boolean do_lower_case = true;
11 | private List never_split = new ArrayList();
12 | private boolean tokenize_chinese_chars = true;
13 |
14 | public BasicTokenizer(boolean do_lower_case, List never_split, boolean tokenize_chinese_chars) {
15 | this.do_lower_case = do_lower_case;
16 | if (never_split == null) {
17 | never_split = new ArrayList();
18 | }
19 | this.tokenize_chinese_chars = tokenize_chinese_chars;
20 | }
21 |
22 | public BasicTokenizer() {
23 | }
24 |
25 | @Override
26 | public List tokenize(String text) {
27 | text = TokenizerUtils.clean_text(text);
28 | if (tokenize_chinese_chars) {
29 | text = TokenizerUtils.tokenize_chinese_chars(text);
30 | }
31 | List orig_tokens = TokenizerUtils.whitespace_tokenize(text);
32 |
33 | List split_tokens = new ArrayList();
34 | for (String token : orig_tokens) {
35 | if (do_lower_case && !never_split.contains(token)) {
36 | token = TokenizerUtils.run_strip_accents(token.toLowerCase());
37 | split_tokens.addAll(TokenizerUtils.run_split_on_punc(token, never_split));
38 | }
39 | }
40 | return TokenizerUtils.whitespace_tokenize(String.join(" ", split_tokens));
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/java/com/ankit/bert/tokenizerimpl/BertTokenizer.java:
--------------------------------------------------------------------------------
1 | package com.ankit.bert.tokenizerimpl;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.util.ArrayList;
6 | import java.util.HashMap;
7 | import java.util.List;
8 | import java.util.Map;
9 | import java.util.stream.Collectors;
10 |
11 | import com.ankit.bert.tokenizer.Tokenizer;
12 | import com.ankit.bert.utils.TokenizerUtils;
13 |
14 | import lombok.extern.log4j.Log4j2;
15 |
16 | /**
17 | * Constructs a BERT tokenizer. Based on WordPiece.
18 | *
19 | * This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which
20 | * contains most of the methods. Users should refer to the superclass for more
21 | * information regarding methods.
22 | *
23 | * Args:
24 | *
25 | * vocab_file (:obj:`string`): File containing the vocabulary.
26 | *
27 | * do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to
28 | * lowercase the input when tokenizing.
29 | *
30 | * do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether
31 | * to do basic tokenization before WordPiece.
32 | *
33 | * never_split (:obj:`bool`, `optional`, defaults to :obj:`True`): List of
34 | * tokens which will never be split during tokenization. Only has an effect when
35 | * :obj:`do_basic_tokenize=True`
36 | *
37 | * unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): The unknown
38 | * token. A token that is not in the vocabulary cannot be converted to an ID and
39 | * is set to be this token instead.
40 | *
41 | * sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): The separator
42 | * token, which is used when building a sequence from multiple sequences, e.g.
43 | * two sequences for sequence classification or for a text and a question for
44 | * question answering. It is also used as the last token of a sequence built
45 | * with special tokens.
46 | *
47 | * pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): The token used
48 | * for padding, for example when batching sequences of different lengths.
49 | *
50 | * cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): The classifier
51 | * token which is used when doing sequence classification (classification of the
52 | * whole sequence instead of per-token classification). It is the first token of
53 | * the sequence when built with special tokens.
54 | *
55 | * mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): The token used
56 | * for masking values. This is the token used when training this model with
57 | * masked language modeling. This is the token which the model will try to
58 | * predict.
59 | *
60 | * tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
61 | * Whether to tokenize Chinese characters. This should likely be deactivated for
62 | * Japanese: see: https://github.com/huggingface/transformers/issues/328
63 | */
64 |
65 | @Log4j2
66 | public class BertTokenizer implements Tokenizer {
67 |
68 | private String vocab_file = "vocab.txt";
69 | private Map token_id_map;
70 | private Map id_token_map;
71 | private boolean do_lower_case = true;
72 | private boolean do_basic_tokenize = true;
73 | private List never_split = new ArrayList();
74 | private String unk_token = "[UNK]";
75 | private String sep_token = "[SEP]";
76 | private String pad_token = "[PAD]";
77 | private String cls_token = "[CLS]";
78 | private String mask_token = "[MASK]";
79 | private boolean tokenize_chinese_chars = true;
80 | private BasicTokenizer basic_tokenizer;
81 | private WordpieceTokenizer wordpiece_tokenizer;
82 |
83 | private static final int MAX_LEN = 512;
84 |
85 | public BertTokenizer(String vocab_file, boolean do_lower_case, boolean do_basic_tokenize, List never_split,
86 | String unk_token, String sep_token, String pad_token, String cls_token, String mask_token,
87 | boolean tokenize_chinese_chars) {
88 | this.vocab_file = vocab_file;
89 | this.do_lower_case = do_lower_case;
90 | this.do_basic_tokenize = do_basic_tokenize;
91 | this.never_split = never_split;
92 | this.unk_token = unk_token;
93 | this.sep_token = sep_token;
94 | this.pad_token = pad_token;
95 | this.cls_token = cls_token;
96 | this.mask_token = mask_token;
97 | this.tokenize_chinese_chars = tokenize_chinese_chars;
98 | init();
99 | }
100 |
101 | public BertTokenizer() {
102 | init();
103 | }
104 |
105 | private void init() {
106 | try {
107 | this.token_id_map = load_vocab(vocab_file);
108 | } catch (IOException e) {
109 | log.error("Unable to load vocab due to: ", e);
110 | }
111 | this.id_token_map = new HashMap();
112 | for (String key : token_id_map.keySet()) {
113 | this.id_token_map.put(token_id_map.get(key), key);
114 | }
115 |
116 | if (do_basic_tokenize) {
117 | this.basic_tokenizer = new BasicTokenizer(do_lower_case, never_split, tokenize_chinese_chars);
118 | }
119 | this.wordpiece_tokenizer = new WordpieceTokenizer(token_id_map, unk_token);
120 | }
121 |
122 | private Map load_vocab(String vocab_file_name) throws IOException {
123 | ClassLoader classloader = Thread.currentThread().getContextClassLoader();
124 | InputStream file =classloader.getResourceAsStream(vocab_file_name);
125 | return TokenizerUtils.generateTokenIdMap(file);
126 | }
127 |
128 | /**
129 | * Tokenizes a piece of text into its word pieces.
130 | *
131 | * This uses a greedy longest-match-first algorithm to perform tokenization
132 | * using the given vocabulary.
133 | *
134 | * For example: input = "unaffable" output = ["un", "##aff", "##able"]
135 | *
136 | * Args: text: A single token or whitespace separated tokens. This should have
137 | * already been passed through `BasicTokenizer`.
138 | *
139 | * Returns: A list of wordpiece tokens.
140 | *
141 | */
142 | @Override
143 | public List tokenize(String text) {
144 | List split_tokens = new ArrayList();
145 | if (do_basic_tokenize) {
146 | for (String token : basic_tokenizer.tokenize(text)) {
147 | for (String sub_token : wordpiece_tokenizer.tokenize(token)) {
148 | split_tokens.add(sub_token);
149 | }
150 | }
151 | } else {
152 | split_tokens = wordpiece_tokenizer.tokenize(text);
153 | }
154 | return split_tokens;
155 | }
156 |
157 | public String convert_tokens_to_string(List tokens) {
158 | // Converts a sequence of tokens (string) in a single string.
159 | return tokens.stream().map(s -> s.replace("##", "")).collect(Collectors.joining(" "));
160 | }
161 |
162 | public List convert_tokens_to_ids(List tokens) {
163 | List output = new ArrayList();
164 | for (String s : tokens) {
165 | output.add(token_id_map.get(s));
166 | }
167 | return output;
168 | }
169 |
170 | public int vocab_size() {
171 | return token_id_map.size();
172 | }
173 | }
174 |
--------------------------------------------------------------------------------
/src/main/java/com/ankit/bert/tokenizerimpl/WordpieceTokenizer.java:
--------------------------------------------------------------------------------
1 | package com.ankit.bert.tokenizerimpl;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 | import java.util.Map;
6 |
7 | import com.ankit.bert.tokenizer.Tokenizer;
8 | import com.ankit.bert.utils.TokenizerUtils;
9 |
10 | public class WordpieceTokenizer implements Tokenizer {
11 | private Map vocab;
12 | private String unk_token;
13 | private int max_input_chars_per_word;
14 |
15 | public WordpieceTokenizer(Map vocab, String unk_token, int max_input_chars_per_word) {
16 | this.vocab = vocab;
17 | this.unk_token = unk_token;
18 | this.max_input_chars_per_word = max_input_chars_per_word;
19 | }
20 |
21 | public WordpieceTokenizer(Map vocab, String unk_token) {
22 | this.vocab = vocab;
23 | this.unk_token = unk_token;
24 | this.max_input_chars_per_word = 100;
25 | }
26 |
27 | @Override
28 | public List tokenize(String text) {
29 | /**
30 | * Tokenizes a piece of text into its word pieces.
31 | *
32 | * This uses a greedy longest-match-first algorithm to perform tokenization
33 | * using the given vocabulary.
34 | *
35 | * For example: input = "unaffable" output = ["un", "##aff", "##able"]
36 | *
37 | * Args: text: A single token or whitespace separated tokens. This should have
38 | * already been passed through `BasicTokenizer`.
39 | *
40 | * Returns: A list of wordpiece tokens.
41 | *
42 | */
43 |
44 | List output_tokens = new ArrayList();
45 | for (String token : TokenizerUtils.whitespace_tokenize(text)) {
46 | if (token.length() > max_input_chars_per_word) {
47 | output_tokens.add(unk_token);
48 | continue;
49 | }
50 | boolean is_bad = false;
51 | int start = 0;
52 |
53 | List sub_tokens = new ArrayList();
54 | while (start < token.length()) {
55 | int end = token.length();
56 | String cur_substr = "";
57 | while (start < end) {
58 | String substr = token.substring(start, end);
59 | if (start > 0) {
60 | substr = "##" + substr;
61 | }
62 | if (vocab.containsKey(substr)) {
63 | cur_substr = substr;
64 | break;
65 | }
66 | end -= 1;
67 | }
68 | if (cur_substr == "") {
69 | is_bad = true;
70 | break;
71 | }
72 | sub_tokens.add(cur_substr);
73 | start = end;
74 | }
75 | if (is_bad) {
76 | output_tokens.add(unk_token);
77 | } else {
78 | output_tokens.addAll(sub_tokens);
79 | }
80 | }
81 | return output_tokens;
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/src/main/java/com/ankit/bert/utils/TokenizerUtils.java:
--------------------------------------------------------------------------------
1 | package com.ankit.bert.utils;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.IOException;
5 | import java.io.InputStream;
6 | import java.io.InputStreamReader;
7 | import java.text.Normalizer;
8 | import java.text.Normalizer.Form;
9 | import java.util.ArrayList;
10 | import java.util.Arrays;
11 | import java.util.HashMap;
12 | import java.util.List;
13 | import java.util.Map;
14 |
15 | public class TokenizerUtils {
16 |
17 | public static String clean_text(String text) {
18 | // Performs invalid character removal and whitespace cleanup on text."""
19 |
20 | StringBuilder output = new StringBuilder();
21 | for (int i = 0; i < text.length(); i++) {
22 | Character c = text.charAt(i);
23 | int cp = (int) c;
24 | if (cp == 0 || cp == 0xFFFD || _is_control(c)) {
25 | continue;
26 | }
27 | if (_is_whitespace(c)) {
28 | output.append(" ");
29 | } else {
30 | output.append(c);
31 | }
32 | }
33 | return output.toString();
34 | }
35 |
36 | public static String tokenize_chinese_chars(String text) {
37 | // Adds whitespace around any CJK character.
38 | StringBuilder output = new StringBuilder();
39 | for (int i = 0; i < text.length(); i++) {
40 | Character c = text.charAt(i);
41 | int cp = (int) c;
42 | if (_is_chinese_char(cp)) {
43 | output.append(" ");
44 | output.append(c);
45 | output.append(" ");
46 | } else {
47 | output.append(c);
48 | }
49 | }
50 | return output.toString();
51 | }
52 |
53 | public static List whitespace_tokenize(String text) {
54 | // Runs basic whitespace cleaning and splitting on a piece of text.
55 | text = text.trim();
56 | if (text != null && text != "") {
57 | return Arrays.asList(text.split("\\s+"));
58 | }
59 | return new ArrayList();
60 |
61 | }
62 |
63 | public static String run_strip_accents(String token) {
64 | token = Normalizer.normalize(token, Form.NFD);
65 | StringBuilder output = new StringBuilder();
66 | for (int i = 0; i < token.length(); i++) {
67 | Character c = token.charAt(i);
68 | if (Character.NON_SPACING_MARK != Character.getType(c)) {
69 | output.append(c);
70 | }
71 | }
72 | return output.toString();
73 | }
74 |
75 | public static List run_split_on_punc(String token, List never_split) {
76 | // Splits punctuation on a piece of text.
77 | List output = new ArrayList();
78 | if (never_split != null && never_split.contains(token)) {
79 | output.add(token);
80 | return output;
81 | }
82 |
83 | boolean start_new_word = true;
84 | StringBuilder str = new StringBuilder();
85 | for (int i = 0; i < token.length(); i++) {
86 | Character c = token.charAt(i);
87 | if (_is_punctuation(c)) {
88 | if (str.length() > 0) {
89 | output.add(str.toString());
90 | str.setLength(0);
91 | }
92 | output.add(c.toString());
93 | start_new_word = true;
94 | } else {
95 | if (start_new_word && str.length() > 0) {
96 | output.add(str.toString());
97 | str.setLength(0);
98 | }
99 | start_new_word = false;
100 | str.append(c);
101 | }
102 | }
103 | if (str.length() > 0) {
104 | output.add(str.toString());
105 | }
106 | return output;
107 | }
108 |
109 | public static Map generateTokenIdMap(InputStream file) throws IOException {
110 | HashMap token_id_map = new HashMap();
111 | if (file == null)
112 | return token_id_map;
113 |
114 | try (BufferedReader br = new BufferedReader(new InputStreamReader(file))) {
115 |
116 | String line;
117 | int index = 0;
118 | while ((line = br.readLine()) != null) {
119 | token_id_map.put(line, index);
120 | index += 1;
121 | }
122 | }
123 | return token_id_map;
124 | }
125 |
126 | private static boolean _is_punctuation(char c) {
127 | // Checks whether `chars` is a punctuation character.
128 | int cp = (int) c;
129 | // We treat all non-letter/number ASCII as punctuation.
130 | // Characters such as "^", "$", and "`" are not in the Unicode
131 | // Punctuation class but we treat them as punctuation anyways, for
132 | // consistency.
133 | if ((cp >= 33 && cp <= 47) || (cp >= 58 && cp <= 64) || (cp >= 91 && cp <= 96) || (cp >= 123 && cp <= 126)) {
134 | return true;
135 | }
136 | int charType = Character.getType(c);
137 | if (Character.CONNECTOR_PUNCTUATION == charType || Character.DASH_PUNCTUATION == charType
138 | || Character.END_PUNCTUATION == charType || Character.FINAL_QUOTE_PUNCTUATION == charType
139 | || Character.INITIAL_QUOTE_PUNCTUATION == charType || Character.OTHER_PUNCTUATION == charType
140 | || Character.START_PUNCTUATION == charType) {
141 | return true;
142 | }
143 | return false;
144 | }
145 |
146 | private static boolean _is_whitespace(char c) {
147 | // Checks whether `chars` is a whitespace character.
148 | // \t, \n, and \r are technically contorl characters but we treat them
149 | // as whitespace since they are generally considered as such.
150 | if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
151 | return true;
152 | }
153 |
154 | int charType = Character.getType(c);
155 | if (Character.SPACE_SEPARATOR == charType) {
156 | return true;
157 | }
158 | return false;
159 | }
160 |
161 | private static boolean _is_control(char c) {
162 | // Checks whether `chars` is a control character.
163 | // These are technically control characters but we count them as whitespace
164 | // characters.
165 | if (c == '\t' || c == '\n' || c == '\r') {
166 | return false;
167 | }
168 |
169 | int charType = Character.getType(c);
170 | if (Character.CONTROL == charType || Character.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR == charType
171 | || Character.FORMAT == charType || Character.PRIVATE_USE == charType || Character.SURROGATE == charType
172 | || Character.UNASSIGNED == charType) {
173 | return true;
174 | }
175 | return false;
176 | }
177 |
178 | private static boolean _is_chinese_char(int cp) {
179 | // Checks whether CP is the codepoint of a CJK character."""
180 | // This defines a "chinese character" as anything in the CJK Unicode block:
181 | // https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
182 | //
183 | // Note that the CJK Unicode block is NOT all Japanese and Korean characters,
184 | // despite its name. The modern Korean Hangul alphabet is a different block,
185 | // as is Japanese Hiragana and Katakana. Those alphabets are used to write
186 | // space-separated words, so they are not treated specially and handled
187 | // like the all of the other languages.
188 | if ((cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0x3400 && cp <= 0x4DBF) || (cp >= 0x20000 && cp <= 0x2A6DF)
189 | || (cp >= 0x2A700 && cp <= 0x2B73F) || (cp >= 0x2B740 && cp <= 0x2B81F)
190 | || (cp >= 0x2B820 && cp <= 0x2CEAF) || (cp >= 0xF900 && cp <= 0xFAFF)
191 | || (cp >= 0x2F800 && cp <= 0x2FA1F)) {
192 | return true;
193 | }
194 |
195 | return false;
196 | }
197 | }
198 |
--------------------------------------------------------------------------------