├── .classpath ├── .gitignore ├── .project ├── .settings ├── org.eclipse.core.resources.prefs ├── org.eclipse.jdt.core.prefs ├── org.eclipse.m2e.core.prefs ├── org.eclipse.wst.common.component ├── org.eclipse.wst.common.project.facet.core.xml └── org.eclipse.wst.validation.prefs ├── README.md ├── pom.xml └── src └── main ├── java └── com │ └── ankit │ └── bert │ ├── tokenizer │ └── Tokenizer.java │ ├── tokenizerimpl │ ├── BasicTokenizer.java │ ├── BertTokenizer.java │ └── WordpieceTokenizer.java │ └── utils │ └── TokenizerUtils.java └── resources └── vocab.txt /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | .idea -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | aiserver 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.wst.common.project.facet.core.builder 10 | 11 | 12 | 13 | 14 | org.eclipse.jdt.core.javabuilder 15 | 16 | 17 | 18 | 19 | org.eclipse.wst.validation.validationbuilder 20 | 21 | 22 | 23 | 24 | org.eclipse.m2e.core.maven2Builder 25 | 26 | 27 | 28 | 29 | 30 | org.eclipse.jem.workbench.JavaEMFNature 31 | org.eclipse.wst.common.modulecore.ModuleCoreNature 32 | org.eclipse.jdt.core.javanature 33 | org.eclipse.m2e.core.maven2Nature 34 | org.eclipse.wst.common.project.facet.core.nature 35 | 36 | 37 | -------------------------------------------------------------------------------- /.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/main/resources=UTF-8 4 | encoding//src/test/java=UTF-8 5 | encoding//src/test/resources=UTF-8 6 | encoding/=UTF-8 7 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate 4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 6 | org.eclipse.jdt.core.compiler.compliance=1.8 7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled 12 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 13 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 14 | org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore 15 | org.eclipse.jdt.core.compiler.release=disabled 16 | org.eclipse.jdt.core.compiler.source=1.8 17 | -------------------------------------------------------------------------------- /.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /.settings/org.eclipse.wst.common.component: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.settings/org.eclipse.wst.common.project.facet.core.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.settings/org.eclipse.wst.validation.prefs: -------------------------------------------------------------------------------- 1 | disabled=06target 2 | eclipse.preferences.version=1 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bert Tokenizer 2 | 3 | This repository contains java implementation of Bert Tokenizer. The implementation is referred from the Hugging face Transformers library. 4 | 5 | https://huggingface.co/transformers/main_classes/tokenizer.html 6 | 7 | ## Usage 8 | 9 | To get tokens from text: 10 | ``` 11 | String text = "Text to tokenize"; 12 | BertTokenizer tokenizer = new BertTokenizer(); 13 | List tokens = tokenizer.tokenize(text); 14 | ``` 15 | 16 | To get token ids using Bert Vocab: 17 | 18 | ``` 19 | List token_ids = tokenizer.convert_tokens_to_ids(tokens); 20 | ``` 21 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 4.0.0 5 | com.ankit 6 | berttokenizer 7 | 0.0.1 8 | BertTokenizer 9 | Java Implementation of Hugging face Bert Tokenizer 10 | 11 | 12 | UTF-8 13 | 4.12 14 | 2.13.2 15 | 2.25.1 16 | 17 | 18 | 19 | 20 | maven-compiler-plugin 21 | 3.3 22 | 23 | 1.8 24 | 1.8 25 | 26 | 27 | 28 | maven-assembly-plugin 29 | 30 | 31 | jar-with-dependencies 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | org.projectlombok 40 | lombok 41 | 1.18.12 42 | provided 43 | 44 | 45 | org.apache.logging.log4j 46 | log4j-core 47 | ${log4j.version} 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /src/main/java/com/ankit/bert/tokenizer/Tokenizer.java: -------------------------------------------------------------------------------- 1 | package com.ankit.bert.tokenizer; 2 | 3 | import java.util.List; 4 | 5 | public interface Tokenizer { 6 | 7 | public List tokenize(String text); 8 | 9 | } 10 | -------------------------------------------------------------------------------- /src/main/java/com/ankit/bert/tokenizerimpl/BasicTokenizer.java: -------------------------------------------------------------------------------- 1 | package com.ankit.bert.tokenizerimpl; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import com.ankit.bert.tokenizer.Tokenizer; 7 | import com.ankit.bert.utils.TokenizerUtils; 8 | 9 | public class BasicTokenizer implements Tokenizer { 10 | private boolean do_lower_case = true; 11 | private List never_split = new ArrayList(); 12 | private boolean tokenize_chinese_chars = true; 13 | 14 | public BasicTokenizer(boolean do_lower_case, List never_split, boolean tokenize_chinese_chars) { 15 | this.do_lower_case = do_lower_case; 16 | if (never_split == null) { 17 | never_split = new ArrayList(); 18 | } 19 | this.tokenize_chinese_chars = tokenize_chinese_chars; 20 | } 21 | 22 | public BasicTokenizer() { 23 | } 24 | 25 | @Override 26 | public List tokenize(String text) { 27 | text = TokenizerUtils.clean_text(text); 28 | if (tokenize_chinese_chars) { 29 | text = TokenizerUtils.tokenize_chinese_chars(text); 30 | } 31 | List orig_tokens = TokenizerUtils.whitespace_tokenize(text); 32 | 33 | List split_tokens = new ArrayList(); 34 | for (String token : orig_tokens) { 35 | if (do_lower_case && !never_split.contains(token)) { 36 | token = TokenizerUtils.run_strip_accents(token.toLowerCase()); 37 | split_tokens.addAll(TokenizerUtils.run_split_on_punc(token, never_split)); 38 | } 39 | } 40 | return TokenizerUtils.whitespace_tokenize(String.join(" ", split_tokens)); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/ankit/bert/tokenizerimpl/BertTokenizer.java: -------------------------------------------------------------------------------- 1 | package com.ankit.bert.tokenizerimpl; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.util.ArrayList; 6 | import java.util.HashMap; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.stream.Collectors; 10 | 11 | import com.ankit.bert.tokenizer.Tokenizer; 12 | import com.ankit.bert.utils.TokenizerUtils; 13 | 14 | import lombok.extern.log4j.Log4j2; 15 | 16 | /** 17 | * Constructs a BERT tokenizer. Based on WordPiece. 18 | * 19 | * This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which 20 | * contains most of the methods. Users should refer to the superclass for more 21 | * information regarding methods. 22 | * 23 | * Args: 24 | * 25 | * vocab_file (:obj:`string`): File containing the vocabulary. 26 | * 27 | * do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to 28 | * lowercase the input when tokenizing. 29 | * 30 | * do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether 31 | * to do basic tokenization before WordPiece. 32 | * 33 | * never_split (:obj:`bool`, `optional`, defaults to :obj:`True`): List of 34 | * tokens which will never be split during tokenization. Only has an effect when 35 | * :obj:`do_basic_tokenize=True` 36 | * 37 | * unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): The unknown 38 | * token. A token that is not in the vocabulary cannot be converted to an ID and 39 | * is set to be this token instead. 40 | * 41 | * sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): The separator 42 | * token, which is used when building a sequence from multiple sequences, e.g. 43 | * two sequences for sequence classification or for a text and a question for 44 | * question answering. It is also used as the last token of a sequence built 45 | * with special tokens. 46 | * 47 | * pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): The token used 48 | * for padding, for example when batching sequences of different lengths. 49 | * 50 | * cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): The classifier 51 | * token which is used when doing sequence classification (classification of the 52 | * whole sequence instead of per-token classification). It is the first token of 53 | * the sequence when built with special tokens. 54 | * 55 | * mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): The token used 56 | * for masking values. This is the token used when training this model with 57 | * masked language modeling. This is the token which the model will try to 58 | * predict. 59 | * 60 | * tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): 61 | * Whether to tokenize Chinese characters. This should likely be deactivated for 62 | * Japanese: see: https://github.com/huggingface/transformers/issues/328 63 | */ 64 | 65 | @Log4j2 66 | public class BertTokenizer implements Tokenizer { 67 | 68 | private String vocab_file = "vocab.txt"; 69 | private Map token_id_map; 70 | private Map id_token_map; 71 | private boolean do_lower_case = true; 72 | private boolean do_basic_tokenize = true; 73 | private List never_split = new ArrayList(); 74 | private String unk_token = "[UNK]"; 75 | private String sep_token = "[SEP]"; 76 | private String pad_token = "[PAD]"; 77 | private String cls_token = "[CLS]"; 78 | private String mask_token = "[MASK]"; 79 | private boolean tokenize_chinese_chars = true; 80 | private BasicTokenizer basic_tokenizer; 81 | private WordpieceTokenizer wordpiece_tokenizer; 82 | 83 | private static final int MAX_LEN = 512; 84 | 85 | public BertTokenizer(String vocab_file, boolean do_lower_case, boolean do_basic_tokenize, List never_split, 86 | String unk_token, String sep_token, String pad_token, String cls_token, String mask_token, 87 | boolean tokenize_chinese_chars) { 88 | this.vocab_file = vocab_file; 89 | this.do_lower_case = do_lower_case; 90 | this.do_basic_tokenize = do_basic_tokenize; 91 | this.never_split = never_split; 92 | this.unk_token = unk_token; 93 | this.sep_token = sep_token; 94 | this.pad_token = pad_token; 95 | this.cls_token = cls_token; 96 | this.mask_token = mask_token; 97 | this.tokenize_chinese_chars = tokenize_chinese_chars; 98 | init(); 99 | } 100 | 101 | public BertTokenizer() { 102 | init(); 103 | } 104 | 105 | private void init() { 106 | try { 107 | this.token_id_map = load_vocab(vocab_file); 108 | } catch (IOException e) { 109 | log.error("Unable to load vocab due to: ", e); 110 | } 111 | this.id_token_map = new HashMap(); 112 | for (String key : token_id_map.keySet()) { 113 | this.id_token_map.put(token_id_map.get(key), key); 114 | } 115 | 116 | if (do_basic_tokenize) { 117 | this.basic_tokenizer = new BasicTokenizer(do_lower_case, never_split, tokenize_chinese_chars); 118 | } 119 | this.wordpiece_tokenizer = new WordpieceTokenizer(token_id_map, unk_token); 120 | } 121 | 122 | private Map load_vocab(String vocab_file_name) throws IOException { 123 | ClassLoader classloader = Thread.currentThread().getContextClassLoader(); 124 | InputStream file =classloader.getResourceAsStream(vocab_file_name); 125 | return TokenizerUtils.generateTokenIdMap(file); 126 | } 127 | 128 | /** 129 | * Tokenizes a piece of text into its word pieces. 130 | * 131 | * This uses a greedy longest-match-first algorithm to perform tokenization 132 | * using the given vocabulary. 133 | * 134 | * For example: input = "unaffable" output = ["un", "##aff", "##able"] 135 | * 136 | * Args: text: A single token or whitespace separated tokens. This should have 137 | * already been passed through `BasicTokenizer`. 138 | * 139 | * Returns: A list of wordpiece tokens. 140 | * 141 | */ 142 | @Override 143 | public List tokenize(String text) { 144 | List split_tokens = new ArrayList(); 145 | if (do_basic_tokenize) { 146 | for (String token : basic_tokenizer.tokenize(text)) { 147 | for (String sub_token : wordpiece_tokenizer.tokenize(token)) { 148 | split_tokens.add(sub_token); 149 | } 150 | } 151 | } else { 152 | split_tokens = wordpiece_tokenizer.tokenize(text); 153 | } 154 | return split_tokens; 155 | } 156 | 157 | public String convert_tokens_to_string(List tokens) { 158 | // Converts a sequence of tokens (string) in a single string. 159 | return tokens.stream().map(s -> s.replace("##", "")).collect(Collectors.joining(" ")); 160 | } 161 | 162 | public List convert_tokens_to_ids(List tokens) { 163 | List output = new ArrayList(); 164 | for (String s : tokens) { 165 | output.add(token_id_map.get(s)); 166 | } 167 | return output; 168 | } 169 | 170 | public int vocab_size() { 171 | return token_id_map.size(); 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /src/main/java/com/ankit/bert/tokenizerimpl/WordpieceTokenizer.java: -------------------------------------------------------------------------------- 1 | package com.ankit.bert.tokenizerimpl; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.Map; 6 | 7 | import com.ankit.bert.tokenizer.Tokenizer; 8 | import com.ankit.bert.utils.TokenizerUtils; 9 | 10 | public class WordpieceTokenizer implements Tokenizer { 11 | private Map vocab; 12 | private String unk_token; 13 | private int max_input_chars_per_word; 14 | 15 | public WordpieceTokenizer(Map vocab, String unk_token, int max_input_chars_per_word) { 16 | this.vocab = vocab; 17 | this.unk_token = unk_token; 18 | this.max_input_chars_per_word = max_input_chars_per_word; 19 | } 20 | 21 | public WordpieceTokenizer(Map vocab, String unk_token) { 22 | this.vocab = vocab; 23 | this.unk_token = unk_token; 24 | this.max_input_chars_per_word = 100; 25 | } 26 | 27 | @Override 28 | public List tokenize(String text) { 29 | /** 30 | * Tokenizes a piece of text into its word pieces. 31 | * 32 | * This uses a greedy longest-match-first algorithm to perform tokenization 33 | * using the given vocabulary. 34 | * 35 | * For example: input = "unaffable" output = ["un", "##aff", "##able"] 36 | * 37 | * Args: text: A single token or whitespace separated tokens. This should have 38 | * already been passed through `BasicTokenizer`. 39 | * 40 | * Returns: A list of wordpiece tokens. 41 | * 42 | */ 43 | 44 | List output_tokens = new ArrayList(); 45 | for (String token : TokenizerUtils.whitespace_tokenize(text)) { 46 | if (token.length() > max_input_chars_per_word) { 47 | output_tokens.add(unk_token); 48 | continue; 49 | } 50 | boolean is_bad = false; 51 | int start = 0; 52 | 53 | List sub_tokens = new ArrayList(); 54 | while (start < token.length()) { 55 | int end = token.length(); 56 | String cur_substr = ""; 57 | while (start < end) { 58 | String substr = token.substring(start, end); 59 | if (start > 0) { 60 | substr = "##" + substr; 61 | } 62 | if (vocab.containsKey(substr)) { 63 | cur_substr = substr; 64 | break; 65 | } 66 | end -= 1; 67 | } 68 | if (cur_substr == "") { 69 | is_bad = true; 70 | break; 71 | } 72 | sub_tokens.add(cur_substr); 73 | start = end; 74 | } 75 | if (is_bad) { 76 | output_tokens.add(unk_token); 77 | } else { 78 | output_tokens.addAll(sub_tokens); 79 | } 80 | } 81 | return output_tokens; 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/com/ankit/bert/utils/TokenizerUtils.java: -------------------------------------------------------------------------------- 1 | package com.ankit.bert.utils; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.io.InputStreamReader; 7 | import java.text.Normalizer; 8 | import java.text.Normalizer.Form; 9 | import java.util.ArrayList; 10 | import java.util.Arrays; 11 | import java.util.HashMap; 12 | import java.util.List; 13 | import java.util.Map; 14 | 15 | public class TokenizerUtils { 16 | 17 | public static String clean_text(String text) { 18 | // Performs invalid character removal and whitespace cleanup on text.""" 19 | 20 | StringBuilder output = new StringBuilder(); 21 | for (int i = 0; i < text.length(); i++) { 22 | Character c = text.charAt(i); 23 | int cp = (int) c; 24 | if (cp == 0 || cp == 0xFFFD || _is_control(c)) { 25 | continue; 26 | } 27 | if (_is_whitespace(c)) { 28 | output.append(" "); 29 | } else { 30 | output.append(c); 31 | } 32 | } 33 | return output.toString(); 34 | } 35 | 36 | public static String tokenize_chinese_chars(String text) { 37 | // Adds whitespace around any CJK character. 38 | StringBuilder output = new StringBuilder(); 39 | for (int i = 0; i < text.length(); i++) { 40 | Character c = text.charAt(i); 41 | int cp = (int) c; 42 | if (_is_chinese_char(cp)) { 43 | output.append(" "); 44 | output.append(c); 45 | output.append(" "); 46 | } else { 47 | output.append(c); 48 | } 49 | } 50 | return output.toString(); 51 | } 52 | 53 | public static List whitespace_tokenize(String text) { 54 | // Runs basic whitespace cleaning and splitting on a piece of text. 55 | text = text.trim(); 56 | if (text != null && text != "") { 57 | return Arrays.asList(text.split("\\s+")); 58 | } 59 | return new ArrayList(); 60 | 61 | } 62 | 63 | public static String run_strip_accents(String token) { 64 | token = Normalizer.normalize(token, Form.NFD); 65 | StringBuilder output = new StringBuilder(); 66 | for (int i = 0; i < token.length(); i++) { 67 | Character c = token.charAt(i); 68 | if (Character.NON_SPACING_MARK != Character.getType(c)) { 69 | output.append(c); 70 | } 71 | } 72 | return output.toString(); 73 | } 74 | 75 | public static List run_split_on_punc(String token, List never_split) { 76 | // Splits punctuation on a piece of text. 77 | List output = new ArrayList(); 78 | if (never_split != null && never_split.contains(token)) { 79 | output.add(token); 80 | return output; 81 | } 82 | 83 | boolean start_new_word = true; 84 | StringBuilder str = new StringBuilder(); 85 | for (int i = 0; i < token.length(); i++) { 86 | Character c = token.charAt(i); 87 | if (_is_punctuation(c)) { 88 | if (str.length() > 0) { 89 | output.add(str.toString()); 90 | str.setLength(0); 91 | } 92 | output.add(c.toString()); 93 | start_new_word = true; 94 | } else { 95 | if (start_new_word && str.length() > 0) { 96 | output.add(str.toString()); 97 | str.setLength(0); 98 | } 99 | start_new_word = false; 100 | str.append(c); 101 | } 102 | } 103 | if (str.length() > 0) { 104 | output.add(str.toString()); 105 | } 106 | return output; 107 | } 108 | 109 | public static Map generateTokenIdMap(InputStream file) throws IOException { 110 | HashMap token_id_map = new HashMap(); 111 | if (file == null) 112 | return token_id_map; 113 | 114 | try (BufferedReader br = new BufferedReader(new InputStreamReader(file))) { 115 | 116 | String line; 117 | int index = 0; 118 | while ((line = br.readLine()) != null) { 119 | token_id_map.put(line, index); 120 | index += 1; 121 | } 122 | } 123 | return token_id_map; 124 | } 125 | 126 | private static boolean _is_punctuation(char c) { 127 | // Checks whether `chars` is a punctuation character. 128 | int cp = (int) c; 129 | // We treat all non-letter/number ASCII as punctuation. 130 | // Characters such as "^", "$", and "`" are not in the Unicode 131 | // Punctuation class but we treat them as punctuation anyways, for 132 | // consistency. 133 | if ((cp >= 33 && cp <= 47) || (cp >= 58 && cp <= 64) || (cp >= 91 && cp <= 96) || (cp >= 123 && cp <= 126)) { 134 | return true; 135 | } 136 | int charType = Character.getType(c); 137 | if (Character.CONNECTOR_PUNCTUATION == charType || Character.DASH_PUNCTUATION == charType 138 | || Character.END_PUNCTUATION == charType || Character.FINAL_QUOTE_PUNCTUATION == charType 139 | || Character.INITIAL_QUOTE_PUNCTUATION == charType || Character.OTHER_PUNCTUATION == charType 140 | || Character.START_PUNCTUATION == charType) { 141 | return true; 142 | } 143 | return false; 144 | } 145 | 146 | private static boolean _is_whitespace(char c) { 147 | // Checks whether `chars` is a whitespace character. 148 | // \t, \n, and \r are technically contorl characters but we treat them 149 | // as whitespace since they are generally considered as such. 150 | if (c == ' ' || c == '\t' || c == '\n' || c == '\r') { 151 | return true; 152 | } 153 | 154 | int charType = Character.getType(c); 155 | if (Character.SPACE_SEPARATOR == charType) { 156 | return true; 157 | } 158 | return false; 159 | } 160 | 161 | private static boolean _is_control(char c) { 162 | // Checks whether `chars` is a control character. 163 | // These are technically control characters but we count them as whitespace 164 | // characters. 165 | if (c == '\t' || c == '\n' || c == '\r') { 166 | return false; 167 | } 168 | 169 | int charType = Character.getType(c); 170 | if (Character.CONTROL == charType || Character.DIRECTIONALITY_COMMON_NUMBER_SEPARATOR == charType 171 | || Character.FORMAT == charType || Character.PRIVATE_USE == charType || Character.SURROGATE == charType 172 | || Character.UNASSIGNED == charType) { 173 | return true; 174 | } 175 | return false; 176 | } 177 | 178 | private static boolean _is_chinese_char(int cp) { 179 | // Checks whether CP is the codepoint of a CJK character.""" 180 | // This defines a "chinese character" as anything in the CJK Unicode block: 181 | // https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 182 | // 183 | // Note that the CJK Unicode block is NOT all Japanese and Korean characters, 184 | // despite its name. The modern Korean Hangul alphabet is a different block, 185 | // as is Japanese Hiragana and Katakana. Those alphabets are used to write 186 | // space-separated words, so they are not treated specially and handled 187 | // like the all of the other languages. 188 | if ((cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0x3400 && cp <= 0x4DBF) || (cp >= 0x20000 && cp <= 0x2A6DF) 189 | || (cp >= 0x2A700 && cp <= 0x2B73F) || (cp >= 0x2B740 && cp <= 0x2B81F) 190 | || (cp >= 0x2B820 && cp <= 0x2CEAF) || (cp >= 0xF900 && cp <= 0xFAFF) 191 | || (cp >= 0x2F800 && cp <= 0x2FA1F)) { 192 | return true; 193 | } 194 | 195 | return false; 196 | } 197 | } 198 | --------------------------------------------------------------------------------