├── .gitignore ├── CMakeLists.txt ├── README.md ├── src ├── tokenizer.cpp ├── tokenizer.h └── unilib │ ├── unicode.cpp │ ├── unicode.h │ ├── uninorms.cpp │ ├── uninorms.h │ ├── unistrip.cpp │ ├── unistrip.h │ ├── utf16.cpp │ ├── utf16.h │ ├── utf8.cpp │ ├── utf8.h │ ├── version.cpp │ └── version.h └── test └── main.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | /cmake-build-debug/ 2 | .idea 3 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0 FATAL_ERROR) 2 | project(Tokenizer) 3 | message(STATUS "start running cmake...") 4 | find_package(Boost 1.75.0 COMPONENTS system filesystem REQUIRED) 5 | set(CMAKE_PREFIX_PATH "/home/wuyunzhao/nptextforcpp/libtorch") 6 | set(Torch_DIR "/home/wuyunzhao/nptextforcpp/libtorch/share/cmake/Torch") 7 | find_package(Torch REQUIRED) 8 | if (Boost_FOUND) 9 | 10 | message(STATUS "Boost_INCLUDE_DIRS: ${Boost_INCLUDE_DIRS}") 11 | message(STATUS "Boost_LIBRARIES: ${Boost_LIBRARIES}") 12 | message(STATUS "Boost_VERSION: ${Boost_VERSION}") 13 | 14 | include_directories(${Boost_INCLUDE_DIRS}) 15 | 16 | endif () 17 | 18 | add_executable(${PROJECT_NAME} test/main.cpp src/tokenizer.cpp src/unilib/unicode.cpp src/unilib/uninorms.cpp) 19 | 20 | if(Boost_FOUND) 21 | 22 | target_link_libraries(Tokenizer ${Boost_LIBRARIES}) 23 | 24 | endif() 25 | target_link_libraries(Tokenizer ${TORCH_LIBRARIES}) 26 | target_include_directories(${PROJECT_NAME} PUBLIC "src/unilib") 27 | target_include_directories(${PROJECT_NAME} PUBLIC "src") 28 | 29 | set_property(TARGET ${PROJECT_NAME} PROPERTY CXX_STANDARD 14) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Huggingface Transformers Tokenizer in C++ 2 | 3 | A tokenizer is in charge of preparing the inputs for a model. 4 | 5 | The tokenizer can tokenize Chinese-English bilingual in Linux. 6 | 7 | This project mainly solves some Chinese character encoding problems. 8 | 9 | Requirements 10 | 11 | - [x] Boost 12 | 13 | ### C++ unicode support 14 | - http://github.com/ufal/unilib -------------------------------------------------------------------------------- /src/tokenizer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "unicode.h" 8 | #include "uninorms.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "tokenizer.h" 15 | 16 | using namespace std; 17 | using namespace ufal::unilib; 18 | using namespace boost; 19 | using namespace spirit::qi; 20 | 21 | 22 | map categories = { 23 | {"Lu", unicode::Lu}, 24 | {"Ll", unicode::Ll}, 25 | {"Lt", unicode::Lt}, 26 | {"Lm", unicode::Lm}, 27 | {"Lo", unicode::Lo}, 28 | {"Mn", unicode::Mn}, 29 | {"Mc", unicode::Mc}, 30 | {"Me", unicode::Me}, 31 | {"Nd", unicode::Nd}, 32 | {"Nl", unicode::Nl}, 33 | {"No", unicode::No}, 34 | {"Pc", unicode::Pc}, 35 | {"Pd", unicode::Pd}, 36 | {"Ps", unicode::Ps}, 37 | {"Pe", unicode::Pe}, 38 | {"Pi", unicode::Pi}, 39 | {"Pf", unicode::Pf}, 40 | {"Po", unicode::Po}, 41 | {"Sm", unicode::Sm}, 42 | {"Sc", unicode::Sc}, 43 | {"Sk", unicode::Sk}, 44 | {"So", unicode::So}, 45 | {"Zs", unicode::Zs}, 46 | {"Zl", unicode::Zl}, 47 | {"Zp", unicode::Zp}, 48 | {"Cc", unicode::Cc}, 49 | {"Cf", unicode::Cf}, 50 | {"Cs", unicode::Cs}, 51 | {"Co", unicode::Co}, 52 | {"Cn", unicode::Cn}, 53 | }; 54 | 55 | map categories_rev; 56 | 57 | std::string ltrim(std::string str) 58 | { 59 | return regex_replace(str, regex("^\\s+"), std::string("")); 60 | } 61 | 62 | std::string rtrim(std::string str) 63 | { 64 | return regex_replace(str, regex("\\s+$"), std::string("")); 65 | } 66 | 67 | std::string trim(std::string str) 68 | { 69 | return ltrim(rtrim(str)); 70 | } 71 | 72 | vector split(const std::string &str, char delimiter) 73 | { 74 | vector internal; 75 | std::stringstream ss(str); // Turn the std::string into a stream. 76 | std::string tok; 77 | 78 | while (getline(ss, tok, delimiter)) 79 | { 80 | internal.push_back(tok); 81 | } 82 | return internal; 83 | } 84 | 85 | map read_vocab(const char *filename) 86 | { 87 | map vocab; 88 | int index = 0; 89 | unsigned int line_count = 1; 90 | ifstream fs8(filename); 91 | if (!fs8.is_open()) 92 | { 93 | cout << "Could not open " << filename << endl; 94 | return vocab; 95 | } 96 | std::string line; 97 | // Read all the lines in the file 98 | while (getline(fs8, line)) 99 | { 100 | // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) 101 | // std::string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); 102 | vocab.insert(pair(std::string(line.begin(), line.end()), index)); 103 | index++; 104 | line_count++; 105 | } 106 | return vocab; 107 | } 108 | 109 | vector whitespace_tokenize(std::string text) 110 | { 111 | vector result; 112 | char delimeter = ' '; 113 | text = trim(text); 114 | if (text == "") 115 | { 116 | return result; 117 | } 118 | result = split(text, delimeter); 119 | return result; 120 | } 121 | 122 | bool _is_whitespace(char letter) 123 | { 124 | if (letter == ' ' or letter == '\t' or letter == '\n' or letter == '\r') 125 | return true; 126 | long int cat = unicode::category(int(letter)); 127 | if (cat == categories["Zs"]) 128 | return true; 129 | return false; 130 | } 131 | 132 | bool _is_control(char letter) 133 | { 134 | if (letter == '\t' or letter == '\n' or letter == '\r') 135 | return false; 136 | unicode::category_t cat = unicode::category(int(letter)); 137 | std::string cat_ = categories_rev[cat]; 138 | if (cat_[0] == 'C') 139 | return true; 140 | return false; 141 | } 142 | 143 | bool _is_punctuation(char letter) 144 | { 145 | int cp = int(letter); 146 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 147 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)) 148 | return true; 149 | unicode::category_t cat = unicode::category(int(letter)); 150 | std::string cat_ = categories_rev[cat]; 151 | if (cat_[0] == 'P') 152 | return true; 153 | return false; 154 | } 155 | 156 | std::string BasicTokenizer::_clean_text(std::string text) 157 | { 158 | std::string output; 159 | int len = 0; 160 | char *char_array = new char[text.length() + 1]; 161 | strcpy(char_array, text.c_str()); 162 | while (char_array[len] != '\0') 163 | { 164 | int cp = int(char_array[len]); 165 | if (cp == 0 or cp == 0xfffd or _is_control(char_array[len])) 166 | continue; 167 | if (_is_whitespace(char_array[len])) 168 | output = output + " "; 169 | else 170 | output = output + char_array[len]; 171 | ++len; 172 | } 173 | return output; 174 | } 175 | 176 | vector BasicTokenizer::_run_split_on_punc(std::string text) 177 | { 178 | // vector never_split = {"[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"}; 179 | if (find(never_split_.begin(), never_split_.end(), text) != never_split_.end()) 180 | { 181 | vector temp = {text}; 182 | return temp; 183 | } 184 | int len_char_array = text.length(); 185 | char *char_array = new char[text.length() + 1]; 186 | strcpy(char_array, text.c_str()); 187 | int i = 0; 188 | bool start_new_word = true; 189 | vector> output; 190 | while (i < len_char_array) 191 | { 192 | char letter = char_array[i]; 193 | if (_is_punctuation(letter)) 194 | { 195 | vector temp = {letter}; 196 | output.push_back(temp); 197 | start_new_word = true; 198 | } 199 | else 200 | { 201 | if (start_new_word) 202 | { 203 | vector temp_2; 204 | output.push_back(temp_2); 205 | } 206 | start_new_word = false; 207 | output.back().push_back(letter); 208 | } 209 | i += 1; 210 | } 211 | vector final_output; 212 | vector>::iterator ptr; 213 | for (ptr = output.begin(); ptr < output.end(); ptr++) 214 | { 215 | vector out = *ptr; 216 | std::string word = ""; 217 | vector::iterator itr; 218 | for (itr = out.begin(); itr < out.end(); itr++) 219 | { 220 | word = word + *itr; 221 | } 222 | final_output.push_back(word); 223 | } 224 | return final_output; 225 | } 226 | 227 | std::string BasicTokenizer::_run_strip_accents(std::string text) 228 | { 229 | wstring_convert, char32_t> conv; 230 | auto temp = conv.from_bytes(text); 231 | auto nfd = [](u32string str) 232 | { 233 | uninorms::nfd(str); 234 | return str; 235 | }; 236 | auto text_ = nfd(temp); 237 | std::string output; 238 | int i = 0; 239 | int len_char_array = text_.length() + 1; 240 | char *char_array = new char[text_.length() + 1]; 241 | int j; 242 | for (j = 0; j < len_char_array; j++) 243 | { 244 | char_array[j] = text_[j]; 245 | } 246 | while (i < len_char_array) 247 | { 248 | long int cat = unicode::category(int(char_array[i])); 249 | if (cat == categories["Mn"]) 250 | { 251 | i++; 252 | continue; 253 | } 254 | // if (_is_punctuation(char_array[i])) 255 | // { 256 | // i++; 257 | // continue; 258 | // } 259 | output = output + char_array[i]; 260 | i++; 261 | } 262 | return output; 263 | } 264 | 265 | std::string BasicTokenizer::utf8chr(int cp) 266 | { 267 | char c[5] = {0x00, 0x00, 0x00, 0x00, 0x00}; 268 | if (cp <= 0x7F) { c[0] = cp; } 269 | else if (cp <= 0x7FF) 270 | { 271 | c[0] = (cp >> 6) + 192; 272 | c[1] = (cp & 63) + 128; 273 | } 274 | else if (0xd800 <= cp && cp <= 0xdfff) {} //invalid block of utf8 275 | else if (cp <= 0xFFFF) 276 | { 277 | c[0] = (cp >> 12) + 224; 278 | c[1] = ((cp >> 6) & 63) + 128; 279 | c[2] = (cp & 63) + 128; 280 | } 281 | else if (cp <= 0x10FFFF) 282 | { 283 | c[0] = (cp >> 18) + 240; 284 | c[1] = ((cp >> 12) & 63) + 128; 285 | c[2] = ((cp >> 6) & 63) + 128; 286 | c[3] = (cp & 63) + 128; 287 | } 288 | return std::string(c); 289 | } 290 | 291 | std::string BasicTokenizer:: 292 | _tokenize_chinese_chars(std::string text) 293 | { 294 | auto &&utf8_text = text; 295 | u8_to_u32_iterator 296 | tbegin(utf8_text.begin()), tend(utf8_text.end()); 297 | vector result; 298 | parse(tbegin, tend, *standard_wide::char_, result); 299 | std::string output; 300 | for (auto &&code_point : result) 301 | { 302 | int cp = code_point; 303 | if (_is_chinese_char(cp)) 304 | { 305 | output += " "; 306 | output.append(utf8chr(code_point)); 307 | output += " "; 308 | } 309 | else 310 | { 311 | output.append(utf8chr(code_point)); 312 | } 313 | // ++len; 314 | } 315 | 316 | return output; 317 | } 318 | 319 | bool BasicTokenizer::_is_chinese_char(int cp) 320 | { 321 | if ( 322 | (cp >= 0x4E00 && cp <= 0x9FFF) 323 | || (cp >= 0x3400 && cp <= 0x4DBF) 324 | || (cp >= 0x20000 && cp <= 0x2A6DF) 325 | || (cp >= 0x2A700 && cp <= 0x2B73F) 326 | || (cp >= 0x2B740 && cp <= 0x2B81F) 327 | || (cp >= 0x2B820 && cp <= 0x2CEAF) 328 | || (cp >= 0xF900 && cp <= 0xFAFF) 329 | || (cp >= 0x2F800 && cp <= 0x2FA1F) || cp == 0x3002 || cp == 0xFF1F || cp == 0xFF01 || cp == 0xFF0C || 330 | cp == 0x3001 || cp == 0xFF1B || cp == 0xFF1A || cp == 0x300C || cp == 0x300D || cp == 0x300E || 331 | cp == 0x300F || cp == 0x2018 || cp == 0x2019 || cp == 0x201C || cp == 0x201D || cp == 0xFF08 || 332 | cp == 0xFF09 || cp == 0x3014 || cp == 0x3015 || cp == 0x3010 || cp == 0x3011 || cp == 0x2014 || 333 | cp == 0x2026 || cp == 0x2013 || cp == 0xFF0E || cp == 0x300A || cp == 0x300B || cp == 0x3008 || cp == 0x3009 334 | ) 335 | return true; 336 | else 337 | return false; 338 | } 339 | 340 | 341 | vector BasicTokenizer::tokenize(std::string text) 342 | { 343 | // text = _clean_text(text); 344 | text = _tokenize_chinese_chars(text); 345 | vector orig_tokens = whitespace_tokenize(text); 346 | vector split_tokens; 347 | vector::iterator itr; 348 | for (itr = orig_tokens.begin(); itr < orig_tokens.end(); itr++) 349 | { 350 | std::string temp = *itr; 351 | if (do_lower_case_ and not bool(find(never_split_.begin(), never_split_.end(), *itr) != never_split_.end())) 352 | { 353 | transform(temp.begin(), temp.end(), temp.begin(), [](unsigned char c) { return std::tolower(c); }); 354 | temp = _run_strip_accents(temp); 355 | } 356 | vector split = _run_split_on_punc(temp); 357 | split_tokens.insert(split_tokens.end(), split.begin(), split.end()); 358 | } 359 | std::string temp_text; 360 | vector::iterator ptr; 361 | for (ptr = split_tokens.begin(); ptr < split_tokens.end(); ptr++) 362 | { 363 | temp_text = temp_text + " " + *ptr; 364 | } 365 | return whitespace_tokenize(temp_text); 366 | } 367 | 368 | void BasicTokenizer::truncate_sequences( 369 | vector &tokens_A, vector &tokens_B, const char *truncation_strategy = "longest_first", 370 | int max_seq_length = 509) 371 | { 372 | int length = tokens_A.size() + tokens_B.size(); 373 | if (strcmp(truncation_strategy, "longest_first") == 0) 374 | { 375 | while (length > max_seq_length) 376 | { 377 | if (tokens_A.empty() || tokens_A.size() > tokens_B.size()) 378 | { 379 | tokens_A.pop_back(); 380 | } 381 | else 382 | { 383 | tokens_B.pop_back(); 384 | } 385 | --length; 386 | } 387 | } 388 | else if (strcmp(truncation_strategy, "only_first") == 0) 389 | { 390 | while (length > max_seq_length && !tokens_A.empty()) 391 | { 392 | tokens_A.pop_back(); 393 | --length; 394 | } 395 | } 396 | else if (strcmp(truncation_strategy, "only_second") == 0) 397 | { 398 | while (length > max_seq_length && !tokens_B.empty()) 399 | { 400 | tokens_B.pop_back(); 401 | --length; 402 | } 403 | } 404 | else if (strcmp(truncation_strategy, "do_not_truncate") == 0) 405 | { 406 | assert((length < max_seq_length)); 407 | } 408 | else 409 | { 410 | cerr << "invalid truncation strategy. skipping trancation" << endl; 411 | } 412 | } 413 | 414 | void WordpieceTokenizer::add_vocab(map vocab) 415 | { 416 | vocab_ = vocab; 417 | unk_token_ = "[UNK]"; 418 | max_input_chars_per_word_ = 100; 419 | } 420 | 421 | vector WordpieceTokenizer::tokenize(std::string text) 422 | { 423 | vector output_tokens; 424 | vector whitespace_tokens = whitespace_tokenize(text); 425 | vector::iterator ptr; 426 | for (ptr = whitespace_tokens.begin(); ptr < whitespace_tokens.end(); ptr++) 427 | { 428 | // cout<<*ptr<<"\n"; 429 | std::string token = *ptr; 430 | int len_char_array = token.length(); 431 | // cout << len_char_array < max_input_chars_per_word_) 435 | { 436 | output_tokens.push_back(unk_token_); 437 | continue; 438 | } 439 | // cout< sub_tokens; 443 | while (start < len_char_array) 444 | { 445 | int end = len_char_array; 446 | std::string cur_substr = ""; 447 | while (start < end) 448 | { 449 | std::string substr; 450 | for (int c = start; c < end; c++) 451 | substr = substr + char_array[c]; 452 | if (start > 0) 453 | substr = "##" + substr; 454 | if (vocab_.count(substr) == 1) 455 | { 456 | cur_substr = substr; 457 | break; 458 | } 459 | end = end - 1; 460 | } 461 | if (cur_substr == "") 462 | { 463 | is_bad = true; 464 | break; 465 | } 466 | sub_tokens.push_back(cur_substr); 467 | start = end; 468 | } 469 | if (is_bad) 470 | output_tokens.push_back(unk_token_); 471 | else 472 | { 473 | output_tokens.insert(output_tokens.end(), sub_tokens.begin(), sub_tokens.end()); 474 | } 475 | } 476 | return output_tokens; 477 | } 478 | 479 | 480 | void BertTokenizer::add_vocab(const char *vocab_file) 481 | { 482 | vocab = read_vocab(vocab_file); 483 | for (map::iterator i = vocab.begin(); i != vocab.end(); ++i) 484 | ids_to_tokens[i->second] = i->first; 485 | do_basic_tokenize_ = true; 486 | do_lower_case_ = false; 487 | wordpiece_tokenizer.add_vocab(vocab); 488 | maxlen_ = 512; 489 | } 490 | 491 | vector BertTokenizer::tokenize(std::string text) 492 | { 493 | vector split_tokens; 494 | if (do_basic_tokenize_) 495 | { 496 | vector temp_tokens = basic_tokenizer.tokenize(text); 497 | vector::iterator ptr; 498 | for (ptr = temp_tokens.begin(); ptr < temp_tokens.end(); ptr++) 499 | { 500 | vector subtokens = wordpiece_tokenizer.tokenize(*ptr); 501 | split_tokens.insert(split_tokens.end(), subtokens.begin(), subtokens.end()); 502 | } 503 | } 504 | else 505 | { 506 | split_tokens = wordpiece_tokenizer.tokenize(text); 507 | } 508 | return split_tokens; 509 | } 510 | 511 | vector BertTokenizer::convert_tokens_to_ids(vector tokens) 512 | { 513 | vector ids; 514 | vector::iterator ptr; 515 | for (ptr = tokens.begin(); ptr < tokens.end(); ptr++) 516 | { 517 | ids.push_back(float(vocab[*ptr])); 518 | } 519 | if (ids.size() > maxlen_) 520 | cout << "Token indices sequence length is longer than the specified maximum"; 521 | return ids; 522 | } 523 | 524 | void 525 | BertTokenizer::encode(std::string textA, std::string textB, vector &input_ids, vector &input_mask, 526 | vector &segment_ids, int max_seq_length, const char *truncation_strategy) 527 | { 528 | BasicTokenizer basictokenizer; 529 | vector tokens_A; 530 | vector words = basictokenizer.tokenize(textA); 531 | vector token; 532 | vector::iterator itr; 533 | for (itr = words.begin(); itr < words.end(); itr++) 534 | { 535 | token = this->tokenize(*itr); 536 | tokens_A.insert(tokens_A.end(), token.begin(), token.end()); 537 | } 538 | if (textB == "") 539 | { 540 | if (tokens_A.size() > max_seq_length - 2) 541 | { 542 | tokens_A.assign(tokens_A.begin(), tokens_A.begin() + max_seq_length - 2); 543 | } 544 | // insert "[CLS}" 545 | tokens_A.insert(tokens_A.begin(), "[CLS]"); 546 | // insert "[SEP]" 547 | tokens_A.push_back("[SEP]"); 548 | for (int i = 0; i < tokens_A.size(); i++) 549 | { 550 | segment_ids.push_back(0.0); 551 | input_mask.push_back(1.0); 552 | } 553 | input_ids = this->convert_tokens_to_ids(tokens_A); 554 | while (input_ids.size() < max_seq_length) 555 | { 556 | input_ids.push_back(0.0); 557 | input_mask.push_back(0.0); 558 | segment_ids.push_back(0.0); 559 | } 560 | } 561 | else 562 | { 563 | vector tokens_B; 564 | words = basictokenizer.tokenize(textB); 565 | for (itr = words.begin(); itr < words.end(); itr++) 566 | { 567 | token = this->tokenize(*itr); 568 | tokens_B.insert(tokens_B.end(), token.begin(), token.end()); 569 | } 570 | basictokenizer.truncate_sequences(tokens_A, tokens_B, truncation_strategy, max_seq_length - 3); 571 | // insert "[CLS}" 572 | tokens_A.insert(tokens_A.begin(), "[CLS]"); 573 | // insert "[SEP]" 574 | tokens_A.push_back("[SEP]"); 575 | for (int i = 0; i < tokens_A.size(); i++) 576 | { 577 | segment_ids.push_back(0.0); 578 | input_mask.push_back(1.0); 579 | } 580 | // insert "[SEP]" 581 | tokens_B.push_back("[SEP]"); 582 | for (int i = 0; i < tokens_B.size(); i++) 583 | { 584 | segment_ids.push_back(0.0); 585 | input_mask.push_back(1.0); 586 | } 587 | tokens_A.insert(tokens_A.end(), tokens_B.begin(), tokens_B.end()); 588 | // Padding 589 | input_ids = this->convert_tokens_to_ids(tokens_A); 590 | while (input_ids.size() < max_seq_length) 591 | { 592 | input_ids.push_back(0.0); 593 | input_mask.push_back(0.0); 594 | segment_ids.push_back(0.0); 595 | } 596 | } 597 | for (auto &&token:tokens_A) 598 | { 599 | cout << token << " "; 600 | } 601 | cout << endl; 602 | } 603 | 604 | -------------------------------------------------------------------------------- /src/tokenizer.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace std; 5 | 6 | vector whitespace_tokenize(string text); 7 | 8 | map read_vocab(const char *filename); 9 | 10 | class BasicTokenizer 11 | { 12 | public: 13 | bool do_lower_case_; 14 | vector never_split_; 15 | 16 | BasicTokenizer(bool do_lower_case = false, 17 | vector never_split = {"[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"}) 18 | { 19 | do_lower_case_ = do_lower_case; 20 | never_split_ = never_split; 21 | } 22 | 23 | string _clean_text(string text); 24 | 25 | vector _run_split_on_punc(string text); 26 | 27 | string _run_strip_accents(string text); 28 | 29 | string _tokenize_chinese_chars(string text); 30 | 31 | string utf8chr(int cp); 32 | 33 | bool _is_chinese_char(int cp); 34 | 35 | vector tokenize(string text); 36 | 37 | void truncate_sequences( 38 | vector &textA, vector &textB, const char *truncation_strategy, int max_seq_length); 39 | }; 40 | 41 | class WordpieceTokenizer 42 | { 43 | public: 44 | map vocab_; 45 | string unk_token_; 46 | int max_input_chars_per_word_; 47 | 48 | WordpieceTokenizer() {}; 49 | 50 | WordpieceTokenizer(map vocab, string unk_token = "[UNK]", int max_input_chars_per_word = 100) 51 | { 52 | vocab_ = vocab; 53 | unk_token_ = unk_token; 54 | max_input_chars_per_word_ = max_input_chars_per_word; 55 | } 56 | 57 | void add_vocab(map vocab); 58 | 59 | vector tokenize(string text); 60 | }; 61 | 62 | 63 | class BertTokenizer 64 | { 65 | public: 66 | map vocab; 67 | map ids_to_tokens; 68 | bool do_lower_case_; 69 | bool do_basic_tokenize_; 70 | int maxlen_; 71 | BasicTokenizer basic_tokenizer; 72 | WordpieceTokenizer wordpiece_tokenizer; 73 | 74 | BertTokenizer() {}; 75 | 76 | BertTokenizer(const char *vocab_file, bool do_lower_case = false, int max_len = 512, bool do_basic_tokenize = true, 77 | vector never_split = {"[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"}) 78 | { 79 | vocab = read_vocab(vocab_file); 80 | for (map::iterator i = vocab.begin(); i != vocab.end(); ++i) 81 | ids_to_tokens[i->second] = i->first; 82 | do_basic_tokenize_ = do_basic_tokenize; 83 | do_lower_case_ = do_lower_case; 84 | wordpiece_tokenizer.add_vocab(vocab); 85 | maxlen_ = max_len; 86 | } 87 | 88 | void add_vocab(const char *vocab_file); 89 | 90 | vector tokenize(string text); 91 | 92 | vector convert_tokens_to_ids(vector tokens); 93 | 94 | void 95 | encode(string textA, string textB, vector &input_ids, vector &input_mask, vector &segment_ids, 96 | int max_seq_length = 512, const char *truncation_strategy = "longest_first"); 97 | }; 98 | -------------------------------------------------------------------------------- /src/unilib/unicode.h: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #pragma once 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | namespace ufal { 20 | namespace unilib { 21 | 22 | class unicode { 23 | enum : uint8_t { 24 | _Lu = 1, _Ll = 2, _Lt = 3, _Lm = 4, _Lo = 5, 25 | _Mn = 6, _Mc = 7, _Me = 8, 26 | _Nd = 9, _Nl = 10, _No = 11, 27 | _Pc = 12, _Pd = 13, _Ps = 14, _Pe = 15, _Pi = 16, _Pf = 17, _Po = 18, 28 | _Sm = 19, _Sc = 20, _Sk = 21, _So = 22, 29 | _Zs = 23, _Zl = 24, _Zp = 25, 30 | _Cc = 26, _Cf = 27, _Cs = 28, _Co = 29, _Cn = 30 31 | }; 32 | 33 | public: 34 | typedef uint32_t category_t; 35 | enum : category_t { 36 | Lu = 1 << _Lu, Ll = 1 << _Ll, Lt = 1 << _Lt, Lut = Lu | Lt, LC = Lu | Ll | Lt, 37 | Lm = 1 << _Lm, Lo = 1 << _Lo, L = Lu | Ll | Lt | Lm | Lo, 38 | Mn = 1 << _Mn, Mc = 1 << _Mc, Me = 1 << _Me, M = Mn | Mc | Me, 39 | Nd = 1 << _Nd, Nl = 1 << _Nl, No = 1 << _No, N = Nd | Nl | No, 40 | Pc = 1 << _Pc, Pd = 1 << _Pd, Ps = 1 << _Ps, Pe = 1 << _Pe, Pi = 1 << _Pi, 41 | Pf = 1 << _Pf, Po = 1 << _Po, P = Pc | Pd | Ps | Pe | Pi | Pf | Po, 42 | Sm = 1 << _Sm, Sc = 1 << _Sc, Sk = 1 << _Sk, So = 1 << _So, S = Sm | Sc | Sk | So, 43 | Zs = 1 << _Zs, Zl = 1 << _Zl, Zp = 1 << _Zp, Z = Zs | Zl | Zp, 44 | Cc = 1 << _Cc, Cf = 1 << _Cf, Cs = 1 << _Cs, Co = 1 << _Co, Cn = 1 << _Cn, C = Cc | Cf | Cs | Co | Cn 45 | }; 46 | 47 | static inline category_t category(char32_t chr); 48 | 49 | static inline char32_t lowercase(char32_t chr); 50 | static inline char32_t uppercase(char32_t chr); 51 | static inline char32_t titlecase(char32_t chr); 52 | 53 | private: 54 | static const char32_t CHARS = 0x110000; 55 | static const int32_t DEFAULT_CAT = Cn; 56 | 57 | static const uint8_t category_index[CHARS >> 8]; 58 | static const uint8_t category_block[][256]; 59 | static const uint8_t othercase_index[CHARS >> 8]; 60 | static const char32_t othercase_block[][256]; 61 | 62 | enum othercase_type { LOWER_ONLY = 1, UPPERTITLE_ONLY = 2, LOWER_THEN_UPPER = 3, UPPER_THEN_TITLE = 4, TITLE_THEN_LOWER = 5 }; 63 | }; 64 | 65 | unicode::category_t unicode::category(char32_t chr) { 66 | return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; 67 | } 68 | 69 | char32_t unicode::lowercase(char32_t chr) { 70 | if (chr < CHARS) { 71 | char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF]; 72 | if ((othercase & 0xFF) == othercase_type::LOWER_ONLY) return othercase >> 8; 73 | if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase >> 8; 74 | if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; 75 | } 76 | return chr; 77 | } 78 | 79 | char32_t unicode::uppercase(char32_t chr) { 80 | if (chr < CHARS) { 81 | char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF]; 82 | if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8; 83 | if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase >> 8; 84 | if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; 85 | } 86 | return chr; 87 | } 88 | 89 | char32_t unicode::titlecase(char32_t chr) { 90 | if (chr < CHARS) { 91 | char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF]; 92 | if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8; 93 | if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase >> 8; 94 | if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; 95 | } 96 | return chr; 97 | } 98 | 99 | } // namespace unilib 100 | } // namespace ufal 101 | -------------------------------------------------------------------------------- /src/unilib/uninorms.h: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #pragma once 14 | 15 | #include 16 | #include 17 | 18 | namespace ufal { 19 | namespace unilib { 20 | 21 | class uninorms { 22 | public: 23 | static void nfc(std::u32string& str); 24 | static void nfd(std::u32string& str); 25 | static void nfkc(std::u32string& str); 26 | static void nfkd(std::u32string& str); 27 | 28 | private: 29 | static void compose(std::u32string& str); 30 | static void decompose(std::u32string& str, bool kanonical); 31 | 32 | static const char32_t CHARS = 0x110000; 33 | 34 | struct Hangul { 35 | // Hangul decomposition and composition 36 | static const char32_t SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 37 | static const char32_t LCount = 19, VCount = 21, TCount = 28, NCount = VCount * TCount, SCount = LCount * NCount; 38 | }; 39 | 40 | static const uint8_t ccc_index[CHARS >> 8]; 41 | static const uint8_t ccc_block[][256]; 42 | 43 | static const uint8_t composition_index[CHARS >> 8]; 44 | static const uint16_t composition_block[][257]; 45 | static const char32_t composition_data[]; 46 | 47 | static const uint8_t decomposition_index[CHARS >> 8]; 48 | static const uint16_t decomposition_block[][257]; 49 | static const char32_t decomposition_data[]; 50 | }; 51 | 52 | } // namespace unilib 53 | } // namespace ufal 54 | -------------------------------------------------------------------------------- /src/unilib/unistrip.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #include "unistrip.h" 14 | 15 | namespace ufal { 16 | namespace unilib { 17 | 18 | const char32_t unistrip::CHARS; 19 | 20 | const uint8_t unistrip::combining_mark_index[unistrip::CHARS >> 8] = { 21 | 0,0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,0,15,0,0,0,16,17,18,19,20,21,22,0,0,23,0,0,0,0,0,0,0,0,0,0,0,24,25,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,27,0,28,29,30,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,0,0,33,0,0,34,35,36,0,0,0,0,0,0,37,0,0,0,0,0,38,39,40,41,42,43,44,45,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,46,47,0,0,0,48,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,49,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,50,51,0,0,0,0,0,0,0,52,0,0,0,0,0,0,0,0,0,0,0,0,0,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 22 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 23 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 24 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,54,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 25 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 26 | }; 27 | 28 | const uint8_t unistrip::combining_mark_block[][32] = { 29 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 30 | {255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 31 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,248,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 32 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,254,255,255,255,255,191,182,0,0,0,0,0,0,0}, 33 | {0,0,255,7,0,0,0,0,0,248,255,255,0,0,1,0,0,0,0,0,0,0,0,0,0,0,192,159,159,61,0,0}, 34 | {0,0,2,0,0,0,255,255,255,7,0,0,0,0,0,0,0,0,0,0,192,255,1,0,0,0,0,0,0,248,15,0}, 35 | {0,0,192,251,239,62,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,248,255,255,255}, 36 | {15,0,0,0,0,0,0,220,255,255,254,0,12,0,0,0,14,0,0,0,0,0,0,208,159,57,128,0,12,0,0,0}, 37 | {14,0,0,0,0,0,0,208,135,57,2,0,0,0,35,0,14,0,0,0,0,0,0,208,191,59,0,0,12,0,0,0}, 38 | {14,0,0,0,0,0,0,208,159,57,192,0,12,0,0,0,4,0,0,0,0,0,0,192,199,61,128,0,0,0,0,0}, 39 | {15,0,0,0,0,0,0,192,223,61,96,0,12,0,0,0,14,0,0,0,0,0,0,208,223,61,96,0,12,0,0,0}, 40 | {14,0,0,0,0,0,0,192,223,61,128,0,12,0,0,0,12,0,0,0,0,0,0,0,0,132,95,255,0,0,12,0}, 41 | {0,0,0,0,0,0,242,7,128,127,0,0,0,0,0,0,0,0,0,0,0,0,242,27,0,63,0,0,0,0,0,0}, 42 | {0,0,0,3,0,0,160,194,0,0,0,0,0,0,254,255,223,224,255,254,255,255,255,31,64,0,0,0,0,0,0,0}, 43 | {0,0,0,0,0,248,255,127,0,0,192,195,157,63,30,0,252,191,0,60,0,0,0,0,0,0,0,0,0,0,0,0}, 44 | {0,0,0,0,0,0,0,0,0,0,0,224,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 45 | {0,0,28,0,0,0,28,0,0,0,12,0,0,0,12,0,0,0,0,0,0,0,240,255,255,255,15,32,0,0,0,0}, 46 | {0,56,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0}, 47 | {0,0,0,0,255,15,255,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 48 | {0,0,128,15,0,0,0,0,0,0,224,127,255,255,255,159,0,0,0,0,0,0,255,127,0,0,0,0,0,0,0,0}, 49 | {31,0,0,0,0,0,240,255,31,0,0,0,0,248,15,0,7,0,0,0,254,63,0,0,0,0,0,0,192,255,15,0}, 50 | {0,0,0,0,240,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,247,255,255,33,28,3}, 51 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,63,240}, 52 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,1,0}, 53 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,3,0}, 54 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255}, 55 | {0,0,0,0,0,252,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0}, 56 | {0,0,0,0,0,0,0,0,0,0,0,0,0,128,247,63,0,0,0,192,0,0,0,0,0,0,0,0,0,0,3,0}, 57 | {68,8,0,0,248,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,240,255,31,0,0,0,255,255,3,0}, 58 | {0,0,0,0,192,63,0,0,128,255,15,0,0,0,0,0,15,0,0,0,0,0,248,255,1,0,0,0,32,0,0,0}, 59 | {0,0,0,0,0,254,127,0,8,48,0,0,0,0,0,56,0,0,0,0,0,0,157,193,2,0,0,0,0,248,96,0}, 60 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,248,55,0,0}, 61 | {0,0,0,64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 62 | {255,255,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 63 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32}, 64 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0}, 65 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,192,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 66 | {110,240,0,0,0,0,0,135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,0,0,0}, 67 | {7,0,0,0,0,0,0,255,127,0,0,0,0,0,0,128,7,0,0,0,0,0,255,7,0,0,0,0,0,0,0,0}, 68 | {7,0,0,0,128,255,31,0,0,0,0,0,0,0,8,0,7,0,0,0,0,0,248,255,1,28,0,0,0,0,0,0}, 69 | {0,0,0,0,0,240,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,255,7,0,0}, 70 | {15,0,0,0,0,0,0,208,159,57,128,0,204,31,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 71 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,15,0,0,0,0,0,0,0}, 72 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,63,255,1,0,0,48,0,0,0,0}, 73 | {0,0,0,0,0,0,255,255,1,0,0,0,0,0,0,0,0,0,0,0,0,248,255,0,0,0,0,0,0,0,0,0}, 74 | {0,0,0,224,255,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 75 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,0}, 76 | {0,0,0,0,0,0,127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 77 | {0,0,0,0,0,0,0,0,0,0,254,255,255,255,255,127,0,128,7,0,0,0,0,0,0,0,0,0,0,0,0,0}, 78 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,0,0,0,0,0,0,0,0,0,0,0,0}, 79 | {0,0,0,0,0,0,0,0,0,0,0,0,224,227,7,248,231,15,0,0,0,60,0,0,0,0,0,0,0,0,0,0}, 80 | {0,0,0,0,0,0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 81 | {255,255,255,255,255,255,127,248,255,255,255,255,255,31,32,0,16,0,0,248,254,255,0,0,0,0,0,0,0,0,0,0}, 82 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,127,0,0,0,0,0}, 83 | {255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0} 84 | }; 85 | 86 | const uint8_t unistrip::stripped_index[unistrip::CHARS >> 8] = { 87 | 0,1,2,3,4,5,6,5,5,7,8,9,5,5,5,10,11,5,5,5,5,5,5,5,5,5,5,12,5,5,13,14,5,15,16,5,5,5,5,5,5,5,17,5,5,5,5,5,18,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,19,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,20,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,21,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, 88 | 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, 89 | 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, 90 | 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, 91 | 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 92 | }; 93 | 94 | const uint16_t unistrip::stripped_block[][256] = { 95 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,2,3,3,3,3,4,4,4,4,0,5,6,6,6,6,6,0,0,7,7,7,7,8,0,0,9,9,9,9,9,9,0,10,11,11,11,11,12,12,12,12,0,13,14,14,14,14,14,0,0,15,15,15,15,16,0,16}, 96 | {1,9,1,9,1,9,2,10,2,10,2,10,2,10,17,18,0,0,3,11,3,11,3,11,3,11,3,11,19,20,19,20,19,20,19,20,21,22,0,0,4,12,4,12,4,12,4,12,4,0,0,0,23,24,25,26,0,27,28,27,28,27,28,0,0,0,0,5,13,5,13,5,13,0,0,0,6,14,6,14,6,14,0,0,29,30,29,30,29,30,31,32,31,32,31,32,31,32,33,34,33,34,0,0,7,15,7,15,7,15,7,15,7,15,7,15,35,36,8,16,8,37,38,37,38,37,38,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,9,4,12,6,14,7,15,7,15,7,15,7,15,7,15,0,1,9,1,9,39,40,0,0,19,20,25,26,6,14,6,14,41,42,24,0,0,0,19,20,0,0,5,13,1,9,39,40,43,44}, 97 | {1,9,1,9,3,11,3,11,4,12,4,12,6,14,6,14,29,30,29,30,7,15,7,15,31,32,33,34,0,0,21,22,0,0,0,0,0,0,1,9,3,11,6,14,6,14,6,14,6,14,8,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 98 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,45,46,0,47,48,49,0,50,0,51,52,53,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,49,51,54,55,56,53,57,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,53,57,58,57,59,0,0,0,0,60,60,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 99 | {61,61,0,62,0,0,0,63,0,0,0,0,64,65,66,0,0,0,0,0,0,0,0,0,0,65,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,67,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,68,68,0,69,0,0,0,70,0,0,0,0,71,67,72,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,73,74,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,75,76,0,0,0,0,0,0,0,0,0,0,0,0,0,77,78,77,78,0,0,61,68,0,0,79,80,75,76,81,82,0,0,65,67,65,67,83,84,0,0,85,86,87,88,66,72,66,72,66,72,89,90,0,0,91,92,0,0,0,0,0,0}, 100 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 101 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,93,93,94,93,95,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96,0,97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,98,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 102 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,99,0,0,0,0,0,0,0,100,0,0,101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,102,103,104,105,106,107,108,109,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,110,111,0,112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 103 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,113,0,0,114,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,115,116,117,0,0,118,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 104 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,119,120,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,121,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 105 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,122,0,0,0,0,0,0,0,0,0,123,0,0,0,0,124,0,0,0,0,125,0,0,0,0,126,0,0,0,0,0,0,0,0,0,0,0,0,127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 106 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,128,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 107 | {0,0,0,0,0,0,129,0,130,0,131,0,132,0,133,0,0,0,134,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 108 | {1,9,135,136,135,136,135,136,2,10,17,18,17,18,17,18,17,18,17,18,3,11,3,11,3,11,3,11,3,11,137,138,19,20,21,22,21,22,21,22,21,22,21,22,4,12,4,12,25,26,25,26,25,26,27,28,27,28,27,28,27,28,139,140,139,140,139,140,5,13,5,13,5,13,5,13,6,14,6,14,6,14,6,14,141,142,141,142,29,30,29,30,29,30,29,30,31,32,31,32,31,32,31,32,31,32,33,34,33,34,33,34,33,34,7,15,7,15,7,15,7,15,7,15,143,144,143,144,35,36,35,36,35,36,35,36,35,36,145,146,145,146,8,16,37,38,37,38,37,38,22,34,36,16,0,147,0,0,0,0,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9,3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11,4,12,4,12,6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14,7,15,7,15,7,15,7,15,7,15,7,15,7,15,8,16,8,16,8,16,8,16,0,0,0,0,0,0}, 109 | {54,54,54,54,54,54,54,54,46,46,46,46,46,46,46,46,55,55,55,55,55,55,0,0,47,47,47,47,47,47,0,0,56,56,56,56,56,56,56,56,48,48,48,48,48,48,48,48,53,53,53,53,53,53,53,53,49,49,49,49,49,49,49,49,58,58,58,58,58,58,0,0,50,50,50,50,50,50,0,0,57,57,57,57,57,57,57,57,0,51,0,51,0,51,0,51,59,59,59,59,59,59,59,59,52,52,52,52,52,52,52,52,54,54,55,55,56,56,53,53,58,58,57,57,59,59,0,0,54,54,54,54,54,54,54,54,46,46,46,46,46,46,46,46,56,56,56,56,56,56,56,56,48,48,48,48,48,48,48,48,59,59,59,59,59,59,59,59,52,52,52,52,52,52,52,52,54,54,54,54,54,0,54,54,46,46,46,46,46,0,0,0,0,45,56,56,56,0,56,56,47,47,48,48,48,148,148,148,53,53,53,53,0,0,53,53,49,49,49,49,0,149,149,149,57,57,57,57,150,150,57,57,51,51,51,51,151,45,45,0,0,0,59,59,59,0,59,59,50,50,52,52,52,0,0,0}, 110 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,152,153,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,154,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,155,156,157,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 111 | {0,0,0,0,158,0,0,0,0,159,0,0,160,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,161,0,162,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,163,0,0,164,0,0,165,0,166,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,167,0,168,0,0,0,0,0,0,0,0,0,0,169,170,171,172,173,0,0,174,175,0,0,176,177,0,0,0,0,0,0,178,179,0,0,180,181,0,0,182,183,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,184,185,186,187,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,188,189,190,191,0,0,0,0,0,0,192,193,194,195,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 112 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,196,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 113 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,197,0,198,0,199,0,200,0,201,0,202,0,203,0,204,0,205,0,206,0,207,0,208,0,0,209,0,210,0,211,0,0,0,0,0,0,212,212,0,213,213,0,214,214,0,215,215,0,216,216,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,217,0,0,0,0,0,0,0,0,0,218,0,0,0,0,0,0,0,0,0,0,0,0,0,219,0,220,0,221,0,222,0,223,0,224,0,225,0,226,0,227,0,228,0,229,0,230,0,0,231,0,232,0,233,0,0,0,0,0,0,234,234,0,235,235,0,236,236,0,237,237,0,238,238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,239,0,0,240,241,242,243,0,0,0,244,0}, 114 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,245,0,246,0,0,0,0,0,0,0,0,0,0,247,247,247,247,248,248,248,249,250,251,252,253,254,0,255,245,256,257,258,0,259,0,260,261,0,262,263,0,264,265,266,247,267,253,249,257,263,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 115 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,268,0,269,0,0,0,0,0,0,0,0,0,0,0,0,0,0,270,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, 116 | {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,271,272,272,272,272,272,272,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,273,274,273,274,273,274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} 117 | }; 118 | 119 | const char32_t unistrip::stripped_data[] = { 120 | 0,65,67,69,73,78,79,85,89,97,99,101,105,110,111,117,121,68,100,71,103,72,104,74,106,75,107,76,108,82,114,83,115,84,116,87,119,90,122,198,230,439,658,216,248,168,913,917,919,921,927,933,937,953,945,949,951,965,959,969,978,1045,1043,1030,1050,1048,1059,1080,1077,1075,1110,1082,1091,1140,1141,1046,1078,1040,1072,1240,1241,1047,1079,1054,1086,1256,1257,1069,1101,1063,1095,1067,1099,1575,1608,1610,1749,1729,1746,2344,2352,2355,2325,2326,2327,2332,2337,2338,2347,2351,2465,2466,2479,2610,2616,2582,2583,2588,2603,2849,2850,2962,3906,3916,3921,3926,3931,3904,4133,6917,6919,6921,6923,6925,6929,66,98,70,102,77,109,80,112,86,118,88,120,383,8127,8190,961,929,8592,8594,8596,8656,8660,8658,8707,8712,8715,8739,8741,8764,8771,8773,8776,61,8801,8781,60,62,8804,8805,8818,8819,8822,8823,8826,8827,8834,8835,8838,8839,8866,8872,8873,8875,8828,8829,8849,8850,8882,8883,8884,8885,10973,12363,12365,12367,12369,12371,12373,12375,12377,12379,12381,12383,12385,12388,12390,12392,12399,12402,12405,12408,12411,12358,12445,12459,12461,12463,12465,12467,12469,12471,12473,12475,12477,12479,12481,12484,12486,12488,12495,12498,12501,12504,12507,12454,12527,12528,12529,12530,12541,1497,1522,1513,1488,1489,1490,1491,1492,1493,1494,1496,1498,1499,1500,1502,1504,1505,1507,1508,1510,1511,1512,1514,69785,69787,69797,119127,119128,119225,119226 121 | }; 122 | 123 | } // namespace unilib 124 | } // namespace ufal 125 | -------------------------------------------------------------------------------- /src/unilib/unistrip.h: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #pragma once 14 | 15 | #include 16 | 17 | namespace ufal { 18 | namespace unilib { 19 | 20 | class unistrip { 21 | public: 22 | static inline bool is_combining_mark(char32_t chr); 23 | static inline char32_t strip_combining_marks(char32_t chr); 24 | 25 | private: 26 | static const char32_t CHARS = 0x110000; 27 | 28 | static const uint8_t combining_mark_index[CHARS >> 8]; 29 | static const uint8_t combining_mark_block[][32]; 30 | 31 | static const uint8_t stripped_index[CHARS >> 8]; 32 | static const uint16_t stripped_block[][256]; 33 | static const char32_t stripped_data[]; 34 | }; 35 | 36 | bool unistrip::is_combining_mark(char32_t chr) { 37 | return chr < CHARS && combining_mark_block[combining_mark_index[chr >> 8]][(chr >> 3) & 0x1F] & (uint8_t(1) << (chr & 0x07)); 38 | } 39 | 40 | char32_t unistrip::strip_combining_marks(char32_t chr) { 41 | if (chr >= CHARS) return chr; 42 | uint16_t index = stripped_block[stripped_index[chr >> 8]][chr & 0xFF]; 43 | return index ? stripped_data[index] : chr; 44 | } 45 | 46 | } // namespace unilib 47 | } // namespace ufal 48 | -------------------------------------------------------------------------------- /src/unilib/utf16.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #include "utf16.h" 14 | 15 | namespace ufal { 16 | namespace unilib { 17 | 18 | bool utf16::valid(const char16_t* str) { 19 | for (; *str; str++) 20 | if (*str >= 0xD800 && *str < 0xDC00) { 21 | str++; if (*str < 0xDC00 || *str >= 0xE000) return false; 22 | } else if (*str >= 0xDC00 && *str < 0xE000) return false; 23 | 24 | return true; 25 | } 26 | 27 | bool utf16::valid(const char16_t* str, size_t len) { 28 | for (; len; str++, len--) 29 | if (*str >= 0xD800 && *str < 0xDC00) { 30 | str++; if (!--len || *str < 0xDC00 || *str >= 0xE000) return false; 31 | } else if (*str >= 0xDC00 && *str < 0xE000) return false; 32 | 33 | return true; 34 | } 35 | 36 | void utf16::decode(const char16_t* str, std::u32string& decoded) { 37 | decoded.clear(); 38 | 39 | for (char32_t chr; (chr = decode(str)); ) 40 | decoded.push_back(chr); 41 | } 42 | 43 | void utf16::decode(const char16_t* str, size_t len, std::u32string& decoded) { 44 | decoded.clear(); 45 | 46 | while (len) 47 | decoded.push_back(decode(str, len)); 48 | } 49 | 50 | void utf16::encode(const std::u32string& str, std::u16string& encoded) { 51 | encoded.clear(); 52 | 53 | for (auto&& chr : str) 54 | append(encoded, chr); 55 | } 56 | 57 | const char16_t utf16::REPLACEMENT_CHAR; 58 | 59 | } // namespace unilib 60 | } // namespace ufal 61 | -------------------------------------------------------------------------------- /src/unilib/utf16.h: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #pragma once 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | namespace ufal { 21 | namespace unilib { 22 | 23 | class utf16 { 24 | public: 25 | static bool valid(const char16_t* str); 26 | static bool valid(const char16_t* str, size_t len); 27 | static inline bool valid(const std::u16string& str); 28 | 29 | static inline char32_t decode(const char16_t*& str); 30 | static inline char32_t decode(const char16_t*& str, size_t& len); 31 | static inline char32_t first(const char16_t* str); 32 | static inline char32_t first(const char16_t* str, size_t len); 33 | static inline char32_t first(const std::u16string& str); 34 | 35 | static void decode(const char16_t* str, std::u32string& decoded); 36 | static void decode(const char16_t* str, size_t len, std::u32string& decoded); 37 | static inline void decode(const std::u16string& str, std::u32string& decoded); 38 | 39 | class string_decoder { 40 | public: 41 | class iterator; 42 | inline iterator begin(); 43 | inline iterator end(); 44 | private: 45 | inline string_decoder(const char16_t* str); 46 | const char16_t* str; 47 | friend class utf16; 48 | }; 49 | static inline string_decoder decoder(const char16_t* str); 50 | static inline string_decoder decoder(const std::u16string& str); 51 | 52 | class buffer_decoder { 53 | public: 54 | class iterator; 55 | inline iterator begin(); 56 | inline iterator end(); 57 | private: 58 | inline buffer_decoder(const char16_t* str, size_t len); 59 | const char16_t* str; 60 | size_t len; 61 | friend class utf16; 62 | }; 63 | static inline buffer_decoder decoder(const char16_t* str, size_t len); 64 | 65 | static inline void append(char16_t*& str, char32_t chr); 66 | static inline void append(std::u16string& str, char32_t chr); 67 | static void encode(const std::u32string& str, std::u16string& encoded); 68 | 69 | template static void map(F f, const char16_t* str, std::u16string& result); 70 | template static void map(F f, const char16_t* str, size_t len, std::u16string& result); 71 | template static void map(F f, const std::u16string& str, std::u16string& result); 72 | 73 | private: 74 | static const char16_t REPLACEMENT_CHAR = '?'; 75 | }; 76 | 77 | bool utf16::valid(const std::u16string& str) { 78 | return valid(str.c_str()); 79 | } 80 | 81 | char32_t utf16::decode(const char16_t*& str) { 82 | if (*str < 0xD800 || *str >= 0xE000) return *str++; 83 | if (*str >= 0xDC00) return ++str, REPLACEMENT_CHAR; 84 | char32_t res = 0x10000 + ((*str++ - 0xD800) << 10); 85 | if (*str < 0xDC00 || *str >= 0xE000) return REPLACEMENT_CHAR; 86 | return res + (*str++ - 0xDC00); 87 | } 88 | 89 | char32_t utf16::decode(const char16_t*& str, size_t& len) { 90 | if (!len) return 0; 91 | --len; 92 | if (*str < 0xD800 || *str >= 0xE000) return *str++; 93 | if (!len || *str >= 0xDC00) return ++str, REPLACEMENT_CHAR; 94 | char32_t res = 0x10000 + ((*str++ - 0xD800) << 10); 95 | if (*str < 0xDC00 || *str >= 0xE000) return REPLACEMENT_CHAR; 96 | return res + ((--len, *str++) - 0xDC00); 97 | } 98 | 99 | char32_t utf16::first(const char16_t* str) { 100 | return decode(str); 101 | } 102 | 103 | char32_t utf16::first(const char16_t* str, size_t len) { 104 | return decode(str, len); 105 | } 106 | 107 | char32_t utf16::first(const std::u16string& str) { 108 | return first(str.c_str()); 109 | } 110 | 111 | void utf16::decode(const std::u16string& str, std::u32string& decoded) { 112 | decode(str.c_str(), decoded); 113 | } 114 | 115 | class utf16::string_decoder::iterator : public std::iterator { 116 | public: 117 | iterator(const char16_t* str) : codepoint(0), next(str) { operator++(); } 118 | iterator(const iterator& it) : codepoint(it.codepoint), next(it.next) {} 119 | iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } 120 | iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; } 121 | bool operator==(const iterator& other) const { return next == other.next; } 122 | bool operator!=(const iterator& other) const { return next != other.next; } 123 | const char32_t& operator*() { return codepoint; } 124 | private: 125 | char32_t codepoint; 126 | const char16_t* next; 127 | }; 128 | 129 | utf16::string_decoder::string_decoder(const char16_t* str) : str(str) {} 130 | 131 | utf16::string_decoder::iterator utf16::string_decoder::begin() { 132 | return iterator(str); 133 | } 134 | 135 | utf16::string_decoder::iterator utf16::string_decoder::end() { 136 | return iterator(nullptr); 137 | } 138 | 139 | utf16::string_decoder utf16::decoder(const char16_t* str) { 140 | return string_decoder(str); 141 | } 142 | 143 | utf16::string_decoder utf16::decoder(const std::u16string& str) { 144 | return string_decoder(str.c_str()); 145 | } 146 | 147 | class utf16::buffer_decoder::iterator : public std::iterator { 148 | public: 149 | iterator(const char16_t* str, size_t len) : codepoint(0), next(str), len(len) { operator++(); } 150 | iterator(const iterator& it) : codepoint(it.codepoint), next(it.next), len(it.len) {} 151 | iterator& operator++() { if (!len) next = nullptr; if (next) codepoint = decode(next, len); return *this; } 152 | iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; } 153 | bool operator==(const iterator& other) const { return next == other.next; } 154 | bool operator!=(const iterator& other) const { return next != other.next; } 155 | const char32_t& operator*() { return codepoint; } 156 | private: 157 | char32_t codepoint; 158 | const char16_t* next; 159 | size_t len; 160 | }; 161 | 162 | utf16::buffer_decoder::buffer_decoder(const char16_t* str, size_t len) : str(str), len(len) {} 163 | 164 | utf16::buffer_decoder::iterator utf16::buffer_decoder::begin() { 165 | return iterator(str, len); 166 | } 167 | 168 | utf16::buffer_decoder::iterator utf16::buffer_decoder::end() { 169 | return iterator(nullptr, 0); 170 | } 171 | 172 | utf16::buffer_decoder utf16::decoder(const char16_t* str, size_t len) { 173 | return buffer_decoder(str, len); 174 | } 175 | 176 | void utf16::append(char16_t*& str, char32_t chr) { 177 | if (chr <= 0xFFFF) *str++ = chr; 178 | else if (chr <= 0x10FFFF) { *str++ = 0xD800 + ((chr - 0x10000) >> 10); *str++ = 0xDC00 + ((chr - 0x10000) & 0x3FF); } 179 | else *str++ = REPLACEMENT_CHAR; 180 | } 181 | 182 | void utf16::append(std::u16string& str, char32_t chr) { 183 | if (chr <= 0xFFFF) str += chr; 184 | else if (chr <= 0x10FFFF) { str += 0xD800 + ((chr - 0x10000) >> 10); str += 0xDC00 + ((chr - 0x10000) & 0x3FF); } 185 | else str += REPLACEMENT_CHAR; 186 | } 187 | 188 | template void utf16::map(F f, const char16_t* str, std::u16string& result) { 189 | result.clear(); 190 | 191 | for (char32_t chr; (chr = decode(str)); ) 192 | append(result, f(chr)); 193 | } 194 | 195 | template void utf16::map(F f, const char16_t* str, size_t len, std::u16string& result) { 196 | result.clear(); 197 | 198 | while (len) 199 | append(result, f(decode(str, len))); 200 | } 201 | 202 | template void utf16::map(F f, const std::u16string& str, std::u16string& result) { 203 | map(f, str.c_str(), result); 204 | } 205 | 206 | } // namespace unilib 207 | } // namespace ufal 208 | -------------------------------------------------------------------------------- /src/unilib/utf8.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #include "utf8.h" 14 | 15 | namespace ufal { 16 | namespace unilib { 17 | 18 | bool utf8::valid(const char* str) { 19 | for (; *str; str++) 20 | if (((unsigned char)*str) >= 0x80) { 21 | if (((unsigned char)*str) < 0xC0) return false; 22 | else if (((unsigned char)*str) < 0xE0) { 23 | str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 24 | } else if (((unsigned char)*str) < 0xF0) { 25 | str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 26 | str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 27 | } else if (((unsigned char)*str) < 0xF8) { 28 | str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 29 | str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 30 | str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 31 | } else return false; 32 | } 33 | return true; 34 | } 35 | 36 | bool utf8::valid(const char* str, size_t len) { 37 | for (; len > 0; str++, len--) 38 | if (((unsigned char)*str) >= 0x80) { 39 | if (((unsigned char)*str) < 0xC0) return false; 40 | else if (((unsigned char)*str) < 0xE0) { 41 | str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 42 | } else if (((unsigned char)*str) < 0xF0) { 43 | str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 44 | str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 45 | } else if (((unsigned char)*str) < 0xF8) { 46 | str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 47 | str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 48 | str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; 49 | } else return false; 50 | } 51 | return true; 52 | } 53 | 54 | void utf8::decode(const char* str, std::u32string& decoded) { 55 | decoded.clear(); 56 | 57 | for (char32_t chr; (chr = decode(str)); ) 58 | decoded.push_back(chr); 59 | } 60 | 61 | void utf8::decode(const char* str, size_t len, std::u32string& decoded) { 62 | decoded.clear(); 63 | 64 | while (len) 65 | decoded.push_back(decode(str, len)); 66 | } 67 | 68 | void utf8::encode(const std::u32string& str, std::string& encoded) { 69 | encoded.clear(); 70 | 71 | for (auto&& chr : str) 72 | append(encoded, chr); 73 | } 74 | 75 | const char utf8::REPLACEMENT_CHAR; 76 | 77 | } // namespace unilib 78 | } // namespace ufal 79 | -------------------------------------------------------------------------------- /src/unilib/utf8.h: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #pragma once 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | namespace ufal { 21 | namespace unilib { 22 | 23 | class utf8 { 24 | public: 25 | static bool valid(const char* str); 26 | static bool valid(const char* str, size_t len); 27 | static inline bool valid(const std::string& str); 28 | 29 | static inline char32_t decode(const char*& str); 30 | static inline char32_t decode(const char*& str, size_t& len); 31 | static inline char32_t first(const char* str); 32 | static inline char32_t first(const char* str, size_t len); 33 | static inline char32_t first(const std::string& str); 34 | 35 | static void decode(const char* str, std::u32string& decoded); 36 | static void decode(const char* str, size_t len, std::u32string& decoded); 37 | static inline void decode(const std::string& str, std::u32string& decoded); 38 | 39 | class string_decoder { 40 | public: 41 | class iterator; 42 | inline iterator begin(); 43 | inline iterator end(); 44 | private: 45 | inline string_decoder(const char* str); 46 | const char* str; 47 | friend class utf8; 48 | }; 49 | static inline string_decoder decoder(const char* str); 50 | static inline string_decoder decoder(const std::string& str); 51 | 52 | class buffer_decoder { 53 | public: 54 | class iterator; 55 | inline iterator begin(); 56 | inline iterator end(); 57 | private: 58 | inline buffer_decoder(const char* str, size_t len); 59 | const char* str; 60 | size_t len; 61 | friend class utf8; 62 | }; 63 | static inline buffer_decoder decoder(const char* str, size_t len); 64 | 65 | static inline void append(char*& str, char32_t chr); 66 | static inline void append(std::string& str, char32_t chr); 67 | static void encode(const std::u32string& str, std::string& encoded); 68 | 69 | template static void map(F f, const char* str, std::string& result); 70 | template static void map(F f, const char* str, size_t len, std::string& result); 71 | template static void map(F f, const std::string& str, std::string& result); 72 | 73 | private: 74 | static const char REPLACEMENT_CHAR = '?'; 75 | }; 76 | 77 | bool utf8::valid(const std::string& str) { 78 | return valid(str.c_str()); 79 | } 80 | 81 | char32_t utf8::decode(const char*& str) { 82 | if (((unsigned char)*str) < 0x80) return (unsigned char)*str++; 83 | else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR; 84 | else if (((unsigned char)*str) < 0xE0) { 85 | char32_t res = (((unsigned char)*str++) & 0x1F) << 6; 86 | if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 87 | return res + (((unsigned char)*str++) & 0x3F); 88 | } else if (((unsigned char)*str) < 0xF0) { 89 | char32_t res = (((unsigned char)*str++) & 0x0F) << 12; 90 | if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 91 | res += (((unsigned char)*str++) & 0x3F) << 6; 92 | if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 93 | return res + (((unsigned char)*str++) & 0x3F); 94 | } else if (((unsigned char)*str) < 0xF8) { 95 | char32_t res = (((unsigned char)*str++) & 0x07) << 18; 96 | if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 97 | res += (((unsigned char)*str++) & 0x3F) << 12; 98 | if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 99 | res += (((unsigned char)*str++) & 0x3F) << 6; 100 | if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 101 | return res + (((unsigned char)*str++) & 0x3F); 102 | } else return ++str, REPLACEMENT_CHAR; 103 | } 104 | 105 | char32_t utf8::decode(const char*& str, size_t& len) { 106 | if (!len) return 0; 107 | --len; 108 | if (((unsigned char)*str) < 0x80) return (unsigned char)*str++; 109 | else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR; 110 | else if (((unsigned char)*str) < 0xE0) { 111 | char32_t res = (((unsigned char)*str++) & 0x1F) << 6; 112 | if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 113 | return res + ((--len, ((unsigned char)*str++)) & 0x3F); 114 | } else if (((unsigned char)*str) < 0xF0) { 115 | char32_t res = (((unsigned char)*str++) & 0x0F) << 12; 116 | if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 117 | res += ((--len, ((unsigned char)*str++)) & 0x3F) << 6; 118 | if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 119 | return res + ((--len, ((unsigned char)*str++)) & 0x3F); 120 | } else if (((unsigned char)*str) < 0xF8) { 121 | char32_t res = (((unsigned char)*str++) & 0x07) << 18; 122 | if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 123 | res += ((--len, ((unsigned char)*str++)) & 0x3F) << 12; 124 | if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 125 | res += ((--len, ((unsigned char)*str++)) & 0x3F) << 6; 126 | if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; 127 | return res + ((--len, ((unsigned char)*str++)) & 0x3F); 128 | } else return ++str, REPLACEMENT_CHAR; 129 | } 130 | 131 | char32_t utf8::first(const char* str) { 132 | return decode(str); 133 | } 134 | 135 | char32_t utf8::first(const char* str, size_t len) { 136 | return decode(str, len); 137 | } 138 | 139 | char32_t utf8::first(const std::string& str) { 140 | return first(str.c_str()); 141 | } 142 | 143 | void utf8::decode(const std::string& str, std::u32string& decoded) { 144 | decode(str.c_str(), decoded); 145 | } 146 | 147 | class utf8::string_decoder::iterator : public std::iterator { 148 | public: 149 | iterator(const char* str) : codepoint(0), next(str) { operator++(); } 150 | iterator(const iterator& it) : codepoint(it.codepoint), next(it.next) {} 151 | iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } 152 | iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; } 153 | bool operator==(const iterator& other) const { return next == other.next; } 154 | bool operator!=(const iterator& other) const { return next != other.next; } 155 | const char32_t& operator*() { return codepoint; } 156 | private: 157 | char32_t codepoint; 158 | const char* next; 159 | }; 160 | 161 | utf8::string_decoder::string_decoder(const char* str) : str(str) {} 162 | 163 | utf8::string_decoder::iterator utf8::string_decoder::begin() { 164 | return iterator(str); 165 | } 166 | 167 | utf8::string_decoder::iterator utf8::string_decoder::end() { 168 | return iterator(nullptr); 169 | } 170 | 171 | utf8::string_decoder utf8::decoder(const char* str) { 172 | return string_decoder(str); 173 | } 174 | 175 | utf8::string_decoder utf8::decoder(const std::string& str) { 176 | return string_decoder(str.c_str()); 177 | } 178 | 179 | class utf8::buffer_decoder::iterator : public std::iterator { 180 | public: 181 | iterator(const char* str, size_t len) : codepoint(0), next(str), len(len) { operator++(); } 182 | iterator(const iterator& it) : codepoint(it.codepoint), next(it.next), len(it.len) {} 183 | iterator& operator++() { if (!len) next = nullptr; if (next) codepoint = decode(next, len); return *this; } 184 | iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; } 185 | bool operator==(const iterator& other) const { return next == other.next; } 186 | bool operator!=(const iterator& other) const { return next != other.next; } 187 | const char32_t& operator*() { return codepoint; } 188 | private: 189 | char32_t codepoint; 190 | const char* next; 191 | size_t len; 192 | }; 193 | 194 | utf8::buffer_decoder::buffer_decoder(const char* str, size_t len) : str(str), len(len) {} 195 | 196 | utf8::buffer_decoder::iterator utf8::buffer_decoder::begin() { 197 | return iterator(str, len); 198 | } 199 | 200 | utf8::buffer_decoder::iterator utf8::buffer_decoder::end() { 201 | return iterator(nullptr, 0); 202 | } 203 | 204 | utf8::buffer_decoder utf8::decoder(const char* str, size_t len) { 205 | return buffer_decoder(str, len); 206 | } 207 | 208 | void utf8::append(char*& str, char32_t chr) { 209 | if (chr < 0x80) *str++ = chr; 210 | else if (chr < 0x800) { *str++ = 0xC0 + (chr >> 6); *str++ = 0x80 + (chr & 0x3F); } 211 | else if (chr < 0x10000) { *str++ = 0xE0 + (chr >> 12); *str++ = 0x80 + ((chr >> 6) & 0x3F); *str++ = 0x80 + (chr & 0x3F); } 212 | else if (chr < 0x200000) { *str++ = 0xF0 + (chr >> 18); *str++ = 0x80 + ((chr >> 12) & 0x3F); *str++ = 0x80 + ((chr >> 6) & 0x3F); *str++ = 0x80 + (chr & 0x3F); } 213 | else *str++ = REPLACEMENT_CHAR; 214 | } 215 | 216 | void utf8::append(std::string& str, char32_t chr) { 217 | if (chr < 0x80) str += chr; 218 | else if (chr < 0x800) { str += 0xC0 + (chr >> 6); str += 0x80 + (chr & 0x3F); } 219 | else if (chr < 0x10000) { str += 0xE0 + (chr >> 12); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); } 220 | else if (chr < 0x200000) { str += 0xF0 + (chr >> 18); str += 0x80 + ((chr >> 12) & 0x3F); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); } 221 | else str += REPLACEMENT_CHAR; 222 | } 223 | 224 | template void utf8::map(F f, const char* str, std::string& result) { 225 | result.clear(); 226 | 227 | for (char32_t chr; (chr = decode(str)); ) 228 | append(result, f(chr)); 229 | } 230 | 231 | template void utf8::map(F f, const char* str, size_t len, std::string& result) { 232 | result.clear(); 233 | 234 | while (len) 235 | append(result, f(decode(str, len))); 236 | } 237 | 238 | template void utf8::map(F f, const std::string& str, std::string& result) { 239 | map(f, str.c_str(), result); 240 | } 241 | 242 | } // namespace unilib 243 | } // namespace ufal 244 | -------------------------------------------------------------------------------- /src/unilib/version.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #include "version.h" 14 | 15 | namespace ufal { 16 | namespace unilib { 17 | 18 | // Returns current version. 19 | version version::current() { 20 | return {3, 1, 2, "devel"}; 21 | } 22 | 23 | } // namespace unilib 24 | } // namespace ufal 25 | -------------------------------------------------------------------------------- /src/unilib/version.h: -------------------------------------------------------------------------------- 1 | // This file is part of UniLib . 2 | // 3 | // Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of 4 | // Mathematics and Physics, Charles University in Prague, Czech Republic. 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public 7 | // License, v. 2.0. If a copy of the MPL was not distributed with this 8 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 | // 10 | // UniLib version: 3.1.2-devel 11 | // Unicode version: 8.0.0 12 | 13 | #pragma once 14 | 15 | #include 16 | 17 | namespace ufal { 18 | namespace unilib { 19 | 20 | struct version { 21 | unsigned major; 22 | unsigned minor; 23 | unsigned patch; 24 | std::string prerelease; 25 | 26 | // Returns current version. 27 | static version current(); 28 | }; 29 | 30 | } // namespace unilib 31 | } // namespace ufal 32 | -------------------------------------------------------------------------------- /test/main.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Eric on 2021/1/5. 3 | // 4 | 5 | #include // One-stop header. 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "tokenizer.h" 11 | 12 | using namespace std; 13 | 14 | BertTokenizer tokenizer; 15 | BasicTokenizer basictokenizer; 16 | 17 | int max_seq_length = 512; 18 | 19 | string join(const char *a, const char *b) 20 | { 21 | string message; 22 | message.reserve(strlen(a) + 1 + strlen(b)); 23 | message = a; 24 | message += "/"; 25 | message += b; 26 | return message; 27 | } 28 | 29 | template 30 | void printVector(vector &v) 31 | { 32 | 33 | for (typename vector::iterator it = v.begin(); it != v.end(); it++) 34 | { 35 | cout << *it << " "; 36 | } 37 | cout << endl; 38 | } 39 | 40 | 41 | int main(int argc, const char *argv[]) 42 | { 43 | torch::jit::script::Module bert; 44 | const char *model = argv[1]; 45 | bert = torch::jit::load(join(model, "traced_bert.pt")); 46 | bert.eval(); 47 | string textA; 48 | string textB; 49 | tokenizer.add_vocab(join(model, "vocab.txt").c_str()); 50 | while (true) 51 | { 52 | cout << "\n" << "Input A -> "; 53 | getline(cin, textA); 54 | cout << "\n" << "Input B -> "; 55 | getline(cin, textB); 56 | vector input_ids; 57 | vector input_mask; 58 | vector segment_ids; 59 | vector input_ids2; 60 | vector input_mask2; 61 | vector segment_ids2; 62 | vector> input_ids_list; 63 | vector> input_mask_list; 64 | vector> segment_ids_list; 65 | const char *truncation_strategy = "only_first"; 66 | tokenizer.encode(textA, textB, input_ids, input_mask, segment_ids, max_seq_length, truncation_strategy); 67 | printVector(input_ids); 68 | printVector(input_mask); 69 | printVector(segment_ids); 70 | std::vector inputs; 71 | inputs.push_back(torch::from_blob(input_ids.data(), {1, max_seq_length}).to(torch::kLong)); 72 | inputs.push_back(torch::from_blob(input_mask.data(), {1, max_seq_length}).to(torch::kLong)); 73 | inputs.push_back(torch::from_blob(segment_ids.data(), {1, max_seq_length}).to(torch::kLong)); 74 | 75 | at::Tensor output = bert.forward(inputs).toTensor(); 76 | std::cout << output << '\n'; 77 | } 78 | 79 | } 80 | --------------------------------------------------------------------------------