├── .bazelrc ├── .gitignore ├── BUILD ├── WORKSPACE ├── bazel ├── BUILD ├── foreign_cc.patch ├── icu │ ├── BUILD │ └── icu.patch └── pybind11 │ └── BUILD ├── binding.cpp ├── clip_tokenizer.cpp ├── clip_tokenizer.h ├── test.cpp ├── unit_test.py └── vocab.txt /.bazelrc: -------------------------------------------------------------------------------- 1 | build --action_env=BAZEL_CXXOPTS="-std=c++17" 2 | build --cxxopt="-std=c++17" 3 | build --copt="-fPIC" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /bazel-*/** 2 | /bazel-* 3 | /.vscode/** -------------------------------------------------------------------------------- /BUILD: -------------------------------------------------------------------------------- 1 | cc_library( 2 | name="clip", 3 | srcs=["clip_tokenizer.cpp"], 4 | hdrs=["clip_tokenizer.h"], 5 | deps=["@icu"], 6 | ) 7 | 8 | 9 | cc_binary( 10 | name="test", 11 | srcs=["test.cpp"], 12 | deps=[ 13 | ":clip" 14 | ], 15 | ) 16 | 17 | load("@pybind11_bazel//:build_defs.bzl", "pybind_extension") 18 | 19 | pybind_extension( 20 | name = "clip_tokenizer_py", 21 | srcs = ["binding.cpp"], 22 | deps = [ 23 | ":clip" 24 | ], 25 | ) 26 | 27 | py_library( 28 | name = "clip_tokenizer_py", 29 | srcs = ["unit_test.py"], 30 | data = [ 31 | ":clip_tokenizer_py.so" 32 | ], 33 | ) 34 | 35 | py_binary( 36 | name = "unit_test", 37 | srcs = ["unit_test.py"], 38 | deps = [ 39 | ":clip_tokenizer_py" 40 | ], 41 | data = [ 42 | ":vocab.txt" 43 | ], 44 | ) -------------------------------------------------------------------------------- /WORKSPACE: -------------------------------------------------------------------------------- 1 | load("@bazel_tools//tools/build_defs/repo:git.bzl", "new_git_repository") 2 | load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") 3 | 4 | http_archive( 5 | name = "rules_foreign_cc", 6 | patches = ["//bazel:foreign_cc.patch"], 7 | sha256 = "2a4d07cd64b0719b39a7c12218a3e507672b82a97b98c6a89d38565894cf7c51", 8 | strip_prefix = "rules_foreign_cc-0.9.0", 9 | url = "https://github.com/bazelbuild/rules_foreign_cc/archive/refs/tags/0.9.0.tar.gz", 10 | ) 11 | 12 | load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies") 13 | 14 | rules_foreign_cc_dependencies() 15 | 16 | new_git_repository( 17 | name = "icu", 18 | build_file = "//bazel/icu:BUILD", 19 | remote = "https://github.com/unicode-org/icu.git", 20 | patches = ["//bazel/icu:icu.patch"], 21 | tag = "release-71-1", 22 | ) 23 | 24 | load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") 25 | 26 | new_git_repository( 27 | name = "pybind11_bazel", 28 | remote = "https://github.com/pybind/pybind11_bazel.git", 29 | tag = "v2.11.1" 30 | ) 31 | 32 | new_git_repository( 33 | name = "pybind11", 34 | build_file = "@pybind11_bazel//:pybind11.BUILD", 35 | remote = "https://github.com/pybind/pybind11.git", 36 | tag = "v2.11.1" 37 | ) 38 | 39 | load("@pybind11_bazel//:python_configure.bzl", "python_configure") 40 | python_configure(name = "local_config_python") 41 | -------------------------------------------------------------------------------- /bazel/BUILD: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozanarmagan/clip_tokenizer_cpp/0ca1e2e2e7418108725eaa7fb93e029516ae63fa/bazel/BUILD -------------------------------------------------------------------------------- /bazel/foreign_cc.patch: -------------------------------------------------------------------------------- 1 | --- foreign_cc/private/configure_script.bzl 2 | +++ foreign_cc/private/configure_script.bzl 3 | @@ -70,7 +70,7 @@ 4 | ).lstrip()) 5 | 6 | script.append("##mkdirs## $$BUILD_TMPDIR$$/$$INSTALL_PREFIX$$") 7 | - script.append("{env_vars} {prefix}\"{configure}\" --prefix=$$BUILD_TMPDIR$$/$$INSTALL_PREFIX$$ {user_options}".format( 8 | + script.append("{env_vars} {prefix}\"{configure}\" {user_options} --prefix=$$BUILD_TMPDIR$$/$$INSTALL_PREFIX$$".format( 9 | env_vars = get_make_env_vars(workspace_name, tools, flags, env_vars, deps, inputs), 10 | prefix = configure_prefix, 11 | configure = configure_path, -------------------------------------------------------------------------------- /bazel/icu/BUILD: -------------------------------------------------------------------------------- 1 | load("@rules_foreign_cc//foreign_cc:defs.bzl", "configure_make") 2 | 3 | package(default_visibility = ["//visibility:public"]) 4 | 5 | filegroup( 6 | name = "all_srcs", 7 | srcs = glob(["**"]), 8 | visibility = ["//visibility:public"], 9 | ) 10 | 11 | configure_make( 12 | name = "icu", 13 | args = ["-j8"], 14 | configure_command = "icu4c/source/runConfigureICU", 15 | tags = ["no-sandbox"], 16 | configure_options = select({ 17 | "@platforms//os:linux": ["Linux"], 18 | "@platforms//os:macos": ["MacOSX"], 19 | }) + [ 20 | "--enable-static", 21 | "--disable-shared", 22 | ], 23 | lib_source = ":all_srcs", 24 | out_static_libs = [ 25 | "libicui18n.a", 26 | "libicutu.a", 27 | "libicuuc.a", 28 | "libicudata.a", 29 | "libicuio.a", 30 | ], 31 | ) 32 | -------------------------------------------------------------------------------- /bazel/icu/icu.patch: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | --- icu4c/source/common/BUILD.bazel 5 | +++ /dev/null 6 | @@ -1,1213 +0,0 @@ 7 | -# © 2021 and later: Unicode, Inc. and others. 8 | -# License & terms of use: http://www.unicode.org/copyright.html 9 | - 10 | -# This file defines Bazel targets for a subset of ICU4C "common" library header and source files. 11 | -# The configuration of dependencies among targets is strongly assisted by the 12 | -# file in depstest that maintains such information, at 13 | -# icu4c/source/test/depstest/dependencies.txt . 14 | - 15 | -load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") 16 | - 17 | -package( 18 | - default_visibility = ["//visibility:public"], 19 | -) 20 | - 21 | -# When compiling code in the `common` dir, the constant 22 | -# `U_COMMON_IMPLEMENTATION` needs to be defined. See 23 | -# https://unicode-org.github.io/icu/userguide/howtouseicu#c-with-your-own-build-system . 24 | - 25 | -# If linker errors occur, then this may be a sign that the dependencies were 26 | -# not specified correctly. Use dependencies.txt in depstest for assistance. See 27 | -# https://stackoverflow.com/q/66111709/2077918 . 28 | - 29 | -cc_library( 30 | - name = "headers", 31 | - hdrs = glob([ 32 | - "unicode/*.h", # public 33 | - "*.h", # internal 34 | - ], 35 | - # Instead of using these checked-in files, our Bazel build process 36 | - # regenerates them and then uses the new versions. 37 | - # Same list of .h files as in icu4c/source/data/unidata/clean.sh. 38 | - exclude = ["norm2_nfc_data.h", "propname_data.h", "*_props_data.h"], 39 | - ), 40 | - # We need to add includes in order to preserve existing source files' 41 | - # include directives that use traditional paths, not paths relative to 42 | - # Bazel workspace: 43 | - # https://stackoverflow.com/a/65635893/2077918 44 | - includes = ["."], 45 | - local_defines = [ 46 | - "U_COMMON_IMPLEMENTATION", 47 | - ], 48 | -) 49 | - 50 | -cc_library( 51 | - name = "platform", 52 | - srcs = [ 53 | - "cmemory.cpp", 54 | - "uobject.cpp", 55 | - "cstring.cpp", 56 | - "cwchar.cpp", 57 | - "uinvchar.cpp", 58 | - "charstr.cpp", 59 | - "unistr.cpp", 60 | - "appendable.cpp", 61 | - "stringpiece.cpp", 62 | - "ustrtrns.cpp", 63 | - "ustring.cpp", 64 | - "ustrfmt.cpp", 65 | - "utf_impl.cpp", 66 | - "putil.cpp", 67 | - "ucln_cmn.cpp", 68 | - "udataswp.cpp", 69 | - "umath.cpp", 70 | - "umutex.cpp", 71 | - "sharedobject.cpp", 72 | - "utrace.cpp", 73 | - ], 74 | - deps = [ 75 | - ":headers", 76 | - # omit other deps b/c they are sys symbols 77 | - ], 78 | - local_defines = [ 79 | - "U_COMMON_IMPLEMENTATION", 80 | - ], 81 | - linkopts = ["-ldl"], 82 | -) 83 | - 84 | -cc_library( 85 | - name = "utrie", 86 | - srcs = ["utrie.cpp"], 87 | - deps = [":platform"], 88 | - local_defines = [ 89 | - "U_COMMON_IMPLEMENTATION", 90 | - ], 91 | -) 92 | - 93 | -cc_library( 94 | - name = "utrie2", 95 | - srcs = ["utrie2.cpp"], 96 | - deps = [":platform"], 97 | - local_defines = [ 98 | - "U_COMMON_IMPLEMENTATION", 99 | - ], 100 | -) 101 | - 102 | -cc_library( 103 | - name = "utrie2_builder", 104 | - srcs = ["utrie2_builder.cpp"], 105 | - deps = [ 106 | - ":utrie", 107 | - ":utrie2", 108 | - ":platform", 109 | - ], 110 | - local_defines = [ 111 | - "U_COMMON_IMPLEMENTATION", 112 | - ], 113 | -) 114 | - 115 | -cc_library( 116 | - name = "ucptrie", 117 | - srcs = ["ucptrie.cpp"], 118 | - deps = [":platform"], 119 | - local_defines = [ 120 | - "U_COMMON_IMPLEMENTATION", 121 | - ], 122 | -) 123 | - 124 | -cc_library( 125 | - name = "umutablecptrie", 126 | - srcs = ["umutablecptrie.cpp"], 127 | - deps = [":ucptrie"], 128 | - local_defines = [ 129 | - "U_COMMON_IMPLEMENTATION", 130 | - ], 131 | -) 132 | - 133 | -cc_library( 134 | - name = "bytestrie", 135 | - srcs = ["bytestrie.cpp"], 136 | - deps = [":platform"], 137 | - local_defines = [ 138 | - "U_COMMON_IMPLEMENTATION", 139 | - ], 140 | -) 141 | - 142 | -cc_library( 143 | - name = "bytestriebuilder", 144 | - srcs = ["bytestriebuilder.cpp"], 145 | - deps = [ 146 | - ":bytestrie", 147 | - ":stringtriebuilder", 148 | - ":sort", 149 | - ], 150 | - local_defines = [ 151 | - "U_COMMON_IMPLEMENTATION", 152 | - ], 153 | -) 154 | - 155 | -cc_library( 156 | - name = "stringtriebuilder", 157 | - srcs = ["stringtriebuilder.cpp"], 158 | - deps = [ 159 | - ":uhash", 160 | - ], 161 | - local_defines = [ 162 | - "U_COMMON_IMPLEMENTATION", 163 | - ], 164 | -) 165 | - 166 | -cc_library( 167 | - name = "uhash", 168 | - hdrs = [ 169 | - "uhash.h", 170 | - ], 171 | - srcs = [ 172 | - "uhash.cpp", 173 | - ], 174 | - deps = [ 175 | - ":headers", 176 | - ], 177 | - local_defines = [ 178 | - "U_COMMON_IMPLEMENTATION", 179 | - ], 180 | -) 181 | - 182 | -cc_library( 183 | - name = "errorcode", 184 | - hdrs = [ 185 | - ], 186 | - srcs = [ 187 | - "errorcode.cpp", 188 | - ], 189 | - includes = ["."], 190 | - deps = [ 191 | - ":platform", 192 | - ":utypes", 193 | - ], 194 | - local_defines = [ 195 | - "U_COMMON_IMPLEMENTATION", 196 | - ], 197 | -) 198 | - 199 | -cc_library( 200 | - name = "utypes", 201 | - srcs = [ 202 | - "utypes.cpp", 203 | - ], 204 | - includes = ["."], 205 | - deps = [ 206 | - ":headers", 207 | - ], 208 | - local_defines = [ 209 | - "U_COMMON_IMPLEMENTATION", 210 | - ], 211 | -) 212 | - 213 | -cc_library( 214 | - name = "uniset", 215 | - srcs = [ 216 | - "uniset.cpp", 217 | - "unifilt.cpp", 218 | - "unisetspan.cpp", 219 | - "bmpset.cpp", 220 | - "util.cpp", 221 | - "unifunct.cpp", 222 | - "usetiter.cpp", 223 | - ], 224 | - includes = ["."], 225 | - deps = [ 226 | - ":patternprops", 227 | - ":uvector", 228 | - ":headers", 229 | - ], 230 | - local_defines = [ 231 | - "U_COMMON_IMPLEMENTATION", 232 | - ], 233 | -) 234 | - 235 | -cc_library( 236 | - name = "patternprops", 237 | - srcs = [ 238 | - "patternprops.cpp", 239 | - ], 240 | - includes = ["."], 241 | - deps = [ 242 | - ":headers", 243 | - ], 244 | - local_defines = [ 245 | - "U_COMMON_IMPLEMENTATION", 246 | - ], 247 | -) 248 | - 249 | -cc_library( 250 | - name = "propsvec", 251 | - srcs = [ 252 | - "propsvec.cpp", 253 | - ], 254 | - includes = ["."], 255 | - deps = [ 256 | - ":sort", 257 | - ":utrie2_builder", 258 | - ":headers", 259 | - ], 260 | - local_defines = [ 261 | - "U_COMMON_IMPLEMENTATION", 262 | - ], 263 | -) 264 | - 265 | -cc_library( 266 | - name = "propname", 267 | - srcs = [ 268 | - "propname.cpp", 269 | - "propname_data.h", 270 | - ], 271 | - includes = ["."], 272 | - deps = [ 273 | - ":bytestrie", 274 | - ":headers", 275 | - ], 276 | - local_defines = [ 277 | - "U_COMMON_IMPLEMENTATION", 278 | - ], 279 | -) 280 | - 281 | -# Note: The cc_library target names "uvector32" and "uvector64" match the 282 | -# dependencies.txt group names, but the filenames are "uvectr32.*"/"uvectr64.*". 283 | -cc_library( 284 | - name = "uvector32", 285 | - srcs = [ 286 | - "uvectr32.cpp", 287 | - ], 288 | - includes = ["."], 289 | - deps = [ 290 | - ":headers", 291 | - ":platform", 292 | - ], 293 | - local_defines = [ 294 | - "U_COMMON_IMPLEMENTATION", 295 | - ], 296 | -) 297 | - 298 | -cc_library( 299 | - name = "uvector64", 300 | - srcs = [ 301 | - "uvectr64.cpp", 302 | - ], 303 | - includes = ["."], 304 | - deps = [ 305 | - ":headers", 306 | - ":platform", 307 | - ], 308 | - local_defines = [ 309 | - "U_COMMON_IMPLEMENTATION", 310 | - ], 311 | -) 312 | - 313 | -cc_library( 314 | - name = "sort", 315 | - srcs = [ 316 | - "uarrsort.cpp", 317 | - ], 318 | - includes = ["."], 319 | - deps = [ 320 | - ":headers", 321 | - ], 322 | - local_defines = [ 323 | - "U_COMMON_IMPLEMENTATION", 324 | - ], 325 | -) 326 | - 327 | -cc_library( 328 | - name = "uvector", 329 | - srcs = [ 330 | - "uvector.cpp", 331 | - ], 332 | - includes = ["."], 333 | - deps = [ 334 | - ":platform", 335 | - ":sort", 336 | - ], 337 | - local_defines = [ 338 | - "U_COMMON_IMPLEMENTATION", 339 | - ], 340 | -) 341 | - 342 | -cc_library( 343 | - name = "breakiterator", 344 | - srcs = [ 345 | - "brkiter.cpp", 346 | - "brkeng.cpp", 347 | - "dictbe.cpp", 348 | - "dictionarydata.cpp", 349 | - "filteredbrk.cpp", 350 | - "lstmbe.cpp", 351 | - "rbbi.cpp", 352 | - "rbbi_cache.cpp", 353 | - "rbbidata.cpp", 354 | - "rbbinode.cpp", 355 | - "rbbirb.cpp", 356 | - "rbbiscan.cpp", 357 | - "rbbisetb.cpp", 358 | - "rbbistbl.cpp", 359 | - "rbbitblb.cpp", 360 | - "ubrk.cpp", 361 | - ], 362 | - includes = ["."], 363 | - deps = [ 364 | - ":bytestrie", 365 | - ":headers", 366 | - ":normlzr", 367 | - ":resourcebundle", 368 | - ":schriter", 369 | - ":service_registration", 370 | - ":ucharstrie", 371 | - ":ucharstriebuilder", 372 | - ":uhash", 373 | - ":uniset_core", 374 | - ":uniset_props", 375 | - ":ustack", 376 | - ":utext", 377 | - ":utrie2_builder", 378 | - ":uvector32", 379 | - ], 380 | - local_defines = [ 381 | - "U_COMMON_IMPLEMENTATION", 382 | - ], 383 | -) 384 | - 385 | -cc_library( 386 | - name = "bytesinkutil", 387 | - srcs = [ 388 | - "bytesinkutil.cpp", 389 | - ], 390 | - includes = ["."], 391 | - deps = [ 392 | - ":headers", 393 | - ":bytestream", 394 | - ":edits", 395 | - ], 396 | - local_defines = [ 397 | - "U_COMMON_IMPLEMENTATION", 398 | - ], 399 | -) 400 | - 401 | -cc_library( 402 | - name = "bytestream", 403 | - srcs = [ 404 | - "bytestream.cpp", 405 | - ], 406 | - includes = ["."], 407 | - deps = [ 408 | - ":headers", 409 | - ":platform", 410 | - ], 411 | - local_defines = [ 412 | - "U_COMMON_IMPLEMENTATION", 413 | - ], 414 | -) 415 | - 416 | -cc_library( 417 | - name = "canonical_iterator", 418 | - srcs = [ 419 | - "caniter.cpp", 420 | - ], 421 | - deps = [ 422 | - ":normalizer2", 423 | - ":usetiter", 424 | - ], 425 | - local_defines = [ 426 | - "U_COMMON_IMPLEMENTATION", 427 | - ], 428 | -) 429 | - 430 | -cc_library( 431 | - name = "characterproperties", 432 | - srcs = [ 433 | - "characterproperties.cpp", 434 | - ], 435 | - includes = ["."], 436 | - deps = [ 437 | - ":headers", 438 | - ":emojiprops", 439 | - ":ucptrie", 440 | - ":umutablecptrie", 441 | - ":uniset_core", 442 | - ":uprops", 443 | - ], 444 | - local_defines = [ 445 | - "U_COMMON_IMPLEMENTATION", 446 | - ], 447 | -) 448 | - 449 | -cc_library( 450 | - name = "chariter", 451 | - srcs = [ 452 | - "chariter.cpp", 453 | - ], 454 | - includes = ["."], 455 | - deps = [ 456 | - ":headers", 457 | - ":platform", 458 | - ], 459 | - local_defines = [ 460 | - "U_COMMON_IMPLEMENTATION", 461 | - ], 462 | -) 463 | - 464 | -cc_library( 465 | - name = "edits", 466 | - srcs = [ 467 | - "edits.cpp", 468 | - ], 469 | - includes = ["."], 470 | - deps = [ 471 | - ":headers", 472 | - ":icu_utility", 473 | - ":platform", 474 | - ], 475 | - local_defines = [ 476 | - "U_COMMON_IMPLEMENTATION", 477 | - ], 478 | -) 479 | - 480 | -cc_library( 481 | - name = "filterednormalizer2", 482 | - srcs = [ 483 | - "filterednormalizer2.cpp", 484 | - ], 485 | - includes = ["."], 486 | - deps = [ 487 | - ":headers", 488 | - ":normalizer2", 489 | - ], 490 | - local_defines = [ 491 | - "U_COMMON_IMPLEMENTATION", 492 | - ], 493 | -) 494 | - 495 | -cc_library( 496 | - name = "hashtable", 497 | - srcs = [ 498 | - "uhash_us.cpp", 499 | - ], 500 | - includes = ["."], 501 | - deps = [ 502 | - ":headers", 503 | - ":uhash", 504 | - ], 505 | - local_defines = [ 506 | - "U_COMMON_IMPLEMENTATION", 507 | - ], 508 | -) 509 | - 510 | -cc_library( 511 | - name = "icu_utility", 512 | - srcs = [ 513 | - "util.cpp", 514 | - ], 515 | - includes = ["."], 516 | - deps = [ 517 | - ":headers", 518 | - ":patternprops", 519 | - ":platform", 520 | - ], 521 | - local_defines = [ 522 | - "U_COMMON_IMPLEMENTATION", 523 | - ], 524 | -) 525 | - 526 | -cc_library( 527 | - name = "loadednormalizer2", 528 | - srcs = [ 529 | - "loadednormalizer2impl.cpp", 530 | - ], 531 | - includes = ["."], 532 | - deps = [ 533 | - ":headers", 534 | - ":normalizer2", 535 | - ], 536 | - local_defines = [ 537 | - "U_COMMON_IMPLEMENTATION", 538 | - ], 539 | -) 540 | - 541 | -cc_library( 542 | - name = "locale_display_names", 543 | - srcs = [ 544 | - "locdispnames.cpp", 545 | - ], 546 | - includes = ["."], 547 | - deps = [ 548 | - ":headers", 549 | - ":locresdata", 550 | - ], 551 | - local_defines = [ 552 | - "U_COMMON_IMPLEMENTATION", 553 | - ], 554 | -) 555 | - 556 | -cc_library( 557 | - name = "locresdata", 558 | - srcs = [ 559 | - "locresdata.cpp", 560 | - ], 561 | - includes = ["."], 562 | - deps = [ 563 | - ":headers", 564 | - ":resourcebundle", 565 | - ], 566 | - local_defines = [ 567 | - "U_COMMON_IMPLEMENTATION", 568 | - ], 569 | -) 570 | - 571 | -cc_library( 572 | - name = "normlzr", 573 | - srcs = [ 574 | - "normlzr.cpp", 575 | - ], 576 | - includes = ["."], 577 | - deps = [ 578 | - ":filterednormalizer2", 579 | - ":headers", 580 | - ":schriter", 581 | - ":uniset_props", 582 | - ], 583 | - local_defines = [ 584 | - "U_COMMON_IMPLEMENTATION", 585 | - ], 586 | -) 587 | - 588 | -cc_library( 589 | - name = "parsepos", 590 | - srcs = [ 591 | - "parsepos.cpp", 592 | - ], 593 | - includes = ["."], 594 | - deps = [ 595 | - ":headers", 596 | - ":platform", 597 | - ], 598 | - local_defines = [ 599 | - "U_COMMON_IMPLEMENTATION", 600 | - ], 601 | -) 602 | - 603 | -cc_library( 604 | - name = "resourcebundle", 605 | - srcs = [ 606 | - "localebuilder.cpp", 607 | - "locavailable.cpp", 608 | - "locbased.cpp", 609 | - "locid.cpp", 610 | - "loclikely.cpp", 611 | - "locmap.cpp", 612 | - "resbund.cpp", 613 | - "resource.cpp", 614 | - "uloc.cpp", 615 | - "uloc_tag.cpp", 616 | - "uloc_keytype.cpp", 617 | - "uresbund.cpp", 618 | - "uresdata.cpp", 619 | - "wintz.cpp", 620 | - ], 621 | - includes = ["."], 622 | - deps = [ 623 | - ":bytesinkutil", 624 | - ":errorcode", 625 | - ":headers", 626 | - ":propname", 627 | - ":sort", 628 | - ":stringenumeration", 629 | - ":ucol_swp", 630 | - ":udata", 631 | - ":uhash", 632 | - ":uscript_props", 633 | - ":uvector", 634 | - ], 635 | - local_defines = [ 636 | - "U_COMMON_IMPLEMENTATION", 637 | - ], 638 | -) 639 | - 640 | -cc_library( 641 | - name = "schriter", 642 | - srcs = [ 643 | - "schriter.cpp", 644 | - "uchriter.cpp", 645 | - ], 646 | - includes = ["."], 647 | - deps = [ 648 | - ":chariter", 649 | - ":headers", 650 | - ], 651 | - local_defines = [ 652 | - "U_COMMON_IMPLEMENTATION", 653 | - ], 654 | -) 655 | - 656 | -cc_library( 657 | - name = "service_registration", 658 | - srcs = [ 659 | - "locutil.cpp", 660 | - "serv.cpp", 661 | - "servlk.cpp", 662 | - "servlkf.cpp", 663 | - "servls.cpp", 664 | - "servnotf.cpp", 665 | - "servrbf.cpp", 666 | - "servslkf.cpp", 667 | - ], 668 | - includes = ["."], 669 | - deps = [ 670 | - ":hashtable", 671 | - ":headers", 672 | - ":locale_display_names", 673 | - ":resourcebundle", 674 | - ":uvector", 675 | - ], 676 | - local_defines = [ 677 | - "U_COMMON_IMPLEMENTATION", 678 | - ], 679 | -) 680 | - 681 | -cc_library( 682 | - name = "stringenumeration", 683 | - srcs = [ 684 | - "uenum.cpp", 685 | - "ustrenum.cpp", 686 | - ], 687 | - includes = ["."], 688 | - deps = [ 689 | - ":headers", 690 | - ":platform", 691 | - ], 692 | - local_defines = [ 693 | - "U_COMMON_IMPLEMENTATION", 694 | - ], 695 | -) 696 | - 697 | -cc_library( 698 | - name = "ubidi_props", 699 | - srcs = [ 700 | - "ubidi_props.cpp", 701 | - "ubidi_props_data.h", 702 | - ], 703 | - includes = ["."], 704 | - deps = [ 705 | - ":headers", 706 | - ":utrie2", 707 | - ], 708 | - local_defines = [ 709 | - "U_COMMON_IMPLEMENTATION", 710 | - ], 711 | -) 712 | - 713 | -cc_library( 714 | - name = "ucase", 715 | - srcs = [ 716 | - "ucase.cpp", 717 | - "ucase_props_data.h", 718 | - ], 719 | - includes = ["."], 720 | - deps = [ 721 | - ":headers", 722 | - ":utrie2", 723 | - ], 724 | - local_defines = [ 725 | - "U_COMMON_IMPLEMENTATION", 726 | - ], 727 | -) 728 | - 729 | -cc_library( 730 | - name = "uchar", 731 | - srcs = [ 732 | - "uchar.cpp", 733 | - "uchar_props_data.h", 734 | - ], 735 | - includes = ["."], 736 | - deps = [ 737 | - ":headers", 738 | - ":utrie2", 739 | - ], 740 | - local_defines = [ 741 | - "U_COMMON_IMPLEMENTATION", 742 | - ], 743 | -) 744 | - 745 | -cc_library( 746 | - name = "emojiprops", 747 | - srcs = [ 748 | - "emojiprops.cpp", 749 | - "emojiprops.h", 750 | - ], 751 | - includes = ["."], 752 | - deps = [ 753 | - ":headers", 754 | - ":ucharstrie", 755 | - ":ucharstrieiterator", 756 | - ":ucptrie", 757 | - ":udata", 758 | - ], 759 | - local_defines = [ 760 | - "U_COMMON_IMPLEMENTATION", 761 | - ], 762 | -) 763 | - 764 | -cc_library( 765 | - name = "ucharstrie", 766 | - srcs = [ 767 | - "ucharstrie.cpp", 768 | - ], 769 | - includes = ["."], 770 | - deps = [ 771 | - ":headers", 772 | - ":platform", 773 | - ], 774 | - local_defines = [ 775 | - "U_COMMON_IMPLEMENTATION", 776 | - ], 777 | -) 778 | - 779 | -cc_library( 780 | - name = "ucharstriebuilder", 781 | - srcs = [ 782 | - "ucharstriebuilder.cpp", 783 | - ], 784 | - includes = ["."], 785 | - deps = [ 786 | - ":headers", 787 | - ":sort", 788 | - ":stringtriebuilder", 789 | - ":ucharstrie", 790 | - ], 791 | - local_defines = [ 792 | - "U_COMMON_IMPLEMENTATION", 793 | - ], 794 | -) 795 | - 796 | -cc_library( 797 | - name = "ucharstrieiterator", 798 | - srcs = [ 799 | - "ucharstrieiterator.cpp", 800 | - ], 801 | - includes = ["."], 802 | - deps = [ 803 | - ":headers", 804 | - ":ucharstrie", 805 | - ":uvector32", 806 | - ], 807 | - local_defines = [ 808 | - "U_COMMON_IMPLEMENTATION", 809 | - ], 810 | -) 811 | - 812 | -cc_library( 813 | - name = "ucol_swp", 814 | - srcs = [ 815 | - "ucol_swp.cpp", 816 | - ], 817 | - includes = ["."], 818 | - deps = [ 819 | - ":headers", 820 | - ":utrie_swap", 821 | - ], 822 | - local_defines = [ 823 | - "U_COMMON_IMPLEMENTATION", 824 | - ], 825 | -) 826 | - 827 | -cc_library( 828 | - name = "udata", 829 | - srcs = [ 830 | - "restrace.cpp", 831 | - "ucmndata.cpp", 832 | - "udata.cpp", 833 | - "udatamem.cpp", 834 | - "umapfile.cpp", 835 | - ], 836 | - includes = ["."], 837 | - deps = [ 838 | - ":headers", 839 | - ":icu_utility", 840 | - ":platform", 841 | - ":uhash", 842 | - "//icu4c/source/stubdata", 843 | - ], 844 | - local_defines = [ 845 | - "U_COMMON_IMPLEMENTATION", 846 | - ], 847 | -) 848 | - 849 | -cc_library( 850 | - name = "uiter", 851 | - srcs = [ 852 | - "uiter.cpp", 853 | - ], 854 | - includes = ["."], 855 | - deps = [ 856 | - ":headers", 857 | - ":platform", 858 | - ], 859 | - local_defines = [ 860 | - "U_COMMON_IMPLEMENTATION", 861 | - ], 862 | -) 863 | - 864 | -cc_library( 865 | - name = "ulist", 866 | - srcs = [ 867 | - "ulist.cpp", 868 | - ], 869 | - includes = ["."], 870 | - deps = [ 871 | - ":headers", 872 | - ":platform", 873 | - ], 874 | - local_defines = [ 875 | - "U_COMMON_IMPLEMENTATION", 876 | - ], 877 | -) 878 | - 879 | -cc_library( 880 | - name = "unames", 881 | - srcs = [ 882 | - "unames.cpp", 883 | - ], 884 | - includes = ["."], 885 | - deps = [ 886 | - ":headers", 887 | - ":uchar", 888 | - ":udata", 889 | - ], 890 | - local_defines = [ 891 | - "U_COMMON_IMPLEMENTATION", 892 | - ], 893 | -) 894 | - 895 | -cc_library( 896 | - name = "unifiedcache", 897 | - srcs = [ 898 | - "unifiedcache.cpp", 899 | - ], 900 | - includes = ["."], 901 | - deps = [ 902 | - ":headers", 903 | - ":platform", 904 | - ":uhash", 905 | - ], 906 | - local_defines = [ 907 | - "U_COMMON_IMPLEMENTATION", 908 | - ], 909 | -) 910 | - 911 | -cc_library( 912 | - name = "uniset_core", 913 | - srcs = [ 914 | - "bmpset.cpp", 915 | - "unifilt.cpp", 916 | - "unifunct.cpp", 917 | - "uniset.cpp", 918 | - "unisetspan.cpp", 919 | - ], 920 | - includes = ["."], 921 | - deps = [ 922 | - ":headers", 923 | - ":icu_utility", 924 | - ":patternprops", 925 | - ":uvector", 926 | - ], 927 | - local_defines = [ 928 | - "U_COMMON_IMPLEMENTATION", 929 | - ], 930 | -) 931 | - 932 | -cc_library( 933 | - name = "uniset_closure", 934 | - srcs = [ 935 | - "uniset_closure.cpp", 936 | - ], 937 | - includes = ["."], 938 | - deps = [ 939 | - ":headers", 940 | - ":uniset_core", 941 | - ":unistr_case_locale", 942 | - ":unistr_titlecase_brkiter", 943 | - ], 944 | - local_defines = [ 945 | - "U_COMMON_IMPLEMENTATION", 946 | - ], 947 | -) 948 | - 949 | -cc_library( 950 | - name = "uniset_props", 951 | - srcs = [ 952 | - "uniset_props.cpp", 953 | - "ruleiter.cpp", 954 | - ], 955 | - includes = ["."], 956 | - deps = [ 957 | - ":characterproperties", 958 | - ":headers", 959 | - ":parsepos", 960 | - ":propname", 961 | - ":resourcebundle", 962 | - ":unames", 963 | - ":uniset_core", 964 | - ":unistr_case", 965 | - ":uprops", 966 | - ], 967 | - local_defines = [ 968 | - "U_COMMON_IMPLEMENTATION", 969 | - ], 970 | -) 971 | - 972 | -cc_library( 973 | - name = "unistr_case", 974 | - srcs = [ 975 | - "unistr_case.cpp", 976 | - ], 977 | - includes = ["."], 978 | - deps = [ 979 | - ":headers", 980 | - ":ustring_case", 981 | - ], 982 | - local_defines = [ 983 | - "U_COMMON_IMPLEMENTATION", 984 | - ], 985 | -) 986 | - 987 | -cc_library( 988 | - name = "unistr_case_locale", 989 | - srcs = [ 990 | - "unistr_case_locale.cpp", 991 | - ], 992 | - includes = ["."], 993 | - deps = [ 994 | - ":headers", 995 | - ":unistr_case", 996 | - ":ustring_case_locale", 997 | - ], 998 | - local_defines = [ 999 | - "U_COMMON_IMPLEMENTATION", 1000 | - ], 1001 | -) 1002 | - 1003 | -cc_library( 1004 | - name = "unistr_titlecase_brkiter", 1005 | - srcs = [ 1006 | - "unistr_titlecase_brkiter.cpp", 1007 | - ], 1008 | - includes = ["."], 1009 | - deps = [ 1010 | - ":headers", 1011 | - ":ustr_titlecase_brkiter", 1012 | - ], 1013 | - local_defines = [ 1014 | - "U_COMMON_IMPLEMENTATION", 1015 | - ], 1016 | -) 1017 | - 1018 | -cc_library( 1019 | - name = "uprops", 1020 | - srcs = [ 1021 | - "uprops.cpp", 1022 | - ], 1023 | - includes = ["."], 1024 | - deps = [ 1025 | - ":headers", 1026 | - ":emojiprops", 1027 | - ":loadednormalizer2", 1028 | - ":normalizer2", 1029 | - ":ubidi_props", 1030 | - ":ucase", 1031 | - ":uchar", 1032 | - ":unistr_case", 1033 | - ":ustring_case", 1034 | - ], 1035 | - local_defines = [ 1036 | - "U_COMMON_IMPLEMENTATION", 1037 | - ], 1038 | -) 1039 | - 1040 | -cc_library( 1041 | - name = "uscript_props", 1042 | - srcs = [ 1043 | - "uscript_props.cpp", 1044 | - ], 1045 | - includes = ["."], 1046 | - deps = [ 1047 | - ":headers", 1048 | - ":platform", 1049 | - ], 1050 | - local_defines = [ 1051 | - "U_COMMON_IMPLEMENTATION", 1052 | - ], 1053 | -) 1054 | - 1055 | -cc_library( 1056 | - name = "uset", 1057 | - srcs = [ 1058 | - "uset.cpp", 1059 | - ], 1060 | - includes = ["."], 1061 | - deps = [ 1062 | - ":headers", 1063 | - ":platform", 1064 | - ":uniset_core", 1065 | - ], 1066 | - local_defines = [ 1067 | - "U_COMMON_IMPLEMENTATION", 1068 | - ], 1069 | -) 1070 | - 1071 | -cc_library( 1072 | - name = "uset_props", 1073 | - srcs = [ 1074 | - "uset_props.cpp", 1075 | - ], 1076 | - includes = ["."], 1077 | - deps = [ 1078 | - ":headers", 1079 | - ":uniset_closure", 1080 | - ":uniset_core", 1081 | - ":uniset_props", 1082 | - ], 1083 | - local_defines = [ 1084 | - "U_COMMON_IMPLEMENTATION", 1085 | - ], 1086 | -) 1087 | - 1088 | -cc_library( 1089 | - name = "usetiter", 1090 | - srcs = [ 1091 | - "usetiter.cpp", 1092 | - ], 1093 | - includes = ["."], 1094 | - deps = [ 1095 | - ":headers", 1096 | - ":platform", 1097 | - ":uniset_core", 1098 | - ], 1099 | - local_defines = [ 1100 | - "U_COMMON_IMPLEMENTATION", 1101 | - ], 1102 | -) 1103 | - 1104 | -cc_library( 1105 | - name = "ustack", 1106 | - srcs = [ 1107 | - "ustack.cpp", 1108 | - ], 1109 | - includes = ["."], 1110 | - deps = [ 1111 | - ":headers", 1112 | - ":uvector", 1113 | - ], 1114 | - local_defines = [ 1115 | - "U_COMMON_IMPLEMENTATION", 1116 | - ], 1117 | -) 1118 | - 1119 | -cc_library( 1120 | - name = "ustr_titlecase_brkiter", 1121 | - srcs = [ 1122 | - "ustr_titlecase_brkiter.cpp", 1123 | - ], 1124 | - includes = ["."], 1125 | - deps = [ 1126 | - ":breakiterator", 1127 | - ":headers", 1128 | - ":ucase", 1129 | - ":ustring_case_locale", 1130 | - ], 1131 | - local_defines = [ 1132 | - "U_COMMON_IMPLEMENTATION", 1133 | - ], 1134 | -) 1135 | - 1136 | -cc_library( 1137 | - name = "ustring_case", 1138 | - srcs = [ 1139 | - "ustrcase.cpp", 1140 | - ], 1141 | - includes = ["."], 1142 | - deps = [ 1143 | - ":headers", 1144 | - ":ucase", 1145 | - ":uchar", 1146 | - ":edits", 1147 | - ], 1148 | - local_defines = [ 1149 | - "U_COMMON_IMPLEMENTATION", 1150 | - ], 1151 | -) 1152 | - 1153 | -cc_library( 1154 | - name = "ustring_case_locale", 1155 | - srcs = [ 1156 | - "ustrcase_locale.cpp", 1157 | - ], 1158 | - includes = ["."], 1159 | - deps = [ 1160 | - ":headers", 1161 | - ":resourcebundle", 1162 | - ":ustring_case", 1163 | - ], 1164 | - local_defines = [ 1165 | - "U_COMMON_IMPLEMENTATION", 1166 | - ], 1167 | -) 1168 | - 1169 | -cc_library( 1170 | - name = "utext", 1171 | - srcs = [ 1172 | - "utext.cpp", 1173 | - ], 1174 | - includes = ["."], 1175 | - deps = [ 1176 | - ":headers", 1177 | - ":ucase", 1178 | - ], 1179 | - local_defines = [ 1180 | - "U_COMMON_IMPLEMENTATION", 1181 | - ], 1182 | -) 1183 | - 1184 | -cc_library( 1185 | - name = "utrie_swap", 1186 | - srcs = [ 1187 | - "utrie_swap.cpp", 1188 | - ], 1189 | - includes = ["."], 1190 | - deps = [ 1191 | - ":headers", 1192 | - ":udata", 1193 | - ], 1194 | - local_defines = [ 1195 | - "U_COMMON_IMPLEMENTATION", 1196 | - ], 1197 | -) 1198 | - 1199 | -# This target depends on a header file that contains NFC/NFD normalization data. 1200 | -# This header file is generated by a script (generate.sh) that invokes the gennorm2 binary. 1201 | -# See the Unicode update change log (changes.txt). 1202 | -cc_library( 1203 | - name = "normalizer2", 1204 | - srcs = [ 1205 | - "norm2_nfc_data.h", # generated by gennorm2 1206 | - "normalizer2.cpp", 1207 | - "normalizer2impl.cpp", 1208 | - ], 1209 | - includes = ["."], 1210 | - hdrs = [ 1211 | - "normalizer2impl.h", 1212 | - ], 1213 | - deps = [ 1214 | - ":headers", 1215 | - ], 1216 | - local_defines = [ 1217 | - "U_COMMON_IMPLEMENTATION", 1218 | - ], 1219 | -) 1220 | 1221 | 1222 | 1223 | --- icu4c/source/data/unidata/norm2/BUILD.bazel 1224 | +++ /dev/null 1225 | @@ -1,13 +0,0 @@ 1226 | -# © 2021 and later: Unicode, Inc. and others. 1227 | -# License & terms of use: http://www.unicode.org/copyright.html 1228 | - 1229 | -# This Bazel build file is needed to declare targets for the files used as 1230 | -# inputs to binary executables that are a part of other Bazel genrule targets. 1231 | - 1232 | -package( 1233 | - default_visibility = ["//visibility:public"], 1234 | -) 1235 | - 1236 | -exports_files([ 1237 | - "nfc.txt", "nfkc.txt", "nfkc_cf.txt", "uts46.txt", 1238 | -]) 1239 | 1240 | 1241 | 1242 | --- icu4c/source/i18n/BUILD.bazel 1243 | +++ /dev/null 1244 | @@ -1,130 +0,0 @@ 1245 | -# © 2021 and later: Unicode, Inc. and others. 1246 | -# License & terms of use: http://www.unicode.org/copyright.html 1247 | - 1248 | -# This file defines Bazel targets for a subset of the ICU4C "i18n" library header and source files. 1249 | -# The configuration of dependencies among targets is strongly assisted by the 1250 | -# file in depstest that maintains such information, at 1251 | -# icu4c/source/test/depstest/dependencies.txt . 1252 | - 1253 | -load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") 1254 | - 1255 | -package( 1256 | - default_visibility = ["//visibility:public"], 1257 | -) 1258 | - 1259 | -# When compiling code in the `common` dir, the constant 1260 | -# `U_I18n_IMPLEMENTATION` needs to be defined. See 1261 | -# https://unicode-org.github.io/icu/userguide/howtouseicu#c-with-your-own-build-system . 1262 | - 1263 | -# If linker errors occur, then this may be a sign that the dependencies were 1264 | -# not specified correctly. Use dependencies.txt in depstest for assistance. See 1265 | -# https://stackoverflow.com/q/66111709/2077918 . 1266 | - 1267 | -cc_library( 1268 | - name = "headers", 1269 | - hdrs = glob([ 1270 | - "unicode/*.h", # public 1271 | - "*.h", # internal 1272 | - ]), 1273 | - # We need to add includes in order to preserve existing source files' 1274 | - # include directives that use traditional paths, not paths relative to 1275 | - # Bazel workspace: 1276 | - # https://stackoverflow.com/a/65635893/2077918 1277 | - includes = ["."], 1278 | - local_defines = [ 1279 | - "U_I18N_IMPLEMENTATION", 1280 | - ], 1281 | -) 1282 | - 1283 | -cc_library( 1284 | - name = "collation", 1285 | - srcs = [ 1286 | - "bocsu.cpp", 1287 | - "coleitr.cpp", 1288 | - "coll.cpp", 1289 | - "collation.cpp", 1290 | - "collationcompare.cpp", 1291 | - "collationdata.cpp", 1292 | - "collationdatareader.cpp", 1293 | - "collationdatawriter.cpp", 1294 | - "collationfastlatin.cpp", 1295 | - # collationfcd.cpp is generated by genuca; 1296 | - # probably hard to build genuca without depending on the old version. 1297 | - "collationfcd.cpp", 1298 | - "collationiterator.cpp", 1299 | - "collationkeys.cpp", 1300 | - "collationroot.cpp", 1301 | - "collationrootelements.cpp", 1302 | - "collationsets.cpp", 1303 | - "collationsettings.cpp", 1304 | - "collationtailoring.cpp", 1305 | - "rulebasedcollator.cpp", 1306 | - "sortkey.cpp", 1307 | - "ucol.cpp", 1308 | - "ucol_res.cpp", 1309 | - "ucol_sit.cpp", 1310 | - "ucoleitr.cpp", 1311 | - "uitercollationiterator.cpp", 1312 | - "utf16collationiterator.cpp", 1313 | - "utf8collationiterator.cpp", 1314 | - ], 1315 | - includes = ["."], 1316 | - deps = [ 1317 | - ":headers", 1318 | - ":uclean_i18n", 1319 | - "//icu4c/source/common:bytestream", 1320 | - "//icu4c/source/common:normalizer2", 1321 | - "//icu4c/source/common:platform", 1322 | - "//icu4c/source/common:propname", 1323 | - "//icu4c/source/common:resourcebundle", 1324 | - "//icu4c/source/common:service_registration", 1325 | - "//icu4c/source/common:ucharstrieiterator", 1326 | - "//icu4c/source/common:uiter", 1327 | - "//icu4c/source/common:ulist", 1328 | - "//icu4c/source/common:unifiedcache", 1329 | - "//icu4c/source/common:uset", 1330 | - "//icu4c/source/common:usetiter", 1331 | - "//icu4c/source/common:utrie2", 1332 | - "//icu4c/source/common:uvector32", 1333 | - "//icu4c/source/common:uvector64", 1334 | - ], 1335 | - local_defines = [ 1336 | - "U_I18N_IMPLEMENTATION", 1337 | - ], 1338 | -) 1339 | - 1340 | -cc_library( 1341 | - name = "collation_builder", 1342 | - srcs = [ 1343 | - "collationbuilder.cpp", 1344 | - "collationdatabuilder.cpp", 1345 | - "collationfastlatinbuilder.cpp", 1346 | - "collationruleparser.cpp", 1347 | - "collationweights.cpp", 1348 | - ], 1349 | - includes = ["."], 1350 | - deps = [ 1351 | - ":collation", 1352 | - "//icu4c/source/common:canonical_iterator", 1353 | - "//icu4c/source/common:ucharstriebuilder", 1354 | - "//icu4c/source/common:uset_props" 1355 | - ], 1356 | - local_defines = [ 1357 | - "U_I18N_IMPLEMENTATION", 1358 | - ], 1359 | -) 1360 | - 1361 | -cc_library( 1362 | - name = "uclean_i18n", 1363 | - srcs = [ 1364 | - "ucln_in.cpp", 1365 | - ], 1366 | - hdrs = ["ucln_in.h"], 1367 | - includes = ["."], 1368 | - deps = [ 1369 | - "//icu4c/source/common:platform", 1370 | - ], 1371 | - local_defines = [ 1372 | - "U_I18N_IMPLEMENTATION", 1373 | - ], 1374 | -) 1375 | 1376 | 1377 | --- icu4c/source/icudefs.mk.in 1378 | +++ icu4c/source/icudefs.mk.in 1379 | @@ -116,8 +116,8 @@ ENABLE_RELEASE = @ENABLE_RELEASE@ 1380 | EXEEXT = @EXEEXT@ 1381 | CC = @CC@ 1382 | CXX = @CXX@ 1383 | -AR = @AR@ 1384 | -ARFLAGS = @ARFLAGS@ r 1385 | +AR = ar 1386 | +ARFLAGS = r 1387 | RANLIB = @RANLIB@ 1388 | COMPILE_LINK_ENVVAR = @COMPILE_LINK_ENVVAR@ 1389 | UCLN_NO_AUTO_CLEANUP = @UCLN_NO_AUTO_CLEANUP@ 1390 | 1391 | 1392 | 1393 | --- icu4c/source/stubdata/BUILD.bazel 1394 | +++ /dev/null 1395 | @@ -1,23 +0,0 @@ 1396 | -# © 2021 and later: Unicode, Inc. and others. 1397 | -# License & terms of use: http://www.unicode.org/copyright.html 1398 | - 1399 | -# This file defines Bazel targets for the ICU4C "stubdata" library header and source files. 1400 | - 1401 | -load("@rules_cc//cc:defs.bzl", "cc_library") 1402 | - 1403 | -package( 1404 | - default_visibility = ["//visibility:public"], 1405 | -) 1406 | - 1407 | -# When compiling code in the `common` dir, the constant 1408 | -# `U_COMMON_IMPLEMENTATION` needs to be defined. See 1409 | -# https://unicode-org.github.io/icu/userguide/howtouseicu#c-with-your-own-build-system . 1410 | - 1411 | -cc_library( 1412 | - name = "stubdata", 1413 | - srcs = ["stubdata.cpp"], 1414 | - deps = ["//icu4c/source/common:headers"], 1415 | - local_defines = [ 1416 | - "U_COMMON_IMPLEMENTATION", 1417 | - ], 1418 | -) 1419 | 1420 | 1421 | 1422 | --- icu4c/source/tools/gennorm2/BUILD.bazel 1423 | +++ /dev/null 1424 | @@ -1,39 +0,0 @@ 1425 | -# © 2021 and later: Unicode, Inc. and others. 1426 | -# License & terms of use: http://www.unicode.org/copyright.html 1427 | - 1428 | -# This Bazel build file defines a target for the gennorm2 binary that generates 1429 | -# headers needed for bootstrapping the ICU4C build process in a way that 1430 | -# integrates the normalization data. 1431 | - 1432 | -load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") 1433 | - 1434 | -package( 1435 | - default_visibility = ["//visibility:public"], 1436 | -) 1437 | - 1438 | -cc_binary( 1439 | - name = "gennorm2", 1440 | - srcs = glob([ 1441 | - "*.c", 1442 | - "*.cpp", 1443 | - "*.h", # cannot have hdrs section in cc_binary 1444 | - ]), 1445 | - deps = [ 1446 | - "//icu4c/source/common:uhash", 1447 | - "//icu4c/source/common:umutablecptrie", 1448 | - "//icu4c/source/common:ucptrie", 1449 | - "//icu4c/source/common:errorcode", 1450 | - "//icu4c/source/common:uniset", 1451 | - "//icu4c/source/common:uvector32", 1452 | - 1453 | - "//icu4c/source/common:platform", 1454 | - "//icu4c/source/common:headers", 1455 | - 1456 | - "//icu4c/source/tools/toolutil:toolutil", 1457 | - "//icu4c/source/tools/toolutil:unewdata", 1458 | - "//icu4c/source/tools/toolutil:writesrc", 1459 | - "//icu4c/source/tools/toolutil:uoptions", 1460 | - "//icu4c/source/tools/toolutil:uparse", 1461 | - ], 1462 | - linkopts = ["-pthread"], 1463 | -) 1464 | 1465 | 1466 | 1467 | --- icu4c/source/tools/toolutil/BUILD.bazel 1468 | +++ /dev/null 1469 | @@ -1,126 +0,0 @@ 1470 | -# © 2021 and later: Unicode, Inc. and others. 1471 | -# License & terms of use: http://www.unicode.org/copyright.html 1472 | - 1473 | -# This Bazel build file defines targets that are dependencies for building 1474 | -# the gennorm2 and genprops binaries. 1475 | - 1476 | -load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") 1477 | - 1478 | -package( 1479 | - default_visibility = ["//visibility:public"], 1480 | -) 1481 | - 1482 | -cc_library( 1483 | - name = "toolutil", 1484 | - includes = ["."], 1485 | - hdrs = ["toolutil.h"], 1486 | - srcs = ["toolutil.cpp"], 1487 | - local_defines = [ 1488 | - "U_TOOLUTIL_IMPLEMENTATION", 1489 | - ], 1490 | - deps = ["//icu4c/source/common:platform"], 1491 | -) 1492 | - 1493 | -cc_library( 1494 | - name = "unewdata", 1495 | - includes = ["."], 1496 | - hdrs = ["unewdata.h"], 1497 | - srcs = ["unewdata.cpp"], 1498 | - local_defines = [ 1499 | - "U_TOOLUTIL_IMPLEMENTATION", 1500 | - ], 1501 | - deps = [ 1502 | - ":filestrm", 1503 | - "//icu4c/source/common:platform", 1504 | - ], 1505 | -) 1506 | - 1507 | -cc_library( 1508 | - name = "uoptions", 1509 | - includes = ["."], 1510 | - hdrs = ["uoptions.h"], 1511 | - srcs = ["uoptions.cpp"], 1512 | - local_defines = [ 1513 | - "U_TOOLUTIL_IMPLEMENTATION", 1514 | - ], 1515 | - deps = ["//icu4c/source/common:platform"], 1516 | -) 1517 | - 1518 | -cc_library( 1519 | - name = "writesrc", 1520 | - includes = ["."], 1521 | - hdrs = ["writesrc.h"], 1522 | - srcs = ["writesrc.cpp"], 1523 | - local_defines = [ 1524 | - "U_TOOLUTIL_IMPLEMENTATION", 1525 | - ], 1526 | - deps = [ 1527 | - "//icu4c/source/common:bytestream", 1528 | - "//icu4c/source/common:platform", 1529 | - "//icu4c/source/common:uniset_core", 1530 | - ], 1531 | -) 1532 | - 1533 | -cc_library( 1534 | - name = "uparse", 1535 | - includes = ["."], 1536 | - hdrs = ["uparse.h"], 1537 | - srcs = ["uparse.cpp"], 1538 | - local_defines = [ 1539 | - "U_TOOLUTIL_IMPLEMENTATION", 1540 | - ], 1541 | - deps = [ 1542 | - ":filestrm", 1543 | - "//icu4c/source/common:platform", 1544 | - ], 1545 | -) 1546 | - 1547 | -cc_library( 1548 | - name = "filestrm", 1549 | - includes = ["."], 1550 | - hdrs = ["filestrm.h"], 1551 | - srcs = ["filestrm.cpp"], 1552 | - local_defines = [ 1553 | - "U_TOOLUTIL_IMPLEMENTATION", 1554 | - ], 1555 | - deps = ["//icu4c/source/common:platform"], 1556 | -) 1557 | - 1558 | -cc_library( 1559 | - name = "ppucd", 1560 | - includes = ["."], 1561 | - hdrs = ["ppucd.h"], 1562 | - srcs = ["ppucd.cpp"], 1563 | - local_defines = [ 1564 | - "U_TOOLUTIL_IMPLEMENTATION", 1565 | - ], 1566 | - deps = [ 1567 | - ":uparse", 1568 | - "//icu4c/source/common:platform", 1569 | - ], 1570 | -) 1571 | - 1572 | -cc_library( 1573 | - name = "denseranges", 1574 | - includes = ["."], 1575 | - hdrs = ["denseranges.h"], 1576 | - srcs = ["denseranges.cpp"], 1577 | - local_defines = [ 1578 | - "U_TOOLUTIL_IMPLEMENTATION", 1579 | - ], 1580 | - deps = ["//icu4c/source/common:platform"], 1581 | -) 1582 | - 1583 | -cc_library( 1584 | - name = "collationinfo", 1585 | - includes = ["."], 1586 | - hdrs = ["collationinfo.h"], 1587 | - srcs = ["collationinfo.cpp"], 1588 | - local_defines = [ 1589 | - "U_TOOLUTIL_IMPLEMENTATION", 1590 | - ], 1591 | - deps = [ 1592 | - "//icu4c/source/common:platform", 1593 | - "//icu4c/source/i18n:headers", 1594 | - ], 1595 | -) 1596 | -------------------------------------------------------------------------------- /bazel/pybind11/BUILD: -------------------------------------------------------------------------------- 1 | package(default_visibility = ["//visibility:public"]) 2 | 3 | cc_library( 4 | name = "pybind11", 5 | hdrs = glob( 6 | include = [ 7 | "include/pybind11/*.h", 8 | "include/pybind11/detail/*.h", 9 | ], 10 | exclude = [ 11 | "include/pybind11/common.h", 12 | "include/pybind11/eigen.h", 13 | ], 14 | ), 15 | copts = [ 16 | "-fexceptions", 17 | "-Wno-undefined-inline", 18 | "-Wno-pragma-once-outside-header", 19 | ], 20 | includes = ["include"] 21 | ) -------------------------------------------------------------------------------- /binding.cpp: -------------------------------------------------------------------------------- 1 | #include "clip_tokenizer.h" 2 | #include 3 | #include 4 | #include 5 | 6 | PYBIND11_MODULE(clip_tokenizer_py, m) { 7 | pybind11::class_(m, "CLIPTokenizer") 8 | .def(pybind11::init()) 9 | .def("tokenize", &CLIPTokenizer::tokenize); 10 | 11 | pybind11::class_(m, "TokenizerResult") 12 | .def(pybind11::init<>()) 13 | .def_readwrite("tokens", &TokenizerResult::tokens) 14 | .def_readwrite("attention_mask", &TokenizerResult::attention_mask) 15 | .def("__repr__", [](const TokenizerResult &a) { 16 | std::string tokens; 17 | for (auto& token : a.tokens[0]) { 18 | tokens += std::to_string(token) + ", "; 19 | } 20 | tokens.pop_back(); 21 | tokens.pop_back(); 22 | std::string attention_mask; 23 | for (auto& mask : a.attention_mask[0]) { 24 | attention_mask += std::to_string(mask) + ", "; 25 | } 26 | attention_mask.pop_back(); 27 | attention_mask.pop_back(); 28 | return ""; 29 | }); 30 | } -------------------------------------------------------------------------------- /clip_tokenizer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "clip_tokenizer.h" 3 | 4 | 5 | bool is_chinese_char(UChar32 ch) { 6 | return (ch >= 0x4E00 && ch <= 0x9FFF) || 7 | (ch >= 0x3400 && ch <= 0x4DBF) || 8 | (ch >= 0x20000 && ch <= 0x2A6DF) || 9 | (ch >= 0x2A700 && ch <= 0x2B73F) || 10 | (ch >= 0x2B740 && ch <= 0x2B81F) || 11 | (ch >= 0x2B820 && ch <= 0x2CEAF) || 12 | (ch >= 0xF900 && ch <= 0xFAFF) || 13 | (ch >= 0x2F800 && ch <= 0x2FA1F); 14 | } 15 | 16 | icu::UnicodeString tokenize_chinese(const icu::UnicodeString& text) { 17 | icu::UnicodeString result; 18 | for (int32_t i = 0; i < text.length(); ++i) { 19 | UChar32 ch = text.char32At(i); 20 | if (is_chinese_char(ch)) { 21 | result += " "; 22 | result += ch; 23 | result += " "; 24 | } else { 25 | result += ch; 26 | } 27 | } 28 | return result; 29 | } 30 | 31 | std::vector get_bytes_to_unicode_vec() { 32 | std::vector result; 33 | for (int i = 0; i < 256; ++i) { 34 | if((i < 33) || (i > 126 && i < 161) || (i == 173)) 35 | continue; 36 | result.push_back(i); 37 | } 38 | 39 | // copy range to range2 40 | std::vector range2 = result; 41 | 42 | int n = 0; 43 | for (int b = 0; b < 256; ++b) { 44 | if (std::find(result.begin(), result.end(), b) == result.end()) { 45 | result.push_back(256 + n); 46 | ++n; 47 | } 48 | } 49 | 50 | return result; 51 | } 52 | 53 | std::unordered_map CLIPTokenizer::bytes_to_unicode() { 54 | std::unordered_map byteToUnicode; 55 | 56 | std::vector range, range2; 57 | 58 | for (int i = 0; i < 256; ++i) { 59 | if((i < 33) || (i > 126 && i < 161) || (i == 173)) 60 | continue; 61 | range.push_back(i); 62 | } 63 | 64 | // copy range to range2 65 | range2 = range; 66 | 67 | int n = 0; 68 | for (int b = 0; b < 256; ++b) { 69 | if (std::find(range.begin(), range.end(), b) == range.end()) { 70 | range.push_back(b); 71 | range2.push_back(256 + n); 72 | ++n; 73 | } 74 | } 75 | 76 | for (size_t i = 0; i < range.size(); ++i) { 77 | byteToUnicode[range[i]] = UChar32(range2[i]); 78 | } 79 | 80 | return byteToUnicode; 81 | } 82 | 83 | 84 | std::set> CLIPTokenizer::get_pairs(const std::vector& word) { 85 | std::set> pairs; 86 | if (word.size() <= 1) { 87 | return pairs; // No pairs if the word has one or zero characters 88 | } 89 | 90 | icu::UnicodeString prev_char = word[0]; 91 | for (size_t i = 1; i < word.size(); ++i) { 92 | pairs.insert(std::make_pair(prev_char, word[i])); 93 | prev_char = word[i]; 94 | } 95 | 96 | return pairs; 97 | } 98 | 99 | std::vector> CLIPTokenizer::get_merges(const std::string& file_path) { 100 | std::ifstream file(file_path); 101 | if (!file.is_open()) { 102 | std::cout << "Error: Could not open file " << file_path << std::endl; 103 | return {}; 104 | } 105 | std::string line; 106 | std::vector merges; 107 | 108 | while (std::getline(file, line)) { 109 | merges.push_back(line); 110 | } 111 | 112 | std::vector> result; 113 | for (size_t i = 1; i <= 49152 - 256 - 2; ++i) { 114 | std::istringstream merge_stream(merges[i]); 115 | std::string first, second; 116 | merge_stream >> first >> second; 117 | result.emplace_back(first, second); 118 | } 119 | 120 | return result; 121 | } 122 | 123 | icu::UnicodeString CLIPTokenizer::whitespace_clean(const icu::UnicodeString& text) { 124 | // Remove consecutive whitespace characters and replace with a single space 125 | icu::UnicodeString result; 126 | 127 | for (int32_t i = 0; i < text.length(); ++i) { 128 | UChar32 ch = text.char32At(i); 129 | if (u_isWhitespace(ch)) { 130 | if (result.length() == 0 || result.char32At(result.length() - 1) == ' ') { 131 | continue; 132 | } else { 133 | result += ' '; 134 | } 135 | } else { 136 | result += ch; 137 | } 138 | } 139 | 140 | return result; 141 | } 142 | 143 | 144 | CLIPTokenizer::CLIPTokenizer(const std::string& vocab_file) { 145 | matcher = std::make_unique("<\\|startoftext\\|>|<\\|endoftext\\|>|'s|'t|'re|'ve|'m|'ll|'d|[\\p{L}]+|[\\p{N}]|[^\\s\\p{L}\\p{N}]+", 0, status); 146 | byte_encoder = bytes_to_unicode(); 147 | for (auto& it : byte_encoder) { 148 | byte_decoder[it.second] = it.first; 149 | } 150 | auto merges = get_merges(vocab_file); 151 | auto bytes_to_unicode_vec = get_bytes_to_unicode_vec(); 152 | vocab.reserve(bytes_to_unicode_vec.size() + merges.size() + 2); 153 | for(const auto& v : get_bytes_to_unicode_vec()) { 154 | vocab.push_back(std::move(icu::UnicodeString(v))); 155 | } 156 | 157 | for(const auto& val : vocab) { 158 | vocab.push_back(val + ""); 159 | } 160 | 161 | for(const auto& [k,v] : merges) { 162 | vocab.push_back(icu::UnicodeString::fromUTF8(k + v)); 163 | } 164 | 165 | 166 | vocab.push_back("<|startoftext|>"); 167 | vocab.push_back("<|endoftext|>"); 168 | for(size_t i = 0; i < vocab.size(); i++) { 169 | encoder[vocab[i]] = i; 170 | decoder[i] = vocab[i]; 171 | } 172 | 173 | for(size_t i = 0; i < merges.size(); ++i) { 174 | bpe_ranks[icu::UnicodeString::fromUTF8(std::get<0>(merges[i]) + std::get<1>(merges[i]))] = i; 175 | } 176 | 177 | cache["<|startoftext|>"] = {"<|startoftext|>"}; 178 | cache["<|endoftext|>"] = {"<|endoftext|>"}; 179 | } 180 | 181 | std::vector CLIPTokenizer::bpe(const icu::UnicodeString& text) { 182 | if (cache.find(text) != cache.end()) { 183 | return cache[text]; 184 | } 185 | 186 | std::vector word; 187 | for (int32_t i = 0; i < text.length() - 1; ++i) { 188 | word.push_back(text.tempSubString(i, 1)); 189 | } 190 | word.push_back(text.tempSubString(text.length() - 1) + ""); 191 | auto pairs = get_pairs(word); 192 | if (pairs.empty()) { 193 | return {text + ""}; 194 | } 195 | 196 | while(1) { 197 | auto bigram = std::min_element(pairs.begin(), pairs.end(), [&](const std::pair& l, const std::pair& r) { 198 | int rank1 = INT32_MAX; 199 | if (bpe_ranks.find(l.first + l.second) != bpe_ranks.end()) { 200 | rank1 = bpe_ranks[l.first + l.second]; 201 | } 202 | int rank2 = INT32_MAX; 203 | if (bpe_ranks.find(r.first + r.second) != bpe_ranks.end()) { 204 | rank2 = bpe_ranks[r.first + r.second]; 205 | } 206 | return rank1 < rank2; 207 | }); 208 | 209 | if (bpe_ranks.find(bigram->first + bigram->second) == bpe_ranks.end()) { 210 | break; 211 | } 212 | 213 | auto first = bigram->first; 214 | auto second = bigram->second; 215 | std::vector new_word; 216 | 217 | size_t i = 0; 218 | 219 | while (i < word.size()) { 220 | size_t j = i; 221 | while (j < word.size()) { 222 | if (word[j] == first) { 223 | break; 224 | } 225 | ++j; 226 | } 227 | for (size_t k = i; k < j; ++k) { 228 | new_word.push_back(word[k]); 229 | } 230 | 231 | if (j == word.size()) { 232 | break; 233 | } else { 234 | i = j; 235 | } 236 | 237 | if(word[i] == first && i < word.size() - 1 && word[i + 1] == second) { 238 | new_word.push_back(first + second); 239 | i += 2; 240 | } else { 241 | new_word.push_back(word[i]); 242 | ++i; 243 | } 244 | } 245 | 246 | word = new_word; 247 | if (word.size() == 1) { 248 | break; 249 | } else { 250 | pairs = get_pairs(word); 251 | } 252 | } 253 | 254 | cache[text] = word; 255 | return word; 256 | } 257 | 258 | std::vector CLIPTokenizer::encode(icu::UnicodeString unicode_text) { 259 | std::vector bpe_tokens; 260 | unicode_text = whitespace_clean(unicode_text); 261 | icu::Transliterator* strip_accents = icu::Transliterator::createInstance("NFD; [:Mn:] Remove", UTRANS_FORWARD, status); 262 | strip_accents->transliterate(unicode_text); 263 | if (status != U_ZERO_ERROR) { 264 | if(status == U_STRING_NOT_TERMINATED_WARNING) { 265 | status = U_ZERO_ERROR; 266 | } else { 267 | std::cout << "Error: " << u_errorName(status) << std::endl; 268 | bpe_tokens.push_back(encoder[icu::UnicodeString::fromUTF8("<|endoftext|>")]); 269 | return bpe_tokens; 270 | } 271 | } 272 | unicode_text = tokenize_chinese(unicode_text); 273 | unicode_text.toLower(); 274 | unicode_text.trim(); 275 | icu::UnicodeString word; 276 | size_t start = 0; 277 | matcher->reset(unicode_text); 278 | while (matcher->find()) { 279 | word = matcher->group(status); 280 | if (status != U_ZERO_ERROR) { 281 | if(status == U_STRING_NOT_TERMINATED_WARNING) { 282 | status = U_ZERO_ERROR; 283 | } else { 284 | std::cout << "Error: " << u_errorName(status) << std::endl; 285 | bpe_tokens.push_back(encoder[icu::UnicodeString::fromUTF8("<|endoftext|>")]); 286 | return bpe_tokens; 287 | } 288 | } 289 | std::string word_str; 290 | word.toUTF8String(word_str); 291 | if (word.length() == 0) { 292 | continue; 293 | } 294 | icu::UnicodeString encoder_result; 295 | 296 | for (int32_t j = 0; j < word_str.length(); ++j) { 297 | encoder_result += byte_encoder[(int)(word_str[j] & 0xff)]; 298 | } 299 | auto bpe_res = bpe(encoder_result); 300 | for (auto& token : bpe_res) {; 301 | if(bpe_tokens.size() >= MAX_LEN - 1) { 302 | break; 303 | } else if (encoder.find(token) == encoder.end()) { 304 | bpe_tokens.push_back(encoder[icu::UnicodeString::fromUTF8("<|endoftext|>")]); 305 | } else { 306 | bpe_tokens.push_back(encoder[token]); 307 | } 308 | } 309 | } 310 | 311 | if (bpe_tokens.back() != encoder[icu::UnicodeString::fromUTF8("<|endoftext|>")]) { 312 | bpe_tokens.push_back(encoder[icu::UnicodeString::fromUTF8("<|endoftext|>")]); 313 | } 314 | 315 | delete strip_accents; 316 | 317 | return bpe_tokens; 318 | } 319 | 320 | TokenizerResult CLIPTokenizer::tokenize(const std::vector& texts) { 321 | std::vector> result; 322 | for (const auto& text : texts) { 323 | icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8("<|startoftext|> "); 324 | unicode_text += icu::UnicodeString::fromUTF8(text); 325 | unicode_text += icu::UnicodeString::fromUTF8(" <|endoftext|>"); 326 | std::vector tokens = encode(unicode_text); 327 | result.push_back(tokens); 328 | } 329 | 330 | TokenizerResult tokenizer_result; 331 | 332 | size_t max_len = 0; 333 | for (const auto& tokens : result) { 334 | if (tokens.size() > max_len) { 335 | max_len = tokens.size(); 336 | } 337 | } 338 | 339 | std::vector> attention_mask; 340 | std::vector> input_ids; 341 | for (auto& tokens : result) { 342 | std::vector mask; 343 | for (size_t i = 0; i < tokens.size(); ++i) { 344 | mask.push_back(1); 345 | } 346 | while (mask.size() < max_len) { 347 | mask.push_back(0); 348 | tokens.push_back(encoder[icu::UnicodeString::fromUTF8("<|endoftext|>")]); 349 | } 350 | input_ids.push_back(tokens); 351 | attention_mask.push_back(mask); 352 | } 353 | 354 | tokenizer_result.attention_mask = attention_mask; 355 | tokenizer_result.tokens = input_ids; 356 | 357 | return tokenizer_result; 358 | } -------------------------------------------------------------------------------- /clip_tokenizer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | namespace std 22 | { 23 | template<> 24 | class hash { 25 | public: 26 | size_t operator()(const icu::UnicodeString &s) const 27 | { 28 | return (size_t) s.hashCode(); 29 | } 30 | }; 31 | }; 32 | 33 | 34 | 35 | struct TokenizerResult { 36 | std::vector> tokens; 37 | std::vector> attention_mask; 38 | }; 39 | 40 | 41 | class CLIPTokenizer { 42 | private: 43 | std::unordered_map bytes_to_unicode(); 44 | std::set> get_pairs(const std::vector& word); 45 | std::vector> get_merges(const std::string& file_path); 46 | icu::UnicodeString whitespace_clean(const icu::UnicodeString& text); 47 | std::vector bpe(const icu::UnicodeString& text); 48 | std::unordered_map byte_encoder; 49 | std::unordered_map byte_decoder; 50 | std::vector vocab; 51 | std::unordered_map> cache; 52 | std::unique_ptr matcher; 53 | std::unordered_map> bpe_ranks; 54 | std::unordered_map> encoder; 55 | std::unordered_map> decoder; 56 | UErrorCode status = U_ZERO_ERROR; 57 | std::vector encode(icu::UnicodeString text); 58 | size_t MAX_LEN = 77; 59 | public: 60 | CLIPTokenizer(const std::string& vocab_file); 61 | TokenizerResult tokenize(const std::vector& texts); 62 | }; 63 | 64 | 65 | -------------------------------------------------------------------------------- /test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "clip_tokenizer.h" 3 | 4 | 5 | 6 | 7 | int main() { 8 | 9 | CLIPTokenizer tokenizer("vocab.txt"); 10 | TokenizerResult result = tokenizer.tokenize({"hello world"}); 11 | 12 | std::cout << "Tokens: " << std::endl; 13 | for (auto& token : result.tokens[0]) { 14 | std::cout << token << " "; 15 | } 16 | std::cout << std::endl; 17 | 18 | std::cout << "Attention mask: " << std::endl; 19 | for (auto& mask : result.attention_mask[0]) { 20 | std::cout << mask << " "; 21 | } 22 | return 0; 23 | } -------------------------------------------------------------------------------- /unit_test.py: -------------------------------------------------------------------------------- 1 | import clip_tokenizer_py 2 | from transformers import CLIPTokenizer 3 | 4 | tokenizer = clip_tokenizer_py.CLIPTokenizer("vocab.txt") 5 | hf_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") 6 | 7 | res = tokenizer.tokenize(["hello world"]) 8 | hf_res = hf_tokenizer("hello world", return_tensors="pt") 9 | 10 | assert res.tokens == hf_res.input_ids.tolist() 11 | assert res.attention_mask == hf_res.attention_mask.tolist() 12 | 13 | 14 | # test russian 15 | res = tokenizer.tokenize(["Привет мир"]) 16 | hf_res = hf_tokenizer("Привет мир", return_tensors="pt") 17 | print(res.tokens) 18 | print(hf_res.input_ids) 19 | 20 | assert res.tokens == hf_res.input_ids.tolist() 21 | assert res.attention_mask == hf_res.attention_mask.tolist() 22 | 23 | ## test chinese 24 | res = tokenizer.tokenize(["你好世界"]) 25 | hf_res = hf_tokenizer("你好世界", return_tensors="pt") 26 | 27 | assert res.tokens == hf_res.input_ids.tolist() 28 | assert res.attention_mask == hf_res.attention_mask.tolist() 29 | 30 | ## test japanese 31 | res = tokenizer.tokenize(["こんにちは世界"]) 32 | hf_res = hf_tokenizer("こんにちは世界", return_tensors="pt") 33 | 34 | assert res.tokens == hf_res.input_ids.tolist() 35 | assert res.attention_mask == hf_res.attention_mask.tolist() 36 | 37 | 38 | # test hindi 39 | res = tokenizer.tokenize(["नमस्ते दुनिया"]) 40 | hf_res = hf_tokenizer("नमस्ते दुनिया", return_tensors="pt") 41 | print(res.tokens) 42 | print(hf_res.input_ids) 43 | 44 | assert res.tokens == hf_res.input_ids.tolist() 45 | assert res.attention_mask == hf_res.attention_mask.tolist() 46 | 47 | ## test arabic 48 | res = tokenizer.tokenize(["مرحبا بالعالم"]) 49 | hf_res = hf_tokenizer("مرحبا بالعالم", return_tensors="pt") 50 | 51 | assert res.tokens == hf_res.input_ids.tolist() 52 | assert res.attention_mask == hf_res.attention_mask.tolist() 53 | 54 | ## test korean 55 | res = tokenizer.tokenize(["안녕하세요"]) 56 | hf_res = hf_tokenizer("안녕하세요", return_tensors="pt") 57 | 58 | assert res.tokens == hf_res.input_ids.tolist() 59 | assert res.attention_mask == hf_res.attention_mask.tolist() 60 | 61 | ## test spanish 62 | res = tokenizer.tokenize(["Hola Mundo"]) 63 | hf_res = hf_tokenizer("Hola Mundo", return_tensors="pt") 64 | print(res.tokens) 65 | print(hf_res.input_ids) 66 | 67 | assert res.tokens == hf_res.input_ids.tolist() 68 | assert res.attention_mask == hf_res.attention_mask.tolist() 69 | 70 | ## test turkish 71 | res = tokenizer.tokenize(["Merhaba Dünya"]) 72 | hf_res = hf_tokenizer("Merhaba Dünya", return_tensors="pt") 73 | 74 | assert res.tokens == hf_res.input_ids.tolist() 75 | assert res.attention_mask == hf_res.attention_mask.tolist() --------------------------------------------------------------------------------