├── .bazelrc
├── .gitignore
├── BUILD
├── WORKSPACE
├── bazel
    ├── BUILD
    ├── foreign_cc.patch
    ├── icu
    │   ├── BUILD
    │   └── icu.patch
    └── pybind11
    │   └── BUILD
├── binding.cpp
├── clip_tokenizer.cpp
├── clip_tokenizer.h
├── test.cpp
├── unit_test.py
└── vocab.txt


/.bazelrc:
--------------------------------------------------------------------------------
1 | build --action_env=BAZEL_CXXOPTS="-std=c++17"
2 | build --cxxopt="-std=c++17"
3 | build --copt="-fPIC"


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /bazel-*/**
2 | /bazel-*
3 | /.vscode/**


--------------------------------------------------------------------------------
/BUILD:
--------------------------------------------------------------------------------
 1 | cc_library(
 2 |     name="clip",
 3 |     srcs=["clip_tokenizer.cpp"],
 4 |     hdrs=["clip_tokenizer.h"],
 5 |     deps=["@icu"],
 6 | )
 7 | 
 8 | 
 9 | cc_binary( 
10 |     name="test",
11 |     srcs=["test.cpp"],
12 |     deps=[
13 |         ":clip"
14 |     ],
15 | )
16 | 
17 | load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
18 | 
19 | pybind_extension(
20 |     name = "clip_tokenizer_py",
21 |     srcs = ["binding.cpp"],
22 |     deps = [
23 |         ":clip"
24 |     ],
25 | )
26 | 
27 | py_library(
28 |     name = "clip_tokenizer_py",
29 |     srcs = ["unit_test.py"],
30 |     data = [
31 |         ":clip_tokenizer_py.so"
32 |     ],
33 | )
34 | 
35 | py_binary(
36 |     name = "unit_test",
37 |     srcs = ["unit_test.py"],
38 |     deps = [
39 |         ":clip_tokenizer_py"
40 |     ],
41 |     data = [
42 |         ":vocab.txt"
43 |     ],
44 | )


--------------------------------------------------------------------------------
/WORKSPACE:
--------------------------------------------------------------------------------
 1 | load("@bazel_tools//tools/build_defs/repo:git.bzl", "new_git_repository")
 2 | load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 3 | 
 4 | http_archive(
 5 |     name = "rules_foreign_cc",
 6 |     patches = ["//bazel:foreign_cc.patch"],
 7 |     sha256 = "2a4d07cd64b0719b39a7c12218a3e507672b82a97b98c6a89d38565894cf7c51",
 8 |     strip_prefix = "rules_foreign_cc-0.9.0",
 9 |     url = "https://github.com/bazelbuild/rules_foreign_cc/archive/refs/tags/0.9.0.tar.gz",
10 | )
11 | 
12 | load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
13 | 
14 | rules_foreign_cc_dependencies()
15 | 
16 | new_git_repository(
17 |     name = "icu",
18 |     build_file = "//bazel/icu:BUILD",
19 |     remote = "https://github.com/unicode-org/icu.git",
20 |     patches = ["//bazel/icu:icu.patch"],
21 |     tag = "release-71-1",
22 | )
23 | 
24 | load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
25 | 
26 | new_git_repository(
27 |     name = "pybind11_bazel",
28 |     remote = "https://github.com/pybind/pybind11_bazel.git",
29 |     tag = "v2.11.1"
30 | )
31 | 
32 | new_git_repository(
33 |   name = "pybind11",
34 |   build_file = "@pybind11_bazel//:pybind11.BUILD",
35 |   remote = "https://github.com/pybind/pybind11.git",
36 |   tag = "v2.11.1"
37 | )
38 | 
39 | load("@pybind11_bazel//:python_configure.bzl", "python_configure")
40 | python_configure(name = "local_config_python")
41 | 


--------------------------------------------------------------------------------
/bazel/BUILD:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozanarmagan/clip_tokenizer_cpp/0ca1e2e2e7418108725eaa7fb93e029516ae63fa/bazel/BUILD


--------------------------------------------------------------------------------
/bazel/foreign_cc.patch:
--------------------------------------------------------------------------------
 1 | --- foreign_cc/private/configure_script.bzl
 2 | +++ foreign_cc/private/configure_script.bzl
 3 | @@ -70,7 +70,7 @@
 4 |          ).lstrip())
 5 |  
 6 |      script.append("##mkdirs## $$BUILD_TMPDIR$$/$$INSTALL_PREFIX$$")
 7 | -    script.append("{env_vars} {prefix}\"{configure}\" --prefix=$$BUILD_TMPDIR$$/$$INSTALL_PREFIX$$ {user_options}".format(
 8 | +    script.append("{env_vars} {prefix}\"{configure}\" {user_options} --prefix=$$BUILD_TMPDIR$$/$$INSTALL_PREFIX$$".format(
 9 |          env_vars = get_make_env_vars(workspace_name, tools, flags, env_vars, deps, inputs),
10 |          prefix = configure_prefix,
11 |          configure = configure_path,


--------------------------------------------------------------------------------
/bazel/icu/BUILD:
--------------------------------------------------------------------------------
 1 | load("@rules_foreign_cc//foreign_cc:defs.bzl", "configure_make")
 2 | 
 3 | package(default_visibility = ["//visibility:public"])
 4 | 
 5 | filegroup(
 6 |     name = "all_srcs",
 7 |     srcs = glob(["**"]),
 8 |     visibility = ["//visibility:public"],
 9 | )
10 | 
11 | configure_make(
12 |     name = "icu",
13 |     args = ["-j8"],
14 |     configure_command = "icu4c/source/runConfigureICU",
15 |     tags = ["no-sandbox"],
16 |     configure_options = select({
17 |         "@platforms//os:linux": ["Linux"],
18 |         "@platforms//os:macos": ["MacOSX"],
19 |     }) + [
20 |         "--enable-static",
21 |         "--disable-shared",
22 |     ],
23 |     lib_source = ":all_srcs",
24 |     out_static_libs = [
25 |         "libicui18n.a",
26 |         "libicutu.a",
27 |         "libicuuc.a",
28 |         "libicudata.a",
29 |         "libicuio.a",
30 |     ],
31 | )
32 | 


--------------------------------------------------------------------------------
/bazel/icu/icu.patch:
--------------------------------------------------------------------------------
   1 | 
   2 | 
   3 | 
   4 | --- icu4c/source/common/BUILD.bazel
   5 | +++ /dev/null
   6 | @@ -1,1213 +0,0 @@
   7 | -# © 2021 and later: Unicode, Inc. and others.
   8 | -# License & terms of use: http://www.unicode.org/copyright.html
   9 | -
  10 | -# This file defines Bazel targets for a subset of ICU4C "common" library header and source files.
  11 | -# The configuration of dependencies among targets is strongly assisted by the
  12 | -# file in depstest that maintains such information, at
  13 | -# icu4c/source/test/depstest/dependencies.txt .
  14 | -
  15 | -load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
  16 | -
  17 | -package(
  18 | -    default_visibility = ["//visibility:public"],
  19 | -)
  20 | -
  21 | -# When compiling code in the `common` dir, the constant
  22 | -# `U_COMMON_IMPLEMENTATION` needs to be defined. See 
  23 | -# https://unicode-org.github.io/icu/userguide/howtouseicu#c-with-your-own-build-system .
  24 | -
  25 | -# If linker errors occur, then this may be a sign that the dependencies were 
  26 | -# not specified correctly. Use dependencies.txt in depstest for assistance. See
  27 | -# https://stackoverflow.com/q/66111709/2077918 .
  28 | -
  29 | -cc_library(
  30 | -    name = "headers",
  31 | -    hdrs = glob([
  32 | -        "unicode/*.h", # public
  33 | -        "*.h",         # internal
  34 | -        ],
  35 | -        # Instead of using these checked-in files, our Bazel build process
  36 | -        # regenerates them and then uses the new versions.
  37 | -        # Same list of .h files as in icu4c/source/data/unidata/clean.sh.
  38 | -        exclude = ["norm2_nfc_data.h", "propname_data.h", "*_props_data.h"],
  39 | -    ),
  40 | -    # We need to add includes in order to preserve existing source files'
  41 | -    # include directives that use traditional paths, not paths relative to
  42 | -    # Bazel workspace:
  43 | -    # https://stackoverflow.com/a/65635893/2077918
  44 | -    includes = ["."],
  45 | -    local_defines = [
  46 | -        "U_COMMON_IMPLEMENTATION",
  47 | -    ],
  48 | -)
  49 | -
  50 | -cc_library(
  51 | -    name = "platform",
  52 | -    srcs = [
  53 | -        "cmemory.cpp", 
  54 | -        "uobject.cpp",
  55 | -        "cstring.cpp",
  56 | -        "cwchar.cpp",
  57 | -        "uinvchar.cpp",
  58 | -        "charstr.cpp",
  59 | -        "unistr.cpp",
  60 | -        "appendable.cpp",
  61 | -        "stringpiece.cpp",
  62 | -        "ustrtrns.cpp",
  63 | -        "ustring.cpp",  
  64 | -        "ustrfmt.cpp",  
  65 | -        "utf_impl.cpp",
  66 | -        "putil.cpp",
  67 | -        "ucln_cmn.cpp",  
  68 | -        "udataswp.cpp",  
  69 | -        "umath.cpp",
  70 | -        "umutex.cpp",
  71 | -        "sharedobject.cpp",
  72 | -        "utrace.cpp",
  73 | -    ],
  74 | -    deps = [
  75 | -        ":headers",
  76 | -        # omit other deps b/c they are sys symbols
  77 | -    ],
  78 | -    local_defines = [
  79 | -        "U_COMMON_IMPLEMENTATION",
  80 | -    ],
  81 | -    linkopts = ["-ldl"],
  82 | -)
  83 | -
  84 | -cc_library(
  85 | -    name = "utrie",
  86 | -    srcs = ["utrie.cpp"],
  87 | -    deps = [":platform"],
  88 | -    local_defines = [
  89 | -        "U_COMMON_IMPLEMENTATION",
  90 | -    ],
  91 | -)
  92 | -
  93 | -cc_library(
  94 | -    name = "utrie2",
  95 | -    srcs = ["utrie2.cpp"],
  96 | -    deps = [":platform"],
  97 | -    local_defines = [
  98 | -        "U_COMMON_IMPLEMENTATION",
  99 | -    ],
 100 | -)
 101 | -
 102 | -cc_library(
 103 | -    name = "utrie2_builder",
 104 | -    srcs = ["utrie2_builder.cpp"],
 105 | -    deps = [
 106 | -        ":utrie",
 107 | -        ":utrie2",
 108 | -        ":platform",
 109 | -    ],
 110 | -    local_defines = [
 111 | -        "U_COMMON_IMPLEMENTATION",
 112 | -    ],
 113 | -)
 114 | -
 115 | -cc_library(
 116 | -    name = "ucptrie",
 117 | -    srcs = ["ucptrie.cpp"],
 118 | -    deps = [":platform"],
 119 | -    local_defines = [
 120 | -        "U_COMMON_IMPLEMENTATION",
 121 | -    ],
 122 | -)
 123 | -
 124 | -cc_library(
 125 | -    name = "umutablecptrie",
 126 | -    srcs = ["umutablecptrie.cpp"],
 127 | -    deps = [":ucptrie"],
 128 | -    local_defines = [
 129 | -        "U_COMMON_IMPLEMENTATION",
 130 | -    ],
 131 | -)
 132 | -
 133 | -cc_library(
 134 | -    name = "bytestrie",
 135 | -    srcs = ["bytestrie.cpp"],
 136 | -    deps = [":platform"],
 137 | -    local_defines = [
 138 | -        "U_COMMON_IMPLEMENTATION",
 139 | -    ],
 140 | -)
 141 | -
 142 | -cc_library(
 143 | -    name = "bytestriebuilder",
 144 | -    srcs = ["bytestriebuilder.cpp"],
 145 | -    deps = [
 146 | -        ":bytestrie",
 147 | -        ":stringtriebuilder",
 148 | -        ":sort",
 149 | -    ],
 150 | -    local_defines = [
 151 | -        "U_COMMON_IMPLEMENTATION",
 152 | -    ],
 153 | -)
 154 | -
 155 | -cc_library(
 156 | -    name = "stringtriebuilder",
 157 | -    srcs = ["stringtriebuilder.cpp"],
 158 | -    deps = [
 159 | -        ":uhash",
 160 | -    ],
 161 | -    local_defines = [
 162 | -        "U_COMMON_IMPLEMENTATION",
 163 | -    ],
 164 | -)
 165 | -
 166 | -cc_library(
 167 | -    name = "uhash",
 168 | -    hdrs = [
 169 | -        "uhash.h",
 170 | -    ],
 171 | -    srcs = [
 172 | -        "uhash.cpp",
 173 | -    ],
 174 | -    deps = [
 175 | -        ":headers",
 176 | -    ],
 177 | -    local_defines = [
 178 | -        "U_COMMON_IMPLEMENTATION",
 179 | -    ],
 180 | -)
 181 | -
 182 | -cc_library(
 183 | -    name = "errorcode",
 184 | -    hdrs = [
 185 | -    ],
 186 | -    srcs = [
 187 | -        "errorcode.cpp",
 188 | -    ],
 189 | -    includes = ["."],
 190 | -    deps = [
 191 | -        ":platform",
 192 | -        ":utypes",
 193 | -    ],
 194 | -    local_defines = [
 195 | -        "U_COMMON_IMPLEMENTATION",
 196 | -    ],
 197 | -)
 198 | -
 199 | -cc_library(
 200 | -    name = "utypes",
 201 | -    srcs = [
 202 | -        "utypes.cpp",
 203 | -    ],
 204 | -    includes = ["."],
 205 | -    deps = [
 206 | -        ":headers",
 207 | -    ],
 208 | -    local_defines = [
 209 | -        "U_COMMON_IMPLEMENTATION",
 210 | -    ],
 211 | -)
 212 | -
 213 | -cc_library(
 214 | -    name = "uniset",
 215 | -    srcs = [
 216 | -        "uniset.cpp",
 217 | -        "unifilt.cpp",
 218 | -        "unisetspan.cpp",
 219 | -        "bmpset.cpp",
 220 | -        "util.cpp",
 221 | -        "unifunct.cpp",
 222 | -        "usetiter.cpp",
 223 | -    ],
 224 | -    includes = ["."],
 225 | -    deps = [
 226 | -        ":patternprops",
 227 | -        ":uvector",
 228 | -        ":headers",
 229 | -    ],
 230 | -    local_defines = [
 231 | -        "U_COMMON_IMPLEMENTATION",
 232 | -    ],
 233 | -)
 234 | -
 235 | -cc_library(
 236 | -    name = "patternprops",
 237 | -    srcs = [
 238 | -        "patternprops.cpp",
 239 | -    ],
 240 | -    includes = ["."],
 241 | -    deps = [
 242 | -        ":headers",
 243 | -    ],
 244 | -    local_defines = [
 245 | -        "U_COMMON_IMPLEMENTATION",
 246 | -    ],
 247 | -)
 248 | -
 249 | -cc_library(
 250 | -    name = "propsvec",
 251 | -    srcs = [
 252 | -        "propsvec.cpp",
 253 | -    ],
 254 | -    includes = ["."],
 255 | -    deps = [
 256 | -        ":sort",
 257 | -        ":utrie2_builder",
 258 | -        ":headers",
 259 | -    ],
 260 | -    local_defines = [
 261 | -        "U_COMMON_IMPLEMENTATION",
 262 | -    ],
 263 | -)
 264 | -
 265 | -cc_library(
 266 | -    name = "propname",
 267 | -    srcs = [
 268 | -        "propname.cpp",
 269 | -        "propname_data.h",
 270 | -    ],
 271 | -    includes = ["."],
 272 | -    deps = [
 273 | -        ":bytestrie",
 274 | -        ":headers",
 275 | -    ],
 276 | -    local_defines = [
 277 | -        "U_COMMON_IMPLEMENTATION",
 278 | -    ],
 279 | -)
 280 | -
 281 | -# Note: The cc_library target names "uvector32" and "uvector64" match the
 282 | -# dependencies.txt group names, but the filenames are "uvectr32.*"/"uvectr64.*".
 283 | -cc_library(
 284 | -    name = "uvector32",
 285 | -    srcs = [
 286 | -        "uvectr32.cpp",
 287 | -    ],
 288 | -    includes = ["."],
 289 | -    deps = [
 290 | -        ":headers",
 291 | -        ":platform",
 292 | -    ],
 293 | -    local_defines = [
 294 | -        "U_COMMON_IMPLEMENTATION",
 295 | -    ],
 296 | -)
 297 | -
 298 | -cc_library(
 299 | -    name = "uvector64",
 300 | -    srcs = [
 301 | -        "uvectr64.cpp",
 302 | -    ],
 303 | -    includes = ["."],
 304 | -    deps = [
 305 | -        ":headers",
 306 | -        ":platform",
 307 | -    ],
 308 | -    local_defines = [
 309 | -        "U_COMMON_IMPLEMENTATION",
 310 | -    ],
 311 | -)
 312 | -
 313 | -cc_library(
 314 | -    name = "sort",
 315 | -    srcs = [
 316 | -        "uarrsort.cpp",
 317 | -    ],
 318 | -    includes = ["."],
 319 | -    deps = [
 320 | -        ":headers",
 321 | -    ],
 322 | -    local_defines = [
 323 | -        "U_COMMON_IMPLEMENTATION",
 324 | -    ],
 325 | -)
 326 | -
 327 | -cc_library(
 328 | -    name = "uvector",
 329 | -    srcs = [
 330 | -        "uvector.cpp",
 331 | -    ],
 332 | -    includes = ["."],
 333 | -    deps = [
 334 | -        ":platform",
 335 | -        ":sort",
 336 | -    ],
 337 | -    local_defines = [
 338 | -        "U_COMMON_IMPLEMENTATION",
 339 | -    ],
 340 | -)
 341 | -
 342 | -cc_library(
 343 | -    name = "breakiterator",
 344 | -    srcs = [
 345 | -        "brkiter.cpp",
 346 | -        "brkeng.cpp",
 347 | -        "dictbe.cpp",
 348 | -        "dictionarydata.cpp",
 349 | -        "filteredbrk.cpp",
 350 | -        "lstmbe.cpp",
 351 | -        "rbbi.cpp",
 352 | -        "rbbi_cache.cpp",
 353 | -        "rbbidata.cpp",
 354 | -        "rbbinode.cpp",
 355 | -        "rbbirb.cpp",
 356 | -        "rbbiscan.cpp",
 357 | -        "rbbisetb.cpp",
 358 | -        "rbbistbl.cpp",
 359 | -        "rbbitblb.cpp",
 360 | -        "ubrk.cpp",
 361 | -    ],
 362 | -    includes = ["."],
 363 | -    deps = [
 364 | -        ":bytestrie",
 365 | -        ":headers",
 366 | -        ":normlzr",
 367 | -        ":resourcebundle",
 368 | -        ":schriter",
 369 | -        ":service_registration",
 370 | -        ":ucharstrie",
 371 | -        ":ucharstriebuilder",
 372 | -        ":uhash",
 373 | -        ":uniset_core",
 374 | -        ":uniset_props",
 375 | -        ":ustack",
 376 | -        ":utext",
 377 | -        ":utrie2_builder",
 378 | -        ":uvector32",
 379 | -    ],
 380 | -    local_defines = [
 381 | -        "U_COMMON_IMPLEMENTATION",
 382 | -    ],
 383 | -)
 384 | -
 385 | -cc_library(
 386 | -    name = "bytesinkutil",
 387 | -    srcs = [
 388 | -        "bytesinkutil.cpp",
 389 | -    ],
 390 | -    includes = ["."],
 391 | -    deps = [
 392 | -        ":headers",
 393 | -        ":bytestream",
 394 | -        ":edits",
 395 | -    ],
 396 | -    local_defines = [
 397 | -        "U_COMMON_IMPLEMENTATION",
 398 | -    ],
 399 | -)
 400 | -
 401 | -cc_library(
 402 | -    name = "bytestream",
 403 | -    srcs = [
 404 | -        "bytestream.cpp",
 405 | -    ],
 406 | -    includes = ["."],
 407 | -    deps = [
 408 | -        ":headers",
 409 | -        ":platform",
 410 | -    ],
 411 | -    local_defines = [
 412 | -        "U_COMMON_IMPLEMENTATION",
 413 | -    ],
 414 | -)
 415 | -
 416 | -cc_library(
 417 | -    name = "canonical_iterator",
 418 | -    srcs = [
 419 | -        "caniter.cpp",
 420 | -    ],
 421 | -    deps = [
 422 | -        ":normalizer2",
 423 | -        ":usetiter",
 424 | -    ],
 425 | -    local_defines = [
 426 | -        "U_COMMON_IMPLEMENTATION",
 427 | -    ],
 428 | -)
 429 | -
 430 | -cc_library(
 431 | -    name = "characterproperties",
 432 | -    srcs = [
 433 | -        "characterproperties.cpp",
 434 | -    ],
 435 | -    includes = ["."],
 436 | -    deps = [
 437 | -        ":headers",
 438 | -        ":emojiprops",
 439 | -        ":ucptrie",
 440 | -        ":umutablecptrie",
 441 | -        ":uniset_core",
 442 | -        ":uprops",
 443 | -    ],
 444 | -    local_defines = [
 445 | -        "U_COMMON_IMPLEMENTATION",
 446 | -    ],
 447 | -)
 448 | -
 449 | -cc_library(
 450 | -    name = "chariter",
 451 | -    srcs = [
 452 | -        "chariter.cpp",
 453 | -    ],
 454 | -    includes = ["."],
 455 | -    deps = [
 456 | -        ":headers",
 457 | -        ":platform",
 458 | -    ],
 459 | -    local_defines = [
 460 | -        "U_COMMON_IMPLEMENTATION",
 461 | -    ],
 462 | -)
 463 | -
 464 | -cc_library(
 465 | -    name = "edits",
 466 | -    srcs = [
 467 | -        "edits.cpp",
 468 | -    ],
 469 | -    includes = ["."],
 470 | -    deps = [
 471 | -        ":headers",
 472 | -        ":icu_utility",
 473 | -        ":platform",
 474 | -    ],
 475 | -    local_defines = [
 476 | -        "U_COMMON_IMPLEMENTATION",
 477 | -    ],
 478 | -)
 479 | -
 480 | -cc_library(
 481 | -    name = "filterednormalizer2",
 482 | -    srcs = [
 483 | -        "filterednormalizer2.cpp",
 484 | -    ],
 485 | -    includes = ["."],
 486 | -    deps = [
 487 | -        ":headers",
 488 | -        ":normalizer2",
 489 | -    ],
 490 | -    local_defines = [
 491 | -        "U_COMMON_IMPLEMENTATION",
 492 | -    ],
 493 | -)
 494 | -
 495 | -cc_library(
 496 | -    name = "hashtable",
 497 | -    srcs = [
 498 | -        "uhash_us.cpp",
 499 | -    ],
 500 | -    includes = ["."],
 501 | -    deps = [
 502 | -        ":headers",
 503 | -        ":uhash",
 504 | -    ],
 505 | -    local_defines = [
 506 | -        "U_COMMON_IMPLEMENTATION",
 507 | -    ],
 508 | -)
 509 | -
 510 | -cc_library(
 511 | -    name = "icu_utility",
 512 | -    srcs = [
 513 | -        "util.cpp",
 514 | -    ],
 515 | -    includes = ["."],
 516 | -    deps = [
 517 | -        ":headers",
 518 | -        ":patternprops",
 519 | -        ":platform",
 520 | -    ],
 521 | -    local_defines = [
 522 | -        "U_COMMON_IMPLEMENTATION",
 523 | -    ],
 524 | -)
 525 | -
 526 | -cc_library(
 527 | -    name = "loadednormalizer2",
 528 | -    srcs = [
 529 | -        "loadednormalizer2impl.cpp",
 530 | -    ],
 531 | -    includes = ["."],
 532 | -    deps = [
 533 | -        ":headers",
 534 | -        ":normalizer2",
 535 | -    ],
 536 | -    local_defines = [
 537 | -        "U_COMMON_IMPLEMENTATION",
 538 | -    ],
 539 | -)
 540 | -
 541 | -cc_library(
 542 | -    name = "locale_display_names",
 543 | -    srcs = [
 544 | -        "locdispnames.cpp",
 545 | -    ],
 546 | -    includes = ["."],
 547 | -    deps = [
 548 | -        ":headers",
 549 | -        ":locresdata",
 550 | -    ],
 551 | -    local_defines = [
 552 | -        "U_COMMON_IMPLEMENTATION",
 553 | -    ],
 554 | -)
 555 | -
 556 | -cc_library(
 557 | -    name = "locresdata",
 558 | -    srcs = [
 559 | -        "locresdata.cpp",
 560 | -    ],
 561 | -    includes = ["."],
 562 | -    deps = [
 563 | -        ":headers",
 564 | -        ":resourcebundle",
 565 | -    ],
 566 | -    local_defines = [
 567 | -        "U_COMMON_IMPLEMENTATION",
 568 | -    ],
 569 | -)
 570 | -
 571 | -cc_library(
 572 | -    name = "normlzr",
 573 | -    srcs = [
 574 | -        "normlzr.cpp",
 575 | -    ],
 576 | -    includes = ["."],
 577 | -    deps = [
 578 | -        ":filterednormalizer2",
 579 | -        ":headers",
 580 | -        ":schriter",
 581 | -        ":uniset_props",
 582 | -    ],
 583 | -    local_defines = [
 584 | -        "U_COMMON_IMPLEMENTATION",
 585 | -    ],
 586 | -)
 587 | -
 588 | -cc_library(
 589 | -    name = "parsepos",
 590 | -    srcs = [
 591 | -        "parsepos.cpp",
 592 | -    ],
 593 | -    includes = ["."],
 594 | -    deps = [
 595 | -        ":headers",
 596 | -        ":platform",
 597 | -    ],
 598 | -    local_defines = [
 599 | -        "U_COMMON_IMPLEMENTATION",
 600 | -    ],
 601 | -)
 602 | -
 603 | -cc_library(
 604 | -    name = "resourcebundle",
 605 | -    srcs = [
 606 | -        "localebuilder.cpp",
 607 | -        "locavailable.cpp",
 608 | -        "locbased.cpp",
 609 | -        "locid.cpp",
 610 | -        "loclikely.cpp",
 611 | -        "locmap.cpp",
 612 | -        "resbund.cpp",
 613 | -        "resource.cpp",
 614 | -        "uloc.cpp",
 615 | -        "uloc_tag.cpp",
 616 | -        "uloc_keytype.cpp",
 617 | -        "uresbund.cpp",
 618 | -        "uresdata.cpp",
 619 | -        "wintz.cpp",
 620 | -    ],
 621 | -    includes = ["."],
 622 | -    deps = [
 623 | -        ":bytesinkutil",
 624 | -        ":errorcode",
 625 | -        ":headers",
 626 | -        ":propname",
 627 | -        ":sort",
 628 | -        ":stringenumeration",
 629 | -        ":ucol_swp",
 630 | -        ":udata",
 631 | -        ":uhash",
 632 | -        ":uscript_props",
 633 | -        ":uvector",
 634 | -    ],
 635 | -    local_defines = [
 636 | -        "U_COMMON_IMPLEMENTATION",
 637 | -    ],
 638 | -)
 639 | -
 640 | -cc_library(
 641 | -    name = "schriter",
 642 | -    srcs = [
 643 | -        "schriter.cpp",
 644 | -        "uchriter.cpp",
 645 | -    ],
 646 | -    includes = ["."],
 647 | -    deps = [
 648 | -        ":chariter",
 649 | -        ":headers",
 650 | -    ],
 651 | -    local_defines = [
 652 | -        "U_COMMON_IMPLEMENTATION",
 653 | -    ],
 654 | -)
 655 | -
 656 | -cc_library(
 657 | -    name = "service_registration",
 658 | -    srcs = [
 659 | -        "locutil.cpp",
 660 | -        "serv.cpp",
 661 | -        "servlk.cpp",
 662 | -        "servlkf.cpp",
 663 | -        "servls.cpp",
 664 | -        "servnotf.cpp",
 665 | -        "servrbf.cpp",
 666 | -        "servslkf.cpp",
 667 | -    ],
 668 | -    includes = ["."],
 669 | -    deps = [
 670 | -        ":hashtable",
 671 | -        ":headers",
 672 | -        ":locale_display_names",
 673 | -        ":resourcebundle",
 674 | -        ":uvector",
 675 | -    ],
 676 | -    local_defines = [
 677 | -        "U_COMMON_IMPLEMENTATION",
 678 | -    ],
 679 | -)
 680 | -
 681 | -cc_library(
 682 | -    name = "stringenumeration",
 683 | -    srcs = [
 684 | -        "uenum.cpp",
 685 | -        "ustrenum.cpp",
 686 | -    ],
 687 | -    includes = ["."],
 688 | -    deps = [
 689 | -        ":headers",
 690 | -        ":platform",
 691 | -    ],
 692 | -    local_defines = [
 693 | -        "U_COMMON_IMPLEMENTATION",
 694 | -    ],
 695 | -)
 696 | -
 697 | -cc_library(
 698 | -    name = "ubidi_props",
 699 | -    srcs = [
 700 | -        "ubidi_props.cpp",
 701 | -        "ubidi_props_data.h",
 702 | -    ],
 703 | -    includes = ["."],
 704 | -    deps = [
 705 | -        ":headers",
 706 | -        ":utrie2",
 707 | -    ],
 708 | -    local_defines = [
 709 | -        "U_COMMON_IMPLEMENTATION",
 710 | -    ],
 711 | -)
 712 | -
 713 | -cc_library(
 714 | -    name = "ucase",
 715 | -    srcs = [
 716 | -        "ucase.cpp",
 717 | -        "ucase_props_data.h",
 718 | -    ],
 719 | -    includes = ["."],
 720 | -    deps = [
 721 | -        ":headers",
 722 | -        ":utrie2",
 723 | -    ],
 724 | -    local_defines = [
 725 | -        "U_COMMON_IMPLEMENTATION",
 726 | -    ],
 727 | -)
 728 | -
 729 | -cc_library(
 730 | -    name = "uchar",
 731 | -    srcs = [
 732 | -        "uchar.cpp",
 733 | -        "uchar_props_data.h",
 734 | -    ],
 735 | -    includes = ["."],
 736 | -    deps = [
 737 | -        ":headers",
 738 | -        ":utrie2",
 739 | -    ],
 740 | -    local_defines = [
 741 | -        "U_COMMON_IMPLEMENTATION",
 742 | -    ],
 743 | -)
 744 | -
 745 | -cc_library(
 746 | -    name = "emojiprops",
 747 | -    srcs = [
 748 | -        "emojiprops.cpp",
 749 | -        "emojiprops.h",
 750 | -    ],
 751 | -    includes = ["."],
 752 | -    deps = [
 753 | -        ":headers",
 754 | -        ":ucharstrie",
 755 | -        ":ucharstrieiterator",
 756 | -        ":ucptrie",
 757 | -        ":udata",
 758 | -    ],
 759 | -    local_defines = [
 760 | -        "U_COMMON_IMPLEMENTATION",
 761 | -    ],
 762 | -)
 763 | -
 764 | -cc_library(
 765 | -    name = "ucharstrie",
 766 | -    srcs = [
 767 | -        "ucharstrie.cpp",
 768 | -    ],
 769 | -    includes = ["."],
 770 | -    deps = [
 771 | -        ":headers",
 772 | -        ":platform",
 773 | -    ],
 774 | -    local_defines = [
 775 | -        "U_COMMON_IMPLEMENTATION",
 776 | -    ],
 777 | -)
 778 | -
 779 | -cc_library(
 780 | -    name = "ucharstriebuilder",
 781 | -    srcs = [
 782 | -        "ucharstriebuilder.cpp",
 783 | -    ],
 784 | -    includes = ["."],
 785 | -    deps = [
 786 | -        ":headers",
 787 | -        ":sort",
 788 | -        ":stringtriebuilder",
 789 | -        ":ucharstrie",
 790 | -    ],
 791 | -    local_defines = [
 792 | -        "U_COMMON_IMPLEMENTATION",
 793 | -    ],
 794 | -)
 795 | -
 796 | -cc_library(
 797 | -    name = "ucharstrieiterator",
 798 | -    srcs = [
 799 | -        "ucharstrieiterator.cpp",
 800 | -    ],
 801 | -    includes = ["."],
 802 | -    deps = [
 803 | -        ":headers",
 804 | -        ":ucharstrie",
 805 | -        ":uvector32",
 806 | -    ],
 807 | -    local_defines = [
 808 | -        "U_COMMON_IMPLEMENTATION",
 809 | -    ],
 810 | -)
 811 | -
 812 | -cc_library(
 813 | -    name = "ucol_swp",
 814 | -    srcs = [
 815 | -        "ucol_swp.cpp",
 816 | -    ],
 817 | -    includes = ["."],
 818 | -    deps = [
 819 | -        ":headers",
 820 | -        ":utrie_swap",
 821 | -    ],
 822 | -    local_defines = [
 823 | -        "U_COMMON_IMPLEMENTATION",
 824 | -    ],
 825 | -)
 826 | -
 827 | -cc_library(
 828 | -    name = "udata",
 829 | -    srcs = [
 830 | -        "restrace.cpp",
 831 | -        "ucmndata.cpp",
 832 | -        "udata.cpp",
 833 | -        "udatamem.cpp",
 834 | -        "umapfile.cpp",
 835 | -    ],
 836 | -    includes = ["."],
 837 | -    deps = [
 838 | -        ":headers",
 839 | -        ":icu_utility",
 840 | -        ":platform",
 841 | -        ":uhash",
 842 | -        "//icu4c/source/stubdata",
 843 | -    ],
 844 | -    local_defines = [
 845 | -        "U_COMMON_IMPLEMENTATION",
 846 | -    ],
 847 | -)
 848 | -
 849 | -cc_library(
 850 | -    name = "uiter",
 851 | -    srcs = [
 852 | -        "uiter.cpp",
 853 | -    ],
 854 | -    includes = ["."],
 855 | -    deps = [
 856 | -        ":headers",
 857 | -        ":platform",
 858 | -    ],
 859 | -    local_defines = [
 860 | -        "U_COMMON_IMPLEMENTATION",
 861 | -    ],
 862 | -)
 863 | -
 864 | -cc_library(
 865 | -    name = "ulist",
 866 | -    srcs = [
 867 | -        "ulist.cpp",
 868 | -    ],
 869 | -    includes = ["."],
 870 | -    deps = [
 871 | -        ":headers",
 872 | -        ":platform",
 873 | -    ],
 874 | -    local_defines = [
 875 | -        "U_COMMON_IMPLEMENTATION",
 876 | -    ],
 877 | -)
 878 | -
 879 | -cc_library(
 880 | -    name = "unames",
 881 | -    srcs = [
 882 | -        "unames.cpp",
 883 | -    ],
 884 | -    includes = ["."],
 885 | -    deps = [
 886 | -        ":headers",
 887 | -        ":uchar",
 888 | -        ":udata",
 889 | -    ],
 890 | -    local_defines = [
 891 | -        "U_COMMON_IMPLEMENTATION",
 892 | -    ],
 893 | -)
 894 | -
 895 | -cc_library(
 896 | -    name = "unifiedcache",
 897 | -    srcs = [
 898 | -        "unifiedcache.cpp",
 899 | -    ],
 900 | -    includes = ["."],
 901 | -    deps = [
 902 | -        ":headers",
 903 | -        ":platform",
 904 | -        ":uhash",
 905 | -    ],
 906 | -    local_defines = [
 907 | -        "U_COMMON_IMPLEMENTATION",
 908 | -    ],
 909 | -)
 910 | -
 911 | -cc_library(
 912 | -    name = "uniset_core",
 913 | -    srcs = [
 914 | -        "bmpset.cpp",
 915 | -        "unifilt.cpp",
 916 | -        "unifunct.cpp",
 917 | -        "uniset.cpp",
 918 | -        "unisetspan.cpp",
 919 | -    ],
 920 | -    includes = ["."],
 921 | -    deps = [
 922 | -        ":headers",
 923 | -        ":icu_utility",
 924 | -        ":patternprops",
 925 | -        ":uvector",
 926 | -    ],
 927 | -    local_defines = [
 928 | -        "U_COMMON_IMPLEMENTATION",
 929 | -    ],
 930 | -)
 931 | -
 932 | -cc_library(
 933 | -    name = "uniset_closure",
 934 | -    srcs = [
 935 | -        "uniset_closure.cpp",
 936 | -    ],
 937 | -    includes = ["."],
 938 | -    deps = [
 939 | -        ":headers",
 940 | -        ":uniset_core",
 941 | -        ":unistr_case_locale",
 942 | -        ":unistr_titlecase_brkiter",
 943 | -    ],
 944 | -    local_defines = [
 945 | -        "U_COMMON_IMPLEMENTATION",
 946 | -    ],
 947 | -)
 948 | -
 949 | -cc_library(
 950 | -    name = "uniset_props",
 951 | -    srcs = [
 952 | -        "uniset_props.cpp",
 953 | -        "ruleiter.cpp",
 954 | -    ],
 955 | -    includes = ["."],
 956 | -    deps = [
 957 | -        ":characterproperties",
 958 | -        ":headers",
 959 | -        ":parsepos",
 960 | -        ":propname",
 961 | -        ":resourcebundle",
 962 | -        ":unames",
 963 | -        ":uniset_core",
 964 | -        ":unistr_case",
 965 | -        ":uprops",
 966 | -    ],
 967 | -    local_defines = [
 968 | -        "U_COMMON_IMPLEMENTATION",
 969 | -    ],
 970 | -)
 971 | -
 972 | -cc_library(
 973 | -    name = "unistr_case",
 974 | -    srcs = [
 975 | -        "unistr_case.cpp",
 976 | -    ],
 977 | -    includes = ["."],
 978 | -    deps = [
 979 | -        ":headers",
 980 | -        ":ustring_case",
 981 | -    ],
 982 | -    local_defines = [
 983 | -        "U_COMMON_IMPLEMENTATION",
 984 | -    ],
 985 | -)
 986 | -
 987 | -cc_library(
 988 | -    name = "unistr_case_locale",
 989 | -    srcs = [
 990 | -        "unistr_case_locale.cpp",
 991 | -    ],
 992 | -    includes = ["."],
 993 | -    deps = [
 994 | -        ":headers",
 995 | -        ":unistr_case",
 996 | -        ":ustring_case_locale",
 997 | -    ],
 998 | -    local_defines = [
 999 | -        "U_COMMON_IMPLEMENTATION",
1000 | -    ],
1001 | -)
1002 | -
1003 | -cc_library(
1004 | -    name = "unistr_titlecase_brkiter",
1005 | -    srcs = [
1006 | -        "unistr_titlecase_brkiter.cpp",
1007 | -    ],
1008 | -    includes = ["."],
1009 | -    deps = [
1010 | -        ":headers",
1011 | -        ":ustr_titlecase_brkiter",
1012 | -    ],
1013 | -    local_defines = [
1014 | -        "U_COMMON_IMPLEMENTATION",
1015 | -    ],
1016 | -)
1017 | -
1018 | -cc_library(
1019 | -    name = "uprops",
1020 | -    srcs = [
1021 | -        "uprops.cpp",
1022 | -    ],
1023 | -    includes = ["."],
1024 | -    deps = [
1025 | -        ":headers",
1026 | -        ":emojiprops",
1027 | -        ":loadednormalizer2",
1028 | -        ":normalizer2",
1029 | -        ":ubidi_props",
1030 | -        ":ucase",
1031 | -        ":uchar",
1032 | -        ":unistr_case",
1033 | -        ":ustring_case",
1034 | -    ],
1035 | -    local_defines = [
1036 | -        "U_COMMON_IMPLEMENTATION",
1037 | -    ],
1038 | -)
1039 | -
1040 | -cc_library(
1041 | -    name = "uscript_props",
1042 | -    srcs = [
1043 | -        "uscript_props.cpp",
1044 | -    ],
1045 | -    includes = ["."],
1046 | -    deps = [
1047 | -        ":headers",
1048 | -        ":platform",
1049 | -    ],
1050 | -    local_defines = [
1051 | -        "U_COMMON_IMPLEMENTATION",
1052 | -    ],
1053 | -)
1054 | -
1055 | -cc_library(
1056 | -    name = "uset",
1057 | -    srcs = [
1058 | -        "uset.cpp",
1059 | -    ],
1060 | -    includes = ["."],
1061 | -    deps = [
1062 | -        ":headers",
1063 | -        ":platform",
1064 | -        ":uniset_core",
1065 | -    ],
1066 | -    local_defines = [
1067 | -        "U_COMMON_IMPLEMENTATION",
1068 | -    ],
1069 | -)
1070 | -
1071 | -cc_library(
1072 | -    name = "uset_props",
1073 | -    srcs = [
1074 | -        "uset_props.cpp",
1075 | -    ],
1076 | -    includes = ["."],
1077 | -    deps = [
1078 | -        ":headers",
1079 | -        ":uniset_closure",
1080 | -        ":uniset_core",
1081 | -        ":uniset_props",
1082 | -    ],
1083 | -    local_defines = [
1084 | -        "U_COMMON_IMPLEMENTATION",
1085 | -    ],
1086 | -)
1087 | -
1088 | -cc_library(
1089 | -    name = "usetiter",
1090 | -    srcs = [
1091 | -        "usetiter.cpp",
1092 | -    ],
1093 | -    includes = ["."],
1094 | -    deps = [
1095 | -        ":headers",
1096 | -        ":platform",
1097 | -        ":uniset_core",
1098 | -    ],
1099 | -    local_defines = [
1100 | -        "U_COMMON_IMPLEMENTATION",
1101 | -    ],
1102 | -)
1103 | -
1104 | -cc_library(
1105 | -    name = "ustack",
1106 | -    srcs = [
1107 | -        "ustack.cpp",
1108 | -    ],
1109 | -    includes = ["."],
1110 | -    deps = [
1111 | -        ":headers",
1112 | -        ":uvector",
1113 | -    ],
1114 | -    local_defines = [
1115 | -        "U_COMMON_IMPLEMENTATION",
1116 | -    ],
1117 | -)
1118 | -
1119 | -cc_library(
1120 | -    name = "ustr_titlecase_brkiter",
1121 | -    srcs = [
1122 | -        "ustr_titlecase_brkiter.cpp",
1123 | -    ],
1124 | -    includes = ["."],
1125 | -    deps = [
1126 | -        ":breakiterator",
1127 | -        ":headers",
1128 | -        ":ucase",
1129 | -        ":ustring_case_locale",
1130 | -    ],
1131 | -    local_defines = [
1132 | -        "U_COMMON_IMPLEMENTATION",
1133 | -    ],
1134 | -)
1135 | -
1136 | -cc_library(
1137 | -    name = "ustring_case",
1138 | -    srcs = [
1139 | -        "ustrcase.cpp",
1140 | -    ],
1141 | -    includes = ["."],
1142 | -    deps = [
1143 | -        ":headers",
1144 | -        ":ucase",
1145 | -        ":uchar",
1146 | -        ":edits",
1147 | -    ],
1148 | -    local_defines = [
1149 | -        "U_COMMON_IMPLEMENTATION",
1150 | -    ],
1151 | -)
1152 | -
1153 | -cc_library(
1154 | -    name = "ustring_case_locale",
1155 | -    srcs = [
1156 | -        "ustrcase_locale.cpp",
1157 | -    ],
1158 | -    includes = ["."],
1159 | -    deps = [
1160 | -        ":headers",
1161 | -        ":resourcebundle",
1162 | -        ":ustring_case",
1163 | -    ],
1164 | -    local_defines = [
1165 | -        "U_COMMON_IMPLEMENTATION",
1166 | -    ],
1167 | -)
1168 | -
1169 | -cc_library(
1170 | -    name = "utext",
1171 | -    srcs = [
1172 | -        "utext.cpp",
1173 | -    ],
1174 | -    includes = ["."],
1175 | -    deps = [
1176 | -        ":headers",
1177 | -        ":ucase",
1178 | -    ],
1179 | -    local_defines = [
1180 | -        "U_COMMON_IMPLEMENTATION",
1181 | -    ],
1182 | -)
1183 | -
1184 | -cc_library(
1185 | -    name = "utrie_swap",
1186 | -    srcs = [
1187 | -        "utrie_swap.cpp",
1188 | -    ],
1189 | -    includes = ["."],
1190 | -    deps = [
1191 | -        ":headers",
1192 | -        ":udata",
1193 | -    ],
1194 | -    local_defines = [
1195 | -        "U_COMMON_IMPLEMENTATION",
1196 | -    ],
1197 | -)
1198 | -
1199 | -# This target depends on a header file that contains NFC/NFD normalization data.
1200 | -# This header file is generated by a script (generate.sh) that invokes the gennorm2 binary.
1201 | -# See the Unicode update change log (changes.txt).
1202 | -cc_library(
1203 | -    name = "normalizer2",
1204 | -    srcs = [
1205 | -        "norm2_nfc_data.h",  # generated by gennorm2
1206 | -        "normalizer2.cpp",
1207 | -        "normalizer2impl.cpp",
1208 | -    ],
1209 | -    includes = ["."],
1210 | -    hdrs = [
1211 | -        "normalizer2impl.h",
1212 | -    ],
1213 | -    deps = [
1214 | -        ":headers",
1215 | -    ],
1216 | -    local_defines = [
1217 | -        "U_COMMON_IMPLEMENTATION",  
1218 | -    ],
1219 | -)
1220 | 
1221 | 
1222 | 
1223 | --- icu4c/source/data/unidata/norm2/BUILD.bazel
1224 | +++ /dev/null
1225 | @@ -1,13 +0,0 @@
1226 | -# © 2021 and later: Unicode, Inc. and others.
1227 | -# License & terms of use: http://www.unicode.org/copyright.html
1228 | -
1229 | -# This Bazel build file is needed to declare targets for the files used as
1230 | -# inputs to binary executables that are a part of other Bazel genrule targets.
1231 | -
1232 | -package(
1233 | -    default_visibility = ["//visibility:public"],
1234 | -)
1235 | -
1236 | -exports_files([
1237 | -    "nfc.txt", "nfkc.txt", "nfkc_cf.txt", "uts46.txt",
1238 | -])
1239 | 
1240 | 
1241 | 
1242 | --- icu4c/source/i18n/BUILD.bazel
1243 | +++ /dev/null
1244 | @@ -1,130 +0,0 @@
1245 | -# © 2021 and later: Unicode, Inc. and others.
1246 | -# License & terms of use: http://www.unicode.org/copyright.html
1247 | -
1248 | -# This file defines Bazel targets for a subset of the ICU4C "i18n" library header and source files.
1249 | -# The configuration of dependencies among targets is strongly assisted by the
1250 | -# file in depstest that maintains such information, at
1251 | -# icu4c/source/test/depstest/dependencies.txt .
1252 | -
1253 | -load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
1254 | -
1255 | -package(
1256 | -    default_visibility = ["//visibility:public"],
1257 | -)
1258 | -
1259 | -# When compiling code in the `common` dir, the constant
1260 | -# `U_I18n_IMPLEMENTATION` needs to be defined. See 
1261 | -# https://unicode-org.github.io/icu/userguide/howtouseicu#c-with-your-own-build-system .
1262 | -
1263 | -# If linker errors occur, then this may be a sign that the dependencies were 
1264 | -# not specified correctly. Use dependencies.txt in depstest for assistance. See
1265 | -# https://stackoverflow.com/q/66111709/2077918 .
1266 | -
1267 | -cc_library(
1268 | -    name = "headers",
1269 | -    hdrs = glob([
1270 | -        "unicode/*.h", # public
1271 | -        "*.h",         # internal
1272 | -    ]),
1273 | -    # We need to add includes in order to preserve existing source files'
1274 | -    # include directives that use traditional paths, not paths relative to
1275 | -    # Bazel workspace:
1276 | -    # https://stackoverflow.com/a/65635893/2077918
1277 | -    includes = ["."],
1278 | -    local_defines = [
1279 | -        "U_I18N_IMPLEMENTATION",
1280 | -    ],
1281 | -)
1282 | -
1283 | -cc_library(
1284 | -    name = "collation",
1285 | -    srcs = [
1286 | -        "bocsu.cpp",
1287 | -        "coleitr.cpp",
1288 | -        "coll.cpp",
1289 | -        "collation.cpp",
1290 | -        "collationcompare.cpp",
1291 | -        "collationdata.cpp",
1292 | -        "collationdatareader.cpp",
1293 | -        "collationdatawriter.cpp",
1294 | -        "collationfastlatin.cpp",
1295 | -        # collationfcd.cpp is generated by genuca;
1296 | -        # probably hard to build genuca without depending on the old version.
1297 | -        "collationfcd.cpp",
1298 | -        "collationiterator.cpp",
1299 | -        "collationkeys.cpp",
1300 | -        "collationroot.cpp",
1301 | -        "collationrootelements.cpp",
1302 | -        "collationsets.cpp",
1303 | -        "collationsettings.cpp",
1304 | -        "collationtailoring.cpp",
1305 | -        "rulebasedcollator.cpp",
1306 | -        "sortkey.cpp",
1307 | -        "ucol.cpp",
1308 | -        "ucol_res.cpp",
1309 | -        "ucol_sit.cpp",
1310 | -        "ucoleitr.cpp",
1311 | -        "uitercollationiterator.cpp",
1312 | -        "utf16collationiterator.cpp",
1313 | -        "utf8collationiterator.cpp",
1314 | -    ],
1315 | -    includes = ["."],
1316 | -    deps = [
1317 | -        ":headers",
1318 | -        ":uclean_i18n",
1319 | -        "//icu4c/source/common:bytestream",
1320 | -        "//icu4c/source/common:normalizer2",
1321 | -        "//icu4c/source/common:platform",
1322 | -        "//icu4c/source/common:propname",
1323 | -        "//icu4c/source/common:resourcebundle",
1324 | -        "//icu4c/source/common:service_registration",
1325 | -        "//icu4c/source/common:ucharstrieiterator",
1326 | -        "//icu4c/source/common:uiter",
1327 | -        "//icu4c/source/common:ulist",
1328 | -        "//icu4c/source/common:unifiedcache",
1329 | -        "//icu4c/source/common:uset",
1330 | -        "//icu4c/source/common:usetiter",
1331 | -        "//icu4c/source/common:utrie2",
1332 | -        "//icu4c/source/common:uvector32",
1333 | -        "//icu4c/source/common:uvector64",
1334 | -    ],
1335 | -    local_defines = [
1336 | -        "U_I18N_IMPLEMENTATION",
1337 | -    ],
1338 | -)
1339 | -
1340 | -cc_library(
1341 | -    name = "collation_builder",
1342 | -    srcs = [
1343 | -        "collationbuilder.cpp",
1344 | -        "collationdatabuilder.cpp",
1345 | -        "collationfastlatinbuilder.cpp",
1346 | -        "collationruleparser.cpp",
1347 | -        "collationweights.cpp",
1348 | -    ],
1349 | -    includes = ["."],
1350 | -    deps = [
1351 | -        ":collation",
1352 | -        "//icu4c/source/common:canonical_iterator",
1353 | -        "//icu4c/source/common:ucharstriebuilder",
1354 | -        "//icu4c/source/common:uset_props"
1355 | -    ],
1356 | -    local_defines = [
1357 | -        "U_I18N_IMPLEMENTATION",
1358 | -    ],
1359 | -)
1360 | -
1361 | -cc_library(
1362 | -    name = "uclean_i18n",
1363 | -    srcs = [
1364 | -        "ucln_in.cpp",
1365 | -    ],
1366 | -    hdrs = ["ucln_in.h"],
1367 | -    includes = ["."],
1368 | -    deps = [
1369 | -        "//icu4c/source/common:platform",
1370 | -    ],
1371 | -    local_defines = [
1372 | -        "U_I18N_IMPLEMENTATION",
1373 | -    ],
1374 | -)
1375 | 
1376 | 
1377 | --- icu4c/source/icudefs.mk.in
1378 | +++ icu4c/source/icudefs.mk.in
1379 | @@ -116,8 +116,8 @@ ENABLE_RELEASE = @ENABLE_RELEASE@
1380 |  EXEEXT = @EXEEXT@
1381 |  CC = @CC@
1382 |  CXX = @CXX@
1383 | -AR = @AR@
1384 | -ARFLAGS = @ARFLAGS@ r
1385 | +AR = ar
1386 | +ARFLAGS = r
1387 |  RANLIB = @RANLIB@
1388 |  COMPILE_LINK_ENVVAR = @COMPILE_LINK_ENVVAR@
1389 |  UCLN_NO_AUTO_CLEANUP = @UCLN_NO_AUTO_CLEANUP@
1390 | 
1391 | 
1392 | 
1393 | --- icu4c/source/stubdata/BUILD.bazel
1394 | +++ /dev/null
1395 | @@ -1,23 +0,0 @@
1396 | -# © 2021 and later: Unicode, Inc. and others.
1397 | -# License & terms of use: http://www.unicode.org/copyright.html
1398 | -
1399 | -# This file defines Bazel targets for the ICU4C "stubdata" library header and source files.
1400 | -
1401 | -load("@rules_cc//cc:defs.bzl", "cc_library")
1402 | -
1403 | -package(
1404 | -    default_visibility = ["//visibility:public"],
1405 | -)
1406 | -
1407 | -# When compiling code in the `common` dir, the constant
1408 | -# `U_COMMON_IMPLEMENTATION` needs to be defined. See 
1409 | -# https://unicode-org.github.io/icu/userguide/howtouseicu#c-with-your-own-build-system .
1410 | -
1411 | -cc_library(
1412 | -    name = "stubdata",
1413 | -    srcs = ["stubdata.cpp"],
1414 | -    deps = ["//icu4c/source/common:headers"],
1415 | -    local_defines = [
1416 | -        "U_COMMON_IMPLEMENTATION",
1417 | -    ],
1418 | -)
1419 | 
1420 | 
1421 | 
1422 | --- icu4c/source/tools/gennorm2/BUILD.bazel
1423 | +++ /dev/null
1424 | @@ -1,39 +0,0 @@
1425 | -# © 2021 and later: Unicode, Inc. and others.
1426 | -# License & terms of use: http://www.unicode.org/copyright.html
1427 | -
1428 | -# This Bazel build file defines a target for the gennorm2 binary that generates
1429 | -# headers needed for bootstrapping the ICU4C build process in a way that
1430 | -# integrates the normalization data.
1431 | -
1432 | -load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
1433 | -
1434 | -package(
1435 | -    default_visibility = ["//visibility:public"],
1436 | -)
1437 | -
1438 | -cc_binary(
1439 | -    name = "gennorm2",
1440 | -    srcs = glob([
1441 | -        "*.c",
1442 | -        "*.cpp",
1443 | -        "*.h",   # cannot have hdrs section in cc_binary
1444 | -    ]),
1445 | -    deps = [
1446 | -        "//icu4c/source/common:uhash",
1447 | -        "//icu4c/source/common:umutablecptrie",
1448 | -        "//icu4c/source/common:ucptrie",
1449 | -        "//icu4c/source/common:errorcode",
1450 | -        "//icu4c/source/common:uniset",
1451 | -        "//icu4c/source/common:uvector32",
1452 | -
1453 | -        "//icu4c/source/common:platform",
1454 | -        "//icu4c/source/common:headers",
1455 | -        
1456 | -        "//icu4c/source/tools/toolutil:toolutil",
1457 | -        "//icu4c/source/tools/toolutil:unewdata",
1458 | -        "//icu4c/source/tools/toolutil:writesrc",
1459 | -        "//icu4c/source/tools/toolutil:uoptions",
1460 | -        "//icu4c/source/tools/toolutil:uparse",
1461 | -    ],
1462 | -    linkopts = ["-pthread"],
1463 | -)
1464 | 
1465 | 
1466 | 
1467 | --- icu4c/source/tools/toolutil/BUILD.bazel
1468 | +++ /dev/null
1469 | @@ -1,126 +0,0 @@
1470 | -# © 2021 and later: Unicode, Inc. and others.
1471 | -# License & terms of use: http://www.unicode.org/copyright.html
1472 | -
1473 | -# This Bazel build file defines targets that are dependencies for building
1474 | -# the gennorm2 and genprops binaries.
1475 | -
1476 | -load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
1477 | -
1478 | -package(
1479 | -    default_visibility = ["//visibility:public"],
1480 | -)
1481 | -
1482 | -cc_library(
1483 | -    name = "toolutil",
1484 | -    includes = ["."],
1485 | -    hdrs = ["toolutil.h"],
1486 | -    srcs = ["toolutil.cpp"],
1487 | -    local_defines = [
1488 | -        "U_TOOLUTIL_IMPLEMENTATION",
1489 | -    ],
1490 | -    deps = ["//icu4c/source/common:platform"],
1491 | -)
1492 | -
1493 | -cc_library(
1494 | -    name = "unewdata",
1495 | -    includes = ["."],
1496 | -    hdrs = ["unewdata.h"],
1497 | -    srcs = ["unewdata.cpp"],
1498 | -    local_defines = [
1499 | -        "U_TOOLUTIL_IMPLEMENTATION",
1500 | -    ],
1501 | -    deps = [
1502 | -        ":filestrm",
1503 | -        "//icu4c/source/common:platform",
1504 | -    ],
1505 | -)
1506 | -
1507 | -cc_library(
1508 | -    name = "uoptions",
1509 | -    includes = ["."],
1510 | -    hdrs = ["uoptions.h"],
1511 | -    srcs = ["uoptions.cpp"],
1512 | -    local_defines = [
1513 | -        "U_TOOLUTIL_IMPLEMENTATION",
1514 | -    ],
1515 | -    deps = ["//icu4c/source/common:platform"],
1516 | -)
1517 | -
1518 | -cc_library(
1519 | -    name = "writesrc",
1520 | -    includes = ["."],
1521 | -    hdrs = ["writesrc.h"],
1522 | -    srcs = ["writesrc.cpp"],
1523 | -    local_defines = [
1524 | -        "U_TOOLUTIL_IMPLEMENTATION",
1525 | -    ],
1526 | -    deps = [
1527 | -        "//icu4c/source/common:bytestream",
1528 | -        "//icu4c/source/common:platform",
1529 | -        "//icu4c/source/common:uniset_core",
1530 | -    ],
1531 | -)
1532 | -
1533 | -cc_library(
1534 | -    name = "uparse",
1535 | -    includes = ["."],
1536 | -    hdrs = ["uparse.h"],
1537 | -    srcs = ["uparse.cpp"],
1538 | -    local_defines = [
1539 | -        "U_TOOLUTIL_IMPLEMENTATION",
1540 | -    ],
1541 | -    deps = [
1542 | -        ":filestrm",
1543 | -        "//icu4c/source/common:platform",
1544 | -        ],
1545 | -)
1546 | -
1547 | -cc_library(
1548 | -    name = "filestrm",
1549 | -    includes = ["."],
1550 | -    hdrs = ["filestrm.h"],
1551 | -    srcs = ["filestrm.cpp"],
1552 | -    local_defines = [
1553 | -        "U_TOOLUTIL_IMPLEMENTATION",
1554 | -    ],
1555 | -    deps = ["//icu4c/source/common:platform"],
1556 | -)
1557 | -
1558 | -cc_library(
1559 | -    name = "ppucd",
1560 | -    includes = ["."],
1561 | -    hdrs = ["ppucd.h"],
1562 | -    srcs = ["ppucd.cpp"],
1563 | -    local_defines = [
1564 | -        "U_TOOLUTIL_IMPLEMENTATION",
1565 | -    ],
1566 | -    deps = [
1567 | -        ":uparse",
1568 | -        "//icu4c/source/common:platform",
1569 | -    ],
1570 | -)
1571 | -
1572 | -cc_library(
1573 | -    name = "denseranges",
1574 | -    includes = ["."],
1575 | -    hdrs = ["denseranges.h"],
1576 | -    srcs = ["denseranges.cpp"],
1577 | -    local_defines = [
1578 | -        "U_TOOLUTIL_IMPLEMENTATION",
1579 | -    ],
1580 | -    deps = ["//icu4c/source/common:platform"],
1581 | -)
1582 | -
1583 | -cc_library(
1584 | -    name = "collationinfo",
1585 | -    includes = ["."],
1586 | -    hdrs = ["collationinfo.h"],
1587 | -    srcs = ["collationinfo.cpp"],
1588 | -    local_defines = [
1589 | -        "U_TOOLUTIL_IMPLEMENTATION",
1590 | -    ],
1591 | -    deps = [
1592 | -        "//icu4c/source/common:platform",
1593 | -        "//icu4c/source/i18n:headers",
1594 | -    ],
1595 | -)
1596 | 


--------------------------------------------------------------------------------
/bazel/pybind11/BUILD:
--------------------------------------------------------------------------------
 1 | package(default_visibility = ["//visibility:public"])
 2 | 
 3 | cc_library(
 4 |   name = "pybind11",
 5 |   hdrs = glob(
 6 |     include = [
 7 |       "include/pybind11/*.h",
 8 |       "include/pybind11/detail/*.h",
 9 |     ],
10 |     exclude = [
11 |       "include/pybind11/common.h",
12 |       "include/pybind11/eigen.h",
13 |     ],
14 |   ),
15 |   copts = [
16 |     "-fexceptions",
17 |     "-Wno-undefined-inline",
18 |     "-Wno-pragma-once-outside-header",
19 |   ],
20 |   includes = ["include"]  
21 | )


--------------------------------------------------------------------------------
/binding.cpp:
--------------------------------------------------------------------------------
 1 | #include "clip_tokenizer.h"
 2 | #include <pybind11/pybind11.h>
 3 | #include <pybind11/stl.h>
 4 | #include <pybind11/complex.h>
 5 | 
 6 | PYBIND11_MODULE(clip_tokenizer_py, m) {
 7 |     pybind11::class_<CLIPTokenizer>(m, "CLIPTokenizer")
 8 |         .def(pybind11::init<const std::string&>())
 9 |         .def("tokenize", &CLIPTokenizer::tokenize);
10 |     
11 |     pybind11::class_<TokenizerResult>(m, "TokenizerResult")
12 |         .def(pybind11::init<>())
13 |         .def_readwrite("tokens", &TokenizerResult::tokens)
14 |         .def_readwrite("attention_mask", &TokenizerResult::attention_mask)
15 |         .def("__repr__", [](const TokenizerResult &a) {
16 |             std::string tokens;
17 |             for (auto& token : a.tokens[0]) {
18 |                 tokens += std::to_string(token) + ", ";
19 |             }
20 |             tokens.pop_back();
21 |             tokens.pop_back();
22 |             std::string attention_mask;
23 |             for (auto& mask : a.attention_mask[0]) {
24 |                 attention_mask += std::to_string(mask) + ", ";
25 |             }
26 |             attention_mask.pop_back();
27 |             attention_mask.pop_back();
28 |             return "<clip_tokenizer_py.TokenizerResult tokens=[" + tokens + "] attention_mask=[" + attention_mask + "]>";
29 |         });
30 | }   


--------------------------------------------------------------------------------
/clip_tokenizer.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include "clip_tokenizer.h"
  3 | 
  4 | 
  5 | bool is_chinese_char(UChar32 ch) {
  6 |     return (ch >= 0x4E00 && ch <= 0x9FFF)   || 
  7 |            (ch >= 0x3400 && ch <= 0x4DBF)   || 
  8 |            (ch >= 0x20000 && ch <= 0x2A6DF) || 
  9 |            (ch >= 0x2A700 && ch <= 0x2B73F) || 
 10 |            (ch >= 0x2B740 && ch <= 0x2B81F) || 
 11 |            (ch >= 0x2B820 && ch <= 0x2CEAF) || 
 12 |            (ch >= 0xF900 && ch <= 0xFAFF)   || 
 13 |            (ch >= 0x2F800 && ch <= 0x2FA1F);
 14 | }
 15 | 
 16 | icu::UnicodeString tokenize_chinese(const icu::UnicodeString& text) {
 17 |     icu::UnicodeString result;
 18 |     for (int32_t i = 0; i < text.length(); ++i) {
 19 |         UChar32 ch = text.char32At(i);
 20 |         if (is_chinese_char(ch)) {
 21 |             result += " ";
 22 |             result += ch;
 23 |             result += " ";
 24 |         } else {
 25 |             result += ch;
 26 |         }
 27 |     }
 28 |     return result;
 29 | }
 30 | 
 31 | std::vector<int> get_bytes_to_unicode_vec() {
 32 |     std::vector<int> result;
 33 |     for (int i = 0; i < 256; ++i) {
 34 |         if((i < 33) || (i > 126 && i < 161) || (i == 173))
 35 |             continue;
 36 |         result.push_back(i);
 37 |     }
 38 | 
 39 |     // copy range to range2
 40 |     std::vector<int> range2 = result;
 41 |     
 42 |     int n = 0;
 43 |     for (int b = 0; b < 256; ++b) {
 44 |         if (std::find(result.begin(), result.end(), b) == result.end()) {
 45 |             result.push_back(256 + n);
 46 |             ++n;
 47 |         }
 48 |     }
 49 | 
 50 |     return result;
 51 | }
 52 | 
 53 | std::unordered_map<int, UChar32> CLIPTokenizer::bytes_to_unicode() {
 54 |     std::unordered_map<int, UChar32> byteToUnicode;
 55 | 
 56 |     std::vector<int> range, range2;
 57 | 
 58 |     for (int i = 0; i < 256; ++i) {
 59 |         if((i < 33) || (i > 126 && i < 161) || (i == 173))
 60 |             continue;
 61 |         range.push_back(i);
 62 |     }
 63 | 
 64 |     // copy range to range2
 65 |     range2 = range;
 66 |     
 67 |     int n = 0;
 68 |     for (int b = 0; b < 256; ++b) {
 69 |         if (std::find(range.begin(), range.end(), b) == range.end()) {
 70 |             range.push_back(b);
 71 |             range2.push_back(256 + n);
 72 |             ++n;
 73 |         }
 74 |     }
 75 | 
 76 |     for (size_t i = 0; i < range.size(); ++i) {
 77 |         byteToUnicode[range[i]] = UChar32(range2[i]);
 78 |     }
 79 | 
 80 |     return byteToUnicode;
 81 | }
 82 | 
 83 | 
 84 | std::set<std::pair<icu::UnicodeString, icu::UnicodeString>> CLIPTokenizer::get_pairs(const std::vector<icu::UnicodeString>& word) {
 85 |     std::set<std::pair<icu::UnicodeString, icu::UnicodeString>> pairs;
 86 |     if (word.size() <= 1) {
 87 |         return pairs;  // No pairs if the word has one or zero characters
 88 |     }
 89 | 
 90 |     icu::UnicodeString prev_char = word[0];
 91 |     for (size_t i = 1; i < word.size(); ++i) {
 92 |         pairs.insert(std::make_pair(prev_char, word[i]));
 93 |         prev_char = word[i];
 94 |     }
 95 | 
 96 |     return pairs;
 97 | }
 98 | 
 99 | std::vector<std::tuple<std::string, std::string>> CLIPTokenizer::get_merges(const std::string& file_path) {
100 |     std::ifstream file(file_path);
101 |     if (!file.is_open()) {
102 |         std::cout << "Error: Could not open file " << file_path << std::endl;
103 |         return {};
104 |     }
105 |     std::string line;
106 |     std::vector<std::string> merges;
107 | 
108 |     while (std::getline(file, line)) {
109 |         merges.push_back(line);
110 |     }
111 | 
112 |     std::vector<std::tuple<std::string, std::string>> result;
113 |     for (size_t i = 1; i <= 49152 - 256 - 2; ++i) {
114 |         std::istringstream merge_stream(merges[i]);
115 |         std::string first, second;
116 |         merge_stream >> first >> second;
117 |         result.emplace_back(first, second);
118 |     }
119 | 
120 |     return result;
121 | }
122 | 
123 | icu::UnicodeString CLIPTokenizer::whitespace_clean(const icu::UnicodeString& text) {
124 |     // Remove consecutive whitespace characters and replace with a single space
125 |     icu::UnicodeString result;
126 | 
127 |     for (int32_t i = 0; i < text.length(); ++i) {
128 |         UChar32 ch = text.char32At(i);
129 |         if (u_isWhitespace(ch)) {
130 |             if (result.length() == 0 || result.char32At(result.length() - 1) == ' ') {
131 |                 continue;
132 |             } else {
133 |                 result += ' ';
134 |             }
135 |         } else {
136 |             result += ch;
137 |         }
138 |     }
139 | 
140 |     return result;
141 | }
142 | 
143 | 
144 | CLIPTokenizer::CLIPTokenizer(const std::string& vocab_file) {
145 |     matcher = std::make_unique<icu::RegexMatcher>("<\\|startoftext\\|>|<\\|endoftext\\|>|'s|'t|'re|'ve|'m|'ll|'d|[\\p{L}]+|[\\p{N}]|[^\\s\\p{L}\\p{N}]+", 0, status);
146 |     byte_encoder = bytes_to_unicode();
147 |     for (auto& it : byte_encoder) {
148 |         byte_decoder[it.second] = it.first;
149 |     }
150 |     auto merges = get_merges(vocab_file);
151 |     auto bytes_to_unicode_vec = get_bytes_to_unicode_vec();
152 |     vocab.reserve(bytes_to_unicode_vec.size() + merges.size() + 2);
153 |     for(const auto& v : get_bytes_to_unicode_vec()) {
154 |         vocab.push_back(std::move(icu::UnicodeString(v)));
155 |     }
156 | 
157 |     for(const auto& val : vocab) {
158 |         vocab.push_back(val + "</w>");
159 |     }
160 | 
161 |     for(const auto& [k,v] : merges) {
162 |         vocab.push_back(icu::UnicodeString::fromUTF8(k + v));
163 |     }
164 | 
165 | 
166 |     vocab.push_back("<|startoftext|>");
167 |     vocab.push_back("<|endoftext|>");
168 |     for(size_t i = 0; i < vocab.size(); i++) {
169 |         encoder[vocab[i]] = i;
170 |         decoder[i] = vocab[i];
171 |     }
172 | 
173 |     for(size_t i = 0; i < merges.size(); ++i) {
174 |         bpe_ranks[icu::UnicodeString::fromUTF8(std::get<0>(merges[i]) + std::get<1>(merges[i]))] = i;
175 |     }
176 | 
177 |     cache["<|startoftext|>"] = {"<|startoftext|>"};
178 |     cache["<|endoftext|>"] = {"<|endoftext|>"};
179 | }
180 | 
181 | std::vector<icu::UnicodeString> CLIPTokenizer::bpe(const icu::UnicodeString& text) {
182 |     if (cache.find(text) != cache.end()) {
183 |         return cache[text];
184 |     }
185 | 
186 |     std::vector<icu::UnicodeString> word;
187 |     for (int32_t i = 0; i < text.length() - 1; ++i) {
188 |         word.push_back(text.tempSubString(i, 1));
189 |     }
190 |     word.push_back(text.tempSubString(text.length() - 1) + "</w>");
191 |     auto pairs = get_pairs(word);
192 |     if (pairs.empty()) {
193 |         return {text + "</w>"};
194 |     }
195 | 
196 |     while(1) {
197 |         auto bigram = std::min_element(pairs.begin(), pairs.end(), [&](const std::pair<icu::UnicodeString, icu::UnicodeString>& l, const std::pair<icu::UnicodeString, icu::UnicodeString>& r) {
198 |             int rank1 = INT32_MAX;
199 |             if (bpe_ranks.find(l.first + l.second) != bpe_ranks.end()) {
200 |                 rank1 = bpe_ranks[l.first + l.second];
201 |             }
202 |             int rank2 = INT32_MAX;
203 |             if (bpe_ranks.find(r.first + r.second) != bpe_ranks.end()) {
204 |                 rank2 = bpe_ranks[r.first + r.second];
205 |             }
206 |             return rank1 < rank2;
207 |         });
208 | 
209 |         if (bpe_ranks.find(bigram->first + bigram->second) == bpe_ranks.end()) {
210 |             break;
211 |         }
212 | 
213 |         auto first = bigram->first;
214 |         auto second = bigram->second;
215 |         std::vector<icu::UnicodeString> new_word;
216 | 
217 |         size_t i = 0;
218 | 
219 |         while (i < word.size()) {
220 |             size_t j = i;
221 |             while (j < word.size()) {
222 |                 if (word[j] == first) {
223 |                     break;
224 |                 }
225 |                 ++j;
226 |             }
227 |             for (size_t k = i; k < j; ++k) {
228 |                 new_word.push_back(word[k]);
229 |             }
230 | 
231 |             if (j == word.size()) {
232 |                 break;
233 |             } else {
234 |                 i = j;
235 |             }
236 | 
237 |             if(word[i] == first && i < word.size() - 1 && word[i + 1] == second) {
238 |                 new_word.push_back(first + second);
239 |                 i += 2;
240 |             } else {
241 |                 new_word.push_back(word[i]);
242 |                 ++i;
243 |             }
244 |         }
245 | 
246 |         word = new_word;
247 |         if (word.size() == 1) {
248 |             break;
249 |         } else {
250 |             pairs = get_pairs(word);
251 |         }
252 |     }
253 | 
254 |     cache[text] = word;
255 |     return word;
256 | }
257 | 
258 | std::vector<int> CLIPTokenizer::encode(icu::UnicodeString unicode_text) {
259 |     std::vector<int> bpe_tokens;
260 |     unicode_text = whitespace_clean(unicode_text);
261 |     icu::Transliterator* strip_accents = icu::Transliterator::createInstance("NFD; [:Mn:] Remove", UTRANS_FORWARD, status);
262 |     strip_accents->transliterate(unicode_text);
263 |     if (status != U_ZERO_ERROR) {
264 |         if(status == U_STRING_NOT_TERMINATED_WARNING) {
265 |             status = U_ZERO_ERROR;
266 |         } else {
267 |             std::cout << "Error: " << u_errorName(status) << std::endl;
268 |             bpe_tokens.push_back(encoder[icu::UnicodeString::fromUTF8("<|endoftext|>")]);
269 |             return bpe_tokens;
270 |         }
271 |     }
272 |     unicode_text = tokenize_chinese(unicode_text);
273 |     unicode_text.toLower();
274 |     unicode_text.trim();
275 |     icu::UnicodeString word;
276 |     size_t start = 0;
277 |     matcher->reset(unicode_text);
278 |     while (matcher->find()) {
279 |         word = matcher->group(status);
280 |         if (status != U_ZERO_ERROR) {
281 |             if(status == U_STRING_NOT_TERMINATED_WARNING) {
282 |                 status = U_ZERO_ERROR;
283 |             } else {
284 |                 std::cout << "Error: " << u_errorName(status) << std::endl;
285 |                 bpe_tokens.push_back(encoder[icu::UnicodeString::fromUTF8("<|endoftext|>")]);
286 |                 return bpe_tokens;
287 |             }
288 |         }
289 |         std::string word_str;
290 |         word.toUTF8String(word_str);
291 |         if (word.length() == 0) {
292 |             continue;
293 |         }
294 |         icu::UnicodeString encoder_result;
295 | 
296 |         for (int32_t j = 0; j < word_str.length(); ++j) {
297 |             encoder_result += byte_encoder[(int)(word_str[j] & 0xff)];
298 |         }
299 |         auto bpe_res = bpe(encoder_result);
300 |         for (auto& token : bpe_res) {;
301 |             if(bpe_tokens.size() >= MAX_LEN - 1) {
302 |                 break;
303 |             } else if (encoder.find(token) == encoder.end()) {
304 |                 bpe_tokens.push_back(encoder[icu::UnicodeString::fromUTF8("<|endoftext|>")]);
305 |             } else {
306 |                 bpe_tokens.push_back(encoder[token]);
307 |             }
308 |         }
309 |     }
310 | 
311 |     if (bpe_tokens.back() != encoder[icu::UnicodeString::fromUTF8("<|endoftext|>")]) {
312 |         bpe_tokens.push_back(encoder[icu::UnicodeString::fromUTF8("<|endoftext|>")]);
313 |     }
314 | 
315 |     delete strip_accents;
316 | 
317 |     return bpe_tokens;
318 | }
319 | 
320 | TokenizerResult CLIPTokenizer::tokenize(const std::vector<std::string>& texts) {
321 |     std::vector<std::vector<int>> result;
322 |     for (const auto& text : texts) {
323 |         icu::UnicodeString unicode_text = icu::UnicodeString::fromUTF8("<|startoftext|> ");
324 |         unicode_text += icu::UnicodeString::fromUTF8(text);
325 |         unicode_text += icu::UnicodeString::fromUTF8(" <|endoftext|>");
326 |         std::vector<int> tokens = encode(unicode_text);
327 |         result.push_back(tokens);
328 |     }
329 |     
330 |     TokenizerResult tokenizer_result;
331 | 
332 |     size_t max_len = 0;
333 |     for (const auto& tokens : result) {
334 |         if (tokens.size() > max_len) {
335 |             max_len = tokens.size();
336 |         }
337 |     }
338 | 
339 |     std::vector<std::vector<int>> attention_mask;
340 |     std::vector<std::vector<int>> input_ids;
341 |     for (auto& tokens : result) {
342 |         std::vector<int> mask;
343 |         for (size_t i = 0; i < tokens.size(); ++i) {
344 |             mask.push_back(1);
345 |         }
346 |         while (mask.size() < max_len) {
347 |             mask.push_back(0);
348 |             tokens.push_back(encoder[icu::UnicodeString::fromUTF8("<|endoftext|>")]);
349 |         }
350 |         input_ids.push_back(tokens);
351 |         attention_mask.push_back(mask);
352 |     }
353 | 
354 |     tokenizer_result.attention_mask = attention_mask;
355 |     tokenizer_result.tokens = input_ids;
356 | 
357 |     return tokenizer_result;
358 | }


--------------------------------------------------------------------------------
/clip_tokenizer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <set>
 3 | #include <unordered_map>
 4 | #include <string>
 5 | #include <vector>
 6 | #include <fstream>
 7 | #include <sstream>
 8 | #include <regex>
 9 | #include <locale>
10 | #include <codecvt>
11 | #include <memory>
12 | #include <unicode/unistr.h>
13 | #include <unicode/ustream.h>
14 | #include <unicode/locid.h>
15 | #include <unicode/uchar.h>
16 | #include <unicode/regex.h>
17 | #include <unicode/normalizer2.h>
18 | #include <unicode/translit.h>
19 | #include <unicode/ustream.h>
20 | 
21 | namespace std
22 | {
23 |     template<>
24 |     class hash<icu::UnicodeString> {
25 |     public:
26 |         size_t operator()(const icu::UnicodeString &s) const 
27 |         {
28 |             return (size_t) s.hashCode();
29 |         }
30 |     };
31 | };
32 | 
33 | 
34 | 
35 | struct TokenizerResult {
36 |     std::vector<std::vector<int>> tokens;
37 |     std::vector<std::vector<int>> attention_mask;
38 | };
39 | 
40 | 
41 | class CLIPTokenizer {
42 |     private:
43 |         std::unordered_map<int, UChar32> bytes_to_unicode();
44 |         std::set<std::pair<icu::UnicodeString, icu::UnicodeString>> get_pairs(const std::vector<icu::UnicodeString>& word);
45 |         std::vector<std::tuple<std::string, std::string>> get_merges(const std::string& file_path);
46 |         icu::UnicodeString whitespace_clean(const icu::UnicodeString& text);
47 |         std::vector<icu::UnicodeString> bpe(const icu::UnicodeString& text);
48 |         std::unordered_map<int, UChar32> byte_encoder;
49 |         std::unordered_map<UChar32, int> byte_decoder;
50 |         std::vector<icu::UnicodeString> vocab;
51 |         std::unordered_map<icu::UnicodeString, std::vector<icu::UnicodeString>> cache;
52 |         std::unique_ptr<icu::RegexMatcher> matcher;
53 |         std::unordered_map<icu::UnicodeString, int, std::hash<icu::UnicodeString>> bpe_ranks;
54 |         std::unordered_map<icu::UnicodeString, int, std::hash<icu::UnicodeString>> encoder;
55 |         std::unordered_map<int, icu::UnicodeString, std::hash<icu::UnicodeString>> decoder;
56 |         UErrorCode status = U_ZERO_ERROR;
57 |         std::vector<int> encode(icu::UnicodeString text);
58 |         size_t MAX_LEN = 77;
59 |     public:
60 |         CLIPTokenizer(const std::string& vocab_file);
61 |         TokenizerResult tokenize(const std::vector<std::string>& texts);
62 | };
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/test.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "clip_tokenizer.h"
 3 | 
 4 | 
 5 | 
 6 | 
 7 | int main() {
 8 | 
 9 |     CLIPTokenizer tokenizer("vocab.txt");
10 |     TokenizerResult result = tokenizer.tokenize({"hello world"});
11 | 
12 |     std::cout << "Tokens: " << std::endl;
13 |     for (auto& token : result.tokens[0]) {
14 |         std::cout << token << " ";
15 |     }
16 |     std::cout << std::endl;
17 | 
18 |     std::cout << "Attention mask: " << std::endl;
19 |     for (auto& mask : result.attention_mask[0]) {
20 |         std::cout << mask << " ";
21 |     }
22 |     return 0;
23 | }


--------------------------------------------------------------------------------
/unit_test.py:
--------------------------------------------------------------------------------
 1 | import clip_tokenizer_py
 2 | from transformers import CLIPTokenizer
 3 | 
 4 | tokenizer = clip_tokenizer_py.CLIPTokenizer("vocab.txt")
 5 | hf_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 6 | 
 7 | res = tokenizer.tokenize(["hello world"])
 8 | hf_res = hf_tokenizer("hello world", return_tensors="pt")
 9 | 
10 | assert res.tokens == hf_res.input_ids.tolist()
11 | assert res.attention_mask == hf_res.attention_mask.tolist()
12 | 
13 | 
14 | # test russian
15 | res = tokenizer.tokenize(["Привет мир"])
16 | hf_res = hf_tokenizer("Привет мир", return_tensors="pt")
17 | print(res.tokens)
18 | print(hf_res.input_ids)
19 | 
20 | assert res.tokens == hf_res.input_ids.tolist()
21 | assert res.attention_mask == hf_res.attention_mask.tolist()
22 | 
23 | ## test chinese
24 | res = tokenizer.tokenize(["你好世界"])
25 | hf_res = hf_tokenizer("你好世界", return_tensors="pt")
26 | 
27 | assert res.tokens == hf_res.input_ids.tolist()
28 | assert res.attention_mask == hf_res.attention_mask.tolist()
29 | 
30 | ## test japanese
31 | res = tokenizer.tokenize(["こんにちは世界"])
32 | hf_res = hf_tokenizer("こんにちは世界", return_tensors="pt")
33 | 
34 | assert res.tokens == hf_res.input_ids.tolist()
35 | assert res.attention_mask == hf_res.attention_mask.tolist()
36 | 
37 | 
38 | # test hindi
39 | res = tokenizer.tokenize(["नमस्ते दुनिया"])
40 | hf_res = hf_tokenizer("नमस्ते दुनिया", return_tensors="pt")
41 | print(res.tokens)
42 | print(hf_res.input_ids)
43 | 
44 | assert res.tokens == hf_res.input_ids.tolist()
45 | assert res.attention_mask == hf_res.attention_mask.tolist()
46 | 
47 | ## test arabic
48 | res = tokenizer.tokenize(["مرحبا بالعالم"])
49 | hf_res = hf_tokenizer("مرحبا بالعالم", return_tensors="pt")
50 | 
51 | assert res.tokens == hf_res.input_ids.tolist()
52 | assert res.attention_mask == hf_res.attention_mask.tolist()
53 | 
54 | ## test korean
55 | res = tokenizer.tokenize(["안녕하세요"])
56 | hf_res = hf_tokenizer("안녕하세요", return_tensors="pt")
57 | 
58 | assert res.tokens == hf_res.input_ids.tolist()
59 | assert res.attention_mask == hf_res.attention_mask.tolist()
60 | 
61 | ## test spanish
62 | res = tokenizer.tokenize(["Hola Mundo"])
63 | hf_res = hf_tokenizer("Hola Mundo", return_tensors="pt")
64 | print(res.tokens)
65 | print(hf_res.input_ids)
66 | 
67 | assert res.tokens == hf_res.input_ids.tolist()
68 | assert res.attention_mask == hf_res.attention_mask.tolist()
69 | 
70 | ## test turkish
71 | res = tokenizer.tokenize(["Merhaba Dünya"])
72 | hf_res = hf_tokenizer("Merhaba Dünya", return_tensors="pt")
73 | 
74 | assert res.tokens == hf_res.input_ids.tolist()
75 | assert res.attention_mask == hf_res.attention_mask.tolist()


--------------------------------------------------------------------------------