├── .editorconfig ├── .github ├── FUNDING.yml └── workflows │ └── CI.yml ├── .gitignore ├── .travis.yml ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── benches ├── cedarwood_benchmark.rs ├── cpp │ └── bench_cedar.cc └── macro-benchmark │ ├── .gitignore │ ├── Cargo.toml │ ├── dict.txt │ └── src │ └── main.rs ├── rustfmt.toml └── src └── lib.rs /.editorconfig: -------------------------------------------------------------------------------- 1 | # editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | charset = utf-8 7 | end_of_line = lf 8 | indent_size = 4 9 | indent_style = space 10 | insert_final_newline = true 11 | trim_trailing_whitespace = true 12 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: MnO2 2 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request] 2 | 3 | name: CI 4 | 5 | jobs: 6 | check: 7 | name: Check 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v2 11 | - uses: actions-rs/toolchain@v1 12 | with: 13 | profile: minimal 14 | toolchain: stable 15 | override: true 16 | - uses: actions-rs/cargo@v1 17 | with: 18 | command: check 19 | args: --all-features 20 | 21 | test: 22 | name: Test Suite 23 | runs-on: ${{ matrix.os }} 24 | strategy: 25 | matrix: 26 | os: [ubuntu-latest, macos-latest, windows-latest] 27 | steps: 28 | - uses: actions/checkout@v2 29 | - uses: actions-rs/toolchain@v1 30 | with: 31 | profile: minimal 32 | toolchain: stable 33 | override: true 34 | - name: Check build with default features 35 | uses: actions-rs/cargo@v1 36 | with: 37 | command: build 38 | - name: Test 39 | uses: actions-rs/cargo@v1 40 | with: 41 | command: test 42 | args: --all-features --all --benches 43 | 44 | codecov: 45 | name: Code Coverage 46 | runs-on: ubuntu-latest 47 | steps: 48 | - uses: actions/checkout@v2 49 | - uses: actions-rs/toolchain@v1 50 | with: 51 | profile: minimal 52 | toolchain: stable 53 | override: true 54 | - name: Run cargo-tarpaulin 55 | uses: actions-rs/tarpaulin@v0.1 56 | with: 57 | args: '--all-features' 58 | - name: Upload to codecov.io 59 | uses: codecov/codecov-action@v1 60 | with: 61 | token: ${{secrets.CODECOV_TOKEN}} 62 | - name: Archive code coverage results 63 | uses: actions/upload-artifact@v1 64 | with: 65 | name: code-coverage-report 66 | path: cobertura.xml 67 | 68 | fmt: 69 | name: Rustfmt 70 | runs-on: ubuntu-latest 71 | steps: 72 | - uses: actions/checkout@v2 73 | - uses: actions-rs/toolchain@v1 74 | with: 75 | profile: minimal 76 | toolchain: stable 77 | override: true 78 | - run: rustup component add rustfmt 79 | - uses: actions-rs/cargo@v1 80 | with: 81 | command: fmt 82 | args: --all -- --check 83 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | .vscode 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | dist: trusty 3 | language: rust 4 | addons: 5 | apt: 6 | packages: 7 | - libssl-dev 8 | - pkg-config 9 | - cmake 10 | - zlib1g-dev 11 | rust: 12 | - stable 13 | - beta 14 | 15 | before_script: 16 | - rustup component add rustfmt 17 | - | 18 | if [[ "$TRAVIS_OS_NAME" == "linux" && "$TRAVIS_RUST_VERSION" == "stable" ]]; then 19 | rustup component add clippy 20 | fi 21 | 22 | script: 23 | - cargo fmt --all -- --check 24 | - cargo build 25 | - cargo test 26 | - cargo test --features reduced-trie 27 | 28 | after_success: 29 | - | 30 | if [[ "$TRAVIS_OS_NAME" == "linux" && "$TRAVIS_RUST_VERSION" == "stable" ]]; then 31 | bash <(curl https://raw.githubusercontent.com/xd009642/tarpaulin/master/travis-install.sh) 32 | cargo tarpaulin --out Xml 33 | bash <(curl -s https://codecov.io/bash) 34 | fi 35 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "anes" 7 | version = "0.1.6" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" 10 | 11 | [[package]] 12 | name = "atty" 13 | version = "0.2.14" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" 16 | dependencies = [ 17 | "hermit-abi", 18 | "libc", 19 | "winapi", 20 | ] 21 | 22 | [[package]] 23 | name = "autocfg" 24 | version = "1.1.0" 25 | source = "registry+https://github.com/rust-lang/crates.io-index" 26 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" 27 | 28 | [[package]] 29 | name = "bitflags" 30 | version = "1.3.2" 31 | source = "registry+https://github.com/rust-lang/crates.io-index" 32 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 33 | 34 | [[package]] 35 | name = "bumpalo" 36 | version = "3.11.0" 37 | source = "registry+https://github.com/rust-lang/crates.io-index" 38 | checksum = "c1ad822118d20d2c234f427000d5acc36eabe1e29a348c89b63dd60b13f28e5d" 39 | 40 | [[package]] 41 | name = "cast" 42 | version = "0.3.0" 43 | source = "registry+https://github.com/rust-lang/crates.io-index" 44 | checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" 45 | 46 | [[package]] 47 | name = "cedarwood" 48 | version = "0.4.6" 49 | dependencies = [ 50 | "criterion", 51 | "rand", 52 | "smallvec", 53 | ] 54 | 55 | [[package]] 56 | name = "cfg-if" 57 | version = "1.0.0" 58 | source = "registry+https://github.com/rust-lang/crates.io-index" 59 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 60 | 61 | [[package]] 62 | name = "ciborium" 63 | version = "0.2.0" 64 | source = "registry+https://github.com/rust-lang/crates.io-index" 65 | checksum = "b0c137568cc60b904a7724001b35ce2630fd00d5d84805fbb608ab89509d788f" 66 | dependencies = [ 67 | "ciborium-io", 68 | "ciborium-ll", 69 | "serde", 70 | ] 71 | 72 | [[package]] 73 | name = "ciborium-io" 74 | version = "0.2.0" 75 | source = "registry+https://github.com/rust-lang/crates.io-index" 76 | checksum = "346de753af073cc87b52b2083a506b38ac176a44cfb05497b622e27be899b369" 77 | 78 | [[package]] 79 | name = "ciborium-ll" 80 | version = "0.2.0" 81 | source = "registry+https://github.com/rust-lang/crates.io-index" 82 | checksum = "213030a2b5a4e0c0892b6652260cf6ccac84827b83a85a534e178e3906c4cf1b" 83 | dependencies = [ 84 | "ciborium-io", 85 | "half", 86 | ] 87 | 88 | [[package]] 89 | name = "clap" 90 | version = "3.2.22" 91 | source = "registry+https://github.com/rust-lang/crates.io-index" 92 | checksum = "86447ad904c7fb335a790c9d7fe3d0d971dc523b8ccd1561a520de9a85302750" 93 | dependencies = [ 94 | "bitflags", 95 | "clap_lex", 96 | "indexmap", 97 | "textwrap", 98 | ] 99 | 100 | [[package]] 101 | name = "clap_lex" 102 | version = "0.2.4" 103 | source = "registry+https://github.com/rust-lang/crates.io-index" 104 | checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5" 105 | dependencies = [ 106 | "os_str_bytes", 107 | ] 108 | 109 | [[package]] 110 | name = "criterion" 111 | version = "0.4.0" 112 | source = "registry+https://github.com/rust-lang/crates.io-index" 113 | checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb" 114 | dependencies = [ 115 | "anes", 116 | "atty", 117 | "cast", 118 | "ciborium", 119 | "clap", 120 | "criterion-plot", 121 | "itertools", 122 | "lazy_static", 123 | "num-traits", 124 | "oorandom", 125 | "plotters", 126 | "rayon", 127 | "regex", 128 | "serde", 129 | "serde_derive", 130 | "serde_json", 131 | "tinytemplate", 132 | "walkdir", 133 | ] 134 | 135 | [[package]] 136 | name = "criterion-plot" 137 | version = "0.5.0" 138 | source = "registry+https://github.com/rust-lang/crates.io-index" 139 | checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" 140 | dependencies = [ 141 | "cast", 142 | "itertools", 143 | ] 144 | 145 | [[package]] 146 | name = "crossbeam-channel" 147 | version = "0.5.6" 148 | source = "registry+https://github.com/rust-lang/crates.io-index" 149 | checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" 150 | dependencies = [ 151 | "cfg-if", 152 | "crossbeam-utils", 153 | ] 154 | 155 | [[package]] 156 | name = "crossbeam-deque" 157 | version = "0.8.2" 158 | source = "registry+https://github.com/rust-lang/crates.io-index" 159 | checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" 160 | dependencies = [ 161 | "cfg-if", 162 | "crossbeam-epoch", 163 | "crossbeam-utils", 164 | ] 165 | 166 | [[package]] 167 | name = "crossbeam-epoch" 168 | version = "0.9.10" 169 | source = "registry+https://github.com/rust-lang/crates.io-index" 170 | checksum = "045ebe27666471bb549370b4b0b3e51b07f56325befa4284db65fc89c02511b1" 171 | dependencies = [ 172 | "autocfg", 173 | "cfg-if", 174 | "crossbeam-utils", 175 | "memoffset", 176 | "once_cell", 177 | "scopeguard", 178 | ] 179 | 180 | [[package]] 181 | name = "crossbeam-utils" 182 | version = "0.8.11" 183 | source = "registry+https://github.com/rust-lang/crates.io-index" 184 | checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc" 185 | dependencies = [ 186 | "cfg-if", 187 | "once_cell", 188 | ] 189 | 190 | [[package]] 191 | name = "either" 192 | version = "1.8.0" 193 | source = "registry+https://github.com/rust-lang/crates.io-index" 194 | checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" 195 | 196 | [[package]] 197 | name = "getrandom" 198 | version = "0.2.7" 199 | source = "registry+https://github.com/rust-lang/crates.io-index" 200 | checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6" 201 | dependencies = [ 202 | "cfg-if", 203 | "libc", 204 | "wasi", 205 | ] 206 | 207 | [[package]] 208 | name = "half" 209 | version = "1.8.2" 210 | source = "registry+https://github.com/rust-lang/crates.io-index" 211 | checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" 212 | 213 | [[package]] 214 | name = "hashbrown" 215 | version = "0.12.3" 216 | source = "registry+https://github.com/rust-lang/crates.io-index" 217 | checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" 218 | 219 | [[package]] 220 | name = "hermit-abi" 221 | version = "0.1.19" 222 | source = "registry+https://github.com/rust-lang/crates.io-index" 223 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" 224 | dependencies = [ 225 | "libc", 226 | ] 227 | 228 | [[package]] 229 | name = "indexmap" 230 | version = "1.9.1" 231 | source = "registry+https://github.com/rust-lang/crates.io-index" 232 | checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" 233 | dependencies = [ 234 | "autocfg", 235 | "hashbrown", 236 | ] 237 | 238 | [[package]] 239 | name = "itertools" 240 | version = "0.10.5" 241 | source = "registry+https://github.com/rust-lang/crates.io-index" 242 | checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" 243 | dependencies = [ 244 | "either", 245 | ] 246 | 247 | [[package]] 248 | name = "itoa" 249 | version = "1.0.3" 250 | source = "registry+https://github.com/rust-lang/crates.io-index" 251 | checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754" 252 | 253 | [[package]] 254 | name = "js-sys" 255 | version = "0.3.60" 256 | source = "registry+https://github.com/rust-lang/crates.io-index" 257 | checksum = "49409df3e3bf0856b916e2ceaca09ee28e6871cf7d9ce97a692cacfdb2a25a47" 258 | dependencies = [ 259 | "wasm-bindgen", 260 | ] 261 | 262 | [[package]] 263 | name = "lazy_static" 264 | version = "1.4.0" 265 | source = "registry+https://github.com/rust-lang/crates.io-index" 266 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 267 | 268 | [[package]] 269 | name = "libc" 270 | version = "0.2.133" 271 | source = "registry+https://github.com/rust-lang/crates.io-index" 272 | checksum = "c0f80d65747a3e43d1596c7c5492d95d5edddaabd45a7fcdb02b95f644164966" 273 | 274 | [[package]] 275 | name = "log" 276 | version = "0.4.17" 277 | source = "registry+https://github.com/rust-lang/crates.io-index" 278 | checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" 279 | dependencies = [ 280 | "cfg-if", 281 | ] 282 | 283 | [[package]] 284 | name = "memoffset" 285 | version = "0.6.5" 286 | source = "registry+https://github.com/rust-lang/crates.io-index" 287 | checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" 288 | dependencies = [ 289 | "autocfg", 290 | ] 291 | 292 | [[package]] 293 | name = "num-traits" 294 | version = "0.2.15" 295 | source = "registry+https://github.com/rust-lang/crates.io-index" 296 | checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" 297 | dependencies = [ 298 | "autocfg", 299 | ] 300 | 301 | [[package]] 302 | name = "num_cpus" 303 | version = "1.13.1" 304 | source = "registry+https://github.com/rust-lang/crates.io-index" 305 | checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" 306 | dependencies = [ 307 | "hermit-abi", 308 | "libc", 309 | ] 310 | 311 | [[package]] 312 | name = "once_cell" 313 | version = "1.15.0" 314 | source = "registry+https://github.com/rust-lang/crates.io-index" 315 | checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1" 316 | 317 | [[package]] 318 | name = "oorandom" 319 | version = "11.1.3" 320 | source = "registry+https://github.com/rust-lang/crates.io-index" 321 | checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" 322 | 323 | [[package]] 324 | name = "os_str_bytes" 325 | version = "6.3.0" 326 | source = "registry+https://github.com/rust-lang/crates.io-index" 327 | checksum = "9ff7415e9ae3fff1225851df9e0d9e4e5479f947619774677a63572e55e80eff" 328 | 329 | [[package]] 330 | name = "plotters" 331 | version = "0.3.4" 332 | source = "registry+https://github.com/rust-lang/crates.io-index" 333 | checksum = "2538b639e642295546c50fcd545198c9d64ee2a38620a628724a3b266d5fbf97" 334 | dependencies = [ 335 | "num-traits", 336 | "plotters-backend", 337 | "plotters-svg", 338 | "wasm-bindgen", 339 | "web-sys", 340 | ] 341 | 342 | [[package]] 343 | name = "plotters-backend" 344 | version = "0.3.4" 345 | source = "registry+https://github.com/rust-lang/crates.io-index" 346 | checksum = "193228616381fecdc1224c62e96946dfbc73ff4384fba576e052ff8c1bea8142" 347 | 348 | [[package]] 349 | name = "plotters-svg" 350 | version = "0.3.3" 351 | source = "registry+https://github.com/rust-lang/crates.io-index" 352 | checksum = "f9a81d2759aae1dae668f783c308bc5c8ebd191ff4184aaa1b37f65a6ae5a56f" 353 | dependencies = [ 354 | "plotters-backend", 355 | ] 356 | 357 | [[package]] 358 | name = "ppv-lite86" 359 | version = "0.2.16" 360 | source = "registry+https://github.com/rust-lang/crates.io-index" 361 | checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" 362 | 363 | [[package]] 364 | name = "proc-macro2" 365 | version = "1.0.44" 366 | source = "registry+https://github.com/rust-lang/crates.io-index" 367 | checksum = "7bd7356a8122b6c4a24a82b278680c73357984ca2fc79a0f9fa6dea7dced7c58" 368 | dependencies = [ 369 | "unicode-ident", 370 | ] 371 | 372 | [[package]] 373 | name = "quote" 374 | version = "1.0.21" 375 | source = "registry+https://github.com/rust-lang/crates.io-index" 376 | checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179" 377 | dependencies = [ 378 | "proc-macro2", 379 | ] 380 | 381 | [[package]] 382 | name = "rand" 383 | version = "0.8.5" 384 | source = "registry+https://github.com/rust-lang/crates.io-index" 385 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 386 | dependencies = [ 387 | "libc", 388 | "rand_chacha", 389 | "rand_core", 390 | ] 391 | 392 | [[package]] 393 | name = "rand_chacha" 394 | version = "0.3.1" 395 | source = "registry+https://github.com/rust-lang/crates.io-index" 396 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" 397 | dependencies = [ 398 | "ppv-lite86", 399 | "rand_core", 400 | ] 401 | 402 | [[package]] 403 | name = "rand_core" 404 | version = "0.6.4" 405 | source = "registry+https://github.com/rust-lang/crates.io-index" 406 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" 407 | dependencies = [ 408 | "getrandom", 409 | ] 410 | 411 | [[package]] 412 | name = "rayon" 413 | version = "1.5.3" 414 | source = "registry+https://github.com/rust-lang/crates.io-index" 415 | checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" 416 | dependencies = [ 417 | "autocfg", 418 | "crossbeam-deque", 419 | "either", 420 | "rayon-core", 421 | ] 422 | 423 | [[package]] 424 | name = "rayon-core" 425 | version = "1.9.3" 426 | source = "registry+https://github.com/rust-lang/crates.io-index" 427 | checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" 428 | dependencies = [ 429 | "crossbeam-channel", 430 | "crossbeam-deque", 431 | "crossbeam-utils", 432 | "num_cpus", 433 | ] 434 | 435 | [[package]] 436 | name = "regex" 437 | version = "1.6.0" 438 | source = "registry+https://github.com/rust-lang/crates.io-index" 439 | checksum = "4c4eb3267174b8c6c2f654116623910a0fef09c4753f8dd83db29c48a0df988b" 440 | dependencies = [ 441 | "regex-syntax", 442 | ] 443 | 444 | [[package]] 445 | name = "regex-syntax" 446 | version = "0.6.27" 447 | source = "registry+https://github.com/rust-lang/crates.io-index" 448 | checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" 449 | 450 | [[package]] 451 | name = "ryu" 452 | version = "1.0.11" 453 | source = "registry+https://github.com/rust-lang/crates.io-index" 454 | checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" 455 | 456 | [[package]] 457 | name = "same-file" 458 | version = "1.0.6" 459 | source = "registry+https://github.com/rust-lang/crates.io-index" 460 | checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" 461 | dependencies = [ 462 | "winapi-util", 463 | ] 464 | 465 | [[package]] 466 | name = "scopeguard" 467 | version = "1.1.0" 468 | source = "registry+https://github.com/rust-lang/crates.io-index" 469 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" 470 | 471 | [[package]] 472 | name = "serde" 473 | version = "1.0.145" 474 | source = "registry+https://github.com/rust-lang/crates.io-index" 475 | checksum = "728eb6351430bccb993660dfffc5a72f91ccc1295abaa8ce19b27ebe4f75568b" 476 | dependencies = [ 477 | "serde_derive", 478 | ] 479 | 480 | [[package]] 481 | name = "serde_derive" 482 | version = "1.0.145" 483 | source = "registry+https://github.com/rust-lang/crates.io-index" 484 | checksum = "81fa1584d3d1bcacd84c277a0dfe21f5b0f6accf4a23d04d4c6d61f1af522b4c" 485 | dependencies = [ 486 | "proc-macro2", 487 | "quote", 488 | "syn", 489 | ] 490 | 491 | [[package]] 492 | name = "serde_json" 493 | version = "1.0.85" 494 | source = "registry+https://github.com/rust-lang/crates.io-index" 495 | checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44" 496 | dependencies = [ 497 | "itoa", 498 | "ryu", 499 | "serde", 500 | ] 501 | 502 | [[package]] 503 | name = "smallvec" 504 | version = "1.9.0" 505 | source = "registry+https://github.com/rust-lang/crates.io-index" 506 | checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" 507 | 508 | [[package]] 509 | name = "syn" 510 | version = "1.0.100" 511 | source = "registry+https://github.com/rust-lang/crates.io-index" 512 | checksum = "52205623b1b0f064a4e71182c3b18ae902267282930c6d5462c91b859668426e" 513 | dependencies = [ 514 | "proc-macro2", 515 | "quote", 516 | "unicode-ident", 517 | ] 518 | 519 | [[package]] 520 | name = "textwrap" 521 | version = "0.15.1" 522 | source = "registry+https://github.com/rust-lang/crates.io-index" 523 | checksum = "949517c0cf1bf4ee812e2e07e08ab448e3ae0d23472aee8a06c985f0c8815b16" 524 | 525 | [[package]] 526 | name = "tinytemplate" 527 | version = "1.2.1" 528 | source = "registry+https://github.com/rust-lang/crates.io-index" 529 | checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" 530 | dependencies = [ 531 | "serde", 532 | "serde_json", 533 | ] 534 | 535 | [[package]] 536 | name = "unicode-ident" 537 | version = "1.0.4" 538 | source = "registry+https://github.com/rust-lang/crates.io-index" 539 | checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd" 540 | 541 | [[package]] 542 | name = "walkdir" 543 | version = "2.3.2" 544 | source = "registry+https://github.com/rust-lang/crates.io-index" 545 | checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" 546 | dependencies = [ 547 | "same-file", 548 | "winapi", 549 | "winapi-util", 550 | ] 551 | 552 | [[package]] 553 | name = "wasi" 554 | version = "0.11.0+wasi-snapshot-preview1" 555 | source = "registry+https://github.com/rust-lang/crates.io-index" 556 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" 557 | 558 | [[package]] 559 | name = "wasm-bindgen" 560 | version = "0.2.83" 561 | source = "registry+https://github.com/rust-lang/crates.io-index" 562 | checksum = "eaf9f5aceeec8be17c128b2e93e031fb8a4d469bb9c4ae2d7dc1888b26887268" 563 | dependencies = [ 564 | "cfg-if", 565 | "wasm-bindgen-macro", 566 | ] 567 | 568 | [[package]] 569 | name = "wasm-bindgen-backend" 570 | version = "0.2.83" 571 | source = "registry+https://github.com/rust-lang/crates.io-index" 572 | checksum = "4c8ffb332579b0557b52d268b91feab8df3615f265d5270fec2a8c95b17c1142" 573 | dependencies = [ 574 | "bumpalo", 575 | "log", 576 | "once_cell", 577 | "proc-macro2", 578 | "quote", 579 | "syn", 580 | "wasm-bindgen-shared", 581 | ] 582 | 583 | [[package]] 584 | name = "wasm-bindgen-macro" 585 | version = "0.2.83" 586 | source = "registry+https://github.com/rust-lang/crates.io-index" 587 | checksum = "052be0f94026e6cbc75cdefc9bae13fd6052cdcaf532fa6c45e7ae33a1e6c810" 588 | dependencies = [ 589 | "quote", 590 | "wasm-bindgen-macro-support", 591 | ] 592 | 593 | [[package]] 594 | name = "wasm-bindgen-macro-support" 595 | version = "0.2.83" 596 | source = "registry+https://github.com/rust-lang/crates.io-index" 597 | checksum = "07bc0c051dc5f23e307b13285f9d75df86bfdf816c5721e573dec1f9b8aa193c" 598 | dependencies = [ 599 | "proc-macro2", 600 | "quote", 601 | "syn", 602 | "wasm-bindgen-backend", 603 | "wasm-bindgen-shared", 604 | ] 605 | 606 | [[package]] 607 | name = "wasm-bindgen-shared" 608 | version = "0.2.83" 609 | source = "registry+https://github.com/rust-lang/crates.io-index" 610 | checksum = "1c38c045535d93ec4f0b4defec448e4291638ee608530863b1e2ba115d4fff7f" 611 | 612 | [[package]] 613 | name = "web-sys" 614 | version = "0.3.60" 615 | source = "registry+https://github.com/rust-lang/crates.io-index" 616 | checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f" 617 | dependencies = [ 618 | "js-sys", 619 | "wasm-bindgen", 620 | ] 621 | 622 | [[package]] 623 | name = "winapi" 624 | version = "0.3.9" 625 | source = "registry+https://github.com/rust-lang/crates.io-index" 626 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 627 | dependencies = [ 628 | "winapi-i686-pc-windows-gnu", 629 | "winapi-x86_64-pc-windows-gnu", 630 | ] 631 | 632 | [[package]] 633 | name = "winapi-i686-pc-windows-gnu" 634 | version = "0.4.0" 635 | source = "registry+https://github.com/rust-lang/crates.io-index" 636 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 637 | 638 | [[package]] 639 | name = "winapi-util" 640 | version = "0.1.5" 641 | source = "registry+https://github.com/rust-lang/crates.io-index" 642 | checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" 643 | dependencies = [ 644 | "winapi", 645 | ] 646 | 647 | [[package]] 648 | name = "winapi-x86_64-pc-windows-gnu" 649 | version = "0.4.0" 650 | source = "registry+https://github.com/rust-lang/crates.io-index" 651 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 652 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cedarwood" 3 | description = "efficiently-updatable double-array trie in Rust (ported from cedar)" 4 | repository = "https://github.com/MnO2/cedarwood" 5 | version = "0.4.6" 6 | license = "BSD-2-Clause" 7 | authors = ["Paul Meng "] 8 | readme = "README.md" 9 | keywords = ["trie", "cedar", "string", "search", "text"] 10 | categories = ["data-structures", "text-processing"] 11 | edition = "2018" 12 | exclude = ["/benches/**", "/.travis.yml"] 13 | 14 | [badges] 15 | travis-ci = { repository = "MnO2/cedarwood" } 16 | codecov = { repository = "MnO2/cedarwood" } 17 | 18 | [features] 19 | default = [] 20 | reduced-trie = [] 21 | 22 | [dev-dependencies] 23 | criterion = "0.4.0" 24 | rand = "0.8.4" 25 | 26 | [[bench]] 27 | name = "cedarwood_benchmark" 28 | harness = false 29 | path = "./benches/cedarwood_benchmark.rs" 30 | required-features = [] 31 | 32 | [package.metadata.docs.rs] 33 | all-features = true 34 | 35 | [dependencies] 36 | smallvec = { version = "1.6.1", features = ["union"] } 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013-2014, Naoki Yoshinaga, Paul Meng 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the 14 | distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cedarwood 2 | 3 | Efficiently-updatable double-array trie in Rust (ported from cedar). This library is tested with reasonably enough of randomized tests, but it is considered as beta since it is not yet tested in high-pressured production environment. Please let me know if you have good use cases to prove its stability. 4 | 5 | [![Build Status](https://travis-ci.com/MnO2/cedarwood.svg?branch=master)](https://travis-ci.org/MnO2/cedarwood) 6 | [![codecov](https://codecov.io/gh/MnO2/cedarwood/branch/master/graph/badge.svg)](https://codecov.io/gh/MnO2/cedarwood) 7 | [![Crates.io](https://img.shields.io/crates/v/cedarwood.svg)](https://crates.io/crates/cedarwood) 8 | [![docs.rs](https://docs.rs/cedarwood/badge.svg)](https://docs.rs/cedarwood/) 9 | 10 | ## Installation 11 | 12 | Add it to your `Cargo.toml`: 13 | 14 | ```toml 15 | [dependencies] 16 | cedarwood = "0.4" 17 | ``` 18 | 19 | then you are good to go. If you are using Rust 2015 you have to `extern crate cedarwood` to your crate root as well. 20 | 21 | ## Example 22 | 23 | ```rust 24 | let dict = vec![ 25 | "a", 26 | "ab", 27 | "abc", 28 | "アルゴリズム", 29 | "データ", 30 | "構造", 31 | "网", 32 | "网球", 33 | "网球拍", 34 | "中", 35 | "中华", 36 | "中华人民", 37 | "中华人民共和国", 38 | ]; 39 | let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 40 | let mut cedar = Cedar::new(); 41 | cedar.build(&key_values); 42 | 43 | let result: Vec = cedar.common_prefix_search("abcdefg").iter().map(|x| x.0).collect(); 44 | assert_eq!(vec![0, 1, 2], result); 45 | 46 | let result: Vec = cedar 47 | .common_prefix_search("网球拍卖会") 48 | .iter() 49 | .map(|x| x.0) 50 | .collect(); 51 | assert_eq!(vec![6, 7, 8], result); 52 | 53 | let result: Vec = cedar 54 | .common_prefix_search("中华人民共和国") 55 | .iter() 56 | .map(|x| x.0) 57 | .collect(); 58 | assert_eq!(vec![9, 10, 11, 12], result); 59 | 60 | let result: Vec = cedar 61 | .common_prefix_search("データ構造とアルゴリズム") 62 | .iter() 63 | .map(|x| x.0) 64 | .collect(); 65 | assert_eq!(vec![4], result); 66 | ``` 67 | 68 | ## To run benchmark tests 69 | 70 | ```bash 71 | cargo bench 72 | ``` 73 | 74 | ## License 75 | 76 | This work is released under the BSD-2 license, following the original license of C++ cedar. A copy of the license is provided in the LICENSE file. 77 | 78 | ## Reference 79 | 80 | * [cedar - C++ implementation of efficiently-updatable double-array trie](http://www.tkl.iis.u-tokyo.ac.jp/~ynaga/cedar/) 81 | 82 | 83 | -------------------------------------------------------------------------------- /benches/cedarwood_benchmark.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate criterion; 3 | extern crate cedarwood; 4 | 5 | use cedarwood::Cedar; 6 | use criterion::Criterion; 7 | 8 | fn build_cedar() -> Cedar { 9 | let dict = vec![ 10 | "a", 11 | "ab", 12 | "abc", 13 | "アルゴリズム", 14 | "データ", 15 | "構造", 16 | "网", 17 | "网球", 18 | "网球拍", 19 | "中", 20 | "中华", 21 | "中华人民", 22 | "中华人民共和国", 23 | ]; 24 | let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 25 | let mut cedar = Cedar::new(); 26 | cedar.build(&key_values); 27 | cedar 28 | } 29 | 30 | fn bench_cedar_build() { 31 | let _cedar = build_cedar(); 32 | } 33 | 34 | fn bench_exact_match_search() { 35 | let cedar = build_cedar(); 36 | let _ret = cedar.exact_match_search("中华人民"); 37 | } 38 | 39 | fn bench_common_prefix_search() { 40 | let cedar = build_cedar(); 41 | let _ret = cedar.common_prefix_search("中华人民"); 42 | } 43 | 44 | fn bench_common_prefix_predict() { 45 | let cedar = build_cedar(); 46 | let _ret = cedar.common_prefix_predict("中"); 47 | } 48 | 49 | fn criterion_benchmark(c: &mut Criterion) { 50 | c.bench_function("cedar build", |b| b.iter(bench_cedar_build)); 51 | c.bench_function("cedar exact_match_search", |b| b.iter(bench_exact_match_search)); 52 | c.bench_function("cedar common_prefix_search", |b| b.iter(bench_common_prefix_search)); 53 | c.bench_function("cedar common_prefix_predict", |b| b.iter(bench_common_prefix_predict)); 54 | } 55 | 56 | criterion_group!(benches, criterion_benchmark); 57 | criterion_main!(benches); 58 | -------------------------------------------------------------------------------- /benches/cpp/bench_cedar.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include // for ternary search tree 7 | #include 8 | #include 9 | #include 10 | 11 | // static const 12 | static const size_t BUFFER_SIZE = 1 << 16; 13 | 14 | #define KEY_SEP '\n' 15 | inline char *find_sep(char *p) 16 | { 17 | while (*p != '\n') 18 | ++p; 19 | *p = '\0'; 20 | return p; 21 | } 22 | 23 | typedef cedar::da cedar_t; 24 | 25 | template 26 | inline T *create() { return new T(); } 27 | 28 | template 29 | inline void destroy(T *t) { delete t; } 30 | 31 | size_t read_data(const char *file, char *&data) 32 | { 33 | int fd = ::open(file, O_RDONLY); 34 | if (fd < 0) 35 | { 36 | std::fprintf(stderr, "no such file: %s\n", file); 37 | std::exit(1); 38 | } 39 | size_t size = static_cast(::lseek(fd, 0L, SEEK_END)); 40 | data = new char[size]; 41 | ::lseek(fd, 0L, SEEK_SET); 42 | ::read(fd, data, size); 43 | ::close(fd); 44 | return size; 45 | } 46 | 47 | void insert_key(cedar_t *t, const char *key, size_t len, int n) 48 | { 49 | t->update(key, len) = n; 50 | } 51 | 52 | bool lookup_key(cedar_t *t, const char *key, size_t len) 53 | { 54 | return t->exactMatchSearch(key, len) >= 0; 55 | } 56 | 57 | template 58 | void insert(T *t, int fd, int &n) 59 | { 60 | char data[BUFFER_SIZE]; 61 | char *start(data), *end(data), *tail(data + BUFFER_SIZE - 1), *tail_(data); 62 | while ((tail_ = end + ::read(fd, end, tail - end)) != end) 63 | { 64 | for (*tail_ = KEY_SEP; (end = find_sep(end)) != tail_; start = ++end) 65 | insert_key(t, start, end - start, ++n); 66 | std::memmove(data, start, tail_ - start); 67 | end = data + (tail_ - start); 68 | start = data; 69 | } 70 | } 71 | 72 | // lookup 73 | template 74 | void lookup(T *t, char *data, size_t size, int &n_, int &n) 75 | { 76 | for (char *start(data), *end(data), *tail(data + size); 77 | end != tail; start = ++end) 78 | { 79 | end = find_sep(end); 80 | if (lookup_key(t, start, end - start)) 81 | ++n_; 82 | ++n; 83 | } 84 | } 85 | 86 | template 87 | void bench(const char *keys, const char *queries, const char *label) 88 | { 89 | std::fprintf(stderr, "---- %-25s --------------------------\n", label); 90 | T *t = create(); 91 | struct timeval st, et; 92 | { 93 | int fd = ::open(keys, O_RDONLY); 94 | if (fd < 0) 95 | { 96 | std::fprintf(stderr, "no such file: %s\n", keys); 97 | std::exit(1); 98 | } 99 | // build trie 100 | int n = 0; 101 | ::gettimeofday(&st, NULL); 102 | insert(t, fd, n); 103 | ::gettimeofday(&et, NULL); 104 | double elapsed = (et.tv_sec - st.tv_sec) + (et.tv_usec - st.tv_usec) * 1e-6; 105 | std::fprintf(stderr, "%-20s %.2f sec (%.2f nsec per key)\n", 106 | "Time to insert:", elapsed, elapsed * 1e9 / n); 107 | std::fprintf(stderr, "%-20s %d\n\n", "Words:", n); 108 | ::close(fd); 109 | } 110 | if (std::strcmp(queries, "-") != 0) 111 | { 112 | // load data 113 | char *data = 0; 114 | const size_t size = read_data(queries, data); 115 | // search 116 | int n(0), n_(0); 117 | ::gettimeofday(&st, NULL); 118 | lookup(t, data, size, n_, n); 119 | ::gettimeofday(&et, NULL); 120 | double elapsed = (et.tv_sec - st.tv_sec) + (et.tv_usec - st.tv_usec) * 1e-6; 121 | std::fprintf(stderr, "%-20s %.2f sec (%.2f nsec per key)\n", 122 | "Time to search:", elapsed, elapsed * 1e9 / n); 123 | std::fprintf(stderr, "%-20s %d\n", "Words:", n); 124 | std::fprintf(stderr, "%-20s %d\n", "Found:", n_); 125 | delete[] data; 126 | } 127 | destroy(t); 128 | } 129 | 130 | int main(int argc, char **argv) 131 | { 132 | if (argc < 3) 133 | { 134 | std::fprintf(stderr, "Usage: %s keys queries\n", argv[0]); 135 | std::exit(1); 136 | } 137 | bench(argv[1], argv[2], "cedar"); 138 | } 139 | -------------------------------------------------------------------------------- /benches/macro-benchmark/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | **/*.rs.bk 3 | .vscode 4 | Cargo.lock 5 | -------------------------------------------------------------------------------- /benches/macro-benchmark/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "macro-benchmark" 3 | version = "0.1.0" 4 | authors = ["Paul Meng "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | cedarwood = { path = "../../" } 9 | -------------------------------------------------------------------------------- /benches/macro-benchmark/src/main.rs: -------------------------------------------------------------------------------- 1 | use cedarwood::Cedar; 2 | use std::fs::File; 3 | use std::io::{self, BufRead, BufReader}; 4 | use std::env; 5 | use std::time; 6 | 7 | struct IndexBuilder {} 8 | 9 | impl IndexBuilder { 10 | pub fn new() -> Self { 11 | IndexBuilder {} 12 | } 13 | 14 | // Require the dictionary to be sorted in lexicographical order 15 | pub fn build(&mut self, dict: &mut R) -> io::Result { 16 | let mut buf = String::new(); 17 | let mut records: Vec<(String, usize, String)> = Vec::new(); 18 | 19 | while dict.read_line(&mut buf)? > 0 { 20 | { 21 | let parts: Vec<&str> = buf.trim().split_whitespace().collect(); 22 | if parts.is_empty() { 23 | continue; 24 | } 25 | 26 | let word = parts[0]; 27 | let freq = parts.get(1).map(|x| x.parse::().unwrap()).unwrap_or(0); 28 | let tag = parts.get(2).cloned().unwrap_or(""); 29 | 30 | records.push((String::from(word), freq, String::from(tag))); 31 | } 32 | buf.clear(); 33 | } 34 | 35 | let dict: Vec<&str> = records.iter().map(|n| n.0.as_ref()).collect(); 36 | let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 37 | 38 | let now = time::Instant::now(); 39 | let mut cedar = Cedar::new(); 40 | cedar.build(&key_values); 41 | println!("{} ms", now.elapsed().as_millis()); 42 | 43 | Ok(cedar) 44 | } 45 | } 46 | 47 | pub fn query(dict: &mut R, cedar: &Cedar) -> io::Result<()> { 48 | let mut buf = String::new(); 49 | let mut records: Vec<(String, usize, String)> = Vec::new(); 50 | 51 | while dict.read_line(&mut buf)? > 0 { 52 | { 53 | let parts: Vec<&str> = buf.trim().split_whitespace().collect(); 54 | if parts.is_empty() { 55 | continue; 56 | } 57 | 58 | let word = parts[0]; 59 | let freq = parts.get(1).map(|x| x.parse::().unwrap()).unwrap_or(0); 60 | let tag = parts.get(2).cloned().unwrap_or(""); 61 | 62 | records.push((String::from(word), freq, String::from(tag))); 63 | } 64 | buf.clear(); 65 | } 66 | 67 | let dict: Vec<&str> = records.iter().map(|n| n.0.as_ref()).collect(); 68 | let keys: Vec<&str> = dict.into_iter().enumerate().map(|(_, s)| s).collect(); 69 | 70 | let now = time::Instant::now(); 71 | for k in keys { 72 | cedar.exact_match_search(k); 73 | } 74 | println!("{} ms", now.elapsed().as_millis()); 75 | 76 | Ok(()) 77 | } 78 | 79 | fn main() -> io::Result<()> { 80 | let args: Vec = env::args().collect(); 81 | 82 | if args.len() < 2 { 83 | eprintln!("bench "); 84 | std::process::exit(1); 85 | } 86 | 87 | let f = File::open(&args[1])?; 88 | let mut buf = BufReader::new(f); 89 | let cedar = IndexBuilder::new().build(&mut buf).unwrap(); 90 | 91 | let f = File::open(&args[2])?; 92 | let mut buf = BufReader::new(f); 93 | query(&mut buf, &cedar)?; 94 | 95 | Ok(()) 96 | } 97 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 120 2 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Efficiently-updatable double-array trie in Rust (ported from cedar). 2 | //! 3 | //! Add it to your `Cargo.toml`: 4 | //! 5 | //! ```toml 6 | //! [dependencies] 7 | //! cedarwood = "0.4" 8 | //! ``` 9 | //! 10 | //! then you are good to go. If you are using Rust 2015 you have to `extern crate cedarwood` to your crate root as well. 11 | //! 12 | //! ## Example 13 | //! 14 | //! ```rust 15 | //! use cedarwood::Cedar; 16 | //! 17 | //! let dict = vec![ 18 | //! "a", 19 | //! "ab", 20 | //! "abc", 21 | //! "アルゴリズム", 22 | //! "データ", 23 | //! "構造", 24 | //! "网", 25 | //! "网球", 26 | //! "网球拍", 27 | //! "中", 28 | //! "中华", 29 | //! "中华人民", 30 | //! "中华人民共和国", 31 | //! ]; 32 | //! let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 33 | //! let mut cedar = Cedar::new(); 34 | //! cedar.build(&key_values); 35 | //! 36 | //! let result: Vec = cedar.common_prefix_search("abcdefg").unwrap().iter().map(|x| x.0).collect(); 37 | //! assert_eq!(vec![0, 1, 2], result); 38 | //! 39 | //! let result: Vec = cedar 40 | //! .common_prefix_search("网球拍卖会") 41 | //! .unwrap() 42 | //! .iter() 43 | //! .map(|x| x.0) 44 | //! .collect(); 45 | //! assert_eq!(vec![6, 7, 8], result); 46 | //! 47 | //! let result: Vec = cedar 48 | //! .common_prefix_search("中华人民共和国") 49 | //! .unwrap() 50 | //! .iter() 51 | //! .map(|x| x.0) 52 | //! .collect(); 53 | //! assert_eq!(vec![9, 10, 11, 12], result); 54 | //! 55 | //! let result: Vec = cedar 56 | //! .common_prefix_search("データ構造とアルゴリズム") 57 | //! .unwrap() 58 | //! .iter() 59 | //! .map(|x| x.0) 60 | //! .collect(); 61 | //! assert_eq!(vec![4], result); 62 | //! ``` 63 | 64 | use smallvec::SmallVec; 65 | use std::fmt; 66 | 67 | /// NInfo stores the information about the trie 68 | #[derive(Debug, Default, Clone)] 69 | struct NInfo { 70 | sibling: u8, // the index of right sibling, it is 0 if it doesn't have a sibling. 71 | child: u8, // the index of the first child 72 | } 73 | 74 | /// Node contains the array of `base` and `check` as specified in the paper: "An efficient implementation of trie structures" 75 | /// https://dl.acm.org/citation.cfm?id=146691 76 | #[derive(Debug, Default, Clone)] 77 | struct Node { 78 | base_: i32, // if it is a negative value, then it stores the value of previous index that is free. 79 | check: i32, // if it is a negative value, then it stores the value of next index that is free. 80 | } 81 | 82 | impl Node { 83 | #[inline] 84 | fn base(&self) -> i32 { 85 | #[cfg(feature = "reduced-trie")] 86 | return -(self.base_ + 1); 87 | #[cfg(not(feature = "reduced-trie"))] 88 | return self.base_; 89 | } 90 | } 91 | 92 | /// Block stores the linked-list pointers and the stats info for blocks. 93 | #[derive(Debug, Clone)] 94 | struct Block { 95 | prev: i32, // previous block's index, 3 bytes width 96 | next: i32, // next block's index, 3 bytes width 97 | num: i16, // the number of slots that is free, the range is 0-256 98 | reject: i16, // a heuristic number to make the search for free space faster, it is the minimum number of iteration in each trie node it has to try before we can conclude that we can reject this block. If the number of kids for the block we are looking for is less than this number then this block is worthy of searching. 99 | trial: i32, // the number of times this block has been probed by `find_places` for the free block. 100 | e_head: i32, // the index of the first empty elemenet in this block 101 | } 102 | 103 | impl Block { 104 | pub fn new() -> Self { 105 | Block { 106 | prev: 0, 107 | next: 0, 108 | num: 256, // each of block has 256 free slots at the beginning 109 | reject: 257, // initially every block need to be fully iterated through so that we can reject it to be unusable. 110 | trial: 0, 111 | e_head: 0, 112 | } 113 | } 114 | } 115 | 116 | /// Blocks are marked as either of three categories, so that we can quickly decide if we can 117 | /// allocate it for use or not. 118 | enum BlockType { 119 | Open, // The block has spaces more than 1. 120 | Closed, // The block is only left with one free slot 121 | Full, // The block's slots are fully used. 122 | } 123 | 124 | /// `Cedar` holds all of the information about double array trie. 125 | #[derive(Clone)] 126 | pub struct Cedar { 127 | array: Vec, // storing the `base` and `check` info from the original paper. 128 | n_infos: Vec, 129 | blocks: Vec, 130 | reject: Vec, 131 | blocks_head_full: i32, // the index of the first 'Full' block, 0 means no 'Full' block 132 | blocks_head_closed: i32, // the index of the first 'Closed' block, 0 means no ' Closed' block 133 | blocks_head_open: i32, // the index of the first 'Open' block, 0 means no 'Open' block 134 | capacity: usize, 135 | size: usize, 136 | ordered: bool, 137 | max_trial: i32, // the parameter for cedar, it could be tuned for more, but the default is 1. 138 | } 139 | 140 | impl fmt::Debug for Cedar { 141 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 142 | write!(f, "Cedar(size={}, ordered={})", self.size, self.ordered) 143 | } 144 | } 145 | 146 | #[allow(dead_code)] 147 | const CEDAR_VALUE_LIMIT: i32 = std::i32::MAX - 1; 148 | const CEDAR_NO_VALUE: i32 = -1; 149 | 150 | /// Iterator for `common_prefix_search` 151 | #[derive(Clone)] 152 | pub struct PrefixIter<'a> { 153 | cedar: &'a Cedar, 154 | key: &'a [u8], 155 | from: usize, 156 | i: usize, 157 | } 158 | 159 | impl<'a> Iterator for PrefixIter<'a> { 160 | type Item = (i32, usize); 161 | 162 | fn size_hint(&self) -> (usize, Option) { 163 | (0, Some(self.key.len())) 164 | } 165 | 166 | fn next(&mut self) -> Option { 167 | while self.i < self.key.len() { 168 | if let Some(value) = self.cedar.find(&self.key[self.i..=self.i], &mut self.from) { 169 | if value == CEDAR_NO_VALUE { 170 | self.i += 1; 171 | continue; 172 | } else { 173 | let result = Some((value, self.i)); 174 | self.i += 1; 175 | return result; 176 | } 177 | } else { 178 | break; 179 | } 180 | } 181 | 182 | None 183 | } 184 | } 185 | 186 | /// Iterator for `common_prefix_predict` 187 | #[derive(Clone)] 188 | pub struct PrefixPredictIter<'a> { 189 | cedar: &'a Cedar, 190 | key: &'a [u8], 191 | from: usize, 192 | p: usize, 193 | root: usize, 194 | value: Option, 195 | } 196 | 197 | impl<'a> PrefixPredictIter<'a> { 198 | fn next_until_none(&mut self) -> Option<(i32, usize)> { 199 | #[allow(clippy::never_loop)] 200 | while let Some(value) = self.value { 201 | let result = (value, self.p); 202 | 203 | let (v_, from_, p_) = self.cedar.next(self.from, self.p, self.root); 204 | self.from = from_; 205 | self.p = p_; 206 | self.value = v_; 207 | 208 | return Some(result); 209 | } 210 | 211 | None 212 | } 213 | } 214 | 215 | impl<'a> Iterator for PrefixPredictIter<'a> { 216 | type Item = (i32, usize); 217 | 218 | fn next(&mut self) -> Option { 219 | if self.from == 0 && self.p == 0 { 220 | // To locate the prefix's position first, if it doesn't exist then that means we 221 | // don't have do anything. `from` would serve as the cursor. 222 | if self.cedar.find(self.key, &mut self.from).is_some() { 223 | self.root = self.from; 224 | 225 | let (v_, from_, p_) = self.cedar.begin(self.from, self.p); 226 | self.from = from_; 227 | self.p = p_; 228 | self.value = v_; 229 | 230 | self.next_until_none() 231 | } else { 232 | None 233 | } 234 | } else { 235 | self.next_until_none() 236 | } 237 | } 238 | } 239 | 240 | #[allow(clippy::cast_lossless)] 241 | impl Cedar { 242 | /// Initialize the Cedar for further use. 243 | #[allow(clippy::new_without_default)] 244 | pub fn new() -> Self { 245 | let mut array: Vec = Vec::with_capacity(256); 246 | let n_infos: Vec = (0..256).map(|_| Default::default()).collect(); 247 | let mut blocks: Vec = vec![Block::new(); 1]; 248 | let reject: Vec = (0..=256).map(|i| i + 1).collect(); 249 | 250 | #[cfg(feature = "reduced-trie")] 251 | array.push(Node { base_: -1, check: -1 }); 252 | #[cfg(not(feature = "reduced-trie"))] 253 | array.push(Node { base_: 0, check: -1 }); 254 | 255 | for i in 1..256 { 256 | // make `base_` point to the previous element, and make `check` point to the next element 257 | array.push(Node { 258 | base_: -(i - 1), 259 | check: -(i + 1), 260 | }) 261 | } 262 | 263 | // make them link as a cyclic doubly-linked list 264 | array[1].base_ = -255; 265 | array[255].check = -1; 266 | 267 | blocks[0].e_head = 1; 268 | 269 | Cedar { 270 | array, 271 | n_infos, 272 | blocks, 273 | reject, 274 | blocks_head_full: 0, 275 | blocks_head_closed: 0, 276 | blocks_head_open: 0, 277 | capacity: 256, 278 | size: 256, 279 | ordered: true, 280 | max_trial: 1, 281 | } 282 | } 283 | 284 | /// Build the double array trie from the given key value pairs 285 | #[allow(dead_code)] 286 | pub fn build(&mut self, key_values: &[(&str, i32)]) { 287 | for (key, value) in key_values { 288 | self.update(key, *value); 289 | } 290 | } 291 | 292 | /// Update the key for the value, it is public interface that works on &str 293 | pub fn update(&mut self, key: &str, value: i32) { 294 | let from = 0; 295 | let pos = 0; 296 | self.update_(key.as_bytes(), value, from, pos); 297 | } 298 | 299 | // Update the key for the value, it is internal interface that works on &[u8] and cursor. 300 | fn update_(&mut self, key: &[u8], value: i32, mut from: usize, mut pos: usize) -> i32 { 301 | if from == 0 && key.is_empty() { 302 | panic!("failed to insert zero-length key"); 303 | } 304 | 305 | while pos < key.len() { 306 | #[cfg(feature = "reduced-trie")] 307 | { 308 | let val_ = self.array[from].base_; 309 | if val_ >= 0 && val_ != CEDAR_VALUE_LIMIT { 310 | let to = self.follow(from, 0); 311 | self.array[to as usize].base_ = val_; 312 | } 313 | } 314 | 315 | from = self.follow(from, key[pos]) as usize; 316 | pos += 1; 317 | } 318 | 319 | #[cfg(feature = "reduced-trie")] 320 | let to = if self.array[from].base_ >= 0 { 321 | from as i32 322 | } else { 323 | self.follow(from, 0) 324 | }; 325 | 326 | #[cfg(feature = "reduced-trie")] 327 | { 328 | if self.array[to as usize].base_ == CEDAR_VALUE_LIMIT { 329 | self.array[to as usize].base_ = 0; 330 | } 331 | } 332 | 333 | #[cfg(not(feature = "reduced-trie"))] 334 | let to = self.follow(from, 0); 335 | 336 | self.array[to as usize].base_ = value; 337 | self.array[to as usize].base_ 338 | } 339 | 340 | // To move in the trie by following the `label`, and insert the node if the node is not there, 341 | // it is used by the `update` to populate the trie. 342 | #[inline] 343 | fn follow(&mut self, from: usize, label: u8) -> i32 { 344 | let base = self.array[from].base(); 345 | 346 | #[allow(unused_assignments)] 347 | let mut to = 0; 348 | 349 | // the node is not there 350 | if base < 0 || self.array[(base ^ (label as i32)) as usize].check < 0 { 351 | // allocate a e node 352 | to = self.pop_e_node(base, label, from as i32); 353 | let branch: i32 = to ^ (label as i32); 354 | 355 | // maintain the info in ninfo 356 | self.push_sibling(from, branch, label, base >= 0); 357 | } else { 358 | // the node is already there and the ownership is not `from`, therefore a conflict. 359 | to = base ^ (label as i32); 360 | if self.array[to as usize].check != (from as i32) { 361 | // call `resolve` to relocate. 362 | to = self.resolve(from, base, label); 363 | } 364 | } 365 | 366 | to 367 | } 368 | 369 | // Find key from double array trie, with `from` as the cursor to traverse the nodes. 370 | fn find(&self, key: &[u8], from: &mut usize) -> Option { 371 | #[allow(unused_assignments)] 372 | let mut to: usize = 0; 373 | let mut pos = 0; 374 | 375 | // recursively matching the key. 376 | while pos < key.len() { 377 | #[cfg(feature = "reduced-trie")] 378 | { 379 | if self.array[*from].base_ >= 0 { 380 | break; 381 | } 382 | } 383 | 384 | to = (self.array[*from].base() ^ (key[pos] as i32)) as usize; 385 | if self.array[to as usize].check != (*from as i32) { 386 | return None; 387 | } 388 | 389 | *from = to; 390 | pos += 1; 391 | } 392 | 393 | #[cfg(feature = "reduced-trie")] 394 | { 395 | if self.array[*from].base_ >= 0 { 396 | if pos == key.len() { 397 | return Some(self.array[*from].base_); 398 | } else { 399 | return None; 400 | } 401 | } 402 | } 403 | 404 | // return the value of the node if `check` is correctly marked fpr the ownership, otherwise 405 | // it means no value is stored. 406 | let n = &self.array[(self.array[*from].base()) as usize]; 407 | if n.check != (*from as i32) { 408 | Some(CEDAR_NO_VALUE) 409 | } else { 410 | Some(n.base_) 411 | } 412 | } 413 | 414 | /// Delete the key from the trie, the public interface that works on &str 415 | pub fn erase(&mut self, key: &str) { 416 | self.erase_(key.as_bytes()) 417 | } 418 | 419 | // Delete the key from the trie, the internal interface that works on &[u8] 420 | fn erase_(&mut self, key: &[u8]) { 421 | let mut from = 0; 422 | 423 | // move the cursor to the right place and use erase__ to delete it. 424 | if let Some(v) = self.find(key, &mut from) { 425 | if v != CEDAR_NO_VALUE { 426 | self.erase__(from); 427 | } 428 | } 429 | } 430 | 431 | fn erase__(&mut self, mut from: usize) { 432 | #[cfg(feature = "reduced-trie")] 433 | let mut e: i32 = if self.array[from].base_ >= 0 { 434 | from as i32 435 | } else { 436 | self.array[from].base() 437 | }; 438 | 439 | #[cfg(feature = "reduced-trie")] 440 | { 441 | from = self.array[e as usize].check as usize; 442 | } 443 | 444 | #[cfg(not(feature = "reduced-trie"))] 445 | let mut e = self.array[from].base(); 446 | 447 | #[allow(unused_assignments)] 448 | let mut has_sibling = false; 449 | loop { 450 | let n = self.array[from].clone(); 451 | has_sibling = self.n_infos[(n.base() ^ (self.n_infos[from].child as i32)) as usize].sibling != 0; 452 | 453 | // if the node has siblings, then remove `e` from the sibling. 454 | if has_sibling { 455 | self.pop_sibling(from as i32, n.base(), (n.base() ^ e) as u8); 456 | } 457 | 458 | // maintain the data structures. 459 | self.push_e_node(e); 460 | e = from as i32; 461 | 462 | // traverse to the parent. 463 | from = self.array[from].check as usize; 464 | 465 | // if it has sibling then this layer has more than one nodes, then we are done. 466 | if has_sibling { 467 | break; 468 | } 469 | } 470 | } 471 | 472 | /// To check if `key` is in the dictionary. 473 | pub fn exact_match_search(&self, key: &str) -> Option<(i32, usize, usize)> { 474 | let key = key.as_bytes(); 475 | let mut from = 0; 476 | 477 | if let Some(value) = self.find(key, &mut from) { 478 | if value == CEDAR_NO_VALUE { 479 | return None; 480 | } 481 | 482 | Some((value, key.len(), from)) 483 | } else { 484 | None 485 | } 486 | } 487 | 488 | /// To return an iterator to iterate through the common prefix in the dictionary with the `key` passed in. 489 | pub fn common_prefix_iter<'a>(&'a self, key: &'a str) -> PrefixIter<'a> { 490 | let key = key.as_bytes(); 491 | 492 | PrefixIter { 493 | cedar: self, 494 | key, 495 | from: 0, 496 | i: 0, 497 | } 498 | } 499 | 500 | /// To return the collection of the common prefix in the dictionary with the `key` passed in. 501 | pub fn common_prefix_search(&self, key: &str) -> Option> { 502 | self.common_prefix_iter(key).map(Some).collect() 503 | } 504 | 505 | /// To return an iterator to iterate through the list of words in the dictionary that has `key` as their prefix. 506 | pub fn common_prefix_predict_iter<'a>(&'a self, key: &'a str) -> PrefixPredictIter<'a> { 507 | let key = key.as_bytes(); 508 | 509 | PrefixPredictIter { 510 | cedar: self, 511 | key, 512 | from: 0, 513 | p: 0, 514 | root: 0, 515 | value: None, 516 | } 517 | } 518 | 519 | /// To return the list of words in the dictionary that has `key` as their prefix. 520 | pub fn common_prefix_predict(&self, key: &str) -> Option> { 521 | self.common_prefix_predict_iter(key).map(Some).collect() 522 | } 523 | 524 | // To get the cursor of the first leaf node starting by `from` 525 | fn begin(&self, mut from: usize, mut p: usize) -> (Option, usize, usize) { 526 | let base = self.array[from].base(); 527 | let mut c = self.n_infos[from].child; 528 | 529 | if from == 0 { 530 | c = self.n_infos[(base ^ (c as i32)) as usize].sibling; 531 | 532 | // if no sibling couldn be found from the virtual root, then we are done. 533 | if c == 0 { 534 | return (None, from, p); 535 | } 536 | } 537 | 538 | // recursively traversing down to look for the first leaf. 539 | while c != 0 { 540 | from = (self.array[from].base() ^ (c as i32)) as usize; 541 | c = self.n_infos[from].child; 542 | p += 1; 543 | } 544 | 545 | #[cfg(feature = "reduced-trie")] 546 | { 547 | if self.array[from].base_ >= 0 { 548 | return (Some(self.array[from].base_), from, p); 549 | } 550 | } 551 | 552 | // To return the value of the leaf. 553 | let v = self.array[(self.array[from].base() ^ (c as i32)) as usize].base_; 554 | (Some(v), from, p) 555 | } 556 | 557 | // To move the cursor from one leaf to the next for the common_prefix_predict. 558 | fn next(&self, mut from: usize, mut p: usize, root: usize) -> (Option, usize, usize) { 559 | #[allow(unused_assignments)] 560 | let mut c: u8 = 0; 561 | 562 | #[cfg(feature = "reduced-trie")] 563 | { 564 | if self.array[from].base_ < 0 { 565 | c = self.n_infos[(self.array[from].base()) as usize].sibling; 566 | } 567 | } 568 | #[cfg(not(feature = "reduced-trie"))] 569 | { 570 | c = self.n_infos[(self.array[from].base()) as usize].sibling; 571 | } 572 | 573 | // traversing up until there is a sibling or it has reached the root. 574 | while c == 0 && from != root { 575 | c = self.n_infos[from as usize].sibling; 576 | from = self.array[from as usize].check as usize; 577 | 578 | p -= 1; 579 | } 580 | 581 | if c != 0 { 582 | // it has a sibling so we leverage on `begin` to traverse the subtree down again. 583 | from = (self.array[from].base() ^ (c as i32)) as usize; 584 | let (v_, from_, p_) = self.begin(from, p + 1); 585 | (v_, from_, p_) 586 | } else { 587 | // no more work since we couldn't find anything. 588 | (None, from, p) 589 | } 590 | } 591 | 592 | // pop a block at idx from the linked-list of type `from`, specially handled if it is the last 593 | // one in the linked-list. 594 | fn pop_block(&mut self, idx: i32, from: BlockType, last: bool) { 595 | let head: &mut i32 = match from { 596 | BlockType::Open => &mut self.blocks_head_open, 597 | BlockType::Closed => &mut self.blocks_head_closed, 598 | BlockType::Full => &mut self.blocks_head_full, 599 | }; 600 | 601 | if last { 602 | *head = 0; 603 | } else { 604 | let b = self.blocks[idx as usize].clone(); 605 | self.blocks[b.prev as usize].next = b.next; 606 | self.blocks[b.next as usize].prev = b.prev; 607 | 608 | if idx == *head { 609 | *head = b.next; 610 | } 611 | } 612 | } 613 | 614 | // return the block at idx to the linked-list of `to`, specially handled if the linked-list is 615 | // empty 616 | fn push_block(&mut self, idx: i32, to: BlockType, empty: bool) { 617 | let head: &mut i32 = match to { 618 | BlockType::Open => &mut self.blocks_head_open, 619 | BlockType::Closed => &mut self.blocks_head_closed, 620 | BlockType::Full => &mut self.blocks_head_full, 621 | }; 622 | 623 | if empty { 624 | self.blocks[idx as usize].next = idx; 625 | self.blocks[idx as usize].prev = idx; 626 | *head = idx; 627 | } else { 628 | self.blocks[idx as usize].prev = self.blocks[*head as usize].prev; 629 | self.blocks[idx as usize].next = *head; 630 | 631 | let t = self.blocks[*head as usize].prev; 632 | self.blocks[t as usize].next = idx; 633 | self.blocks[*head as usize].prev = idx; 634 | *head = idx; 635 | } 636 | } 637 | 638 | /// Reallocate more spaces so that we have more free blocks. 639 | fn add_block(&mut self) -> i32 { 640 | if self.size == self.capacity { 641 | self.capacity += self.capacity; 642 | 643 | self.array.resize(self.capacity, Default::default()); 644 | self.n_infos.resize(self.capacity, Default::default()); 645 | self.blocks.resize(self.capacity >> 8, Block::new()); 646 | } 647 | 648 | self.blocks[self.size >> 8].e_head = self.size as i32; 649 | 650 | // make it a doubley linked list 651 | self.array[self.size] = Node { 652 | base_: -((self.size as i32) + 255), 653 | check: -((self.size as i32) + 1), 654 | }; 655 | 656 | for i in (self.size + 1)..(self.size + 255) { 657 | self.array[i] = Node { 658 | base_: -(i as i32 - 1), 659 | check: -(i as i32 + 1), 660 | }; 661 | } 662 | 663 | self.array[self.size + 255] = Node { 664 | base_: -((self.size as i32) + 254), 665 | check: -(self.size as i32), 666 | }; 667 | 668 | let is_empty = self.blocks_head_open == 0; 669 | let idx = (self.size >> 8) as i32; 670 | debug_assert!(self.blocks[idx as usize].num > 1); 671 | self.push_block(idx, BlockType::Open, is_empty); 672 | 673 | self.size += 256; 674 | 675 | ((self.size >> 8) - 1) as i32 676 | } 677 | 678 | // transfer the block at idx from the linked-list of `from` to the linked-list of `to`, 679 | // specially handle the case where the destination linked-list is empty. 680 | fn transfer_block(&mut self, idx: i32, from: BlockType, to: BlockType, to_block_empty: bool) { 681 | let is_last = idx == self.blocks[idx as usize].next; //it's the last one if the next points to itself 682 | let is_empty = to_block_empty && (self.blocks[idx as usize].num != 0); 683 | 684 | self.pop_block(idx, from, is_last); 685 | self.push_block(idx, to, is_empty); 686 | } 687 | 688 | /// Mark an edge `e` as used in a trie node. 689 | fn pop_e_node(&mut self, base: i32, label: u8, from: i32) -> i32 { 690 | let e: i32 = if base < 0 { 691 | self.find_place() 692 | } else { 693 | base ^ (label as i32) 694 | }; 695 | 696 | let idx = e >> 8; 697 | let n = self.array[e as usize].clone(); 698 | 699 | self.blocks[idx as usize].num -= 1; 700 | // move the block at idx to the correct linked-list depending the free slots it still have. 701 | if self.blocks[idx as usize].num == 0 { 702 | if idx != 0 { 703 | self.transfer_block(idx, BlockType::Closed, BlockType::Full, self.blocks_head_full == 0); 704 | } 705 | } else { 706 | self.array[(-n.base_) as usize].check = n.check; 707 | self.array[(-n.check) as usize].base_ = n.base_; 708 | 709 | if e == self.blocks[idx as usize].e_head { 710 | self.blocks[idx as usize].e_head = -n.check; 711 | } 712 | 713 | if idx != 0 && self.blocks[idx as usize].num == 1 && self.blocks[idx as usize].trial != self.max_trial { 714 | self.transfer_block(idx, BlockType::Open, BlockType::Closed, self.blocks_head_closed == 0); 715 | } 716 | } 717 | 718 | #[cfg(feature = "reduced-trie")] 719 | { 720 | self.array[e as usize].base_ = CEDAR_VALUE_LIMIT; 721 | self.array[e as usize].check = from; 722 | if base < 0 { 723 | self.array[from as usize].base_ = -(e ^ (label as i32)) - 1; 724 | } 725 | } 726 | 727 | #[cfg(not(feature = "reduced-trie"))] 728 | { 729 | if label != 0 { 730 | self.array[e as usize].base_ = -1; 731 | } else { 732 | self.array[e as usize].base_ = 0; 733 | } 734 | self.array[e as usize].check = from; 735 | if base < 0 { 736 | self.array[from as usize].base_ = e ^ (label as i32); 737 | } 738 | } 739 | 740 | e 741 | } 742 | 743 | /// Mark an edge `e` as free in a trie node. 744 | fn push_e_node(&mut self, e: i32) { 745 | let idx = e >> 8; 746 | self.blocks[idx as usize].num += 1; 747 | 748 | if self.blocks[idx as usize].num == 1 { 749 | self.blocks[idx as usize].e_head = e; 750 | self.array[e as usize] = Node { base_: -e, check: -e }; 751 | 752 | if idx != 0 { 753 | // Move the block from 'Full' to 'Closed' since it has one free slot now. 754 | self.transfer_block(idx, BlockType::Full, BlockType::Closed, self.blocks_head_closed == 0); 755 | } 756 | } else { 757 | let prev = self.blocks[idx as usize].e_head; 758 | 759 | let next = -self.array[prev as usize].check; 760 | 761 | // Insert to the edge immediately after the e_head 762 | self.array[e as usize] = Node { 763 | base_: -prev, 764 | check: -next, 765 | }; 766 | 767 | self.array[prev as usize].check = -e; 768 | self.array[next as usize].base_ = -e; 769 | 770 | // Move the block from 'Closed' to 'Open' since it has more than one free slot now. 771 | if self.blocks[idx as usize].num == 2 || self.blocks[idx as usize].trial == self.max_trial { 772 | debug_assert!(self.blocks[idx as usize].num > 1); 773 | if idx != 0 { 774 | self.transfer_block(idx, BlockType::Closed, BlockType::Open, self.blocks_head_open == 0); 775 | } 776 | } 777 | 778 | // Reset the trial stats 779 | self.blocks[idx as usize].trial = 0; 780 | } 781 | 782 | if self.blocks[idx as usize].reject < self.reject[self.blocks[idx as usize].num as usize] { 783 | self.blocks[idx as usize].reject = self.reject[self.blocks[idx as usize].num as usize]; 784 | } 785 | 786 | self.n_infos[e as usize] = Default::default(); 787 | } 788 | 789 | // push the `label` into the sibling chain 790 | fn push_sibling(&mut self, from: usize, base: i32, label: u8, has_child: bool) { 791 | let keep_order: bool = if self.ordered { 792 | label > self.n_infos[from].child 793 | } else { 794 | self.n_infos[from].child == 0 795 | }; 796 | 797 | let sibling: u8; 798 | { 799 | let mut c: &mut u8 = &mut self.n_infos[from as usize].child; 800 | if has_child && keep_order { 801 | loop { 802 | let code = *c as i32; 803 | c = &mut self.n_infos[(base ^ code) as usize].sibling; 804 | 805 | if !(self.ordered && (*c != 0) && (*c < label)) { 806 | break; 807 | } 808 | } 809 | } 810 | sibling = *c; 811 | 812 | *c = label; 813 | } 814 | 815 | self.n_infos[(base ^ (label as i32)) as usize].sibling = sibling; 816 | } 817 | 818 | // remove the `label` from the sibling chain. 819 | #[allow(dead_code)] 820 | fn pop_sibling(&mut self, from: i32, base: i32, label: u8) { 821 | let mut c: *mut u8 = &mut self.n_infos[from as usize].child; 822 | unsafe { 823 | while *c != label { 824 | let code = *c as i32; 825 | c = &mut self.n_infos[(base ^ code) as usize].sibling; 826 | } 827 | 828 | let code = label as i32; 829 | *c = self.n_infos[(base ^ code) as usize].sibling; 830 | } 831 | } 832 | 833 | // Loop through the siblings to see which one reached the end first, which means it is the one 834 | // with smaller in children size, and we should try ti relocate the smaller one. 835 | fn consult(&self, base_n: i32, base_p: i32, mut c_n: u8, mut c_p: u8) -> bool { 836 | loop { 837 | c_n = self.n_infos[(base_n ^ (c_n as i32)) as usize].sibling; 838 | c_p = self.n_infos[(base_p ^ (c_p as i32)) as usize].sibling; 839 | 840 | if !(c_n != 0 && c_p != 0) { 841 | break; 842 | } 843 | } 844 | 845 | c_p != 0 846 | } 847 | 848 | // Collect the list of the children, and push the label as well if it is not terminal node. 849 | fn set_child(&self, base: i32, mut c: u8, label: u8, not_terminal: bool) -> SmallVec<[u8; 256]> { 850 | let mut child: SmallVec<[u8; 256]> = SmallVec::new(); 851 | 852 | if c == 0 { 853 | child.push(c); 854 | c = self.n_infos[(base ^ (c as i32)) as usize].sibling; 855 | } 856 | 857 | if self.ordered { 858 | while c != 0 && c <= label { 859 | child.push(c); 860 | c = self.n_infos[(base ^ (c as i32)) as usize].sibling; 861 | } 862 | } 863 | 864 | if not_terminal { 865 | child.push(label); 866 | } 867 | 868 | while c != 0 { 869 | child.push(c); 870 | c = self.n_infos[(base ^ (c as i32)) as usize].sibling; 871 | } 872 | 873 | child 874 | } 875 | 876 | // For the case where only one free slot is needed 877 | fn find_place(&mut self) -> i32 { 878 | if self.blocks_head_closed != 0 { 879 | return self.blocks[self.blocks_head_closed as usize].e_head; 880 | } 881 | 882 | if self.blocks_head_open != 0 { 883 | return self.blocks[self.blocks_head_open as usize].e_head; 884 | } 885 | 886 | // the block is not enough, resize it and allocate it. 887 | self.add_block() << 8 888 | } 889 | 890 | // For the case where multiple free slots are needed. 891 | fn find_places(&mut self, child: &[u8]) -> i32 { 892 | let mut idx = self.blocks_head_open; 893 | 894 | // we still have available 'Open' blocks. 895 | if idx != 0 { 896 | debug_assert!(self.blocks[idx as usize].num > 1); 897 | let bz = self.blocks[self.blocks_head_open as usize].prev; 898 | let nc = child.len() as i16; 899 | 900 | loop { 901 | // only proceed if the free slots are more than the number of children. Also, we 902 | // save the minimal number of attempts to fail in the `reject`, it only worths to 903 | // try out this block if the number of children is less than that number. 904 | if self.blocks[idx as usize].num >= nc && nc < self.blocks[idx as usize].reject { 905 | let mut e = self.blocks[idx as usize].e_head; 906 | loop { 907 | let base = e ^ (child[0] as i32); 908 | 909 | let mut i = 1; 910 | // iterate through the children to see if they are available: (check < 0) 911 | while self.array[(base ^ (child[i] as i32)) as usize].check < 0 { 912 | if i == child.len() - 1 { 913 | // we have found the available block. 914 | self.blocks[idx as usize].e_head = e; 915 | return e; 916 | } 917 | i += 1; 918 | } 919 | 920 | // we save the next free block's information in `check` 921 | e = -self.array[e as usize].check; 922 | if e == self.blocks[idx as usize].e_head { 923 | break; 924 | } 925 | } 926 | } 927 | 928 | // we broke out of the loop, that means we failed. We save the information in 929 | // `reject` for future pruning. 930 | self.blocks[idx as usize].reject = nc; 931 | if self.blocks[idx as usize].reject < self.reject[self.blocks[idx as usize].num as usize] { 932 | // put this stats into the global array of information as well. 933 | self.reject[self.blocks[idx as usize].num as usize] = self.blocks[idx as usize].reject; 934 | } 935 | 936 | let idx_ = self.blocks[idx as usize].next; 937 | 938 | self.blocks[idx as usize].trial += 1; 939 | 940 | // move this block to the 'Closed' block list since it has reached the max_trial 941 | if self.blocks[idx as usize].trial == self.max_trial { 942 | self.transfer_block(idx, BlockType::Open, BlockType::Closed, self.blocks_head_closed == 0); 943 | } 944 | 945 | // we have finsihed one round of this cyclic doubly-linked-list. 946 | if idx == bz { 947 | break; 948 | } 949 | 950 | // going to the next in this linked list group 951 | idx = idx_; 952 | } 953 | } 954 | 955 | self.add_block() << 8 956 | } 957 | 958 | // resolve the conflict by moving one of the the nodes to a free block. 959 | fn resolve(&mut self, mut from_n: usize, base_n: i32, label_n: u8) -> i32 { 960 | let to_pn = base_n ^ (label_n as i32); 961 | 962 | // the `base` and `from` for the conflicting one. 963 | let from_p = self.array[to_pn as usize].check; 964 | let base_p = self.array[from_p as usize].base(); 965 | 966 | // whether to replace siblings of newly added 967 | let flag = self.consult( 968 | base_n, 969 | base_p, 970 | self.n_infos[from_n as usize].child, 971 | self.n_infos[from_p as usize].child, 972 | ); 973 | 974 | // collect the list of children for the block that we are going to relocate. 975 | let children = if flag { 976 | self.set_child(base_n, self.n_infos[from_n as usize].child, label_n, true) 977 | } else { 978 | self.set_child(base_p, self.n_infos[from_p as usize].child, 255, false) 979 | }; 980 | 981 | // decide which algorithm to allocate free block depending on the number of children we 982 | // have. 983 | let mut base = if children.len() == 1 { 984 | self.find_place() 985 | } else { 986 | self.find_places(&children) 987 | }; 988 | 989 | base ^= children[0] as i32; 990 | 991 | let (from, base_) = if flag { 992 | (from_n as i32, base_n) 993 | } else { 994 | (from_p, base_p) 995 | }; 996 | 997 | if flag && children[0] == label_n { 998 | self.n_infos[from as usize].child = label_n; 999 | } 1000 | 1001 | #[cfg(feature = "reduced-trie")] 1002 | { 1003 | self.array[from as usize].base_ = -base - 1; 1004 | } 1005 | 1006 | #[cfg(not(feature = "reduced-trie"))] 1007 | { 1008 | self.array[from as usize].base_ = base; 1009 | } 1010 | 1011 | // the actual work for relocating the chilren 1012 | for i in 0..(children.len()) { 1013 | let to = self.pop_e_node(base, children[i], from); 1014 | let to_ = base_ ^ (children[i] as i32); 1015 | 1016 | if i == children.len() - 1 { 1017 | self.n_infos[to as usize].sibling = 0; 1018 | } else { 1019 | self.n_infos[to as usize].sibling = children[i + 1]; 1020 | } 1021 | 1022 | if flag && to_ == to_pn { 1023 | continue; 1024 | } 1025 | 1026 | self.array[to as usize].base_ = self.array[to_ as usize].base_; 1027 | 1028 | #[cfg(feature = "reduced-trie")] 1029 | let condition = self.array[to as usize].base_ < 0 && children[i] != 0; 1030 | #[cfg(not(feature = "reduced-trie"))] 1031 | let condition = self.array[to as usize].base_ > 0 && children[i] != 0; 1032 | 1033 | if condition { 1034 | let mut c = self.n_infos[to_ as usize].child; 1035 | 1036 | self.n_infos[to as usize].child = c; 1037 | 1038 | loop { 1039 | let idx = (self.array[to as usize].base() ^ (c as i32)) as usize; 1040 | self.array[idx].check = to; 1041 | c = self.n_infos[idx].sibling; 1042 | 1043 | if c == 0 { 1044 | break; 1045 | } 1046 | } 1047 | } 1048 | 1049 | if !flag && to_ == (from_n as i32) { 1050 | from_n = to as usize; 1051 | } 1052 | 1053 | // clean up the space that was moved away from. 1054 | if !flag && to_ == to_pn { 1055 | self.push_sibling(from_n, to_pn ^ (label_n as i32), label_n, true); 1056 | self.n_infos[to_ as usize].child = 0; 1057 | 1058 | #[cfg(feature = "reduced-trie")] 1059 | { 1060 | self.array[to_ as usize].base_ = CEDAR_VALUE_LIMIT; 1061 | } 1062 | 1063 | #[cfg(not(feature = "reduced-trie"))] 1064 | { 1065 | if label_n != 0 { 1066 | self.array[to_ as usize].base_ = -1; 1067 | } else { 1068 | self.array[to_ as usize].base_ = 0; 1069 | } 1070 | } 1071 | 1072 | self.array[to_ as usize].check = from_n as i32; 1073 | } else { 1074 | self.push_e_node(to_); 1075 | } 1076 | } 1077 | 1078 | // return the position that is free now. 1079 | if flag { 1080 | base ^ (label_n as i32) 1081 | } else { 1082 | to_pn 1083 | } 1084 | } 1085 | } 1086 | 1087 | #[cfg(test)] 1088 | mod tests { 1089 | use super::*; 1090 | use rand::distributions::Alphanumeric; 1091 | use rand::{thread_rng, Rng}; 1092 | use std::iter; 1093 | 1094 | #[test] 1095 | fn test_insert_and_delete() { 1096 | let dict = vec!["a"]; 1097 | let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 1098 | let mut cedar = Cedar::new(); 1099 | cedar.build(&key_values); 1100 | 1101 | let result = cedar.exact_match_search("ab").map(|x| x.0); 1102 | assert_eq!(None, result); 1103 | 1104 | cedar.update("ab", 1); 1105 | let result = cedar.exact_match_search("ab").map(|x| x.0); 1106 | assert_eq!(Some(1), result); 1107 | 1108 | cedar.erase("ab"); 1109 | let result = cedar.exact_match_search("ab").map(|x| x.0); 1110 | assert_eq!(None, result); 1111 | 1112 | cedar.update("abc", 2); 1113 | let result = cedar.exact_match_search("abc").map(|x| x.0); 1114 | assert_eq!(Some(2), result); 1115 | 1116 | cedar.erase("abc"); 1117 | let result = cedar.exact_match_search("abc").map(|x| x.0); 1118 | assert_eq!(None, result); 1119 | } 1120 | 1121 | #[test] 1122 | fn test_common_prefix_search() { 1123 | let dict = vec![ 1124 | "a", 1125 | "ab", 1126 | "abc", 1127 | "アルゴリズム", 1128 | "データ", 1129 | "構造", 1130 | "网", 1131 | "网球", 1132 | "网球拍", 1133 | "中", 1134 | "中华", 1135 | "中华人民", 1136 | "中华人民共和国", 1137 | ]; 1138 | let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 1139 | let mut cedar = Cedar::new(); 1140 | cedar.build(&key_values); 1141 | 1142 | let result: Vec = cedar 1143 | .common_prefix_search("abcdefg") 1144 | .unwrap() 1145 | .iter() 1146 | .map(|x| x.0) 1147 | .collect(); 1148 | assert_eq!(vec![0, 1, 2], result); 1149 | 1150 | let result: Vec = cedar 1151 | .common_prefix_search("网球拍卖会") 1152 | .unwrap() 1153 | .iter() 1154 | .map(|x| x.0) 1155 | .collect(); 1156 | assert_eq!(vec![6, 7, 8], result); 1157 | 1158 | let result: Vec = cedar 1159 | .common_prefix_search("中华人民共和国") 1160 | .unwrap() 1161 | .iter() 1162 | .map(|x| x.0) 1163 | .collect(); 1164 | assert_eq!(vec![9, 10, 11, 12], result); 1165 | 1166 | let result: Vec = cedar 1167 | .common_prefix_search("データ構造とアルゴリズム") 1168 | .unwrap() 1169 | .iter() 1170 | .map(|x| x.0) 1171 | .collect(); 1172 | assert_eq!(vec![4], result); 1173 | } 1174 | 1175 | #[test] 1176 | fn test_common_prefix_iter() { 1177 | let dict = vec![ 1178 | "a", 1179 | "ab", 1180 | "abc", 1181 | "アルゴリズム", 1182 | "データ", 1183 | "構造", 1184 | "网", 1185 | "网球", 1186 | "网球拍", 1187 | "中", 1188 | "中华", 1189 | "中华人民", 1190 | "中华人民共和国", 1191 | ]; 1192 | 1193 | let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 1194 | let mut cedar = Cedar::new(); 1195 | cedar.build(&key_values); 1196 | 1197 | let result: Vec = cedar.common_prefix_iter("abcdefg").map(|x| x.0).collect(); 1198 | assert_eq!(vec![0, 1, 2], result); 1199 | 1200 | let result: Vec = cedar.common_prefix_iter("网球拍卖会").map(|x| x.0).collect(); 1201 | assert_eq!(vec![6, 7, 8], result); 1202 | 1203 | let result: Vec = cedar.common_prefix_iter("中华人民共和国").map(|x| x.0).collect(); 1204 | assert_eq!(vec![9, 10, 11, 12], result); 1205 | 1206 | let result: Vec = cedar 1207 | .common_prefix_iter("データ構造とアルゴリズム") 1208 | .map(|x| x.0) 1209 | .collect(); 1210 | assert_eq!(vec![4], result); 1211 | } 1212 | 1213 | #[test] 1214 | fn test_common_prefix_predict() { 1215 | let dict = vec!["a", "ab", "abc"]; 1216 | let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 1217 | let mut cedar = Cedar::new(); 1218 | cedar.build(&key_values); 1219 | 1220 | let result: Vec = cedar.common_prefix_predict("a").unwrap().iter().map(|x| x.0).collect(); 1221 | assert_eq!(vec![0, 1, 2], result); 1222 | } 1223 | 1224 | #[test] 1225 | fn test_exact_match_search() { 1226 | let dict = vec!["a", "ab", "abc"]; 1227 | let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 1228 | let mut cedar = Cedar::new(); 1229 | cedar.build(&key_values); 1230 | 1231 | let result = cedar.exact_match_search("abc").map(|x| x.0); 1232 | assert_eq!(Some(2), result); 1233 | } 1234 | 1235 | #[test] 1236 | fn test_unicode_han_sip() { 1237 | let dict = vec!["讥䶯䶰", "讥䶯䶰䶱䶲", "讥䶯䶰䶱䶲䶳䶴䶵𦡦"]; 1238 | 1239 | let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 1240 | let mut cedar = Cedar::new(); 1241 | cedar.build(&key_values); 1242 | 1243 | let result: Vec = cedar.common_prefix_iter("讥䶯䶰䶱䶲䶳䶴䶵𦡦").map(|x| x.0).collect(); 1244 | assert_eq!(vec![0, 1, 2], result); 1245 | } 1246 | 1247 | #[test] 1248 | fn test_unicode_grapheme_cluster() { 1249 | let dict = vec!["a", "abc", "abcde\u{0301}"]; 1250 | 1251 | let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 1252 | let mut cedar = Cedar::new(); 1253 | cedar.build(&key_values); 1254 | 1255 | let result: Vec = cedar 1256 | .common_prefix_iter("abcde\u{0301}\u{1100}\u{1161}\u{AC00}") 1257 | .map(|x| x.0) 1258 | .collect(); 1259 | assert_eq!(vec![0, 1, 2], result); 1260 | } 1261 | 1262 | #[test] 1263 | fn test_erase() { 1264 | let dict = vec!["a", "ab", "abc"]; 1265 | let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 1266 | let mut cedar = Cedar::new(); 1267 | cedar.build(&key_values); 1268 | 1269 | cedar.erase("abc"); 1270 | assert!(cedar.exact_match_search("abc").is_none()); 1271 | assert!(cedar.exact_match_search("ab").is_some()); 1272 | assert!(cedar.exact_match_search("a").is_some()); 1273 | 1274 | cedar.erase("ab"); 1275 | assert!(cedar.exact_match_search("ab").is_none()); 1276 | assert!(cedar.exact_match_search("a").is_some()); 1277 | 1278 | cedar.erase("a"); 1279 | assert!(cedar.exact_match_search("a").is_none()); 1280 | } 1281 | 1282 | #[test] 1283 | fn test_erase_on_internal_key() { 1284 | let mut cedar = Cedar::new(); 1285 | 1286 | cedar.update("aa", 0); 1287 | assert!(cedar.exact_match_search("aa").is_some()); 1288 | cedar.update("ab", 1); 1289 | assert!(cedar.exact_match_search("ab").is_some()); 1290 | 1291 | cedar.erase("a"); 1292 | assert!(cedar.exact_match_search("a").is_none()); 1293 | cedar.erase("aa"); 1294 | assert!(cedar.exact_match_search("aa").is_none()); 1295 | cedar.erase("ab"); 1296 | assert!(cedar.exact_match_search("ab").is_none()); 1297 | } 1298 | 1299 | #[test] 1300 | fn test_update() { 1301 | let dict = vec!["a", "ab", "abc"]; 1302 | let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 1303 | let mut cedar = Cedar::new(); 1304 | cedar.build(&key_values); 1305 | 1306 | cedar.update("abcd", 3); 1307 | 1308 | assert!(cedar.exact_match_search("a").is_some()); 1309 | assert!(cedar.exact_match_search("ab").is_some()); 1310 | assert!(cedar.exact_match_search("abc").is_some()); 1311 | assert!(cedar.exact_match_search("abcd").is_some()); 1312 | assert!(cedar.exact_match_search("abcde").is_none()); 1313 | 1314 | let dict = vec!["a", "ab", "abc"]; 1315 | let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 1316 | let mut cedar = Cedar::new(); 1317 | cedar.build(&key_values); 1318 | cedar.update("bachelor", 1); 1319 | cedar.update("jar", 2); 1320 | cedar.update("badge", 3); 1321 | cedar.update("baby", 4); 1322 | 1323 | assert!(cedar.exact_match_search("bachelor").is_some()); 1324 | assert!(cedar.exact_match_search("jar").is_some()); 1325 | assert!(cedar.exact_match_search("badge").is_some()); 1326 | assert!(cedar.exact_match_search("baby").is_some()); 1327 | assert!(cedar.exact_match_search("abcde").is_none()); 1328 | 1329 | let dict = vec!["a", "ab", "abc"]; 1330 | let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 1331 | let mut cedar = Cedar::new(); 1332 | cedar.build(&key_values); 1333 | cedar.update("中", 1); 1334 | cedar.update("中华", 2); 1335 | cedar.update("中华人民", 3); 1336 | cedar.update("中华人民共和国", 4); 1337 | 1338 | assert!(cedar.exact_match_search("中").is_some()); 1339 | assert!(cedar.exact_match_search("中华").is_some()); 1340 | assert!(cedar.exact_match_search("中华人民").is_some()); 1341 | assert!(cedar.exact_match_search("中华人民共和国").is_some()); 1342 | } 1343 | 1344 | #[test] 1345 | fn test_quickcheck_like() { 1346 | let mut rng = thread_rng(); 1347 | let mut dict: Vec = Vec::with_capacity(1000); 1348 | for _ in 0..1000 { 1349 | let chars: Vec = iter::repeat(()).map(|()| rng.sample(Alphanumeric)).take(30).collect(); 1350 | let s = String::from_utf8(chars).unwrap(); 1351 | dict.push(s); 1352 | } 1353 | 1354 | let key_values: Vec<(&str, i32)> = dict.iter().enumerate().map(|(k, s)| (s.as_ref(), k as i32)).collect(); 1355 | let mut cedar = Cedar::new(); 1356 | cedar.build(&key_values); 1357 | 1358 | for (k, s) in dict.iter().enumerate() { 1359 | assert_eq!(cedar.exact_match_search(s).map(|x| x.0), Some(k as i32)); 1360 | } 1361 | } 1362 | 1363 | #[test] 1364 | fn test_quickcheck_like_with_deep_trie() { 1365 | let mut rng = thread_rng(); 1366 | let mut dict: Vec = Vec::with_capacity(1000); 1367 | let mut s = String::new(); 1368 | for _ in 0..1000 { 1369 | let c: char = rng.sample(Alphanumeric) as char; 1370 | s.push(c); 1371 | dict.push(s.clone()); 1372 | } 1373 | 1374 | let key_values: Vec<(&str, i32)> = dict.iter().enumerate().map(|(k, s)| (s.as_ref(), k as i32)).collect(); 1375 | let mut cedar = Cedar::new(); 1376 | cedar.build(&key_values); 1377 | 1378 | for (k, s) in dict.iter().enumerate() { 1379 | assert_eq!(cedar.exact_match_search(s).map(|x| x.0), Some(k as i32)); 1380 | } 1381 | } 1382 | 1383 | #[test] 1384 | fn test_mass_erase() { 1385 | let mut rng = thread_rng(); 1386 | let mut dict: Vec = Vec::with_capacity(1000); 1387 | for _ in 0..1000 { 1388 | let chars: Vec = iter::repeat(()).map(|()| rng.sample(Alphanumeric)).take(30).collect(); 1389 | let s = String::from_utf8(chars).unwrap(); 1390 | 1391 | dict.push(s); 1392 | } 1393 | 1394 | let key_values: Vec<(&str, i32)> = dict.iter().enumerate().map(|(k, s)| (s.as_ref(), k as i32)).collect(); 1395 | let mut cedar = Cedar::new(); 1396 | cedar.build(&key_values); 1397 | 1398 | for s in dict.iter() { 1399 | cedar.erase(s); 1400 | assert!(cedar.exact_match_search(s).is_none()); 1401 | } 1402 | } 1403 | 1404 | #[test] 1405 | fn test_duplication() { 1406 | let dict = vec!["些许端", "些須", "些须", "亜", "亝", "亞", "亞", "亞丁", "亞丁港"]; 1407 | let key_values: Vec<(&str, i32)> = dict.into_iter().enumerate().map(|(k, s)| (s, k as i32)).collect(); 1408 | let mut cedar = Cedar::new(); 1409 | cedar.build(&key_values); 1410 | 1411 | assert_eq!(cedar.exact_match_search("亞").map(|t| t.0), Some(6)); 1412 | assert_eq!(cedar.exact_match_search("亞丁港").map(|t| t.0), Some(8)); 1413 | assert_eq!(cedar.exact_match_search("亝").map(|t| t.0), Some(4)); 1414 | assert_eq!(cedar.exact_match_search("些須").map(|t| t.0), Some(1)); 1415 | } 1416 | } 1417 | --------------------------------------------------------------------------------