├── .editorconfig ├── .gitignore ├── .travis.yml ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── benches └── darts_benchmark.rs ├── examples └── build-dict │ ├── .gitignore │ ├── Cargo.lock │ ├── Cargo.toml │ ├── dict.txt │ └── src │ └── main.rs ├── priv ├── dict.big.bincode ├── dict.txt.big └── weicheng.txt ├── rustfmt.toml └── src ├── lib.rs └── searcher.rs /.editorconfig: -------------------------------------------------------------------------------- 1 | # editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | charset = utf-8 7 | end_of_line = lf 8 | indent_size = 4 9 | indent_style = space 10 | insert_final_newline = true 11 | trim_trailing_whitespace = true 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | **/*.rs.bk 3 | .vscode 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | dist: trusty 3 | language: rust 4 | addons: 5 | apt: 6 | packages: 7 | - libssl-dev 8 | - pkg-config 9 | - cmake 10 | - zlib1g-dev 11 | rust: 12 | - stable 13 | - beta 14 | - nightly 15 | 16 | before_script: 17 | - rustup component add rustfmt 18 | - | 19 | if [[ "$TRAVIS_OS_NAME" == "linux" && "$TRAVIS_RUST_VERSION" == "stable" ]]; then 20 | rustup component add clippy 21 | fi 22 | 23 | script: 24 | - cargo fmt --all -- --check 25 | - cargo build 26 | - cargo build --features searcher 27 | - cargo build --features serialization 28 | - cargo build --all-features 29 | - cargo test --all-features 30 | - | 31 | if [[ "$TRAVIS_OS_NAME" == "linux" && "$TRAVIS_RUST_VERSION" == "stable" ]]; then 32 | cargo clippy --all-features 33 | fi 34 | 35 | after_success: 36 | - | 37 | if [[ "$TRAVIS_OS_NAME" == "linux" && "$TRAVIS_RUST_VERSION" == "stable" ]]; then 38 | bash <(curl https://raw.githubusercontent.com/xd009642/tarpaulin/master/travis-install.sh) 39 | cargo tarpaulin --all-features --out Xml 40 | bash <(curl -s https://codecov.io/bash) 41 | fi 42 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | [[package]] 4 | name = "arrayvec" 5 | version = "0.4.10" 6 | source = "registry+https://github.com/rust-lang/crates.io-index" 7 | dependencies = [ 8 | "nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)", 9 | ] 10 | 11 | [[package]] 12 | name = "atty" 13 | version = "0.2.11" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | dependencies = [ 16 | "libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)", 17 | "termion 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)", 18 | "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", 19 | ] 20 | 21 | [[package]] 22 | name = "autocfg" 23 | version = "0.1.4" 24 | source = "registry+https://github.com/rust-lang/crates.io-index" 25 | 26 | [[package]] 27 | name = "bincode" 28 | version = "1.1.4" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | dependencies = [ 31 | "autocfg 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", 32 | "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)", 33 | "serde 1.0.92 (registry+https://github.com/rust-lang/crates.io-index)", 34 | ] 35 | 36 | [[package]] 37 | name = "bitflags" 38 | version = "1.1.0" 39 | source = "registry+https://github.com/rust-lang/crates.io-index" 40 | 41 | [[package]] 42 | name = "byteorder" 43 | version = "1.3.2" 44 | source = "registry+https://github.com/rust-lang/crates.io-index" 45 | 46 | [[package]] 47 | name = "cast" 48 | version = "0.2.2" 49 | source = "registry+https://github.com/rust-lang/crates.io-index" 50 | 51 | [[package]] 52 | name = "cfg-if" 53 | version = "0.1.9" 54 | source = "registry+https://github.com/rust-lang/crates.io-index" 55 | 56 | [[package]] 57 | name = "clap" 58 | version = "2.33.0" 59 | source = "registry+https://github.com/rust-lang/crates.io-index" 60 | dependencies = [ 61 | "bitflags 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", 62 | "textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", 63 | "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", 64 | ] 65 | 66 | [[package]] 67 | name = "cloudabi" 68 | version = "0.0.3" 69 | source = "registry+https://github.com/rust-lang/crates.io-index" 70 | dependencies = [ 71 | "bitflags 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", 72 | ] 73 | 74 | [[package]] 75 | name = "criterion" 76 | version = "0.2.11" 77 | source = "registry+https://github.com/rust-lang/crates.io-index" 78 | dependencies = [ 79 | "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", 80 | "cast 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", 81 | "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)", 82 | "criterion-plot 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", 83 | "csv 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", 84 | "itertools 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", 85 | "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", 86 | "libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)", 87 | "num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", 88 | "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", 89 | "rand_os 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", 90 | "rand_xoshiro 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", 91 | "rayon 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", 92 | "rayon-core 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)", 93 | "serde 1.0.92 (registry+https://github.com/rust-lang/crates.io-index)", 94 | "serde_derive 1.0.92 (registry+https://github.com/rust-lang/crates.io-index)", 95 | "serde_json 1.0.39 (registry+https://github.com/rust-lang/crates.io-index)", 96 | "tinytemplate 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", 97 | "walkdir 2.2.8 (registry+https://github.com/rust-lang/crates.io-index)", 98 | ] 99 | 100 | [[package]] 101 | name = "criterion-plot" 102 | version = "0.3.1" 103 | source = "registry+https://github.com/rust-lang/crates.io-index" 104 | dependencies = [ 105 | "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)", 106 | "cast 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", 107 | "itertools 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", 108 | ] 109 | 110 | [[package]] 111 | name = "crossbeam-deque" 112 | version = "0.6.3" 113 | source = "registry+https://github.com/rust-lang/crates.io-index" 114 | dependencies = [ 115 | "crossbeam-epoch 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", 116 | "crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", 117 | ] 118 | 119 | [[package]] 120 | name = "crossbeam-epoch" 121 | version = "0.7.1" 122 | source = "registry+https://github.com/rust-lang/crates.io-index" 123 | dependencies = [ 124 | "arrayvec 0.4.10 (registry+https://github.com/rust-lang/crates.io-index)", 125 | "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", 126 | "crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", 127 | "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", 128 | "memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", 129 | "scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", 130 | ] 131 | 132 | [[package]] 133 | name = "crossbeam-queue" 134 | version = "0.1.2" 135 | source = "registry+https://github.com/rust-lang/crates.io-index" 136 | dependencies = [ 137 | "crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", 138 | ] 139 | 140 | [[package]] 141 | name = "crossbeam-utils" 142 | version = "0.6.5" 143 | source = "registry+https://github.com/rust-lang/crates.io-index" 144 | dependencies = [ 145 | "cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)", 146 | "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", 147 | ] 148 | 149 | [[package]] 150 | name = "csv" 151 | version = "1.0.7" 152 | source = "registry+https://github.com/rust-lang/crates.io-index" 153 | dependencies = [ 154 | "csv-core 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", 155 | "itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", 156 | "ryu 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", 157 | "serde 1.0.92 (registry+https://github.com/rust-lang/crates.io-index)", 158 | ] 159 | 160 | [[package]] 161 | name = "csv-core" 162 | version = "0.1.5" 163 | source = "registry+https://github.com/rust-lang/crates.io-index" 164 | dependencies = [ 165 | "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)", 166 | ] 167 | 168 | [[package]] 169 | name = "darts" 170 | version = "0.1.0" 171 | dependencies = [ 172 | "bincode 1.1.4 (registry+https://github.com/rust-lang/crates.io-index)", 173 | "criterion 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", 174 | "hashbrown 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", 175 | "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", 176 | "serde 1.0.92 (registry+https://github.com/rust-lang/crates.io-index)", 177 | ] 178 | 179 | [[package]] 180 | name = "either" 181 | version = "1.5.2" 182 | source = "registry+https://github.com/rust-lang/crates.io-index" 183 | 184 | [[package]] 185 | name = "fuchsia-cprng" 186 | version = "0.1.1" 187 | source = "registry+https://github.com/rust-lang/crates.io-index" 188 | 189 | [[package]] 190 | name = "hashbrown" 191 | version = "0.5.0" 192 | source = "registry+https://github.com/rust-lang/crates.io-index" 193 | 194 | [[package]] 195 | name = "itertools" 196 | version = "0.8.0" 197 | source = "registry+https://github.com/rust-lang/crates.io-index" 198 | dependencies = [ 199 | "either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)", 200 | ] 201 | 202 | [[package]] 203 | name = "itoa" 204 | version = "0.4.4" 205 | source = "registry+https://github.com/rust-lang/crates.io-index" 206 | 207 | [[package]] 208 | name = "lazy_static" 209 | version = "1.3.0" 210 | source = "registry+https://github.com/rust-lang/crates.io-index" 211 | 212 | [[package]] 213 | name = "libc" 214 | version = "0.2.58" 215 | source = "registry+https://github.com/rust-lang/crates.io-index" 216 | 217 | [[package]] 218 | name = "memchr" 219 | version = "2.2.0" 220 | source = "registry+https://github.com/rust-lang/crates.io-index" 221 | dependencies = [ 222 | "libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)", 223 | ] 224 | 225 | [[package]] 226 | name = "memoffset" 227 | version = "0.2.1" 228 | source = "registry+https://github.com/rust-lang/crates.io-index" 229 | 230 | [[package]] 231 | name = "nodrop" 232 | version = "0.1.13" 233 | source = "registry+https://github.com/rust-lang/crates.io-index" 234 | 235 | [[package]] 236 | name = "num-traits" 237 | version = "0.2.8" 238 | source = "registry+https://github.com/rust-lang/crates.io-index" 239 | dependencies = [ 240 | "autocfg 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", 241 | ] 242 | 243 | [[package]] 244 | name = "num_cpus" 245 | version = "1.10.1" 246 | source = "registry+https://github.com/rust-lang/crates.io-index" 247 | dependencies = [ 248 | "libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)", 249 | ] 250 | 251 | [[package]] 252 | name = "numtoa" 253 | version = "0.1.0" 254 | source = "registry+https://github.com/rust-lang/crates.io-index" 255 | 256 | [[package]] 257 | name = "proc-macro2" 258 | version = "0.4.30" 259 | source = "registry+https://github.com/rust-lang/crates.io-index" 260 | dependencies = [ 261 | "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", 262 | ] 263 | 264 | [[package]] 265 | name = "quote" 266 | version = "0.6.12" 267 | source = "registry+https://github.com/rust-lang/crates.io-index" 268 | dependencies = [ 269 | "proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)", 270 | ] 271 | 272 | [[package]] 273 | name = "rand_core" 274 | version = "0.3.1" 275 | source = "registry+https://github.com/rust-lang/crates.io-index" 276 | dependencies = [ 277 | "rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", 278 | ] 279 | 280 | [[package]] 281 | name = "rand_core" 282 | version = "0.4.0" 283 | source = "registry+https://github.com/rust-lang/crates.io-index" 284 | 285 | [[package]] 286 | name = "rand_os" 287 | version = "0.1.3" 288 | source = "registry+https://github.com/rust-lang/crates.io-index" 289 | dependencies = [ 290 | "cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)", 291 | "fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", 292 | "libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)", 293 | "rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", 294 | "rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", 295 | "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", 296 | ] 297 | 298 | [[package]] 299 | name = "rand_xoshiro" 300 | version = "0.1.0" 301 | source = "registry+https://github.com/rust-lang/crates.io-index" 302 | dependencies = [ 303 | "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)", 304 | "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", 305 | ] 306 | 307 | [[package]] 308 | name = "rayon" 309 | version = "1.1.0" 310 | source = "registry+https://github.com/rust-lang/crates.io-index" 311 | dependencies = [ 312 | "crossbeam-deque 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", 313 | "either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)", 314 | "rayon-core 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)", 315 | ] 316 | 317 | [[package]] 318 | name = "rayon-core" 319 | version = "1.5.0" 320 | source = "registry+https://github.com/rust-lang/crates.io-index" 321 | dependencies = [ 322 | "crossbeam-deque 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", 323 | "crossbeam-queue 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", 324 | "crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", 325 | "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", 326 | "num_cpus 1.10.1 (registry+https://github.com/rust-lang/crates.io-index)", 327 | ] 328 | 329 | [[package]] 330 | name = "rdrand" 331 | version = "0.4.0" 332 | source = "registry+https://github.com/rust-lang/crates.io-index" 333 | dependencies = [ 334 | "rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", 335 | ] 336 | 337 | [[package]] 338 | name = "redox_syscall" 339 | version = "0.1.54" 340 | source = "registry+https://github.com/rust-lang/crates.io-index" 341 | 342 | [[package]] 343 | name = "redox_termios" 344 | version = "0.1.1" 345 | source = "registry+https://github.com/rust-lang/crates.io-index" 346 | dependencies = [ 347 | "redox_syscall 0.1.54 (registry+https://github.com/rust-lang/crates.io-index)", 348 | ] 349 | 350 | [[package]] 351 | name = "ryu" 352 | version = "0.2.8" 353 | source = "registry+https://github.com/rust-lang/crates.io-index" 354 | 355 | [[package]] 356 | name = "same-file" 357 | version = "1.0.4" 358 | source = "registry+https://github.com/rust-lang/crates.io-index" 359 | dependencies = [ 360 | "winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", 361 | ] 362 | 363 | [[package]] 364 | name = "scopeguard" 365 | version = "0.3.3" 366 | source = "registry+https://github.com/rust-lang/crates.io-index" 367 | 368 | [[package]] 369 | name = "serde" 370 | version = "1.0.92" 371 | source = "registry+https://github.com/rust-lang/crates.io-index" 372 | dependencies = [ 373 | "serde_derive 1.0.92 (registry+https://github.com/rust-lang/crates.io-index)", 374 | ] 375 | 376 | [[package]] 377 | name = "serde_derive" 378 | version = "1.0.92" 379 | source = "registry+https://github.com/rust-lang/crates.io-index" 380 | dependencies = [ 381 | "proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)", 382 | "quote 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)", 383 | "syn 0.15.36 (registry+https://github.com/rust-lang/crates.io-index)", 384 | ] 385 | 386 | [[package]] 387 | name = "serde_json" 388 | version = "1.0.39" 389 | source = "registry+https://github.com/rust-lang/crates.io-index" 390 | dependencies = [ 391 | "itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", 392 | "ryu 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", 393 | "serde 1.0.92 (registry+https://github.com/rust-lang/crates.io-index)", 394 | ] 395 | 396 | [[package]] 397 | name = "syn" 398 | version = "0.15.36" 399 | source = "registry+https://github.com/rust-lang/crates.io-index" 400 | dependencies = [ 401 | "proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)", 402 | "quote 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)", 403 | "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", 404 | ] 405 | 406 | [[package]] 407 | name = "termion" 408 | version = "1.5.3" 409 | source = "registry+https://github.com/rust-lang/crates.io-index" 410 | dependencies = [ 411 | "libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)", 412 | "numtoa 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", 413 | "redox_syscall 0.1.54 (registry+https://github.com/rust-lang/crates.io-index)", 414 | "redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", 415 | ] 416 | 417 | [[package]] 418 | name = "textwrap" 419 | version = "0.11.0" 420 | source = "registry+https://github.com/rust-lang/crates.io-index" 421 | dependencies = [ 422 | "unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", 423 | ] 424 | 425 | [[package]] 426 | name = "tinytemplate" 427 | version = "1.0.2" 428 | source = "registry+https://github.com/rust-lang/crates.io-index" 429 | dependencies = [ 430 | "serde 1.0.92 (registry+https://github.com/rust-lang/crates.io-index)", 431 | "serde_json 1.0.39 (registry+https://github.com/rust-lang/crates.io-index)", 432 | ] 433 | 434 | [[package]] 435 | name = "unicode-width" 436 | version = "0.1.5" 437 | source = "registry+https://github.com/rust-lang/crates.io-index" 438 | 439 | [[package]] 440 | name = "unicode-xid" 441 | version = "0.1.0" 442 | source = "registry+https://github.com/rust-lang/crates.io-index" 443 | 444 | [[package]] 445 | name = "walkdir" 446 | version = "2.2.8" 447 | source = "registry+https://github.com/rust-lang/crates.io-index" 448 | dependencies = [ 449 | "same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", 450 | "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", 451 | "winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", 452 | ] 453 | 454 | [[package]] 455 | name = "winapi" 456 | version = "0.3.7" 457 | source = "registry+https://github.com/rust-lang/crates.io-index" 458 | dependencies = [ 459 | "winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", 460 | "winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)", 461 | ] 462 | 463 | [[package]] 464 | name = "winapi-i686-pc-windows-gnu" 465 | version = "0.4.0" 466 | source = "registry+https://github.com/rust-lang/crates.io-index" 467 | 468 | [[package]] 469 | name = "winapi-util" 470 | version = "0.1.2" 471 | source = "registry+https://github.com/rust-lang/crates.io-index" 472 | dependencies = [ 473 | "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", 474 | ] 475 | 476 | [[package]] 477 | name = "winapi-x86_64-pc-windows-gnu" 478 | version = "0.4.0" 479 | source = "registry+https://github.com/rust-lang/crates.io-index" 480 | 481 | [metadata] 482 | "checksum arrayvec 0.4.10 (registry+https://github.com/rust-lang/crates.io-index)" = "92c7fb76bc8826a8b33b4ee5bb07a247a81e76764ab4d55e8f73e3a4d8808c71" 483 | "checksum atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "9a7d5b8723950951411ee34d271d99dddcc2035a16ab25310ea2c8cfd4369652" 484 | "checksum autocfg 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "0e49efa51329a5fd37e7c79db4621af617cd4e3e5bc224939808d076077077bf" 485 | "checksum bincode 1.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "9f04a5e50dc80b3d5d35320889053637d15011aed5e66b66b37ae798c65da6f7" 486 | "checksum bitflags 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3d155346769a6855b86399e9bc3814ab343cd3d62c7e985113d46a0ec3c281fd" 487 | "checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5" 488 | "checksum cast 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "926013f2860c46252efceabb19f4a6b308197505082c609025aa6706c011d427" 489 | "checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33" 490 | "checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" 491 | "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" 492 | "checksum criterion 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "0363053954f3e679645fc443321ca128b7b950a6fe288cf5f9335cc22ee58394" 493 | "checksum criterion-plot 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "76f9212ddf2f4a9eb2d401635190600656a1f88a932ef53d06e7fa4c7e02fb8e" 494 | "checksum crossbeam-deque 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "05e44b8cf3e1a625844d1750e1f7820da46044ff6d28f4d43e455ba3e5bb2c13" 495 | "checksum crossbeam-epoch 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)" = "04c9e3102cc2d69cd681412141b390abd55a362afc1540965dad0ad4d34280b4" 496 | "checksum crossbeam-queue 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7c979cd6cfe72335896575c6b5688da489e420d36a27a0b9eb0c73db574b4a4b" 497 | "checksum crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f8306fcef4a7b563b76b7dd949ca48f52bc1141aa067d2ea09565f3e2652aa5c" 498 | "checksum csv 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)" = "9044e25afb0924b5a5fc5511689b0918629e85d68ea591e5e87fbf1e85ea1b3b" 499 | "checksum csv-core 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fa5cdef62f37e6ffe7d1f07a381bc0db32b7a3ff1cac0de56cb0d81e71f53d65" 500 | "checksum either 1.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5527cfe0d098f36e3f8839852688e63c8fff1c90b2b405aef730615f9a7bcf7b" 501 | "checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" 502 | "checksum hashbrown 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e1de41fb8dba9714efd92241565cdff73f78508c95697dd56787d3cba27e2353" 503 | "checksum itertools 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5b8467d9c1cebe26feb08c640139247fac215782d35371ade9a2136ed6085358" 504 | "checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f" 505 | "checksum lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bc5729f27f159ddd61f4df6228e827e86643d4d3e7c32183cb30a1c08f604a14" 506 | "checksum libc 0.2.58 (registry+https://github.com/rust-lang/crates.io-index)" = "6281b86796ba5e4366000be6e9e18bf35580adf9e63fbe2294aadb587613a319" 507 | "checksum memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2efc7bc57c883d4a4d6e3246905283d8dae951bb3bd32f49d6ef297f546e1c39" 508 | "checksum memoffset 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0f9dc261e2b62d7a622bf416ea3c5245cdd5d9a7fcc428c0d06804dfce1775b3" 509 | "checksum nodrop 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "2f9667ddcc6cc8a43afc9b7917599d7216aa09c463919ea32c59ed6cac8bc945" 510 | "checksum num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "6ba9a427cfca2be13aa6f6403b0b7e7368fe982bfa16fccc450ce74c46cd9b32" 511 | "checksum num_cpus 1.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "bcef43580c035376c0705c42792c294b66974abbfd2789b511784023f71f3273" 512 | "checksum numtoa 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b8f8bdf33df195859076e54ab11ee78a1b208382d3a26ec40d142ffc1ecc49ef" 513 | "checksum proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)" = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759" 514 | "checksum quote 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)" = "faf4799c5d274f3868a4aae320a0a182cbd2baee377b378f080e16a23e9d80db" 515 | "checksum rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" 516 | "checksum rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d0e7a549d590831370895ab7ba4ea0c1b6b011d106b5ff2da6eee112615e6dc0" 517 | "checksum rand_os 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071" 518 | "checksum rand_xoshiro 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "03b418169fb9c46533f326efd6eed2576699c44ca92d3052a066214a8d828929" 519 | "checksum rayon 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a4b0186e22767d5b9738a05eab7c6ac90b15db17e5b5f9bd87976dd7d89a10a4" 520 | "checksum rayon-core 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ebbe0df8435ac0c397d467b6cad6d25543d06e8a019ef3f6af3c384597515bd2" 521 | "checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" 522 | "checksum redox_syscall 0.1.54 (registry+https://github.com/rust-lang/crates.io-index)" = "12229c14a0f65c4f1cb046a3b52047cdd9da1f4b30f8a39c5063c8bae515e252" 523 | "checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" 524 | "checksum ryu 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "b96a9549dc8d48f2c283938303c4b5a77aa29bfbc5b54b084fb1630408899a8f" 525 | "checksum same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8f20c4be53a8a1ff4c1f1b2bd14570d2f634628709752f0702ecdd2b3f9a5267" 526 | "checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27" 527 | "checksum serde 1.0.92 (registry+https://github.com/rust-lang/crates.io-index)" = "32746bf0f26eab52f06af0d0aa1984f641341d06d8d673c693871da2d188c9be" 528 | "checksum serde_derive 1.0.92 (registry+https://github.com/rust-lang/crates.io-index)" = "46a3223d0c9ba936b61c0d2e3e559e3217dbfb8d65d06d26e8b3c25de38bae3e" 529 | "checksum serde_json 1.0.39 (registry+https://github.com/rust-lang/crates.io-index)" = "5a23aa71d4a4d43fdbfaac00eff68ba8a06a51759a89ac3304323e800c4dd40d" 530 | "checksum syn 0.15.36 (registry+https://github.com/rust-lang/crates.io-index)" = "8b4f551a91e2e3848aeef8751d0d4eec9489b6474c720fd4c55958d8d31a430c" 531 | "checksum termion 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "6a8fb22f7cde82c8220e5aeacb3258ed7ce996142c77cba193f203515e26c330" 532 | "checksum textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" 533 | "checksum tinytemplate 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4574b75faccaacddb9b284faecdf0b544b80b6b294f3d062d325c5726a209c20" 534 | "checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" 535 | "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" 536 | "checksum walkdir 2.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "c7904a7e2bb3cdf0cf5e783f44204a85a37a93151738fa349f06680f59a98b45" 537 | "checksum winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "f10e386af2b13e47c89e7236a7a14a086791a2b88ebad6df9bf42040195cf770" 538 | "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 539 | "checksum winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7168bab6e1daee33b4557efd0e95d5ca70a03706d39fa5f3fe7a236f584b03c9" 540 | "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 541 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "darts" 3 | description = "A double array trie, A Forward Maximum Matching Searcher." 4 | repository = "https://github.com/andelf/rust-darts" 5 | documentation = "https://docs.rs/darts" 6 | version = "0.1.0" 7 | license = "MIT" 8 | authors = ["andelf ", 9 | "Paul Meng ", 10 | "messense "] 11 | readme = "README.md" 12 | keywords = ["trie", "darts", "string", "search", "text"] 13 | categories = ["data-structures", "text-processing"] 14 | edition = "2018" 15 | exclude = ["/benches/**", "/priv/**", "/.travis.yml"] 16 | 17 | [badges] 18 | travis-ci = { repository = "andelf/rust-darts" } 19 | codecov = { repository = "andelf/rust-darts" } 20 | 21 | [dependencies] 22 | bincode = { version = "1.1", optional = true } 23 | serde = { version = "1.0", features = ["derive"], optional = true } 24 | 25 | [dev-dependencies] 26 | criterion = "0.2" 27 | lazy_static = "1.3" 28 | hashbrown = "0.5.0" 29 | 30 | [[bench]] 31 | name = "darts_benchmark" 32 | harness = false 33 | required-features = ["searcher", "serialization"] 34 | 35 | [features] 36 | default = [] 37 | searcher = [] 38 | serialization = ["bincode", "serde"] 39 | 40 | [package.metadata.docs.rs] 41 | all-features = true 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 andelf 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rust-darts: Double-Array Trie Rust implementation. 2 | 3 | This library is in alpha state, PRs are welcomed. An optional Forward Maximum Matching Searcher is provided when enabled by features. 4 | 5 | [![Build Status](https://travis-ci.org/andelf/rust-darts.svg?branch=master)](https://travis-ci.org/andelf/rust-darts) 6 | [![codecov](https://codecov.io/gh/andelf/rust-darts/branch/master/graph/badge.svg)](https://codecov.io/gh/andelf/rust-darts) 7 | [![Crates.io](https://img.shields.io/crates/v/darts.svg)](https://crates.io/crates/darts) 8 | [![docs.rs](https://docs.rs/darts/badge.svg)](https://docs.rs/darts/) 9 | 10 | ## Installation 11 | 12 | Add it to your `Cargo.toml`: 13 | 14 | ```toml 15 | [dependencies] 16 | darts = "0.1" 17 | ``` 18 | 19 | then you are good to go. If you are using Rust 2015 you have to `extern crate darts` to your crate root as well. 20 | 21 | ## Example 22 | 23 | ```rust 24 | use std::fs::File; 25 | use darts::DoubleArrayTrie; 26 | 27 | fn main() { 28 | let mut f = File::open("./priv/dict.big.bincode").unwrap(); 29 | let da = DoubleArrayTrie::load(&mut f).unwrap(); 30 | let string = "中华人民共和国"; 31 | let prefixes = da.common_prefix_search(string).map(|matches| { 32 | matches 33 | .iter() 34 | .map(|(end_idx, v)| { 35 | &string[..end_idx] 36 | }) 37 | .collect(); 38 | }).unwrap_or(vec![]); 39 | assert_eq!(vec!["中", "中华", "中华人民", "中华人民共和国"], prefixes); 40 | } 41 | ``` 42 | 43 | ```rust 44 | use std::fs::File; 45 | use darts::DoubleArrayTrie; 46 | 47 | fn main() { 48 | let mut f = File::open("./priv/dict.big.bincode").unwrap(); 49 | let da = DoubleArrayTrie::load(&mut f).unwrap(); 50 | assert!(da.exact_match_search("东湖高新技术开发区").is_some()); 51 | } 52 | ``` 53 | 54 | ## Enabling Additional Features 55 | 56 | * `searcher` feature enables searcher for maximum forward matcher 57 | * `serialization` feature enables saving and loading serialized `DoubleArrayTrie` data 58 | 59 | ```toml 60 | [dependencies] 61 | darts = { version = "0.1", features = ["searcher", "serialization"] } 62 | ``` 63 | 64 | ## To Rebuild Dictionary 65 | 66 | ```bash 67 | # It would take minutes, be patient. 68 | time cargo test --all-features -- --nocapture --ignored test_dat_basic 69 | ``` 70 | 71 | ## To run benchmark tests 72 | ```bash 73 | cargo bench --all-features 74 | ``` 75 | 76 | ## License 77 | 78 | This work is released under the MIT license. A copy of the license is provided in the LICENSE file. 79 | 80 | ## Reference 81 | 82 | - [hankcs/HanLP](https://github.com/hankcs/HanLP) 83 | - [Aho Corasick自动机结合DoubleArrayTrie极速多模式匹配](http://www.hankcs.com/program/algorithm/aho-corasick-double-array-trie.html) 84 | - [DoubleArrayTrie和AhoCorasickDoubleArrayTrie的实用性对比](http://www.hankcs.com/program/algorithm/double-array-trie-vs-aho-corasick-double-array-trie.html) 85 | - [Darts: Double-Array Trie System](http://chasen.org/~taku/software/darts/) 86 | - [An Implementation of Double-Array Trie](https://linux.thai.net/~thep/datrie/datrie.html) 87 | -------------------------------------------------------------------------------- /benches/darts_benchmark.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate criterion; 3 | extern crate darts; 4 | extern crate hashbrown; 5 | extern crate lazy_static; 6 | 7 | use darts::searcher; 8 | 9 | use criterion::Criterion; 10 | use darts::DoubleArrayTrie; 11 | use hashbrown::HashMap; 12 | use lazy_static::lazy_static; 13 | use std::fs::{read_to_string, File}; 14 | use std::io::{BufRead, BufReader}; 15 | 16 | lazy_static! { 17 | static ref DA: DoubleArrayTrie = { 18 | let mut f = File::open("./priv/dict.big.bincode").unwrap(); 19 | DoubleArrayTrie::load(&mut f).unwrap() 20 | }; 21 | static ref HASHMAP: HashMap = { 22 | let mut dict = HashMap::new(); 23 | let file = File::open("./priv/dict.txt.big").unwrap(); 24 | let mut rdr = BufReader::with_capacity(8 * (1 << 10), file); 25 | 26 | let mut buf = String::new(); 27 | while rdr.read_line(&mut buf).unwrap() > 0 { 28 | { 29 | let parts: Vec<&str> = buf.trim().split_whitespace().collect(); 30 | if parts.is_empty() { 31 | // Skip empty lines 32 | continue; 33 | } 34 | 35 | let word = parts[0]; 36 | let freq = parts.get(1).map(|x| x.parse::().unwrap()).unwrap(); 37 | 38 | dict.insert(String::from(word), freq); 39 | } 40 | buf.clear(); 41 | } 42 | 43 | dict 44 | }; 45 | } 46 | 47 | fn bench_dat_prefix_search() { 48 | DA.common_prefix_search("东湖高新技术开发区").unwrap(); 49 | } 50 | 51 | fn bench_hashbrown_prefix_search() { 52 | let sentence: &str = "东湖高新技术开发区"; 53 | let char_indices: Vec = sentence.char_indices().map(|x| x.0).collect(); 54 | 55 | let word_count = char_indices.len(); 56 | for (k, &byte_start) in char_indices.iter().enumerate() { 57 | let mut i = k; 58 | let wfrag = if k + 1 < char_indices.len() { 59 | &sentence[byte_start..char_indices[k + 1]] 60 | } else { 61 | &sentence[byte_start..] 62 | }; 63 | 64 | while i < word_count { 65 | if HASHMAP.contains_key(wfrag) { 66 | //do nothing 67 | } 68 | 69 | i += 1; 70 | } 71 | } 72 | } 73 | 74 | fn bench_dat_match_found() { 75 | DA.exact_match_search( 76 | "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 77 | ); 78 | } 79 | 80 | fn bench_hashbrown_match_found() { 81 | HASHMAP.contains_key( 82 | "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 83 | ); 84 | } 85 | 86 | fn bench_dat_match_not_found_slow_fail() { 87 | DA.exact_match_search("东湖高新技术开发区abcdef"); 88 | } 89 | 90 | fn bench_hashbrown_match_not_found_slow_fail() { 91 | HASHMAP.contains_key("东湖高新技术开发区abcdef"); 92 | } 93 | 94 | fn bench_dat_match_not_found_fast_fail() { 95 | DA.exact_match_search("abcdef东湖高新技术开发区"); 96 | } 97 | 98 | fn bench_hashbrown_match_not_found_fast_fail() { 99 | HASHMAP.contains_key("abcdef东湖高新技术开发区"); 100 | } 101 | 102 | fn bench_dat_searcher() { 103 | let text: String = read_to_string("./priv/weicheng.txt").unwrap(); 104 | 105 | let mut searcher = DA.search(&text); 106 | loop { 107 | let step = searcher.next(); 108 | if step == searcher::SearchStep::Done { 109 | break; 110 | } 111 | } 112 | } 113 | 114 | fn touch_static_variables() { 115 | DA.exact_match_search(" "); 116 | HASHMAP.contains_key(" "); 117 | } 118 | 119 | fn criterion_benchmark(c: &mut Criterion) { 120 | touch_static_variables(); 121 | 122 | c.bench_function("dat prefix search", |b| b.iter(|| bench_dat_prefix_search())); 123 | 124 | c.bench_function("hashbrown prefix search", |b| { 125 | b.iter(|| bench_hashbrown_prefix_search()) 126 | }); 127 | 128 | c.bench_function("dat match found", |b| b.iter(|| bench_dat_match_found())); 129 | 130 | c.bench_function("hashbrown match found", |b| b.iter(|| bench_hashbrown_match_found())); 131 | 132 | c.bench_function("dat match not found fast fail", |b| { 133 | b.iter(|| bench_dat_match_not_found_fast_fail()) 134 | }); 135 | 136 | c.bench_function("hashbrown match not found fast fail", |b| { 137 | b.iter(|| bench_hashbrown_match_not_found_fast_fail()) 138 | }); 139 | 140 | c.bench_function("dat match not found slow fail", |b| { 141 | b.iter(|| bench_dat_match_not_found_slow_fail()) 142 | }); 143 | 144 | c.bench_function("hashbrown match not found slow fail", |b| { 145 | b.iter(|| bench_hashbrown_match_not_found_slow_fail()) 146 | }); 147 | 148 | c.bench_function("dat searcher", |b| b.iter(|| bench_dat_searcher())); 149 | } 150 | 151 | criterion_group!(benches, criterion_benchmark); 152 | criterion_main!(benches); 153 | -------------------------------------------------------------------------------- /examples/build-dict/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | **/*.rs.bk 3 | .vscode 4 | Cargo.lock 5 | -------------------------------------------------------------------------------- /examples/build-dict/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | [[package]] 4 | name = "build-dict" 5 | version = "0.1.0" 6 | dependencies = [ 7 | "darts 0.1.0", 8 | ] 9 | 10 | [[package]] 11 | name = "darts" 12 | version = "0.1.0" 13 | 14 | -------------------------------------------------------------------------------- /examples/build-dict/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "build-dict" 3 | version = "0.1.0" 4 | authors = ["Paul Meng "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | darts = { path = "../../" } 9 | -------------------------------------------------------------------------------- /examples/build-dict/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::io::{self, BufRead, BufReader}; 3 | use darts::{DoubleArrayTrie, DoubleArrayTrieBuilder}; 4 | 5 | struct IndexBuilder {} 6 | 7 | impl IndexBuilder { 8 | pub fn new() -> Self { 9 | IndexBuilder {} 10 | } 11 | 12 | // Require the dictionary to be sorted in lexicographical order 13 | pub fn build(&mut self, dict: &mut R) -> io::Result { 14 | let mut buf = String::new(); 15 | let mut records: Vec<(String, usize, String)> = Vec::new(); 16 | let mut prev_word = String::new(); 17 | 18 | while dict.read_line(&mut buf)? > 0 { 19 | { 20 | let parts: Vec<&str> = buf.trim().split_whitespace().collect(); 21 | if parts.is_empty() { 22 | continue; 23 | } 24 | 25 | let word = parts[0]; 26 | let freq = parts.get(1).map(|x| x.parse::().unwrap()).unwrap_or(0); 27 | let tag = parts.get(2).cloned().unwrap_or(""); 28 | 29 | assert!( 30 | (&*prev_word < word), 31 | "the dictionary has to be sorted in lexicographical order." 32 | ); 33 | prev_word = word.to_string(); 34 | 35 | records.push((String::from(word), freq, String::from(tag))); 36 | } 37 | buf.clear(); 38 | } 39 | 40 | let strs: Vec<&str> = records.iter().map(|n| n.0.as_ref()).collect(); 41 | let da = DoubleArrayTrieBuilder::new().build(&strs); 42 | 43 | Ok(da) 44 | } 45 | } 46 | 47 | fn main() { 48 | let f = File::open("./dict.txt").unwrap(); 49 | let mut buf = BufReader::new(f); 50 | let _ = IndexBuilder::new().build(&mut buf).unwrap(); 51 | } 52 | -------------------------------------------------------------------------------- /priv/dict.big.bincode: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andelf/rust-darts/b1a6813f52beb8f062fadfed8a2009b9fa9f6ae2/priv/dict.big.bincode -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 120 2 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Double Array Trie in Rust 2 | //! 3 | //! ## Installation 4 | //! 5 | //! Add it to your `Cargo.toml`: 6 | //! 7 | //! ```toml 8 | //! [dependencies] 9 | //! darts = "0.1" 10 | //! ``` 11 | //! 12 | //! Then you are good to go. If you are using Rust 2015 you have to ``extern crate darts`` to your crate root as well. 13 | //! 14 | //! ## Example 15 | //! 16 | //! ```rust 17 | //! use std::fs::File; 18 | //! use darts::DoubleArrayTrie; 19 | //! 20 | //! fn main() { 21 | //! let mut f = File::open("./priv/dict.big.bincode").unwrap(); 22 | //! let da = DoubleArrayTrie::load(&mut f).unwrap(); 23 | //! let string = "中华人民共和国"; 24 | //! let prefixes = da.common_prefix_search(string).map(|matches| { 25 | //! matches 26 | //! .into_iter() 27 | //! .map(|(end_idx, v)| { 28 | //! &string[..end_idx] 29 | //! }) 30 | //! .collect() 31 | //! }).unwrap_or(vec![]); 32 | //! assert_eq!(vec!["中", "中华", "中华人民", "中华人民共和国"], prefixes); 33 | //! } 34 | //! ``` 35 | //! 36 | //! ```rust 37 | //! use std::fs::File; 38 | //! use darts::DoubleArrayTrie; 39 | //! 40 | //! fn main() { 41 | //! let mut f = File::open("./priv/dict.big.bincode").unwrap(); 42 | //! let da = DoubleArrayTrie::load(&mut f).unwrap(); 43 | //! assert!(da.exact_match_search("东湖高新技术开发区").is_some()); 44 | //! } 45 | //! ``` 46 | //! 47 | //! ## Enabling Additional Features 48 | //! 49 | //! * `searcher` feature enables searcher for maximum forward matcher 50 | //! * `serialization` feature enables saving and loading serialized `DoubleArrayTrie` data 51 | //! 52 | //! ```toml 53 | //! [dependencies] 54 | //! darts = { version = "0.1", features = ["searcher", "serialization"] } 55 | //! ``` 56 | //! 57 | #[cfg(feature = "serialization")] 58 | extern crate bincode; 59 | #[cfg(feature = "serialization")] 60 | extern crate serde; 61 | 62 | #[cfg(feature = "searcher")] 63 | pub mod searcher; 64 | 65 | use std::cmp; 66 | use std::error; 67 | use std::fmt; 68 | use std::io; 69 | #[cfg(feature = "serialization")] 70 | use std::io::prelude::*; 71 | use std::iter; 72 | use std::result; 73 | use std::str; 74 | use std::vec; 75 | 76 | #[cfg(feature = "serialization")] 77 | use serde::{Deserialize, Serialize}; 78 | 79 | /// The error type which is used in this crate. 80 | #[derive(Debug)] 81 | pub enum DartsError { 82 | #[cfg(feature = "serialization")] 83 | Serialize(Box), 84 | Io(io::Error), 85 | } 86 | 87 | impl fmt::Display for DartsError { 88 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 89 | write!(f, "rust-darts error") 90 | } 91 | } 92 | 93 | impl error::Error for DartsError { 94 | fn description(&self) -> &str { 95 | match *self { 96 | #[cfg(feature = "serialization")] 97 | DartsError::Serialize(ref err) => err.description(), 98 | DartsError::Io(ref err) => err.description(), 99 | } 100 | } 101 | } 102 | 103 | /// The result type which is used in this crate. 104 | pub type Result = result::Result; 105 | 106 | impl From for DartsError { 107 | fn from(err: io::Error) -> Self { 108 | DartsError::Io(err) 109 | } 110 | } 111 | 112 | #[cfg(feature = "serialization")] 113 | impl From> for DartsError { 114 | fn from(err: Box) -> Self { 115 | DartsError::Serialize(err) 116 | } 117 | } 118 | 119 | struct Node { 120 | code: usize, 121 | depth: usize, 122 | left: usize, 123 | right: usize, 124 | } 125 | 126 | /// Build a Double Arrary Trie from a series of strings. 127 | pub struct DoubleArrayTrieBuilder<'a> { 128 | check: Vec, 129 | base: Vec, 130 | used: Vec, 131 | 132 | size: usize, 133 | alloc_size: usize, 134 | keys: Vec, vec::IntoIter>>, // String::chars() iterator 135 | next_check_pos: usize, 136 | 137 | progress: usize, 138 | progress_func: Option ()>>, 139 | } 140 | 141 | #[allow(clippy::new_without_default)] 142 | impl<'a> DoubleArrayTrieBuilder<'a> { 143 | pub fn new() -> DoubleArrayTrieBuilder<'a> { 144 | DoubleArrayTrieBuilder { 145 | check: vec![], 146 | base: vec![], 147 | used: vec![], 148 | size: 0, 149 | alloc_size: 0, 150 | keys: vec![], 151 | next_check_pos: 0, 152 | progress: 0, 153 | progress_func: None, 154 | } 155 | } 156 | 157 | /// Set callback to inspect trie building progress. 158 | pub fn progress(mut self, func: F) -> DoubleArrayTrieBuilder<'a> 159 | where 160 | F: 'static + Fn(usize, usize) -> (), 161 | { 162 | self.progress_func = Some(Box::new(func)); 163 | self 164 | } 165 | 166 | /// Start the building process from root layer, and recursively calling `fetch` and `insert` to 167 | /// construct the arrays 168 | pub fn build(mut self, keys: &'a [&str]) -> DoubleArrayTrie { 169 | // using the unicode scalar len is correct since that's our DARTS unit here 170 | let longest_word_len = keys.iter().map(|s| s.chars().count()).max().unwrap_or(0); 171 | 172 | // it should be at least the range of unicode scalar size since we are offseting by `code` 173 | self.resize(std::char::MAX as usize); 174 | 175 | self.keys = keys.iter().map(|s| s.chars().chain(vec!['\u{0}'])).collect(); 176 | 177 | self.base[0] = 1; 178 | self.next_check_pos = 0; 179 | 180 | let root_node = Node { 181 | code: 0, 182 | left: 0, 183 | right: keys.len(), 184 | depth: 0, 185 | }; 186 | 187 | let mut siblings = Vec::with_capacity(keys.len() / 100); 188 | self.fetch(&root_node, &mut siblings); 189 | self.insert(&siblings); 190 | 191 | // shrink size, free the unnecessary memory 192 | let last_used_pos = self 193 | .used 194 | .iter() 195 | .enumerate() 196 | .rev() 197 | .find(|&(_, &k)| k) 198 | .map_or(self.alloc_size, |t| t.0 + std::char::MAX as usize); 199 | self.resize(last_used_pos); 200 | 201 | let DoubleArrayTrieBuilder { check, base, .. } = self; 202 | DoubleArrayTrie { 203 | check, 204 | base, 205 | longest_word_len, 206 | } 207 | } 208 | 209 | /// Resize all of the arrays we need 210 | fn resize(&mut self, new_len: usize) { 211 | self.check.resize(new_len, 0); 212 | self.base.resize(new_len, 0); 213 | self.used.resize(new_len, false); 214 | 215 | self.alloc_size = new_len; 216 | } 217 | 218 | /// To collect the children of `parent` node, by iterating through the same offset of the 219 | /// `keys`, and save the result into `siblings`, returning the number of siblings it collects. 220 | fn fetch(&mut self, parent: &Node, siblings: &mut Vec) -> usize { 221 | let mut prev = 0; 222 | 223 | // iterate over the same offset of the `keys` 224 | for i in parent.left..parent.right { 225 | let c = self.keys[i].next(); 226 | 227 | if c.is_none() { 228 | continue; 229 | } 230 | 231 | let curr = c.map_or(0, |c| { 232 | if c != '\u{0}' { 233 | c as usize + 1 // since we use \u{0} to indicate the termination of the string, every code has to be offset by 1 234 | } else { 235 | 0 // \u{0} as the termination of the string 236 | } 237 | }); 238 | 239 | assert!(prev <= curr, "keys must be sorted!"); 240 | 241 | // we found the adjacent characters in the same offset are different, that means we 242 | // should add one more sibling in the trie. 243 | if curr != prev || siblings.is_empty() { 244 | let tmp_node = Node { 245 | code: curr, 246 | depth: parent.depth + 1, 247 | left: i, 248 | right: 0, 249 | }; 250 | if let Some(n) = siblings.last_mut() { 251 | n.right = i; 252 | } 253 | siblings.push(tmp_node); 254 | } 255 | 256 | prev = curr; 257 | } 258 | 259 | if let Some(n) = siblings.last_mut() { 260 | n.right = parent.right; 261 | } 262 | siblings.len() 263 | } 264 | 265 | /// Insert the nodes in the `siblings` into `check` and `base`, returning the index where the 266 | /// `siblings` is inserted. 267 | fn insert(&mut self, siblings: &[Node]) -> usize { 268 | assert!(!siblings.is_empty()); 269 | 270 | let mut begin: usize; 271 | let mut pos = cmp::max(siblings[0].code + 1, self.next_check_pos) - 1; 272 | let mut last_free = 0; 273 | let mut nonzero_num = 0; // the number of slots in check that already been taken 274 | let mut first = 0; // the flag to mark if we have run into the first time for the condition of "check[pos] == 0" 275 | let key_size = self.keys.len(); 276 | 277 | if self.alloc_size <= pos { 278 | self.resize(pos + 1); 279 | } 280 | 281 | 'outer: loop { 282 | pos += 1; 283 | 284 | if self.alloc_size <= pos { 285 | self.resize(pos + 1); 286 | } 287 | 288 | // iterate through the slot that already has an owner 289 | if self.check[pos] > 0 { 290 | nonzero_num += 1; 291 | continue; 292 | } else if self.check[pos] < 0 { 293 | pos = (-self.check[pos] - 1) as usize; 294 | continue; 295 | } else if first == 0 { 296 | self.next_check_pos = pos; // remember the slot so the next time we call `insert` we could save some time for searching 297 | last_free = pos; 298 | first = 1; 299 | } 300 | 301 | // derive the `begin` in reverse, substract the code from `pos` 302 | begin = pos - siblings[0].code; 303 | 304 | if self.alloc_size <= begin + siblings.last().map(|n| n.code).unwrap() { 305 | let l = self.alloc_size * cmp::max(105, (key_size * 100) / (self.progress + 1)) / 100; 306 | self.resize(l as usize) 307 | } 308 | 309 | // then we check if the `begin` is already taken 310 | if self.used[begin] { 311 | if last_free < pos { 312 | self.check[last_free] = -(pos as i32); 313 | } 314 | 315 | continue; 316 | } 317 | 318 | // check if any of the slots where we should put the code are taken. 319 | for n in siblings.iter() { 320 | if self.check[begin + n.code] > 0 { 321 | if last_free < pos { 322 | self.check[last_free] = -(pos as i32); 323 | } 324 | 325 | continue 'outer; 326 | } 327 | } 328 | 329 | // all are available, break out the loop. 330 | break; 331 | } 332 | 333 | // heuristic search, if the places we have iterated over where 95% of them are taken, then 334 | // we just jump start from `pos` in the next cycle 335 | if nonzero_num as f32 / (pos as f32 - self.next_check_pos as f32 + 1.0) >= 0.95 { 336 | self.next_check_pos = pos; 337 | } 338 | 339 | self.used[begin] = true; 340 | self.size = cmp::max(self.size, begin + siblings.last().map(|n| n.code).unwrap() + 1); 341 | 342 | // mark the ownership of these cells 343 | siblings 344 | .iter() 345 | .map(|n| self.check[begin + n.code] = begin as i32) 346 | .last(); 347 | 348 | // recursively call `fetch` and `insert` for this level of the nodes 349 | for sibling in siblings.iter() { 350 | let heuristic_capacity = (sibling.right - sibling.left) / (100 / std::cmp::min(sibling.depth * 10, 100)); 351 | let mut new_siblings = Vec::with_capacity(heuristic_capacity); 352 | 353 | // a string without any children, then it means we reach a leaf node. 354 | if self.fetch(sibling, &mut new_siblings) == 0 { 355 | // mark it as negative number to signal it is a leaf. 356 | self.base[begin + sibling.code] = -(sibling.left as i32) - 1; 357 | 358 | self.progress += 1; 359 | if let Some(f) = self.progress_func.as_ref() { 360 | f(self.progress, key_size); 361 | } 362 | } else { 363 | let h = self.insert(&new_siblings); 364 | 365 | // save the insertion index into `base` 366 | self.base[begin + sibling.code] = h as i32; 367 | } 368 | } 369 | 370 | begin 371 | } 372 | } 373 | 374 | pub struct PrefixIter<'a> { 375 | key_len: usize, 376 | da: &'a DoubleArrayTrie, 377 | char_indices: str::CharIndices<'a>, 378 | b: i32, 379 | n: i32, 380 | p: usize, 381 | reach_leaf: bool, 382 | longest_word_len: usize, 383 | } 384 | 385 | impl<'a> Iterator for PrefixIter<'a> { 386 | type Item = (usize, usize); 387 | 388 | fn size_hint(&self) -> (usize, Option) { 389 | (0, Some(self.longest_word_len)) 390 | } 391 | 392 | fn next(&mut self) -> Option { 393 | if self.reach_leaf { 394 | return None; 395 | } 396 | 397 | while let Some((i, c)) = self.char_indices.next() { 398 | self.p = self.b as usize; 399 | self.n = self.da.base[self.p]; 400 | 401 | if self.b == self.da.check[self.p] as i32 && self.n < 0 { 402 | self.p = self.b as usize + c as usize + 1; 403 | if self.b == self.da.check[self.p] as i32 { 404 | self.b = self.da.base[self.p]; 405 | } else { 406 | self.reach_leaf = true; 407 | } 408 | 409 | return Some((i, (-self.n - 1) as usize)); 410 | } 411 | 412 | self.p = self.b as usize + c as usize + 1; 413 | if self.b == self.da.check[self.p] as i32 { 414 | self.b = self.da.base[self.p]; 415 | } else { 416 | return None; 417 | }; 418 | } 419 | 420 | self.p = self.b as usize; 421 | self.n = self.da.base[self.p]; 422 | 423 | if self.b == self.da.check[self.p] as i32 && self.n < 0 { 424 | self.reach_leaf = true; 425 | Some((self.key_len, (-self.n - 1) as usize)) 426 | } else { 427 | self.reach_leaf = true; 428 | None 429 | } 430 | } 431 | } 432 | 433 | /// A Double Array Trie. 434 | #[derive(Debug)] 435 | #[cfg_attr(feature = "serialization", derive(Serialize, Deserialize))] 436 | pub struct DoubleArrayTrie { 437 | base: Vec, // use negetive to indicate ends 438 | check: Vec, 439 | longest_word_len: usize, 440 | } 441 | 442 | impl DoubleArrayTrie { 443 | /// Match whole string. 444 | pub fn exact_match_search(&self, key: &str) -> Option { 445 | let mut b = self.base[0]; 446 | let mut p: usize; 447 | 448 | for c in key.chars() { 449 | p = (b + c as i32 + 1) as usize; 450 | 451 | if b == self.check[p] as i32 { 452 | b = self.base[p]; 453 | } else { 454 | return None; 455 | } 456 | } 457 | 458 | p = b as usize; 459 | let n = self.base[p]; 460 | 461 | if b == self.check[p] as i32 && n < 0 { 462 | Some((-n - 1) as usize) 463 | } else { 464 | None 465 | } 466 | } 467 | 468 | /// Iterate thorough all of the matched prefixes. Returning an iterator. 469 | pub fn common_prefix_iter<'a>(&'a self, key: &'a str) -> PrefixIter<'a> { 470 | let key_len = key.len(); 471 | 472 | PrefixIter { 473 | key_len, 474 | da: self, 475 | char_indices: key.char_indices(), 476 | b: self.base[0], 477 | p: 0, 478 | n: 0, 479 | reach_leaf: false, 480 | longest_word_len: self.longest_word_len, 481 | } 482 | } 483 | 484 | /// Find all matched prefixes. Returns [(end_index, value)]. 485 | pub fn common_prefix_search(&self, key: &str) -> Option> { 486 | self.common_prefix_iter(key).map(Some).collect() 487 | } 488 | 489 | pub fn delete(&mut self, key: &str) { 490 | let mut b = self.base[0]; 491 | let mut p: usize; 492 | 493 | for c in key.chars() { 494 | p = (b + c as i32 + 1) as usize; 495 | 496 | if b == self.check[p] as i32 { 497 | b = self.base[p]; 498 | } else { 499 | return; 500 | } 501 | } 502 | 503 | p = b as usize; 504 | let n = self.base[p]; 505 | 506 | if b == self.check[p] as i32 && n < 0 { 507 | self.check[p] = 0; 508 | self.base[p] = 0; 509 | } 510 | } 511 | 512 | pub fn insert(&mut self, key: &str, word_id: i32) { 513 | let mut b = self.base[0]; 514 | let mut p: usize; 515 | 516 | let mut iter = key.chars().peekable(); 517 | while let Some(c) = iter.next() { 518 | p = (b + c as i32 + 1) as usize; 519 | 520 | if b == self.check[p] as i32 { 521 | if iter.peek().is_some() { 522 | b = self.base[p]; 523 | } else { 524 | let new_base = self.base[p] as usize; 525 | self.base[new_base] = -word_id - 1; 526 | self.check[new_base] = new_base as i32; 527 | } 528 | } else if self.check[p] <= 0 { 529 | // it's a free slot 530 | if let Some(&next_c) = iter.peek() { 531 | let mut siblings: Vec = vec![(next_c as usize) + 1]; 532 | self.base[p] = self.look_for_free_slot(p, &mut siblings) as i32; 533 | self.check[p] = b as i32; 534 | b = self.base[p]; 535 | } else { 536 | let mut siblings: Vec = vec![1]; 537 | self.base[p] = self.look_for_free_slot(p, &mut siblings) as i32; 538 | self.check[p] = b as i32; 539 | 540 | let new_base = self.base[p] as usize; 541 | self.base[new_base] = -word_id - 1; 542 | self.check[new_base] = new_base as i32; 543 | } 544 | } else { 545 | let mut siblings: Vec = vec![]; 546 | self.fetch(b as usize, &mut siblings); 547 | 548 | // it's a conflict, we need to move the node 549 | let new_base = self.look_for_free_slot(b as usize, &siblings); 550 | 551 | // TODO: compare the size and choose the smaller one to move 552 | self.relocate(b as usize, new_base, &siblings); 553 | 554 | if let Some(&next_c) = iter.peek() { 555 | let mut siblings: Vec = vec![(next_c as usize) + 1]; 556 | self.base[p] = self.look_for_free_slot(p, &mut siblings) as i32; 557 | self.check[p] = b as i32; 558 | b = self.base[p]; 559 | } else { 560 | let mut siblings: Vec = vec![1]; 561 | self.base[p] = self.look_for_free_slot(p, &mut siblings) as i32; 562 | self.check[p] = b as i32; 563 | 564 | let new_base = self.base[p] as usize; 565 | self.base[new_base] = -word_id - 1; 566 | self.check[new_base] = new_base as i32; 567 | } 568 | } 569 | } 570 | } 571 | 572 | /// Resize all of the arrays we need 573 | fn resize(&mut self, new_len: usize) { 574 | self.check.resize(new_len, 0); 575 | self.base.resize(new_len, 0); 576 | } 577 | 578 | fn fetch(&mut self, s: usize, siblings: &mut Vec) { 579 | let upper_bound = std::cmp::min(std::char::MAX as usize, self.check.len() - (self.base[s] as usize) - 1); 580 | for c in 1..=upper_bound { 581 | if (self.check[(self.base[s] as usize) + c] as usize) == s { 582 | siblings.push(c); 583 | } 584 | } 585 | } 586 | 587 | fn look_for_free_slot(&mut self, s: usize, siblings: &[usize]) -> usize { 588 | let mut begin: usize; 589 | let mut pos = s + siblings[0]; 590 | let mut last_free = 0; 591 | let mut first = 0; // the flag to mark if we have run into the first time for the condition of "check[pos] == 0" 592 | 593 | 'outer: loop { 594 | pos += 1; 595 | 596 | if self.base.len() <= pos { 597 | self.resize(pos + 1); 598 | } 599 | 600 | // iterate through the slot that already has an owner 601 | if self.check[pos] > 0 { 602 | continue; 603 | } else if self.check[pos] < 0 { 604 | pos = (-self.check[pos] - 1) as usize; 605 | continue; 606 | } else if first == 0 { 607 | last_free = pos; 608 | first = 1; 609 | } 610 | 611 | // derive the `begin` in reverse, substract the code from `pos` 612 | begin = pos - siblings[0]; 613 | 614 | // check if any of the slots where we should put the code are taken. 615 | for n in siblings.iter() { 616 | if self.check[begin + n] > 0 { 617 | if last_free < pos { 618 | self.check[last_free] = -(pos as i32); 619 | } 620 | 621 | continue 'outer; 622 | } 623 | } 624 | 625 | // all are available, break out the loop. 626 | return pos; 627 | } 628 | } 629 | 630 | fn relocate(&mut self, s: usize, new_base: usize, siblings: &Vec) { 631 | for c in siblings.iter() { 632 | if (self.check[(self.base[s] as usize) + c] as usize) == s { 633 | self.check[new_base + c] = s as i32; 634 | self.base[new_base + c] = self.base[(self.base[s] as usize) + c]; 635 | 636 | let n = (self.base[s] as usize) + c; 637 | let mut new_siblings = vec![]; 638 | self.fetch(n, &mut new_siblings); 639 | 640 | for d in new_siblings.iter() { 641 | if (self.check[(self.base[n] as usize) + d] as usize) == n { 642 | self.check[(self.base[n] as usize) + d] = (new_base + c) as i32; 643 | } 644 | } 645 | 646 | self.check[(self.base[s] as usize) + c] = 0; 647 | } 648 | } 649 | 650 | self.base[s] = new_base as i32; 651 | } 652 | 653 | /// Save DAT to an output stream. 654 | #[cfg(feature = "serialization")] 655 | pub fn save(&self, w: &mut W) -> Result<()> { 656 | let encoded: Vec = bincode::serialize(self)?; 657 | w.write_all(&encoded).map_err(From::from) 658 | } 659 | 660 | /// Load DAT from input stream. 661 | #[cfg(feature = "serialization")] 662 | pub fn load(r: &mut R) -> Result { 663 | let mut buf = Vec::new(); 664 | r.read_to_end(&mut buf)?; 665 | Ok(bincode::deserialize(&buf)?) 666 | } 667 | } 668 | 669 | #[cfg(test)] 670 | mod tests { 671 | use super::*; 672 | #[cfg(feature = "serialization")] 673 | use std::{fs::File, io::BufReader}; 674 | 675 | #[cfg(feature = "serialization")] 676 | #[test] 677 | #[ignore] 678 | fn test_dat_basic() { 679 | let f = File::open("./priv/dict.txt.big").unwrap(); 680 | 681 | let mut keys: Vec = BufReader::new(f).lines().map(|s| s.unwrap()).collect(); 682 | 683 | // sort the key in lexigraphical order so that we don't need relocate the `base` and 684 | // `check` 685 | keys.sort(); 686 | 687 | let strs: Vec<&str> = keys.iter().map(|n| n.split(' ').next().unwrap()).collect(); 688 | 689 | let da = DoubleArrayTrieBuilder::new() 690 | .progress(|current, total| print!("\r{}% {}/{}", current * 100 / total, current, total)) 691 | .build(&strs); 692 | 693 | println!("\nDone!"); 694 | 695 | let _ = File::create("./priv/dict.big.bincode") 696 | .as_mut() 697 | .map(|f| da.save(f)) 698 | .expect("write ok!"); 699 | } 700 | 701 | #[cfg(feature = "serialization")] 702 | #[test] 703 | fn test_dat_exact_match_search() { 704 | let mut f = File::open("./priv/dict.big.bincode").unwrap(); 705 | let da = DoubleArrayTrie::load(&mut f).unwrap(); 706 | 707 | let input1 = "中华人民共和国"; 708 | let result1: Vec<&str> = da 709 | .common_prefix_search(input1) 710 | .unwrap() 711 | .iter() 712 | .map(|&(end_idx, _)| &input1[..end_idx]) 713 | .collect(); 714 | assert_eq!(result1, vec!["中", "中华", "中华人民", "中华人民共和国"]); 715 | 716 | let input2 = "网球拍卖会"; 717 | let result2: Vec<&str> = da 718 | .common_prefix_search(input2) 719 | .unwrap() 720 | .iter() 721 | .map(|&(end_idx, _)| &input2[..end_idx]) 722 | .collect(); 723 | assert_eq!(result2, vec!["网", "网球", "网球拍"]); 724 | } 725 | 726 | #[test] 727 | fn test_dat_prefix_iter() { 728 | let mut f = File::open("./priv/dict.big.bincode").unwrap(); 729 | let da = DoubleArrayTrie::load(&mut f).unwrap(); 730 | 731 | let input1 = "中华人民共和国"; 732 | let result1: Vec<&str> = da 733 | .common_prefix_iter(input1) 734 | .map(|(end_idx, _)| &input1[..end_idx]) 735 | .collect(); 736 | assert_eq!(result1, vec!["中", "中华", "中华人民", "中华人民共和国"]); 737 | 738 | let input2 = "网球拍卖会"; 739 | let result2: Vec<&str> = da 740 | .common_prefix_iter(input2) 741 | .map(|(end_idx, _)| &input2[..end_idx]) 742 | .collect(); 743 | assert_eq!(result2, vec!["网", "网球", "网球拍"]); 744 | } 745 | 746 | #[cfg(feature = "serialization")] 747 | #[test] 748 | fn test_dat_prefix_search() { 749 | let mut f = File::open("./priv/dict.big.bincode").unwrap(); 750 | let da = DoubleArrayTrie::load(&mut f).unwrap(); 751 | assert!(da.exact_match_search("东湖高新技术开发区").is_some()); 752 | } 753 | 754 | #[test] 755 | fn test_dat_builder() { 756 | let strs: Vec<&str> = vec!["a", "ab", "abc"]; 757 | let da = DoubleArrayTrieBuilder::new().build(&strs); 758 | assert!(da.exact_match_search("abc").is_some()); 759 | } 760 | 761 | #[test] 762 | fn test_dat_delete() { 763 | let strs: Vec<&str> = vec!["a", "ab", "abc"]; 764 | let mut da = DoubleArrayTrieBuilder::new().build(&strs); 765 | assert!(da.exact_match_search("abc").is_some()); 766 | 767 | da.delete("abc"); 768 | assert!(da.exact_match_search("abc").is_none()); 769 | assert!(da.exact_match_search("ab").is_some()); 770 | assert!(da.exact_match_search("a").is_some()); 771 | 772 | da.delete("ab"); 773 | assert!(da.exact_match_search("ab").is_none()); 774 | assert!(da.exact_match_search("a").is_some()); 775 | 776 | da.delete("a"); 777 | assert!(da.exact_match_search("a").is_none()); 778 | 779 | let strs: Vec<&str> = vec!["中", "中华", "中华人民", "中华人民共和国"]; 780 | let mut da = DoubleArrayTrieBuilder::new().build(&strs); 781 | let input1 = "中华人民共和国"; 782 | 783 | da.delete("中华人民"); 784 | 785 | let result1: Vec<&str> = da 786 | .common_prefix_iter(input1) 787 | .map(|(end_idx, _)| &input1[..end_idx]) 788 | .collect(); 789 | assert_eq!(result1, vec!["中", "中华", "中华人民共和国"]); 790 | 791 | da.delete("中华"); 792 | 793 | let result1: Vec<&str> = da 794 | .common_prefix_iter(input1) 795 | .map(|(end_idx, _)| &input1[..end_idx]) 796 | .collect(); 797 | assert_eq!(result1, vec!["中", "中华人民共和国"]); 798 | 799 | da.delete("中华人民共和国"); 800 | let result1: Vec<&str> = da 801 | .common_prefix_iter(input1) 802 | .map(|(end_idx, _)| &input1[..end_idx]) 803 | .collect(); 804 | assert_eq!(result1, vec!["中"]); 805 | } 806 | 807 | #[test] 808 | fn test_dat_insert() { 809 | let strs: Vec<&str> = vec!["a", "ab", "abc"]; 810 | let mut da = DoubleArrayTrieBuilder::new().build(&strs); 811 | da.insert("abcd", 3); 812 | 813 | assert!(da.exact_match_search("a").is_some()); 814 | assert!(da.exact_match_search("ab").is_some()); 815 | assert!(da.exact_match_search("abc").is_some()); 816 | assert!(da.exact_match_search("abcd").is_some()); 817 | assert!(da.exact_match_search("abcde").is_none()); 818 | 819 | // The example from the paper: An Efficient Implementation of Trie Structures 820 | let strs: Vec<&str> = vec!["a"]; 821 | let mut da = DoubleArrayTrieBuilder::new().build(&strs); 822 | da.insert("bachelor", 1); 823 | da.insert("jar", 2); 824 | da.insert("badge", 3); 825 | da.insert("baby", 4); 826 | 827 | assert_eq!(da.exact_match_search("bachelor"), Some(1)); 828 | assert_eq!(da.exact_match_search("jar"), Some(2)); 829 | assert_eq!(da.exact_match_search("badge"), Some(3)); 830 | assert_eq!(da.exact_match_search("baby"), Some(4)); 831 | assert_eq!(da.exact_match_search("abcde"), None); 832 | 833 | let strs: Vec<&str> = vec!["天"]; 834 | let mut da = DoubleArrayTrieBuilder::new().build(&strs); 835 | da.insert("中", 1); 836 | da.insert("中华", 2); 837 | da.insert("中华人民", 3); 838 | da.insert("中华人民共和国", 4); 839 | 840 | assert_eq!(da.exact_match_search("中"), Some(1)); 841 | assert_eq!(da.exact_match_search("中华"), Some(2)); 842 | assert_eq!(da.exact_match_search("中华人民"), Some(3)); 843 | assert_eq!(da.exact_match_search("中华人民共和国"), Some(4)); 844 | 845 | let input1 = "中华人民共和国"; 846 | let result1: Vec<&str> = da 847 | .common_prefix_iter(input1) 848 | .map(|(end_idx, _)| &input1[..end_idx]) 849 | .collect(); 850 | assert_eq!(result1, vec!["中", "中华", "中华人民", "中华人民共和国"]); 851 | } 852 | 853 | #[test] 854 | fn test_dat_unicode_han_sip() { 855 | let strs: Vec<&str> = vec!["讥䶯䶰", "讥䶯䶰䶱䶲", "讥䶯䶰䶱䶲䶳䶴䶵𦡦"]; 856 | let da = DoubleArrayTrieBuilder::new().build(&strs); 857 | 858 | let input1 = "讥䶯䶰䶱䶲䶳䶴䶵𦡦"; 859 | let result1: Vec<&str> = da 860 | .common_prefix_iter(input1) 861 | .map(|(end_idx, _)| &input1[..end_idx]) 862 | .collect(); 863 | assert_eq!( 864 | result1, 865 | vec!["讥䶯䶰", "讥䶯䶰䶱䶲", "讥䶯䶰䶱䶲䶳䶴䶵𦡦"] 866 | ); 867 | } 868 | 869 | #[test] 870 | fn test_dat_unicode_grapheme_cluster() { 871 | let strs: Vec<&str> = vec!["a", "abc", "abcde\u{0301}"]; 872 | let da = DoubleArrayTrieBuilder::new().build(&strs); 873 | 874 | let input1 = "abcde\u{0301}\u{1100}\u{1161}\u{AC00}"; 875 | let result1: Vec<&str> = da 876 | .common_prefix_iter(input1) 877 | .map(|(end_idx, _)| &input1[..end_idx]) 878 | .collect(); 879 | assert_eq!(result1, vec!["a", "abc", "abcde\u{0301}"]); 880 | } 881 | 882 | #[test] 883 | fn test_dat_unicode_japanese() { 884 | let strs: Vec<&str> = vec!["アルゴリズム", "データ", "構造"]; 885 | let da = DoubleArrayTrieBuilder::new().build(&strs); 886 | 887 | let input1 = "データ構造とアルゴリズム"; 888 | let result1: Vec<&str> = da 889 | .common_prefix_iter(input1) 890 | .map(|(end_idx, _)| &input1[..end_idx]) 891 | .collect(); 892 | assert_eq!(result1, vec!["データ"]); 893 | } 894 | 895 | #[test] 896 | fn test_dat_unicode_arabic() { 897 | // how does the unicode work for Arabic: http://zevoid.blogspot.com/2017/10/blog-post_19.html 898 | let strs: Vec<&str> = vec!["أَبْجَدِيَّة", "عَرَبِيَّة"]; 899 | let da = DoubleArrayTrieBuilder::new().build(&strs); 900 | 901 | let input1 = "أَبْجَدِيَّة عَرَبِيَّة"; 902 | let result1: Vec<&str> = da 903 | .common_prefix_iter(input1) 904 | .map(|(end_idx, _)| &input1[..end_idx]) 905 | .collect(); 906 | assert_eq!( 907 | result1, 908 | vec!["أ\u{064e}ب\u{0652}ج\u{064e}د\u{0650}ي\u{064e}\u{0651}ة"] 909 | ); 910 | } 911 | 912 | #[test] 913 | fn test_dat_insert_and_delete() { 914 | let strs: Vec<&str> = vec!["a", "ab", "abc"]; 915 | let mut da = DoubleArrayTrieBuilder::new().build(&strs); 916 | assert!(da.exact_match_search("abc").is_some()); 917 | 918 | da.delete("abc"); 919 | assert!(da.exact_match_search("abc").is_none()); 920 | 921 | da.insert("abc", 2); 922 | assert_eq!(da.exact_match_search("abc"), Some(2)); 923 | 924 | da.delete("ab"); 925 | assert!(da.exact_match_search("ab").is_none()); 926 | 927 | da.insert("ab", 1); 928 | assert_eq!(da.exact_match_search("ab"), Some(1)); 929 | 930 | da.delete("a"); 931 | assert!(da.exact_match_search("a").is_none()); 932 | 933 | da.insert("a", 0); 934 | assert_eq!(da.exact_match_search("a"), Some(0)); 935 | } 936 | } 937 | -------------------------------------------------------------------------------- /src/searcher.rs: -------------------------------------------------------------------------------- 1 | use crate::DoubleArrayTrie; 2 | 3 | impl DoubleArrayTrie { 4 | /// Run Forward Maximum Matching Method on a string. Returns a Searcher iterator. 5 | pub fn search<'a, 'b>(&'b self, haystack: &'a str) -> DoubleArrayTrieSearcher<'a, 'b> { 6 | DoubleArrayTrieSearcher { 7 | haystack, 8 | dat: self, 9 | start_pos: 0, 10 | } 11 | } 12 | } 13 | 14 | #[derive(Copy, Clone, Eq, PartialEq, Debug)] 15 | pub enum SearchStep { 16 | Match(usize, usize), 17 | Reject(usize, usize), 18 | } 19 | 20 | /// A seracher for all words in Double Array Trie, using Forward Maximum Matching Method. 21 | #[allow(dead_code)] 22 | pub struct DoubleArrayTrieSearcher<'a, 'b> { 23 | haystack: &'a str, 24 | dat: &'b DoubleArrayTrie, 25 | start_pos: usize, 26 | } 27 | 28 | impl<'a, 'b> Iterator for DoubleArrayTrieSearcher<'a, 'b> { 29 | type Item = SearchStep; 30 | 31 | fn next(&mut self) -> Option { 32 | let base = &self.dat.base; 33 | let check = &self.dat.check; 34 | 35 | let mut b = base[0]; 36 | let mut n; 37 | let mut p: usize; 38 | 39 | let start_pos = self.start_pos; 40 | 41 | let mut next_pos = 0; 42 | let mut result = None; 43 | 44 | if start_pos >= self.haystack.len() { 45 | return None; 46 | } 47 | 48 | for (i, c) in self.haystack[start_pos..].char_indices() { 49 | p = b as usize; 50 | n = base[p]; 51 | 52 | if b == check[p] as i32 && n < 0 { 53 | next_pos = start_pos + i; 54 | result = Some(SearchStep::Match(start_pos, start_pos + i)); 55 | } 56 | 57 | p = b as usize + c as usize + 1; 58 | if b == check[p] as i32 { 59 | b = base[p]; 60 | } else if result.is_some() { 61 | // last item is the maximum matching 62 | self.start_pos = next_pos; 63 | return result; 64 | } else { 65 | self.start_pos = start_pos + i + c.len_utf8(); 66 | return Some(SearchStep::Reject(start_pos, self.start_pos)); 67 | } 68 | } 69 | 70 | p = b as usize; 71 | n = base[p]; 72 | 73 | // full match from start to end 74 | self.start_pos = self.haystack.len(); 75 | if b == check[p] as i32 && n < 0 { 76 | Some(SearchStep::Match(start_pos, self.start_pos)) 77 | } else { 78 | Some(SearchStep::Reject(start_pos, self.start_pos)) 79 | } 80 | } 81 | } 82 | 83 | #[cfg(test)] 84 | mod tests { 85 | use super::*; 86 | use std::fs::File; 87 | 88 | fn search_step_to_str(step: &SearchStep, haystack: &str) -> String { 89 | match *step { 90 | SearchStep::Match(start, end) => format!("{}/n", &haystack[start..end]), 91 | SearchStep::Reject(start, end) => format!("{}/x", &haystack[start..end]), 92 | } 93 | } 94 | 95 | #[test] 96 | fn test_dat_searcher() { 97 | let mut f = File::open("./priv/dict.big.bincode").unwrap(); 98 | let da = DoubleArrayTrie::load(&mut f).unwrap(); 99 | 100 | let text = "江西鄱阳湖干枯,中国最大淡水湖变成大草原"; 101 | let segmented = da 102 | .search(&text) 103 | .map(|s| search_step_to_str(&s, text)) 104 | .collect::>() 105 | .join(" "); 106 | assert_eq!( 107 | segmented, 108 | "江西/n 鄱阳湖/n 干枯/n ,/x 中国/n 最大/n 淡水湖/n 变成/n 大/n 草原/n" 109 | ); 110 | } 111 | } 112 | --------------------------------------------------------------------------------