├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── proptest-regressions ├── engines │ ├── dfa │ │ └── test.txt │ ├── hybrid │ │ └── test.txt │ └── pikevm │ │ └── tests.txt ├── literal │ └── tests.txt └── util │ └── tests.txt ├── rustfmt.toml ├── src ├── cursor.rs ├── engines.rs ├── engines │ ├── dfa.rs │ ├── dfa │ │ ├── accel.rs │ │ ├── search.rs │ │ └── test.rs │ ├── hybrid.rs │ ├── hybrid │ │ ├── search.rs │ │ └── test.rs │ ├── meta │ │ ├── error.rs │ │ ├── literal.rs │ │ ├── mod.rs │ │ ├── regex.rs │ │ ├── strategy.rs │ │ └── wrappers.rs │ ├── pikevm.rs │ └── pikevm │ │ ├── error.rs │ │ └── tests.rs ├── input.rs ├── lib.rs ├── literal.rs ├── literal │ └── tests.rs ├── test_rope.rs ├── tests.rs ├── util.rs └── util │ ├── empty.rs │ ├── iter.rs │ ├── prefilter.rs │ ├── primitives.rs │ ├── sparse_set.rs │ ├── tests.rs │ └── utf8.rs └── test_cases └── syntax.rs /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | debug/ 4 | target/ 5 | 6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 8 | Cargo.lock 9 | 10 | # These are backup files generated by rustfmt 11 | **/*.rs.bk 12 | 13 | # MSVC Windows builds of rustc generate these, which store debugging information 14 | *.pdb -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.1.3" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "anyhow" 16 | version = "1.0.89" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" 19 | 20 | [[package]] 21 | name = "autocfg" 22 | version = "1.3.0" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" 25 | 26 | [[package]] 27 | name = "bit-set" 28 | version = "0.5.3" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" 31 | dependencies = [ 32 | "bit-vec", 33 | ] 34 | 35 | [[package]] 36 | name = "bit-vec" 37 | version = "0.6.3" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" 40 | 41 | [[package]] 42 | name = "bitflags" 43 | version = "2.6.0" 44 | source = "registry+https://github.com/rust-lang/crates.io-index" 45 | checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" 46 | 47 | [[package]] 48 | name = "bstr" 49 | version = "1.10.0" 50 | source = "registry+https://github.com/rust-lang/crates.io-index" 51 | checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c" 52 | dependencies = [ 53 | "memchr", 54 | "serde", 55 | ] 56 | 57 | [[package]] 58 | name = "byteorder" 59 | version = "1.5.0" 60 | source = "registry+https://github.com/rust-lang/crates.io-index" 61 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" 62 | 63 | [[package]] 64 | name = "cfg-if" 65 | version = "1.0.0" 66 | source = "registry+https://github.com/rust-lang/crates.io-index" 67 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 68 | 69 | [[package]] 70 | name = "equivalent" 71 | version = "1.0.1" 72 | source = "registry+https://github.com/rust-lang/crates.io-index" 73 | checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" 74 | 75 | [[package]] 76 | name = "errno" 77 | version = "0.3.9" 78 | source = "registry+https://github.com/rust-lang/crates.io-index" 79 | checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" 80 | dependencies = [ 81 | "libc", 82 | "windows-sys 0.52.0", 83 | ] 84 | 85 | [[package]] 86 | name = "fastrand" 87 | version = "2.1.1" 88 | source = "registry+https://github.com/rust-lang/crates.io-index" 89 | checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" 90 | 91 | [[package]] 92 | name = "fnv" 93 | version = "1.0.7" 94 | source = "registry+https://github.com/rust-lang/crates.io-index" 95 | checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" 96 | 97 | [[package]] 98 | name = "getrandom" 99 | version = "0.2.15" 100 | source = "registry+https://github.com/rust-lang/crates.io-index" 101 | checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" 102 | dependencies = [ 103 | "cfg-if", 104 | "libc", 105 | "wasi", 106 | ] 107 | 108 | [[package]] 109 | name = "hashbrown" 110 | version = "0.14.5" 111 | source = "registry+https://github.com/rust-lang/crates.io-index" 112 | checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" 113 | 114 | [[package]] 115 | name = "indexmap" 116 | version = "2.5.0" 117 | source = "registry+https://github.com/rust-lang/crates.io-index" 118 | checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" 119 | dependencies = [ 120 | "equivalent", 121 | "hashbrown", 122 | ] 123 | 124 | [[package]] 125 | name = "lazy_static" 126 | version = "1.5.0" 127 | source = "registry+https://github.com/rust-lang/crates.io-index" 128 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 129 | 130 | [[package]] 131 | name = "libc" 132 | version = "0.2.158" 133 | source = "registry+https://github.com/rust-lang/crates.io-index" 134 | checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" 135 | 136 | [[package]] 137 | name = "libm" 138 | version = "0.2.8" 139 | source = "registry+https://github.com/rust-lang/crates.io-index" 140 | checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" 141 | 142 | [[package]] 143 | name = "linux-raw-sys" 144 | version = "0.4.14" 145 | source = "registry+https://github.com/rust-lang/crates.io-index" 146 | checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" 147 | 148 | [[package]] 149 | name = "log" 150 | version = "0.4.22" 151 | source = "registry+https://github.com/rust-lang/crates.io-index" 152 | checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" 153 | 154 | [[package]] 155 | name = "memchr" 156 | version = "2.7.4" 157 | source = "registry+https://github.com/rust-lang/crates.io-index" 158 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" 159 | 160 | [[package]] 161 | name = "num-traits" 162 | version = "0.2.19" 163 | source = "registry+https://github.com/rust-lang/crates.io-index" 164 | checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" 165 | dependencies = [ 166 | "autocfg", 167 | "libm", 168 | ] 169 | 170 | [[package]] 171 | name = "once_cell" 172 | version = "1.19.0" 173 | source = "registry+https://github.com/rust-lang/crates.io-index" 174 | checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" 175 | 176 | [[package]] 177 | name = "ppv-lite86" 178 | version = "0.2.20" 179 | source = "registry+https://github.com/rust-lang/crates.io-index" 180 | checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" 181 | dependencies = [ 182 | "zerocopy", 183 | ] 184 | 185 | [[package]] 186 | name = "proc-macro2" 187 | version = "1.0.86" 188 | source = "registry+https://github.com/rust-lang/crates.io-index" 189 | checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" 190 | dependencies = [ 191 | "unicode-ident", 192 | ] 193 | 194 | [[package]] 195 | name = "proptest" 196 | version = "1.5.0" 197 | source = "registry+https://github.com/rust-lang/crates.io-index" 198 | checksum = "b4c2511913b88df1637da85cc8d96ec8e43a3f8bb8ccb71ee1ac240d6f3df58d" 199 | dependencies = [ 200 | "bit-set", 201 | "bit-vec", 202 | "bitflags", 203 | "lazy_static", 204 | "num-traits", 205 | "rand", 206 | "rand_chacha", 207 | "rand_xorshift", 208 | "regex-syntax", 209 | "rusty-fork", 210 | "tempfile", 211 | "unarray", 212 | ] 213 | 214 | [[package]] 215 | name = "quick-error" 216 | version = "1.2.3" 217 | source = "registry+https://github.com/rust-lang/crates.io-index" 218 | checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" 219 | 220 | [[package]] 221 | name = "quote" 222 | version = "1.0.37" 223 | source = "registry+https://github.com/rust-lang/crates.io-index" 224 | checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" 225 | dependencies = [ 226 | "proc-macro2", 227 | ] 228 | 229 | [[package]] 230 | name = "rand" 231 | version = "0.8.5" 232 | source = "registry+https://github.com/rust-lang/crates.io-index" 233 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 234 | dependencies = [ 235 | "libc", 236 | "rand_chacha", 237 | "rand_core", 238 | ] 239 | 240 | [[package]] 241 | name = "rand_chacha" 242 | version = "0.3.1" 243 | source = "registry+https://github.com/rust-lang/crates.io-index" 244 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" 245 | dependencies = [ 246 | "ppv-lite86", 247 | "rand_core", 248 | ] 249 | 250 | [[package]] 251 | name = "rand_core" 252 | version = "0.6.4" 253 | source = "registry+https://github.com/rust-lang/crates.io-index" 254 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" 255 | dependencies = [ 256 | "getrandom", 257 | ] 258 | 259 | [[package]] 260 | name = "rand_xorshift" 261 | version = "0.3.0" 262 | source = "registry+https://github.com/rust-lang/crates.io-index" 263 | checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" 264 | dependencies = [ 265 | "rand_core", 266 | ] 267 | 268 | [[package]] 269 | name = "regex-automata" 270 | version = "0.4.7" 271 | source = "registry+https://github.com/rust-lang/crates.io-index" 272 | checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" 273 | dependencies = [ 274 | "aho-corasick", 275 | "memchr", 276 | "regex-syntax", 277 | ] 278 | 279 | [[package]] 280 | name = "regex-cursor" 281 | version = "0.1.5" 282 | dependencies = [ 283 | "anyhow", 284 | "log", 285 | "memchr", 286 | "proptest", 287 | "regex-automata", 288 | "regex-syntax", 289 | "regex-test", 290 | "ropey", 291 | ] 292 | 293 | [[package]] 294 | name = "regex-syntax" 295 | version = "0.8.4" 296 | source = "registry+https://github.com/rust-lang/crates.io-index" 297 | checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" 298 | 299 | [[package]] 300 | name = "regex-test" 301 | version = "0.1.1" 302 | source = "registry+https://github.com/rust-lang/crates.io-index" 303 | checksum = "da40f0939bc4c598b4326abdbb363a8987aa43d0526e5624aefcf3ed90344e62" 304 | dependencies = [ 305 | "anyhow", 306 | "bstr", 307 | "serde", 308 | "toml", 309 | ] 310 | 311 | [[package]] 312 | name = "ropey" 313 | version = "1.6.1" 314 | source = "registry+https://github.com/rust-lang/crates.io-index" 315 | checksum = "93411e420bcd1a75ddd1dc3caf18c23155eda2c090631a85af21ba19e97093b5" 316 | dependencies = [ 317 | "smallvec", 318 | "str_indices", 319 | ] 320 | 321 | [[package]] 322 | name = "rustix" 323 | version = "0.38.37" 324 | source = "registry+https://github.com/rust-lang/crates.io-index" 325 | checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811" 326 | dependencies = [ 327 | "bitflags", 328 | "errno", 329 | "libc", 330 | "linux-raw-sys", 331 | "windows-sys 0.52.0", 332 | ] 333 | 334 | [[package]] 335 | name = "rusty-fork" 336 | version = "0.3.0" 337 | source = "registry+https://github.com/rust-lang/crates.io-index" 338 | checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f" 339 | dependencies = [ 340 | "fnv", 341 | "quick-error", 342 | "tempfile", 343 | "wait-timeout", 344 | ] 345 | 346 | [[package]] 347 | name = "serde" 348 | version = "1.0.210" 349 | source = "registry+https://github.com/rust-lang/crates.io-index" 350 | checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" 351 | dependencies = [ 352 | "serde_derive", 353 | ] 354 | 355 | [[package]] 356 | name = "serde_derive" 357 | version = "1.0.210" 358 | source = "registry+https://github.com/rust-lang/crates.io-index" 359 | checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" 360 | dependencies = [ 361 | "proc-macro2", 362 | "quote", 363 | "syn", 364 | ] 365 | 366 | [[package]] 367 | name = "serde_spanned" 368 | version = "0.6.7" 369 | source = "registry+https://github.com/rust-lang/crates.io-index" 370 | checksum = "eb5b1b31579f3811bf615c144393417496f152e12ac8b7663bf664f4a815306d" 371 | dependencies = [ 372 | "serde", 373 | ] 374 | 375 | [[package]] 376 | name = "smallvec" 377 | version = "1.13.2" 378 | source = "registry+https://github.com/rust-lang/crates.io-index" 379 | checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" 380 | 381 | [[package]] 382 | name = "str_indices" 383 | version = "0.4.3" 384 | source = "registry+https://github.com/rust-lang/crates.io-index" 385 | checksum = "e9557cb6521e8d009c51a8666f09356f4b817ba9ba0981a305bd86aee47bd35c" 386 | 387 | [[package]] 388 | name = "syn" 389 | version = "2.0.77" 390 | source = "registry+https://github.com/rust-lang/crates.io-index" 391 | checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" 392 | dependencies = [ 393 | "proc-macro2", 394 | "quote", 395 | "unicode-ident", 396 | ] 397 | 398 | [[package]] 399 | name = "tempfile" 400 | version = "3.12.0" 401 | source = "registry+https://github.com/rust-lang/crates.io-index" 402 | checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" 403 | dependencies = [ 404 | "cfg-if", 405 | "fastrand", 406 | "once_cell", 407 | "rustix", 408 | "windows-sys 0.59.0", 409 | ] 410 | 411 | [[package]] 412 | name = "toml" 413 | version = "0.8.19" 414 | source = "registry+https://github.com/rust-lang/crates.io-index" 415 | checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e" 416 | dependencies = [ 417 | "serde", 418 | "serde_spanned", 419 | "toml_datetime", 420 | "toml_edit", 421 | ] 422 | 423 | [[package]] 424 | name = "toml_datetime" 425 | version = "0.6.8" 426 | source = "registry+https://github.com/rust-lang/crates.io-index" 427 | checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" 428 | dependencies = [ 429 | "serde", 430 | ] 431 | 432 | [[package]] 433 | name = "toml_edit" 434 | version = "0.22.21" 435 | source = "registry+https://github.com/rust-lang/crates.io-index" 436 | checksum = "3b072cee73c449a636ffd6f32bd8de3a9f7119139aff882f44943ce2986dc5cf" 437 | dependencies = [ 438 | "indexmap", 439 | "serde", 440 | "serde_spanned", 441 | "toml_datetime", 442 | "winnow", 443 | ] 444 | 445 | [[package]] 446 | name = "unarray" 447 | version = "0.1.4" 448 | source = "registry+https://github.com/rust-lang/crates.io-index" 449 | checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" 450 | 451 | [[package]] 452 | name = "unicode-ident" 453 | version = "1.0.13" 454 | source = "registry+https://github.com/rust-lang/crates.io-index" 455 | checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" 456 | 457 | [[package]] 458 | name = "wait-timeout" 459 | version = "0.2.0" 460 | source = "registry+https://github.com/rust-lang/crates.io-index" 461 | checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6" 462 | dependencies = [ 463 | "libc", 464 | ] 465 | 466 | [[package]] 467 | name = "wasi" 468 | version = "0.11.0+wasi-snapshot-preview1" 469 | source = "registry+https://github.com/rust-lang/crates.io-index" 470 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" 471 | 472 | [[package]] 473 | name = "windows-sys" 474 | version = "0.52.0" 475 | source = "registry+https://github.com/rust-lang/crates.io-index" 476 | checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" 477 | dependencies = [ 478 | "windows-targets", 479 | ] 480 | 481 | [[package]] 482 | name = "windows-sys" 483 | version = "0.59.0" 484 | source = "registry+https://github.com/rust-lang/crates.io-index" 485 | checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" 486 | dependencies = [ 487 | "windows-targets", 488 | ] 489 | 490 | [[package]] 491 | name = "windows-targets" 492 | version = "0.52.6" 493 | source = "registry+https://github.com/rust-lang/crates.io-index" 494 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 495 | dependencies = [ 496 | "windows_aarch64_gnullvm", 497 | "windows_aarch64_msvc", 498 | "windows_i686_gnu", 499 | "windows_i686_gnullvm", 500 | "windows_i686_msvc", 501 | "windows_x86_64_gnu", 502 | "windows_x86_64_gnullvm", 503 | "windows_x86_64_msvc", 504 | ] 505 | 506 | [[package]] 507 | name = "windows_aarch64_gnullvm" 508 | version = "0.52.6" 509 | source = "registry+https://github.com/rust-lang/crates.io-index" 510 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 511 | 512 | [[package]] 513 | name = "windows_aarch64_msvc" 514 | version = "0.52.6" 515 | source = "registry+https://github.com/rust-lang/crates.io-index" 516 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 517 | 518 | [[package]] 519 | name = "windows_i686_gnu" 520 | version = "0.52.6" 521 | source = "registry+https://github.com/rust-lang/crates.io-index" 522 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 523 | 524 | [[package]] 525 | name = "windows_i686_gnullvm" 526 | version = "0.52.6" 527 | source = "registry+https://github.com/rust-lang/crates.io-index" 528 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 529 | 530 | [[package]] 531 | name = "windows_i686_msvc" 532 | version = "0.52.6" 533 | source = "registry+https://github.com/rust-lang/crates.io-index" 534 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 535 | 536 | [[package]] 537 | name = "windows_x86_64_gnu" 538 | version = "0.52.6" 539 | source = "registry+https://github.com/rust-lang/crates.io-index" 540 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 541 | 542 | [[package]] 543 | name = "windows_x86_64_gnullvm" 544 | version = "0.52.6" 545 | source = "registry+https://github.com/rust-lang/crates.io-index" 546 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 547 | 548 | [[package]] 549 | name = "windows_x86_64_msvc" 550 | version = "0.52.6" 551 | source = "registry+https://github.com/rust-lang/crates.io-index" 552 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 553 | 554 | [[package]] 555 | name = "winnow" 556 | version = "0.6.18" 557 | source = "registry+https://github.com/rust-lang/crates.io-index" 558 | checksum = "68a9bda4691f099d435ad181000724da8e5899daa10713c2d432552b9ccd3a6f" 559 | dependencies = [ 560 | "memchr", 561 | ] 562 | 563 | [[package]] 564 | name = "zerocopy" 565 | version = "0.7.35" 566 | source = "registry+https://github.com/rust-lang/crates.io-index" 567 | checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" 568 | dependencies = [ 569 | "byteorder", 570 | "zerocopy-derive", 571 | ] 572 | 573 | [[package]] 574 | name = "zerocopy-derive" 575 | version = "0.7.35" 576 | source = "registry+https://github.com/rust-lang/crates.io-index" 577 | checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" 578 | dependencies = [ 579 | "proc-macro2", 580 | "quote", 581 | "syn", 582 | ] 583 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "regex-cursor" 3 | description = "regex fork that can search discontiguous haystacks" 4 | version = "0.1.5" 5 | edition = "2021" 6 | documentation = "https://docs.rs/regex-cursor" 7 | author = "Pascal Kuthe " 8 | repository = "https://github.com/pascalkuthe/regex-cursor" 9 | readme = "README.md" 10 | keywords = ["regex", "dfa", "automata", "automaton", "nfa"] 11 | license = "MIT OR Apache-2.0" 12 | categories = ["text-processing"] 13 | rust-version = "1.65" 14 | 15 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 16 | 17 | [dependencies] 18 | log = "0.4.22" 19 | memchr = "2.7" 20 | regex-automata = "0.4.7" 21 | regex-syntax = "0.8.4" 22 | ropey = { version = "1.6.1", default-features = false, optional = true } 23 | 24 | [dev-dependencies] 25 | anyhow = "1.0.89" 26 | proptest = "1.5.0" 27 | regex-test = "0.1.1" 28 | 29 | [features] 30 | default = ["perf-inline", "ropey"] 31 | perf-inline = [] 32 | ropey = ["dep:ropey"] 33 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 Pascal Kuthe 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # regex-cursor 2 | 3 | 4 | This crate provides routines for searching **discontiguous strings** for matches of a [regular expression] (aka "regex"). It is based on [regex-automata] and most of the code is adapted from the various crates in the [regex](https://github.com/rust-lang/regex) repository. 5 | 6 | It is intended as a prototype for upstream support for "streaming regex". The cursor based API in this crate is very similar to the API already exposed by `regex`/`regex-automata`. To that end a generic `Cursor` trait is provided that collections can implement. 7 | 8 | A sketch of the cursor API is shown below. The string is yielded in multiple byte chunks. Calling advance moves the cursor to the next chunk. Calling backtrack moves the cursor a chunk back. Backtracking is required by this crate. That makes it unsuitable for searching fully unbuffered streams like bytes send over a TCP connection. 9 | 10 | ``` rust 11 | pub trait Cursor { 12 | fn chunk(&self) -> &[u8] { .. } 13 | fn advance(&mut self) -> bool { .. } 14 | fn bracktrack(&mut self) -> bool { .. } 15 | } 16 | ``` 17 | 18 | Working on this crate showed me that regex backtracks a lot more than expected with most functionality fundamentally requiring backtracking. For network usecases that do not buffer their input the primary usecase would likely be detecting a match (without necessarily requiring the matched byte range). Such usecases can be covered by manually feeding bytes into the hybrid and DFA engines from the regex-automata crate. This approach also has the advantage of allowing the caller to pause the match (async) while waiting for more data allowing the caller to drive the search instead of the engine itself. 19 | 20 | The only part of this crate that could be applied to the fully streaming case is the streaming PikeVM implementation. However, there are some limitations: 21 | * only a single search can be run since the PikeVM may look ahead multiple bytes to disambiguate alternative matches 22 | * Prefilters longer than one byte can not work 23 | * utf-8 mode can not be supported (empty matches may occur between unicode boundaries) 24 | 25 | Currently, the PikeVM implementation is not written with this use case in mind and may call backtrack unnecessarily, but that could be addressed in the future, but especially the first point is very limiting. The pikevm also does not allow the user to drive the search and would block on network calls for example (no async). 26 | 27 | -------------------------------------------------------------------------------- /proptest-regressions/engines/dfa/test.txt: -------------------------------------------------------------------------------- 1 | # Seeds for failure cases proptest has generated in the past. It is 2 | # automatically read and these particular cases re-run before any 3 | # novel cases are generated. 4 | # 5 | # It is recommended to check this file in to source control so that 6 | # everyone who runs the test benefits from these saved cases. 7 | cc 2795080e34081178522520583a3fffdcfeadb09aa47a298f991e102fb6064559 # shrinks to mut haystack = "𛅕", needle = "" 8 | cc 561c3e868d6f45d3071f185399fcd6031baede9ecbda8b4a1f3e9760775dc27e # shrinks to mut haystack = "Σ0🌀𑍇𑵐:𫠠𝕒 ", needle = ":" 9 | cc 63a23412cc7362942174b377418542dd6430d448b0f72833809e22588e872d09 # shrinks to mut haystack = "a", needle = "" 10 | cc 311b1045964903485e0577546cf1341422999100f2e3274f8d4ea61fea074b20 # shrinks to mut haystack = "®", needle = "." 11 | -------------------------------------------------------------------------------- /proptest-regressions/engines/hybrid/test.txt: -------------------------------------------------------------------------------- 1 | # Seeds for failure cases proptest has generated in the past. It is 2 | # automatically read and these particular cases re-run before any 3 | # novel cases are generated. 4 | # 5 | # It is recommended to check this file in to source control so that 6 | # everyone who runs the test benefits from these saved cases. 7 | cc 3152dced60f8c193302e2adbe9ebd67be558b4af65991b997e5f776920c0459f # shrinks to haystack = "", needle = "" 8 | cc 0a97b5285cbdc808df0e0e829c62fe77de165b9aaf8f15dc0d41a150407a4b01 # shrinks to haystack = "Y", needle = "Y" 9 | cc 3121032e282f21b11023cec49d0119661db16574d821f15b91400b6d66449702 # shrinks to haystack = "&&", needle = "&" 10 | cc f8813009c0bd8c6bdd386e9b17ce8bb83e513707c27985bc2757c56549c7290c # shrinks to haystack = ":a", needle = "$|:" 11 | cc 1cd08976b659689543c93e102417319e7dafe94333d0f2813f5c68dc935bb6cf # shrinks to haystack = "Σ /ⶠaAA ﷏00AAΣ/എ", needle = "/" 12 | cc 7fdff08fc051c9b641db028206943cbb84ca26f8a88e06eadaa5b09b66148d34 # shrinks to mut haystack = "𑊊", needle = "𑒀?." 13 | -------------------------------------------------------------------------------- /proptest-regressions/engines/pikevm/tests.txt: -------------------------------------------------------------------------------- 1 | # Seeds for failure cases proptest has generated in the past. It is 2 | # automatically read and these particular cases re-run before any 3 | # novel cases are generated. 4 | # 5 | # It is recommended to check this file in to source control so that 6 | # everyone who runs the test benefits from these saved cases. 7 | cc 4c899804f8e28d294268b2c482879338edc3be0210465aeaf6a03d65626d386f # shrinks to haystack = "Ѩ", needle = "Ѩ*|A0" 8 | cc 9dcbeee2d5ffde3324638f38b2eefc96a95b0665810c02c12093976a0aba96c5 # shrinks to haystack = "", needle = "^" 9 | cc 0311c531b8a3e09dc21270ace24fc7cdec1d773228a9ce3843888afe4774c4a2 # shrinks to haystack = "", needle = "$" 10 | cc 578435f522160de6326c7cf57b367dc9e52679b796ecf8d331a9684a9ef4d1f7 # shrinks to haystack = " ", needle = "." 11 | -------------------------------------------------------------------------------- /proptest-regressions/literal/tests.txt: -------------------------------------------------------------------------------- 1 | # Seeds for failure cases proptest has generated in the past. It is 2 | # automatically read and these particular cases re-run before any 3 | # novel cases are generated. 4 | # 5 | # It is recommended to check this file in to source control so that 6 | # everyone who runs the test benefits from these saved cases. 7 | cc a1f6f819109c893f29c5f71a0ac13dfcbf04de0dc6411615de2d9587b12d6edf # shrinks to haystack = "", needle = "🌀🤀𛱰a0Aa®ଏ¡𞥞®0" 8 | cc 9fc9553316dab0f5611d42ebdbfda893e991f183f013a13e105570d9bb935bbb # shrinks to haystack = "🀄", needle = [128] 9 | cc 14528483978ac457a80022577321d49eadc3952a4bc848dcf622730341424c50 # shrinks to haystack = "\"", needle = "\"" 10 | cc 0906f449ec7e583178f7865198d5c6c8589f6a760f57fe1e94fa71b751a13dcc # shrinks to haystack = "*", needle = "*" 11 | cc 3dc047ca1210586977bea6afe1c52f3f21b8f778358932316bce56a9c8dd069a # shrinks to mut haystack = "®", needle = "¯" 12 | cc d37b534f1d1d9b91a41efb745325c95e429901bd53d2bc4a31fd55997e5b243a # shrinks to mut haystack = "Ѩ", needle = "Ѩ" 13 | cc ea94b3aca8d5e5c4728504f773d8ec61d1e7a0e3aa8e186b9c953a199cd7e3e2 # shrinks to mut haystack = "A® a𛲜�a0 a0 𖬀 ", needle = "�" 14 | cc 80ea1772c0da540fd9e502978e22f1678ea0a06ec302d38891ecf36be39f966c # shrinks to mut haystack = "0Aa0 ��⺀ A", needle = "�" 15 | -------------------------------------------------------------------------------- /proptest-regressions/util/tests.txt: -------------------------------------------------------------------------------- 1 | # Seeds for failure cases proptest has generated in the past. It is 2 | # automatically read and these particular cases re-run before any 3 | # novel cases are generated. 4 | # 5 | # It is recommended to check this file in to source control so that 6 | # everyone who runs the test benefits from these saved cases. 7 | cc 06febfa67a8673673da6a2a4d70869e49f8d45945ae98745208a6266253a5bed # shrinks to haystack = "®" 8 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | use_small_heuristics = "Max" 2 | newline_style = "Unix" 3 | use_field_init_shorthand = true 4 | 5 | imports_granularity = "Module" 6 | group_imports = "StdExternalCrate" 7 | format_macro_matchers = true 8 | format_macro_bodies = true 9 | -------------------------------------------------------------------------------- /src/cursor.rs: -------------------------------------------------------------------------------- 1 | pub trait IntoCursor { 2 | type Cursor: Cursor; 3 | fn into_cursor(self) -> Self::Cursor; 4 | } 5 | 6 | impl IntoCursor for C { 7 | type Cursor = Self; 8 | 9 | fn into_cursor(self) -> Self { 10 | self 11 | } 12 | } 13 | 14 | /// A cursor that allows traversing a discontiguous string like a rope. 15 | pub trait Cursor { 16 | /// Returns the current chunk. If [`utf8_aware`](Cursor::utf8_aware) returns true then this function 17 | /// must **never** return a chunk that splits a unicode codepoint. 18 | /// See [`utf8_aware`](Cursor::utf8_aware) for details. 19 | /// 20 | /// Must never return an empty byteslice unless the underlying collection is empty. 21 | fn chunk(&self) -> &[u8]; 22 | /// Whether this cursor is aware of utf-8 codepoint boundaries. 23 | /// 24 | /// **`true`** means that his cursor must never split a unicode codepoint at a 25 | /// chunk boundary. In that case all regex features are supported. 26 | /// 27 | /// **`false`** means that his cursor can not be used for utf-8 mode 28 | /// matching (only affects empty strings) and can not be used to match 29 | /// unicode word boundaries. 30 | fn utf8_aware(&self) -> bool { 31 | true 32 | } 33 | /// Advances the cursor to the next chunk if possible. In that case `true` 34 | /// must be returned. If the end of data is reached this function should 35 | /// return `false` and **not change the chunk** 36 | fn advance(&mut self) -> bool; 37 | /// Moves the cursor to the previous chunk if possible. In that case `true` 38 | /// must be returned If the start of data is reached this function should 39 | /// return `false` and **not change the chunk** 40 | fn backtrack(&mut self) -> bool; 41 | /// Returns the total length of the data. This does not 42 | /// take the current cursor position into account and should 43 | /// not change with calls to [`advance`](Cursor::advance) and [`backtrack`](Cursor::backtrack). 44 | fn total_bytes(&self) -> Option; 45 | /// The offset of the current chunk from the start of the haystack in bytes 46 | fn offset(&self) -> usize; 47 | } 48 | 49 | impl Cursor for &mut C { 50 | fn chunk(&self) -> &[u8] { 51 | C::chunk(self) 52 | } 53 | 54 | fn utf8_aware(&self) -> bool { 55 | C::utf8_aware(self) 56 | } 57 | 58 | fn advance(&mut self) -> bool { 59 | C::advance(self) 60 | } 61 | 62 | fn backtrack(&mut self) -> bool { 63 | C::backtrack(self) 64 | } 65 | 66 | fn total_bytes(&self) -> Option { 67 | C::total_bytes(self) 68 | } 69 | 70 | fn offset(&self) -> usize { 71 | C::offset(self) 72 | } 73 | } 74 | 75 | impl Cursor for &[u8] { 76 | fn chunk(&self) -> &[u8] { 77 | self 78 | } 79 | 80 | // true since there are no chunk bounderies 81 | fn utf8_aware(&self) -> bool { 82 | true 83 | } 84 | 85 | fn advance(&mut self) -> bool { 86 | false 87 | } 88 | 89 | fn backtrack(&mut self) -> bool { 90 | false 91 | } 92 | 93 | fn total_bytes(&self) -> Option { 94 | Some(self.len()) 95 | } 96 | fn offset(&self) -> usize { 97 | 0 98 | } 99 | } 100 | 101 | impl Cursor for &str { 102 | fn chunk(&self) -> &[u8] { 103 | self.as_bytes() 104 | } 105 | 106 | // true since there are no chunk bounderies 107 | fn utf8_aware(&self) -> bool { 108 | true 109 | } 110 | 111 | fn advance(&mut self) -> bool { 112 | false 113 | } 114 | 115 | fn backtrack(&mut self) -> bool { 116 | false 117 | } 118 | fn total_bytes(&self) -> Option { 119 | Some(::len(self)) 120 | } 121 | 122 | fn offset(&self) -> usize { 123 | 0 124 | } 125 | } 126 | 127 | #[cfg(feature = "ropey")] 128 | #[derive(Clone, Copy)] 129 | enum Pos { 130 | ChunkStart, 131 | ChunkEnd, 132 | } 133 | 134 | #[cfg(feature = "ropey")] 135 | #[derive(Clone)] 136 | pub struct RopeyCursor<'a> { 137 | iter: ropey::iter::Chunks<'a>, 138 | current: &'a [u8], 139 | pos: Pos, 140 | len: usize, 141 | offset: usize, 142 | } 143 | 144 | #[cfg(feature = "ropey")] 145 | impl<'a> RopeyCursor<'a> { 146 | pub fn new(slice: ropey::RopeSlice<'a>) -> Self { 147 | let iter = slice.chunks(); 148 | let mut res = 149 | Self { current: &[], iter, pos: Pos::ChunkEnd, len: slice.len_bytes(), offset: 0 }; 150 | res.advance(); 151 | res 152 | } 153 | 154 | pub fn at(slice: ropey::RopeSlice<'a>, at: usize) -> Self { 155 | let (iter, offset, _, _) = slice.chunks_at_byte(at); 156 | if offset == slice.len_bytes() { 157 | let mut res = 158 | Self { current: &[], iter, pos: Pos::ChunkStart, len: slice.len_bytes(), offset }; 159 | res.backtrack(); 160 | res 161 | } else { 162 | let mut res = 163 | Self { current: &[], iter, pos: Pos::ChunkEnd, len: slice.len_bytes(), offset }; 164 | res.advance(); 165 | res 166 | } 167 | } 168 | } 169 | 170 | #[cfg(feature = "ropey")] 171 | impl Cursor for RopeyCursor<'_> { 172 | fn chunk(&self) -> &[u8] { 173 | self.current 174 | } 175 | 176 | fn advance(&mut self) -> bool { 177 | match self.pos { 178 | Pos::ChunkStart => { 179 | self.iter.next(); 180 | self.pos = Pos::ChunkEnd; 181 | } 182 | Pos::ChunkEnd => (), 183 | } 184 | for next in self.iter.by_ref() { 185 | if next.is_empty() { 186 | continue; 187 | } 188 | self.offset += self.current.len(); 189 | self.current = next.as_bytes(); 190 | return true; 191 | } 192 | false 193 | } 194 | 195 | fn backtrack(&mut self) -> bool { 196 | match self.pos { 197 | Pos::ChunkStart => {} 198 | Pos::ChunkEnd => { 199 | self.iter.prev(); 200 | self.pos = Pos::ChunkStart; 201 | } 202 | } 203 | while let Some(prev) = self.iter.prev() { 204 | if prev.is_empty() { 205 | continue; 206 | } 207 | self.offset -= prev.len(); 208 | self.current = prev.as_bytes(); 209 | return true; 210 | } 211 | false 212 | } 213 | 214 | fn utf8_aware(&self) -> bool { 215 | true 216 | } 217 | 218 | fn total_bytes(&self) -> Option { 219 | Some(self.len) 220 | } 221 | 222 | fn offset(&self) -> usize { 223 | self.offset 224 | } 225 | } 226 | 227 | #[cfg(feature = "ropey")] 228 | impl<'a> IntoCursor for ropey::RopeSlice<'a> { 229 | type Cursor = RopeyCursor<'a>; 230 | 231 | fn into_cursor(self) -> Self::Cursor { 232 | RopeyCursor::new(self) 233 | } 234 | } 235 | 236 | #[cfg(feature = "ropey")] 237 | impl<'a> IntoCursor for &'a ropey::Rope { 238 | type Cursor = RopeyCursor<'a>; 239 | 240 | fn into_cursor(self) -> Self::Cursor { 241 | RopeyCursor::new(self.slice(..)) 242 | } 243 | } 244 | #[cfg(all(feature = "ropey", test))] 245 | mod ropey_test { 246 | use ropey::Rope; 247 | 248 | use crate::cursor::IntoCursor; 249 | use crate::Cursor; 250 | 251 | #[test] 252 | fn smoke_test() { 253 | let rope = Rope::from_str("abc"); 254 | let mut cursor = rope.into_cursor(); 255 | assert_eq!(cursor.chunk(), "abc".as_bytes()); 256 | assert!(!cursor.advance()); 257 | assert_eq!(cursor.chunk(), "abc".as_bytes()); 258 | assert!(!cursor.backtrack()); 259 | assert_eq!(cursor.chunk(), "abc".as_bytes()); 260 | let rope = Rope::from("abc".repeat(5000)); 261 | let mut cursor = rope.into_cursor(); 262 | let mut offset = 0; 263 | loop { 264 | assert_eq!(cursor.offset(), offset); 265 | offset += cursor.chunk().len(); 266 | if !cursor.advance() { 267 | break; 268 | } 269 | } 270 | loop { 271 | offset -= cursor.chunk().len(); 272 | assert_eq!(cursor.offset(), offset); 273 | if !cursor.backtrack() { 274 | break; 275 | } 276 | } 277 | assert_eq!(cursor.offset(), 0); 278 | assert_eq!(offset, 0); 279 | } 280 | } 281 | -------------------------------------------------------------------------------- /src/engines.rs: -------------------------------------------------------------------------------- 1 | pub mod dfa; 2 | pub mod hybrid; 3 | pub mod meta; 4 | pub mod pikevm; 5 | -------------------------------------------------------------------------------- /src/engines/dfa.rs: -------------------------------------------------------------------------------- 1 | pub use regex_automata::dfa::regex::Regex; 2 | use regex_automata::dfa::Automaton; 3 | use regex_automata::{Anchored, Match, MatchError}; 4 | 5 | use crate::cursor::Cursor; 6 | use crate::util::iter; 7 | use crate::Input; 8 | 9 | pub use crate::engines::dfa::search::{try_search_fwd, try_search_rev}; 10 | 11 | mod accel; 12 | mod search; 13 | #[cfg(test)] 14 | mod test; 15 | 16 | /// Returns true if either the given input specifies an anchored search 17 | /// or if the underlying NFA is always anchored. 18 | fn is_anchored(regex: &Regex, input: &Input) -> bool { 19 | match input.get_anchored() { 20 | Anchored::No => regex.forward().is_always_start_anchored(), 21 | Anchored::Yes | Anchored::Pattern(_) => true, 22 | } 23 | } 24 | 25 | /// Returns an iterator over all non-overlapping leftmost matches in the 26 | /// given bytes. If no match exists, then the iterator yields no elements. 27 | /// 28 | /// # Panics 29 | /// 30 | /// This routine panics if the search could not complete. This can occur 31 | /// in a number of circumstances: 32 | /// 33 | /// * The configuration of the lazy DFA may permit it to "quit" the search. 34 | /// For example, setting quit bytes or enabling heuristic support for 35 | /// Unicode word boundaries. The default configuration does not enable any 36 | /// option that could result in the lazy DFA quitting. 37 | /// * The configuration of the lazy DFA may also permit it to "give up" 38 | /// on a search if it makes ineffective use of its transition table 39 | /// cache. The default configuration does not enable this by default, 40 | /// although it is typically a good idea to. 41 | /// * When the provided `Input` configuration is not supported. For 42 | /// example, by providing an unsupported anchor mode. 43 | /// 44 | /// When a search panics, callers cannot know whether a match exists or 45 | /// not. 46 | /// 47 | /// The above conditions also apply to the iterator returned as well. For 48 | /// example, if the lazy DFA gives up or quits during a search using this 49 | /// method, then a panic will occur during iteration. 50 | /// 51 | /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher) 52 | /// if you want to handle these error conditions. 53 | /// 54 | /// # Example 55 | /// 56 | /// ``` 57 | /// use regex_automata::{hybrid::regex::Regex, Match}; 58 | /// 59 | /// let re = Regex::new("foo[0-9]+")?; 60 | /// let mut cache = re.create_cache(); 61 | /// 62 | /// let text = "foo1 foo12 foo123"; 63 | /// let matches: Vec = re.find_iter(&mut cache, text).collect(); 64 | /// assert_eq!(matches, vec![ 65 | /// Match::must(0, 0..4), 66 | /// Match::must(0, 5..10), 67 | /// Match::must(0, 11..17), 68 | /// ]); 69 | /// # Ok::<(), Box>(()) 70 | /// ``` 71 | #[inline] 72 | pub fn find_iter(regex: &Regex, input: Input) -> FindMatches<'_, C> { 73 | let it = iter::Searcher::new(input); 74 | FindMatches { re: regex, it } 75 | } 76 | 77 | /// Returns the start and end offset of the leftmost match. If no match 78 | /// exists, then `None` is returned. 79 | /// 80 | /// # Panics 81 | /// 82 | /// This routine panics if the search could not complete. This can occur 83 | /// in a number of circumstances: 84 | /// 85 | /// * The configuration of the lazy DFA may permit it to "quit" the search. 86 | /// For example, setting quit bytes or enabling heuristic support for 87 | /// Unicode word boundaries. The default configuration does not enable any 88 | /// option that could result in the lazy DFA quitting. 89 | /// * The configuration of the lazy DFA may also permit it to "give up" 90 | /// on a search if it makes ineffective use of its transition table 91 | /// cache. The default configuration does not enable this by default, 92 | /// although it is typically a good idea to. 93 | /// * When the provided `Input` configuration is not supported. For 94 | /// example, by providing an unsupported anchor mode. 95 | /// 96 | /// When a search panics, callers cannot know whether a match exists or 97 | /// not. 98 | /// 99 | /// Use [`Regex::try_search`] if you want to handle these error conditions. 100 | /// 101 | /// # Example 102 | /// 103 | /// ``` 104 | /// use regex_automata::{Match, hybrid::regex::Regex}; 105 | /// 106 | /// let re = Regex::new("foo[0-9]+")?; 107 | /// let mut cache = re.create_cache(); 108 | /// assert_eq!( 109 | /// Some(Match::must(0, 3..11)), 110 | /// re.find(&mut cache, "zzzfoo12345zzz"), 111 | /// ); 112 | /// 113 | /// // Even though a match is found after reading the first byte (`a`), 114 | /// // the default leftmost-first match semantics demand that we find the 115 | /// // earliest match that prefers earlier parts of the pattern over latter 116 | /// // parts. 117 | /// let re = Regex::new("abc|a")?; 118 | /// let mut cache = re.create_cache(); 119 | /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc")); 120 | /// # Ok::<(), Box>(()) 121 | /// ``` 122 | pub fn find(regex: &Regex, input: &mut Input) -> Option { 123 | try_search(regex, input).unwrap() 124 | } 125 | 126 | /// Returns the start and end offset of the leftmost match. If no match 127 | /// exists, then `None` is returned. 128 | /// 129 | /// This is like [`Regex::find`] but with two differences: 130 | /// 131 | /// 1. It is not generic over `Into` and instead accepts a 132 | /// `&Input`. This permits reusing the same `Input` for multiple searches 133 | /// without needing to create a new one. This _may_ help with latency. 134 | /// 2. It returns an error if the search could not complete where as 135 | /// [`Regex::find`] will panic. 136 | /// 137 | /// # Errors 138 | /// 139 | /// This routine errors if the search could not complete. This can occur 140 | /// in a number of circumstances: 141 | /// 142 | /// * The configuration of the lazy DFA may permit it to "quit" the search. 143 | /// For example, setting quit bytes or enabling heuristic support for 144 | /// Unicode word boundaries. The default configuration does not enable any 145 | /// option that could result in the lazy DFA quitting. 146 | /// * The configuration of the lazy DFA may also permit it to "give up" 147 | /// on a search if it makes ineffective use of its transition table 148 | /// cache. The default configuration does not enable this by default, 149 | /// although it is typically a good idea to. 150 | /// * When the provided `Input` configuration is not supported. For 151 | /// example, by providing an unsupported anchor mode. 152 | /// 153 | /// When a search returns an error, callers cannot know whether a match 154 | /// exists or not. 155 | pub fn try_search( 156 | regex: &Regex, 157 | input: &mut Input, 158 | ) -> Result, MatchError> { 159 | let fwd = regex.forward(); 160 | let end = match try_search_fwd(fwd, input)? { 161 | None => return Ok(None), 162 | Some(end) => end, 163 | }; 164 | // This special cases an empty match at the beginning of the search. If 165 | // our end matches our start, then since a reverse DFA can't match past 166 | // the start, it must follow that our starting position is also our end 167 | // position. So short circuit and skip the reverse search. 168 | if input.start() == end.offset() { 169 | return Ok(Some(Match::new(end.pattern(), end.offset()..end.offset()))); 170 | } 171 | // We can also skip the reverse search if we know our search was 172 | // anchored. This occurs either when the input config is anchored or 173 | // when we know the regex itself is anchored. In this case, we know the 174 | // start of the match, if one is found, must be the start of the 175 | // search. 176 | if is_anchored(regex, input) { 177 | return Ok(Some(Match::new(end.pattern(), input.start()..end.offset()))); 178 | } 179 | // N.B. I have tentatively convinced myself that it isn't necessary 180 | // to specify the specific pattern for the reverse search since the 181 | // reverse search will always find the same pattern to match as the 182 | // forward search. But I lack a rigorous proof. Why not just provide 183 | // the pattern anyway? Well, if it is needed, then leaving it out 184 | // gives us a chance to find a witness. (Also, if we don't need to 185 | // specify the pattern, then we don't need to build the reverse DFA 186 | // with 'starts_for_each_pattern' enabled. It doesn't matter too much 187 | // for the lazy DFA, but does make the overall DFA bigger.) 188 | // 189 | // We also need to be careful to disable 'earliest' for the reverse 190 | // search, since it could be enabled for the forward search. In the 191 | // reverse case, to satisfy "leftmost" criteria, we need to match as 192 | // much as we can. We also need to be careful to make the search 193 | // anchored. We don't want the reverse search to report any matches 194 | // other than the one beginning at the end of our forward search. 195 | 196 | let match_range = input.start()..end.offset(); 197 | let start = input.with(|mut revsearch| { 198 | revsearch = revsearch.span(match_range).anchored(Anchored::Yes).earliest(false); 199 | try_search_rev(regex.reverse(), revsearch) 200 | }); 201 | let start = start?.expect("reverse search must match if forward search does"); 202 | debug_assert_eq!( 203 | start.pattern(), 204 | end.pattern(), 205 | "forward and reverse search must match same pattern", 206 | ); 207 | debug_assert!(start.offset() <= end.offset()); 208 | debug_assert!(end.offset() <= input.end()); 209 | debug_assert!(input.start() <= start.offset()); 210 | Ok(Some(Match::new(end.pattern(), start.offset()..end.offset()))) 211 | } 212 | 213 | /// An iterator over all non-overlapping matches for an infallible search. 214 | /// 215 | /// The iterator yields a [`Match`] value until no more matches could be found. 216 | /// If the underlying regex engine returns an error, then a panic occurs. 217 | /// 218 | /// This iterator can be created with the [`Regex::find_iter`] method. 219 | #[derive(Debug)] 220 | pub struct FindMatches<'r, C: Cursor> { 221 | re: &'r Regex, 222 | it: iter::Searcher, 223 | } 224 | 225 | impl<'r, C: Cursor> Iterator for FindMatches<'r, C> { 226 | type Item = Match; 227 | 228 | #[inline] 229 | fn next(&mut self) -> Option { 230 | let FindMatches { re, ref mut it } = *self; 231 | it.advance(|input| try_search(re, input)) 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /src/engines/dfa/accel.rs: -------------------------------------------------------------------------------- 1 | use crate::cursor::Cursor; 2 | use crate::Input; 3 | 4 | /// Search for between 1 and 3 needle bytes in the given haystack, starting the 5 | /// search at the given position. If `needles` has a length other than 1-3, 6 | /// then this panics. 7 | #[cfg_attr(feature = "perf-inline", inline(always))] 8 | pub(crate) fn find_fwd_imp(needles: &[u8], haystack: &[u8], at: usize) -> Option { 9 | let bs = needles; 10 | let i = match needles.len() { 11 | 1 => memchr::memchr(bs[0], &haystack[at..])?, 12 | 2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?, 13 | 3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?, 14 | 0 => panic!("cannot find with empty needles"), 15 | n => panic!("invalid needles length: {}", n), 16 | }; 17 | Some(at + i) 18 | } 19 | /// Search for between 1 and 3 needle bytes in the given input, starting the 20 | /// search at the given position. If `needles` has a length other than 1-3, 21 | /// then this panics. 22 | #[cfg_attr(feature = "perf-inline", inline(always))] 23 | pub(crate) fn find_fwd( 24 | needles: &[u8], 25 | input: &mut Input, 26 | at: usize, 27 | ) -> Option { 28 | if let Some(pos) = find_fwd_imp(needles, input.chunk(), at) { 29 | return Some(pos); 30 | } 31 | while input.chunk_offset() + input.chunk().len() < input.end() && input.advance() { 32 | if let Some(pos) = find_fwd_imp(needles, input.chunk(), 0) { 33 | return Some(pos); 34 | } 35 | } 36 | None 37 | } 38 | 39 | /// Search for between 1 and 3 needle bytes in the given haystack in reverse, 40 | /// starting the search at the given position. If `needles` has a length other 41 | /// than 1-3, then this panics. 42 | #[cfg_attr(feature = "perf-inline", inline(always))] 43 | pub(crate) fn find_rev_imp(needles: &[u8], haystack: &[u8], at: usize) -> Option { 44 | let bs = needles; 45 | match needles.len() { 46 | 1 => memchr::memrchr(bs[0], &haystack[..at]), 47 | 2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]), 48 | 3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]), 49 | 0 => panic!("cannot find with empty needles"), 50 | n => panic!("invalid needles length: {}", n), 51 | } 52 | } 53 | /// Search for between 1 and 3 needle bytes in the given input, starting the 54 | /// search at the given position. If `needles` has a length other than 1-3, 55 | /// then this panics. 56 | #[cfg_attr(feature = "perf-inline", inline(always))] 57 | pub(crate) fn find_rev( 58 | needles: &[u8], 59 | input: &mut Input, 60 | at: usize, 61 | ) -> Option { 62 | if let Some(pos) = find_rev_imp(needles, input.chunk(), at) { 63 | return Some(pos); 64 | } 65 | while input.start() < input.chunk_offset() && input.backtrack() { 66 | if let Some(pos) = find_rev_imp(needles, input.chunk(), input.chunk().len()) { 67 | return Some(pos); 68 | } 69 | } 70 | None 71 | } 72 | -------------------------------------------------------------------------------- /src/engines/dfa/test.rs: -------------------------------------------------------------------------------- 1 | use proptest::proptest; 2 | 3 | use crate::engines::dfa::find_iter; 4 | use crate::input::Input; 5 | use crate::test_rope::SingleByteChunks; 6 | 7 | #[test] 8 | fn searcher() { 9 | let text = std::fs::read_to_string("test_cases/syntax.rs").unwrap(); 10 | let regex = super::Regex::builder() 11 | .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true)) 12 | .build("vec") 13 | .unwrap(); 14 | let rope = ropey::Rope::from_str(&text); 15 | let matches: Vec<_> = find_iter(®ex, Input::new(rope.slice(..))) 16 | .map(|range| rope.byte_slice(range.range())) 17 | .collect(); 18 | assert_eq!(matches.len(), 68); 19 | } 20 | 21 | #[test] 22 | fn anchor() { 23 | let haystack = ":a"; 24 | let needle = "$|:"; 25 | let foo = SingleByteChunks::new(haystack.as_bytes()); 26 | let regex = super::Regex::builder() 27 | .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true).unicode(false)) 28 | .build(needle) 29 | .unwrap(); 30 | let iter1: Vec<_> = regex.find_iter(haystack).collect(); 31 | let iter2: Vec<_> = find_iter(®ex, Input::new(foo)).collect(); 32 | assert_eq!(iter1, iter2); 33 | } 34 | 35 | #[test] 36 | fn end_of_input() { 37 | let haystack = "a b c"; 38 | let needle = "\\b"; 39 | let foo = SingleByteChunks::new(haystack.as_bytes()); 40 | let regex = super::Regex::builder() 41 | .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true).unicode(false)) 42 | .build(needle) 43 | .unwrap(); 44 | let iter1: Vec<_> = regex.find_iter(haystack).collect(); 45 | let iter2: Vec<_> = find_iter(®ex, Input::new(foo)).collect(); 46 | assert_eq!(iter1, iter2); 47 | } 48 | 49 | #[test] 50 | fn hotloop_transition() { 51 | let haystack = "Σ /ⶠaAA ﷏00AAΣ/എ"; 52 | let needle = "/"; 53 | let foo = ropey::Rope::from_str(haystack); 54 | let regex = super::Regex::builder() 55 | .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true)) 56 | .build(needle) 57 | .unwrap(); 58 | let iter1: Vec<_> = regex.find_iter(haystack).collect(); 59 | let iter2: Vec<_> = find_iter(®ex, Input::new(&foo)).collect(); 60 | assert_eq!(iter1, iter2); 61 | } 62 | 63 | proptest! { 64 | #[test] 65 | fn matches(mut haystack: String, needle: String) { 66 | haystack = haystack.repeat(1024); 67 | let foo = ropey::Rope::from_str(&haystack); 68 | let Ok(regex) = super::Regex::builder() 69 | .syntax(regex_automata::util::syntax::Config::new() 70 | .case_insensitive(true) 71 | ) 72 | .build(&needle) else { 73 | return Ok(()) 74 | }; 75 | let iter1 = regex.find_iter( &haystack); 76 | let iter2 = find_iter(®ex, Input::new(&foo)); 77 | crate::util::iter::prop_assert_eq(iter1, iter2)?; 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/engines/hybrid.rs: -------------------------------------------------------------------------------- 1 | pub use regex_automata::hybrid::regex::{Cache, Regex}; 2 | use regex_automata::{Anchored, Match, MatchError}; 3 | 4 | use crate::cursor::Cursor; 5 | use crate::input::Input; 6 | use crate::util::iter; 7 | 8 | pub use crate::engines::hybrid::search::{try_search_fwd, try_search_rev}; 9 | 10 | mod search; 11 | #[cfg(test)] 12 | mod test; 13 | 14 | /// Returns true if either the given input specifies an anchored search 15 | /// or if the underlying NFA is always anchored. 16 | fn is_anchored(regex: &Regex, input: &Input) -> bool { 17 | match input.get_anchored() { 18 | Anchored::No => regex.forward().get_nfa().is_always_start_anchored(), 19 | Anchored::Yes | Anchored::Pattern(_) => true, 20 | } 21 | } 22 | 23 | /// Returns an iterator over all non-overlapping leftmost matches in the 24 | /// given bytes. If no match exists, then the iterator yields no elements. 25 | /// 26 | /// # Panics 27 | /// 28 | /// This routine panics if the search could not complete. This can occur 29 | /// in a number of circumstances: 30 | /// 31 | /// * The configuration of the lazy DFA may permit it to "quit" the search. 32 | /// For example, setting quit bytes or enabling heuristic support for 33 | /// Unicode word boundaries. The default configuration does not enable any 34 | /// option that could result in the lazy DFA quitting. 35 | /// * The configuration of the lazy DFA may also permit it to "give up" 36 | /// on a search if it makes ineffective use of its transition table 37 | /// cache. The default configuration does not enable this by default, 38 | /// although it is typically a good idea to. 39 | /// * When the provided `Input` configuration is not supported. For 40 | /// example, by providing an unsupported anchor mode. 41 | /// 42 | /// When a search panics, callers cannot know whether a match exists or 43 | /// not. 44 | /// 45 | /// The above conditions also apply to the iterator returned as well. For 46 | /// example, if the lazy DFA gives up or quits during a search using this 47 | /// method, then a panic will occur during iteration. 48 | /// 49 | /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher) 50 | /// if you want to handle these error conditions. 51 | /// 52 | /// # Example 53 | /// 54 | /// ``` 55 | /// use regex_automata::{hybrid::regex::Regex, Match}; 56 | /// 57 | /// let re = Regex::new("foo[0-9]+")?; 58 | /// let mut cache = re.create_cache(); 59 | /// 60 | /// let text = "foo1 foo12 foo123"; 61 | /// let matches: Vec = re.find_iter(&mut cache, text).collect(); 62 | /// assert_eq!(matches, vec![ 63 | /// Match::must(0, 0..4), 64 | /// Match::must(0, 5..10), 65 | /// Match::must(0, 11..17), 66 | /// ]); 67 | /// # Ok::<(), Box>(()) 68 | /// ``` 69 | #[inline] 70 | pub fn find_iter<'r, 'c, C: Cursor>( 71 | regex: &'r Regex, 72 | cache: &'c mut Cache, 73 | input: Input, 74 | ) -> FindMatches<'r, 'c, C> { 75 | let it = iter::Searcher::new(input); 76 | FindMatches { re: regex, cache, it } 77 | } 78 | 79 | /// Returns the start and end offset of the leftmost match. If no match 80 | /// exists, then `None` is returned. 81 | /// 82 | /// # Panics 83 | /// 84 | /// This routine panics if the search could not complete. This can occur 85 | /// in a number of circumstances: 86 | /// 87 | /// * The configuration of the lazy DFA may permit it to "quit" the search. 88 | /// For example, setting quit bytes or enabling heuristic support for 89 | /// Unicode word boundaries. The default configuration does not enable any 90 | /// option that could result in the lazy DFA quitting. 91 | /// * The configuration of the lazy DFA may also permit it to "give up" 92 | /// on a search if it makes ineffective use of its transition table 93 | /// cache. The default configuration does not enable this by default, 94 | /// although it is typically a good idea to. 95 | /// * When the provided `Input` configuration is not supported. For 96 | /// example, by providing an unsupported anchor mode. 97 | /// 98 | /// When a search panics, callers cannot know whether a match exists or 99 | /// not. 100 | /// 101 | /// Use [`Regex::try_search`] if you want to handle these error conditions. 102 | /// 103 | /// # Example 104 | /// 105 | /// ``` 106 | /// use regex_automata::{Match, hybrid::regex::Regex}; 107 | /// 108 | /// let re = Regex::new("foo[0-9]+")?; 109 | /// let mut cache = re.create_cache(); 110 | /// assert_eq!( 111 | /// Some(Match::must(0, 3..11)), 112 | /// re.find(&mut cache, "zzzfoo12345zzz"), 113 | /// ); 114 | /// 115 | /// // Even though a match is found after reading the first byte (`a`), 116 | /// // the default leftmost-first match semantics demand that we find the 117 | /// // earliest match that prefers earlier parts of the pattern over latter 118 | /// // parts. 119 | /// let re = Regex::new("abc|a")?; 120 | /// let mut cache = re.create_cache(); 121 | /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc")); 122 | /// # Ok::<(), Box>(()) 123 | /// ``` 124 | pub fn find(regex: &Regex, cache: &mut Cache, input: &mut Input) -> Option { 125 | try_search(regex, cache, input).unwrap() 126 | } 127 | 128 | /// Returns the start and end offset of the leftmost match. If no match 129 | /// exists, then `None` is returned. 130 | /// 131 | /// This is like [`Regex::find`] but with two differences: 132 | /// 133 | /// 1. It is not generic over `Into` and instead accepts a 134 | /// `&Input`. This permits reusing the same `Input` for multiple searches 135 | /// without needing to create a new one. This _may_ help with latency. 136 | /// 2. It returns an error if the search could not complete where as 137 | /// [`Regex::find`] will panic. 138 | /// 139 | /// # Errors 140 | /// 141 | /// This routine errors if the search could not complete. This can occur 142 | /// in a number of circumstances: 143 | /// 144 | /// * The configuration of the lazy DFA may permit it to "quit" the search. 145 | /// For example, setting quit bytes or enabling heuristic support for 146 | /// Unicode word boundaries. The default configuration does not enable any 147 | /// option that could result in the lazy DFA quitting. 148 | /// * The configuration of the lazy DFA may also permit it to "give up" 149 | /// on a search if it makes ineffective use of its transition table 150 | /// cache. The default configuration does not enable this by default, 151 | /// although it is typically a good idea to. 152 | /// * When the provided `Input` configuration is not supported. For 153 | /// example, by providing an unsupported anchor mode. 154 | /// 155 | /// When a search returns an error, callers cannot know whether a match 156 | /// exists or not. 157 | pub fn try_search( 158 | regex: &Regex, 159 | cache: &mut Cache, 160 | input: &mut Input, 161 | ) -> Result, MatchError> { 162 | let (fcache, rcache) = cache.as_parts_mut(); 163 | let end = match try_search_fwd(regex.forward(), fcache, input)? { 164 | None => return Ok(None), 165 | Some(end) => end, 166 | }; 167 | // This special cases an empty match at the beginning of the search. If 168 | // our end matches our start, then since a reverse DFA can't match past 169 | // the start, it must follow that our starting position is also our end 170 | // position. So short circuit and skip the reverse search. 171 | if input.start() == end.offset() { 172 | return Ok(Some(Match::new(end.pattern(), end.offset()..end.offset()))); 173 | } 174 | // We can also skip the reverse search if we know our search was 175 | // anchored. This occurs either when the input config is anchored or 176 | // when we know the regex itself is anchored. In this case, we know the 177 | // start of the match, if one is found, must be the start of the 178 | // search. 179 | if is_anchored(regex, input) { 180 | return Ok(Some(Match::new(end.pattern(), input.start()..end.offset()))); 181 | } 182 | // N.B. I have tentatively convinced myself that it isn't necessary 183 | // to specify the specific pattern for the reverse search since the 184 | // reverse search will always find the same pattern to match as the 185 | // forward search. But I lack a rigorous proof. Why not just provide 186 | // the pattern anyway? Well, if it is needed, then leaving it out 187 | // gives us a chance to find a witness. (Also, if we don't need to 188 | // specify the pattern, then we don't need to build the reverse DFA 189 | // with 'starts_for_each_pattern' enabled. It doesn't matter too much 190 | // for the lazy DFA, but does make the overall DFA bigger.) 191 | // 192 | // We also need to be careful to disable 'earliest' for the reverse 193 | // search, since it could be enabled for the forward search. In the 194 | // reverse case, to satisfy "leftmost" criteria, we need to match as 195 | // much as we can. We also need to be careful to make the search 196 | // anchored. We don't want the reverse search to report any matches 197 | // other than the one beginning at the end of our forward search. 198 | 199 | let match_range = input.start()..end.offset(); 200 | let start = input.with(|mut revsearch| { 201 | revsearch = revsearch.span(match_range).anchored(Anchored::Yes).earliest(false); 202 | try_search_rev(regex.reverse(), rcache, revsearch) 203 | }); 204 | let start = start?.expect("reverse search must match if forward search does"); 205 | debug_assert_eq!( 206 | start.pattern(), 207 | end.pattern(), 208 | "forward and reverse search must match same pattern", 209 | ); 210 | debug_assert!(start.offset() <= end.offset()); 211 | debug_assert!(end.offset() <= input.end()); 212 | debug_assert!(input.start() <= start.offset()); 213 | Ok(Some(Match::new(end.pattern(), start.offset()..end.offset()))) 214 | } 215 | 216 | /// An iterator over all non-overlapping matches for an infallible search. 217 | /// 218 | /// The iterator yields a [`Match`] value until no more matches could be found. 219 | /// If the underlying regex engine returns an error, then a panic occurs. 220 | /// 221 | /// The lifetime parameters are as follows: 222 | /// 223 | /// * `'r` represents the lifetime of the regex object. 224 | /// * `'h` represents the lifetime of the haystack being searched. 225 | /// * `'c` represents the lifetime of the regex cache. 226 | /// 227 | /// This iterator can be created with the [`Regex::find_iter`] method. 228 | #[derive(Debug)] 229 | pub struct FindMatches<'r, 'c, C: Cursor> { 230 | re: &'r Regex, 231 | cache: &'c mut Cache, 232 | it: iter::Searcher, 233 | } 234 | 235 | impl<'r, 'c, C: Cursor> Iterator for FindMatches<'r, 'c, C> { 236 | type Item = Match; 237 | 238 | #[inline] 239 | fn next(&mut self) -> Option { 240 | let FindMatches { re, ref mut cache, ref mut it } = *self; 241 | it.advance(|input| try_search(re, cache, input)) 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /src/engines/hybrid/test.rs: -------------------------------------------------------------------------------- 1 | use proptest::proptest; 2 | 3 | use crate::engines::hybrid::find_iter; 4 | use crate::input::Input; 5 | 6 | #[test] 7 | fn searcher() { 8 | let text = std::fs::read_to_string("test_cases/syntax.rs").unwrap(); 9 | let regex = super::Regex::builder() 10 | .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true)) 11 | .build("vec") 12 | .unwrap(); 13 | let mut cache = regex.create_cache(); 14 | let rope = ropey::Rope::from_str(&text); 15 | let matches: Vec<_> = find_iter(®ex, &mut cache, Input::new(&rope)) 16 | .map(|range| rope.byte_slice(range.range())) 17 | .collect(); 18 | assert_eq!(matches.len(), 68); 19 | } 20 | 21 | #[test] 22 | fn anchor() { 23 | let haystack = ":a"; 24 | let needle = "$|:"; 25 | let foo = ropey::Rope::from_str(haystack); 26 | let regex = super::Regex::builder() 27 | .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true).unicode(false)) 28 | .build(needle) 29 | .unwrap(); 30 | let mut cache1 = regex.create_cache(); 31 | let mut cache2 = regex.create_cache(); 32 | let iter1: Vec<_> = regex.find_iter(&mut cache1, haystack).collect(); 33 | let iter2: Vec<_> = find_iter(®ex, &mut cache2, Input::new(&foo)).collect(); 34 | assert_eq!(iter1, iter2); 35 | } 36 | 37 | #[test] 38 | fn hotloop_transition() { 39 | let haystack = "Σ /ⶠaAA ﷏00AAΣ/എ"; 40 | let needle = "/"; 41 | let foo = ropey::Rope::from_str(haystack); 42 | let regex = super::Regex::builder() 43 | .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true)) 44 | .build(needle) 45 | .unwrap(); 46 | let mut cache1 = regex.create_cache(); 47 | let mut cache2 = regex.create_cache(); 48 | let iter1: Vec<_> = regex.find_iter(&mut cache1, haystack).collect(); 49 | let iter2: Vec<_> = find_iter(®ex, &mut cache2, Input::new(&foo)).collect(); 50 | assert_eq!(iter1, iter2); 51 | } 52 | 53 | proptest! { 54 | #[test] 55 | fn matches(mut haystack: String, needle: String) { 56 | haystack = haystack.repeat(1024); 57 | let foo = ropey::Rope::from_str(&haystack); 58 | let Ok(regex) = super::Regex::builder() 59 | .syntax(regex_automata::util::syntax::Config::new() 60 | .case_insensitive(true) 61 | ) 62 | .build(&needle) else { 63 | return Ok(()) 64 | }; 65 | let mut cache1 = regex.create_cache(); 66 | let mut cache2 = regex.create_cache(); 67 | let iter1 = regex.find_iter(&mut cache1, &haystack); 68 | let iter2 = find_iter(®ex, &mut cache2, Input::new(&foo)); 69 | crate::util::iter::prop_assert_eq(iter1, iter2)?; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/engines/meta/error.rs: -------------------------------------------------------------------------------- 1 | use regex_automata::{nfa, MatchError, MatchErrorKind, PatternID}; 2 | use regex_syntax::{ast, hir}; 3 | 4 | /// An error that occurs when construction of a `Regex` fails. 5 | /// 6 | /// A build error is generally a result of one of two possible failure 7 | /// modes. First is a parse or syntax error in the concrete syntax of a 8 | /// pattern. Second is that the construction of the underlying regex matcher 9 | /// fails, usually because it gets too big with respect to limits like 10 | /// [`Config::nfa_size_limit`](crate::meta::Config::nfa_size_limit). 11 | /// 12 | /// This error provides very little introspection capabilities. You can: 13 | /// 14 | /// * Ask for the [`PatternID`] of the pattern that caused an error, if one 15 | /// is available. This is available for things like syntax errors, but not for 16 | /// cases where build limits are exceeded. 17 | /// * Ask for the underlying syntax error, but only if the error is a syntax 18 | /// error. 19 | /// * Ask for a human readable message corresponding to the underlying error. 20 | /// * The `BuildError::source` method (from the `std::error::Error` 21 | /// trait implementation) may be used to query for an underlying error if one 22 | /// exists. There are no API guarantees about which error is returned. 23 | /// 24 | /// When the `std` feature is enabled, this implements `std::error::Error`. 25 | #[derive(Clone, Debug)] 26 | pub struct BuildError { 27 | kind: BuildErrorKind, 28 | } 29 | 30 | #[derive(Clone, Debug)] 31 | enum BuildErrorKind { 32 | Syntax { pid: PatternID, err: regex_syntax::Error }, 33 | NFA(nfa::thompson::BuildError), 34 | } 35 | 36 | impl BuildError { 37 | /// If it is known which pattern ID caused this build error to occur, then 38 | /// this method returns it. 39 | /// 40 | /// Some errors are not associated with a particular pattern. However, any 41 | /// errors that occur as part of parsing a pattern are guaranteed to be 42 | /// associated with a pattern ID. 43 | /// 44 | /// # Example 45 | /// 46 | /// ``` 47 | /// use regex_automata::{meta::Regex, PatternID}; 48 | /// 49 | /// let err = Regex::new_many(&["a", "b", r"\p{Foo}", "c"]).unwrap_err(); 50 | /// assert_eq!(Some(PatternID::must(2)), err.pattern()); 51 | /// ``` 52 | pub fn pattern(&self) -> Option { 53 | match self.kind { 54 | BuildErrorKind::Syntax { pid, .. } => Some(pid), 55 | _ => None, 56 | } 57 | } 58 | 59 | /// If this error occurred because the regex exceeded the configured size 60 | /// limit before being built, then this returns the configured size limit. 61 | /// 62 | /// The limit returned is what was configured, and corresponds to the 63 | /// maximum amount of heap usage in bytes. 64 | pub fn size_limit(&self) -> Option { 65 | match self.kind { 66 | BuildErrorKind::NFA(ref err) => err.size_limit(), 67 | _ => None, 68 | } 69 | } 70 | 71 | /// If this error corresponds to a syntax error, then a reference to it is 72 | /// returned by this method. 73 | pub fn syntax_error(&self) -> Option<®ex_syntax::Error> { 74 | match self.kind { 75 | BuildErrorKind::Syntax { ref err, .. } => Some(err), 76 | _ => None, 77 | } 78 | } 79 | 80 | pub(crate) fn ast(pid: PatternID, err: ast::Error) -> BuildError { 81 | let err = regex_syntax::Error::from(err); 82 | BuildError { kind: BuildErrorKind::Syntax { pid, err } } 83 | } 84 | 85 | pub(crate) fn hir(pid: PatternID, err: hir::Error) -> BuildError { 86 | let err = regex_syntax::Error::from(err); 87 | BuildError { kind: BuildErrorKind::Syntax { pid, err } } 88 | } 89 | 90 | pub(crate) fn nfa(err: nfa::thompson::BuildError) -> BuildError { 91 | BuildError { kind: BuildErrorKind::NFA(err) } 92 | } 93 | } 94 | 95 | impl std::error::Error for BuildError { 96 | fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { 97 | match self.kind { 98 | BuildErrorKind::Syntax { ref err, .. } => Some(err), 99 | BuildErrorKind::NFA(ref err) => Some(err), 100 | } 101 | } 102 | } 103 | 104 | impl core::fmt::Display for BuildError { 105 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 106 | match self.kind { 107 | BuildErrorKind::Syntax { pid, .. } => { 108 | write!(f, "error parsing pattern {}", pid.as_usize()) 109 | } 110 | BuildErrorKind::NFA(_) => write!(f, "error building NFA"), 111 | } 112 | } 113 | } 114 | 115 | /// An error that occurs when a regex engine "gives up" for some reason before 116 | /// finishing a search. Usually this occurs because of heuristic Unicode word 117 | /// boundary support or because of ineffective cache usage in the lazy DFA. 118 | /// 119 | /// When this error occurs, callers should retry the regex search with a 120 | /// different regex engine. 121 | /// 122 | /// Note that this has convenient `From` impls that will automatically 123 | /// convert a `MatchError` into this error. This works because the meta 124 | /// regex engine internals guarantee that errors like `HaystackTooLong` and 125 | /// `UnsupportedAnchored` will never occur. The only errors left are `Quit` and 126 | /// `GaveUp`, which both correspond to this "failure" error. 127 | #[derive(Debug)] 128 | pub(crate) struct RetryFailError { 129 | offset: usize, 130 | } 131 | 132 | impl RetryFailError { 133 | pub(crate) fn from_offset(offset: usize) -> RetryFailError { 134 | RetryFailError { offset } 135 | } 136 | } 137 | 138 | impl std::error::Error for RetryFailError {} 139 | 140 | impl core::fmt::Display for RetryFailError { 141 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 142 | write!(f, "regex engine failed at offset {:?}", self.offset) 143 | } 144 | } 145 | 146 | impl From for RetryFailError { 147 | fn from(merr: MatchError) -> RetryFailError { 148 | use MatchErrorKind::*; 149 | 150 | match *merr.kind() { 151 | Quit { offset, .. } => RetryFailError::from_offset(offset), 152 | GaveUp { offset } => RetryFailError::from_offset(offset), 153 | // These can never occur because we avoid them by construction 154 | // or with higher level control flow logic. For example, the 155 | // backtracker's wrapper will never hand out a backtracker engine 156 | // when the haystack would be too long. 157 | _ => { 158 | unreachable!("found impossible error in meta engine: {}", merr) 159 | } 160 | } 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/engines/meta/literal.rs: -------------------------------------------------------------------------------- 1 | use std::{vec, vec::Vec}; 2 | 3 | use log::debug; 4 | use regex_automata::MatchKind; 5 | use regex_syntax::hir::Hir; 6 | 7 | use crate::engines::meta::regex::RegexInfo; 8 | 9 | /// Pull out an alternation of literals from the given sequence of HIR 10 | /// expressions. 11 | /// 12 | /// There are numerous ways for this to fail. Generally, this only applies 13 | /// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there 14 | /// are "too few" alternates, in which case, the regex engine is likely faster. 15 | /// 16 | /// And currently, this only returns something when 'hirs.len() == 1'. 17 | pub(crate) fn alternation_literals(info: &RegexInfo, hirs: &[&Hir]) -> Option>> { 18 | use regex_syntax::hir::{HirKind, Literal}; 19 | 20 | // Might as well skip the work below if we know we can't build an 21 | // Aho-Corasick searcher. 22 | if !cfg!(feature = "perf-literal-multisubstring") { 23 | return None; 24 | } 25 | // This is pretty hacky, but basically, if `is_alternation_literal` is 26 | // true, then we can make several assumptions about the structure of our 27 | // HIR. This is what justifies the `unreachable!` statements below. 28 | if hirs.len() != 1 29 | || !info.props()[0].look_set().is_empty() 30 | || info.props()[0].explicit_captures_len() > 0 31 | || !info.props()[0].is_alternation_literal() 32 | || info.config().get_match_kind() != MatchKind::LeftmostFirst 33 | { 34 | return None; 35 | } 36 | let hir = &hirs[0]; 37 | let alts = match *hir.kind() { 38 | HirKind::Alternation(ref alts) => alts, 39 | _ => return None, // one literal isn't worth it 40 | }; 41 | 42 | let mut lits = vec![]; 43 | for alt in alts { 44 | let mut lit = vec![]; 45 | match *alt.kind() { 46 | HirKind::Literal(Literal(ref bytes)) => lit.extend_from_slice(bytes), 47 | HirKind::Concat(ref exprs) => { 48 | for e in exprs { 49 | match *e.kind() { 50 | HirKind::Literal(Literal(ref bytes)) => { 51 | lit.extend_from_slice(bytes); 52 | } 53 | _ => unreachable!("expected literal, got {:?}", e), 54 | } 55 | } 56 | } 57 | _ => unreachable!("expected literal or concat, got {:?}", alt), 58 | } 59 | lits.push(lit); 60 | } 61 | // Why do this? Well, when the number of literals is small, it's likely 62 | // that we'll use the lazy DFA which is in turn likely to be faster than 63 | // Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have 64 | // a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use 65 | // the latter because it is so hungry (in time and space), and the former 66 | // is decently fast, but not as fast as a well oiled lazy DFA. 67 | // 68 | // However, once the number starts getting large, the lazy DFA is likely 69 | // to start thrashing because of the modest default cache size. When 70 | // exactly does this happen? Dunno. But at whatever point that is (we make 71 | // a guess below based on ad hoc benchmarking), we'll want to cut over to 72 | // Aho-Corasick, where even the contiguous NFA is likely to do much better. 73 | if lits.len() < 3000 { 74 | debug!("skipping Aho-Corasick because there are too few literals"); 75 | return None; 76 | } 77 | Some(lits) 78 | } 79 | -------------------------------------------------------------------------------- /src/engines/meta/mod.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | Provides a regex matcher that composes several other regex matchers 3 | automatically. 4 | 5 | This module is home to a meta [`Regex`], which provides a convenient high 6 | level API for executing regular expressions in linear time. 7 | 8 | # Comparison with the `regex` crate 9 | 10 | A meta `Regex` is the implementation used directly by the `regex` crate. 11 | Indeed, the `regex` crate API is essentially just a light wrapper over a meta 12 | `Regex`. This means that if you need the full flexibility offered by this 13 | API, then you should be able to switch to using this API directly without 14 | any changes in match semantics or syntax. However, there are some API level 15 | differences: 16 | 17 | * The `regex` crate API returns match objects that include references to the 18 | haystack itself, which in turn makes it easy to access the matching strings 19 | without having to slice the haystack yourself. In contrast, a meta `Regex` 20 | returns match objects that only have offsets in them. 21 | * At time of writing, a meta `Regex` doesn't have some of the convenience 22 | routines that the `regex` crate has, such as replacements. Note though that 23 | [`Captures::interpolate_string`](crate::util::captures::Captures::interpolate_string) 24 | will handle the replacement string interpolation for you. 25 | * A meta `Regex` supports the [`Input`](crate::Input) abstraction, which 26 | provides a way to configure a search in more ways than is supported by the 27 | `regex` crate. For example, [`Input::anchored`](crate::Input::anchored) can 28 | be used to run an anchored search, regardless of whether the pattern is itself 29 | anchored with a `^`. 30 | * A meta `Regex` supports multi-pattern searching everywhere. 31 | Indeed, every [`Match`](crate::Match) returned by the search APIs 32 | include a [`PatternID`](crate::PatternID) indicating which pattern 33 | matched. In the single pattern case, all matches correspond to 34 | [`PatternID::ZERO`](crate::PatternID::ZERO). In contrast, the `regex` crate 35 | has distinct `Regex` and a `RegexSet` APIs. The former only supports a single 36 | pattern, while the latter supports multiple patterns but cannot report the 37 | offsets of a match. 38 | * A meta `Regex` provides the explicit capability of bypassing its internal 39 | memory pool for automatically acquiring mutable scratch space required by its 40 | internal regex engines. Namely, a [`Cache`] can be explicitly provided to lower 41 | level routines such as [`Regex::search_with`]. 42 | 43 | */ 44 | 45 | pub use self::regex::{Builder, Cache, CapturesMatches, Config, FindMatches, Regex, Split, SplitN}; 46 | pub use regex_automata::meta::BuildError; 47 | 48 | mod error; 49 | // mod limited; 50 | mod literal; 51 | mod regex; 52 | // mod reverse_inner; 53 | // mod stopat; 54 | mod strategy; 55 | mod wrappers; 56 | -------------------------------------------------------------------------------- /src/engines/meta/wrappers.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | This module contains a boat load of wrappers around each of our internal regex 3 | engines. They encapsulate a few things: 4 | 5 | 1. The wrappers manage the conditional existence of the regex engine. Namely, 6 | the PikeVM is the only required regex engine. The rest are optional. These 7 | wrappers present a uniform API regardless of which engines are available. And 8 | availability might be determined by compile time features or by dynamic 9 | configuration via `meta::Config`. Encapsulating the conditional compilation 10 | features is in particular a huge simplification for the higher level code that 11 | composes these engines. 12 | 2. The wrappers manage construction of each engine, including skipping it if 13 | the engine is unavailable or configured to not be used. 14 | 3. The wrappers manage whether an engine *can* be used for a particular 15 | search configuration. For example, `BoundedBacktracker::get` only returns a 16 | backtracking engine when the haystack is bigger than the maximum supported 17 | length. The wrappers also sometimes take a position on when an engine *ought* 18 | to be used, but only in cases where the logic is extremely local to the engine 19 | itself. Otherwise, things like "choose between the backtracker and the one-pass 20 | DFA" are managed by the higher level meta strategy code. 21 | 22 | There are also corresponding wrappers for the various `Cache` types for each 23 | regex engine that needs them. If an engine is unavailable or not used, then a 24 | cache for it will *not* actually be allocated. 25 | */ 26 | 27 | use log::debug; 28 | use regex_automata::nfa::thompson::NFA; 29 | use regex_automata::util::prefilter::Prefilter; 30 | use regex_automata::util::primitives::NonMaxUsize; 31 | use regex_automata::{dfa, hybrid, HalfMatch, Match, MatchKind, PatternID}; 32 | 33 | use crate::cursor::Cursor; 34 | use crate::engines::meta::error::{BuildError, RetryFailError}; 35 | use crate::engines::meta::regex::RegexInfo; 36 | use crate::engines::pikevm; 37 | use crate::Input; 38 | 39 | #[derive(Debug)] 40 | pub(crate) struct PikeVM(PikeVMEngine); 41 | 42 | impl PikeVM { 43 | pub(crate) fn new( 44 | info: &RegexInfo, 45 | pre: Option, 46 | nfa: &NFA, 47 | ) -> Result { 48 | PikeVMEngine::new(info, pre, nfa).map(PikeVM) 49 | } 50 | 51 | pub(crate) fn create_cache(&self) -> PikeVMCache { 52 | PikeVMCache::new(self) 53 | } 54 | 55 | #[cfg_attr(feature = "perf-inline", inline(always))] 56 | pub(crate) fn get(&self) -> &PikeVMEngine { 57 | &self.0 58 | } 59 | } 60 | 61 | #[derive(Debug)] 62 | pub(crate) struct PikeVMEngine(pikevm::PikeVM); 63 | 64 | impl PikeVMEngine { 65 | pub(crate) fn new( 66 | info: &RegexInfo, 67 | pre: Option, 68 | nfa: &NFA, 69 | ) -> Result { 70 | let pikevm_config = 71 | pikevm::Config::new().match_kind(info.config().get_match_kind()).prefilter(pre); 72 | let engine = pikevm::Builder::new() 73 | .configure(pikevm_config) 74 | .build_from_nfa(nfa.clone()) 75 | .map_err(BuildError::nfa)?; 76 | debug!("PikeVM built"); 77 | Ok(PikeVMEngine(engine)) 78 | } 79 | 80 | #[cfg_attr(feature = "perf-inline", inline(always))] 81 | pub(crate) fn is_match(&self, cache: &mut PikeVMCache, input: &mut Input) -> bool { 82 | crate::engines::pikevm::is_match(&self.0, cache.0.as_mut().unwrap(), input) 83 | } 84 | 85 | #[cfg_attr(feature = "perf-inline", inline(always))] 86 | pub(crate) fn search_slots( 87 | &self, 88 | cache: &mut PikeVMCache, 89 | input: &mut Input, 90 | slots: &mut [Option], 91 | ) -> Option { 92 | crate::engines::pikevm::search_slots(&self.0, cache.0.as_mut().unwrap(), input, slots) 93 | } 94 | 95 | // #[cfg_attr(feature = "perf-inline", inline(always))] 96 | // pub(crate) fn which_overlapping_matches( 97 | // &self, 98 | // cache: &mut PikeVMCache, 99 | // input: &mut Input, 100 | // patset: &mut PatternSet, 101 | // ) { 102 | // self.0.which_overlapping_matches(cache.0.as_mut().unwrap(), input, patset) 103 | // } 104 | } 105 | 106 | #[derive(Clone, Debug)] 107 | pub(crate) struct PikeVMCache(Option); 108 | 109 | impl PikeVMCache { 110 | pub(crate) fn none() -> PikeVMCache { 111 | PikeVMCache(None) 112 | } 113 | 114 | pub(crate) fn new(builder: &PikeVM) -> PikeVMCache { 115 | PikeVMCache(Some(pikevm::Cache::new(&builder.get().0))) 116 | } 117 | 118 | pub(crate) fn reset(&mut self, builder: &PikeVM) { 119 | self.0.as_mut().unwrap().reset(&builder.get().0); 120 | } 121 | 122 | pub(crate) fn memory_usage(&self) -> usize { 123 | self.0.as_ref().map_or(0, |c| c.memory_usage()) 124 | } 125 | } 126 | 127 | #[derive(Debug)] 128 | pub(crate) struct Hybrid(Option); 129 | 130 | impl Hybrid { 131 | pub(crate) fn none() -> Hybrid { 132 | Hybrid(None) 133 | } 134 | 135 | pub(crate) fn new(info: &RegexInfo, pre: Option, nfa: &NFA, nfarev: &NFA) -> Hybrid { 136 | Hybrid(HybridEngine::new(info, pre, nfa, nfarev)) 137 | } 138 | 139 | pub(crate) fn create_cache(&self) -> HybridCache { 140 | HybridCache::new(self) 141 | } 142 | 143 | #[cfg_attr(feature = "perf-inline", inline(always))] 144 | pub(crate) fn get(&self, _input: &mut Input) -> Option<&HybridEngine> { 145 | let engine = self.0.as_ref()?; 146 | Some(engine) 147 | } 148 | 149 | pub(crate) fn is_some(&self) -> bool { 150 | self.0.is_some() 151 | } 152 | } 153 | 154 | #[derive(Debug)] 155 | pub(crate) struct HybridEngine(hybrid::regex::Regex); 156 | 157 | impl HybridEngine { 158 | pub(crate) fn new( 159 | info: &RegexInfo, 160 | pre: Option, 161 | nfa: &NFA, 162 | nfarev: &NFA, 163 | ) -> Option { 164 | { 165 | if !info.config().get_hybrid() { 166 | return None; 167 | } 168 | let dfa_config = hybrid::dfa::Config::new() 169 | .match_kind(info.config().get_match_kind()) 170 | .prefilter(pre.clone()) 171 | // Enabling this is necessary for ensuring we can service any 172 | // kind of 'Input' search without error. For the lazy DFA, 173 | // this is not particularly costly, since the start states are 174 | // generated lazily. 175 | .starts_for_each_pattern(true) 176 | .byte_classes(info.config().get_byte_classes()) 177 | .unicode_word_boundary(true) 178 | .specialize_start_states(pre.is_some()) 179 | .cache_capacity(info.config().get_hybrid_cache_capacity()) 180 | // This makes it possible for building a lazy DFA to 181 | // fail even though the NFA has already been built. Namely, 182 | // if the cache capacity is too small to fit some minimum 183 | // number of states (which is small, like 4 or 5), then the 184 | // DFA will refuse to build. 185 | // 186 | // We shouldn't enable this to make building always work, since 187 | // this could cause the allocation of a cache bigger than the 188 | // provided capacity amount. 189 | // 190 | // This is effectively the only reason why building a lazy DFA 191 | // could fail. If it does, then we simply suppress the error 192 | // and return None. 193 | .skip_cache_capacity_check(false) 194 | // This and enabling heuristic Unicode word boundary support 195 | // above make it so the lazy DFA can quit at match time. 196 | .minimum_cache_clear_count(Some(3)) 197 | .minimum_bytes_per_state(Some(10)); 198 | let result = hybrid::dfa::Builder::new() 199 | .configure(dfa_config.clone()) 200 | .build_from_nfa(nfa.clone()); 201 | let fwd = match result { 202 | Ok(fwd) => fwd, 203 | Err(_err) => { 204 | debug!("forward lazy DFA failed to build: {}", _err); 205 | return None; 206 | } 207 | }; 208 | let result = hybrid::dfa::Builder::new() 209 | .configure( 210 | dfa_config 211 | .clone() 212 | .match_kind(MatchKind::All) 213 | .prefilter(None) 214 | .specialize_start_states(false), 215 | ) 216 | .build_from_nfa(nfarev.clone()); 217 | let rev = match result { 218 | Ok(rev) => rev, 219 | Err(_err) => { 220 | debug!("reverse lazy DFA failed to build: {}", _err); 221 | return None; 222 | } 223 | }; 224 | let engine = hybrid::regex::Builder::new().build_from_dfas(fwd, rev); 225 | debug!("lazy DFA built"); 226 | Some(HybridEngine(engine)) 227 | } 228 | } 229 | 230 | #[cfg_attr(feature = "perf-inline", inline(always))] 231 | pub(crate) fn try_search( 232 | &self, 233 | cache: &mut HybridCache, 234 | input: &mut Input, 235 | ) -> Result, RetryFailError> { 236 | let cache = cache.0.as_mut().unwrap(); 237 | crate::engines::hybrid::try_search(&self.0, cache, input).map_err(|e| e.into()) 238 | } 239 | 240 | #[cfg_attr(feature = "perf-inline", inline(always))] 241 | pub(crate) fn try_search_half_fwd( 242 | &self, 243 | cache: &mut HybridCache, 244 | input: &mut Input, 245 | ) -> Result, RetryFailError> { 246 | let fwd = self.0.forward(); 247 | let fwdcache = cache.0.as_mut().unwrap().as_parts_mut().0; 248 | crate::engines::hybrid::try_search_fwd(fwd, fwdcache, input).map_err(|e| e.into()) 249 | } 250 | 251 | // #[cfg_attr(feature = "perf-inline", inline(always))] 252 | // pub(crate) fn try_search_half_fwd_stopat( 253 | // &self, 254 | // cache: &mut HybridCache, 255 | // input: &mut Input, 256 | // ) -> Result, RetryFailError> { 257 | // let dfa = self.0.forward(); 258 | // let mut cache = cache.0.as_mut().unwrap().as_parts_mut().0; 259 | // crate::meta::stopat::hybrid_try_search_half_fwd(dfa, &mut cache, input) 260 | // } 261 | 262 | #[cfg_attr(feature = "perf-inline", inline(always))] 263 | pub(crate) fn try_search_half_rev( 264 | &self, 265 | cache: &mut HybridCache, 266 | input: &mut Input, 267 | ) -> Result, RetryFailError> { 268 | let rev = self.0.reverse(); 269 | let revcache = cache.0.as_mut().unwrap().as_parts_mut().1; 270 | crate::engines::hybrid::try_search_rev(rev, revcache, input).map_err(|e| e.into()) 271 | } 272 | 273 | // #[cfg_attr(feature = "perf-inline", inline(always))] 274 | // pub(crate) fn try_search_half_rev_limited( 275 | // &self, 276 | // cache: &mut HybridCache, 277 | // input: &mut Input, 278 | // min_start: usize, 279 | // ) -> Result, RetryError> { 280 | // let dfa = self.0.reverse(); 281 | // let mut cache = cache.0.as_mut().unwrap().as_parts_mut().1; 282 | // crate::meta::limited::hybrid_try_search_half_rev(dfa, &mut cache, input, min_start) 283 | // } 284 | 285 | // #[inline] 286 | // pub(crate) fn try_which_overlapping_matches( 287 | // &self, 288 | // cache: &mut HybridCache, 289 | // input: &mut Input, 290 | // patset: &mut PatternSet, 291 | // ) -> Result<(), RetryFailError> { 292 | // let fwd = self.0.forward(); 293 | // let mut fwdcache = cache.0.as_mut().unwrap().as_parts_mut().0; 294 | // fwd.try_which_overlapping_matches(&mut fwdcache, input, patset).map_err(|e| e.into()) 295 | // } 296 | } 297 | 298 | #[derive(Clone, Debug)] 299 | pub(crate) struct HybridCache(Option); 300 | 301 | impl HybridCache { 302 | pub(crate) fn none() -> HybridCache { 303 | HybridCache(None) 304 | } 305 | 306 | pub(crate) fn new(builder: &Hybrid) -> HybridCache { 307 | HybridCache(builder.0.as_ref().map(|e| e.0.create_cache())) 308 | } 309 | 310 | pub(crate) fn reset(&mut self, builder: &Hybrid) { 311 | if let Some(ref e) = builder.0 { 312 | self.0.as_mut().unwrap().reset(&e.0); 313 | } 314 | } 315 | 316 | pub(crate) fn memory_usage(&self) -> usize { 317 | { 318 | self.0.as_ref().map_or(0, |c| c.memory_usage()) 319 | } 320 | } 321 | } 322 | 323 | #[derive(Debug)] 324 | pub(crate) struct DFA(Option); 325 | 326 | impl DFA { 327 | pub(crate) fn none() -> DFA { 328 | DFA(None) 329 | } 330 | 331 | pub(crate) fn new(info: &RegexInfo, pre: Option, nfa: &NFA, nfarev: &NFA) -> DFA { 332 | DFA(DFAEngine::new(info, pre, nfa, nfarev)) 333 | } 334 | 335 | #[cfg_attr(feature = "perf-inline", inline(always))] 336 | pub(crate) fn get(&self, _input: &mut Input) -> Option<&DFAEngine> { 337 | let engine = self.0.as_ref()?; 338 | Some(engine) 339 | } 340 | 341 | pub(crate) fn is_some(&self) -> bool { 342 | self.0.is_some() 343 | } 344 | 345 | pub(crate) fn memory_usage(&self) -> usize { 346 | self.0.as_ref().map_or(0, |e| e.memory_usage()) 347 | } 348 | } 349 | 350 | #[derive(Debug)] 351 | pub(crate) struct DFAEngine(dfa::regex::Regex); 352 | 353 | impl DFAEngine { 354 | pub(crate) fn new( 355 | info: &RegexInfo, 356 | pre: Option, 357 | nfa: &NFA, 358 | nfarev: &NFA, 359 | ) -> Option { 360 | { 361 | if !info.config().get_dfa() { 362 | return None; 363 | } 364 | // If our NFA is anything but small, don't even bother with a DFA. 365 | if let Some(state_limit) = info.config().get_dfa_state_limit() { 366 | if nfa.states().len() > state_limit { 367 | debug!( 368 | "skipping full DFA because NFA has {} states, \ 369 | which exceeds the heuristic limit of {}", 370 | nfa.states().len(), 371 | state_limit, 372 | ); 373 | return None; 374 | } 375 | } 376 | // We cut the size limit in four because the total heap used by 377 | // DFA construction is determinization aux memory and the DFA 378 | // itself, and those things are configured independently in the 379 | // lower level DFA builder API. And then split that in two because 380 | // of forward and reverse DFAs. 381 | let size_limit = info.config().get_dfa_size_limit().map(|n| n / 4); 382 | let dfa_config = dfa::dense::Config::new() 383 | .match_kind(info.config().get_match_kind()) 384 | .prefilter(pre.clone()) 385 | // Enabling this is necessary for ensuring we can service any 386 | // kind of 'Input' search without error. For the full DFA, this 387 | // can be quite costly. But since we have such a small bound 388 | // on the size of the DFA, in practice, any multl-regexes are 389 | // probably going to blow the limit anyway. 390 | .starts_for_each_pattern(true) 391 | .byte_classes(info.config().get_byte_classes()) 392 | .unicode_word_boundary(true) 393 | .specialize_start_states(pre.is_some()) 394 | .determinize_size_limit(size_limit) 395 | .dfa_size_limit(size_limit); 396 | let result = 397 | dfa::dense::Builder::new().configure(dfa_config.clone()).build_from_nfa(nfa); 398 | let fwd = match result { 399 | Ok(fwd) => fwd, 400 | Err(_err) => { 401 | debug!("forward full DFA failed to build: {}", _err); 402 | return None; 403 | } 404 | }; 405 | let result = dfa::dense::Builder::new() 406 | .configure( 407 | dfa_config 408 | .clone() 409 | // We never need unanchored reverse searches, so 410 | // there's no point in building it into the DFA, which 411 | // WILL take more space. (This isn't done for the lazy 412 | // DFA because the DFA is, well, lazy. It doesn't pay 413 | // the cost for supporting unanchored searches unless 414 | // you actually do an unanchored search, which we 415 | // don't.) 416 | .start_kind(dfa::StartKind::Anchored) 417 | .match_kind(MatchKind::All) 418 | .prefilter(None) 419 | .specialize_start_states(false), 420 | ) 421 | .build_from_nfa(nfarev); 422 | let rev = match result { 423 | Ok(rev) => rev, 424 | Err(_err) => { 425 | debug!("reverse full DFA failed to build: {}", _err); 426 | return None; 427 | } 428 | }; 429 | let engine = dfa::regex::Builder::new().build_from_dfas(fwd, rev); 430 | debug!( 431 | "fully compiled forward and reverse DFAs built, {} bytes", 432 | engine.forward().memory_usage() + engine.reverse().memory_usage(), 433 | ); 434 | Some(DFAEngine(engine)) 435 | } 436 | } 437 | 438 | #[cfg_attr(feature = "perf-inline", inline(always))] 439 | pub(crate) fn try_search( 440 | &self, 441 | input: &mut Input, 442 | ) -> Result, RetryFailError> { 443 | crate::engines::dfa::try_search(&self.0, input).map_err(|err| err.into()) 444 | } 445 | 446 | #[cfg_attr(feature = "perf-inline", inline(always))] 447 | pub(crate) fn try_search_half_fwd( 448 | &self, 449 | input: &mut Input, 450 | ) -> Result, RetryFailError> { 451 | crate::engines::dfa::try_search_fwd(self.0.forward(), input).map_err(|e| e.into()) 452 | } 453 | 454 | // #[cfg_attr(feature = "perf-inline", inline(always))] 455 | // pub(crate) fn try_search_half_fwd_stopat( 456 | // &self, 457 | // input: &mut Input, 458 | // ) -> Result, RetryFailError> { 459 | // let dfa = self.0.forward(); 460 | // crate::meta::stopat::dfa_try_search_half_fwd(dfa, input) 461 | // } 462 | 463 | #[cfg_attr(feature = "perf-inline", inline(always))] 464 | pub(crate) fn try_search_half_rev( 465 | &self, 466 | input: &mut Input, 467 | ) -> Result, RetryFailError> { 468 | crate::engines::dfa::try_search_rev(self.0.reverse(), input).map_err(|e| e.into()) 469 | } 470 | 471 | // #[cfg_attr(feature = "perf-inline", inline(always))] 472 | // pub(crate) fn try_search_half_rev_limited( 473 | // &self, 474 | // input: &mut Input, 475 | // min_start: usize, 476 | // ) -> Result, RetryError> { 477 | // let dfa = self.0.reverse(); 478 | // crate::meta::limited::dfa_try_search_half_rev(dfa, input, min_start) 479 | // } 480 | 481 | // #[inline] 482 | // pub(crate) fn try_which_overlapping_matches( 483 | // &self, 484 | // input: &mut Input, 485 | // patset: &mut PatternSet, 486 | // ) -> Result<(), RetryFailError> { 487 | // use crate::dfa::Automaton; 488 | // self.0.forward().try_which_overlapping_matches(input, patset).map_err(|e| e.into()) 489 | // } 490 | 491 | pub(crate) fn memory_usage(&self) -> usize { 492 | self.0.forward().memory_usage() + self.0.reverse().memory_usage() 493 | } 494 | } 495 | 496 | // #[derive(Debug)] 497 | // pub(crate) struct ReverseHybrid(Option); 498 | 499 | // impl ReverseHybrid { 500 | // pub(crate) fn none() -> ReverseHybrid { 501 | // ReverseHybrid(None) 502 | // } 503 | 504 | // pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> ReverseHybrid { 505 | // ReverseHybrid(ReverseHybridEngine::new(info, nfarev)) 506 | // } 507 | 508 | // pub(crate) fn create_cache(&self) -> ReverseHybridCache { 509 | // ReverseHybridCache::new(self) 510 | // } 511 | 512 | // #[cfg_attr(feature = "perf-inline", inline(always))] 513 | // pub(crate) fn get(&self, _input: &mut Input) -> Option<&ReverseHybridEngine> { 514 | // let engine = self.0.as_ref()?; 515 | // Some(engine) 516 | // } 517 | // } 518 | 519 | // #[derive(Debug)] 520 | // pub(crate) struct ReverseHybridEngine(hybrid::dfa::DFA); 521 | 522 | // impl ReverseHybridEngine { 523 | // pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> Option { 524 | // if !info.config().get_hybrid() { 525 | // return None; 526 | // } 527 | // // Since we only use this for reverse searches, we can hard-code 528 | // // a number of things like match semantics, prefilters, starts 529 | // // for each pattern and so on. 530 | // let dfa_config = hybrid::dfa::Config::new() 531 | // .match_kind(MatchKind::All) 532 | // .prefilter(None) 533 | // .starts_for_each_pattern(false) 534 | // .byte_classes(info.config().get_byte_classes()) 535 | // .unicode_word_boundary(true) 536 | // .specialize_start_states(false) 537 | // .cache_capacity(info.config().get_hybrid_cache_capacity()) 538 | // .skip_cache_capacity_check(false) 539 | // .minimum_cache_clear_count(Some(3)) 540 | // .minimum_bytes_per_state(Some(10)); 541 | // let result = 542 | // hybrid::dfa::Builder::new().configure(dfa_config).build_from_nfa(nfarev.clone()); 543 | // let rev = match result { 544 | // Ok(rev) => rev, 545 | // Err(_err) => { 546 | // debug!("lazy reverse DFA failed to build: {}", _err); 547 | // return None; 548 | // } 549 | // }; 550 | // debug!("lazy reverse DFA built"); 551 | // Some(ReverseHybridEngine(rev)) 552 | // } 553 | 554 | // #[cfg_attr(feature = "perf-inline", inline(always))] 555 | // pub(crate) fn try_search_half_rev_limited( 556 | // &self, 557 | // cache: &mut ReverseHybridCache, 558 | // input: &mut Input, 559 | // min_start: usize, 560 | // ) -> Result, RetryError> { 561 | // let dfa = &self.0; 562 | // let mut cache = cache.0.as_mut().unwrap(); 563 | // crate::meta::limited::hybrid_try_search_half_rev(dfa, &mut cache, input, min_start) 564 | // } 565 | // } 566 | 567 | // #[derive(Clone, Debug)] 568 | // pub(crate) struct ReverseHybridCache( 569 | // #[cfg(feature = "hybrid")] Option, 570 | // #[cfg(not(feature = "hybrid"))] (), 571 | // ); 572 | 573 | // impl ReverseHybridCache { 574 | // pub(crate) fn none() -> ReverseHybridCache { 575 | // #[cfg(feature = "hybrid")] 576 | // { 577 | // ReverseHybridCache(None) 578 | // } 579 | // #[cfg(not(feature = "hybrid"))] 580 | // { 581 | // ReverseHybridCache(()) 582 | // } 583 | // } 584 | 585 | // pub(crate) fn new(builder: &ReverseHybrid) -> ReverseHybridCache { 586 | // #[cfg(feature = "hybrid")] 587 | // { 588 | // ReverseHybridCache(builder.0.as_ref().map(|e| e.0.create_cache())) 589 | // } 590 | // #[cfg(not(feature = "hybrid"))] 591 | // { 592 | // ReverseHybridCache(()) 593 | // } 594 | // } 595 | 596 | // pub(crate) fn reset(&mut self, builder: &ReverseHybrid) { 597 | // #[cfg(feature = "hybrid")] 598 | // if let Some(ref e) = builder.0 { 599 | // self.0.as_mut().unwrap().reset(&e.0); 600 | // } 601 | // } 602 | 603 | // pub(crate) fn memory_usage(&self) -> usize { 604 | // #[cfg(feature = "hybrid")] 605 | // { 606 | // self.0.as_ref().map_or(0, |c| c.memory_usage()) 607 | // } 608 | // #[cfg(not(feature = "hybrid"))] 609 | // { 610 | // 0 611 | // } 612 | // } 613 | // } 614 | 615 | // #[derive(Debug)] 616 | // pub(crate) struct ReverseDFA(Option); 617 | 618 | // impl ReverseDFA { 619 | // pub(crate) fn none() -> ReverseDFA { 620 | // ReverseDFA(None) 621 | // } 622 | 623 | // pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> ReverseDFA { 624 | // ReverseDFA(ReverseDFAEngine::new(info, nfarev)) 625 | // } 626 | 627 | // #[cfg_attr(feature = "perf-inline", inline(always))] 628 | // pub(crate) fn get(&self, _input: &mut Input) -> Option<&ReverseDFAEngine> { 629 | // let engine = self.0.as_ref()?; 630 | // Some(engine) 631 | // } 632 | 633 | // pub(crate) fn is_some(&self) -> bool { 634 | // self.0.is_some() 635 | // } 636 | 637 | // pub(crate) fn memory_usage(&self) -> usize { 638 | // self.0.as_ref().map_or(0, |e| e.memory_usage()) 639 | // } 640 | // } 641 | 642 | // #[derive(Debug)] 643 | // pub(crate) struct ReverseDFAEngine( 644 | // #[cfg(feature = "dfa-build")] dfa::dense::DFA>, 645 | // #[cfg(not(feature = "dfa-build"))] (), 646 | // ); 647 | 648 | // impl ReverseDFAEngine { 649 | // pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> Option { 650 | // #[cfg(feature = "dfa-build")] 651 | // { 652 | // if !info.config().get_dfa() { 653 | // return None; 654 | // } 655 | // // If our NFA is anything but small, don't even bother with a DFA. 656 | // if let Some(state_limit) = info.config().get_dfa_state_limit() { 657 | // if nfarev.states().len() > state_limit { 658 | // debug!( 659 | // "skipping full reverse DFA because NFA has {} states, \ 660 | // which exceeds the heuristic limit of {}", 661 | // nfarev.states().len(), 662 | // state_limit, 663 | // ); 664 | // return None; 665 | // } 666 | // } 667 | // // We cut the size limit in two because the total heap used by DFA 668 | // // construction is determinization aux memory and the DFA itself, 669 | // // and those things are configured independently in the lower level 670 | // // DFA builder API. 671 | // let size_limit = info.config().get_dfa_size_limit().map(|n| n / 2); 672 | // // Since we only use this for reverse searches, we can hard-code 673 | // // a number of things like match semantics, prefilters, starts 674 | // // for each pattern and so on. We also disable acceleration since 675 | // // it's incompatible with limited searches (which is the only 676 | // // operation we support for this kind of engine at the moment). 677 | // let dfa_config = dfa::dense::Config::new() 678 | // .match_kind(MatchKind::All) 679 | // .prefilter(None) 680 | // .accelerate(false) 681 | // .start_kind(dfa::StartKind::Anchored) 682 | // .starts_for_each_pattern(false) 683 | // .byte_classes(info.config().get_byte_classes()) 684 | // .unicode_word_boundary(true) 685 | // .specialize_start_states(false) 686 | // .determinize_size_limit(size_limit) 687 | // .dfa_size_limit(size_limit); 688 | // let result = dfa::dense::Builder::new().configure(dfa_config).build_from_nfa(&nfarev); 689 | // let rev = match result { 690 | // Ok(rev) => rev, 691 | // Err(_err) => { 692 | // debug!("full reverse DFA failed to build: {}", _err); 693 | // return None; 694 | // } 695 | // }; 696 | // debug!("fully compiled reverse DFA built, {} bytes", rev.memory_usage()); 697 | // Some(ReverseDFAEngine(rev)) 698 | // } 699 | // #[cfg(not(feature = "dfa-build"))] 700 | // { 701 | // None 702 | // } 703 | // } 704 | 705 | // #[cfg_attr(feature = "perf-inline", inline(always))] 706 | // pub(crate) fn try_search_half_rev_limited( 707 | // &self, 708 | // input: &mut Input, 709 | // min_start: usize, 710 | // ) -> Result, RetryError> { 711 | // #[cfg(feature = "dfa-build")] 712 | // { 713 | // let dfa = &self.0; 714 | // crate::meta::limited::dfa_try_search_half_rev(dfa, input, min_start) 715 | // } 716 | // #[cfg(not(feature = "dfa-build"))] 717 | // { 718 | // // Impossible to reach because this engine is never constructed 719 | // // if the requisite features aren't enabled. 720 | // unreachable!() 721 | // } 722 | // } 723 | 724 | // pub(crate) fn memory_usage(&self) -> usize { 725 | // #[cfg(feature = "dfa-build")] 726 | // { 727 | // self.0.memory_usage() 728 | // } 729 | // #[cfg(not(feature = "dfa-build"))] 730 | // { 731 | // // Impossible to reach because this engine is never constructed 732 | // // if the requisite features aren't enabled. 733 | // unreachable!() 734 | // } 735 | // } 736 | // } 737 | -------------------------------------------------------------------------------- /src/engines/pikevm/error.rs: -------------------------------------------------------------------------------- 1 | use regex_automata::util::primitives::{PatternID, StateID}; 2 | use regex_automata::util::{captures, look}; 3 | 4 | /// An error that can occurred during the construction of a thompson NFA. 5 | /// 6 | /// This error does not provide many introspection capabilities. There are 7 | /// generally only two things you can do with it: 8 | /// 9 | /// * Obtain a human readable message via its `std::fmt::Display` impl. 10 | /// * Access an underlying [`regex_syntax::Error`] type from its `source` 11 | /// method via the `std::error::Error` trait. This error only occurs when using 12 | /// convenience routines for building an NFA directly from a pattern string. 13 | /// 14 | /// Otherwise, errors typically occur when a limit has been breeched. For 15 | /// example, if the total heap usage of the compiled NFA exceeds the limit 16 | /// set by [`Config::nfa_size_limit`](crate::nfa::thompson::Config), then 17 | /// building the NFA will fail. 18 | #[derive(Clone, Debug)] 19 | pub struct BuildError { 20 | kind: BuildErrorKind, 21 | } 22 | 23 | /// The kind of error that occurred during the construction of a thompson NFA. 24 | #[derive(Clone, Debug)] 25 | enum BuildErrorKind { 26 | /// An error that occurred while parsing a regular expression. Note that 27 | /// this error may be printed over multiple lines, and is generally 28 | /// intended to be end user readable on its own. 29 | Syntax(regex_syntax::Error), 30 | /// An error that occurs if the capturing groups provided to an NFA builder 31 | /// do not satisfy the documented invariants. For example, things like 32 | /// too many groups, missing groups, having the first (zeroth) group be 33 | /// named or duplicate group names within the same pattern. 34 | Captures(captures::GroupInfoError), 35 | /// An error that occurs when an NFA contains a Unicode word boundary, but 36 | /// where the crate was compiled without the necessary data for dealing 37 | /// with Unicode word boundaries. 38 | Word(look::UnicodeWordBoundaryError), 39 | /// An error that occurs if too many patterns were given to the NFA 40 | /// compiler. 41 | TooManyPatterns { 42 | /// The number of patterns given, which exceeds the limit. 43 | given: usize, 44 | /// The limit on the number of patterns. 45 | limit: usize, 46 | }, 47 | /// An error that occurs if too states are produced while building an NFA. 48 | TooManyStates { 49 | /// The minimum number of states that are desired, which exceeds the 50 | /// limit. 51 | given: usize, 52 | /// The limit on the number of states. 53 | limit: usize, 54 | }, 55 | /// An error that occurs when NFA compilation exceeds a configured heap 56 | /// limit. 57 | ExceededSizeLimit { 58 | /// The configured limit, in bytes. 59 | limit: usize, 60 | }, 61 | /// An error that occurs when an invalid capture group index is added to 62 | /// the NFA. An "invalid" index can be one that would otherwise overflow 63 | /// a `usize` on the current target. 64 | InvalidCaptureIndex { 65 | /// The invalid index that was given. 66 | index: u32, 67 | }, 68 | /// An error that occurs when one tries to build an NFA simulation (such as 69 | /// the PikeVM) without any capturing groups. 70 | MissingCaptures, 71 | /// An error that occurs when one tries to build a reverse NFA with 72 | /// captures enabled. Currently, this isn't supported, but we probably 73 | /// should support it at some point. 74 | UnsupportedCaptures, 75 | } 76 | 77 | impl BuildError { 78 | /// If this error occurred because the NFA exceeded the configured size 79 | /// limit before being built, then this returns the configured size limit. 80 | /// 81 | /// The limit returned is what was configured, and corresponds to the 82 | /// maximum amount of heap usage in bytes. 83 | pub fn size_limit(&self) -> Option { 84 | match self.kind { 85 | BuildErrorKind::ExceededSizeLimit { limit } => Some(limit), 86 | _ => None, 87 | } 88 | } 89 | 90 | fn kind(&self) -> &BuildErrorKind { 91 | &self.kind 92 | } 93 | 94 | pub(crate) fn syntax(err: regex_syntax::Error) -> BuildError { 95 | BuildError { kind: BuildErrorKind::Syntax(err) } 96 | } 97 | 98 | pub(crate) fn captures(err: captures::GroupInfoError) -> BuildError { 99 | BuildError { kind: BuildErrorKind::Captures(err) } 100 | } 101 | 102 | pub(crate) fn word(err: look::UnicodeWordBoundaryError) -> BuildError { 103 | BuildError { kind: BuildErrorKind::Word(err) } 104 | } 105 | 106 | pub(crate) fn too_many_patterns(given: usize) -> BuildError { 107 | let limit = PatternID::LIMIT; 108 | BuildError { kind: BuildErrorKind::TooManyPatterns { given, limit } } 109 | } 110 | 111 | pub(crate) fn too_many_states(given: usize) -> BuildError { 112 | let limit = StateID::LIMIT; 113 | BuildError { kind: BuildErrorKind::TooManyStates { given, limit } } 114 | } 115 | 116 | pub(crate) fn exceeded_size_limit(limit: usize) -> BuildError { 117 | BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } } 118 | } 119 | 120 | pub(crate) fn invalid_capture_index(index: u32) -> BuildError { 121 | BuildError { kind: BuildErrorKind::InvalidCaptureIndex { index } } 122 | } 123 | 124 | pub(crate) fn missing_captures() -> BuildError { 125 | BuildError { kind: BuildErrorKind::MissingCaptures } 126 | } 127 | 128 | pub(crate) fn unsupported_captures() -> BuildError { 129 | BuildError { kind: BuildErrorKind::UnsupportedCaptures } 130 | } 131 | } 132 | 133 | impl std::error::Error for BuildError { 134 | fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { 135 | match self.kind() { 136 | BuildErrorKind::Syntax(ref err) => Some(err), 137 | BuildErrorKind::Captures(ref err) => Some(err), 138 | _ => None, 139 | } 140 | } 141 | } 142 | 143 | impl core::fmt::Display for BuildError { 144 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 145 | match self.kind() { 146 | BuildErrorKind::Syntax(_) => write!(f, "error parsing regex"), 147 | BuildErrorKind::Captures(_) => { 148 | write!(f, "error with capture groups") 149 | } 150 | BuildErrorKind::Word(_) => { 151 | write!(f, "NFA contains Unicode word boundary") 152 | } 153 | BuildErrorKind::TooManyPatterns { given, limit } => write!( 154 | f, 155 | "attempted to compile {} patterns, \ 156 | which exceeds the limit of {}", 157 | given, limit, 158 | ), 159 | BuildErrorKind::TooManyStates { given, limit } => write!( 160 | f, 161 | "attempted to compile {} NFA states, \ 162 | which exceeds the limit of {}", 163 | given, limit, 164 | ), 165 | BuildErrorKind::ExceededSizeLimit { limit } => { 166 | write!(f, "heap usage during NFA compilation exceeded limit of {}", limit,) 167 | } 168 | BuildErrorKind::InvalidCaptureIndex { index } => { 169 | write!(f, "capture group index {} is invalid (too big or discontinuous)", index,) 170 | } 171 | BuildErrorKind::MissingCaptures => write!( 172 | f, 173 | "operation requires the NFA to have capturing groups, \ 174 | but the NFA given contains none", 175 | ), 176 | BuildErrorKind::UnsupportedCaptures => write!( 177 | f, 178 | "currently captures must be disabled when compiling \ 179 | a reverse NFA", 180 | ), 181 | } 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /src/engines/pikevm/tests.rs: -------------------------------------------------------------------------------- 1 | use std::ops::RangeBounds; 2 | 3 | use proptest::{prop_assert_eq, proptest}; 4 | use regex_automata::nfa::thompson::pikevm::PikeVM; 5 | use regex_automata::nfa::thompson::Config; 6 | use regex_automata::util::escape::DebugHaystack; 7 | use regex_automata::util::syntax::Config as SyntaxConfig; 8 | 9 | use crate::engines::pikevm::find_iter; 10 | use crate::input::Input; 11 | use crate::test_rope::SingleByteChunks; 12 | 13 | use super::Cache; 14 | 15 | fn test(needle: &str, haystack: &[u8]) { 16 | test_with_bounds(needle, haystack, ..) 17 | } 18 | 19 | fn test_with_bounds(needle: &str, haystack: &[u8], bounds: impl RangeBounds + Clone) { 20 | for utf8 in [true, false] { 21 | let regex = PikeVM::builder() 22 | .syntax(SyntaxConfig::new().utf8(utf8)) 23 | .thompson(Config::new().utf8(utf8)) 24 | .build(needle) 25 | .unwrap(); 26 | let mut cache1 = regex.create_cache(); 27 | let mut cache2 = Cache::new(®ex); 28 | let input = regex_automata::Input::new(haystack).range(bounds.clone()); 29 | let iter1: Vec<_> = regex.find_iter(&mut cache1, input).collect(); 30 | let input = Input::new(SingleByteChunks::new(haystack)).range(bounds.clone()); 31 | let iter2: Vec<_> = find_iter(®ex, &mut cache2, input).collect(); 32 | assert_eq!(iter1, iter2, "matches of {needle} in {:?}", DebugHaystack(haystack)); 33 | } 34 | } 35 | 36 | #[test] 37 | fn smoke_test() { 38 | let text = std::fs::read_to_string("test_cases/syntax.rs").unwrap(); 39 | let regex = 40 | PikeVM::builder().syntax(SyntaxConfig::new().case_insensitive(true)).build("vec").unwrap(); 41 | let mut cache = Cache::new(®ex); 42 | let rope = ropey::Rope::from_str(&text); 43 | let matches: Vec<_> = find_iter(®ex, &mut cache, Input::new(&rope)) 44 | .map(|range| rope.byte_slice(range.range())) 45 | .collect(); 46 | println!("found {matches:#?} in syntax.rs"); 47 | assert_eq!(matches.len(), 68); 48 | } 49 | 50 | #[test] 51 | fn any() { 52 | test(".", b" "); 53 | } 54 | 55 | #[test] 56 | fn look_around() { 57 | test("^bar", b"foobar"); 58 | test("foo$", b"foobar"); 59 | test(r"(?m)(?:^|a)+", b"a\naaa\n"); 60 | test_with_bounds(r"\b{end}", "𝛃".as_bytes(), 2..3); 61 | let haystack: String = 62 | (0..5 * 4096).map(|i| format!("foöbar foÖ{0}bar foö{0}bar", " ".repeat(i % 31))).collect(); 63 | let needle = r"\bfoö\b[ ]*\bbar\b"; 64 | test(needle, haystack.as_bytes()) 65 | } 66 | 67 | #[test] 68 | fn maybe_empty() { 69 | test(r"x*", b"x"); 70 | test(r"\bx*\b", b"x"); 71 | } 72 | 73 | proptest! { 74 | #[test] 75 | fn matches(haystack: String, needle: String) { 76 | let Ok(regex) = PikeVM::builder().syntax(SyntaxConfig::new().case_insensitive(true)).build(&needle) else { 77 | return Ok(()) 78 | }; 79 | let mut cache1 = regex.create_cache(); 80 | let mut cache2 = Cache::new(®ex); 81 | let iter1: Vec<_> = regex.find_iter(&mut cache1, &haystack).collect(); 82 | let iter2: Vec<_> = find_iter(®ex, &mut cache2, Input::new(SingleByteChunks::new(haystack.as_bytes()))).collect(); 83 | prop_assert_eq!(iter1, iter2); 84 | } 85 | #[test] 86 | fn matches_word(haystack: String, needle in r"\\b\PC+\\b") { 87 | let Ok(regex) = PikeVM::builder().syntax(SyntaxConfig::new().case_insensitive(true)).build(&needle) else { 88 | return Ok(()) 89 | }; 90 | let mut cache1 = regex.create_cache(); 91 | let mut cache2 = Cache::new(®ex); 92 | let iter1: Vec<_> = regex.find_iter(&mut cache1, &haystack).collect(); 93 | let iter2: Vec<_> = find_iter(®ex, &mut cache2, Input::new(SingleByteChunks::new(haystack.as_bytes()))).collect(); 94 | prop_assert_eq!(iter1, iter2); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | This crate provides routines for searching **discontiguous strings** for matches 3 | of a regular expression (aka "regex"). It is based on regex-automata and 4 | most of the code is adapted from the various crates in the 5 | [regex](https://github.com/rust-lang/regex) repository. 6 | 7 | It is intended as a prototype for upstream support for "streaming regex". The 8 | cursor based API in this crate is very similar to the API already exposed by 9 | `regex`/`regex-automata`. To that end a generic `Cursor` trait is provided that 10 | collections can implement. 11 | 12 | A sketch of the cursor API is shown below. The string is yielded in multiple 13 | byte chunks. Calling advance moves the cursor to the next chunk. Calling 14 | backtrack moves the cursor a chunk back. Backtracking is required by this 15 | crate. That makes it unsuitable for searching fully unbuffered streams like 16 | bytes send over a TCP connection. 17 | 18 | ```rust_ignore 19 | pub trait Cursor { 20 | fn chunk(&self) -> &[u8] { .. } 21 | fn advance(&mut self) -> bool { .. } 22 | fn bracktrack(&mut self) -> bool { .. } 23 | } 24 | ``` 25 | 26 | Working on this crate showed me that regex backtracks a lot more than expected 27 | with most functionality fundamentally requiring backtracking. For network 28 | usecases that do not buffer their input the primary usecase would likely be 29 | detecting a match (without necessarily requiring the matched byte range). 30 | Such usecases can be covered by manually feeding bytes into the hybrid and DFA 31 | engines from the regex-automata crate. This approach also has the advantage 32 | of allowing the caller to pause the match (async) while waiting for more data 33 | allowing the caller to drive the search instead of the engine itself. 34 | 35 | The only part of this crate that could be applied to the fully streaming case is 36 | the streaming PikeVM implementation. However, there are some limitations: 37 | * only a single search can be run since the PikeVM may look ahead multiple bytes 38 | to disambiguate alternative matches 39 | * Prefilters longer than one byte can not work 40 | * utf-8 mode can not be supported (empty matches may occur between unicode 41 | boundaries) 42 | 43 | Currently, the PikeVM implementation is not written with this use case in mind 44 | and may call backtrack unnecessarily, but that could be addressed in the future, 45 | but especially the first point is very limiting. The pikevm also does not allow 46 | the user to drive the search and would block on network calls for example (no 47 | async). 48 | */ 49 | 50 | #[cfg(feature = "ropey")] 51 | pub use cursor::RopeyCursor; 52 | pub use cursor::{Cursor, IntoCursor}; 53 | pub use input::Input; 54 | pub use regex_automata; 55 | 56 | mod cursor; 57 | pub mod engines; 58 | mod input; 59 | mod literal; 60 | mod util; 61 | 62 | #[cfg(test)] 63 | mod test_rope; 64 | #[cfg(test)] 65 | mod tests; 66 | -------------------------------------------------------------------------------- /src/literal.rs: -------------------------------------------------------------------------------- 1 | pub use regex_automata::util::prefilter::Prefilter; 2 | pub use regex_automata::MatchKind; 3 | use regex_automata::Span; 4 | 5 | use crate::cursor::Cursor; 6 | use crate::Input; 7 | 8 | use FindChunkResult::*; 9 | 10 | #[cfg(test)] 11 | mod tests; 12 | 13 | pub fn find(prefilter: &Prefilter, input: &mut Input) -> Option { 14 | // TODO optimize this: 15 | // * potentially use an array vec 16 | // * specical case max_needle_len==2 (no accumulating necessary) 17 | // * specical case max_needle_len==min_needle_len (no ambiguety) 18 | if prefilter.max_needle_len() == 1 { 19 | find_1(prefilter, input) 20 | } else { 21 | find_n::(prefilter, input) 22 | } 23 | } 24 | 25 | pub fn prefix(prefilter: &Prefilter, input: &mut Input) -> Option { 26 | let mut offset = input.chunk_offset(); 27 | let chunk_pos = input.chunk_pos(); 28 | let chunk_end = input.get_chunk_end(); 29 | let mut res = if prefilter.max_needle_len() <= chunk_end - chunk_pos { 30 | prefilter 31 | .prefix(input.chunk(), Span { start: input.chunk_pos(), end: input.get_chunk_end() })? 32 | } else { 33 | offset += chunk_pos; 34 | let mut buf = 35 | Vec::with_capacity(prefilter.max_needle_len().min(input.end() - input.start())); 36 | buf.extend_from_slice(&input.chunk()[chunk_pos..chunk_end]); 37 | while input.advance() && !buf.spare_capacity_mut().is_empty() { 38 | let mut chunk_len = input.chunk().len().min(buf.spare_capacity_mut().len()); 39 | if input.chunk_offset() + chunk_len <= input.end() { 40 | buf.extend_from_slice(&input.chunk()[..chunk_len]); 41 | } else { 42 | chunk_len = input.end() - input.chunk_offset(); 43 | buf.extend_from_slice(&input.chunk()[..chunk_len]); 44 | break; 45 | } 46 | } 47 | prefilter.prefix(&buf, Span { start: 0, end: buf.len() })? 48 | }; 49 | res.start += offset; 50 | res.end += offset; 51 | Some(res) 52 | } 53 | 54 | fn find_1(prefilter: &Prefilter, input: &mut Input) -> Option { 55 | debug_assert_eq!(prefilter.max_needle_len(), 1); 56 | let first_haystack = &input.chunk(); 57 | if let Some(mut res) = prefilter 58 | .find(first_haystack, Span { start: input.chunk_pos(), end: input.get_chunk_end() }) 59 | { 60 | res.start += input.chunk_offset(); 61 | res.end += input.chunk_offset(); 62 | return Some(res); 63 | } 64 | while input.chunk_offset() + input.chunk().len() < input.end() && input.advance() { 65 | let haystack = &input.chunk(); 66 | let Some(mut res) = prefilter.find(haystack, Span { start: 0, end: input.get_chunk_end() }) 67 | else { 68 | continue; 69 | }; 70 | 71 | res.start += input.chunk_offset(); 72 | res.end += input.chunk_offset(); 73 | return Some(res); 74 | } 75 | None 76 | } 77 | 78 | fn find_n( 79 | prefilter: &Prefilter, 80 | input: &mut Input, 81 | ) -> Option { 82 | // helper macro to make the code more readable 83 | macro_rules! find_chunk { 84 | ($chunk:expr, $buf_offset:expr, |$start: ident, $off: ident| $disambiguate: expr) => { 85 | match find_n_chunk::(prefilter, $chunk, $buf_offset) { 86 | FindChunkResult::Match(span) => return Some(span), 87 | FindChunkResult::AbigousMatch { $start, $off } if AMBIGUITY => { 88 | return Some($disambiguate); 89 | } 90 | _ => {} 91 | } 92 | }; 93 | } 94 | 95 | // simple case: only search in a single chunk specical casing this is nice 96 | // for performance and makes the rest of the logic simpler 97 | let first_chunk_end = input.get_chunk_end(); 98 | let mut first_chunk = input.chunk(); 99 | if first_chunk.len() != first_chunk_end { 100 | if let Some(mut res) = 101 | prefilter.find(first_chunk, Span { start: input.chunk_pos(), end: first_chunk_end }) 102 | { 103 | res.start += input.chunk_offset(); 104 | res.end += input.chunk_offset(); 105 | return Some(res); 106 | } 107 | return None; 108 | } 109 | first_chunk = &first_chunk[input.chunk_pos()..]; 110 | 111 | let max_needle_len = prefilter.max_needle_len(); 112 | let carry_over = max_needle_len - 1; 113 | let sliding_window = 2 * carry_over; 114 | 115 | // again special case the first chunk since that is the hot path 116 | // and also keeps the logic below simpler 117 | let mut buf_offset = input.chunk_offset() + input.chunk_pos(); 118 | if first_chunk.len() >= sliding_window { 119 | find_chunk!(first_chunk, input.chunk_offset() + input.chunk_pos(), |start, off| { 120 | let mut buf = Vec::with_capacity(max_needle_len); 121 | buf.extend_from_slice(&first_chunk[start..]); 122 | disambiguate_match(prefilter, input, buf, off) 123 | }); 124 | let carrry_over_start = first_chunk.len() - carry_over; 125 | first_chunk = &first_chunk[carrry_over_start..]; 126 | buf_offset += carrry_over_start; 127 | } 128 | let mut buf = Vec::with_capacity(2 * sliding_window); 129 | buf.extend_from_slice(first_chunk); 130 | 131 | while input.chunk_offset() + input.chunk().len() < input.end() && input.advance() { 132 | debug_assert!(buf.len() < sliding_window, "{} {sliding_window}", buf.len()); 133 | let mut chunk = &input.chunk()[..input.get_chunk_end()]; 134 | let mut chunk_offset = input.chunk_offset(); 135 | // this condition only triggers until we have filled the buffer for the first time 136 | if buf.len() < carry_over { 137 | if buf.len() + chunk.len() <= carry_over { 138 | buf.extend_from_slice(chunk); 139 | continue; 140 | } 141 | let copied = carry_over - buf.len(); 142 | buf.extend_from_slice(&chunk[..copied]); 143 | chunk = &chunk[copied..]; 144 | chunk_offset += copied; 145 | } 146 | debug_assert!(buf.len() >= carry_over, "{} {carry_over}", buf.len()); 147 | 148 | // if the chunk is too small just continue accumelating the condition 149 | // below implies chunk.len() <= sliding_window since buf.len() <= 150 | // sliding_window 151 | if buf.len() + chunk.len() <= buf.capacity() { 152 | buf.extend_from_slice(chunk); 153 | if buf.len() >= sliding_window { 154 | find_chunk!(&buf, buf_offset, |start, off| { 155 | buf.drain(..start); 156 | disambiguate_match(prefilter, input, buf, off) 157 | }); 158 | let carry_over_start = buf.len() - carry_over; 159 | buf.drain(..carry_over_start); 160 | buf_offset += carry_over_start; 161 | } 162 | continue; 163 | } 164 | 165 | buf.extend_from_slice(&chunk[..carry_over]); 166 | find_chunk!(&buf, buf_offset, |start, off| { 167 | buf.drain(..start); 168 | buf.extend_from_slice(&chunk[..max_needle_len - buf.len()]); 169 | let mut res = prefilter.prefix(&buf, Span { start: 0, end: buf.len() }).unwrap(); 170 | res.start += off; 171 | res.end += off; 172 | res 173 | }); 174 | buf.clear(); 175 | 176 | find_chunk!(chunk, chunk_offset, |start, off| { 177 | buf.extend_from_slice(&chunk[start..]); 178 | disambiguate_match(prefilter, input, buf, off) 179 | }); 180 | let carrry_over_start = chunk.len() - carry_over; 181 | buf_offset = chunk_offset + carrry_over_start; 182 | buf.extend_from_slice(&chunk[carrry_over_start..]); 183 | } 184 | 185 | if !buf.is_empty() { 186 | if let Some(mut res) = prefilter.find(&buf, Span { start: 0, end: buf.len() }) { 187 | res.start += buf_offset; 188 | res.end += buf_offset; 189 | return Some(res); 190 | } 191 | } 192 | None 193 | } 194 | 195 | #[must_use] 196 | enum FindChunkResult { 197 | // the prefilter found no matches in this chunk 198 | NoMatch, 199 | // the prefilter found a match at the (offset correctd) 200 | // span in this chunk 201 | Match(Span), 202 | // the prefilter found a match that could be ambigous 203 | // depending on what data follows the buffer 204 | AbigousMatch { start: usize, off: usize }, 205 | } 206 | 207 | fn disambiguate_match( 208 | prefilter: &Prefilter, 209 | input: &mut Input, 210 | mut buf: Vec, 211 | off: usize, 212 | ) -> Span { 213 | let max_needle_len = prefilter.max_needle_len(); 214 | debug_assert!(buf.len() < max_needle_len); 215 | while input.advance() { 216 | let chunk_end = input.get_chunk_end().min(max_needle_len - buf.len()); 217 | let chunk = input.chunk(); 218 | if chunk_end != chunk.len() { 219 | buf.extend_from_slice(&chunk[..chunk_end]); 220 | break; 221 | } 222 | buf.extend_from_slice(chunk); 223 | } 224 | debug_assert!(buf.len() <= max_needle_len); 225 | let mut res = prefilter.prefix(&buf, Span { start: 0, end: buf.len() }).unwrap(); 226 | res.start += off; 227 | res.end += off; 228 | res 229 | } 230 | 231 | fn find_n_chunk( 232 | prefilter: &Prefilter, 233 | buf: &[u8], 234 | off: usize, 235 | ) -> FindChunkResult { 236 | debug_assert!(buf.len() >= 2 * prefilter.max_needle_len() - 2); 237 | if let Some(mut res) = prefilter.find(buf, Span { start: 0, end: buf.len() }) { 238 | // This condition is neeed in case we find a match at the end of the 239 | // chunk. In that case there may be an even longer match once we 240 | // continue scanning. For example: 241 | // 242 | // pattern: "abc|a" 243 | // haystack: "xxabc" chunked into ["xxab", "c"] 244 | // matck_kind: leftmost-first 245 | // 246 | // In the first chunk we would find a match for "a" but we 247 | // should be matching "abc" instead (since that is the first 248 | // alternation). 249 | if AMBIGOUS && res.start + prefilter.max_needle_len() > buf.len() { 250 | AbigousMatch { start: res.start, off: res.start + off } 251 | } else { 252 | res.start += off; 253 | res.end += off; 254 | Match(res) 255 | } 256 | } else { 257 | NoMatch 258 | } 259 | } 260 | -------------------------------------------------------------------------------- /src/literal/tests.rs: -------------------------------------------------------------------------------- 1 | use std::iter; 2 | 3 | use proptest::proptest; 4 | use regex_automata::util::prefilter::Prefilter; 5 | use regex_automata::Span; 6 | 7 | proptest! { 8 | #[test] 9 | fn matches(mut haystack: String, needle: String) { 10 | haystack = haystack.repeat(1024); 11 | let needles = &[needle.as_bytes()]; 12 | let Some(prefilter) = Prefilter::new(regex_automata::MatchKind::All, needles) else { 13 | return Ok(()) 14 | }; 15 | let mut span = Span{ start: 0, end: haystack.len() }; 16 | let iter1 = iter::from_fn(||{ 17 | let res = prefilter.find(haystack.as_bytes(), span)?; 18 | span.start = res.end; 19 | Some(res) 20 | }); 21 | let rope = ropey::Rope::from_str(&haystack); 22 | let mut input = crate::Input::new(&rope); 23 | let iter2= iter::from_fn(||{ 24 | let res = super::find(&prefilter, &mut input)?; 25 | input.move_to(res.end); 26 | Some(res) 27 | }); 28 | crate::util::iter::prop_assert_eq(iter1, iter2)?; 29 | } 30 | 31 | #[test] 32 | fn matches_range(mut haystack: String, needle: String) { 33 | haystack = haystack.repeat(1024); 34 | let start = haystack.len() / 3; 35 | let end = 2*start; 36 | let needles = &[needle.as_bytes()]; 37 | let Some(prefilter) = Prefilter::new(regex_automata::MatchKind::All, needles) else { 38 | return Ok(()) 39 | }; 40 | let mut span = Span{ start, end }; 41 | let iter1 = iter::from_fn(||{ 42 | let res = prefilter.find(haystack.as_bytes(), span)?; 43 | span.start = res.end; 44 | Some(res) 45 | }); 46 | let rope = ropey::Rope::from_str(&haystack); 47 | let mut input = crate::Input::new(&rope).range(start..end); 48 | let iter2 = iter::from_fn(||{ 49 | let res = super::find(&prefilter, &mut input)?; 50 | assert!(res.end <= end); 51 | input.move_to(res.end); 52 | Some(res) 53 | }); 54 | crate::util::iter::prop_assert_eq(iter1, iter2)?; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/test_rope.rs: -------------------------------------------------------------------------------- 1 | use std::cell::Cell; 2 | use std::collections::hash_map::DefaultHasher; 3 | use std::hash::Hasher; 4 | use std::sync::atomic::{AtomicUsize, Ordering}; 5 | 6 | use regex_automata::util::escape::DebugHaystack; 7 | 8 | use crate::util::utf8; 9 | use crate::Cursor; 10 | 11 | #[derive(Debug)] 12 | struct XorShift64Star { 13 | state: Cell, 14 | } 15 | 16 | impl XorShift64Star { 17 | fn new() -> Self { 18 | // Any non-zero seed will do -- this uses the hash of a global counter. 19 | let mut seed = 0; 20 | while seed == 0 { 21 | let mut hasher = DefaultHasher::new(); 22 | static COUNTER: AtomicUsize = AtomicUsize::new(0); 23 | hasher.write_usize(COUNTER.fetch_add(1, Ordering::Relaxed)); 24 | seed = hasher.finish(); 25 | } 26 | 27 | XorShift64Star { state: Cell::new(seed) } 28 | } 29 | 30 | fn next(&self) -> u64 { 31 | let mut x = self.state.get(); 32 | debug_assert_ne!(x, 0); 33 | x ^= x >> 12; 34 | x ^= x << 25; 35 | x ^= x >> 27; 36 | self.state.set(x); 37 | x.wrapping_mul(0x2545_f491_4f6c_dd1d) 38 | } 39 | 40 | /// Return a value from `0..n`. 41 | fn next_usize(&self, n: usize) -> usize { 42 | (self.next() % n as u64) as usize 43 | } 44 | } 45 | 46 | #[derive(Debug)] 47 | pub(crate) struct RandomSlices<'a> { 48 | haystack: &'a [u8], 49 | pos: usize, 50 | size: usize, 51 | ran: XorShift64Star, 52 | } 53 | 54 | impl<'a> RandomSlices<'a> { 55 | pub fn new(haystack: &'a [u8]) -> Self { 56 | let mut res = RandomSlices { haystack, pos: 0, size: 0, ran: XorShift64Star::new() }; 57 | res.advance(); 58 | res 59 | } 60 | } 61 | 62 | impl Cursor for RandomSlices<'_> { 63 | fn chunk(&self) -> &[u8] { 64 | debug_assert_eq!(self.haystack.is_empty(), self.size == 0); 65 | &self.haystack[self.pos..self.pos + self.size] 66 | } 67 | 68 | fn utf8_aware(&self) -> bool { 69 | true 70 | } 71 | 72 | fn advance(&mut self) -> bool { 73 | if self.pos + self.size == self.haystack.len() { 74 | return false; 75 | } 76 | let new_start = self.pos + self.size; 77 | let mut tries = u16::MAX; 78 | loop { 79 | let next_size = self.ran.next_usize(250) + 1; 80 | let new_end = (new_start + next_size).min(self.haystack.len()); 81 | if utf8::is_boundary(self.haystack, new_end) { 82 | self.pos = new_start; 83 | self.size = new_end - new_start; 84 | break; 85 | } 86 | if tries == 0 { 87 | panic!("faild to advance at {} {:?}", self.pos, DebugHaystack(self.haystack)) 88 | } 89 | tries -= 1; 90 | } 91 | true 92 | } 93 | 94 | fn backtrack(&mut self) -> bool { 95 | if self.pos == 0 { 96 | return false; 97 | } 98 | let mut tries = u16::MAX; 99 | let new_end = self.pos; 100 | loop { 101 | let next_size = self.ran.next_usize(250) + 1; 102 | let new_start = new_end.saturating_sub(next_size); 103 | if utf8::is_boundary(self.haystack, new_start) { 104 | self.pos = new_start; 105 | self.size = new_end - new_start; 106 | break; 107 | } 108 | if tries == 0 { 109 | panic!("faild to backtrack at {} {:?}", self.pos, DebugHaystack(self.haystack)) 110 | } 111 | tries -= 1; 112 | } 113 | true 114 | } 115 | 116 | fn total_bytes(&self) -> Option { 117 | Some(self.haystack.len()) 118 | } 119 | 120 | fn offset(&self) -> usize { 121 | self.pos 122 | } 123 | } 124 | 125 | #[derive(Debug)] 126 | pub(crate) struct SingleByteChunks<'a> { 127 | haystack: &'a [u8], 128 | pos: usize, 129 | end: usize, 130 | } 131 | 132 | impl<'a> SingleByteChunks<'a> { 133 | pub fn new(haystack: &'a [u8]) -> Self { 134 | Self { 135 | haystack, 136 | pos: 0, 137 | end: (1..haystack.len()) 138 | .find(|&i| utf8::is_boundary(haystack, i)) 139 | .unwrap_or(haystack.len()), 140 | } 141 | } 142 | } 143 | 144 | impl Cursor for SingleByteChunks<'_> { 145 | fn chunk(&self) -> &[u8] { 146 | debug_assert!(utf8::is_boundary(self.haystack, self.pos) || self.pos == 0); 147 | debug_assert!(utf8::is_boundary(self.haystack, self.end) || self.end == 0); 148 | &self.haystack[self.pos..self.end] 149 | } 150 | 151 | fn utf8_aware(&self) -> bool { 152 | true 153 | } 154 | 155 | fn advance(&mut self) -> bool { 156 | if self.end < self.haystack.len() { 157 | self.pos = self.end; 158 | self.end = (self.end + 1..self.haystack.len()) 159 | .find(|&i| utf8::is_boundary(self.haystack, i)) 160 | .unwrap_or(self.haystack.len()); 161 | true 162 | } else { 163 | false 164 | } 165 | } 166 | 167 | fn backtrack(&mut self) -> bool { 168 | if self.pos != 0 { 169 | self.end = self.pos; 170 | self.pos = 171 | (0..self.pos).rev().find(|&i| utf8::is_boundary(self.haystack, i)).unwrap_or(0); 172 | true 173 | } else { 174 | false 175 | } 176 | } 177 | 178 | fn total_bytes(&self) -> Option { 179 | Some(self.haystack.len()) 180 | } 181 | 182 | fn offset(&self) -> usize { 183 | self.pos 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /src/tests.rs: -------------------------------------------------------------------------------- 1 | use crate::{test_rope::SingleByteChunks, Input}; 2 | 3 | use { 4 | crate::engines::meta::{self, Regex}, 5 | anyhow::Result, 6 | regex_automata::util::syntax, 7 | regex_automata::MatchKind, 8 | regex_test::{CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult, TestRunner}, 9 | }; 10 | 11 | fn suite() -> anyhow::Result { 12 | let mut tests = regex_test::RegexTests::new(); 13 | macro_rules! load { 14 | ($name:expr) => {{ 15 | const DATA: &[u8] = include_bytes!(concat!("../../regex/testdata/", $name, ".toml")); 16 | tests.load_slice($name, DATA)?; 17 | }}; 18 | } 19 | 20 | load!("anchored"); 21 | load!("bytes"); 22 | load!("crazy"); 23 | load!("crlf"); 24 | load!("earliest"); 25 | load!("empty"); 26 | load!("expensive"); 27 | load!("flags"); 28 | load!("iter"); 29 | load!("leftmost-all"); 30 | load!("line-terminator"); 31 | load!("misc"); 32 | load!("multiline"); 33 | load!("no-unicode"); 34 | load!("overlapping"); 35 | load!("regression"); 36 | load!("set"); 37 | load!("substring"); 38 | load!("unicode"); 39 | load!("utf8"); 40 | load!("word-boundary"); 41 | load!("word-boundary-special"); 42 | load!("fowler/basic"); 43 | load!("fowler/nullsubexpr"); 44 | load!("fowler/repetition"); 45 | 46 | Ok(tests) 47 | } 48 | 49 | /// Configure a regex_automata::Input with the given test configuration. 50 | fn create_input(test: ®ex_test::RegexTest) -> crate::Input { 51 | use regex_automata::Anchored; 52 | 53 | let bounds = test.bounds(); 54 | let anchored = if test.anchored() { Anchored::Yes } else { Anchored::No }; 55 | let mut input = crate::Input::new(crate::test_rope::SingleByteChunks::new(test.haystack())) 56 | .range(bounds.start..bounds.end); 57 | input.anchored(anchored); 58 | input 59 | } 60 | 61 | /// Convert capture matches into the test suite's capture values. 62 | /// 63 | /// The given captures must represent a valid match, where the first capturing 64 | /// group has a non-None span. Otherwise this panics. 65 | fn testify_captures(caps: ®ex_automata::util::captures::Captures) -> regex_test::Captures { 66 | assert!(caps.is_match(), "expected captures to represent a match"); 67 | let spans = 68 | caps.iter().map(|group| group.map(|m| regex_test::Span { start: m.start, end: m.end })); 69 | // These unwraps are OK because we assume our 'caps' represents a match, 70 | // and a match always gives a non-zero number of groups with the first 71 | // group being non-None. 72 | regex_test::Captures::new(caps.pattern().unwrap().as_usize(), spans).unwrap() 73 | } 74 | 75 | const BLACKLIST: &[&str] = &[ 76 | // These 'earliest' tests are blacklisted because the meta searcher doesn't 77 | // give the same offsets that the test expects. This is legal because the 78 | // 'earliest' routines don't guarantee a particular match offset other 79 | // than "the earliest the regex engine can report a match." Some regex 80 | // engines will quit earlier than others. The backtracker, for example, 81 | // can't really quit before finding the full leftmost-first match. Many of 82 | // the literal searchers also don't have the ability to quit fully or it's 83 | // otherwise not worth doing. (A literal searcher not quitting as early as 84 | // possible usually means looking at a few more bytes. That's no biggie.) 85 | "earliest/", 86 | ]; 87 | 88 | const RUNS: usize = 1; 89 | /// Tests the default configuration of the meta regex engine. 90 | #[test] 91 | fn default() -> Result<()> { 92 | let builder = Regex::builder(); 93 | let mut runner = TestRunner::new()?; 94 | runner 95 | .expand(&["is_match", "find", "captures"], |test| test.compiles()) 96 | .blacklist_iter(BLACKLIST); 97 | for _ in 0..RUNS { 98 | runner.test_iter(suite()?.iter(), compiler(builder.clone())); 99 | } 100 | runner.assert(); 101 | Ok(()) 102 | } 103 | 104 | #[cfg(feature = "ropey")] 105 | #[test] 106 | fn rope_one_past_end() -> Result<()> { 107 | use crate::RopeyCursor; 108 | 109 | let builder = Regex::builder() 110 | .syntax(syntax::Config::new().case_insensitive(true).multi_line(true)) 111 | .build("git nix"); 112 | let rope = ropey::Rope::from_str("x"); 113 | builder.unwrap().find(Input::new(RopeyCursor::at(rope.slice(..), 1)).range(1..)); 114 | Ok(()) 115 | } 116 | 117 | #[test] 118 | fn prefix() -> Result<()> { 119 | let regex = Regex::builder().build("^foo$").unwrap(); 120 | let rope = ropey::Rope::from_str("xfoox"); 121 | let mut input = Input::new(rope.slice(..)); 122 | input.slice(1..4); 123 | let mat1 = regex.find(input).unwrap(); 124 | assert_eq!(mat1.start(), 1); 125 | assert_eq!(mat1.end(), 4); 126 | let rope = SingleByteChunks::new(b"xfoox"); 127 | let mut input = Input::new(rope); 128 | input.slice(1..4); 129 | let mat1 = regex.find(input).unwrap(); 130 | assert_eq!(mat1.start(), 1); 131 | assert_eq!(mat1.end(), 4); 132 | Ok(()) 133 | } 134 | 135 | /// Tests the default configuration minus the full DFA. 136 | #[test] 137 | fn no_dfa() -> Result<()> { 138 | let mut builder = Regex::builder(); 139 | builder.configure(Regex::config().dfa(false)); 140 | let mut runner = TestRunner::new()?; 141 | runner 142 | .expand(&["is_match", "find", "captures"], |test| test.compiles()) 143 | .blacklist_iter(BLACKLIST); 144 | for _ in 0..RUNS { 145 | runner.test_iter(suite()?.iter(), compiler(builder.clone())); 146 | } 147 | runner.assert(); 148 | Ok(()) 149 | } 150 | 151 | /// Tests the default configuration minus the full DFA and lazy DFA. 152 | #[test] 153 | fn no_dfa_hybrid() -> Result<()> { 154 | let mut builder = Regex::builder(); 155 | builder.configure(Regex::config().dfa(false).hybrid(false)); 156 | let mut runner = TestRunner::new()?; 157 | runner 158 | .expand(&["is_match", "find", "captures"], |test| test.compiles()) 159 | .blacklist_iter(BLACKLIST); 160 | for _ in 0..RUNS { 161 | runner.test_iter(suite()?.iter(), compiler(builder.clone())); 162 | } 163 | runner.assert(); 164 | Ok(()) 165 | } 166 | 167 | fn compiler( 168 | mut builder: meta::Builder, 169 | ) -> impl FnMut(&RegexTest, &[String]) -> Result { 170 | move |test, regexes| { 171 | if !configure_meta_builder(test, &mut builder) { 172 | return Ok(CompiledRegex::skip()); 173 | } 174 | // println!("{} {builder:?}", test.full_name()); 175 | let re = builder.build_many(regexes)?; 176 | Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, test) })) 177 | } 178 | } 179 | 180 | fn run_test(re: &Regex, test: &RegexTest) -> TestResult { 181 | let mut input = create_input(test); 182 | match test.additional_name() { 183 | "is_match" => TestResult::matched(re.is_match(input)), 184 | "find" => match test.search_kind() { 185 | SearchKind::Earliest => { 186 | input.earliest(true); 187 | TestResult::matches( 188 | re.find_iter(input).take(test.match_limit().unwrap_or(std::usize::MAX)).map( 189 | |m| Match { 190 | id: m.pattern().as_usize(), 191 | span: Span { start: m.start(), end: m.end() }, 192 | }, 193 | ), 194 | ) 195 | } 196 | SearchKind::Leftmost => TestResult::matches( 197 | re.find_iter(input).take(test.match_limit().unwrap_or(std::usize::MAX)).map(|m| { 198 | Match { 199 | id: m.pattern().as_usize(), 200 | span: Span { start: m.start(), end: m.end() }, 201 | } 202 | }), 203 | ), 204 | SearchKind::Overlapping => TestResult::skip(), 205 | }, 206 | "captures" => match test.search_kind() { 207 | SearchKind::Earliest => { 208 | input.earliest(true); 209 | let it = re 210 | .captures_iter(input) 211 | .take(test.match_limit().unwrap_or(std::usize::MAX)) 212 | .map(|caps| testify_captures(&caps)); 213 | TestResult::captures(it) 214 | } 215 | SearchKind::Leftmost => { 216 | let it = re 217 | .captures_iter(input) 218 | .take(test.match_limit().unwrap_or(std::usize::MAX)) 219 | .map(|caps| testify_captures(&caps)); 220 | TestResult::captures(it) 221 | } 222 | SearchKind::Overlapping => { 223 | // There is no overlapping regex API that supports captures. 224 | TestResult::skip() 225 | } 226 | }, 227 | name => TestResult::fail(&format!("unrecognized test name: {}", name)), 228 | } 229 | } 230 | 231 | /// Configures the given regex builder with all relevant settings on the given 232 | /// regex test. 233 | /// 234 | /// If the regex test has a setting that is unsupported, then this returns 235 | /// false (implying the test should be skipped). 236 | fn configure_meta_builder(test: &RegexTest, builder: &mut meta::Builder) -> bool { 237 | let match_kind = match test.match_kind() { 238 | regex_test::MatchKind::All => MatchKind::All, 239 | regex_test::MatchKind::LeftmostFirst => MatchKind::LeftmostFirst, 240 | regex_test::MatchKind::LeftmostLongest => return false, 241 | }; 242 | let meta_config = Regex::config() 243 | .match_kind(match_kind) 244 | .utf8_empty(test.utf8()) 245 | .line_terminator(test.line_terminator()); 246 | builder.configure(meta_config).syntax(config_syntax(test)); 247 | true 248 | } 249 | 250 | /// Configuration of the regex parser from a regex test. 251 | fn config_syntax(test: &RegexTest) -> syntax::Config { 252 | syntax::Config::new() 253 | .case_insensitive(test.case_insensitive()) 254 | .unicode(test.unicode()) 255 | .utf8(test.utf8()) 256 | .line_terminator(test.line_terminator()) 257 | } 258 | -------------------------------------------------------------------------------- /src/util.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod empty; 2 | pub mod iter; 3 | pub mod prefilter; 4 | pub mod primitives; 5 | pub mod sparse_set; 6 | pub mod utf8; 7 | 8 | // #[cfg(test)] 9 | // mod tests; 10 | -------------------------------------------------------------------------------- /src/util/empty.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | This module provides helper routines for dealing with zero-width matches. 3 | 4 | The main problem being solved here is this: 5 | 6 | 1. The caller wants to search something that they know is valid UTF-8, such 7 | as a Rust `&str`. 8 | 2. The regex used by the caller can match the empty string. For example, `a*`. 9 | 3. The caller should never get match offsets returned that occur within the 10 | encoding of a UTF-8 codepoint. It is logically incorrect, and also means that, 11 | e.g., slicing the `&str` at those offsets will lead to a panic. 12 | 13 | So the question here is, how do we prevent the caller from getting match 14 | offsets that split a codepoint? For example, strictly speaking, the regex `a*` 15 | matches `☃` at the positions `[0, 0]`, `[1, 1]`, `[2, 2]` and `[3, 3]` since 16 | the UTF-8 encoding of `☃` is `\xE2\x98\x83`. In particular, the `NFA` that 17 | underlies all of the matching engines in this crate doesn't have anything in 18 | its state graph that prevents matching between UTF-8 code units. Indeed, any 19 | engine derived from the `NFA` will match at those positions by virtue of the 20 | fact that the `NFA` is byte oriented. That is, its transitions are defined over 21 | bytes and the matching engines work by proceeding one byte at a time. 22 | 23 | (An alternative architecture would be to define the transitions in an `NFA` 24 | over codepoints, or `char`. And then make the matching engines proceed by 25 | decoding one codepoint at a time. This is a viable strategy, but it doesn't 26 | work for DFA matching engines because designing a fast and memory efficient 27 | transition table for an alphabet as large as Unicode is quite difficult. More 28 | to the point, the top-level `regex` crate supports matching on arbitrary bytes 29 | when Unicode mode is disabled and one is searching a `&[u8]`. So in that case, 30 | you can't just limit yourself to decoding codepoints and matching those. You 31 | really do need to be able to follow byte oriented transitions on the `NFA`.) 32 | 33 | In an older version of the regex crate, we handled this case not in the regex 34 | engine, but in the iterators over matches. Namely, since this case only arises 35 | when the match is empty, we "just" incremented the next starting position 36 | of the search by `N`, where `N` is the length of the codepoint encoded at 37 | the current position. The alternative or more "natural" solution of just 38 | incrementing by `1` would result in executing a search of `a*` on `☃` like 39 | this: 40 | 41 | * Start search at `0`. 42 | * Found match at `[0, 0]`. 43 | * Next start position is `0`. 44 | * To avoid an infinite loop, since it's an empty match, increment by `1`. 45 | * Start search at `1`. 46 | * Found match at `[1, 1]`. Oops. 47 | 48 | But if we instead incremented by `3` (the length in bytes of `☃`), then we get 49 | the following: 50 | 51 | * Start search at `0`. 52 | * Found match at `[0, 0]`. 53 | * Next start position is `0`. 54 | * To avoid an infinite loop, since it's an empty match, increment by `3`. 55 | * Start search at `3`. 56 | * Found match at `[3, 3]`. 57 | 58 | And we get the correct result. But does this technique work in all cases? 59 | Crucially, it requires that a zero-width match that splits a codepoint never 60 | occurs beyond the starting position of the search. Because if it did, merely 61 | incrementing the start position by the number of bytes in the codepoint at 62 | the current position wouldn't be enough. A zero-width match could just occur 63 | anywhere. It turns out that it is _almost_ true. We can convince ourselves by 64 | looking at all possible patterns that can match the empty string: 65 | 66 | * Patterns like `a*`, `a{0}`, `(?:)`, `a|` and `|a` all unconditionally match 67 | the empty string. That is, assuming there isn't an `a` at the current position, 68 | they will all match the empty string at the start of a search. There is no way 69 | to move past it because any other match would not be "leftmost." 70 | * `^` only matches at the beginning of the haystack, where the start position 71 | is `0`. Since we know we're searching valid UTF-8 (if it isn't valid UTF-8, 72 | then this entire problem goes away because it implies your string type supports 73 | invalid UTF-8 and thus must deal with offsets that not only split a codepoint 74 | but occur in entirely invalid UTF-8 somehow), it follows that `^` never matches 75 | between the code units of a codepoint because the start of a valid UTF-8 string 76 | is never within the encoding of a codepoint. 77 | * `$` basically the same logic as `^`, but for the end of a string. A valid 78 | UTF-8 string can't have an incomplete codepoint at the end of it. 79 | * `(?m:^)` follows similarly to `^`, but it can match immediately following 80 | a `\n`. However, since a `\n` is always a codepoint itself and can never 81 | appear within a codepoint, it follows that the position immediately following 82 | a `\n` in a string that is valid UTF-8 is guaranteed to not be between the 83 | code units of another codepoint. (One caveat here is that the line terminator 84 | for multi-line anchors can now be changed to any arbitrary byte, including 85 | things like `\x98` which might occur within a codepoint. However, this wasn't 86 | supported by the old regex crate. If it was, it pose the same problems as 87 | `(?-u:\B)`, as we'll discuss below.) 88 | * `(?m:$)` a similar argument as for `(?m:^)`. The only difference is that a 89 | `(?m:$)` matches just before a `\n`. But the same argument applies. 90 | * `(?Rm:^)` and `(?Rm:$)` weren't supported by the old regex crate, but the 91 | CRLF aware line anchors follow a similar argument as for `(?m:^)` and `(?m:$)`. 92 | Namely, since they only ever match at a boundary where one side is either a 93 | `\r` or a `\n`, neither of which can occur within a codepoint. 94 | * `\b` only matches at positions where both sides are valid codepoints, so 95 | this cannot split a codepoint. 96 | * `\B`, like `\b`, also only matches at positions where both sides are valid 97 | codepoints. So this cannot split a codepoint either. 98 | * `(?-u:\b)` matches only at positions where at least one side of it is an ASCII 99 | word byte. Since ASCII bytes cannot appear as code units in non-ASCII codepoints 100 | (one of the many amazing qualities of UTF-8), it follows that this too cannot 101 | split a codepoint. 102 | * `(?-u:\B)` finally represents a problem. It can matches between *any* two 103 | bytes that are either both word bytes or non-word bytes. Since code units like 104 | `\xE2` and `\x98` (from the UTF-8 encoding of `☃`) are both non-word bytes, 105 | `(?-u:\B)` will match at the position between them. 106 | 107 | Thus, our approach of incrementing one codepoint at a time after seeing an 108 | empty match is flawed because `(?-u:\B)` can result in an empty match that 109 | splits a codepoint at a position past the starting point of a search. For 110 | example, searching `(?-u:\B)` on `a☃` would produce the following matches: `[2, 111 | 2]`, `[3, 3]` and `[4, 4]`. The positions at `0` and `1` don't match because 112 | they correspond to word boundaries since `a` is an ASCII word byte. 113 | 114 | So what did the old regex crate do to avoid this? It banned `(?-u:\B)` from 115 | regexes that could match `&str`. That might sound extreme, but a lot of other 116 | things were banned too. For example, all of `(?-u:.)`, `(?-u:[^a])` and 117 | `(?-u:\W)` can match invalid UTF-8 too, including individual code units with a 118 | codepoint. The key difference is that those expressions could never produce an 119 | empty match. That ban happens when translating an `Ast` to an `Hir`, because 120 | that process that reason about whether an `Hir` can produce *non-empty* matches 121 | at invalid UTF-8 boundaries. Bottom line though is that we side-stepped the 122 | `(?-u:\B)` issue by banning it. 123 | 124 | If banning `(?-u:\B)` were the only issue with the old regex crate's approach, 125 | then I probably would have kept it. `\B` is rarely used, so it's not such a big 126 | deal to have to work-around it. However, the problem with the above approach 127 | is that it doesn't compose. The logic for avoiding splitting a codepoint only 128 | lived in the iterator, which means if anyone wants to implement their own 129 | iterator over regex matches, they have to deal with this extremely subtle edge 130 | case to get full correctness. 131 | 132 | Instead, in this crate, we take the approach of pushing this complexity down 133 | to the lowest layers of each regex engine. The approach is pretty simple: 134 | 135 | * If this corner case doesn't apply, don't do anything. (For example, if UTF-8 136 | mode isn't enabled or if the regex cannot match the empty string.) 137 | * If an empty match is reported, explicitly check if it splits a codepoint. 138 | * If it doesn't, we're done, return the match. 139 | * If it does, then ignore the match and re-run the search. 140 | * Repeat the above process until the end of the haystack is reached or a match 141 | is found that doesn't split a codepoint or isn't zero width. 142 | 143 | And that's pretty much what this module provides. Every regex engine uses these 144 | methods in their lowest level public APIs, but just above the layer where 145 | their internal engine is used. That way, all regex engines can be arbitrarily 146 | composed without worrying about handling this case, and iterators don't need to 147 | handle it explicitly. 148 | 149 | (It turns out that a new feature I added, support for changing the line 150 | terminator in a regex to any arbitrary byte, also provokes the above problem. 151 | Namely, the byte could be invalid UTF-8 or a UTF-8 continuation byte. So that 152 | support would need to be limited or banned when UTF-8 mode is enabled, just 153 | like we did for `(?-u:\B)`. But thankfully our more robust approach in this 154 | crate handles that case just fine too.) 155 | */ 156 | 157 | use regex_automata::MatchError; 158 | 159 | use crate::cursor::Cursor; 160 | use crate::input::Input; 161 | 162 | #[cold] 163 | #[inline(never)] 164 | pub(crate) fn skip_splits_fwd( 165 | input: &mut Input, 166 | init_value: T, 167 | match_offset: usize, 168 | find: F, 169 | ) -> Result, MatchError> 170 | where 171 | F: FnMut(&mut Input) -> Result, MatchError>, 172 | { 173 | skip_splits(true, input, match_offset, init_value, find) 174 | } 175 | 176 | #[cold] 177 | #[inline(never)] 178 | pub(crate) fn skip_splits_rev( 179 | input: &mut Input, 180 | init_value: T, 181 | match_offset: usize, 182 | find: F, 183 | ) -> Result, MatchError> 184 | where 185 | F: FnMut(&mut Input) -> Result, MatchError>, 186 | { 187 | skip_splits(false, input, match_offset, init_value, find) 188 | } 189 | 190 | fn skip_splits( 191 | forward: bool, 192 | input: &mut Input, 193 | match_offset: usize, 194 | init_value: T, 195 | mut find: F, 196 | ) -> Result, MatchError> 197 | where 198 | F: FnMut(&mut Input) -> Result, MatchError>, 199 | { 200 | input.move_to(match_offset); 201 | // If our config says to do an anchored search, then we're definitely 202 | // done. We just need to determine whether we have a valid match or 203 | // not. If we don't, then we're not allowed to continue, so we report 204 | // no match. 205 | // 206 | // This is actually quite a subtle correctness thing. The key here is 207 | // that if we got an empty match that splits a codepoint after doing an 208 | // anchored search in UTF-8 mode, then that implies that we must have 209 | // *started* the search at a location that splits a codepoint. This 210 | // follows from the fact that if a match is reported from an anchored 211 | // search, then the start offset of the match *must* match the start 212 | // offset of the search. 213 | // 214 | // It also follows that no other non-empty match is possible. For 215 | // example, you might write a regex like '(?:)|SOMETHING' and start its 216 | // search in the middle of a codepoint. The first branch is an empty 217 | // regex that will bubble up a match at the first position, and then 218 | // get rejected here and report no match. But what if 'SOMETHING' could 219 | // have matched? We reason that such a thing is impossible, because 220 | // if it does, it must report a match that starts in the middle of a 221 | // codepoint. This in turn implies that a match is reported whose span 222 | // does not correspond to valid UTF-8, and this breaks the promise 223 | // made when UTF-8 mode is enabled. (That promise *can* be broken, for 224 | // example, by enabling UTF-8 mode but building an by hand NFA that 225 | // produces non-empty matches that span invalid UTF-8. This is an unchecked 226 | // but documented precondition violation of UTF-8 mode, and is documented 227 | // to have unspecified behavior.) 228 | // 229 | // I believe this actually means that if an anchored search is run, and 230 | // UTF-8 mode is enabled and the start position splits a codepoint, 231 | // then it is correct to immediately report no match without even 232 | // executing the regex engine. But it doesn't really seem worth writing 233 | // out that case in every regex engine to save a tiny bit of work in an 234 | // extremely pathological case, so we just handle it here. 235 | if input.get_anchored().is_anchored() { 236 | return Ok(input.is_char_boundary().then_some(init_value)); 237 | } 238 | // Otherwise, we have an unanchored search, so just keep looking for 239 | // matches until we have one that does not split a codepoint or we hit 240 | // EOI. 241 | let mut value = init_value; 242 | while !input.is_char_boundary() { 243 | if forward { 244 | // The unwrap is OK here because overflowing usize while 245 | // iterating over a slice is impossible, at it would require 246 | // a slice of length greater than isize::MAX, which is itself 247 | // impossible. 248 | input.set_start(input.start().checked_add(1).unwrap()); 249 | } else { 250 | input.set_end(match input.end().checked_sub(1) { 251 | None => return Ok(None), 252 | Some(end) => end, 253 | }); 254 | } 255 | match find(input)? { 256 | None => return Ok(None), 257 | Some((new_value, new_match_end)) => { 258 | value = new_value; 259 | input.move_to(new_match_end) 260 | } 261 | } 262 | } 263 | Ok(Some(value)) 264 | } 265 | -------------------------------------------------------------------------------- /src/util/iter.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | Generic helpers for iteration of matches from a regex engine in a haystack. 3 | 4 | The principle type in this module is a [`Searcher`]. A `Searcher` provides 5 | its own lower level iterater-like API in addition to methods for constructing 6 | types that implement `Iterator`. The documentation for `Searcher` explains a 7 | bit more about why these different APIs exist. 8 | 9 | Currently, this module supports iteration over any regex engine that works 10 | with the [`HalfMatch`], [`Match`] or [`Captures`] types. 11 | */ 12 | 13 | use std::fmt::Debug; 14 | 15 | use regex_automata::{HalfMatch, Match, MatchError}; 16 | 17 | use crate::cursor::Cursor; 18 | use crate::input::Input; 19 | 20 | /// A searcher for creating iterators and performing lower level iteration. 21 | /// 22 | /// This searcher encapsulates the logic required for finding all successive 23 | /// non-overlapping matches in a haystack. In theory, iteration would look 24 | /// something like this: 25 | /// 26 | /// 1. Setting the start position to `0`. 27 | /// 2. Execute a regex search. If no match, end iteration. 28 | /// 3. Report the match and set the start position to the end of the match. 29 | /// 4. Go back to (2). 30 | /// 31 | /// And if this were indeed the case, it's likely that `Searcher` wouldn't 32 | /// exist. Unfortunately, because a regex may match the empty string, the above 33 | /// logic won't work for all possible regexes. Namely, if an empty match is 34 | /// found, then step (3) would set the start position of the search to the 35 | /// position it was at. Thus, iteration would never end. 36 | /// 37 | /// Instead, a `Searcher` knows how to detect these cases and forcefully 38 | /// advance iteration in the case of an empty match that overlaps with a 39 | /// previous match. 40 | /// 41 | /// If you know that your regex cannot match any empty string, then the simple 42 | /// algorithm described above will work correctly. 43 | /// 44 | /// When possible, prefer the iterators defined on the regex engine you're 45 | /// using. This tries to abstract over the regex engine and is thus a bit more 46 | /// unwieldy to use. 47 | /// 48 | /// In particular, a `Searcher` is not itself an iterator. Instead, it provides 49 | /// `advance` routines that permit moving the search along explicitly. It also 50 | /// provides various routines, like [`Searcher::into_matches_iter`], that 51 | /// accept a closure (representing how a regex engine executes a search) and 52 | /// returns a conventional iterator. 53 | /// 54 | /// The lifetime parameters come from the [`Input`] type passed to 55 | /// [`Searcher::new`]: 56 | /// 57 | /// * `'h` is the lifetime of the underlying haystack. 58 | /// 59 | /// # Searcher vs Iterator 60 | /// 61 | /// Why does a search type with "advance" APIs exist at all when we also have 62 | /// iterators? Unfortunately, the reasoning behind this split is a complex 63 | /// combination of the following things: 64 | /// 65 | /// 1. While many of the regex engines expose their own iterators, it is also 66 | /// nice to expose this lower level iteration helper because it permits callers 67 | /// to provide their own `Input` configuration. Moreover, a `Searcher` can work 68 | /// with _any_ regex engine instead of only the ones defined in this crate. 69 | /// This way, everyone benefits from a shared iteration implementation. 70 | /// 2. There are many different regex engines that, while they have the same 71 | /// match semantics, they have slightly different APIs. Iteration is just 72 | /// complex enough to want to share code, and so we need a way of abstracting 73 | /// over those different regex engines. While we could define a new trait that 74 | /// describes any regex engine search API, it would wind up looking very close 75 | /// to a closure. While there may still be reasons for the more generic trait 76 | /// to exist, for now and for the purposes of iteration, we use a closure. 77 | /// Closures also provide a lot of easy flexibility at the call site, in that 78 | /// they permit the caller to borrow any kind of state they want for use during 79 | /// each search call. 80 | /// 3. As a result of using closures, and because closures are anonymous types 81 | /// that cannot be named, it is difficult to encapsulate them without both 82 | /// costs to speed and added complexity to the public API. For example, in 83 | /// defining an iterator type like 84 | /// [`dfa::regex::FindMatches`](crate::dfa::regex::FindMatches), 85 | /// if we use a closure internally, it's not possible to name this type in the 86 | /// return type of the iterator constructor. Thus, the only way around it is 87 | /// to erase the type by boxing it and turning it into a `Box`. 88 | /// This boxed closure is unlikely to be inlined _and_ it infects the public 89 | /// API in subtle ways. Namely, unless you declare the closure as implementing 90 | /// `Send` and `Sync`, then the resulting iterator type won't implement it 91 | /// either. But there are practical issues with requiring the closure to 92 | /// implement `Send` and `Sync` that result in other API complexities that 93 | /// are beyond the scope of this already long exposition. 94 | /// 4. Some regex engines expose more complex match information than just 95 | /// "which pattern matched" and "at what offsets." For example, the PikeVM 96 | /// exposes match spans for each capturing group that participated in the 97 | /// match. In such cases, it can be quite beneficial to reuse the capturing 98 | /// group allocation on subsequent searches. A proper iterator doesn't permit 99 | /// this API due to its interface, so it's useful to have something a bit lower 100 | /// level that permits callers to amortize allocations while also reusing a 101 | /// shared implementation of iteration. (See the documentation for 102 | /// [`Searcher::advance`] for an example of using the "advance" API with the 103 | /// PikeVM.) 104 | /// 105 | /// What this boils down to is that there are "advance" APIs which require 106 | /// handing a closure to it for every call, and there are also APIs to create 107 | /// iterators from a closure. The former are useful for _implementing_ 108 | /// iterators or when you need more flexibility, while the latter are useful 109 | /// for conveniently writing custom iterators on-the-fly. 110 | /// 111 | /// # Example: iterating with captures 112 | /// 113 | /// Several regex engines in this crate over convenient iterator APIs over 114 | /// [`Captures`] values. To do so, this requires allocating a new `Captures` 115 | /// value for each iteration step. This can perhaps be more costly than you 116 | /// might want. Instead of implementing your own iterator to avoid that 117 | /// cost (which can be a little subtle if you want to handle empty matches 118 | /// correctly), you can use this `Searcher` to do it for you: 119 | /// 120 | /// ``` 121 | /// use regex_automata::{ 122 | /// nfa::thompson::pikevm::PikeVM, 123 | /// util::iter::Searcher, 124 | /// Input, Span, 125 | /// }; 126 | /// 127 | /// let re = PikeVM::new("foo(?P[0-9]+)")?; 128 | /// let haystack = "foo1 foo12 foo123"; 129 | /// 130 | /// let mut caps = re.create_captures(); 131 | /// let mut cache = re.create_cache(); 132 | /// let mut matches = vec![]; 133 | /// let mut searcher = Searcher::new(Input::new(haystack)); 134 | /// while let Some(_) = searcher.advance(|input| { 135 | /// re.search(&mut cache, input, &mut caps); 136 | /// Ok(caps.get_match()) 137 | /// }) { 138 | /// // The unwrap is OK since 'numbers' matches if the pattern matches. 139 | /// matches.push(caps.get_group_by_name("numbers").unwrap()); 140 | /// } 141 | /// assert_eq!(matches, vec![ 142 | /// Span::from(3..4), 143 | /// Span::from(8..10), 144 | /// Span::from(14..17), 145 | /// ]); 146 | /// 147 | /// # Ok::<(), Box>(()) 148 | /// ``` 149 | pub struct Searcher { 150 | /// The input parameters to give to each regex engine call. 151 | /// 152 | /// The start position of the search is mutated during iteration. 153 | input: Input, 154 | /// Records the end offset of the most recent match. This is necessary to 155 | /// handle a corner case for preventing empty matches from overlapping with 156 | /// the ending bounds of a prior match. 157 | last_match_end: Option, 158 | } 159 | 160 | impl Searcher { 161 | /// Create a new fallible non-overlapping matches iterator. 162 | /// 163 | /// The given `input` provides the parameters (including the haystack), 164 | /// while the `finder` represents a closure that calls the underlying regex 165 | /// engine. The closure may borrow any additional state that is needed, 166 | /// such as a prefilter scanner. 167 | pub fn new(input: Input) -> Searcher { 168 | Searcher { input, last_match_end: None } 169 | } 170 | 171 | /// Returns the current `Input` used by this searcher. 172 | /// 173 | /// The `Input` returned is generally equivalent to the one given to 174 | /// [`Searcher::new`], but its start position may be different to reflect 175 | /// the start of the next search to be executed. 176 | pub fn input(&mut self) -> &mut Input { 177 | &mut self.input 178 | } 179 | 180 | // /// Return the next half match for an infallible search if one exists, and 181 | // /// advance to the next position. 182 | // /// 183 | // /// This is like `try_advance_half`, except errors are converted into 184 | // /// panics. 185 | // /// 186 | // /// # Panics 187 | // /// 188 | // /// If the given closure returns an error, then this panics. This is useful 189 | // /// when you know your underlying regex engine has been configured to not 190 | // /// return an error. 191 | // /// 192 | // /// # Example 193 | // /// 194 | // /// This example shows how to use a `Searcher` to iterate over all matches 195 | // /// when using a DFA, which only provides "half" matches. 196 | // /// 197 | // /// ``` 198 | // /// use regex_automata::{ 199 | // /// hybrid::dfa::DFA, 200 | // /// util::iter::Searcher, 201 | // /// HalfMatch, Input, 202 | // /// }; 203 | // /// 204 | // /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; 205 | // /// let mut cache = re.create_cache(); 206 | // /// 207 | // /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); 208 | // /// let mut it = Searcher::new(input); 209 | // /// 210 | // /// let expected = Some(HalfMatch::must(0, 10)); 211 | // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); 212 | // /// assert_eq!(expected, got); 213 | // /// 214 | // /// let expected = Some(HalfMatch::must(0, 21)); 215 | // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); 216 | // /// assert_eq!(expected, got); 217 | // /// 218 | // /// let expected = Some(HalfMatch::must(0, 32)); 219 | // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); 220 | // /// assert_eq!(expected, got); 221 | // /// 222 | // /// let expected = None; 223 | // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); 224 | // /// assert_eq!(expected, got); 225 | // /// 226 | // /// # Ok::<(), Box>(()) 227 | // /// ``` 228 | // /// 229 | // /// This correctly moves iteration forward even when an empty match occurs: 230 | // /// 231 | // /// ``` 232 | // /// use regex_automata::{ 233 | // /// hybrid::dfa::DFA, 234 | // /// util::iter::Searcher, 235 | // /// HalfMatch, Input, 236 | // /// }; 237 | // /// 238 | // /// let re = DFA::new(r"a|")?; 239 | // /// let mut cache = re.create_cache(); 240 | // /// 241 | // /// let input = Input::new("abba"); 242 | // /// let mut it = Searcher::new(input); 243 | // /// 244 | // /// let expected = Some(HalfMatch::must(0, 1)); 245 | // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); 246 | // /// assert_eq!(expected, got); 247 | // /// 248 | // /// let expected = Some(HalfMatch::must(0, 2)); 249 | // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); 250 | // /// assert_eq!(expected, got); 251 | // /// 252 | // /// let expected = Some(HalfMatch::must(0, 4)); 253 | // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); 254 | // /// assert_eq!(expected, got); 255 | // /// 256 | // /// let expected = None; 257 | // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); 258 | // /// assert_eq!(expected, got); 259 | // /// 260 | // /// # Ok::<(), Box>(()) 261 | // /// ``` 262 | // #[inline] 263 | // pub fn advance_half(&mut self, finder: F) -> Option 264 | // where 265 | // F: FnMut(&mut Input) -> Result, MatchError>, 266 | // { 267 | // match self.try_advance_half(finder) { 268 | // Ok(m) => m, 269 | // Err(err) => panic!( 270 | // "unexpected regex half find error: {}\n\ 271 | // to handle find errors, use 'try' or 'search' methods", 272 | // err, 273 | // ), 274 | // } 275 | // } 276 | 277 | /// Return the next match for an infallible search if one exists, and 278 | /// advance to the next position. 279 | /// 280 | /// The search is advanced even in the presence of empty matches by 281 | /// forbidding empty matches from overlapping with any other match. 282 | /// 283 | /// This is like `try_advance`, except errors are converted into panics. 284 | /// 285 | /// # Panics 286 | /// 287 | /// If the given closure returns an error, then this panics. This is useful 288 | /// when you know your underlying regex engine has been configured to not 289 | /// return an error. 290 | /// 291 | /// # Example 292 | /// 293 | /// This example shows how to use a `Searcher` to iterate over all matches 294 | /// when using a regex based on lazy DFAs: 295 | /// 296 | /// ``` 297 | /// use regex_automata::{ 298 | /// hybrid::regex::Regex, 299 | /// util::iter::Searcher, 300 | /// Match, Input, 301 | /// }; 302 | /// 303 | /// let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; 304 | /// let mut cache = re.create_cache(); 305 | /// 306 | /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); 307 | /// let mut it = Searcher::new(input); 308 | /// 309 | /// let expected = Some(Match::must(0, 0..10)); 310 | /// let got = it.advance(|input| re.try_search(&mut cache, input)); 311 | /// assert_eq!(expected, got); 312 | /// 313 | /// let expected = Some(Match::must(0, 11..21)); 314 | /// let got = it.advance(|input| re.try_search(&mut cache, input)); 315 | /// assert_eq!(expected, got); 316 | /// 317 | /// let expected = Some(Match::must(0, 22..32)); 318 | /// let got = it.advance(|input| re.try_search(&mut cache, input)); 319 | /// assert_eq!(expected, got); 320 | /// 321 | /// let expected = None; 322 | /// let got = it.advance(|input| re.try_search(&mut cache, input)); 323 | /// assert_eq!(expected, got); 324 | /// 325 | /// # Ok::<(), Box>(()) 326 | /// ``` 327 | /// 328 | /// This example shows the same as above, but with the PikeVM. This example 329 | /// is useful because it shows how to use this API even when the regex 330 | /// engine doesn't directly return a `Match`. 331 | /// 332 | /// ``` 333 | /// use regex_automata::{ 334 | /// nfa::thompson::pikevm::PikeVM, 335 | /// util::iter::Searcher, 336 | /// Match, Input, 337 | /// }; 338 | /// 339 | /// let re = PikeVM::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; 340 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); 341 | /// 342 | /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); 343 | /// let mut it = Searcher::new(input); 344 | /// 345 | /// let expected = Some(Match::must(0, 0..10)); 346 | /// let got = it.advance(|input| { 347 | /// re.search(&mut cache, input, &mut caps); 348 | /// Ok(caps.get_match()) 349 | /// }); 350 | /// // Note that if we wanted to extract capturing group spans, we could 351 | /// // do that here with 'caps'. 352 | /// assert_eq!(expected, got); 353 | /// 354 | /// let expected = Some(Match::must(0, 11..21)); 355 | /// let got = it.advance(|input| { 356 | /// re.search(&mut cache, input, &mut caps); 357 | /// Ok(caps.get_match()) 358 | /// }); 359 | /// assert_eq!(expected, got); 360 | /// 361 | /// let expected = Some(Match::must(0, 22..32)); 362 | /// let got = it.advance(|input| { 363 | /// re.search(&mut cache, input, &mut caps); 364 | /// Ok(caps.get_match()) 365 | /// }); 366 | /// assert_eq!(expected, got); 367 | /// 368 | /// let expected = None; 369 | /// let got = it.advance(|input| { 370 | /// re.search(&mut cache, input, &mut caps); 371 | /// Ok(caps.get_match()) 372 | /// }); 373 | /// assert_eq!(expected, got); 374 | /// 375 | /// # Ok::<(), Box>(()) 376 | /// ``` 377 | #[inline] 378 | pub fn advance(&mut self, finder: F) -> Option 379 | where 380 | F: FnMut(&mut Input) -> Result, MatchError>, 381 | { 382 | match self.try_advance(finder) { 383 | Ok(m) => m, 384 | Err(err) => panic!( 385 | "unexpected regex find error: {}\n\ 386 | to handle find errors, use 'try' or 'search' methods", 387 | err, 388 | ), 389 | } 390 | } 391 | 392 | /// Return the next half match for a fallible search if one exists, and 393 | /// advance to the next position. 394 | /// 395 | /// This is like `advance_half`, except it permits callers to handle errors 396 | /// during iteration. 397 | #[inline] 398 | pub fn try_advance_half(&mut self, mut finder: F) -> Result, MatchError> 399 | where 400 | F: FnMut(&mut Input) -> Result, MatchError>, 401 | { 402 | let mut m = match finder(&mut self.input)? { 403 | None => return Ok(None), 404 | Some(m) => m, 405 | }; 406 | if Some(m.offset()) == self.last_match_end { 407 | m = match self.handle_overlapping_empty_half_match(m, finder)? { 408 | None => return Ok(None), 409 | Some(m) => m, 410 | }; 411 | } 412 | self.input.set_start(m.offset()); 413 | self.last_match_end = Some(m.offset()); 414 | Ok(Some(m)) 415 | } 416 | 417 | /// Return the next match for a fallible search if one exists, and advance 418 | /// to the next position. 419 | /// 420 | /// This is like `advance`, except it permits callers to handle errors 421 | /// during iteration. 422 | #[inline] 423 | pub fn try_advance(&mut self, mut finder: F) -> Result, MatchError> 424 | where 425 | F: FnMut(&mut Input) -> Result, MatchError>, 426 | { 427 | let end = self.input.end(); 428 | let mut m = match finder(&mut self.input)? { 429 | None => return Ok(None), 430 | Some(m) => m, 431 | }; 432 | assert!(m.end() <= end); 433 | if m.is_empty() && Some(m.end()) == self.last_match_end { 434 | m = match self.handle_overlapping_empty_match(m, finder)? { 435 | None => return Ok(None), 436 | Some(m) => m, 437 | }; 438 | } 439 | self.input.set_start(m.end()); 440 | self.last_match_end = Some(m.end()); 441 | Ok(Some(m)) 442 | } 443 | 444 | /// Given a closure that executes a single search, return an iterator over 445 | /// all successive non-overlapping half matches. 446 | /// 447 | /// The iterator returned yields result values. If the underlying regex 448 | /// engine is configured to never return an error, consider calling 449 | /// [`TryHalfMatchesIter::infallible`] to convert errors into panics. 450 | /// 451 | /// # Example 452 | /// 453 | /// This example shows how to use a `Searcher` to create a proper 454 | /// iterator over half matches. 455 | /// 456 | /// ``` 457 | /// use regex_automata::{ 458 | /// hybrid::dfa::DFA, 459 | /// util::iter::Searcher, 460 | /// HalfMatch, Input, 461 | /// }; 462 | /// 463 | /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; 464 | /// let mut cache = re.create_cache(); 465 | /// 466 | /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); 467 | /// let mut it = Searcher::new(input).into_half_matches_iter(|input| { 468 | /// re.try_search_fwd(&mut cache, input) 469 | /// }); 470 | /// 471 | /// let expected = Some(Ok(HalfMatch::must(0, 10))); 472 | /// assert_eq!(expected, it.next()); 473 | /// 474 | /// let expected = Some(Ok(HalfMatch::must(0, 21))); 475 | /// assert_eq!(expected, it.next()); 476 | /// 477 | /// let expected = Some(Ok(HalfMatch::must(0, 32))); 478 | /// assert_eq!(expected, it.next()); 479 | /// 480 | /// let expected = None; 481 | /// assert_eq!(expected, it.next()); 482 | /// 483 | /// # Ok::<(), Box>(()) 484 | /// ``` 485 | #[inline] 486 | pub fn into_half_matches_iter(self, finder: F) -> TryHalfMatchesIter 487 | where 488 | F: FnMut(&mut Input) -> Result, MatchError>, 489 | { 490 | TryHalfMatchesIter { it: self, finder } 491 | } 492 | 493 | /// Handles the special case of a match that begins where the previous 494 | /// match ended. Without this special handling, it'd be possible to get 495 | /// stuck where an empty match never results in forward progress. This 496 | /// also makes it more consistent with how presiding general purpose regex 497 | /// engines work. 498 | #[cold] 499 | #[inline(never)] 500 | fn handle_overlapping_empty_half_match( 501 | &mut self, 502 | _: HalfMatch, 503 | mut finder: F, 504 | ) -> Result, MatchError> 505 | where 506 | F: FnMut(&mut Input) -> Result, MatchError>, 507 | { 508 | // Since we are only here when 'm.offset()' matches the offset of the 509 | // last match, it follows that this must have been an empty match. 510 | // Since we both need to make progress *and* prevent overlapping 511 | // matches, we discard this match and advance the search by 1. 512 | // 513 | // Note that this may start a search in the middle of a codepoint. The 514 | // regex engines themselves are expected to deal with that and not 515 | // report any matches within a codepoint if they are configured in 516 | // UTF-8 mode. 517 | self.input.set_start(self.input.start().checked_add(1).unwrap()); 518 | finder(&mut self.input) 519 | } 520 | 521 | /// Handles the special case of an empty match by ensuring that 1) the 522 | /// iterator always advances and 2) empty matches never overlap with other 523 | /// matches. 524 | /// 525 | /// (1) is necessary because we principally make progress by setting the 526 | /// starting location of the next search to the ending location of the last 527 | /// match. But if a match is empty, then this results in a search that does 528 | /// not advance and thus does not terminate. 529 | /// 530 | /// (2) is not strictly necessary, but makes intuitive sense and matches 531 | /// the presiding behavior of most general purpose regex engines. The 532 | /// "intuitive sense" here is that we want to report NON-overlapping 533 | /// matches. So for example, given the regex 'a|(?:)' against the haystack 534 | /// 'a', without the special handling, you'd get the matches [0, 1) and [1, 535 | /// 1), where the latter overlaps with the end bounds of the former. 536 | /// 537 | /// Note that we mark this cold and forcefully prevent inlining because 538 | /// handling empty matches like this is extremely rare and does require 539 | /// quite a bit of code, comparatively. Keeping this code out of the main 540 | /// iterator function keeps it smaller and more amenable to inlining 541 | /// itself. 542 | #[cold] 543 | #[inline(never)] 544 | fn handle_overlapping_empty_match( 545 | &mut self, 546 | m: Match, 547 | mut finder: F, 548 | ) -> Result, MatchError> 549 | where 550 | F: FnMut(&mut Input) -> Result, MatchError>, 551 | { 552 | assert!(m.is_empty()); 553 | self.input.set_start(self.input.start().checked_add(1).unwrap()); 554 | finder(&mut self.input) 555 | } 556 | } 557 | 558 | impl Debug for Searcher { 559 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 560 | f.debug_struct("Searcher") 561 | .field("input", &self.input) 562 | .field("last_match_end", &self.last_match_end) 563 | .finish() 564 | } 565 | } 566 | 567 | /// An iterator over all non-overlapping half matches for a fallible search. 568 | /// 569 | /// The iterator yields a `Result` value until no more 570 | /// matches could be found. 571 | /// 572 | /// The type parameters are as follows: 573 | /// 574 | /// * `F` represents the type of a closure that executes the search. 575 | /// 576 | /// The lifetime parameters come from the [`Input`] type: 577 | /// 578 | /// * `'h` is the lifetime of the underlying haystack. 579 | /// 580 | /// When possible, prefer the iterators defined on the regex engine you're 581 | /// using. This tries to abstract over the regex engine and is thus a bit more 582 | /// unwieldy to use. 583 | /// 584 | /// This iterator is created by [`Searcher::into_half_matches_iter`]. 585 | pub struct TryHalfMatchesIter { 586 | it: Searcher, 587 | finder: F, 588 | } 589 | 590 | // impl TryHalfMatchesIter { 591 | // /// Return an infallible version of this iterator. 592 | // /// 593 | // /// Any item yielded that corresponds to an error results in a panic. This 594 | // /// is useful if your underlying regex engine is configured in a way that 595 | // /// it is guaranteed to never return an error. 596 | // pub fn infallible(self) -> HalfMatchesIter { 597 | // HalfMatchesIter(self) 598 | // } 599 | 600 | // /// Returns the current `Input` used by this iterator. 601 | // /// 602 | // /// The `Input` returned is generally equivalent to the one used to 603 | // /// construct this iterator, but its start position may be different to 604 | // /// reflect the start of the next search to be executed. 605 | // pub fn input(&mut self) -> &mut Input { 606 | // self.it.input() 607 | // } 608 | // } 609 | 610 | impl Iterator for TryHalfMatchesIter 611 | where 612 | F: FnMut(&mut Input) -> Result, MatchError>, 613 | { 614 | type Item = Result; 615 | 616 | #[inline] 617 | fn next(&mut self) -> Option> { 618 | self.it.try_advance_half(&mut self.finder).transpose() 619 | } 620 | } 621 | 622 | impl core::fmt::Debug for TryHalfMatchesIter { 623 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 624 | f.debug_struct("TryHalfMatchesIter") 625 | .field("it", &self.it) 626 | .field("finder", &"") 627 | .finish() 628 | } 629 | } 630 | 631 | /// An iterator over all non-overlapping half matches for an infallible search. 632 | /// 633 | /// The iterator yields a [`HalfMatch`] value until no more matches could be 634 | /// found. 635 | /// 636 | /// The type parameters are as follows: 637 | /// 638 | /// * `F` represents the type of a closure that executes the search. 639 | /// 640 | /// The lifetime parameters come from the [`Input`] type: 641 | /// 642 | /// * `'h` is the lifetime of the underlying haystack. 643 | /// 644 | /// When possible, prefer the iterators defined on the regex engine you're 645 | /// using. This tries to abstract over the regex engine and is thus a bit more 646 | /// unwieldy to use. 647 | /// 648 | /// This iterator is created by [`Searcher::into_half_matches_iter`] and 649 | /// then calling [`TryHalfMatchesIter::infallible`]. 650 | #[derive(Debug)] 651 | pub struct HalfMatchesIter(TryHalfMatchesIter); 652 | 653 | // impl HalfMatchesIter { 654 | // /// Returns the current `Input` used by this iterator. 655 | // /// 656 | // /// The `Input` returned is generally equivalent to the one used to 657 | // /// construct this iterator, but its start position may be different to 658 | // /// reflect the start of the next search to be executed. 659 | // pub fn input(&mut self) -> &mut Input { 660 | // self.0.it.input() 661 | // } 662 | // } 663 | 664 | // impl Iterator for HalfMatchesIter 665 | // where 666 | // F: FnMut(&mut Input) -> Result, MatchError>, 667 | // { 668 | // type Item = HalfMatch; 669 | 670 | // #[inline] 671 | // fn next(&mut self) -> Option { 672 | // match self.0.next()? { 673 | // Ok(m) => Some(m), 674 | // Err(err) => panic!( 675 | // "unexpected regex half find error: {}\n\ 676 | // to handle find errors, use 'try' or 'search' methods", 677 | // err, 678 | // ), 679 | // } 680 | // } 681 | // } 682 | 683 | // #[cfg(test)] 684 | // pub fn assert_eq( 685 | // mut iter1: impl Iterator, 686 | // mut iter2: impl Iterator, 687 | // ) { 688 | // let mut i = 0; 689 | // loop { 690 | // match (iter1.next(), iter2.next()) { 691 | // (None, None) => break, 692 | // (iter1, iter2) => assert_eq!(iter1, iter2, "{i}"), 693 | // } 694 | // i += 1; 695 | // } 696 | // } 697 | 698 | #[cfg(test)] 699 | pub fn prop_assert_eq( 700 | mut iter1: impl Iterator, 701 | mut iter2: impl Iterator, 702 | ) -> proptest::test_runner::TestCaseResult { 703 | let mut i = 0; 704 | let mut prev = None; 705 | loop { 706 | match (iter1.next(), iter2.next()) { 707 | (None, None) => break, 708 | (iter1, iter2) => { 709 | proptest::prop_assert_eq!(&iter1, &iter2, "i={}, prev={:?}", i, prev); 710 | prev = iter1; 711 | } 712 | } 713 | i += 1; 714 | } 715 | Ok(()) 716 | } 717 | -------------------------------------------------------------------------------- /src/util/prefilter.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | Defines a prefilter for accelerating regex searches. 3 | 4 | A prefilter can be created by building a [`Prefilter`] value. 5 | 6 | A prefilter represents one of the most important optimizations available for 7 | accelerating regex searches. The idea of a prefilter is to very quickly find 8 | candidate locations in a haystack where a regex _could_ match. Once a candidate 9 | is found, it is then intended for the regex engine to run at that position to 10 | determine whether the candidate is a match or a false positive. 11 | 12 | In the aforementioned description of the prefilter optimization also lay its 13 | demise. Namely, if a prefilter has a high false positive rate and it produces 14 | lots of candidates, then a prefilter can overall make a regex search slower. 15 | It can run more slowly because more time is spent ping-ponging between the 16 | prefilter search and the regex engine attempting to confirm each candidate as 17 | a match. This ping-ponging has overhead that adds up, and is exacerbated by 18 | a high false positive rate. 19 | 20 | Nevertheless, the optimization is still generally worth performing in most 21 | cases. Particularly given just how much throughput can be improved. (It is not 22 | uncommon for prefilter optimizations to improve throughput by one or two orders 23 | of magnitude.) 24 | 25 | Typically a prefilter is used to find occurrences of literal prefixes from a 26 | regex pattern, but this isn't required. A prefilter can be used to look for 27 | suffixes or even inner literals. 28 | 29 | Note that as of now, prefilters throw away information about which pattern 30 | each literal comes from. In other words, when a prefilter finds a match, 31 | there's no way to know which pattern (or patterns) it came from. Therefore, 32 | in order to confirm a match, you'll have to check all of the patterns by 33 | running the full regex engine. 34 | */ 35 | 36 | use log::debug; 37 | use regex_automata::MatchKind; 38 | use regex_syntax::hir::{literal, Hir}; 39 | 40 | /// Extracts all of the prefix literals from the given HIR expressions into a 41 | /// single `Seq`. The literals in the sequence are ordered with respect to the 42 | /// order of the given HIR expressions and consistent with the match semantics 43 | /// given. 44 | /// 45 | /// The sequence returned is "optimized." That is, they may be shrunk or even 46 | /// truncated according to heuristics with the intent of making them more 47 | /// useful as a prefilter. (Which translates to both using faster algorithms 48 | /// and minimizing the false positive rate.) 49 | /// 50 | /// Note that this erases any connection between the literals and which pattern 51 | /// (or patterns) they came from. 52 | /// 53 | /// The match kind given must correspond to the match semantics of the regex 54 | /// that is represented by the HIRs given. The match semantics may change the 55 | /// literal sequence returned. 56 | pub(crate) fn prefixes(kind: MatchKind, hirs: &[H]) -> literal::Seq 57 | where 58 | H: core::borrow::Borrow, 59 | { 60 | let mut extractor = literal::Extractor::new(); 61 | extractor.kind(literal::ExtractKind::Prefix); 62 | 63 | let mut prefixes = literal::Seq::empty(); 64 | for hir in hirs { 65 | prefixes.union(&mut extractor.extract(hir.borrow())); 66 | } 67 | debug!( 68 | "prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}", 69 | prefixes.len(), 70 | prefixes.is_exact(), 71 | prefixes 72 | ); 73 | match kind { 74 | MatchKind::All => { 75 | prefixes.sort(); 76 | prefixes.dedup(); 77 | } 78 | MatchKind::LeftmostFirst => { 79 | prefixes.optimize_for_prefix_by_preference(); 80 | } 81 | _ => unreachable!(), 82 | } 83 | debug!( 84 | "prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}", 85 | prefixes.len(), 86 | prefixes.is_exact(), 87 | prefixes 88 | ); 89 | prefixes 90 | } 91 | -------------------------------------------------------------------------------- /src/util/primitives.rs: -------------------------------------------------------------------------------- 1 | use regex_automata::util::primitives::SmallIndex; 2 | use regex_automata::PatternID; 3 | 4 | #[derive(Clone, Debug)] 5 | pub(crate) struct SmallIndexIter { 6 | rng: core::ops::Range, 7 | } 8 | 9 | impl Iterator for SmallIndexIter { 10 | type Item = SmallIndex; 11 | 12 | fn next(&mut self) -> Option { 13 | if self.rng.start >= self.rng.end { 14 | return None; 15 | } 16 | let next_id = self.rng.start + 1; 17 | let id = core::mem::replace(&mut self.rng.start, next_id); 18 | // new_unchecked is OK since we asserted that the number of 19 | // elements in this iterator will fit in an ID at construction. 20 | Some(SmallIndex::new_unchecked(id)) 21 | } 22 | } 23 | 24 | macro_rules! index_type_impls { 25 | ($name:ident, $err:ident, $iter:ident, $withiter:ident) => { 26 | #[derive(Clone, Debug)] 27 | pub(crate) struct $iter(SmallIndexIter); 28 | 29 | impl $iter { 30 | fn new(len: usize) -> $iter { 31 | assert!( 32 | len <= $name::LIMIT, 33 | "cannot create iterator for {} when number of \ 34 | elements exceed {:?}", 35 | stringify!($name), 36 | $name::LIMIT, 37 | ); 38 | $iter(SmallIndexIter { rng: 0..len }) 39 | } 40 | } 41 | 42 | impl Iterator for $iter { 43 | type Item = $name; 44 | 45 | fn next(&mut self) -> Option<$name> { 46 | self.0.next().map(|id| $name::new_unchecked(id.as_usize())) 47 | } 48 | } 49 | 50 | /// An iterator adapter that is like std::iter::Enumerate, but attaches 51 | /// small index values instead. It requires `ExactSizeIterator`. At 52 | /// construction, it ensures that the index of each element in the 53 | /// iterator is representable in the corresponding small index type. 54 | #[derive(Clone, Debug)] 55 | pub(crate) struct $withiter { 56 | it: I, 57 | ids: $iter, 58 | } 59 | 60 | impl $withiter { 61 | fn new(it: I) -> $withiter { 62 | let ids = $iter::new(it.len()); 63 | $withiter { it, ids } 64 | } 65 | } 66 | 67 | impl Iterator for $withiter { 68 | type Item = ($name, I::Item); 69 | 70 | fn next(&mut self) -> Option<($name, I::Item)> { 71 | let item = self.it.next()?; 72 | // Number of elements in this iterator must match, according 73 | // to contract of ExactSizeIterator. 74 | let id = self.ids.next().unwrap(); 75 | Some((id, item)) 76 | } 77 | } 78 | }; 79 | } 80 | 81 | index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter); 82 | // index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter); 83 | 84 | /// A utility trait that defines a couple of adapters for making it convenient 85 | /// to access indices as "small index" types. We require ExactSizeIterator so 86 | /// that iterator construction can do a single check to make sure the index of 87 | /// each element is representable by its small index type. 88 | pub(crate) trait IteratorIndexExt: Iterator { 89 | fn with_pattern_ids(self) -> WithPatternIDIter 90 | where 91 | Self: Sized + ExactSizeIterator, 92 | { 93 | WithPatternIDIter::new(self) 94 | } 95 | 96 | // fn with_state_ids(self) -> WithStateIDIter 97 | // where 98 | // Self: Sized + ExactSizeIterator, 99 | // { 100 | // WithStateIDIter::new(self) 101 | // } 102 | } 103 | 104 | impl IteratorIndexExt for I {} 105 | -------------------------------------------------------------------------------- /src/util/sparse_set.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | This module defines a sparse set data structure. Its most interesting 3 | properties are: 4 | 5 | * They preserve insertion order. 6 | * Set membership testing is done in constant time. 7 | * Set insertion is done in constant time. 8 | * Clearing the set is done in constant time. 9 | 10 | The cost for doing this is that the capacity of the set needs to be known up 11 | front, and the elements in the set are limited to state identifiers. 12 | 13 | These sets are principally used when traversing an NFA state graph. This 14 | happens at search time, for example, in the PikeVM. It also happens during DFA 15 | determinization. 16 | */ 17 | 18 | use std::vec; 19 | use std::vec::Vec; 20 | 21 | use regex_automata::util::primitives::StateID; 22 | 23 | /// A sparse set used for representing ordered NFA states. 24 | /// 25 | /// This supports constant time addition and membership testing. Clearing an 26 | /// entire set can also be done in constant time. Iteration yields elements 27 | /// in the order in which they were inserted. 28 | /// 29 | /// The data structure is based on: https://research.swtch.com/sparse 30 | /// Note though that we don't actually use uninitialized memory. We generally 31 | /// reuse sparse sets, so the initial allocation cost is bareable. However, its 32 | /// other properties listed above are extremely useful. 33 | #[derive(Clone)] 34 | pub(crate) struct SparseSet { 35 | /// The number of elements currently in this set. 36 | len: usize, 37 | /// Dense contains the ids in the order in which they were inserted. 38 | dense: Vec, 39 | /// Sparse maps ids to their location in dense. 40 | /// 41 | /// A state ID is in the set if and only if 42 | /// sparse[id] < len && id == dense[sparse[id]]. 43 | /// 44 | /// Note that these are indices into 'dense'. It's a little weird to use 45 | /// StateID here, but we know our length can never exceed the bounds of 46 | /// StateID (enforced by 'resize') and StateID will be at most 4 bytes 47 | /// where as a usize is likely double that in most cases. 48 | sparse: Vec, 49 | } 50 | 51 | impl SparseSet { 52 | /// Create a new sparse set with the given capacity. 53 | /// 54 | /// Sparse sets have a fixed size and they cannot grow. Attempting to 55 | /// insert more distinct elements than the total capacity of the set will 56 | /// result in a panic. 57 | /// 58 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. 59 | #[inline] 60 | pub(crate) fn new(capacity: usize) -> SparseSet { 61 | let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; 62 | set.resize(capacity); 63 | set 64 | } 65 | 66 | /// Resizes this sparse set to have the new capacity given. 67 | /// 68 | /// This set is automatically cleared. 69 | /// 70 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. 71 | #[inline] 72 | pub(crate) fn resize(&mut self, new_capacity: usize) { 73 | assert!( 74 | new_capacity <= StateID::LIMIT, 75 | "sparse set capacity cannot excced {:?}", 76 | StateID::LIMIT 77 | ); 78 | self.clear(); 79 | self.dense.resize(new_capacity, StateID::ZERO); 80 | self.sparse.resize(new_capacity, StateID::ZERO); 81 | } 82 | 83 | /// Returns the capacity of this set. 84 | /// 85 | /// The capacity represents a fixed limit on the number of distinct 86 | /// elements that are allowed in this set. The capacity cannot be changed. 87 | #[inline] 88 | pub(crate) fn capacity(&self) -> usize { 89 | self.dense.len() 90 | } 91 | 92 | /// Returns the number of elements in this set. 93 | #[inline] 94 | pub(crate) fn len(&self) -> usize { 95 | self.len 96 | } 97 | 98 | /// Returns true if and only if this set is empty. 99 | #[inline] 100 | pub(crate) fn is_empty(&self) -> bool { 101 | self.len() == 0 102 | } 103 | 104 | /// Insert the state ID value into this set and return true if the given 105 | /// state ID was not previously in this set. 106 | /// 107 | /// This operation is idempotent. If the given value is already in this 108 | /// set, then this is a no-op. 109 | /// 110 | /// If more than `capacity` ids are inserted, then this panics. 111 | /// 112 | /// This is marked as inline(always) since the compiler won't inline it 113 | /// otherwise, and it's a fairly hot piece of code in DFA determinization. 114 | #[cfg_attr(feature = "perf-inline", inline(always))] 115 | pub(crate) fn insert(&mut self, id: StateID) -> bool { 116 | if self.contains(id) { 117 | return false; 118 | } 119 | 120 | let i = self.len(); 121 | assert!( 122 | i < self.capacity(), 123 | "{:?} exceeds capacity of {:?} when inserting {:?}", 124 | i, 125 | self.capacity(), 126 | id, 127 | ); 128 | // OK since i < self.capacity() and self.capacity() is guaranteed to 129 | // be <= StateID::LIMIT. 130 | let index = StateID::new_unchecked(i); 131 | self.dense[index] = id; 132 | self.sparse[id] = index; 133 | self.len += 1; 134 | true 135 | } 136 | 137 | /// Returns true if and only if this set contains the given value. 138 | #[inline] 139 | pub(crate) fn contains(&self, id: StateID) -> bool { 140 | let index = self.sparse[id]; 141 | index.as_usize() < self.len() && self.dense[index] == id 142 | } 143 | 144 | /// Clear this set such that it has no members. 145 | #[inline] 146 | pub(crate) fn clear(&mut self) { 147 | self.len = 0; 148 | } 149 | 150 | #[inline] 151 | pub(crate) fn iter(&self) -> SparseSetIter<'_> { 152 | SparseSetIter(self.dense[..self.len()].iter()) 153 | } 154 | 155 | /// Returns the heap memory usage, in bytes, used by this sparse set. 156 | #[inline] 157 | pub(crate) fn memory_usage(&self) -> usize { 158 | self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE 159 | } 160 | } 161 | 162 | impl core::fmt::Debug for SparseSet { 163 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { 164 | let elements: Vec = self.iter().collect(); 165 | f.debug_tuple("SparseSet").field(&elements).finish() 166 | } 167 | } 168 | 169 | /// An iterator over all elements in a sparse set. 170 | /// 171 | /// The lifetime `'a` refers to the lifetime of the set being iterated over. 172 | #[derive(Debug)] 173 | pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); 174 | 175 | impl<'a> Iterator for SparseSetIter<'a> { 176 | type Item = StateID; 177 | 178 | #[cfg_attr(feature = "perf-inline", inline(always))] 179 | fn next(&mut self) -> Option { 180 | self.0.next().copied() 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /src/util/tests.rs: -------------------------------------------------------------------------------- 1 | use crate::util::{decode, decode_last}; 2 | use crate::Input; 3 | use proptest::{prop_assert_eq, proptest}; 4 | use std::iter::successors; 5 | 6 | proptest! { 7 | #[test] 8 | fn test_decode(haystack: String) { 9 | let foo = ropey::Rope::from_str(&haystack); 10 | let mut input = Input::new(foo.slice(..)); 11 | let first_char = decode(&mut input, 0).transpose().unwrap(); 12 | let res: Vec<_> = successors(first_char.map(|c| (0, c)), |(i, c)| { 13 | decode(&mut input, i + c.len_utf8()) 14 | .transpose() 15 | .unwrap() 16 | .map(|c2| (i + c.len_utf8(), c2)) 17 | }) 18 | .collect(); 19 | let ref_chars: Vec<_> = haystack.char_indices().collect(); 20 | prop_assert_eq!(res, ref_chars); 21 | 22 | // let last_char = decode_last(&[], &mut input, 0).transpose().unwrap(); 23 | // let chars_rev = std::iter::successors(first_char.map(|c| (0, c)), |(i, c)| { 24 | // decode(&mut input, i + c.len_utf8()) 25 | // .transpose() 26 | // .unwrap() 27 | // .map(|c2| (i + c.len_utf8(), c2)) 28 | // }); 29 | } 30 | #[test] 31 | fn test_decode_last(haystack: String) { 32 | let foo = ropey::Rope::from_str(&haystack); 33 | let mut input = Input::new(foo.slice(..)); 34 | let end = haystack.len(); 35 | input.move_to(end); 36 | let first_char = decode_last(haystack[..input.haystack_off()].as_bytes(), &mut input, end).transpose().unwrap(); 37 | let res: Vec<_> = successors(first_char.map(|c| (end - c.len_utf8(), c)), |&(i, _)| { 38 | input.move_to(i); 39 | decode_last(haystack[..input.haystack_off()].as_bytes(), &mut input, i) 40 | .transpose() 41 | .unwrap() 42 | .map(|c2| (i - c2.len_utf8(), c2)) 43 | }) 44 | .collect(); 45 | let ref_chars: Vec<_> = haystack.char_indices().rev().collect(); 46 | prop_assert_eq!(res, ref_chars); 47 | 48 | // let last_char = decode_last(&[], &mut input, 0).transpose().unwrap(); 49 | // let chars_rev = std::iter::successors(first_char.map(|c| (0, c)), |(i, c)| { 50 | // decode(&mut input, i + c.len_utf8()) 51 | // .transpose() 52 | // .unwrap() 53 | // .map(|c2| (i + c.len_utf8(), c2)) 54 | // }); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/util/utf8.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | Utilities for dealing with UTF-8. 3 | 4 | This module provides some UTF-8 related helper routines, including an 5 | incremental decoder. 6 | */ 7 | 8 | /// Returns true if and only if the given offset in the given bytes falls on a 9 | /// valid UTF-8 encoded codepoint boundary. 10 | /// 11 | /// If `bytes` is not valid UTF-8, then the behavior of this routine is 12 | /// unspecified. 13 | #[cfg_attr(feature = "perf-inline", inline(always))] 14 | pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool { 15 | match bytes.get(i) { 16 | // The position at the end of the bytes always represents an empty 17 | // string, which is a valid boundary. But anything after that doesn't 18 | // make much sense to call valid a boundary. 19 | None => i == bytes.len(), 20 | // Other than ASCII (where the most significant bit is never set), 21 | // valid starting bytes always have their most significant two bits 22 | // set, where as continuation bytes never have their second most 23 | // significant bit set. Therefore, this only returns true when bytes[i] 24 | // corresponds to a byte that begins a valid UTF-8 encoding of a 25 | // Unicode scalar value. 26 | Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000, 27 | } 28 | } 29 | --------------------------------------------------------------------------------