├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── proptest-regressions
    ├── engines
    │   ├── dfa
    │   │   └── test.txt
    │   ├── hybrid
    │   │   └── test.txt
    │   └── pikevm
    │   │   └── tests.txt
    ├── literal
    │   └── tests.txt
    └── util
    │   └── tests.txt
├── rustfmt.toml
├── src
    ├── cursor.rs
    ├── engines.rs
    ├── engines
    │   ├── dfa.rs
    │   ├── dfa
    │   │   ├── accel.rs
    │   │   ├── search.rs
    │   │   └── test.rs
    │   ├── hybrid.rs
    │   ├── hybrid
    │   │   ├── search.rs
    │   │   └── test.rs
    │   ├── meta
    │   │   ├── error.rs
    │   │   ├── literal.rs
    │   │   ├── mod.rs
    │   │   ├── regex.rs
    │   │   ├── strategy.rs
    │   │   └── wrappers.rs
    │   ├── pikevm.rs
    │   └── pikevm
    │   │   ├── error.rs
    │   │   └── tests.rs
    ├── input.rs
    ├── lib.rs
    ├── literal.rs
    ├── literal
    │   └── tests.rs
    ├── test_rope.rs
    ├── tests.rs
    ├── util.rs
    └── util
    │   ├── empty.rs
    │   ├── iter.rs
    │   ├── prefilter.rs
    │   ├── primitives.rs
    │   ├── sparse_set.rs
    │   ├── tests.rs
    │   └── utf8.rs
└── test_cases
    └── syntax.rs


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | debug/
 4 | target/
 5 | 
 6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 8 | Cargo.lock
 9 | 
10 | # These are backup files generated by rustfmt
11 | **/*.rs.bk
12 | 
13 | # MSVC Windows builds of rustc generate these, which store debugging information
14 | *.pdb


--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 3
  4 | 
  5 | [[package]]
  6 | name = "aho-corasick"
  7 | version = "1.1.3"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
 10 | dependencies = [
 11 |  "memchr",
 12 | ]
 13 | 
 14 | [[package]]
 15 | name = "anyhow"
 16 | version = "1.0.89"
 17 | source = "registry+https://github.com/rust-lang/crates.io-index"
 18 | checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6"
 19 | 
 20 | [[package]]
 21 | name = "autocfg"
 22 | version = "1.3.0"
 23 | source = "registry+https://github.com/rust-lang/crates.io-index"
 24 | checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
 25 | 
 26 | [[package]]
 27 | name = "bit-set"
 28 | version = "0.5.3"
 29 | source = "registry+https://github.com/rust-lang/crates.io-index"
 30 | checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
 31 | dependencies = [
 32 |  "bit-vec",
 33 | ]
 34 | 
 35 | [[package]]
 36 | name = "bit-vec"
 37 | version = "0.6.3"
 38 | source = "registry+https://github.com/rust-lang/crates.io-index"
 39 | checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
 40 | 
 41 | [[package]]
 42 | name = "bitflags"
 43 | version = "2.6.0"
 44 | source = "registry+https://github.com/rust-lang/crates.io-index"
 45 | checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
 46 | 
 47 | [[package]]
 48 | name = "bstr"
 49 | version = "1.10.0"
 50 | source = "registry+https://github.com/rust-lang/crates.io-index"
 51 | checksum = "40723b8fb387abc38f4f4a37c09073622e41dd12327033091ef8950659e6dc0c"
 52 | dependencies = [
 53 |  "memchr",
 54 |  "serde",
 55 | ]
 56 | 
 57 | [[package]]
 58 | name = "byteorder"
 59 | version = "1.5.0"
 60 | source = "registry+https://github.com/rust-lang/crates.io-index"
 61 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 62 | 
 63 | [[package]]
 64 | name = "cfg-if"
 65 | version = "1.0.0"
 66 | source = "registry+https://github.com/rust-lang/crates.io-index"
 67 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 68 | 
 69 | [[package]]
 70 | name = "equivalent"
 71 | version = "1.0.1"
 72 | source = "registry+https://github.com/rust-lang/crates.io-index"
 73 | checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 74 | 
 75 | [[package]]
 76 | name = "errno"
 77 | version = "0.3.9"
 78 | source = "registry+https://github.com/rust-lang/crates.io-index"
 79 | checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
 80 | dependencies = [
 81 |  "libc",
 82 |  "windows-sys 0.52.0",
 83 | ]
 84 | 
 85 | [[package]]
 86 | name = "fastrand"
 87 | version = "2.1.1"
 88 | source = "registry+https://github.com/rust-lang/crates.io-index"
 89 | checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6"
 90 | 
 91 | [[package]]
 92 | name = "fnv"
 93 | version = "1.0.7"
 94 | source = "registry+https://github.com/rust-lang/crates.io-index"
 95 | checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 96 | 
 97 | [[package]]
 98 | name = "getrandom"
 99 | version = "0.2.15"
100 | source = "registry+https://github.com/rust-lang/crates.io-index"
101 | checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
102 | dependencies = [
103 |  "cfg-if",
104 |  "libc",
105 |  "wasi",
106 | ]
107 | 
108 | [[package]]
109 | name = "hashbrown"
110 | version = "0.14.5"
111 | source = "registry+https://github.com/rust-lang/crates.io-index"
112 | checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
113 | 
114 | [[package]]
115 | name = "indexmap"
116 | version = "2.5.0"
117 | source = "registry+https://github.com/rust-lang/crates.io-index"
118 | checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5"
119 | dependencies = [
120 |  "equivalent",
121 |  "hashbrown",
122 | ]
123 | 
124 | [[package]]
125 | name = "lazy_static"
126 | version = "1.5.0"
127 | source = "registry+https://github.com/rust-lang/crates.io-index"
128 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
129 | 
130 | [[package]]
131 | name = "libc"
132 | version = "0.2.158"
133 | source = "registry+https://github.com/rust-lang/crates.io-index"
134 | checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
135 | 
136 | [[package]]
137 | name = "libm"
138 | version = "0.2.8"
139 | source = "registry+https://github.com/rust-lang/crates.io-index"
140 | checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
141 | 
142 | [[package]]
143 | name = "linux-raw-sys"
144 | version = "0.4.14"
145 | source = "registry+https://github.com/rust-lang/crates.io-index"
146 | checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
147 | 
148 | [[package]]
149 | name = "log"
150 | version = "0.4.22"
151 | source = "registry+https://github.com/rust-lang/crates.io-index"
152 | checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
153 | 
154 | [[package]]
155 | name = "memchr"
156 | version = "2.7.4"
157 | source = "registry+https://github.com/rust-lang/crates.io-index"
158 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
159 | 
160 | [[package]]
161 | name = "num-traits"
162 | version = "0.2.19"
163 | source = "registry+https://github.com/rust-lang/crates.io-index"
164 | checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
165 | dependencies = [
166 |  "autocfg",
167 |  "libm",
168 | ]
169 | 
170 | [[package]]
171 | name = "once_cell"
172 | version = "1.19.0"
173 | source = "registry+https://github.com/rust-lang/crates.io-index"
174 | checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
175 | 
176 | [[package]]
177 | name = "ppv-lite86"
178 | version = "0.2.20"
179 | source = "registry+https://github.com/rust-lang/crates.io-index"
180 | checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
181 | dependencies = [
182 |  "zerocopy",
183 | ]
184 | 
185 | [[package]]
186 | name = "proc-macro2"
187 | version = "1.0.86"
188 | source = "registry+https://github.com/rust-lang/crates.io-index"
189 | checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
190 | dependencies = [
191 |  "unicode-ident",
192 | ]
193 | 
194 | [[package]]
195 | name = "proptest"
196 | version = "1.5.0"
197 | source = "registry+https://github.com/rust-lang/crates.io-index"
198 | checksum = "b4c2511913b88df1637da85cc8d96ec8e43a3f8bb8ccb71ee1ac240d6f3df58d"
199 | dependencies = [
200 |  "bit-set",
201 |  "bit-vec",
202 |  "bitflags",
203 |  "lazy_static",
204 |  "num-traits",
205 |  "rand",
206 |  "rand_chacha",
207 |  "rand_xorshift",
208 |  "regex-syntax",
209 |  "rusty-fork",
210 |  "tempfile",
211 |  "unarray",
212 | ]
213 | 
214 | [[package]]
215 | name = "quick-error"
216 | version = "1.2.3"
217 | source = "registry+https://github.com/rust-lang/crates.io-index"
218 | checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
219 | 
220 | [[package]]
221 | name = "quote"
222 | version = "1.0.37"
223 | source = "registry+https://github.com/rust-lang/crates.io-index"
224 | checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
225 | dependencies = [
226 |  "proc-macro2",
227 | ]
228 | 
229 | [[package]]
230 | name = "rand"
231 | version = "0.8.5"
232 | source = "registry+https://github.com/rust-lang/crates.io-index"
233 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
234 | dependencies = [
235 |  "libc",
236 |  "rand_chacha",
237 |  "rand_core",
238 | ]
239 | 
240 | [[package]]
241 | name = "rand_chacha"
242 | version = "0.3.1"
243 | source = "registry+https://github.com/rust-lang/crates.io-index"
244 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
245 | dependencies = [
246 |  "ppv-lite86",
247 |  "rand_core",
248 | ]
249 | 
250 | [[package]]
251 | name = "rand_core"
252 | version = "0.6.4"
253 | source = "registry+https://github.com/rust-lang/crates.io-index"
254 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
255 | dependencies = [
256 |  "getrandom",
257 | ]
258 | 
259 | [[package]]
260 | name = "rand_xorshift"
261 | version = "0.3.0"
262 | source = "registry+https://github.com/rust-lang/crates.io-index"
263 | checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f"
264 | dependencies = [
265 |  "rand_core",
266 | ]
267 | 
268 | [[package]]
269 | name = "regex-automata"
270 | version = "0.4.7"
271 | source = "registry+https://github.com/rust-lang/crates.io-index"
272 | checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
273 | dependencies = [
274 |  "aho-corasick",
275 |  "memchr",
276 |  "regex-syntax",
277 | ]
278 | 
279 | [[package]]
280 | name = "regex-cursor"
281 | version = "0.1.5"
282 | dependencies = [
283 |  "anyhow",
284 |  "log",
285 |  "memchr",
286 |  "proptest",
287 |  "regex-automata",
288 |  "regex-syntax",
289 |  "regex-test",
290 |  "ropey",
291 | ]
292 | 
293 | [[package]]
294 | name = "regex-syntax"
295 | version = "0.8.4"
296 | source = "registry+https://github.com/rust-lang/crates.io-index"
297 | checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
298 | 
299 | [[package]]
300 | name = "regex-test"
301 | version = "0.1.1"
302 | source = "registry+https://github.com/rust-lang/crates.io-index"
303 | checksum = "da40f0939bc4c598b4326abdbb363a8987aa43d0526e5624aefcf3ed90344e62"
304 | dependencies = [
305 |  "anyhow",
306 |  "bstr",
307 |  "serde",
308 |  "toml",
309 | ]
310 | 
311 | [[package]]
312 | name = "ropey"
313 | version = "1.6.1"
314 | source = "registry+https://github.com/rust-lang/crates.io-index"
315 | checksum = "93411e420bcd1a75ddd1dc3caf18c23155eda2c090631a85af21ba19e97093b5"
316 | dependencies = [
317 |  "smallvec",
318 |  "str_indices",
319 | ]
320 | 
321 | [[package]]
322 | name = "rustix"
323 | version = "0.38.37"
324 | source = "registry+https://github.com/rust-lang/crates.io-index"
325 | checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811"
326 | dependencies = [
327 |  "bitflags",
328 |  "errno",
329 |  "libc",
330 |  "linux-raw-sys",
331 |  "windows-sys 0.52.0",
332 | ]
333 | 
334 | [[package]]
335 | name = "rusty-fork"
336 | version = "0.3.0"
337 | source = "registry+https://github.com/rust-lang/crates.io-index"
338 | checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f"
339 | dependencies = [
340 |  "fnv",
341 |  "quick-error",
342 |  "tempfile",
343 |  "wait-timeout",
344 | ]
345 | 
346 | [[package]]
347 | name = "serde"
348 | version = "1.0.210"
349 | source = "registry+https://github.com/rust-lang/crates.io-index"
350 | checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
351 | dependencies = [
352 |  "serde_derive",
353 | ]
354 | 
355 | [[package]]
356 | name = "serde_derive"
357 | version = "1.0.210"
358 | source = "registry+https://github.com/rust-lang/crates.io-index"
359 | checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
360 | dependencies = [
361 |  "proc-macro2",
362 |  "quote",
363 |  "syn",
364 | ]
365 | 
366 | [[package]]
367 | name = "serde_spanned"
368 | version = "0.6.7"
369 | source = "registry+https://github.com/rust-lang/crates.io-index"
370 | checksum = "eb5b1b31579f3811bf615c144393417496f152e12ac8b7663bf664f4a815306d"
371 | dependencies = [
372 |  "serde",
373 | ]
374 | 
375 | [[package]]
376 | name = "smallvec"
377 | version = "1.13.2"
378 | source = "registry+https://github.com/rust-lang/crates.io-index"
379 | checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
380 | 
381 | [[package]]
382 | name = "str_indices"
383 | version = "0.4.3"
384 | source = "registry+https://github.com/rust-lang/crates.io-index"
385 | checksum = "e9557cb6521e8d009c51a8666f09356f4b817ba9ba0981a305bd86aee47bd35c"
386 | 
387 | [[package]]
388 | name = "syn"
389 | version = "2.0.77"
390 | source = "registry+https://github.com/rust-lang/crates.io-index"
391 | checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed"
392 | dependencies = [
393 |  "proc-macro2",
394 |  "quote",
395 |  "unicode-ident",
396 | ]
397 | 
398 | [[package]]
399 | name = "tempfile"
400 | version = "3.12.0"
401 | source = "registry+https://github.com/rust-lang/crates.io-index"
402 | checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64"
403 | dependencies = [
404 |  "cfg-if",
405 |  "fastrand",
406 |  "once_cell",
407 |  "rustix",
408 |  "windows-sys 0.59.0",
409 | ]
410 | 
411 | [[package]]
412 | name = "toml"
413 | version = "0.8.19"
414 | source = "registry+https://github.com/rust-lang/crates.io-index"
415 | checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e"
416 | dependencies = [
417 |  "serde",
418 |  "serde_spanned",
419 |  "toml_datetime",
420 |  "toml_edit",
421 | ]
422 | 
423 | [[package]]
424 | name = "toml_datetime"
425 | version = "0.6.8"
426 | source = "registry+https://github.com/rust-lang/crates.io-index"
427 | checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41"
428 | dependencies = [
429 |  "serde",
430 | ]
431 | 
432 | [[package]]
433 | name = "toml_edit"
434 | version = "0.22.21"
435 | source = "registry+https://github.com/rust-lang/crates.io-index"
436 | checksum = "3b072cee73c449a636ffd6f32bd8de3a9f7119139aff882f44943ce2986dc5cf"
437 | dependencies = [
438 |  "indexmap",
439 |  "serde",
440 |  "serde_spanned",
441 |  "toml_datetime",
442 |  "winnow",
443 | ]
444 | 
445 | [[package]]
446 | name = "unarray"
447 | version = "0.1.4"
448 | source = "registry+https://github.com/rust-lang/crates.io-index"
449 | checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94"
450 | 
451 | [[package]]
452 | name = "unicode-ident"
453 | version = "1.0.13"
454 | source = "registry+https://github.com/rust-lang/crates.io-index"
455 | checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
456 | 
457 | [[package]]
458 | name = "wait-timeout"
459 | version = "0.2.0"
460 | source = "registry+https://github.com/rust-lang/crates.io-index"
461 | checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6"
462 | dependencies = [
463 |  "libc",
464 | ]
465 | 
466 | [[package]]
467 | name = "wasi"
468 | version = "0.11.0+wasi-snapshot-preview1"
469 | source = "registry+https://github.com/rust-lang/crates.io-index"
470 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
471 | 
472 | [[package]]
473 | name = "windows-sys"
474 | version = "0.52.0"
475 | source = "registry+https://github.com/rust-lang/crates.io-index"
476 | checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
477 | dependencies = [
478 |  "windows-targets",
479 | ]
480 | 
481 | [[package]]
482 | name = "windows-sys"
483 | version = "0.59.0"
484 | source = "registry+https://github.com/rust-lang/crates.io-index"
485 | checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
486 | dependencies = [
487 |  "windows-targets",
488 | ]
489 | 
490 | [[package]]
491 | name = "windows-targets"
492 | version = "0.52.6"
493 | source = "registry+https://github.com/rust-lang/crates.io-index"
494 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
495 | dependencies = [
496 |  "windows_aarch64_gnullvm",
497 |  "windows_aarch64_msvc",
498 |  "windows_i686_gnu",
499 |  "windows_i686_gnullvm",
500 |  "windows_i686_msvc",
501 |  "windows_x86_64_gnu",
502 |  "windows_x86_64_gnullvm",
503 |  "windows_x86_64_msvc",
504 | ]
505 | 
506 | [[package]]
507 | name = "windows_aarch64_gnullvm"
508 | version = "0.52.6"
509 | source = "registry+https://github.com/rust-lang/crates.io-index"
510 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
511 | 
512 | [[package]]
513 | name = "windows_aarch64_msvc"
514 | version = "0.52.6"
515 | source = "registry+https://github.com/rust-lang/crates.io-index"
516 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
517 | 
518 | [[package]]
519 | name = "windows_i686_gnu"
520 | version = "0.52.6"
521 | source = "registry+https://github.com/rust-lang/crates.io-index"
522 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
523 | 
524 | [[package]]
525 | name = "windows_i686_gnullvm"
526 | version = "0.52.6"
527 | source = "registry+https://github.com/rust-lang/crates.io-index"
528 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
529 | 
530 | [[package]]
531 | name = "windows_i686_msvc"
532 | version = "0.52.6"
533 | source = "registry+https://github.com/rust-lang/crates.io-index"
534 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
535 | 
536 | [[package]]
537 | name = "windows_x86_64_gnu"
538 | version = "0.52.6"
539 | source = "registry+https://github.com/rust-lang/crates.io-index"
540 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
541 | 
542 | [[package]]
543 | name = "windows_x86_64_gnullvm"
544 | version = "0.52.6"
545 | source = "registry+https://github.com/rust-lang/crates.io-index"
546 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
547 | 
548 | [[package]]
549 | name = "windows_x86_64_msvc"
550 | version = "0.52.6"
551 | source = "registry+https://github.com/rust-lang/crates.io-index"
552 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
553 | 
554 | [[package]]
555 | name = "winnow"
556 | version = "0.6.18"
557 | source = "registry+https://github.com/rust-lang/crates.io-index"
558 | checksum = "68a9bda4691f099d435ad181000724da8e5899daa10713c2d432552b9ccd3a6f"
559 | dependencies = [
560 |  "memchr",
561 | ]
562 | 
563 | [[package]]
564 | name = "zerocopy"
565 | version = "0.7.35"
566 | source = "registry+https://github.com/rust-lang/crates.io-index"
567 | checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
568 | dependencies = [
569 |  "byteorder",
570 |  "zerocopy-derive",
571 | ]
572 | 
573 | [[package]]
574 | name = "zerocopy-derive"
575 | version = "0.7.35"
576 | source = "registry+https://github.com/rust-lang/crates.io-index"
577 | checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
578 | dependencies = [
579 |  "proc-macro2",
580 |  "quote",
581 |  "syn",
582 | ]
583 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "regex-cursor"
 3 | description = "regex fork that can search discontiguous haystacks"
 4 | version = "0.1.5"
 5 | edition = "2021"
 6 | documentation = "https://docs.rs/regex-cursor"
 7 | author = "Pascal Kuthe <pascalkuthe@pm.me>"
 8 | repository = "https://github.com/pascalkuthe/regex-cursor"
 9 | readme = "README.md"
10 | keywords = ["regex", "dfa", "automata", "automaton", "nfa"]
11 | license = "MIT OR Apache-2.0"
12 | categories = ["text-processing"]
13 | rust-version = "1.65"
14 | 
15 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
16 | 
17 | [dependencies]
18 | log = "0.4.22"
19 | memchr = "2.7"
20 | regex-automata = "0.4.7"
21 | regex-syntax = "0.8.4"
22 | ropey = { version = "1.6.1", default-features = false, optional = true }
23 | 
24 | [dev-dependencies]
25 | anyhow = "1.0.89"
26 | proptest = "1.5.0"
27 | regex-test =  "0.1.1"
28 | 
29 | [features]
30 | default = ["perf-inline", "ropey"]
31 | perf-inline = []
32 | ropey = ["dep:ropey"]
33 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | 	http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2024 Pascal Kuthe
 2 | 
 3 | Permission is hereby granted, free of charge, to any
 4 | person obtaining a copy of this software and associated
 5 | documentation files (the "Software"), to deal in the
 6 | Software without restriction, including without
 7 | limitation the rights to use, copy, modify, merge,
 8 | publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software
10 | is furnished to do so, subject to the following
11 | conditions:
12 | 
13 | The above copyright notice and this permission notice
14 | shall be included in all copies or substantial portions
15 | of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | DEALINGS IN THE SOFTWARE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # regex-cursor
 2 | 
 3 | 
 4 | This crate provides routines for searching **discontiguous strings** for matches of a [regular expression] (aka "regex"). It is based on [regex-automata] and most of the code is adapted from the various crates in the [regex](https://github.com/rust-lang/regex) repository.
 5 | 
 6 | It is intended as a prototype for upstream support for "streaming regex". The cursor based API in this crate is very similar to the API already exposed by `regex`/`regex-automata`. To that end a generic `Cursor` trait is provided that collections can implement.
 7 | 
 8 | A sketch of the cursor API is shown below. The string is yielded in multiple byte chunks. Calling advance moves the cursor to the next chunk. Calling backtrack moves the cursor a chunk back. Backtracking is required by this crate. That makes it unsuitable for searching fully unbuffered streams like bytes send over a TCP connection. 
 9 | 
10 | ``` rust
11 | pub trait Cursor {
12 |     fn chunk(&self) -> &[u8] { .. }
13 |     fn advance(&mut self) -> bool { .. }
14 |     fn bracktrack(&mut self) -> bool { .. }
15 | }
16 | ```
17 | 
18 | Working on this crate showed me that regex backtracks a lot more than expected with most functionality fundamentally requiring backtracking. For network usecases that do not buffer their input the primary usecase would likely be detecting a match (without necessarily requiring the matched byte range). Such usecases can be covered by manually feeding bytes into the hybrid and DFA engines from the regex-automata crate. This approach also has the advantage of allowing the caller to pause the match (async) while waiting for more data allowing the caller to drive the search instead of the engine itself.
19 | 
20 | The only part of this crate that could be applied to the fully streaming case is the streaming PikeVM implementation. However, there are some limitations:
21 | * only a single search can be run since the PikeVM may look ahead multiple bytes to disambiguate alternative matches
22 | * Prefilters longer than one byte can not work
23 | * utf-8 mode can not be supported (empty matches may occur between unicode boundaries)
24 | 
25 | Currently, the PikeVM implementation is not written with this use case in mind and may call backtrack unnecessarily, but that could be addressed in the future, but especially the first point is very limiting. The pikevm also does not allow the user to drive the search and would block on network calls for example (no async).
26 | 
27 | 


--------------------------------------------------------------------------------
/proptest-regressions/engines/dfa/test.txt:
--------------------------------------------------------------------------------
 1 | # Seeds for failure cases proptest has generated in the past. It is
 2 | # automatically read and these particular cases re-run before any
 3 | # novel cases are generated.
 4 | #
 5 | # It is recommended to check this file in to source control so that
 6 | # everyone who runs the test benefits from these saved cases.
 7 | cc 2795080e34081178522520583a3fffdcfeadb09aa47a298f991e102fb6064559 # shrinks to mut haystack = "𛅕", needle = ""
 8 | cc 561c3e868d6f45d3071f185399fcd6031baede9ecbda8b4a1f3e9760775dc27e # shrinks to mut haystack = "Σ0🌀𑍇𑵐:𫠠𝕒 ", needle = ":"
 9 | cc 63a23412cc7362942174b377418542dd6430d448b0f72833809e22588e872d09 # shrinks to mut haystack = "a", needle = ""
10 | cc 311b1045964903485e0577546cf1341422999100f2e3274f8d4ea61fea074b20 # shrinks to mut haystack = "®", needle = "."
11 | 


--------------------------------------------------------------------------------
/proptest-regressions/engines/hybrid/test.txt:
--------------------------------------------------------------------------------
 1 | # Seeds for failure cases proptest has generated in the past. It is
 2 | # automatically read and these particular cases re-run before any
 3 | # novel cases are generated.
 4 | #
 5 | # It is recommended to check this file in to source control so that
 6 | # everyone who runs the test benefits from these saved cases.
 7 | cc 3152dced60f8c193302e2adbe9ebd67be558b4af65991b997e5f776920c0459f # shrinks to haystack = "", needle = ""
 8 | cc 0a97b5285cbdc808df0e0e829c62fe77de165b9aaf8f15dc0d41a150407a4b01 # shrinks to haystack = "Y", needle = "Y"
 9 | cc 3121032e282f21b11023cec49d0119661db16574d821f15b91400b6d66449702 # shrinks to haystack = "&&", needle = "&"
10 | cc f8813009c0bd8c6bdd386e9b17ce8bb83e513707c27985bc2757c56549c7290c # shrinks to haystack = ":a", needle = "$|:"
11 | cc 1cd08976b659689543c93e102417319e7dafe94333d0f2813f5c68dc935bb6cf # shrinks to haystack = "Σ /ⶠaAA ﷏00AAΣ/എ", needle = "/"
12 | cc 7fdff08fc051c9b641db028206943cbb84ca26f8a88e06eadaa5b09b66148d34 # shrinks to mut haystack = "𑊊", needle = "𑒀?."
13 | 


--------------------------------------------------------------------------------
/proptest-regressions/engines/pikevm/tests.txt:
--------------------------------------------------------------------------------
 1 | # Seeds for failure cases proptest has generated in the past. It is
 2 | # automatically read and these particular cases re-run before any
 3 | # novel cases are generated.
 4 | #
 5 | # It is recommended to check this file in to source control so that
 6 | # everyone who runs the test benefits from these saved cases.
 7 | cc 4c899804f8e28d294268b2c482879338edc3be0210465aeaf6a03d65626d386f # shrinks to haystack = "Ѩ", needle = "Ѩ*|A0"
 8 | cc 9dcbeee2d5ffde3324638f38b2eefc96a95b0665810c02c12093976a0aba96c5 # shrinks to haystack = "", needle = "^"
 9 | cc 0311c531b8a3e09dc21270ace24fc7cdec1d773228a9ce3843888afe4774c4a2 # shrinks to haystack = "", needle = "$"
10 | cc 578435f522160de6326c7cf57b367dc9e52679b796ecf8d331a9684a9ef4d1f7 # shrinks to haystack = " ", needle = "."
11 | 


--------------------------------------------------------------------------------
/proptest-regressions/literal/tests.txt:
--------------------------------------------------------------------------------
 1 | # Seeds for failure cases proptest has generated in the past. It is
 2 | # automatically read and these particular cases re-run before any
 3 | # novel cases are generated.
 4 | #
 5 | # It is recommended to check this file in to source control so that
 6 | # everyone who runs the test benefits from these saved cases.
 7 | cc a1f6f819109c893f29c5f71a0ac13dfcbf04de0dc6411615de2d9587b12d6edf # shrinks to haystack = "", needle = "🌀🤀𛱰a0Aa®ଏ¡𞥞®0"
 8 | cc 9fc9553316dab0f5611d42ebdbfda893e991f183f013a13e105570d9bb935bbb # shrinks to haystack = "🀄", needle = [128]
 9 | cc 14528483978ac457a80022577321d49eadc3952a4bc848dcf622730341424c50 # shrinks to haystack = "\"", needle = "\""
10 | cc 0906f449ec7e583178f7865198d5c6c8589f6a760f57fe1e94fa71b751a13dcc # shrinks to haystack = "*", needle = "*"
11 | cc 3dc047ca1210586977bea6afe1c52f3f21b8f778358932316bce56a9c8dd069a # shrinks to mut haystack = "®", needle = "¯"
12 | cc d37b534f1d1d9b91a41efb745325c95e429901bd53d2bc4a31fd55997e5b243a # shrinks to mut haystack = "Ѩ", needle = "Ѩ"
13 | cc ea94b3aca8d5e5c4728504f773d8ec61d1e7a0e3aa8e186b9c953a199cd7e3e2 # shrinks to mut haystack = "A® a𛲜�a0 a0 𖬀 ", needle = "�"
14 | cc 80ea1772c0da540fd9e502978e22f1678ea0a06ec302d38891ecf36be39f966c # shrinks to mut haystack = "0Aa0 ��⺀  A", needle = "�"
15 | 


--------------------------------------------------------------------------------
/proptest-regressions/util/tests.txt:
--------------------------------------------------------------------------------
1 | # Seeds for failure cases proptest has generated in the past. It is
2 | # automatically read and these particular cases re-run before any
3 | # novel cases are generated.
4 | #
5 | # It is recommended to check this file in to source control so that
6 | # everyone who runs the test benefits from these saved cases.
7 | cc 06febfa67a8673673da6a2a4d70869e49f8d45945ae98745208a6266253a5bed # shrinks to haystack = "®"
8 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | use_small_heuristics = "Max"
2 | newline_style = "Unix"
3 | use_field_init_shorthand = true
4 | 
5 | imports_granularity = "Module"
6 | group_imports = "StdExternalCrate"
7 | format_macro_matchers = true
8 | format_macro_bodies = true
9 | 


--------------------------------------------------------------------------------
/src/cursor.rs:
--------------------------------------------------------------------------------
  1 | pub trait IntoCursor {
  2 |     type Cursor: Cursor;
  3 |     fn into_cursor(self) -> Self::Cursor;
  4 | }
  5 | 
  6 | impl<C: Cursor> IntoCursor for C {
  7 |     type Cursor = Self;
  8 | 
  9 |     fn into_cursor(self) -> Self {
 10 |         self
 11 |     }
 12 | }
 13 | 
 14 | /// A cursor that allows traversing a discontiguous string like a rope.
 15 | pub trait Cursor {
 16 |     /// Returns the current chunk. If [`utf8_aware`](Cursor::utf8_aware) returns true then this function
 17 |     /// must **never** return a chunk that splits a unicode codepoint.
 18 |     /// See [`utf8_aware`](Cursor::utf8_aware) for details.
 19 |     ///
 20 |     /// Must never return an empty byteslice unless the underlying collection is empty.
 21 |     fn chunk(&self) -> &[u8];
 22 |     /// Whether this cursor is aware of utf-8 codepoint boundaries.
 23 |     ///
 24 |     /// **`true`** means that his cursor must never split a unicode codepoint at a
 25 |     /// chunk boundary. In that case all regex features are supported.
 26 |     ///
 27 |     /// **`false`** means that his cursor can not be used for utf-8 mode
 28 |     /// matching (only affects empty strings) and can not be used to match
 29 |     /// unicode word boundaries.
 30 |     fn utf8_aware(&self) -> bool {
 31 |         true
 32 |     }
 33 |     /// Advances the cursor to the next chunk if possible. In that case `true`
 34 |     /// must be returned. If the end of data is reached this function should
 35 |     /// return `false` and **not change the chunk**
 36 |     fn advance(&mut self) -> bool;
 37 |     /// Moves the cursor to the previous chunk if possible. In that case `true`
 38 |     /// must be returned If the start of data is reached this function should
 39 |     /// return `false` and **not change the chunk**
 40 |     fn backtrack(&mut self) -> bool;
 41 |     /// Returns the total length of the data. This does not
 42 |     /// take the current cursor position into account and should
 43 |     /// not change with calls to [`advance`](Cursor::advance) and [`backtrack`](Cursor::backtrack).
 44 |     fn total_bytes(&self) -> Option<usize>;
 45 |     /// The offset of the current chunk from the start of the haystack in bytes
 46 |     fn offset(&self) -> usize;
 47 | }
 48 | 
 49 | impl<C: Cursor> Cursor for &mut C {
 50 |     fn chunk(&self) -> &[u8] {
 51 |         C::chunk(self)
 52 |     }
 53 | 
 54 |     fn utf8_aware(&self) -> bool {
 55 |         C::utf8_aware(self)
 56 |     }
 57 | 
 58 |     fn advance(&mut self) -> bool {
 59 |         C::advance(self)
 60 |     }
 61 | 
 62 |     fn backtrack(&mut self) -> bool {
 63 |         C::backtrack(self)
 64 |     }
 65 | 
 66 |     fn total_bytes(&self) -> Option<usize> {
 67 |         C::total_bytes(self)
 68 |     }
 69 | 
 70 |     fn offset(&self) -> usize {
 71 |         C::offset(self)
 72 |     }
 73 | }
 74 | 
 75 | impl Cursor for &[u8] {
 76 |     fn chunk(&self) -> &[u8] {
 77 |         self
 78 |     }
 79 | 
 80 |     // true since there are no chunk bounderies
 81 |     fn utf8_aware(&self) -> bool {
 82 |         true
 83 |     }
 84 | 
 85 |     fn advance(&mut self) -> bool {
 86 |         false
 87 |     }
 88 | 
 89 |     fn backtrack(&mut self) -> bool {
 90 |         false
 91 |     }
 92 | 
 93 |     fn total_bytes(&self) -> Option<usize> {
 94 |         Some(self.len())
 95 |     }
 96 |     fn offset(&self) -> usize {
 97 |         0
 98 |     }
 99 | }
100 | 
101 | impl Cursor for &str {
102 |     fn chunk(&self) -> &[u8] {
103 |         self.as_bytes()
104 |     }
105 | 
106 |     // true since there are no chunk bounderies
107 |     fn utf8_aware(&self) -> bool {
108 |         true
109 |     }
110 | 
111 |     fn advance(&mut self) -> bool {
112 |         false
113 |     }
114 | 
115 |     fn backtrack(&mut self) -> bool {
116 |         false
117 |     }
118 |     fn total_bytes(&self) -> Option<usize> {
119 |         Some(<str>::len(self))
120 |     }
121 | 
122 |     fn offset(&self) -> usize {
123 |         0
124 |     }
125 | }
126 | 
127 | #[cfg(feature = "ropey")]
128 | #[derive(Clone, Copy)]
129 | enum Pos {
130 |     ChunkStart,
131 |     ChunkEnd,
132 | }
133 | 
134 | #[cfg(feature = "ropey")]
135 | #[derive(Clone)]
136 | pub struct RopeyCursor<'a> {
137 |     iter: ropey::iter::Chunks<'a>,
138 |     current: &'a [u8],
139 |     pos: Pos,
140 |     len: usize,
141 |     offset: usize,
142 | }
143 | 
144 | #[cfg(feature = "ropey")]
145 | impl<'a> RopeyCursor<'a> {
146 |     pub fn new(slice: ropey::RopeSlice<'a>) -> Self {
147 |         let iter = slice.chunks();
148 |         let mut res =
149 |             Self { current: &[], iter, pos: Pos::ChunkEnd, len: slice.len_bytes(), offset: 0 };
150 |         res.advance();
151 |         res
152 |     }
153 | 
154 |     pub fn at(slice: ropey::RopeSlice<'a>, at: usize) -> Self {
155 |         let (iter, offset, _, _) = slice.chunks_at_byte(at);
156 |         if offset == slice.len_bytes() {
157 |             let mut res =
158 |                 Self { current: &[], iter, pos: Pos::ChunkStart, len: slice.len_bytes(), offset };
159 |             res.backtrack();
160 |             res
161 |         } else {
162 |             let mut res =
163 |                 Self { current: &[], iter, pos: Pos::ChunkEnd, len: slice.len_bytes(), offset };
164 |             res.advance();
165 |             res
166 |         }
167 |     }
168 | }
169 | 
170 | #[cfg(feature = "ropey")]
171 | impl Cursor for RopeyCursor<'_> {
172 |     fn chunk(&self) -> &[u8] {
173 |         self.current
174 |     }
175 | 
176 |     fn advance(&mut self) -> bool {
177 |         match self.pos {
178 |             Pos::ChunkStart => {
179 |                 self.iter.next();
180 |                 self.pos = Pos::ChunkEnd;
181 |             }
182 |             Pos::ChunkEnd => (),
183 |         }
184 |         for next in self.iter.by_ref() {
185 |             if next.is_empty() {
186 |                 continue;
187 |             }
188 |             self.offset += self.current.len();
189 |             self.current = next.as_bytes();
190 |             return true;
191 |         }
192 |         false
193 |     }
194 | 
195 |     fn backtrack(&mut self) -> bool {
196 |         match self.pos {
197 |             Pos::ChunkStart => {}
198 |             Pos::ChunkEnd => {
199 |                 self.iter.prev();
200 |                 self.pos = Pos::ChunkStart;
201 |             }
202 |         }
203 |         while let Some(prev) = self.iter.prev() {
204 |             if prev.is_empty() {
205 |                 continue;
206 |             }
207 |             self.offset -= prev.len();
208 |             self.current = prev.as_bytes();
209 |             return true;
210 |         }
211 |         false
212 |     }
213 | 
214 |     fn utf8_aware(&self) -> bool {
215 |         true
216 |     }
217 | 
218 |     fn total_bytes(&self) -> Option<usize> {
219 |         Some(self.len)
220 |     }
221 | 
222 |     fn offset(&self) -> usize {
223 |         self.offset
224 |     }
225 | }
226 | 
227 | #[cfg(feature = "ropey")]
228 | impl<'a> IntoCursor for ropey::RopeSlice<'a> {
229 |     type Cursor = RopeyCursor<'a>;
230 | 
231 |     fn into_cursor(self) -> Self::Cursor {
232 |         RopeyCursor::new(self)
233 |     }
234 | }
235 | 
236 | #[cfg(feature = "ropey")]
237 | impl<'a> IntoCursor for &'a ropey::Rope {
238 |     type Cursor = RopeyCursor<'a>;
239 | 
240 |     fn into_cursor(self) -> Self::Cursor {
241 |         RopeyCursor::new(self.slice(..))
242 |     }
243 | }
244 | #[cfg(all(feature = "ropey", test))]
245 | mod ropey_test {
246 |     use ropey::Rope;
247 | 
248 |     use crate::cursor::IntoCursor;
249 |     use crate::Cursor;
250 | 
251 |     #[test]
252 |     fn smoke_test() {
253 |         let rope = Rope::from_str("abc");
254 |         let mut cursor = rope.into_cursor();
255 |         assert_eq!(cursor.chunk(), "abc".as_bytes());
256 |         assert!(!cursor.advance());
257 |         assert_eq!(cursor.chunk(), "abc".as_bytes());
258 |         assert!(!cursor.backtrack());
259 |         assert_eq!(cursor.chunk(), "abc".as_bytes());
260 |         let rope = Rope::from("abc".repeat(5000));
261 |         let mut cursor = rope.into_cursor();
262 |         let mut offset = 0;
263 |         loop {
264 |             assert_eq!(cursor.offset(), offset);
265 |             offset += cursor.chunk().len();
266 |             if !cursor.advance() {
267 |                 break;
268 |             }
269 |         }
270 |         loop {
271 |             offset -= cursor.chunk().len();
272 |             assert_eq!(cursor.offset(), offset);
273 |             if !cursor.backtrack() {
274 |                 break;
275 |             }
276 |         }
277 |         assert_eq!(cursor.offset(), 0);
278 |         assert_eq!(offset, 0);
279 |     }
280 | }
281 | 


--------------------------------------------------------------------------------
/src/engines.rs:
--------------------------------------------------------------------------------
1 | pub mod dfa;
2 | pub mod hybrid;
3 | pub mod meta;
4 | pub mod pikevm;
5 | 


--------------------------------------------------------------------------------
/src/engines/dfa.rs:
--------------------------------------------------------------------------------
  1 | pub use regex_automata::dfa::regex::Regex;
  2 | use regex_automata::dfa::Automaton;
  3 | use regex_automata::{Anchored, Match, MatchError};
  4 | 
  5 | use crate::cursor::Cursor;
  6 | use crate::util::iter;
  7 | use crate::Input;
  8 | 
  9 | pub use crate::engines::dfa::search::{try_search_fwd, try_search_rev};
 10 | 
 11 | mod accel;
 12 | mod search;
 13 | #[cfg(test)]
 14 | mod test;
 15 | 
 16 | /// Returns true if either the given input specifies an anchored search
 17 | /// or if the underlying NFA is always anchored.
 18 | fn is_anchored(regex: &Regex, input: &Input<impl Cursor>) -> bool {
 19 |     match input.get_anchored() {
 20 |         Anchored::No => regex.forward().is_always_start_anchored(),
 21 |         Anchored::Yes | Anchored::Pattern(_) => true,
 22 |     }
 23 | }
 24 | 
 25 | /// Returns an iterator over all non-overlapping leftmost matches in the
 26 | /// given bytes. If no match exists, then the iterator yields no elements.
 27 | ///
 28 | /// # Panics
 29 | ///
 30 | /// This routine panics if the search could not complete. This can occur
 31 | /// in a number of circumstances:
 32 | ///
 33 | /// * The configuration of the lazy DFA may permit it to "quit" the search.
 34 | /// For example, setting quit bytes or enabling heuristic support for
 35 | /// Unicode word boundaries. The default configuration does not enable any
 36 | /// option that could result in the lazy DFA quitting.
 37 | /// * The configuration of the lazy DFA may also permit it to "give up"
 38 | /// on a search if it makes ineffective use of its transition table
 39 | /// cache. The default configuration does not enable this by default,
 40 | /// although it is typically a good idea to.
 41 | /// * When the provided `Input` configuration is not supported. For
 42 | /// example, by providing an unsupported anchor mode.
 43 | ///
 44 | /// When a search panics, callers cannot know whether a match exists or
 45 | /// not.
 46 | ///
 47 | /// The above conditions also apply to the iterator returned as well. For
 48 | /// example, if the lazy DFA gives up or quits during a search using this
 49 | /// method, then a panic will occur during iteration.
 50 | ///
 51 | /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher)
 52 | /// if you want to handle these error conditions.
 53 | ///
 54 | /// # Example
 55 | ///
 56 | /// ```
 57 | /// use regex_automata::{hybrid::regex::Regex, Match};
 58 | ///
 59 | /// let re = Regex::new("foo[0-9]+")?;
 60 | /// let mut cache = re.create_cache();
 61 | ///
 62 | /// let text = "foo1 foo12 foo123";
 63 | /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
 64 | /// assert_eq!(matches, vec![
 65 | ///     Match::must(0, 0..4),
 66 | ///     Match::must(0, 5..10),
 67 | ///     Match::must(0, 11..17),
 68 | /// ]);
 69 | /// # Ok::<(), Box<dyn std::error::Error>>(())
 70 | /// ```
 71 | #[inline]
 72 | pub fn find_iter<C: Cursor>(regex: &Regex, input: Input<C>) -> FindMatches<'_, C> {
 73 |     let it = iter::Searcher::new(input);
 74 |     FindMatches { re: regex, it }
 75 | }
 76 | 
 77 | /// Returns the start and end offset of the leftmost match. If no match
 78 | /// exists, then `None` is returned.
 79 | ///
 80 | /// # Panics
 81 | ///
 82 | /// This routine panics if the search could not complete. This can occur
 83 | /// in a number of circumstances:
 84 | ///
 85 | /// * The configuration of the lazy DFA may permit it to "quit" the search.
 86 | /// For example, setting quit bytes or enabling heuristic support for
 87 | /// Unicode word boundaries. The default configuration does not enable any
 88 | /// option that could result in the lazy DFA quitting.
 89 | /// * The configuration of the lazy DFA may also permit it to "give up"
 90 | /// on a search if it makes ineffective use of its transition table
 91 | /// cache. The default configuration does not enable this by default,
 92 | /// although it is typically a good idea to.
 93 | /// * When the provided `Input` configuration is not supported. For
 94 | /// example, by providing an unsupported anchor mode.
 95 | ///
 96 | /// When a search panics, callers cannot know whether a match exists or
 97 | /// not.
 98 | ///
 99 | /// Use [`Regex::try_search`] if you want to handle these error conditions.
100 | ///
101 | /// # Example
102 | ///
103 | /// ```
104 | /// use regex_automata::{Match, hybrid::regex::Regex};
105 | ///
106 | /// let re = Regex::new("foo[0-9]+")?;
107 | /// let mut cache = re.create_cache();
108 | /// assert_eq!(
109 | ///     Some(Match::must(0, 3..11)),
110 | ///     re.find(&mut cache, "zzzfoo12345zzz"),
111 | /// );
112 | ///
113 | /// // Even though a match is found after reading the first byte (`a`),
114 | /// // the default leftmost-first match semantics demand that we find the
115 | /// // earliest match that prefers earlier parts of the pattern over latter
116 | /// // parts.
117 | /// let re = Regex::new("abc|a")?;
118 | /// let mut cache = re.create_cache();
119 | /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc"));
120 | /// # Ok::<(), Box<dyn std::error::Error>>(())
121 | /// ```
122 | pub fn find<C: Cursor>(regex: &Regex, input: &mut Input<C>) -> Option<Match> {
123 |     try_search(regex, input).unwrap()
124 | }
125 | 
126 | /// Returns the start and end offset of the leftmost match. If no match
127 | /// exists, then `None` is returned.
128 | ///
129 | /// This is like [`Regex::find`] but with two differences:
130 | ///
131 | /// 1. It is not generic over `Into<Input>` and instead accepts a
132 | /// `&Input`. This permits reusing the same `Input` for multiple searches
133 | /// without needing to create a new one. This _may_ help with latency.
134 | /// 2. It returns an error if the search could not complete where as
135 | /// [`Regex::find`] will panic.
136 | ///
137 | /// # Errors
138 | ///
139 | /// This routine errors if the search could not complete. This can occur
140 | /// in a number of circumstances:
141 | ///
142 | /// * The configuration of the lazy DFA may permit it to "quit" the search.
143 | /// For example, setting quit bytes or enabling heuristic support for
144 | /// Unicode word boundaries. The default configuration does not enable any
145 | /// option that could result in the lazy DFA quitting.
146 | /// * The configuration of the lazy DFA may also permit it to "give up"
147 | /// on a search if it makes ineffective use of its transition table
148 | /// cache. The default configuration does not enable this by default,
149 | /// although it is typically a good idea to.
150 | /// * When the provided `Input` configuration is not supported. For
151 | /// example, by providing an unsupported anchor mode.
152 | ///
153 | /// When a search returns an error, callers cannot know whether a match
154 | /// exists or not.
155 | pub fn try_search<C: Cursor>(
156 |     regex: &Regex,
157 |     input: &mut Input<C>,
158 | ) -> Result<Option<Match>, MatchError> {
159 |     let fwd = regex.forward();
160 |     let end = match try_search_fwd(fwd, input)? {
161 |         None => return Ok(None),
162 |         Some(end) => end,
163 |     };
164 |     // This special cases an empty match at the beginning of the search. If
165 |     // our end matches our start, then since a reverse DFA can't match past
166 |     // the start, it must follow that our starting position is also our end
167 |     // position. So short circuit and skip the reverse search.
168 |     if input.start() == end.offset() {
169 |         return Ok(Some(Match::new(end.pattern(), end.offset()..end.offset())));
170 |     }
171 |     // We can also skip the reverse search if we know our search was
172 |     // anchored. This occurs either when the input config is anchored or
173 |     // when we know the regex itself is anchored. In this case, we know the
174 |     // start of the match, if one is found, must be the start of the
175 |     // search.
176 |     if is_anchored(regex, input) {
177 |         return Ok(Some(Match::new(end.pattern(), input.start()..end.offset())));
178 |     }
179 |     // N.B. I have tentatively convinced myself that it isn't necessary
180 |     // to specify the specific pattern for the reverse search since the
181 |     // reverse search will always find the same pattern to match as the
182 |     // forward search. But I lack a rigorous proof. Why not just provide
183 |     // the pattern anyway? Well, if it is needed, then leaving it out
184 |     // gives us a chance to find a witness. (Also, if we don't need to
185 |     // specify the pattern, then we don't need to build the reverse DFA
186 |     // with 'starts_for_each_pattern' enabled. It doesn't matter too much
187 |     // for the lazy DFA, but does make the overall DFA bigger.)
188 |     //
189 |     // We also need to be careful to disable 'earliest' for the reverse
190 |     // search, since it could be enabled for the forward search. In the
191 |     // reverse case, to satisfy "leftmost" criteria, we need to match as
192 |     // much as we can. We also need to be careful to make the search
193 |     // anchored. We don't want the reverse search to report any matches
194 |     // other than the one beginning at the end of our forward search.
195 | 
196 |     let match_range = input.start()..end.offset();
197 |     let start = input.with(|mut revsearch| {
198 |         revsearch = revsearch.span(match_range).anchored(Anchored::Yes).earliest(false);
199 |         try_search_rev(regex.reverse(), revsearch)
200 |     });
201 |     let start = start?.expect("reverse search must match if forward search does");
202 |     debug_assert_eq!(
203 |         start.pattern(),
204 |         end.pattern(),
205 |         "forward and reverse search must match same pattern",
206 |     );
207 |     debug_assert!(start.offset() <= end.offset());
208 |     debug_assert!(end.offset() <= input.end());
209 |     debug_assert!(input.start() <= start.offset());
210 |     Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
211 | }
212 | 
213 | /// An iterator over all non-overlapping matches for an infallible search.
214 | ///
215 | /// The iterator yields a [`Match`] value until no more matches could be found.
216 | /// If the underlying regex engine returns an error, then a panic occurs.
217 | ///
218 | /// This iterator can be created with the [`Regex::find_iter`] method.
219 | #[derive(Debug)]
220 | pub struct FindMatches<'r, C: Cursor> {
221 |     re: &'r Regex,
222 |     it: iter::Searcher<C>,
223 | }
224 | 
225 | impl<'r, C: Cursor> Iterator for FindMatches<'r, C> {
226 |     type Item = Match;
227 | 
228 |     #[inline]
229 |     fn next(&mut self) -> Option<Match> {
230 |         let FindMatches { re, ref mut it } = *self;
231 |         it.advance(|input| try_search(re, input))
232 |     }
233 | }
234 | 


--------------------------------------------------------------------------------
/src/engines/dfa/accel.rs:
--------------------------------------------------------------------------------
 1 | use crate::cursor::Cursor;
 2 | use crate::Input;
 3 | 
 4 | /// Search for between 1 and 3 needle bytes in the given haystack, starting the
 5 | /// search at the given position. If `needles` has a length other than 1-3,
 6 | /// then this panics.
 7 | #[cfg_attr(feature = "perf-inline", inline(always))]
 8 | pub(crate) fn find_fwd_imp(needles: &[u8], haystack: &[u8], at: usize) -> Option<usize> {
 9 |     let bs = needles;
10 |     let i = match needles.len() {
11 |         1 => memchr::memchr(bs[0], &haystack[at..])?,
12 |         2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?,
13 |         3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?,
14 |         0 => panic!("cannot find with empty needles"),
15 |         n => panic!("invalid needles length: {}", n),
16 |     };
17 |     Some(at + i)
18 | }
19 | /// Search for between 1 and 3 needle bytes in the given input, starting the
20 | /// search at the given position. If `needles` has a length other than 1-3,
21 | /// then this panics.
22 | #[cfg_attr(feature = "perf-inline", inline(always))]
23 | pub(crate) fn find_fwd<C: Cursor>(
24 |     needles: &[u8],
25 |     input: &mut Input<C>,
26 |     at: usize,
27 | ) -> Option<usize> {
28 |     if let Some(pos) = find_fwd_imp(needles, input.chunk(), at) {
29 |         return Some(pos);
30 |     }
31 |     while input.chunk_offset() + input.chunk().len() < input.end() && input.advance() {
32 |         if let Some(pos) = find_fwd_imp(needles, input.chunk(), 0) {
33 |             return Some(pos);
34 |         }
35 |     }
36 |     None
37 | }
38 | 
39 | /// Search for between 1 and 3 needle bytes in the given haystack in reverse,
40 | /// starting the search at the given position. If `needles` has a length other
41 | /// than 1-3, then this panics.
42 | #[cfg_attr(feature = "perf-inline", inline(always))]
43 | pub(crate) fn find_rev_imp(needles: &[u8], haystack: &[u8], at: usize) -> Option<usize> {
44 |     let bs = needles;
45 |     match needles.len() {
46 |         1 => memchr::memrchr(bs[0], &haystack[..at]),
47 |         2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]),
48 |         3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]),
49 |         0 => panic!("cannot find with empty needles"),
50 |         n => panic!("invalid needles length: {}", n),
51 |     }
52 | }
53 | /// Search for between 1 and 3 needle bytes in the given input, starting the
54 | /// search at the given position. If `needles` has a length other than 1-3,
55 | /// then this panics.
56 | #[cfg_attr(feature = "perf-inline", inline(always))]
57 | pub(crate) fn find_rev<C: Cursor>(
58 |     needles: &[u8],
59 |     input: &mut Input<C>,
60 |     at: usize,
61 | ) -> Option<usize> {
62 |     if let Some(pos) = find_rev_imp(needles, input.chunk(), at) {
63 |         return Some(pos);
64 |     }
65 |     while input.start() < input.chunk_offset() && input.backtrack() {
66 |         if let Some(pos) = find_rev_imp(needles, input.chunk(), input.chunk().len()) {
67 |             return Some(pos);
68 |         }
69 |     }
70 |     None
71 | }
72 | 


--------------------------------------------------------------------------------
/src/engines/dfa/test.rs:
--------------------------------------------------------------------------------
 1 | use proptest::proptest;
 2 | 
 3 | use crate::engines::dfa::find_iter;
 4 | use crate::input::Input;
 5 | use crate::test_rope::SingleByteChunks;
 6 | 
 7 | #[test]
 8 | fn searcher() {
 9 |     let text = std::fs::read_to_string("test_cases/syntax.rs").unwrap();
10 |     let regex = super::Regex::builder()
11 |         .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true))
12 |         .build("vec")
13 |         .unwrap();
14 |     let rope = ropey::Rope::from_str(&text);
15 |     let matches: Vec<_> = find_iter(&regex, Input::new(rope.slice(..)))
16 |         .map(|range| rope.byte_slice(range.range()))
17 |         .collect();
18 |     assert_eq!(matches.len(), 68);
19 | }
20 | 
21 | #[test]
22 | fn anchor() {
23 |     let haystack = ":a";
24 |     let needle = "$|:";
25 |     let foo = SingleByteChunks::new(haystack.as_bytes());
26 |     let regex = super::Regex::builder()
27 |         .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true).unicode(false))
28 |         .build(needle)
29 |         .unwrap();
30 |     let iter1: Vec<_> = regex.find_iter(haystack).collect();
31 |     let iter2: Vec<_> = find_iter(&regex, Input::new(foo)).collect();
32 |     assert_eq!(iter1, iter2);
33 | }
34 | 
35 | #[test]
36 | fn end_of_input() {
37 |     let haystack = "a b c";
38 |     let needle = "\\b";
39 |     let foo = SingleByteChunks::new(haystack.as_bytes());
40 |     let regex = super::Regex::builder()
41 |         .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true).unicode(false))
42 |         .build(needle)
43 |         .unwrap();
44 |     let iter1: Vec<_> = regex.find_iter(haystack).collect();
45 |     let iter2: Vec<_> = find_iter(&regex, Input::new(foo)).collect();
46 |     assert_eq!(iter1, iter2);
47 | }
48 | 
49 | #[test]
50 | fn hotloop_transition() {
51 |     let haystack = "Σ /ⶠaAA ﷏00AAΣ/എ";
52 |     let needle = "/";
53 |     let foo = ropey::Rope::from_str(haystack);
54 |     let regex = super::Regex::builder()
55 |         .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true))
56 |         .build(needle)
57 |         .unwrap();
58 |     let iter1: Vec<_> = regex.find_iter(haystack).collect();
59 |     let iter2: Vec<_> = find_iter(&regex, Input::new(&foo)).collect();
60 |     assert_eq!(iter1, iter2);
61 | }
62 | 
63 | proptest! {
64 |   #[test]
65 |   fn matches(mut haystack: String, needle: String) {
66 |     haystack = haystack.repeat(1024);
67 |     let foo = ropey::Rope::from_str(&haystack);
68 |     let Ok(regex) = super::Regex::builder()
69 |         .syntax(regex_automata::util::syntax::Config::new()
70 |             .case_insensitive(true)
71 |         )
72 |         .build(&needle) else {
73 |         return Ok(())
74 |     };
75 |     let iter1 = regex.find_iter( &haystack);
76 |     let iter2 = find_iter(&regex, Input::new(&foo));
77 |     crate::util::iter::prop_assert_eq(iter1, iter2)?;
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/engines/hybrid.rs:
--------------------------------------------------------------------------------
  1 | pub use regex_automata::hybrid::regex::{Cache, Regex};
  2 | use regex_automata::{Anchored, Match, MatchError};
  3 | 
  4 | use crate::cursor::Cursor;
  5 | use crate::input::Input;
  6 | use crate::util::iter;
  7 | 
  8 | pub use crate::engines::hybrid::search::{try_search_fwd, try_search_rev};
  9 | 
 10 | mod search;
 11 | #[cfg(test)]
 12 | mod test;
 13 | 
 14 | /// Returns true if either the given input specifies an anchored search
 15 | /// or if the underlying NFA is always anchored.
 16 | fn is_anchored(regex: &Regex, input: &Input<impl Cursor>) -> bool {
 17 |     match input.get_anchored() {
 18 |         Anchored::No => regex.forward().get_nfa().is_always_start_anchored(),
 19 |         Anchored::Yes | Anchored::Pattern(_) => true,
 20 |     }
 21 | }
 22 | 
 23 | /// Returns an iterator over all non-overlapping leftmost matches in the
 24 | /// given bytes. If no match exists, then the iterator yields no elements.
 25 | ///
 26 | /// # Panics
 27 | ///
 28 | /// This routine panics if the search could not complete. This can occur
 29 | /// in a number of circumstances:
 30 | ///
 31 | /// * The configuration of the lazy DFA may permit it to "quit" the search.
 32 | /// For example, setting quit bytes or enabling heuristic support for
 33 | /// Unicode word boundaries. The default configuration does not enable any
 34 | /// option that could result in the lazy DFA quitting.
 35 | /// * The configuration of the lazy DFA may also permit it to "give up"
 36 | /// on a search if it makes ineffective use of its transition table
 37 | /// cache. The default configuration does not enable this by default,
 38 | /// although it is typically a good idea to.
 39 | /// * When the provided `Input` configuration is not supported. For
 40 | /// example, by providing an unsupported anchor mode.
 41 | ///
 42 | /// When a search panics, callers cannot know whether a match exists or
 43 | /// not.
 44 | ///
 45 | /// The above conditions also apply to the iterator returned as well. For
 46 | /// example, if the lazy DFA gives up or quits during a search using this
 47 | /// method, then a panic will occur during iteration.
 48 | ///
 49 | /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher)
 50 | /// if you want to handle these error conditions.
 51 | ///
 52 | /// # Example
 53 | ///
 54 | /// ```
 55 | /// use regex_automata::{hybrid::regex::Regex, Match};
 56 | ///
 57 | /// let re = Regex::new("foo[0-9]+")?;
 58 | /// let mut cache = re.create_cache();
 59 | ///
 60 | /// let text = "foo1 foo12 foo123";
 61 | /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
 62 | /// assert_eq!(matches, vec![
 63 | ///     Match::must(0, 0..4),
 64 | ///     Match::must(0, 5..10),
 65 | ///     Match::must(0, 11..17),
 66 | /// ]);
 67 | /// # Ok::<(), Box<dyn std::error::Error>>(())
 68 | /// ```
 69 | #[inline]
 70 | pub fn find_iter<'r, 'c, C: Cursor>(
 71 |     regex: &'r Regex,
 72 |     cache: &'c mut Cache,
 73 |     input: Input<C>,
 74 | ) -> FindMatches<'r, 'c, C> {
 75 |     let it = iter::Searcher::new(input);
 76 |     FindMatches { re: regex, cache, it }
 77 | }
 78 | 
 79 | /// Returns the start and end offset of the leftmost match. If no match
 80 | /// exists, then `None` is returned.
 81 | ///
 82 | /// # Panics
 83 | ///
 84 | /// This routine panics if the search could not complete. This can occur
 85 | /// in a number of circumstances:
 86 | ///
 87 | /// * The configuration of the lazy DFA may permit it to "quit" the search.
 88 | /// For example, setting quit bytes or enabling heuristic support for
 89 | /// Unicode word boundaries. The default configuration does not enable any
 90 | /// option that could result in the lazy DFA quitting.
 91 | /// * The configuration of the lazy DFA may also permit it to "give up"
 92 | /// on a search if it makes ineffective use of its transition table
 93 | /// cache. The default configuration does not enable this by default,
 94 | /// although it is typically a good idea to.
 95 | /// * When the provided `Input` configuration is not supported. For
 96 | /// example, by providing an unsupported anchor mode.
 97 | ///
 98 | /// When a search panics, callers cannot know whether a match exists or
 99 | /// not.
100 | ///
101 | /// Use [`Regex::try_search`] if you want to handle these error conditions.
102 | ///
103 | /// # Example
104 | ///
105 | /// ```
106 | /// use regex_automata::{Match, hybrid::regex::Regex};
107 | ///
108 | /// let re = Regex::new("foo[0-9]+")?;
109 | /// let mut cache = re.create_cache();
110 | /// assert_eq!(
111 | ///     Some(Match::must(0, 3..11)),
112 | ///     re.find(&mut cache, "zzzfoo12345zzz"),
113 | /// );
114 | ///
115 | /// // Even though a match is found after reading the first byte (`a`),
116 | /// // the default leftmost-first match semantics demand that we find the
117 | /// // earliest match that prefers earlier parts of the pattern over latter
118 | /// // parts.
119 | /// let re = Regex::new("abc|a")?;
120 | /// let mut cache = re.create_cache();
121 | /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc"));
122 | /// # Ok::<(), Box<dyn std::error::Error>>(())
123 | /// ```
124 | pub fn find<C: Cursor>(regex: &Regex, cache: &mut Cache, input: &mut Input<C>) -> Option<Match> {
125 |     try_search(regex, cache, input).unwrap()
126 | }
127 | 
128 | /// Returns the start and end offset of the leftmost match. If no match
129 | /// exists, then `None` is returned.
130 | ///
131 | /// This is like [`Regex::find`] but with two differences:
132 | ///
133 | /// 1. It is not generic over `Into<Input>` and instead accepts a
134 | /// `&Input`. This permits reusing the same `Input` for multiple searches
135 | /// without needing to create a new one. This _may_ help with latency.
136 | /// 2. It returns an error if the search could not complete where as
137 | /// [`Regex::find`] will panic.
138 | ///
139 | /// # Errors
140 | ///
141 | /// This routine errors if the search could not complete. This can occur
142 | /// in a number of circumstances:
143 | ///
144 | /// * The configuration of the lazy DFA may permit it to "quit" the search.
145 | /// For example, setting quit bytes or enabling heuristic support for
146 | /// Unicode word boundaries. The default configuration does not enable any
147 | /// option that could result in the lazy DFA quitting.
148 | /// * The configuration of the lazy DFA may also permit it to "give up"
149 | /// on a search if it makes ineffective use of its transition table
150 | /// cache. The default configuration does not enable this by default,
151 | /// although it is typically a good idea to.
152 | /// * When the provided `Input` configuration is not supported. For
153 | /// example, by providing an unsupported anchor mode.
154 | ///
155 | /// When a search returns an error, callers cannot know whether a match
156 | /// exists or not.
157 | pub fn try_search<C: Cursor>(
158 |     regex: &Regex,
159 |     cache: &mut Cache,
160 |     input: &mut Input<C>,
161 | ) -> Result<Option<Match>, MatchError> {
162 |     let (fcache, rcache) = cache.as_parts_mut();
163 |     let end = match try_search_fwd(regex.forward(), fcache, input)? {
164 |         None => return Ok(None),
165 |         Some(end) => end,
166 |     };
167 |     // This special cases an empty match at the beginning of the search. If
168 |     // our end matches our start, then since a reverse DFA can't match past
169 |     // the start, it must follow that our starting position is also our end
170 |     // position. So short circuit and skip the reverse search.
171 |     if input.start() == end.offset() {
172 |         return Ok(Some(Match::new(end.pattern(), end.offset()..end.offset())));
173 |     }
174 |     // We can also skip the reverse search if we know our search was
175 |     // anchored. This occurs either when the input config is anchored or
176 |     // when we know the regex itself is anchored. In this case, we know the
177 |     // start of the match, if one is found, must be the start of the
178 |     // search.
179 |     if is_anchored(regex, input) {
180 |         return Ok(Some(Match::new(end.pattern(), input.start()..end.offset())));
181 |     }
182 |     // N.B. I have tentatively convinced myself that it isn't necessary
183 |     // to specify the specific pattern for the reverse search since the
184 |     // reverse search will always find the same pattern to match as the
185 |     // forward search. But I lack a rigorous proof. Why not just provide
186 |     // the pattern anyway? Well, if it is needed, then leaving it out
187 |     // gives us a chance to find a witness. (Also, if we don't need to
188 |     // specify the pattern, then we don't need to build the reverse DFA
189 |     // with 'starts_for_each_pattern' enabled. It doesn't matter too much
190 |     // for the lazy DFA, but does make the overall DFA bigger.)
191 |     //
192 |     // We also need to be careful to disable 'earliest' for the reverse
193 |     // search, since it could be enabled for the forward search. In the
194 |     // reverse case, to satisfy "leftmost" criteria, we need to match as
195 |     // much as we can. We also need to be careful to make the search
196 |     // anchored. We don't want the reverse search to report any matches
197 |     // other than the one beginning at the end of our forward search.
198 | 
199 |     let match_range = input.start()..end.offset();
200 |     let start = input.with(|mut revsearch| {
201 |         revsearch = revsearch.span(match_range).anchored(Anchored::Yes).earliest(false);
202 |         try_search_rev(regex.reverse(), rcache, revsearch)
203 |     });
204 |     let start = start?.expect("reverse search must match if forward search does");
205 |     debug_assert_eq!(
206 |         start.pattern(),
207 |         end.pattern(),
208 |         "forward and reverse search must match same pattern",
209 |     );
210 |     debug_assert!(start.offset() <= end.offset());
211 |     debug_assert!(end.offset() <= input.end());
212 |     debug_assert!(input.start() <= start.offset());
213 |     Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
214 | }
215 | 
216 | /// An iterator over all non-overlapping matches for an infallible search.
217 | ///
218 | /// The iterator yields a [`Match`] value until no more matches could be found.
219 | /// If the underlying regex engine returns an error, then a panic occurs.
220 | ///
221 | /// The lifetime parameters are as follows:
222 | ///
223 | /// * `'r` represents the lifetime of the regex object.
224 | /// * `'h` represents the lifetime of the haystack being searched.
225 | /// * `'c` represents the lifetime of the regex cache.
226 | ///
227 | /// This iterator can be created with the [`Regex::find_iter`] method.
228 | #[derive(Debug)]
229 | pub struct FindMatches<'r, 'c, C: Cursor> {
230 |     re: &'r Regex,
231 |     cache: &'c mut Cache,
232 |     it: iter::Searcher<C>,
233 | }
234 | 
235 | impl<'r, 'c, C: Cursor> Iterator for FindMatches<'r, 'c, C> {
236 |     type Item = Match;
237 | 
238 |     #[inline]
239 |     fn next(&mut self) -> Option<Match> {
240 |         let FindMatches { re, ref mut cache, ref mut it } = *self;
241 |         it.advance(|input| try_search(re, cache, input))
242 |     }
243 | }
244 | 


--------------------------------------------------------------------------------
/src/engines/hybrid/test.rs:
--------------------------------------------------------------------------------
 1 | use proptest::proptest;
 2 | 
 3 | use crate::engines::hybrid::find_iter;
 4 | use crate::input::Input;
 5 | 
 6 | #[test]
 7 | fn searcher() {
 8 |     let text = std::fs::read_to_string("test_cases/syntax.rs").unwrap();
 9 |     let regex = super::Regex::builder()
10 |         .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true))
11 |         .build("vec")
12 |         .unwrap();
13 |     let mut cache = regex.create_cache();
14 |     let rope = ropey::Rope::from_str(&text);
15 |     let matches: Vec<_> = find_iter(&regex, &mut cache, Input::new(&rope))
16 |         .map(|range| rope.byte_slice(range.range()))
17 |         .collect();
18 |     assert_eq!(matches.len(), 68);
19 | }
20 | 
21 | #[test]
22 | fn anchor() {
23 |     let haystack = ":a";
24 |     let needle = "$|:";
25 |     let foo = ropey::Rope::from_str(haystack);
26 |     let regex = super::Regex::builder()
27 |         .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true).unicode(false))
28 |         .build(needle)
29 |         .unwrap();
30 |     let mut cache1 = regex.create_cache();
31 |     let mut cache2 = regex.create_cache();
32 |     let iter1: Vec<_> = regex.find_iter(&mut cache1, haystack).collect();
33 |     let iter2: Vec<_> = find_iter(&regex, &mut cache2, Input::new(&foo)).collect();
34 |     assert_eq!(iter1, iter2);
35 | }
36 | 
37 | #[test]
38 | fn hotloop_transition() {
39 |     let haystack = "Σ /ⶠaAA ﷏00AAΣ/എ";
40 |     let needle = "/";
41 |     let foo = ropey::Rope::from_str(haystack);
42 |     let regex = super::Regex::builder()
43 |         .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true))
44 |         .build(needle)
45 |         .unwrap();
46 |     let mut cache1 = regex.create_cache();
47 |     let mut cache2 = regex.create_cache();
48 |     let iter1: Vec<_> = regex.find_iter(&mut cache1, haystack).collect();
49 |     let iter2: Vec<_> = find_iter(&regex, &mut cache2, Input::new(&foo)).collect();
50 |     assert_eq!(iter1, iter2);
51 | }
52 | 
53 | proptest! {
54 |   #[test]
55 |   fn matches(mut haystack: String, needle: String) {
56 |     haystack = haystack.repeat(1024);
57 |     let foo = ropey::Rope::from_str(&haystack);
58 |     let Ok(regex) = super::Regex::builder()
59 |         .syntax(regex_automata::util::syntax::Config::new()
60 |             .case_insensitive(true)
61 |         )
62 |         .build(&needle) else {
63 |         return Ok(())
64 |     };
65 |     let mut cache1 = regex.create_cache();
66 |     let mut cache2 = regex.create_cache();
67 |     let iter1 = regex.find_iter(&mut cache1, &haystack);
68 |     let iter2 = find_iter(&regex, &mut cache2, Input::new(&foo));
69 |     crate::util::iter::prop_assert_eq(iter1, iter2)?;
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/engines/meta/error.rs:
--------------------------------------------------------------------------------
  1 | use regex_automata::{nfa, MatchError, MatchErrorKind, PatternID};
  2 | use regex_syntax::{ast, hir};
  3 | 
  4 | /// An error that occurs when construction of a `Regex` fails.
  5 | ///
  6 | /// A build error is generally a result of one of two possible failure
  7 | /// modes. First is a parse or syntax error in the concrete syntax of a
  8 | /// pattern. Second is that the construction of the underlying regex matcher
  9 | /// fails, usually because it gets too big with respect to limits like
 10 | /// [`Config::nfa_size_limit`](crate::meta::Config::nfa_size_limit).
 11 | ///
 12 | /// This error provides very little introspection capabilities. You can:
 13 | ///
 14 | /// * Ask for the [`PatternID`] of the pattern that caused an error, if one
 15 | /// is available. This is available for things like syntax errors, but not for
 16 | /// cases where build limits are exceeded.
 17 | /// * Ask for the underlying syntax error, but only if the error is a syntax
 18 | /// error.
 19 | /// * Ask for a human readable message corresponding to the underlying error.
 20 | /// * The `BuildError::source` method (from the `std::error::Error`
 21 | /// trait implementation) may be used to query for an underlying error if one
 22 | /// exists. There are no API guarantees about which error is returned.
 23 | ///
 24 | /// When the `std` feature is enabled, this implements `std::error::Error`.
 25 | #[derive(Clone, Debug)]
 26 | pub struct BuildError {
 27 |     kind: BuildErrorKind,
 28 | }
 29 | 
 30 | #[derive(Clone, Debug)]
 31 | enum BuildErrorKind {
 32 |     Syntax { pid: PatternID, err: regex_syntax::Error },
 33 |     NFA(nfa::thompson::BuildError),
 34 | }
 35 | 
 36 | impl BuildError {
 37 |     /// If it is known which pattern ID caused this build error to occur, then
 38 |     /// this method returns it.
 39 |     ///
 40 |     /// Some errors are not associated with a particular pattern. However, any
 41 |     /// errors that occur as part of parsing a pattern are guaranteed to be
 42 |     /// associated with a pattern ID.
 43 |     ///
 44 |     /// # Example
 45 |     ///
 46 |     /// ```
 47 |     /// use regex_automata::{meta::Regex, PatternID};
 48 |     ///
 49 |     /// let err = Regex::new_many(&["a", "b", r"\p{Foo}", "c"]).unwrap_err();
 50 |     /// assert_eq!(Some(PatternID::must(2)), err.pattern());
 51 |     /// ```
 52 |     pub fn pattern(&self) -> Option<PatternID> {
 53 |         match self.kind {
 54 |             BuildErrorKind::Syntax { pid, .. } => Some(pid),
 55 |             _ => None,
 56 |         }
 57 |     }
 58 | 
 59 |     /// If this error occurred because the regex exceeded the configured size
 60 |     /// limit before being built, then this returns the configured size limit.
 61 |     ///
 62 |     /// The limit returned is what was configured, and corresponds to the
 63 |     /// maximum amount of heap usage in bytes.
 64 |     pub fn size_limit(&self) -> Option<usize> {
 65 |         match self.kind {
 66 |             BuildErrorKind::NFA(ref err) => err.size_limit(),
 67 |             _ => None,
 68 |         }
 69 |     }
 70 | 
 71 |     /// If this error corresponds to a syntax error, then a reference to it is
 72 |     /// returned by this method.
 73 |     pub fn syntax_error(&self) -> Option<&regex_syntax::Error> {
 74 |         match self.kind {
 75 |             BuildErrorKind::Syntax { ref err, .. } => Some(err),
 76 |             _ => None,
 77 |         }
 78 |     }
 79 | 
 80 |     pub(crate) fn ast(pid: PatternID, err: ast::Error) -> BuildError {
 81 |         let err = regex_syntax::Error::from(err);
 82 |         BuildError { kind: BuildErrorKind::Syntax { pid, err } }
 83 |     }
 84 | 
 85 |     pub(crate) fn hir(pid: PatternID, err: hir::Error) -> BuildError {
 86 |         let err = regex_syntax::Error::from(err);
 87 |         BuildError { kind: BuildErrorKind::Syntax { pid, err } }
 88 |     }
 89 | 
 90 |     pub(crate) fn nfa(err: nfa::thompson::BuildError) -> BuildError {
 91 |         BuildError { kind: BuildErrorKind::NFA(err) }
 92 |     }
 93 | }
 94 | 
 95 | impl std::error::Error for BuildError {
 96 |     fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
 97 |         match self.kind {
 98 |             BuildErrorKind::Syntax { ref err, .. } => Some(err),
 99 |             BuildErrorKind::NFA(ref err) => Some(err),
100 |         }
101 |     }
102 | }
103 | 
104 | impl core::fmt::Display for BuildError {
105 |     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
106 |         match self.kind {
107 |             BuildErrorKind::Syntax { pid, .. } => {
108 |                 write!(f, "error parsing pattern {}", pid.as_usize())
109 |             }
110 |             BuildErrorKind::NFA(_) => write!(f, "error building NFA"),
111 |         }
112 |     }
113 | }
114 | 
115 | /// An error that occurs when a regex engine "gives up" for some reason before
116 | /// finishing a search. Usually this occurs because of heuristic Unicode word
117 | /// boundary support or because of ineffective cache usage in the lazy DFA.
118 | ///
119 | /// When this error occurs, callers should retry the regex search with a
120 | /// different regex engine.
121 | ///
122 | /// Note that this has convenient `From` impls that will automatically
123 | /// convert a `MatchError` into this error. This works because the meta
124 | /// regex engine internals guarantee that errors like `HaystackTooLong` and
125 | /// `UnsupportedAnchored` will never occur. The only errors left are `Quit` and
126 | /// `GaveUp`, which both correspond to this "failure" error.
127 | #[derive(Debug)]
128 | pub(crate) struct RetryFailError {
129 |     offset: usize,
130 | }
131 | 
132 | impl RetryFailError {
133 |     pub(crate) fn from_offset(offset: usize) -> RetryFailError {
134 |         RetryFailError { offset }
135 |     }
136 | }
137 | 
138 | impl std::error::Error for RetryFailError {}
139 | 
140 | impl core::fmt::Display for RetryFailError {
141 |     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
142 |         write!(f, "regex engine failed at offset {:?}", self.offset)
143 |     }
144 | }
145 | 
146 | impl From<MatchError> for RetryFailError {
147 |     fn from(merr: MatchError) -> RetryFailError {
148 |         use MatchErrorKind::*;
149 | 
150 |         match *merr.kind() {
151 |             Quit { offset, .. } => RetryFailError::from_offset(offset),
152 |             GaveUp { offset } => RetryFailError::from_offset(offset),
153 |             // These can never occur because we avoid them by construction
154 |             // or with higher level control flow logic. For example, the
155 |             // backtracker's wrapper will never hand out a backtracker engine
156 |             // when the haystack would be too long.
157 |             _ => {
158 |                 unreachable!("found impossible error in meta engine: {}", merr)
159 |             }
160 |         }
161 |     }
162 | }
163 | 


--------------------------------------------------------------------------------
/src/engines/meta/literal.rs:
--------------------------------------------------------------------------------
 1 | use std::{vec, vec::Vec};
 2 | 
 3 | use log::debug;
 4 | use regex_automata::MatchKind;
 5 | use regex_syntax::hir::Hir;
 6 | 
 7 | use crate::engines::meta::regex::RegexInfo;
 8 | 
 9 | /// Pull out an alternation of literals from the given sequence of HIR
10 | /// expressions.
11 | ///
12 | /// There are numerous ways for this to fail. Generally, this only applies
13 | /// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there
14 | /// are "too few" alternates, in which case, the regex engine is likely faster.
15 | ///
16 | /// And currently, this only returns something when 'hirs.len() == 1'.
17 | pub(crate) fn alternation_literals(info: &RegexInfo, hirs: &[&Hir]) -> Option<Vec<Vec<u8>>> {
18 |     use regex_syntax::hir::{HirKind, Literal};
19 | 
20 |     // Might as well skip the work below if we know we can't build an
21 |     // Aho-Corasick searcher.
22 |     if !cfg!(feature = "perf-literal-multisubstring") {
23 |         return None;
24 |     }
25 |     // This is pretty hacky, but basically, if `is_alternation_literal` is
26 |     // true, then we can make several assumptions about the structure of our
27 |     // HIR. This is what justifies the `unreachable!` statements below.
28 |     if hirs.len() != 1
29 |         || !info.props()[0].look_set().is_empty()
30 |         || info.props()[0].explicit_captures_len() > 0
31 |         || !info.props()[0].is_alternation_literal()
32 |         || info.config().get_match_kind() != MatchKind::LeftmostFirst
33 |     {
34 |         return None;
35 |     }
36 |     let hir = &hirs[0];
37 |     let alts = match *hir.kind() {
38 |         HirKind::Alternation(ref alts) => alts,
39 |         _ => return None, // one literal isn't worth it
40 |     };
41 | 
42 |     let mut lits = vec![];
43 |     for alt in alts {
44 |         let mut lit = vec![];
45 |         match *alt.kind() {
46 |             HirKind::Literal(Literal(ref bytes)) => lit.extend_from_slice(bytes),
47 |             HirKind::Concat(ref exprs) => {
48 |                 for e in exprs {
49 |                     match *e.kind() {
50 |                         HirKind::Literal(Literal(ref bytes)) => {
51 |                             lit.extend_from_slice(bytes);
52 |                         }
53 |                         _ => unreachable!("expected literal, got {:?}", e),
54 |                     }
55 |                 }
56 |             }
57 |             _ => unreachable!("expected literal or concat, got {:?}", alt),
58 |         }
59 |         lits.push(lit);
60 |     }
61 |     // Why do this? Well, when the number of literals is small, it's likely
62 |     // that we'll use the lazy DFA which is in turn likely to be faster than
63 |     // Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have
64 |     // a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use
65 |     // the latter because it is so hungry (in time and space), and the former
66 |     // is decently fast, but not as fast as a well oiled lazy DFA.
67 |     //
68 |     // However, once the number starts getting large, the lazy DFA is likely
69 |     // to start thrashing because of the modest default cache size. When
70 |     // exactly does this happen? Dunno. But at whatever point that is (we make
71 |     // a guess below based on ad hoc benchmarking), we'll want to cut over to
72 |     // Aho-Corasick, where even the contiguous NFA is likely to do much better.
73 |     if lits.len() < 3000 {
74 |         debug!("skipping Aho-Corasick because there are too few literals");
75 |         return None;
76 |     }
77 |     Some(lits)
78 | }
79 | 


--------------------------------------------------------------------------------
/src/engines/meta/mod.rs:
--------------------------------------------------------------------------------
 1 | /*!
 2 | Provides a regex matcher that composes several other regex matchers
 3 | automatically.
 4 | 
 5 | This module is home to a meta [`Regex`], which provides a convenient high
 6 | level API for executing regular expressions in linear time.
 7 | 
 8 | # Comparison with the `regex` crate
 9 | 
10 | A meta `Regex` is the implementation used directly by the `regex` crate.
11 | Indeed, the `regex` crate API is essentially just a light wrapper over a meta
12 | `Regex`. This means that if you need the full flexibility offered by this
13 | API, then you should be able to switch to using this API directly without
14 | any changes in match semantics or syntax. However, there are some API level
15 | differences:
16 | 
17 | * The `regex` crate API returns match objects that include references to the
18 | haystack itself, which in turn makes it easy to access the matching strings
19 | without having to slice the haystack yourself. In contrast, a meta `Regex`
20 | returns match objects that only have offsets in them.
21 | * At time of writing, a meta `Regex` doesn't have some of the convenience
22 | routines that the `regex` crate has, such as replacements. Note though that
23 | [`Captures::interpolate_string`](crate::util::captures::Captures::interpolate_string)
24 | will handle the replacement string interpolation for you.
25 | * A meta `Regex` supports the [`Input`](crate::Input) abstraction, which
26 | provides a way to configure a search in more ways than is supported by the
27 | `regex` crate. For example, [`Input::anchored`](crate::Input::anchored) can
28 | be used to run an anchored search, regardless of whether the pattern is itself
29 | anchored with a `^`.
30 | * A meta `Regex` supports multi-pattern searching everywhere.
31 | Indeed, every [`Match`](crate::Match) returned by the search APIs
32 | include a [`PatternID`](crate::PatternID) indicating which pattern
33 | matched. In the single pattern case, all matches correspond to
34 | [`PatternID::ZERO`](crate::PatternID::ZERO). In contrast, the `regex` crate
35 | has distinct `Regex` and a `RegexSet` APIs. The former only supports a single
36 | pattern, while the latter supports multiple patterns but cannot report the
37 | offsets of a match.
38 | * A meta `Regex` provides the explicit capability of bypassing its internal
39 | memory pool for automatically acquiring mutable scratch space required by its
40 | internal regex engines. Namely, a [`Cache`] can be explicitly provided to lower
41 | level routines such as [`Regex::search_with`].
42 | 
43 | */
44 | 
45 | pub use self::regex::{Builder, Cache, CapturesMatches, Config, FindMatches, Regex, Split, SplitN};
46 | pub use regex_automata::meta::BuildError;
47 | 
48 | mod error;
49 | // mod limited;
50 | mod literal;
51 | mod regex;
52 | // mod reverse_inner;
53 | // mod stopat;
54 | mod strategy;
55 | mod wrappers;
56 | 


--------------------------------------------------------------------------------
/src/engines/meta/wrappers.rs:
--------------------------------------------------------------------------------
  1 | /*!
  2 | This module contains a boat load of wrappers around each of our internal regex
  3 | engines. They encapsulate a few things:
  4 | 
  5 | 1. The wrappers manage the conditional existence of the regex engine. Namely,
  6 | the PikeVM is the only required regex engine. The rest are optional. These
  7 | wrappers present a uniform API regardless of which engines are available. And
  8 | availability might be determined by compile time features or by dynamic
  9 | configuration via `meta::Config`. Encapsulating the conditional compilation
 10 | features is in particular a huge simplification for the higher level code that
 11 | composes these engines.
 12 | 2. The wrappers manage construction of each engine, including skipping it if
 13 | the engine is unavailable or configured to not be used.
 14 | 3. The wrappers manage whether an engine *can* be used for a particular
 15 | search configuration. For example, `BoundedBacktracker::get` only returns a
 16 | backtracking engine when the haystack is bigger than the maximum supported
 17 | length. The wrappers also sometimes take a position on when an engine *ought*
 18 | to be used, but only in cases where the logic is extremely local to the engine
 19 | itself. Otherwise, things like "choose between the backtracker and the one-pass
 20 | DFA" are managed by the higher level meta strategy code.
 21 | 
 22 | There are also corresponding wrappers for the various `Cache` types for each
 23 | regex engine that needs them. If an engine is unavailable or not used, then a
 24 | cache for it will *not* actually be allocated.
 25 | */
 26 | 
 27 | use log::debug;
 28 | use regex_automata::nfa::thompson::NFA;
 29 | use regex_automata::util::prefilter::Prefilter;
 30 | use regex_automata::util::primitives::NonMaxUsize;
 31 | use regex_automata::{dfa, hybrid, HalfMatch, Match, MatchKind, PatternID};
 32 | 
 33 | use crate::cursor::Cursor;
 34 | use crate::engines::meta::error::{BuildError, RetryFailError};
 35 | use crate::engines::meta::regex::RegexInfo;
 36 | use crate::engines::pikevm;
 37 | use crate::Input;
 38 | 
 39 | #[derive(Debug)]
 40 | pub(crate) struct PikeVM(PikeVMEngine);
 41 | 
 42 | impl PikeVM {
 43 |     pub(crate) fn new(
 44 |         info: &RegexInfo,
 45 |         pre: Option<Prefilter>,
 46 |         nfa: &NFA,
 47 |     ) -> Result<PikeVM, BuildError> {
 48 |         PikeVMEngine::new(info, pre, nfa).map(PikeVM)
 49 |     }
 50 | 
 51 |     pub(crate) fn create_cache(&self) -> PikeVMCache {
 52 |         PikeVMCache::new(self)
 53 |     }
 54 | 
 55 |     #[cfg_attr(feature = "perf-inline", inline(always))]
 56 |     pub(crate) fn get(&self) -> &PikeVMEngine {
 57 |         &self.0
 58 |     }
 59 | }
 60 | 
 61 | #[derive(Debug)]
 62 | pub(crate) struct PikeVMEngine(pikevm::PikeVM);
 63 | 
 64 | impl PikeVMEngine {
 65 |     pub(crate) fn new(
 66 |         info: &RegexInfo,
 67 |         pre: Option<Prefilter>,
 68 |         nfa: &NFA,
 69 |     ) -> Result<PikeVMEngine, BuildError> {
 70 |         let pikevm_config =
 71 |             pikevm::Config::new().match_kind(info.config().get_match_kind()).prefilter(pre);
 72 |         let engine = pikevm::Builder::new()
 73 |             .configure(pikevm_config)
 74 |             .build_from_nfa(nfa.clone())
 75 |             .map_err(BuildError::nfa)?;
 76 |         debug!("PikeVM built");
 77 |         Ok(PikeVMEngine(engine))
 78 |     }
 79 | 
 80 |     #[cfg_attr(feature = "perf-inline", inline(always))]
 81 |     pub(crate) fn is_match(&self, cache: &mut PikeVMCache, input: &mut Input<impl Cursor>) -> bool {
 82 |         crate::engines::pikevm::is_match(&self.0, cache.0.as_mut().unwrap(), input)
 83 |     }
 84 | 
 85 |     #[cfg_attr(feature = "perf-inline", inline(always))]
 86 |     pub(crate) fn search_slots(
 87 |         &self,
 88 |         cache: &mut PikeVMCache,
 89 |         input: &mut Input<impl Cursor>,
 90 |         slots: &mut [Option<NonMaxUsize>],
 91 |     ) -> Option<PatternID> {
 92 |         crate::engines::pikevm::search_slots(&self.0, cache.0.as_mut().unwrap(), input, slots)
 93 |     }
 94 | 
 95 |     // #[cfg_attr(feature = "perf-inline", inline(always))]
 96 |     // pub(crate) fn which_overlapping_matches(
 97 |     //     &self,
 98 |     //     cache: &mut PikeVMCache,
 99 |     //     input: &mut Input<impl Cursor>,
100 |     //     patset: &mut PatternSet,
101 |     // ) {
102 |     //     self.0.which_overlapping_matches(cache.0.as_mut().unwrap(), input, patset)
103 |     // }
104 | }
105 | 
106 | #[derive(Clone, Debug)]
107 | pub(crate) struct PikeVMCache(Option<pikevm::Cache>);
108 | 
109 | impl PikeVMCache {
110 |     pub(crate) fn none() -> PikeVMCache {
111 |         PikeVMCache(None)
112 |     }
113 | 
114 |     pub(crate) fn new(builder: &PikeVM) -> PikeVMCache {
115 |         PikeVMCache(Some(pikevm::Cache::new(&builder.get().0)))
116 |     }
117 | 
118 |     pub(crate) fn reset(&mut self, builder: &PikeVM) {
119 |         self.0.as_mut().unwrap().reset(&builder.get().0);
120 |     }
121 | 
122 |     pub(crate) fn memory_usage(&self) -> usize {
123 |         self.0.as_ref().map_or(0, |c| c.memory_usage())
124 |     }
125 | }
126 | 
127 | #[derive(Debug)]
128 | pub(crate) struct Hybrid(Option<HybridEngine>);
129 | 
130 | impl Hybrid {
131 |     pub(crate) fn none() -> Hybrid {
132 |         Hybrid(None)
133 |     }
134 | 
135 |     pub(crate) fn new(info: &RegexInfo, pre: Option<Prefilter>, nfa: &NFA, nfarev: &NFA) -> Hybrid {
136 |         Hybrid(HybridEngine::new(info, pre, nfa, nfarev))
137 |     }
138 | 
139 |     pub(crate) fn create_cache(&self) -> HybridCache {
140 |         HybridCache::new(self)
141 |     }
142 | 
143 |     #[cfg_attr(feature = "perf-inline", inline(always))]
144 |     pub(crate) fn get(&self, _input: &mut Input<impl Cursor>) -> Option<&HybridEngine> {
145 |         let engine = self.0.as_ref()?;
146 |         Some(engine)
147 |     }
148 | 
149 |     pub(crate) fn is_some(&self) -> bool {
150 |         self.0.is_some()
151 |     }
152 | }
153 | 
154 | #[derive(Debug)]
155 | pub(crate) struct HybridEngine(hybrid::regex::Regex);
156 | 
157 | impl HybridEngine {
158 |     pub(crate) fn new(
159 |         info: &RegexInfo,
160 |         pre: Option<Prefilter>,
161 |         nfa: &NFA,
162 |         nfarev: &NFA,
163 |     ) -> Option<HybridEngine> {
164 |         {
165 |             if !info.config().get_hybrid() {
166 |                 return None;
167 |             }
168 |             let dfa_config = hybrid::dfa::Config::new()
169 |                 .match_kind(info.config().get_match_kind())
170 |                 .prefilter(pre.clone())
171 |                 // Enabling this is necessary for ensuring we can service any
172 |                 // kind of 'Input' search without error. For the lazy DFA,
173 |                 // this is not particularly costly, since the start states are
174 |                 // generated lazily.
175 |                 .starts_for_each_pattern(true)
176 |                 .byte_classes(info.config().get_byte_classes())
177 |                 .unicode_word_boundary(true)
178 |                 .specialize_start_states(pre.is_some())
179 |                 .cache_capacity(info.config().get_hybrid_cache_capacity())
180 |                 // This makes it possible for building a lazy DFA to
181 |                 // fail even though the NFA has already been built. Namely,
182 |                 // if the cache capacity is too small to fit some minimum
183 |                 // number of states (which is small, like 4 or 5), then the
184 |                 // DFA will refuse to build.
185 |                 //
186 |                 // We shouldn't enable this to make building always work, since
187 |                 // this could cause the allocation of a cache bigger than the
188 |                 // provided capacity amount.
189 |                 //
190 |                 // This is effectively the only reason why building a lazy DFA
191 |                 // could fail. If it does, then we simply suppress the error
192 |                 // and return None.
193 |                 .skip_cache_capacity_check(false)
194 |                 // This and enabling heuristic Unicode word boundary support
195 |                 // above make it so the lazy DFA can quit at match time.
196 |                 .minimum_cache_clear_count(Some(3))
197 |                 .minimum_bytes_per_state(Some(10));
198 |             let result = hybrid::dfa::Builder::new()
199 |                 .configure(dfa_config.clone())
200 |                 .build_from_nfa(nfa.clone());
201 |             let fwd = match result {
202 |                 Ok(fwd) => fwd,
203 |                 Err(_err) => {
204 |                     debug!("forward lazy DFA failed to build: {}", _err);
205 |                     return None;
206 |                 }
207 |             };
208 |             let result = hybrid::dfa::Builder::new()
209 |                 .configure(
210 |                     dfa_config
211 |                         .clone()
212 |                         .match_kind(MatchKind::All)
213 |                         .prefilter(None)
214 |                         .specialize_start_states(false),
215 |                 )
216 |                 .build_from_nfa(nfarev.clone());
217 |             let rev = match result {
218 |                 Ok(rev) => rev,
219 |                 Err(_err) => {
220 |                     debug!("reverse lazy DFA failed to build: {}", _err);
221 |                     return None;
222 |                 }
223 |             };
224 |             let engine = hybrid::regex::Builder::new().build_from_dfas(fwd, rev);
225 |             debug!("lazy DFA built");
226 |             Some(HybridEngine(engine))
227 |         }
228 |     }
229 | 
230 |     #[cfg_attr(feature = "perf-inline", inline(always))]
231 |     pub(crate) fn try_search(
232 |         &self,
233 |         cache: &mut HybridCache,
234 |         input: &mut Input<impl Cursor>,
235 |     ) -> Result<Option<Match>, RetryFailError> {
236 |         let cache = cache.0.as_mut().unwrap();
237 |         crate::engines::hybrid::try_search(&self.0, cache, input).map_err(|e| e.into())
238 |     }
239 | 
240 |     #[cfg_attr(feature = "perf-inline", inline(always))]
241 |     pub(crate) fn try_search_half_fwd(
242 |         &self,
243 |         cache: &mut HybridCache,
244 |         input: &mut Input<impl Cursor>,
245 |     ) -> Result<Option<HalfMatch>, RetryFailError> {
246 |         let fwd = self.0.forward();
247 |         let fwdcache = cache.0.as_mut().unwrap().as_parts_mut().0;
248 |         crate::engines::hybrid::try_search_fwd(fwd, fwdcache, input).map_err(|e| e.into())
249 |     }
250 | 
251 |     // #[cfg_attr(feature = "perf-inline", inline(always))]
252 |     // pub(crate) fn try_search_half_fwd_stopat(
253 |     //     &self,
254 |     //     cache: &mut HybridCache,
255 |     //     input: &mut Input<impl Cursor>,
256 |     // ) -> Result<Result<HalfMatch, usize>, RetryFailError> {
257 |     //     let dfa = self.0.forward();
258 |     //     let mut cache = cache.0.as_mut().unwrap().as_parts_mut().0;
259 |     //     crate::meta::stopat::hybrid_try_search_half_fwd(dfa, &mut cache, input)
260 |     // }
261 | 
262 |     #[cfg_attr(feature = "perf-inline", inline(always))]
263 |     pub(crate) fn try_search_half_rev(
264 |         &self,
265 |         cache: &mut HybridCache,
266 |         input: &mut Input<impl Cursor>,
267 |     ) -> Result<Option<HalfMatch>, RetryFailError> {
268 |         let rev = self.0.reverse();
269 |         let revcache = cache.0.as_mut().unwrap().as_parts_mut().1;
270 |         crate::engines::hybrid::try_search_rev(rev, revcache, input).map_err(|e| e.into())
271 |     }
272 | 
273 |     // #[cfg_attr(feature = "perf-inline", inline(always))]
274 |     // pub(crate) fn try_search_half_rev_limited(
275 |     //     &self,
276 |     //     cache: &mut HybridCache,
277 |     //     input: &mut Input<impl Cursor>,
278 |     //     min_start: usize,
279 |     // ) -> Result<Option<HalfMatch>, RetryError> {
280 |     //     let dfa = self.0.reverse();
281 |     //     let mut cache = cache.0.as_mut().unwrap().as_parts_mut().1;
282 |     //     crate::meta::limited::hybrid_try_search_half_rev(dfa, &mut cache, input, min_start)
283 |     // }
284 | 
285 |     // #[inline]
286 |     // pub(crate) fn try_which_overlapping_matches(
287 |     //     &self,
288 |     //     cache: &mut HybridCache,
289 |     //     input: &mut Input<impl Cursor>,
290 |     //     patset: &mut PatternSet,
291 |     // ) -> Result<(), RetryFailError> {
292 |     //         let fwd = self.0.forward();
293 |     //         let mut fwdcache = cache.0.as_mut().unwrap().as_parts_mut().0;
294 |     //         fwd.try_which_overlapping_matches(&mut fwdcache, input, patset).map_err(|e| e.into())
295 |     // }
296 | }
297 | 
298 | #[derive(Clone, Debug)]
299 | pub(crate) struct HybridCache(Option<hybrid::regex::Cache>);
300 | 
301 | impl HybridCache {
302 |     pub(crate) fn none() -> HybridCache {
303 |         HybridCache(None)
304 |     }
305 | 
306 |     pub(crate) fn new(builder: &Hybrid) -> HybridCache {
307 |         HybridCache(builder.0.as_ref().map(|e| e.0.create_cache()))
308 |     }
309 | 
310 |     pub(crate) fn reset(&mut self, builder: &Hybrid) {
311 |         if let Some(ref e) = builder.0 {
312 |             self.0.as_mut().unwrap().reset(&e.0);
313 |         }
314 |     }
315 | 
316 |     pub(crate) fn memory_usage(&self) -> usize {
317 |         {
318 |             self.0.as_ref().map_or(0, |c| c.memory_usage())
319 |         }
320 |     }
321 | }
322 | 
323 | #[derive(Debug)]
324 | pub(crate) struct DFA(Option<DFAEngine>);
325 | 
326 | impl DFA {
327 |     pub(crate) fn none() -> DFA {
328 |         DFA(None)
329 |     }
330 | 
331 |     pub(crate) fn new(info: &RegexInfo, pre: Option<Prefilter>, nfa: &NFA, nfarev: &NFA) -> DFA {
332 |         DFA(DFAEngine::new(info, pre, nfa, nfarev))
333 |     }
334 | 
335 |     #[cfg_attr(feature = "perf-inline", inline(always))]
336 |     pub(crate) fn get(&self, _input: &mut Input<impl Cursor>) -> Option<&DFAEngine> {
337 |         let engine = self.0.as_ref()?;
338 |         Some(engine)
339 |     }
340 | 
341 |     pub(crate) fn is_some(&self) -> bool {
342 |         self.0.is_some()
343 |     }
344 | 
345 |     pub(crate) fn memory_usage(&self) -> usize {
346 |         self.0.as_ref().map_or(0, |e| e.memory_usage())
347 |     }
348 | }
349 | 
350 | #[derive(Debug)]
351 | pub(crate) struct DFAEngine(dfa::regex::Regex);
352 | 
353 | impl DFAEngine {
354 |     pub(crate) fn new(
355 |         info: &RegexInfo,
356 |         pre: Option<Prefilter>,
357 |         nfa: &NFA,
358 |         nfarev: &NFA,
359 |     ) -> Option<DFAEngine> {
360 |         {
361 |             if !info.config().get_dfa() {
362 |                 return None;
363 |             }
364 |             // If our NFA is anything but small, don't even bother with a DFA.
365 |             if let Some(state_limit) = info.config().get_dfa_state_limit() {
366 |                 if nfa.states().len() > state_limit {
367 |                     debug!(
368 |                         "skipping full DFA because NFA has {} states, \
369 |                          which exceeds the heuristic limit of {}",
370 |                         nfa.states().len(),
371 |                         state_limit,
372 |                     );
373 |                     return None;
374 |                 }
375 |             }
376 |             // We cut the size limit in four because the total heap used by
377 |             // DFA construction is determinization aux memory and the DFA
378 |             // itself, and those things are configured independently in the
379 |             // lower level DFA builder API. And then split that in two because
380 |             // of forward and reverse DFAs.
381 |             let size_limit = info.config().get_dfa_size_limit().map(|n| n / 4);
382 |             let dfa_config = dfa::dense::Config::new()
383 |                 .match_kind(info.config().get_match_kind())
384 |                 .prefilter(pre.clone())
385 |                 // Enabling this is necessary for ensuring we can service any
386 |                 // kind of 'Input' search without error. For the full DFA, this
387 |                 // can be quite costly. But since we have such a small bound
388 |                 // on the size of the DFA, in practice, any multl-regexes are
389 |                 // probably going to blow the limit anyway.
390 |                 .starts_for_each_pattern(true)
391 |                 .byte_classes(info.config().get_byte_classes())
392 |                 .unicode_word_boundary(true)
393 |                 .specialize_start_states(pre.is_some())
394 |                 .determinize_size_limit(size_limit)
395 |                 .dfa_size_limit(size_limit);
396 |             let result =
397 |                 dfa::dense::Builder::new().configure(dfa_config.clone()).build_from_nfa(nfa);
398 |             let fwd = match result {
399 |                 Ok(fwd) => fwd,
400 |                 Err(_err) => {
401 |                     debug!("forward full DFA failed to build: {}", _err);
402 |                     return None;
403 |                 }
404 |             };
405 |             let result = dfa::dense::Builder::new()
406 |                 .configure(
407 |                     dfa_config
408 |                         .clone()
409 |                         // We never need unanchored reverse searches, so
410 |                         // there's no point in building it into the DFA, which
411 |                         // WILL take more space. (This isn't done for the lazy
412 |                         // DFA because the DFA is, well, lazy. It doesn't pay
413 |                         // the cost for supporting unanchored searches unless
414 |                         // you actually do an unanchored search, which we
415 |                         // don't.)
416 |                         .start_kind(dfa::StartKind::Anchored)
417 |                         .match_kind(MatchKind::All)
418 |                         .prefilter(None)
419 |                         .specialize_start_states(false),
420 |                 )
421 |                 .build_from_nfa(nfarev);
422 |             let rev = match result {
423 |                 Ok(rev) => rev,
424 |                 Err(_err) => {
425 |                     debug!("reverse full DFA failed to build: {}", _err);
426 |                     return None;
427 |                 }
428 |             };
429 |             let engine = dfa::regex::Builder::new().build_from_dfas(fwd, rev);
430 |             debug!(
431 |                 "fully compiled forward and reverse DFAs built, {} bytes",
432 |                 engine.forward().memory_usage() + engine.reverse().memory_usage(),
433 |             );
434 |             Some(DFAEngine(engine))
435 |         }
436 |     }
437 | 
438 |     #[cfg_attr(feature = "perf-inline", inline(always))]
439 |     pub(crate) fn try_search(
440 |         &self,
441 |         input: &mut Input<impl Cursor>,
442 |     ) -> Result<Option<Match>, RetryFailError> {
443 |         crate::engines::dfa::try_search(&self.0, input).map_err(|err| err.into())
444 |     }
445 | 
446 |     #[cfg_attr(feature = "perf-inline", inline(always))]
447 |     pub(crate) fn try_search_half_fwd(
448 |         &self,
449 |         input: &mut Input<impl Cursor>,
450 |     ) -> Result<Option<HalfMatch>, RetryFailError> {
451 |         crate::engines::dfa::try_search_fwd(self.0.forward(), input).map_err(|e| e.into())
452 |     }
453 | 
454 |     // #[cfg_attr(feature = "perf-inline", inline(always))]
455 |     // pub(crate) fn try_search_half_fwd_stopat(
456 |     //     &self,
457 |     //     input: &mut Input<impl Cursor>,
458 |     // ) -> Result<Result<HalfMatch, usize>, RetryFailError> {
459 |     //         let dfa = self.0.forward();
460 |     //         crate::meta::stopat::dfa_try_search_half_fwd(dfa, input)
461 |     // }
462 | 
463 |     #[cfg_attr(feature = "perf-inline", inline(always))]
464 |     pub(crate) fn try_search_half_rev(
465 |         &self,
466 |         input: &mut Input<impl Cursor>,
467 |     ) -> Result<Option<HalfMatch>, RetryFailError> {
468 |         crate::engines::dfa::try_search_rev(self.0.reverse(), input).map_err(|e| e.into())
469 |     }
470 | 
471 |     // #[cfg_attr(feature = "perf-inline", inline(always))]
472 |     // pub(crate) fn try_search_half_rev_limited(
473 |     //     &self,
474 |     //     input: &mut Input<impl Cursor>,
475 |     //     min_start: usize,
476 |     // ) -> Result<Option<HalfMatch>, RetryError> {
477 |     //     let dfa = self.0.reverse();
478 |     //     crate::meta::limited::dfa_try_search_half_rev(dfa, input, min_start)
479 |     // }
480 | 
481 |     // #[inline]
482 |     // pub(crate) fn try_which_overlapping_matches(
483 |     //     &self,
484 |     //     input: &mut Input<impl Cursor>,
485 |     //     patset: &mut PatternSet,
486 |     // ) -> Result<(), RetryFailError> {
487 |     //         use crate::dfa::Automaton;
488 |     //         self.0.forward().try_which_overlapping_matches(input, patset).map_err(|e| e.into())
489 |     // }
490 | 
491 |     pub(crate) fn memory_usage(&self) -> usize {
492 |         self.0.forward().memory_usage() + self.0.reverse().memory_usage()
493 |     }
494 | }
495 | 
496 | // #[derive(Debug)]
497 | // pub(crate) struct ReverseHybrid(Option<ReverseHybridEngine>);
498 | 
499 | // impl ReverseHybrid {
500 | //     pub(crate) fn none() -> ReverseHybrid {
501 | //         ReverseHybrid(None)
502 | //     }
503 | 
504 | //     pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> ReverseHybrid {
505 | //         ReverseHybrid(ReverseHybridEngine::new(info, nfarev))
506 | //     }
507 | 
508 | //     pub(crate) fn create_cache(&self) -> ReverseHybridCache {
509 | //         ReverseHybridCache::new(self)
510 | //     }
511 | 
512 | //     #[cfg_attr(feature = "perf-inline", inline(always))]
513 | //     pub(crate) fn get(&self, _input: &mut Input<impl Cursor>) -> Option<&ReverseHybridEngine> {
514 | //         let engine = self.0.as_ref()?;
515 | //         Some(engine)
516 | //     }
517 | // }
518 | 
519 | // #[derive(Debug)]
520 | // pub(crate) struct ReverseHybridEngine(hybrid::dfa::DFA);
521 | 
522 | // impl ReverseHybridEngine {
523 | //     pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> Option<ReverseHybridEngine> {
524 | //         if !info.config().get_hybrid() {
525 | //             return None;
526 | //         }
527 | //         // Since we only use this for reverse searches, we can hard-code
528 | //         // a number of things like match semantics, prefilters, starts
529 | //         // for each pattern and so on.
530 | //         let dfa_config = hybrid::dfa::Config::new()
531 | //             .match_kind(MatchKind::All)
532 | //             .prefilter(None)
533 | //             .starts_for_each_pattern(false)
534 | //             .byte_classes(info.config().get_byte_classes())
535 | //             .unicode_word_boundary(true)
536 | //             .specialize_start_states(false)
537 | //             .cache_capacity(info.config().get_hybrid_cache_capacity())
538 | //             .skip_cache_capacity_check(false)
539 | //             .minimum_cache_clear_count(Some(3))
540 | //             .minimum_bytes_per_state(Some(10));
541 | //         let result =
542 | //             hybrid::dfa::Builder::new().configure(dfa_config).build_from_nfa(nfarev.clone());
543 | //         let rev = match result {
544 | //             Ok(rev) => rev,
545 | //             Err(_err) => {
546 | //                 debug!("lazy reverse DFA failed to build: {}", _err);
547 | //                 return None;
548 | //             }
549 | //         };
550 | //         debug!("lazy reverse DFA built");
551 | //         Some(ReverseHybridEngine(rev))
552 | //     }
553 | 
554 | //     #[cfg_attr(feature = "perf-inline", inline(always))]
555 | //     pub(crate) fn try_search_half_rev_limited(
556 | //         &self,
557 | //         cache: &mut ReverseHybridCache,
558 | //         input: &mut Input<impl Cursor>,
559 | //         min_start: usize,
560 | //     ) -> Result<Option<HalfMatch>, RetryError> {
561 | //         let dfa = &self.0;
562 | //         let mut cache = cache.0.as_mut().unwrap();
563 | //         crate::meta::limited::hybrid_try_search_half_rev(dfa, &mut cache, input, min_start)
564 | //     }
565 | // }
566 | 
567 | // #[derive(Clone, Debug)]
568 | // pub(crate) struct ReverseHybridCache(
569 | //     #[cfg(feature = "hybrid")] Option<hybrid::dfa::Cache>,
570 | //     #[cfg(not(feature = "hybrid"))] (),
571 | // );
572 | 
573 | // impl ReverseHybridCache {
574 | //     pub(crate) fn none() -> ReverseHybridCache {
575 | //         #[cfg(feature = "hybrid")]
576 | //         {
577 | //             ReverseHybridCache(None)
578 | //         }
579 | //         #[cfg(not(feature = "hybrid"))]
580 | //         {
581 | //             ReverseHybridCache(())
582 | //         }
583 | //     }
584 | 
585 | //     pub(crate) fn new(builder: &ReverseHybrid) -> ReverseHybridCache {
586 | //         #[cfg(feature = "hybrid")]
587 | //         {
588 | //             ReverseHybridCache(builder.0.as_ref().map(|e| e.0.create_cache()))
589 | //         }
590 | //         #[cfg(not(feature = "hybrid"))]
591 | //         {
592 | //             ReverseHybridCache(())
593 | //         }
594 | //     }
595 | 
596 | //     pub(crate) fn reset(&mut self, builder: &ReverseHybrid) {
597 | //         #[cfg(feature = "hybrid")]
598 | //         if let Some(ref e) = builder.0 {
599 | //             self.0.as_mut().unwrap().reset(&e.0);
600 | //         }
601 | //     }
602 | 
603 | //     pub(crate) fn memory_usage(&self) -> usize {
604 | //         #[cfg(feature = "hybrid")]
605 | //         {
606 | //             self.0.as_ref().map_or(0, |c| c.memory_usage())
607 | //         }
608 | //         #[cfg(not(feature = "hybrid"))]
609 | //         {
610 | //             0
611 | //         }
612 | //     }
613 | // }
614 | 
615 | // #[derive(Debug)]
616 | // pub(crate) struct ReverseDFA(Option<ReverseDFAEngine>);
617 | 
618 | // impl ReverseDFA {
619 | //     pub(crate) fn none() -> ReverseDFA {
620 | //         ReverseDFA(None)
621 | //     }
622 | 
623 | //     pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> ReverseDFA {
624 | //         ReverseDFA(ReverseDFAEngine::new(info, nfarev))
625 | //     }
626 | 
627 | //     #[cfg_attr(feature = "perf-inline", inline(always))]
628 | //     pub(crate) fn get(&self, _input: &mut Input<impl Cursor>) -> Option<&ReverseDFAEngine> {
629 | //         let engine = self.0.as_ref()?;
630 | //         Some(engine)
631 | //     }
632 | 
633 | //     pub(crate) fn is_some(&self) -> bool {
634 | //         self.0.is_some()
635 | //     }
636 | 
637 | //     pub(crate) fn memory_usage(&self) -> usize {
638 | //         self.0.as_ref().map_or(0, |e| e.memory_usage())
639 | //     }
640 | // }
641 | 
642 | // #[derive(Debug)]
643 | // pub(crate) struct ReverseDFAEngine(
644 | //     #[cfg(feature = "dfa-build")] dfa::dense::DFA<Vec<u32>>,
645 | //     #[cfg(not(feature = "dfa-build"))] (),
646 | // );
647 | 
648 | // impl ReverseDFAEngine {
649 | //     pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> Option<ReverseDFAEngine> {
650 | //         #[cfg(feature = "dfa-build")]
651 | //         {
652 | //             if !info.config().get_dfa() {
653 | //                 return None;
654 | //             }
655 | //             // If our NFA is anything but small, don't even bother with a DFA.
656 | //             if let Some(state_limit) = info.config().get_dfa_state_limit() {
657 | //                 if nfarev.states().len() > state_limit {
658 | //                     debug!(
659 | //                         "skipping full reverse DFA because NFA has {} states, \
660 | //                          which exceeds the heuristic limit of {}",
661 | //                         nfarev.states().len(),
662 | //                         state_limit,
663 | //                     );
664 | //                     return None;
665 | //                 }
666 | //             }
667 | //             // We cut the size limit in two because the total heap used by DFA
668 | //             // construction is determinization aux memory and the DFA itself,
669 | //             // and those things are configured independently in the lower level
670 | //             // DFA builder API.
671 | //             let size_limit = info.config().get_dfa_size_limit().map(|n| n / 2);
672 | //             // Since we only use this for reverse searches, we can hard-code
673 | //             // a number of things like match semantics, prefilters, starts
674 | //             // for each pattern and so on. We also disable acceleration since
675 | //             // it's incompatible with limited searches (which is the only
676 | //             // operation we support for this kind of engine at the moment).
677 | //             let dfa_config = dfa::dense::Config::new()
678 | //                 .match_kind(MatchKind::All)
679 | //                 .prefilter(None)
680 | //                 .accelerate(false)
681 | //                 .start_kind(dfa::StartKind::Anchored)
682 | //                 .starts_for_each_pattern(false)
683 | //                 .byte_classes(info.config().get_byte_classes())
684 | //                 .unicode_word_boundary(true)
685 | //                 .specialize_start_states(false)
686 | //                 .determinize_size_limit(size_limit)
687 | //                 .dfa_size_limit(size_limit);
688 | //             let result = dfa::dense::Builder::new().configure(dfa_config).build_from_nfa(&nfarev);
689 | //             let rev = match result {
690 | //                 Ok(rev) => rev,
691 | //                 Err(_err) => {
692 | //                     debug!("full reverse DFA failed to build: {}", _err);
693 | //                     return None;
694 | //                 }
695 | //             };
696 | //             debug!("fully compiled reverse DFA built, {} bytes", rev.memory_usage());
697 | //             Some(ReverseDFAEngine(rev))
698 | //         }
699 | //         #[cfg(not(feature = "dfa-build"))]
700 | //         {
701 | //             None
702 | //         }
703 | //     }
704 | 
705 | //     #[cfg_attr(feature = "perf-inline", inline(always))]
706 | //     pub(crate) fn try_search_half_rev_limited(
707 | //         &self,
708 | //         input: &mut Input<impl Cursor>,
709 | //         min_start: usize,
710 | //     ) -> Result<Option<HalfMatch>, RetryError> {
711 | //         #[cfg(feature = "dfa-build")]
712 | //         {
713 | //             let dfa = &self.0;
714 | //             crate::meta::limited::dfa_try_search_half_rev(dfa, input, min_start)
715 | //         }
716 | //         #[cfg(not(feature = "dfa-build"))]
717 | //         {
718 | //             // Impossible to reach because this engine is never constructed
719 | //             // if the requisite features aren't enabled.
720 | //             unreachable!()
721 | //         }
722 | //     }
723 | 
724 | //     pub(crate) fn memory_usage(&self) -> usize {
725 | //         #[cfg(feature = "dfa-build")]
726 | //         {
727 | //             self.0.memory_usage()
728 | //         }
729 | //         #[cfg(not(feature = "dfa-build"))]
730 | //         {
731 | //             // Impossible to reach because this engine is never constructed
732 | //             // if the requisite features aren't enabled.
733 | //             unreachable!()
734 | //         }
735 | //     }
736 | // }
737 | 


--------------------------------------------------------------------------------
/src/engines/pikevm/error.rs:
--------------------------------------------------------------------------------
  1 | use regex_automata::util::primitives::{PatternID, StateID};
  2 | use regex_automata::util::{captures, look};
  3 | 
  4 | /// An error that can occurred during the construction of a thompson NFA.
  5 | ///
  6 | /// This error does not provide many introspection capabilities. There are
  7 | /// generally only two things you can do with it:
  8 | ///
  9 | /// * Obtain a human readable message via its `std::fmt::Display` impl.
 10 | /// * Access an underlying [`regex_syntax::Error`] type from its `source`
 11 | /// method via the `std::error::Error` trait. This error only occurs when using
 12 | /// convenience routines for building an NFA directly from a pattern string.
 13 | ///
 14 | /// Otherwise, errors typically occur when a limit has been breeched. For
 15 | /// example, if the total heap usage of the compiled NFA exceeds the limit
 16 | /// set by [`Config::nfa_size_limit`](crate::nfa::thompson::Config), then
 17 | /// building the NFA will fail.
 18 | #[derive(Clone, Debug)]
 19 | pub struct BuildError {
 20 |     kind: BuildErrorKind,
 21 | }
 22 | 
 23 | /// The kind of error that occurred during the construction of a thompson NFA.
 24 | #[derive(Clone, Debug)]
 25 | enum BuildErrorKind {
 26 |     /// An error that occurred while parsing a regular expression. Note that
 27 |     /// this error may be printed over multiple lines, and is generally
 28 |     /// intended to be end user readable on its own.
 29 |     Syntax(regex_syntax::Error),
 30 |     /// An error that occurs if the capturing groups provided to an NFA builder
 31 |     /// do not satisfy the documented invariants. For example, things like
 32 |     /// too many groups, missing groups, having the first (zeroth) group be
 33 |     /// named or duplicate group names within the same pattern.
 34 |     Captures(captures::GroupInfoError),
 35 |     /// An error that occurs when an NFA contains a Unicode word boundary, but
 36 |     /// where the crate was compiled without the necessary data for dealing
 37 |     /// with Unicode word boundaries.
 38 |     Word(look::UnicodeWordBoundaryError),
 39 |     /// An error that occurs if too many patterns were given to the NFA
 40 |     /// compiler.
 41 |     TooManyPatterns {
 42 |         /// The number of patterns given, which exceeds the limit.
 43 |         given: usize,
 44 |         /// The limit on the number of patterns.
 45 |         limit: usize,
 46 |     },
 47 |     /// An error that occurs if too states are produced while building an NFA.
 48 |     TooManyStates {
 49 |         /// The minimum number of states that are desired, which exceeds the
 50 |         /// limit.
 51 |         given: usize,
 52 |         /// The limit on the number of states.
 53 |         limit: usize,
 54 |     },
 55 |     /// An error that occurs when NFA compilation exceeds a configured heap
 56 |     /// limit.
 57 |     ExceededSizeLimit {
 58 |         /// The configured limit, in bytes.
 59 |         limit: usize,
 60 |     },
 61 |     /// An error that occurs when an invalid capture group index is added to
 62 |     /// the NFA. An "invalid" index can be one that would otherwise overflow
 63 |     /// a `usize` on the current target.
 64 |     InvalidCaptureIndex {
 65 |         /// The invalid index that was given.
 66 |         index: u32,
 67 |     },
 68 |     /// An error that occurs when one tries to build an NFA simulation (such as
 69 |     /// the PikeVM) without any capturing groups.
 70 |     MissingCaptures,
 71 |     /// An error that occurs when one tries to build a reverse NFA with
 72 |     /// captures enabled. Currently, this isn't supported, but we probably
 73 |     /// should support it at some point.
 74 |     UnsupportedCaptures,
 75 | }
 76 | 
 77 | impl BuildError {
 78 |     /// If this error occurred because the NFA exceeded the configured size
 79 |     /// limit before being built, then this returns the configured size limit.
 80 |     ///
 81 |     /// The limit returned is what was configured, and corresponds to the
 82 |     /// maximum amount of heap usage in bytes.
 83 |     pub fn size_limit(&self) -> Option<usize> {
 84 |         match self.kind {
 85 |             BuildErrorKind::ExceededSizeLimit { limit } => Some(limit),
 86 |             _ => None,
 87 |         }
 88 |     }
 89 | 
 90 |     fn kind(&self) -> &BuildErrorKind {
 91 |         &self.kind
 92 |     }
 93 | 
 94 |     pub(crate) fn syntax(err: regex_syntax::Error) -> BuildError {
 95 |         BuildError { kind: BuildErrorKind::Syntax(err) }
 96 |     }
 97 | 
 98 |     pub(crate) fn captures(err: captures::GroupInfoError) -> BuildError {
 99 |         BuildError { kind: BuildErrorKind::Captures(err) }
100 |     }
101 | 
102 |     pub(crate) fn word(err: look::UnicodeWordBoundaryError) -> BuildError {
103 |         BuildError { kind: BuildErrorKind::Word(err) }
104 |     }
105 | 
106 |     pub(crate) fn too_many_patterns(given: usize) -> BuildError {
107 |         let limit = PatternID::LIMIT;
108 |         BuildError { kind: BuildErrorKind::TooManyPatterns { given, limit } }
109 |     }
110 | 
111 |     pub(crate) fn too_many_states(given: usize) -> BuildError {
112 |         let limit = StateID::LIMIT;
113 |         BuildError { kind: BuildErrorKind::TooManyStates { given, limit } }
114 |     }
115 | 
116 |     pub(crate) fn exceeded_size_limit(limit: usize) -> BuildError {
117 |         BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } }
118 |     }
119 | 
120 |     pub(crate) fn invalid_capture_index(index: u32) -> BuildError {
121 |         BuildError { kind: BuildErrorKind::InvalidCaptureIndex { index } }
122 |     }
123 | 
124 |     pub(crate) fn missing_captures() -> BuildError {
125 |         BuildError { kind: BuildErrorKind::MissingCaptures }
126 |     }
127 | 
128 |     pub(crate) fn unsupported_captures() -> BuildError {
129 |         BuildError { kind: BuildErrorKind::UnsupportedCaptures }
130 |     }
131 | }
132 | 
133 | impl std::error::Error for BuildError {
134 |     fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
135 |         match self.kind() {
136 |             BuildErrorKind::Syntax(ref err) => Some(err),
137 |             BuildErrorKind::Captures(ref err) => Some(err),
138 |             _ => None,
139 |         }
140 |     }
141 | }
142 | 
143 | impl core::fmt::Display for BuildError {
144 |     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
145 |         match self.kind() {
146 |             BuildErrorKind::Syntax(_) => write!(f, "error parsing regex"),
147 |             BuildErrorKind::Captures(_) => {
148 |                 write!(f, "error with capture groups")
149 |             }
150 |             BuildErrorKind::Word(_) => {
151 |                 write!(f, "NFA contains Unicode word boundary")
152 |             }
153 |             BuildErrorKind::TooManyPatterns { given, limit } => write!(
154 |                 f,
155 |                 "attempted to compile {} patterns, \
156 |                  which exceeds the limit of {}",
157 |                 given, limit,
158 |             ),
159 |             BuildErrorKind::TooManyStates { given, limit } => write!(
160 |                 f,
161 |                 "attempted to compile {} NFA states, \
162 |                  which exceeds the limit of {}",
163 |                 given, limit,
164 |             ),
165 |             BuildErrorKind::ExceededSizeLimit { limit } => {
166 |                 write!(f, "heap usage during NFA compilation exceeded limit of {}", limit,)
167 |             }
168 |             BuildErrorKind::InvalidCaptureIndex { index } => {
169 |                 write!(f, "capture group index {} is invalid (too big or discontinuous)", index,)
170 |             }
171 |             BuildErrorKind::MissingCaptures => write!(
172 |                 f,
173 |                 "operation requires the NFA to have capturing groups, \
174 |                  but the NFA given contains none",
175 |             ),
176 |             BuildErrorKind::UnsupportedCaptures => write!(
177 |                 f,
178 |                 "currently captures must be disabled when compiling \
179 |                  a reverse NFA",
180 |             ),
181 |         }
182 |     }
183 | }
184 | 


--------------------------------------------------------------------------------
/src/engines/pikevm/tests.rs:
--------------------------------------------------------------------------------
 1 | use std::ops::RangeBounds;
 2 | 
 3 | use proptest::{prop_assert_eq, proptest};
 4 | use regex_automata::nfa::thompson::pikevm::PikeVM;
 5 | use regex_automata::nfa::thompson::Config;
 6 | use regex_automata::util::escape::DebugHaystack;
 7 | use regex_automata::util::syntax::Config as SyntaxConfig;
 8 | 
 9 | use crate::engines::pikevm::find_iter;
10 | use crate::input::Input;
11 | use crate::test_rope::SingleByteChunks;
12 | 
13 | use super::Cache;
14 | 
15 | fn test(needle: &str, haystack: &[u8]) {
16 |     test_with_bounds(needle, haystack, ..)
17 | }
18 | 
19 | fn test_with_bounds(needle: &str, haystack: &[u8], bounds: impl RangeBounds<usize> + Clone) {
20 |     for utf8 in [true, false] {
21 |         let regex = PikeVM::builder()
22 |             .syntax(SyntaxConfig::new().utf8(utf8))
23 |             .thompson(Config::new().utf8(utf8))
24 |             .build(needle)
25 |             .unwrap();
26 |         let mut cache1 = regex.create_cache();
27 |         let mut cache2 = Cache::new(&regex);
28 |         let input = regex_automata::Input::new(haystack).range(bounds.clone());
29 |         let iter1: Vec<_> = regex.find_iter(&mut cache1, input).collect();
30 |         let input = Input::new(SingleByteChunks::new(haystack)).range(bounds.clone());
31 |         let iter2: Vec<_> = find_iter(&regex, &mut cache2, input).collect();
32 |         assert_eq!(iter1, iter2, "matches of {needle} in {:?}", DebugHaystack(haystack));
33 |     }
34 | }
35 | 
36 | #[test]
37 | fn smoke_test() {
38 |     let text = std::fs::read_to_string("test_cases/syntax.rs").unwrap();
39 |     let regex =
40 |         PikeVM::builder().syntax(SyntaxConfig::new().case_insensitive(true)).build("vec").unwrap();
41 |     let mut cache = Cache::new(&regex);
42 |     let rope = ropey::Rope::from_str(&text);
43 |     let matches: Vec<_> = find_iter(&regex, &mut cache, Input::new(&rope))
44 |         .map(|range| rope.byte_slice(range.range()))
45 |         .collect();
46 |     println!("found {matches:#?} in syntax.rs");
47 |     assert_eq!(matches.len(), 68);
48 | }
49 | 
50 | #[test]
51 | fn any() {
52 |     test(".", b" ");
53 | }
54 | 
55 | #[test]
56 | fn look_around() {
57 |     test("^bar", b"foobar");
58 |     test("foo$", b"foobar");
59 |     test(r"(?m)(?:^|a)+", b"a\naaa\n");
60 |     test_with_bounds(r"\b{end}", "𝛃".as_bytes(), 2..3);
61 |     let haystack: String =
62 |         (0..5 * 4096).map(|i| format!("foöbar  foÖ{0}bar foö{0}bar", " ".repeat(i % 31))).collect();
63 |     let needle = r"\bfoö\b[ ]*\bbar\b";
64 |     test(needle, haystack.as_bytes())
65 | }
66 | 
67 | #[test]
68 | fn maybe_empty() {
69 |     test(r"x*", b"x");
70 |     test(r"\bx*\b", b"x");
71 | }
72 | 
73 | proptest! {
74 |   #[test]
75 |   fn matches(haystack: String, needle: String) {
76 |     let Ok(regex) = PikeVM::builder().syntax(SyntaxConfig::new().case_insensitive(true)).build(&needle) else {
77 |         return Ok(())
78 |     };
79 |     let mut cache1 = regex.create_cache();
80 |     let mut cache2 = Cache::new(&regex);
81 |     let iter1: Vec<_> = regex.find_iter(&mut cache1, &haystack).collect();
82 |     let iter2: Vec<_> = find_iter(&regex, &mut cache2, Input::new(SingleByteChunks::new(haystack.as_bytes()))).collect();
83 |     prop_assert_eq!(iter1, iter2);
84 |   }
85 |   #[test]
86 |   fn matches_word(haystack: String, needle in r"\\b\PC+\\b") {
87 |     let Ok(regex) = PikeVM::builder().syntax(SyntaxConfig::new().case_insensitive(true)).build(&needle) else {
88 |         return Ok(())
89 |     };
90 |     let mut cache1 = regex.create_cache();
91 |     let mut cache2 = Cache::new(&regex);
92 |     let iter1: Vec<_> = regex.find_iter(&mut cache1, &haystack).collect();
93 |     let iter2: Vec<_> = find_iter(&regex, &mut cache2, Input::new(SingleByteChunks::new(haystack.as_bytes()))).collect();
94 |     prop_assert_eq!(iter1, iter2);
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*!
 2 | This crate provides routines for searching **discontiguous strings** for matches
 3 | of a regular expression (aka "regex"). It is based on regex-automata and
 4 | most of the code is adapted from the various crates in the
 5 | [regex](https://github.com/rust-lang/regex) repository.
 6 | 
 7 | It is intended as a prototype for upstream support for "streaming regex". The
 8 | cursor based API in this crate is very similar to the API already exposed by
 9 | `regex`/`regex-automata`. To that end a generic `Cursor` trait is provided that
10 | collections can implement.
11 | 
12 | A sketch of the cursor API is shown below. The string is yielded in multiple
13 | byte chunks. Calling advance moves the cursor to the next chunk. Calling
14 | backtrack moves the cursor a chunk back. Backtracking is required by this
15 | crate. That makes it unsuitable for searching fully unbuffered streams like
16 | bytes send over a TCP connection.
17 | 
18 | ```rust_ignore
19 | pub trait Cursor {
20 |    fn chunk(&self) -> &[u8] { .. }
21 |     fn advance(&mut self) -> bool { .. }
22 |     fn bracktrack(&mut self) -> bool { .. }
23 | }
24 | ```
25 | 
26 | Working on this crate showed me that regex backtracks a lot more than expected
27 | with most functionality fundamentally requiring backtracking. For network
28 | usecases that do not buffer their input the primary usecase would likely be
29 | detecting a match (without necessarily requiring the matched byte range).
30 | Such usecases can be covered by manually feeding bytes into the hybrid and DFA
31 | engines from the regex-automata crate. This approach also has the advantage
32 | of allowing the caller to pause the match (async) while waiting for more data
33 | allowing the caller to drive the search instead of the engine itself.
34 | 
35 | The only part of this crate that could be applied to the fully streaming case is
36 | the streaming PikeVM implementation. However, there are some limitations:
37 | * only a single search can be run since the PikeVM may look ahead multiple bytes
38 | to disambiguate alternative matches
39 | * Prefilters longer than one byte can not work
40 | * utf-8 mode can not be supported (empty matches may occur between unicode
41 | boundaries)
42 | 
43 | Currently, the PikeVM implementation is not written with this use case in mind
44 | and may call backtrack unnecessarily, but that could be addressed in the future,
45 | but especially the first point is very limiting. The pikevm also does not allow
46 | the user to drive the search and would block on network calls for example (no
47 | async).
48 | */
49 | 
50 | #[cfg(feature = "ropey")]
51 | pub use cursor::RopeyCursor;
52 | pub use cursor::{Cursor, IntoCursor};
53 | pub use input::Input;
54 | pub use regex_automata;
55 | 
56 | mod cursor;
57 | pub mod engines;
58 | mod input;
59 | mod literal;
60 | mod util;
61 | 
62 | #[cfg(test)]
63 | mod test_rope;
64 | #[cfg(test)]
65 | mod tests;
66 | 


--------------------------------------------------------------------------------
/src/literal.rs:
--------------------------------------------------------------------------------
  1 | pub use regex_automata::util::prefilter::Prefilter;
  2 | pub use regex_automata::MatchKind;
  3 | use regex_automata::Span;
  4 | 
  5 | use crate::cursor::Cursor;
  6 | use crate::Input;
  7 | 
  8 | use FindChunkResult::*;
  9 | 
 10 | #[cfg(test)]
 11 | mod tests;
 12 | 
 13 | pub fn find<C: Cursor>(prefilter: &Prefilter, input: &mut Input<C>) -> Option<Span> {
 14 |     // TODO optimize this:
 15 |     // * potentially use an array vec
 16 |     // * specical case max_needle_len==2 (no accumulating necessary)
 17 |     // * specical case max_needle_len==min_needle_len (no ambiguety)
 18 |     if prefilter.max_needle_len() == 1 {
 19 |         find_1(prefilter, input)
 20 |     } else {
 21 |         find_n::<C, true>(prefilter, input)
 22 |     }
 23 | }
 24 | 
 25 | pub fn prefix<C: Cursor>(prefilter: &Prefilter, input: &mut Input<C>) -> Option<Span> {
 26 |     let mut offset = input.chunk_offset();
 27 |     let chunk_pos = input.chunk_pos();
 28 |     let chunk_end = input.get_chunk_end();
 29 |     let mut res = if prefilter.max_needle_len() <= chunk_end - chunk_pos {
 30 |         prefilter
 31 |             .prefix(input.chunk(), Span { start: input.chunk_pos(), end: input.get_chunk_end() })?
 32 |     } else {
 33 |         offset += chunk_pos;
 34 |         let mut buf =
 35 |             Vec::with_capacity(prefilter.max_needle_len().min(input.end() - input.start()));
 36 |         buf.extend_from_slice(&input.chunk()[chunk_pos..chunk_end]);
 37 |         while input.advance() && !buf.spare_capacity_mut().is_empty() {
 38 |             let mut chunk_len = input.chunk().len().min(buf.spare_capacity_mut().len());
 39 |             if input.chunk_offset() + chunk_len <= input.end() {
 40 |                 buf.extend_from_slice(&input.chunk()[..chunk_len]);
 41 |             } else {
 42 |                 chunk_len = input.end() - input.chunk_offset();
 43 |                 buf.extend_from_slice(&input.chunk()[..chunk_len]);
 44 |                 break;
 45 |             }
 46 |         }
 47 |         prefilter.prefix(&buf, Span { start: 0, end: buf.len() })?
 48 |     };
 49 |     res.start += offset;
 50 |     res.end += offset;
 51 |     Some(res)
 52 | }
 53 | 
 54 | fn find_1<C: Cursor>(prefilter: &Prefilter, input: &mut Input<C>) -> Option<Span> {
 55 |     debug_assert_eq!(prefilter.max_needle_len(), 1);
 56 |     let first_haystack = &input.chunk();
 57 |     if let Some(mut res) = prefilter
 58 |         .find(first_haystack, Span { start: input.chunk_pos(), end: input.get_chunk_end() })
 59 |     {
 60 |         res.start += input.chunk_offset();
 61 |         res.end += input.chunk_offset();
 62 |         return Some(res);
 63 |     }
 64 |     while input.chunk_offset() + input.chunk().len() < input.end() && input.advance() {
 65 |         let haystack = &input.chunk();
 66 |         let Some(mut res) = prefilter.find(haystack, Span { start: 0, end: input.get_chunk_end() })
 67 |         else {
 68 |             continue;
 69 |         };
 70 | 
 71 |         res.start += input.chunk_offset();
 72 |         res.end += input.chunk_offset();
 73 |         return Some(res);
 74 |     }
 75 |     None
 76 | }
 77 | 
 78 | fn find_n<C: Cursor, const AMBIGUITY: bool>(
 79 |     prefilter: &Prefilter,
 80 |     input: &mut Input<C>,
 81 | ) -> Option<Span> {
 82 |     // helper macro to make the code more readable
 83 |     macro_rules! find_chunk {
 84 |         ($chunk:expr, $buf_offset:expr, |$start: ident, $off: ident| $disambiguate: expr) => {
 85 |             match find_n_chunk::<AMBIGUITY>(prefilter, $chunk, $buf_offset) {
 86 |                 FindChunkResult::Match(span) => return Some(span),
 87 |                 FindChunkResult::AbigousMatch { $start, $off } if AMBIGUITY => {
 88 |                     return Some($disambiguate);
 89 |                 }
 90 |                 _ => {}
 91 |             }
 92 |         };
 93 |     }
 94 | 
 95 |     // simple case: only search in a single chunk specical casing this is nice
 96 |     // for performance and makes the rest of the logic simpler
 97 |     let first_chunk_end = input.get_chunk_end();
 98 |     let mut first_chunk = input.chunk();
 99 |     if first_chunk.len() != first_chunk_end {
100 |         if let Some(mut res) =
101 |             prefilter.find(first_chunk, Span { start: input.chunk_pos(), end: first_chunk_end })
102 |         {
103 |             res.start += input.chunk_offset();
104 |             res.end += input.chunk_offset();
105 |             return Some(res);
106 |         }
107 |         return None;
108 |     }
109 |     first_chunk = &first_chunk[input.chunk_pos()..];
110 | 
111 |     let max_needle_len = prefilter.max_needle_len();
112 |     let carry_over = max_needle_len - 1;
113 |     let sliding_window = 2 * carry_over;
114 | 
115 |     // again special case the first chunk since that is the hot path
116 |     // and also keeps the logic below simpler
117 |     let mut buf_offset = input.chunk_offset() + input.chunk_pos();
118 |     if first_chunk.len() >= sliding_window {
119 |         find_chunk!(first_chunk, input.chunk_offset() + input.chunk_pos(), |start, off| {
120 |             let mut buf = Vec::with_capacity(max_needle_len);
121 |             buf.extend_from_slice(&first_chunk[start..]);
122 |             disambiguate_match(prefilter, input, buf, off)
123 |         });
124 |         let carrry_over_start = first_chunk.len() - carry_over;
125 |         first_chunk = &first_chunk[carrry_over_start..];
126 |         buf_offset += carrry_over_start;
127 |     }
128 |     let mut buf = Vec::with_capacity(2 * sliding_window);
129 |     buf.extend_from_slice(first_chunk);
130 | 
131 |     while input.chunk_offset() + input.chunk().len() < input.end() && input.advance() {
132 |         debug_assert!(buf.len() < sliding_window, "{} {sliding_window}", buf.len());
133 |         let mut chunk = &input.chunk()[..input.get_chunk_end()];
134 |         let mut chunk_offset = input.chunk_offset();
135 |         // this condition only triggers until we have filled the buffer for the first time
136 |         if buf.len() < carry_over {
137 |             if buf.len() + chunk.len() <= carry_over {
138 |                 buf.extend_from_slice(chunk);
139 |                 continue;
140 |             }
141 |             let copied = carry_over - buf.len();
142 |             buf.extend_from_slice(&chunk[..copied]);
143 |             chunk = &chunk[copied..];
144 |             chunk_offset += copied;
145 |         }
146 |         debug_assert!(buf.len() >= carry_over, "{} {carry_over}", buf.len());
147 | 
148 |         // if the chunk is too small just continue accumelating the condition
149 |         // below implies chunk.len() <= sliding_window since buf.len() <=
150 |         // sliding_window
151 |         if buf.len() + chunk.len() <= buf.capacity() {
152 |             buf.extend_from_slice(chunk);
153 |             if buf.len() >= sliding_window {
154 |                 find_chunk!(&buf, buf_offset, |start, off| {
155 |                     buf.drain(..start);
156 |                     disambiguate_match(prefilter, input, buf, off)
157 |                 });
158 |                 let carry_over_start = buf.len() - carry_over;
159 |                 buf.drain(..carry_over_start);
160 |                 buf_offset += carry_over_start;
161 |             }
162 |             continue;
163 |         }
164 | 
165 |         buf.extend_from_slice(&chunk[..carry_over]);
166 |         find_chunk!(&buf, buf_offset, |start, off| {
167 |             buf.drain(..start);
168 |             buf.extend_from_slice(&chunk[..max_needle_len - buf.len()]);
169 |             let mut res = prefilter.prefix(&buf, Span { start: 0, end: buf.len() }).unwrap();
170 |             res.start += off;
171 |             res.end += off;
172 |             res
173 |         });
174 |         buf.clear();
175 | 
176 |         find_chunk!(chunk, chunk_offset, |start, off| {
177 |             buf.extend_from_slice(&chunk[start..]);
178 |             disambiguate_match(prefilter, input, buf, off)
179 |         });
180 |         let carrry_over_start = chunk.len() - carry_over;
181 |         buf_offset = chunk_offset + carrry_over_start;
182 |         buf.extend_from_slice(&chunk[carrry_over_start..]);
183 |     }
184 | 
185 |     if !buf.is_empty() {
186 |         if let Some(mut res) = prefilter.find(&buf, Span { start: 0, end: buf.len() }) {
187 |             res.start += buf_offset;
188 |             res.end += buf_offset;
189 |             return Some(res);
190 |         }
191 |     }
192 |     None
193 | }
194 | 
195 | #[must_use]
196 | enum FindChunkResult {
197 |     // the prefilter found no matches in this chunk
198 |     NoMatch,
199 |     // the prefilter found a match at the (offset correctd)
200 |     // span in this chunk
201 |     Match(Span),
202 |     // the prefilter found a match that could be ambigous
203 |     // depending on what data follows the buffer
204 |     AbigousMatch { start: usize, off: usize },
205 | }
206 | 
207 | fn disambiguate_match<C: Cursor>(
208 |     prefilter: &Prefilter,
209 |     input: &mut Input<C>,
210 |     mut buf: Vec<u8>,
211 |     off: usize,
212 | ) -> Span {
213 |     let max_needle_len = prefilter.max_needle_len();
214 |     debug_assert!(buf.len() < max_needle_len);
215 |     while input.advance() {
216 |         let chunk_end = input.get_chunk_end().min(max_needle_len - buf.len());
217 |         let chunk = input.chunk();
218 |         if chunk_end != chunk.len() {
219 |             buf.extend_from_slice(&chunk[..chunk_end]);
220 |             break;
221 |         }
222 |         buf.extend_from_slice(chunk);
223 |     }
224 |     debug_assert!(buf.len() <= max_needle_len);
225 |     let mut res = prefilter.prefix(&buf, Span { start: 0, end: buf.len() }).unwrap();
226 |     res.start += off;
227 |     res.end += off;
228 |     res
229 | }
230 | 
231 | fn find_n_chunk<const AMBIGOUS: bool>(
232 |     prefilter: &Prefilter,
233 |     buf: &[u8],
234 |     off: usize,
235 | ) -> FindChunkResult {
236 |     debug_assert!(buf.len() >= 2 * prefilter.max_needle_len() - 2);
237 |     if let Some(mut res) = prefilter.find(buf, Span { start: 0, end: buf.len() }) {
238 |         // This condition is neeed in case we find a match at the end of the
239 |         // chunk. In that case there may be an even longer match once we
240 |         // continue scanning. For example:
241 |         //
242 |         // pattern: "abc|a"
243 |         // haystack: "xxabc" chunked into ["xxab", "c"]
244 |         // matck_kind: leftmost-first
245 |         //
246 |         // In the first chunk we would find a match for "a" but we
247 |         // should be matching "abc" instead (since that is the first
248 |         // alternation).
249 |         if AMBIGOUS && res.start + prefilter.max_needle_len() > buf.len() {
250 |             AbigousMatch { start: res.start, off: res.start + off }
251 |         } else {
252 |             res.start += off;
253 |             res.end += off;
254 |             Match(res)
255 |         }
256 |     } else {
257 |         NoMatch
258 |     }
259 | }
260 | 


--------------------------------------------------------------------------------
/src/literal/tests.rs:
--------------------------------------------------------------------------------
 1 | use std::iter;
 2 | 
 3 | use proptest::proptest;
 4 | use regex_automata::util::prefilter::Prefilter;
 5 | use regex_automata::Span;
 6 | 
 7 | proptest! {
 8 |     #[test]
 9 |     fn matches(mut haystack: String, needle: String) {
10 |         haystack = haystack.repeat(1024);
11 |         let needles = &[needle.as_bytes()];
12 |         let Some(prefilter) = Prefilter::new(regex_automata::MatchKind::All, needles) else {
13 |             return Ok(())
14 |         };
15 |         let mut span = Span{ start: 0, end: haystack.len() };
16 |         let iter1 = iter::from_fn(||{
17 |             let res = prefilter.find(haystack.as_bytes(), span)?;
18 |             span.start = res.end;
19 |             Some(res)
20 |         });
21 |         let rope = ropey::Rope::from_str(&haystack);
22 |         let mut input = crate::Input::new(&rope);
23 |         let iter2= iter::from_fn(||{
24 |             let res = super::find(&prefilter, &mut input)?;
25 |             input.move_to(res.end);
26 |             Some(res)
27 |         });
28 |         crate::util::iter::prop_assert_eq(iter1, iter2)?;
29 |     }
30 | 
31 |     #[test]
32 |     fn matches_range(mut haystack: String, needle: String) {
33 |         haystack = haystack.repeat(1024);
34 |         let start = haystack.len() / 3;
35 |         let end = 2*start;
36 |         let needles = &[needle.as_bytes()];
37 |         let Some(prefilter) = Prefilter::new(regex_automata::MatchKind::All, needles) else {
38 |             return Ok(())
39 |         };
40 |         let mut span = Span{ start, end };
41 |         let iter1 = iter::from_fn(||{
42 |             let res = prefilter.find(haystack.as_bytes(), span)?;
43 |             span.start = res.end;
44 |             Some(res)
45 |         });
46 |         let rope = ropey::Rope::from_str(&haystack);
47 |         let mut input = crate::Input::new(&rope).range(start..end);
48 |         let iter2 = iter::from_fn(||{
49 |             let res = super::find(&prefilter, &mut input)?;
50 |             assert!(res.end <= end);
51 |             input.move_to(res.end);
52 |             Some(res)
53 |         });
54 |         crate::util::iter::prop_assert_eq(iter1, iter2)?;
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/test_rope.rs:
--------------------------------------------------------------------------------
  1 | use std::cell::Cell;
  2 | use std::collections::hash_map::DefaultHasher;
  3 | use std::hash::Hasher;
  4 | use std::sync::atomic::{AtomicUsize, Ordering};
  5 | 
  6 | use regex_automata::util::escape::DebugHaystack;
  7 | 
  8 | use crate::util::utf8;
  9 | use crate::Cursor;
 10 | 
 11 | #[derive(Debug)]
 12 | struct XorShift64Star {
 13 |     state: Cell<u64>,
 14 | }
 15 | 
 16 | impl XorShift64Star {
 17 |     fn new() -> Self {
 18 |         // Any non-zero seed will do -- this uses the hash of a global counter.
 19 |         let mut seed = 0;
 20 |         while seed == 0 {
 21 |             let mut hasher = DefaultHasher::new();
 22 |             static COUNTER: AtomicUsize = AtomicUsize::new(0);
 23 |             hasher.write_usize(COUNTER.fetch_add(1, Ordering::Relaxed));
 24 |             seed = hasher.finish();
 25 |         }
 26 | 
 27 |         XorShift64Star { state: Cell::new(seed) }
 28 |     }
 29 | 
 30 |     fn next(&self) -> u64 {
 31 |         let mut x = self.state.get();
 32 |         debug_assert_ne!(x, 0);
 33 |         x ^= x >> 12;
 34 |         x ^= x << 25;
 35 |         x ^= x >> 27;
 36 |         self.state.set(x);
 37 |         x.wrapping_mul(0x2545_f491_4f6c_dd1d)
 38 |     }
 39 | 
 40 |     /// Return a value from `0..n`.
 41 |     fn next_usize(&self, n: usize) -> usize {
 42 |         (self.next() % n as u64) as usize
 43 |     }
 44 | }
 45 | 
 46 | #[derive(Debug)]
 47 | pub(crate) struct RandomSlices<'a> {
 48 |     haystack: &'a [u8],
 49 |     pos: usize,
 50 |     size: usize,
 51 |     ran: XorShift64Star,
 52 | }
 53 | 
 54 | impl<'a> RandomSlices<'a> {
 55 |     pub fn new(haystack: &'a [u8]) -> Self {
 56 |         let mut res = RandomSlices { haystack, pos: 0, size: 0, ran: XorShift64Star::new() };
 57 |         res.advance();
 58 |         res
 59 |     }
 60 | }
 61 | 
 62 | impl Cursor for RandomSlices<'_> {
 63 |     fn chunk(&self) -> &[u8] {
 64 |         debug_assert_eq!(self.haystack.is_empty(), self.size == 0);
 65 |         &self.haystack[self.pos..self.pos + self.size]
 66 |     }
 67 | 
 68 |     fn utf8_aware(&self) -> bool {
 69 |         true
 70 |     }
 71 | 
 72 |     fn advance(&mut self) -> bool {
 73 |         if self.pos + self.size == self.haystack.len() {
 74 |             return false;
 75 |         }
 76 |         let new_start = self.pos + self.size;
 77 |         let mut tries = u16::MAX;
 78 |         loop {
 79 |             let next_size = self.ran.next_usize(250) + 1;
 80 |             let new_end = (new_start + next_size).min(self.haystack.len());
 81 |             if utf8::is_boundary(self.haystack, new_end) {
 82 |                 self.pos = new_start;
 83 |                 self.size = new_end - new_start;
 84 |                 break;
 85 |             }
 86 |             if tries == 0 {
 87 |                 panic!("faild to advance at {} {:?}", self.pos, DebugHaystack(self.haystack))
 88 |             }
 89 |             tries -= 1;
 90 |         }
 91 |         true
 92 |     }
 93 | 
 94 |     fn backtrack(&mut self) -> bool {
 95 |         if self.pos == 0 {
 96 |             return false;
 97 |         }
 98 |         let mut tries = u16::MAX;
 99 |         let new_end = self.pos;
100 |         loop {
101 |             let next_size = self.ran.next_usize(250) + 1;
102 |             let new_start = new_end.saturating_sub(next_size);
103 |             if utf8::is_boundary(self.haystack, new_start) {
104 |                 self.pos = new_start;
105 |                 self.size = new_end - new_start;
106 |                 break;
107 |             }
108 |             if tries == 0 {
109 |                 panic!("faild to backtrack at {} {:?}", self.pos, DebugHaystack(self.haystack))
110 |             }
111 |             tries -= 1;
112 |         }
113 |         true
114 |     }
115 | 
116 |     fn total_bytes(&self) -> Option<usize> {
117 |         Some(self.haystack.len())
118 |     }
119 | 
120 |     fn offset(&self) -> usize {
121 |         self.pos
122 |     }
123 | }
124 | 
125 | #[derive(Debug)]
126 | pub(crate) struct SingleByteChunks<'a> {
127 |     haystack: &'a [u8],
128 |     pos: usize,
129 |     end: usize,
130 | }
131 | 
132 | impl<'a> SingleByteChunks<'a> {
133 |     pub fn new(haystack: &'a [u8]) -> Self {
134 |         Self {
135 |             haystack,
136 |             pos: 0,
137 |             end: (1..haystack.len())
138 |                 .find(|&i| utf8::is_boundary(haystack, i))
139 |                 .unwrap_or(haystack.len()),
140 |         }
141 |     }
142 | }
143 | 
144 | impl Cursor for SingleByteChunks<'_> {
145 |     fn chunk(&self) -> &[u8] {
146 |         debug_assert!(utf8::is_boundary(self.haystack, self.pos) || self.pos == 0);
147 |         debug_assert!(utf8::is_boundary(self.haystack, self.end) || self.end == 0);
148 |         &self.haystack[self.pos..self.end]
149 |     }
150 | 
151 |     fn utf8_aware(&self) -> bool {
152 |         true
153 |     }
154 | 
155 |     fn advance(&mut self) -> bool {
156 |         if self.end < self.haystack.len() {
157 |             self.pos = self.end;
158 |             self.end = (self.end + 1..self.haystack.len())
159 |                 .find(|&i| utf8::is_boundary(self.haystack, i))
160 |                 .unwrap_or(self.haystack.len());
161 |             true
162 |         } else {
163 |             false
164 |         }
165 |     }
166 | 
167 |     fn backtrack(&mut self) -> bool {
168 |         if self.pos != 0 {
169 |             self.end = self.pos;
170 |             self.pos =
171 |                 (0..self.pos).rev().find(|&i| utf8::is_boundary(self.haystack, i)).unwrap_or(0);
172 |             true
173 |         } else {
174 |             false
175 |         }
176 |     }
177 | 
178 |     fn total_bytes(&self) -> Option<usize> {
179 |         Some(self.haystack.len())
180 |     }
181 | 
182 |     fn offset(&self) -> usize {
183 |         self.pos
184 |     }
185 | }
186 | 


--------------------------------------------------------------------------------
/src/tests.rs:
--------------------------------------------------------------------------------
  1 | use crate::{test_rope::SingleByteChunks, Input};
  2 | 
  3 | use {
  4 |     crate::engines::meta::{self, Regex},
  5 |     anyhow::Result,
  6 |     regex_automata::util::syntax,
  7 |     regex_automata::MatchKind,
  8 |     regex_test::{CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult, TestRunner},
  9 | };
 10 | 
 11 | fn suite() -> anyhow::Result<regex_test::RegexTests> {
 12 |     let mut tests = regex_test::RegexTests::new();
 13 |     macro_rules! load {
 14 |         ($name:expr) => {{
 15 |             const DATA: &[u8] = include_bytes!(concat!("../../regex/testdata/", $name, ".toml"));
 16 |             tests.load_slice($name, DATA)?;
 17 |         }};
 18 |     }
 19 | 
 20 |     load!("anchored");
 21 |     load!("bytes");
 22 |     load!("crazy");
 23 |     load!("crlf");
 24 |     load!("earliest");
 25 |     load!("empty");
 26 |     load!("expensive");
 27 |     load!("flags");
 28 |     load!("iter");
 29 |     load!("leftmost-all");
 30 |     load!("line-terminator");
 31 |     load!("misc");
 32 |     load!("multiline");
 33 |     load!("no-unicode");
 34 |     load!("overlapping");
 35 |     load!("regression");
 36 |     load!("set");
 37 |     load!("substring");
 38 |     load!("unicode");
 39 |     load!("utf8");
 40 |     load!("word-boundary");
 41 |     load!("word-boundary-special");
 42 |     load!("fowler/basic");
 43 |     load!("fowler/nullsubexpr");
 44 |     load!("fowler/repetition");
 45 | 
 46 |     Ok(tests)
 47 | }
 48 | 
 49 | /// Configure a regex_automata::Input with the given test configuration.
 50 | fn create_input(test: &regex_test::RegexTest) -> crate::Input<SingleByteChunks> {
 51 |     use regex_automata::Anchored;
 52 | 
 53 |     let bounds = test.bounds();
 54 |     let anchored = if test.anchored() { Anchored::Yes } else { Anchored::No };
 55 |     let mut input = crate::Input::new(crate::test_rope::SingleByteChunks::new(test.haystack()))
 56 |         .range(bounds.start..bounds.end);
 57 |     input.anchored(anchored);
 58 |     input
 59 | }
 60 | 
 61 | /// Convert capture matches into the test suite's capture values.
 62 | ///
 63 | /// The given captures must represent a valid match, where the first capturing
 64 | /// group has a non-None span. Otherwise this panics.
 65 | fn testify_captures(caps: &regex_automata::util::captures::Captures) -> regex_test::Captures {
 66 |     assert!(caps.is_match(), "expected captures to represent a match");
 67 |     let spans =
 68 |         caps.iter().map(|group| group.map(|m| regex_test::Span { start: m.start, end: m.end }));
 69 |     // These unwraps are OK because we assume our 'caps' represents a match,
 70 |     // and a match always gives a non-zero number of groups with the first
 71 |     // group being non-None.
 72 |     regex_test::Captures::new(caps.pattern().unwrap().as_usize(), spans).unwrap()
 73 | }
 74 | 
 75 | const BLACKLIST: &[&str] = &[
 76 |     // These 'earliest' tests are blacklisted because the meta searcher doesn't
 77 |     // give the same offsets that the test expects. This is legal because the
 78 |     // 'earliest' routines don't guarantee a particular match offset other
 79 |     // than "the earliest the regex engine can report a match." Some regex
 80 |     // engines will quit earlier than others. The backtracker, for example,
 81 |     // can't really quit before finding the full leftmost-first match. Many of
 82 |     // the literal searchers also don't have the ability to quit fully or it's
 83 |     // otherwise not worth doing. (A literal searcher not quitting as early as
 84 |     // possible usually means looking at a few more bytes. That's no biggie.)
 85 |     "earliest/",
 86 | ];
 87 | 
 88 | const RUNS: usize = 1;
 89 | /// Tests the default configuration of the meta regex engine.
 90 | #[test]
 91 | fn default() -> Result<()> {
 92 |     let builder = Regex::builder();
 93 |     let mut runner = TestRunner::new()?;
 94 |     runner
 95 |         .expand(&["is_match", "find", "captures"], |test| test.compiles())
 96 |         .blacklist_iter(BLACKLIST);
 97 |     for _ in 0..RUNS {
 98 |         runner.test_iter(suite()?.iter(), compiler(builder.clone()));
 99 |     }
100 |     runner.assert();
101 |     Ok(())
102 | }
103 | 
104 | #[cfg(feature = "ropey")]
105 | #[test]
106 | fn rope_one_past_end() -> Result<()> {
107 |     use crate::RopeyCursor;
108 | 
109 |     let builder = Regex::builder()
110 |         .syntax(syntax::Config::new().case_insensitive(true).multi_line(true))
111 |         .build("git nix");
112 |     let rope = ropey::Rope::from_str("x");
113 |     builder.unwrap().find(Input::new(RopeyCursor::at(rope.slice(..), 1)).range(1..));
114 |     Ok(())
115 | }
116 | 
117 | #[test]
118 | fn prefix() -> Result<()> {
119 |     let regex = Regex::builder().build("^foo$").unwrap();
120 |     let rope = ropey::Rope::from_str("xfoox");
121 |     let mut input = Input::new(rope.slice(..));
122 |     input.slice(1..4);
123 |     let mat1 = regex.find(input).unwrap();
124 |     assert_eq!(mat1.start(), 1);
125 |     assert_eq!(mat1.end(), 4);
126 |     let rope = SingleByteChunks::new(b"xfoox");
127 |     let mut input = Input::new(rope);
128 |     input.slice(1..4);
129 |     let mat1 = regex.find(input).unwrap();
130 |     assert_eq!(mat1.start(), 1);
131 |     assert_eq!(mat1.end(), 4);
132 |     Ok(())
133 | }
134 | 
135 | /// Tests the default configuration minus the full DFA.
136 | #[test]
137 | fn no_dfa() -> Result<()> {
138 |     let mut builder = Regex::builder();
139 |     builder.configure(Regex::config().dfa(false));
140 |     let mut runner = TestRunner::new()?;
141 |     runner
142 |         .expand(&["is_match", "find", "captures"], |test| test.compiles())
143 |         .blacklist_iter(BLACKLIST);
144 |     for _ in 0..RUNS {
145 |         runner.test_iter(suite()?.iter(), compiler(builder.clone()));
146 |     }
147 |     runner.assert();
148 |     Ok(())
149 | }
150 | 
151 | /// Tests the default configuration minus the full DFA and lazy DFA.
152 | #[test]
153 | fn no_dfa_hybrid() -> Result<()> {
154 |     let mut builder = Regex::builder();
155 |     builder.configure(Regex::config().dfa(false).hybrid(false));
156 |     let mut runner = TestRunner::new()?;
157 |     runner
158 |         .expand(&["is_match", "find", "captures"], |test| test.compiles())
159 |         .blacklist_iter(BLACKLIST);
160 |     for _ in 0..RUNS {
161 |         runner.test_iter(suite()?.iter(), compiler(builder.clone()));
162 |     }
163 |     runner.assert();
164 |     Ok(())
165 | }
166 | 
167 | fn compiler(
168 |     mut builder: meta::Builder,
169 | ) -> impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex> {
170 |     move |test, regexes| {
171 |         if !configure_meta_builder(test, &mut builder) {
172 |             return Ok(CompiledRegex::skip());
173 |         }
174 |         // println!("{} {builder:?}", test.full_name());
175 |         let re = builder.build_many(regexes)?;
176 |         Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, test) }))
177 |     }
178 | }
179 | 
180 | fn run_test(re: &Regex, test: &RegexTest) -> TestResult {
181 |     let mut input = create_input(test);
182 |     match test.additional_name() {
183 |         "is_match" => TestResult::matched(re.is_match(input)),
184 |         "find" => match test.search_kind() {
185 |             SearchKind::Earliest => {
186 |                 input.earliest(true);
187 |                 TestResult::matches(
188 |                     re.find_iter(input).take(test.match_limit().unwrap_or(std::usize::MAX)).map(
189 |                         |m| Match {
190 |                             id: m.pattern().as_usize(),
191 |                             span: Span { start: m.start(), end: m.end() },
192 |                         },
193 |                     ),
194 |                 )
195 |             }
196 |             SearchKind::Leftmost => TestResult::matches(
197 |                 re.find_iter(input).take(test.match_limit().unwrap_or(std::usize::MAX)).map(|m| {
198 |                     Match {
199 |                         id: m.pattern().as_usize(),
200 |                         span: Span { start: m.start(), end: m.end() },
201 |                     }
202 |                 }),
203 |             ),
204 |             SearchKind::Overlapping => TestResult::skip(),
205 |         },
206 |         "captures" => match test.search_kind() {
207 |             SearchKind::Earliest => {
208 |                 input.earliest(true);
209 |                 let it = re
210 |                     .captures_iter(input)
211 |                     .take(test.match_limit().unwrap_or(std::usize::MAX))
212 |                     .map(|caps| testify_captures(&caps));
213 |                 TestResult::captures(it)
214 |             }
215 |             SearchKind::Leftmost => {
216 |                 let it = re
217 |                     .captures_iter(input)
218 |                     .take(test.match_limit().unwrap_or(std::usize::MAX))
219 |                     .map(|caps| testify_captures(&caps));
220 |                 TestResult::captures(it)
221 |             }
222 |             SearchKind::Overlapping => {
223 |                 // There is no overlapping regex API that supports captures.
224 |                 TestResult::skip()
225 |             }
226 |         },
227 |         name => TestResult::fail(&format!("unrecognized test name: {}", name)),
228 |     }
229 | }
230 | 
231 | /// Configures the given regex builder with all relevant settings on the given
232 | /// regex test.
233 | ///
234 | /// If the regex test has a setting that is unsupported, then this returns
235 | /// false (implying the test should be skipped).
236 | fn configure_meta_builder(test: &RegexTest, builder: &mut meta::Builder) -> bool {
237 |     let match_kind = match test.match_kind() {
238 |         regex_test::MatchKind::All => MatchKind::All,
239 |         regex_test::MatchKind::LeftmostFirst => MatchKind::LeftmostFirst,
240 |         regex_test::MatchKind::LeftmostLongest => return false,
241 |     };
242 |     let meta_config = Regex::config()
243 |         .match_kind(match_kind)
244 |         .utf8_empty(test.utf8())
245 |         .line_terminator(test.line_terminator());
246 |     builder.configure(meta_config).syntax(config_syntax(test));
247 |     true
248 | }
249 | 
250 | /// Configuration of the regex parser from a regex test.
251 | fn config_syntax(test: &RegexTest) -> syntax::Config {
252 |     syntax::Config::new()
253 |         .case_insensitive(test.case_insensitive())
254 |         .unicode(test.unicode())
255 |         .utf8(test.utf8())
256 |         .line_terminator(test.line_terminator())
257 | }
258 | 


--------------------------------------------------------------------------------
/src/util.rs:
--------------------------------------------------------------------------------
 1 | pub(crate) mod empty;
 2 | pub mod iter;
 3 | pub mod prefilter;
 4 | pub mod primitives;
 5 | pub mod sparse_set;
 6 | pub mod utf8;
 7 | 
 8 | // #[cfg(test)]
 9 | // mod tests;
10 | 


--------------------------------------------------------------------------------
/src/util/empty.rs:
--------------------------------------------------------------------------------
  1 | /*!
  2 | This module provides helper routines for dealing with zero-width matches.
  3 | 
  4 | The main problem being solved here is this:
  5 | 
  6 | 1. The caller wants to search something that they know is valid UTF-8, such
  7 | as a Rust `&str`.
  8 | 2. The regex used by the caller can match the empty string. For example, `a*`.
  9 | 3. The caller should never get match offsets returned that occur within the
 10 | encoding of a UTF-8 codepoint. It is logically incorrect, and also means that,
 11 | e.g., slicing the `&str` at those offsets will lead to a panic.
 12 | 
 13 | So the question here is, how do we prevent the caller from getting match
 14 | offsets that split a codepoint? For example, strictly speaking, the regex `a*`
 15 | matches `☃` at the positions `[0, 0]`, `[1, 1]`, `[2, 2]` and `[3, 3]` since
 16 | the UTF-8 encoding of `☃` is `\xE2\x98\x83`. In particular, the `NFA` that
 17 | underlies all of the matching engines in this crate doesn't have anything in
 18 | its state graph that prevents matching between UTF-8 code units. Indeed, any
 19 | engine derived from the `NFA` will match at those positions by virtue of the
 20 | fact that the `NFA` is byte oriented. That is, its transitions are defined over
 21 | bytes and the matching engines work by proceeding one byte at a time.
 22 | 
 23 | (An alternative architecture would be to define the transitions in an `NFA`
 24 | over codepoints, or `char`. And then make the matching engines proceed by
 25 | decoding one codepoint at a time. This is a viable strategy, but it doesn't
 26 | work for DFA matching engines because designing a fast and memory efficient
 27 | transition table for an alphabet as large as Unicode is quite difficult. More
 28 | to the point, the top-level `regex` crate supports matching on arbitrary bytes
 29 | when Unicode mode is disabled and one is searching a `&[u8]`. So in that case,
 30 | you can't just limit yourself to decoding codepoints and matching those. You
 31 | really do need to be able to follow byte oriented transitions on the `NFA`.)
 32 | 
 33 | In an older version of the regex crate, we handled this case not in the regex
 34 | engine, but in the iterators over matches. Namely, since this case only arises
 35 | when the match is empty, we "just" incremented the next starting position
 36 | of the search by `N`, where `N` is the length of the codepoint encoded at
 37 | the current position. The alternative or more "natural" solution of just
 38 | incrementing by `1` would result in executing a search of `a*` on `☃` like
 39 | this:
 40 | 
 41 | * Start search at `0`.
 42 | * Found match at `[0, 0]`.
 43 | * Next start position is `0`.
 44 | * To avoid an infinite loop, since it's an empty match, increment by `1`.
 45 | * Start search at `1`.
 46 | * Found match at `[1, 1]`. Oops.
 47 | 
 48 | But if we instead incremented by `3` (the length in bytes of `☃`), then we get
 49 | the following:
 50 | 
 51 | * Start search at `0`.
 52 | * Found match at `[0, 0]`.
 53 | * Next start position is `0`.
 54 | * To avoid an infinite loop, since it's an empty match, increment by `3`.
 55 | * Start search at `3`.
 56 | * Found match at `[3, 3]`.
 57 | 
 58 | And we get the correct result. But does this technique work in all cases?
 59 | Crucially, it requires that a zero-width match that splits a codepoint never
 60 | occurs beyond the starting position of the search. Because if it did, merely
 61 | incrementing the start position by the number of bytes in the codepoint at
 62 | the current position wouldn't be enough. A zero-width match could just occur
 63 | anywhere. It turns out that it is _almost_ true. We can convince ourselves by
 64 | looking at all possible patterns that can match the empty string:
 65 | 
 66 | * Patterns like `a*`, `a{0}`, `(?:)`, `a|` and `|a` all unconditionally match
 67 | the empty string. That is, assuming there isn't an `a` at the current position,
 68 | they will all match the empty string at the start of a search. There is no way
 69 | to move past it because any other match would not be "leftmost."
 70 | * `^` only matches at the beginning of the haystack, where the start position
 71 | is `0`. Since we know we're searching valid UTF-8 (if it isn't valid UTF-8,
 72 | then this entire problem goes away because it implies your string type supports
 73 | invalid UTF-8 and thus must deal with offsets that not only split a codepoint
 74 | but occur in entirely invalid UTF-8 somehow), it follows that `^` never matches
 75 | between the code units of a codepoint because the start of a valid UTF-8 string
 76 | is never within the encoding of a codepoint.
 77 | * `$` basically the same logic as `^`, but for the end of a string. A valid
 78 | UTF-8 string can't have an incomplete codepoint at the end of it.
 79 | * `(?m:^)` follows similarly to `^`, but it can match immediately following
 80 | a `\n`. However, since a `\n` is always a codepoint itself and can never
 81 | appear within a codepoint, it follows that the position immediately following
 82 | a `\n` in a string that is valid UTF-8 is guaranteed to not be between the
 83 | code units of another codepoint. (One caveat here is that the line terminator
 84 | for multi-line anchors can now be changed to any arbitrary byte, including
 85 | things like `\x98` which might occur within a codepoint. However, this wasn't
 86 | supported by the old regex crate. If it was, it pose the same problems as
 87 | `(?-u:\B)`, as we'll discuss below.)
 88 | * `(?m:$)` a similar argument as for `(?m:^)`. The only difference is that a
 89 | `(?m:$)` matches just before a `\n`. But the same argument applies.
 90 | * `(?Rm:^)` and `(?Rm:$)` weren't supported by the old regex crate, but the
 91 | CRLF aware line anchors follow a similar argument as for `(?m:^)` and `(?m:$)`.
 92 | Namely, since they only ever match at a boundary where one side is either a
 93 | `\r` or a `\n`, neither of which can occur within a codepoint.
 94 | * `\b` only matches at positions where both sides are valid codepoints, so
 95 | this cannot split a codepoint.
 96 | * `\B`, like `\b`, also only matches at positions where both sides are valid
 97 | codepoints. So this cannot split a codepoint either.
 98 | * `(?-u:\b)` matches only at positions where at least one side of it is an ASCII
 99 | word byte. Since ASCII bytes cannot appear as code units in non-ASCII codepoints
100 | (one of the many amazing qualities of UTF-8), it follows that this too cannot
101 | split a codepoint.
102 | * `(?-u:\B)` finally represents a problem. It can matches between *any* two
103 | bytes that are either both word bytes or non-word bytes. Since code units like
104 | `\xE2` and `\x98` (from the UTF-8 encoding of `☃`) are both non-word bytes,
105 | `(?-u:\B)` will match at the position between them.
106 | 
107 | Thus, our approach of incrementing one codepoint at a time after seeing an
108 | empty match is flawed because `(?-u:\B)` can result in an empty match that
109 | splits a codepoint at a position past the starting point of a search. For
110 | example, searching `(?-u:\B)` on `a☃` would produce the following matches: `[2,
111 | 2]`, `[3, 3]` and `[4, 4]`. The positions at `0` and `1` don't match because
112 | they correspond to word boundaries since `a` is an ASCII word byte.
113 | 
114 | So what did the old regex crate do to avoid this? It banned `(?-u:\B)` from
115 | regexes that could match `&str`. That might sound extreme, but a lot of other
116 | things were banned too. For example, all of `(?-u:.)`, `(?-u:[^a])` and
117 | `(?-u:\W)` can match invalid UTF-8 too, including individual code units with a
118 | codepoint. The key difference is that those expressions could never produce an
119 | empty match. That ban happens when translating an `Ast` to an `Hir`, because
120 | that process that reason about whether an `Hir` can produce *non-empty* matches
121 | at invalid UTF-8 boundaries. Bottom line though is that we side-stepped the
122 | `(?-u:\B)` issue by banning it.
123 | 
124 | If banning `(?-u:\B)` were the only issue with the old regex crate's approach,
125 | then I probably would have kept it. `\B` is rarely used, so it's not such a big
126 | deal to have to work-around it. However, the problem with the above approach
127 | is that it doesn't compose. The logic for avoiding splitting a codepoint only
128 | lived in the iterator, which means if anyone wants to implement their own
129 | iterator over regex matches, they have to deal with this extremely subtle edge
130 | case to get full correctness.
131 | 
132 | Instead, in this crate, we take the approach of pushing this complexity down
133 | to the lowest layers of each regex engine. The approach is pretty simple:
134 | 
135 | * If this corner case doesn't apply, don't do anything. (For example, if UTF-8
136 | mode isn't enabled or if the regex cannot match the empty string.)
137 | * If an empty match is reported, explicitly check if it splits a codepoint.
138 | * If it doesn't, we're done, return the match.
139 | * If it does, then ignore the match and re-run the search.
140 | * Repeat the above process until the end of the haystack is reached or a match
141 | is found that doesn't split a codepoint or isn't zero width.
142 | 
143 | And that's pretty much what this module provides. Every regex engine uses these
144 | methods in their lowest level public APIs, but just above the layer where
145 | their internal engine is used. That way, all regex engines can be arbitrarily
146 | composed without worrying about handling this case, and iterators don't need to
147 | handle it explicitly.
148 | 
149 | (It turns out that a new feature I added, support for changing the line
150 | terminator in a regex to any arbitrary byte, also provokes the above problem.
151 | Namely, the byte could be invalid UTF-8 or a UTF-8 continuation byte. So that
152 | support would need to be limited or banned when UTF-8 mode is enabled, just
153 | like we did for `(?-u:\B)`. But thankfully our more robust approach in this
154 | crate handles that case just fine too.)
155 | */
156 | 
157 | use regex_automata::MatchError;
158 | 
159 | use crate::cursor::Cursor;
160 | use crate::input::Input;
161 | 
162 | #[cold]
163 | #[inline(never)]
164 | pub(crate) fn skip_splits_fwd<T, F, C: Cursor>(
165 |     input: &mut Input<C>,
166 |     init_value: T,
167 |     match_offset: usize,
168 |     find: F,
169 | ) -> Result<Option<T>, MatchError>
170 | where
171 |     F: FnMut(&mut Input<C>) -> Result<Option<(T, usize)>, MatchError>,
172 | {
173 |     skip_splits(true, input, match_offset, init_value, find)
174 | }
175 | 
176 | #[cold]
177 | #[inline(never)]
178 | pub(crate) fn skip_splits_rev<T, F, C: Cursor>(
179 |     input: &mut Input<C>,
180 |     init_value: T,
181 |     match_offset: usize,
182 |     find: F,
183 | ) -> Result<Option<T>, MatchError>
184 | where
185 |     F: FnMut(&mut Input<C>) -> Result<Option<(T, usize)>, MatchError>,
186 | {
187 |     skip_splits(false, input, match_offset, init_value, find)
188 | }
189 | 
190 | fn skip_splits<T, F, C: Cursor>(
191 |     forward: bool,
192 |     input: &mut Input<C>,
193 |     match_offset: usize,
194 |     init_value: T,
195 |     mut find: F,
196 | ) -> Result<Option<T>, MatchError>
197 | where
198 |     F: FnMut(&mut Input<C>) -> Result<Option<(T, usize)>, MatchError>,
199 | {
200 |     input.move_to(match_offset);
201 |     // If our config says to do an anchored search, then we're definitely
202 |     // done. We just need to determine whether we have a valid match or
203 |     // not. If we don't, then we're not allowed to continue, so we report
204 |     // no match.
205 |     //
206 |     // This is actually quite a subtle correctness thing. The key here is
207 |     // that if we got an empty match that splits a codepoint after doing an
208 |     // anchored search in UTF-8 mode, then that implies that we must have
209 |     // *started* the search at a location that splits a codepoint. This
210 |     // follows from the fact that if a match is reported from an anchored
211 |     // search, then the start offset of the match *must* match the start
212 |     // offset of the search.
213 |     //
214 |     // It also follows that no other non-empty match is possible. For
215 |     // example, you might write a regex like '(?:)|SOMETHING' and start its
216 |     // search in the middle of a codepoint. The first branch is an empty
217 |     // regex that will bubble up a match at the first position, and then
218 |     // get rejected here and report no match. But what if 'SOMETHING' could
219 |     // have matched? We reason that such a thing is impossible, because
220 |     // if it does, it must report a match that starts in the middle of a
221 |     // codepoint. This in turn implies that a match is reported whose span
222 |     // does not correspond to valid UTF-8, and this breaks the promise
223 |     // made when UTF-8 mode is enabled. (That promise *can* be broken, for
224 |     // example, by enabling UTF-8 mode but building an by hand NFA that
225 |     // produces non-empty matches that span invalid UTF-8. This is an unchecked
226 |     // but documented precondition violation of UTF-8 mode, and is documented
227 |     // to have unspecified behavior.)
228 |     //
229 |     // I believe this actually means that if an anchored search is run, and
230 |     // UTF-8 mode is enabled and the start position splits a codepoint,
231 |     // then it is correct to immediately report no match without even
232 |     // executing the regex engine. But it doesn't really seem worth writing
233 |     // out that case in every regex engine to save a tiny bit of work in an
234 |     // extremely pathological case, so we just handle it here.
235 |     if input.get_anchored().is_anchored() {
236 |         return Ok(input.is_char_boundary().then_some(init_value));
237 |     }
238 |     // Otherwise, we have an unanchored search, so just keep looking for
239 |     // matches until we have one that does not split a codepoint or we hit
240 |     // EOI.
241 |     let mut value = init_value;
242 |     while !input.is_char_boundary() {
243 |         if forward {
244 |             // The unwrap is OK here because overflowing usize while
245 |             // iterating over a slice is impossible, at it would require
246 |             // a slice of length greater than isize::MAX, which is itself
247 |             // impossible.
248 |             input.set_start(input.start().checked_add(1).unwrap());
249 |         } else {
250 |             input.set_end(match input.end().checked_sub(1) {
251 |                 None => return Ok(None),
252 |                 Some(end) => end,
253 |             });
254 |         }
255 |         match find(input)? {
256 |             None => return Ok(None),
257 |             Some((new_value, new_match_end)) => {
258 |                 value = new_value;
259 |                 input.move_to(new_match_end)
260 |             }
261 |         }
262 |     }
263 |     Ok(Some(value))
264 | }
265 | 


--------------------------------------------------------------------------------
/src/util/iter.rs:
--------------------------------------------------------------------------------
  1 | /*!
  2 | Generic helpers for iteration of matches from a regex engine in a haystack.
  3 | 
  4 | The principle type in this module is a [`Searcher`]. A `Searcher` provides
  5 | its own lower level iterater-like API in addition to methods for constructing
  6 | types that implement `Iterator`. The documentation for `Searcher` explains a
  7 | bit more about why these different APIs exist.
  8 | 
  9 | Currently, this module supports iteration over any regex engine that works
 10 | with the [`HalfMatch`], [`Match`] or [`Captures`] types.
 11 | */
 12 | 
 13 | use std::fmt::Debug;
 14 | 
 15 | use regex_automata::{HalfMatch, Match, MatchError};
 16 | 
 17 | use crate::cursor::Cursor;
 18 | use crate::input::Input;
 19 | 
 20 | /// A searcher for creating iterators and performing lower level iteration.
 21 | ///
 22 | /// This searcher encapsulates the logic required for finding all successive
 23 | /// non-overlapping matches in a haystack. In theory, iteration would look
 24 | /// something like this:
 25 | ///
 26 | /// 1. Setting the start position to `0`.
 27 | /// 2. Execute a regex search. If no match, end iteration.
 28 | /// 3. Report the match and set the start position to the end of the match.
 29 | /// 4. Go back to (2).
 30 | ///
 31 | /// And if this were indeed the case, it's likely that `Searcher` wouldn't
 32 | /// exist. Unfortunately, because a regex may match the empty string, the above
 33 | /// logic won't work for all possible regexes. Namely, if an empty match is
 34 | /// found, then step (3) would set the start position of the search to the
 35 | /// position it was at. Thus, iteration would never end.
 36 | ///
 37 | /// Instead, a `Searcher` knows how to detect these cases and forcefully
 38 | /// advance iteration in the case of an empty match that overlaps with a
 39 | /// previous match.
 40 | ///
 41 | /// If you know that your regex cannot match any empty string, then the simple
 42 | /// algorithm described above will work correctly.
 43 | ///
 44 | /// When possible, prefer the iterators defined on the regex engine you're
 45 | /// using. This tries to abstract over the regex engine and is thus a bit more
 46 | /// unwieldy to use.
 47 | ///
 48 | /// In particular, a `Searcher` is not itself an iterator. Instead, it provides
 49 | /// `advance` routines that permit moving the search along explicitly. It also
 50 | /// provides various routines, like [`Searcher::into_matches_iter`], that
 51 | /// accept a closure (representing how a regex engine executes a search) and
 52 | /// returns a conventional iterator.
 53 | ///
 54 | /// The lifetime parameters come from the [`Input`] type passed to
 55 | /// [`Searcher::new`]:
 56 | ///
 57 | /// * `'h` is the lifetime of the underlying haystack.
 58 | ///
 59 | /// # Searcher vs Iterator
 60 | ///
 61 | /// Why does a search type with "advance" APIs exist at all when we also have
 62 | /// iterators? Unfortunately, the reasoning behind this split is a complex
 63 | /// combination of the following things:
 64 | ///
 65 | /// 1. While many of the regex engines expose their own iterators, it is also
 66 | /// nice to expose this lower level iteration helper because it permits callers
 67 | /// to provide their own `Input` configuration. Moreover, a `Searcher` can work
 68 | /// with _any_ regex engine instead of only the ones defined in this crate.
 69 | /// This way, everyone benefits from a shared iteration implementation.
 70 | /// 2. There are many different regex engines that, while they have the same
 71 | /// match semantics, they have slightly different APIs. Iteration is just
 72 | /// complex enough to want to share code, and so we need a way of abstracting
 73 | /// over those different regex engines. While we could define a new trait that
 74 | /// describes any regex engine search API, it would wind up looking very close
 75 | /// to a closure. While there may still be reasons for the more generic trait
 76 | /// to exist, for now and for the purposes of iteration, we use a closure.
 77 | /// Closures also provide a lot of easy flexibility at the call site, in that
 78 | /// they permit the caller to borrow any kind of state they want for use during
 79 | /// each search call.
 80 | /// 3. As a result of using closures, and because closures are anonymous types
 81 | /// that cannot be named, it is difficult to encapsulate them without both
 82 | /// costs to speed and added complexity to the public API. For example, in
 83 | /// defining an iterator type like
 84 | /// [`dfa::regex::FindMatches`](crate::dfa::regex::FindMatches),
 85 | /// if we use a closure internally, it's not possible to name this type in the
 86 | /// return type of the iterator constructor. Thus, the only way around it is
 87 | /// to erase the type by boxing it and turning it into a `Box<dyn FnMut ...>`.
 88 | /// This boxed closure is unlikely to be inlined _and_ it infects the public
 89 | /// API in subtle ways. Namely, unless you declare the closure as implementing
 90 | /// `Send` and `Sync`, then the resulting iterator type won't implement it
 91 | /// either. But there are practical issues with requiring the closure to
 92 | /// implement `Send` and `Sync` that result in other API complexities that
 93 | /// are beyond the scope of this already long exposition.
 94 | /// 4. Some regex engines expose more complex match information than just
 95 | /// "which pattern matched" and "at what offsets." For example, the PikeVM
 96 | /// exposes match spans for each capturing group that participated in the
 97 | /// match. In such cases, it can be quite beneficial to reuse the capturing
 98 | /// group allocation on subsequent searches. A proper iterator doesn't permit
 99 | /// this API due to its interface, so it's useful to have something a bit lower
100 | /// level that permits callers to amortize allocations while also reusing a
101 | /// shared implementation of iteration. (See the documentation for
102 | /// [`Searcher::advance`] for an example of using the "advance" API with the
103 | /// PikeVM.)
104 | ///
105 | /// What this boils down to is that there are "advance" APIs which require
106 | /// handing a closure to it for every call, and there are also APIs to create
107 | /// iterators from a closure. The former are useful for _implementing_
108 | /// iterators or when you need more flexibility, while the latter are useful
109 | /// for conveniently writing custom iterators on-the-fly.
110 | ///
111 | /// # Example: iterating with captures
112 | ///
113 | /// Several regex engines in this crate over convenient iterator APIs over
114 | /// [`Captures`] values. To do so, this requires allocating a new `Captures`
115 | /// value for each iteration step. This can perhaps be more costly than you
116 | /// might want. Instead of implementing your own iterator to avoid that
117 | /// cost (which can be a little subtle if you want to handle empty matches
118 | /// correctly), you can use this `Searcher` to do it for you:
119 | ///
120 | /// ```
121 | /// use regex_automata::{
122 | ///     nfa::thompson::pikevm::PikeVM,
123 | ///     util::iter::Searcher,
124 | ///     Input, Span,
125 | /// };
126 | ///
127 | /// let re = PikeVM::new("foo(?P<numbers>[0-9]+)")?;
128 | /// let haystack = "foo1 foo12 foo123";
129 | ///
130 | /// let mut caps = re.create_captures();
131 | /// let mut cache = re.create_cache();
132 | /// let mut matches = vec![];
133 | /// let mut searcher = Searcher::new(Input::new(haystack));
134 | /// while let Some(_) = searcher.advance(|input| {
135 | ///     re.search(&mut cache, input, &mut caps);
136 | ///     Ok(caps.get_match())
137 | /// }) {
138 | ///     // The unwrap is OK since 'numbers' matches if the pattern matches.
139 | ///     matches.push(caps.get_group_by_name("numbers").unwrap());
140 | /// }
141 | /// assert_eq!(matches, vec![
142 | ///     Span::from(3..4),
143 | ///     Span::from(8..10),
144 | ///     Span::from(14..17),
145 | /// ]);
146 | ///
147 | /// # Ok::<(), Box<dyn std::error::Error>>(())
148 | /// ```
149 | pub struct Searcher<C: Cursor> {
150 |     /// The input parameters to give to each regex engine call.
151 |     ///
152 |     /// The start position of the search is mutated during iteration.
153 |     input: Input<C>,
154 |     /// Records the end offset of the most recent match. This is necessary to
155 |     /// handle a corner case for preventing empty matches from overlapping with
156 |     /// the ending bounds of a prior match.
157 |     last_match_end: Option<usize>,
158 | }
159 | 
160 | impl<C: Cursor> Searcher<C> {
161 |     /// Create a new fallible non-overlapping matches iterator.
162 |     ///
163 |     /// The given `input` provides the parameters (including the haystack),
164 |     /// while the `finder` represents a closure that calls the underlying regex
165 |     /// engine. The closure may borrow any additional state that is needed,
166 |     /// such as a prefilter scanner.
167 |     pub fn new(input: Input<C>) -> Searcher<C> {
168 |         Searcher { input, last_match_end: None }
169 |     }
170 | 
171 |     /// Returns the current `Input` used by this searcher.
172 |     ///
173 |     /// The `Input` returned is generally equivalent to the one given to
174 |     /// [`Searcher::new`], but its start position may be different to reflect
175 |     /// the start of the next search to be executed.
176 |     pub fn input(&mut self) -> &mut Input<C> {
177 |         &mut self.input
178 |     }
179 | 
180 |     // /// Return the next half match for an infallible search if one exists, and
181 |     // /// advance to the next position.
182 |     // ///
183 |     // /// This is like `try_advance_half`, except errors are converted into
184 |     // /// panics.
185 |     // ///
186 |     // /// # Panics
187 |     // ///
188 |     // /// If the given closure returns an error, then this panics. This is useful
189 |     // /// when you know your underlying regex engine has been configured to not
190 |     // /// return an error.
191 |     // ///
192 |     // /// # Example
193 |     // ///
194 |     // /// This example shows how to use a `Searcher` to iterate over all matches
195 |     // /// when using a DFA, which only provides "half" matches.
196 |     // ///
197 |     // /// ```
198 |     // /// use regex_automata::{
199 |     // ///     hybrid::dfa::DFA,
200 |     // ///     util::iter::Searcher,
201 |     // ///     HalfMatch, Input,
202 |     // /// };
203 |     // ///
204 |     // /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
205 |     // /// let mut cache = re.create_cache();
206 |     // ///
207 |     // /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22");
208 |     // /// let mut it = Searcher::new(input);
209 |     // ///
210 |     // /// let expected = Some(HalfMatch::must(0, 10));
211 |     // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
212 |     // /// assert_eq!(expected, got);
213 |     // ///
214 |     // /// let expected = Some(HalfMatch::must(0, 21));
215 |     // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
216 |     // /// assert_eq!(expected, got);
217 |     // ///
218 |     // /// let expected = Some(HalfMatch::must(0, 32));
219 |     // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
220 |     // /// assert_eq!(expected, got);
221 |     // ///
222 |     // /// let expected = None;
223 |     // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
224 |     // /// assert_eq!(expected, got);
225 |     // ///
226 |     // /// # Ok::<(), Box<dyn std::error::Error>>(())
227 |     // /// ```
228 |     // ///
229 |     // /// This correctly moves iteration forward even when an empty match occurs:
230 |     // ///
231 |     // /// ```
232 |     // /// use regex_automata::{
233 |     // ///     hybrid::dfa::DFA,
234 |     // ///     util::iter::Searcher,
235 |     // ///     HalfMatch, Input,
236 |     // /// };
237 |     // ///
238 |     // /// let re = DFA::new(r"a|")?;
239 |     // /// let mut cache = re.create_cache();
240 |     // ///
241 |     // /// let input = Input::new("abba");
242 |     // /// let mut it = Searcher::new(input);
243 |     // ///
244 |     // /// let expected = Some(HalfMatch::must(0, 1));
245 |     // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
246 |     // /// assert_eq!(expected, got);
247 |     // ///
248 |     // /// let expected = Some(HalfMatch::must(0, 2));
249 |     // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
250 |     // /// assert_eq!(expected, got);
251 |     // ///
252 |     // /// let expected = Some(HalfMatch::must(0, 4));
253 |     // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
254 |     // /// assert_eq!(expected, got);
255 |     // ///
256 |     // /// let expected = None;
257 |     // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
258 |     // /// assert_eq!(expected, got);
259 |     // ///
260 |     // /// # Ok::<(), Box<dyn std::error::Error>>(())
261 |     // /// ```
262 |     // #[inline]
263 |     // pub fn advance_half<F>(&mut self, finder: F) -> Option<HalfMatch>
264 |     // where
265 |     //     F: FnMut(&mut Input<C>) -> Result<Option<HalfMatch>, MatchError>,
266 |     // {
267 |     //     match self.try_advance_half(finder) {
268 |     //         Ok(m) => m,
269 |     //         Err(err) => panic!(
270 |     //             "unexpected regex half find error: {}\n\
271 |     //              to handle find errors, use 'try' or 'search' methods",
272 |     //             err,
273 |     //         ),
274 |     //     }
275 |     // }
276 | 
277 |     /// Return the next match for an infallible search if one exists, and
278 |     /// advance to the next position.
279 |     ///
280 |     /// The search is advanced even in the presence of empty matches by
281 |     /// forbidding empty matches from overlapping with any other match.
282 |     ///
283 |     /// This is like `try_advance`, except errors are converted into panics.
284 |     ///
285 |     /// # Panics
286 |     ///
287 |     /// If the given closure returns an error, then this panics. This is useful
288 |     /// when you know your underlying regex engine has been configured to not
289 |     /// return an error.
290 |     ///
291 |     /// # Example
292 |     ///
293 |     /// This example shows how to use a `Searcher` to iterate over all matches
294 |     /// when using a regex based on lazy DFAs:
295 |     ///
296 |     /// ```
297 |     /// use regex_automata::{
298 |     ///     hybrid::regex::Regex,
299 |     ///     util::iter::Searcher,
300 |     ///     Match, Input,
301 |     /// };
302 |     ///
303 |     /// let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
304 |     /// let mut cache = re.create_cache();
305 |     ///
306 |     /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22");
307 |     /// let mut it = Searcher::new(input);
308 |     ///
309 |     /// let expected = Some(Match::must(0, 0..10));
310 |     /// let got = it.advance(|input| re.try_search(&mut cache, input));
311 |     /// assert_eq!(expected, got);
312 |     ///
313 |     /// let expected = Some(Match::must(0, 11..21));
314 |     /// let got = it.advance(|input| re.try_search(&mut cache, input));
315 |     /// assert_eq!(expected, got);
316 |     ///
317 |     /// let expected = Some(Match::must(0, 22..32));
318 |     /// let got = it.advance(|input| re.try_search(&mut cache, input));
319 |     /// assert_eq!(expected, got);
320 |     ///
321 |     /// let expected = None;
322 |     /// let got = it.advance(|input| re.try_search(&mut cache, input));
323 |     /// assert_eq!(expected, got);
324 |     ///
325 |     /// # Ok::<(), Box<dyn std::error::Error>>(())
326 |     /// ```
327 |     ///
328 |     /// This example shows the same as above, but with the PikeVM. This example
329 |     /// is useful because it shows how to use this API even when the regex
330 |     /// engine doesn't directly return a `Match`.
331 |     ///
332 |     /// ```
333 |     /// use regex_automata::{
334 |     ///     nfa::thompson::pikevm::PikeVM,
335 |     ///     util::iter::Searcher,
336 |     ///     Match, Input,
337 |     /// };
338 |     ///
339 |     /// let re = PikeVM::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
340 |     /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
341 |     ///
342 |     /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22");
343 |     /// let mut it = Searcher::new(input);
344 |     ///
345 |     /// let expected = Some(Match::must(0, 0..10));
346 |     /// let got = it.advance(|input| {
347 |     ///     re.search(&mut cache, input, &mut caps);
348 |     ///     Ok(caps.get_match())
349 |     /// });
350 |     /// // Note that if we wanted to extract capturing group spans, we could
351 |     /// // do that here with 'caps'.
352 |     /// assert_eq!(expected, got);
353 |     ///
354 |     /// let expected = Some(Match::must(0, 11..21));
355 |     /// let got = it.advance(|input| {
356 |     ///     re.search(&mut cache, input, &mut caps);
357 |     ///     Ok(caps.get_match())
358 |     /// });
359 |     /// assert_eq!(expected, got);
360 |     ///
361 |     /// let expected = Some(Match::must(0, 22..32));
362 |     /// let got = it.advance(|input| {
363 |     ///     re.search(&mut cache, input, &mut caps);
364 |     ///     Ok(caps.get_match())
365 |     /// });
366 |     /// assert_eq!(expected, got);
367 |     ///
368 |     /// let expected = None;
369 |     /// let got = it.advance(|input| {
370 |     ///     re.search(&mut cache, input, &mut caps);
371 |     ///     Ok(caps.get_match())
372 |     /// });
373 |     /// assert_eq!(expected, got);
374 |     ///
375 |     /// # Ok::<(), Box<dyn std::error::Error>>(())
376 |     /// ```
377 |     #[inline]
378 |     pub fn advance<F>(&mut self, finder: F) -> Option<Match>
379 |     where
380 |         F: FnMut(&mut Input<C>) -> Result<Option<Match>, MatchError>,
381 |     {
382 |         match self.try_advance(finder) {
383 |             Ok(m) => m,
384 |             Err(err) => panic!(
385 |                 "unexpected regex find error: {}\n\
386 |                  to handle find errors, use 'try' or 'search' methods",
387 |                 err,
388 |             ),
389 |         }
390 |     }
391 | 
392 |     /// Return the next half match for a fallible search if one exists, and
393 |     /// advance to the next position.
394 |     ///
395 |     /// This is like `advance_half`, except it permits callers to handle errors
396 |     /// during iteration.
397 |     #[inline]
398 |     pub fn try_advance_half<F>(&mut self, mut finder: F) -> Result<Option<HalfMatch>, MatchError>
399 |     where
400 |         F: FnMut(&mut Input<C>) -> Result<Option<HalfMatch>, MatchError>,
401 |     {
402 |         let mut m = match finder(&mut self.input)? {
403 |             None => return Ok(None),
404 |             Some(m) => m,
405 |         };
406 |         if Some(m.offset()) == self.last_match_end {
407 |             m = match self.handle_overlapping_empty_half_match(m, finder)? {
408 |                 None => return Ok(None),
409 |                 Some(m) => m,
410 |             };
411 |         }
412 |         self.input.set_start(m.offset());
413 |         self.last_match_end = Some(m.offset());
414 |         Ok(Some(m))
415 |     }
416 | 
417 |     /// Return the next match for a fallible search if one exists, and advance
418 |     /// to the next position.
419 |     ///
420 |     /// This is like `advance`, except it permits callers to handle errors
421 |     /// during iteration.
422 |     #[inline]
423 |     pub fn try_advance<F>(&mut self, mut finder: F) -> Result<Option<Match>, MatchError>
424 |     where
425 |         F: FnMut(&mut Input<C>) -> Result<Option<Match>, MatchError>,
426 |     {
427 |         let end = self.input.end();
428 |         let mut m = match finder(&mut self.input)? {
429 |             None => return Ok(None),
430 |             Some(m) => m,
431 |         };
432 |         assert!(m.end() <= end);
433 |         if m.is_empty() && Some(m.end()) == self.last_match_end {
434 |             m = match self.handle_overlapping_empty_match(m, finder)? {
435 |                 None => return Ok(None),
436 |                 Some(m) => m,
437 |             };
438 |         }
439 |         self.input.set_start(m.end());
440 |         self.last_match_end = Some(m.end());
441 |         Ok(Some(m))
442 |     }
443 | 
444 |     /// Given a closure that executes a single search, return an iterator over
445 |     /// all successive non-overlapping half matches.
446 |     ///
447 |     /// The iterator returned yields result values. If the underlying regex
448 |     /// engine is configured to never return an error, consider calling
449 |     /// [`TryHalfMatchesIter::infallible`] to convert errors into panics.
450 |     ///
451 |     /// # Example
452 |     ///
453 |     /// This example shows how to use a `Searcher` to create a proper
454 |     /// iterator over half matches.
455 |     ///
456 |     /// ```
457 |     /// use regex_automata::{
458 |     ///     hybrid::dfa::DFA,
459 |     ///     util::iter::Searcher,
460 |     ///     HalfMatch, Input,
461 |     /// };
462 |     ///
463 |     /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
464 |     /// let mut cache = re.create_cache();
465 |     ///
466 |     /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22");
467 |     /// let mut it = Searcher::new(input).into_half_matches_iter(|input| {
468 |     ///     re.try_search_fwd(&mut cache, input)
469 |     /// });
470 |     ///
471 |     /// let expected = Some(Ok(HalfMatch::must(0, 10)));
472 |     /// assert_eq!(expected, it.next());
473 |     ///
474 |     /// let expected = Some(Ok(HalfMatch::must(0, 21)));
475 |     /// assert_eq!(expected, it.next());
476 |     ///
477 |     /// let expected = Some(Ok(HalfMatch::must(0, 32)));
478 |     /// assert_eq!(expected, it.next());
479 |     ///
480 |     /// let expected = None;
481 |     /// assert_eq!(expected, it.next());
482 |     ///
483 |     /// # Ok::<(), Box<dyn std::error::Error>>(())
484 |     /// ```
485 |     #[inline]
486 |     pub fn into_half_matches_iter<F>(self, finder: F) -> TryHalfMatchesIter<C, F>
487 |     where
488 |         F: FnMut(&mut Input<C>) -> Result<Option<HalfMatch>, MatchError>,
489 |     {
490 |         TryHalfMatchesIter { it: self, finder }
491 |     }
492 | 
493 |     /// Handles the special case of a match that begins where the previous
494 |     /// match ended. Without this special handling, it'd be possible to get
495 |     /// stuck where an empty match never results in forward progress. This
496 |     /// also makes it more consistent with how presiding general purpose regex
497 |     /// engines work.
498 |     #[cold]
499 |     #[inline(never)]
500 |     fn handle_overlapping_empty_half_match<F>(
501 |         &mut self,
502 |         _: HalfMatch,
503 |         mut finder: F,
504 |     ) -> Result<Option<HalfMatch>, MatchError>
505 |     where
506 |         F: FnMut(&mut Input<C>) -> Result<Option<HalfMatch>, MatchError>,
507 |     {
508 |         // Since we are only here when 'm.offset()' matches the offset of the
509 |         // last match, it follows that this must have been an empty match.
510 |         // Since we both need to make progress *and* prevent overlapping
511 |         // matches, we discard this match and advance the search by 1.
512 |         //
513 |         // Note that this may start a search in the middle of a codepoint. The
514 |         // regex engines themselves are expected to deal with that and not
515 |         // report any matches within a codepoint if they are configured in
516 |         // UTF-8 mode.
517 |         self.input.set_start(self.input.start().checked_add(1).unwrap());
518 |         finder(&mut self.input)
519 |     }
520 | 
521 |     /// Handles the special case of an empty match by ensuring that 1) the
522 |     /// iterator always advances and 2) empty matches never overlap with other
523 |     /// matches.
524 |     ///
525 |     /// (1) is necessary because we principally make progress by setting the
526 |     /// starting location of the next search to the ending location of the last
527 |     /// match. But if a match is empty, then this results in a search that does
528 |     /// not advance and thus does not terminate.
529 |     ///
530 |     /// (2) is not strictly necessary, but makes intuitive sense and matches
531 |     /// the presiding behavior of most general purpose regex engines. The
532 |     /// "intuitive sense" here is that we want to report NON-overlapping
533 |     /// matches. So for example, given the regex 'a|(?:)' against the haystack
534 |     /// 'a', without the special handling, you'd get the matches [0, 1) and [1,
535 |     /// 1), where the latter overlaps with the end bounds of the former.
536 |     ///
537 |     /// Note that we mark this cold and forcefully prevent inlining because
538 |     /// handling empty matches like this is extremely rare and does require
539 |     /// quite a bit of code, comparatively. Keeping this code out of the main
540 |     /// iterator function keeps it smaller and more amenable to inlining
541 |     /// itself.
542 |     #[cold]
543 |     #[inline(never)]
544 |     fn handle_overlapping_empty_match<F>(
545 |         &mut self,
546 |         m: Match,
547 |         mut finder: F,
548 |     ) -> Result<Option<Match>, MatchError>
549 |     where
550 |         F: FnMut(&mut Input<C>) -> Result<Option<Match>, MatchError>,
551 |     {
552 |         assert!(m.is_empty());
553 |         self.input.set_start(self.input.start().checked_add(1).unwrap());
554 |         finder(&mut self.input)
555 |     }
556 | }
557 | 
558 | impl<C: Cursor> Debug for Searcher<C> {
559 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
560 |         f.debug_struct("Searcher")
561 |             .field("input", &self.input)
562 |             .field("last_match_end", &self.last_match_end)
563 |             .finish()
564 |     }
565 | }
566 | 
567 | /// An iterator over all non-overlapping half matches for a fallible search.
568 | ///
569 | /// The iterator yields a `Result<HalfMatch, MatchError>` value until no more
570 | /// matches could be found.
571 | ///
572 | /// The type parameters are as follows:
573 | ///
574 | /// * `F` represents the type of a closure that executes the search.
575 | ///
576 | /// The lifetime parameters come from the [`Input`] type:
577 | ///
578 | /// * `'h` is the lifetime of the underlying haystack.
579 | ///
580 | /// When possible, prefer the iterators defined on the regex engine you're
581 | /// using. This tries to abstract over the regex engine and is thus a bit more
582 | /// unwieldy to use.
583 | ///
584 | /// This iterator is created by [`Searcher::into_half_matches_iter`].
585 | pub struct TryHalfMatchesIter<C: Cursor, F> {
586 |     it: Searcher<C>,
587 |     finder: F,
588 | }
589 | 
590 | // impl<C: Cursor, F> TryHalfMatchesIter<C, F> {
591 | //     /// Return an infallible version of this iterator.
592 | //     ///
593 | //     /// Any item yielded that corresponds to an error results in a panic. This
594 | //     /// is useful if your underlying regex engine is configured in a way that
595 | //     /// it is guaranteed to never return an error.
596 | //     pub fn infallible(self) -> HalfMatchesIter<C, F> {
597 | //         HalfMatchesIter(self)
598 | //     }
599 | 
600 | //     /// Returns the current `Input` used by this iterator.
601 | //     ///
602 | //     /// The `Input` returned is generally equivalent to the one used to
603 | //     /// construct this iterator, but its start position may be different to
604 | //     /// reflect the start of the next search to be executed.
605 | //     pub fn input(&mut self) -> &mut Input<C> {
606 | //         self.it.input()
607 | //     }
608 | // }
609 | 
610 | impl<C: Cursor, F> Iterator for TryHalfMatchesIter<C, F>
611 | where
612 |     F: FnMut(&mut Input<C>) -> Result<Option<HalfMatch>, MatchError>,
613 | {
614 |     type Item = Result<HalfMatch, MatchError>;
615 | 
616 |     #[inline]
617 |     fn next(&mut self) -> Option<Result<HalfMatch, MatchError>> {
618 |         self.it.try_advance_half(&mut self.finder).transpose()
619 |     }
620 | }
621 | 
622 | impl<C: Cursor, F> core::fmt::Debug for TryHalfMatchesIter<C, F> {
623 |     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
624 |         f.debug_struct("TryHalfMatchesIter")
625 |             .field("it", &self.it)
626 |             .field("finder", &"<closure>")
627 |             .finish()
628 |     }
629 | }
630 | 
631 | /// An iterator over all non-overlapping half matches for an infallible search.
632 | ///
633 | /// The iterator yields a [`HalfMatch`] value until no more matches could be
634 | /// found.
635 | ///
636 | /// The type parameters are as follows:
637 | ///
638 | /// * `F` represents the type of a closure that executes the search.
639 | ///
640 | /// The lifetime parameters come from the [`Input`] type:
641 | ///
642 | /// * `'h` is the lifetime of the underlying haystack.
643 | ///
644 | /// When possible, prefer the iterators defined on the regex engine you're
645 | /// using. This tries to abstract over the regex engine and is thus a bit more
646 | /// unwieldy to use.
647 | ///
648 | /// This iterator is created by [`Searcher::into_half_matches_iter`] and
649 | /// then calling [`TryHalfMatchesIter::infallible`].
650 | #[derive(Debug)]
651 | pub struct HalfMatchesIter<C: Cursor, F>(TryHalfMatchesIter<C, F>);
652 | 
653 | // impl<C: Cursor, F> HalfMatchesIter<C, F> {
654 | //     /// Returns the current `Input` used by this iterator.
655 | //     ///
656 | //     /// The `Input` returned is generally equivalent to the one used to
657 | //     /// construct this iterator, but its start position may be different to
658 | //     /// reflect the start of the next search to be executed.
659 | //     pub fn input(&mut self) -> &mut Input<C> {
660 | //         self.0.it.input()
661 | //     }
662 | // }
663 | 
664 | // impl<C: Cursor, F> Iterator for HalfMatchesIter<C, F>
665 | // where
666 | //     F: FnMut(&mut Input<C>) -> Result<Option<HalfMatch>, MatchError>,
667 | // {
668 | //     type Item = HalfMatch;
669 | 
670 | //     #[inline]
671 | //     fn next(&mut self) -> Option<HalfMatch> {
672 | //         match self.0.next()? {
673 | //             Ok(m) => Some(m),
674 | //             Err(err) => panic!(
675 | //                 "unexpected regex half find error: {}\n\
676 | //                  to handle find errors, use 'try' or 'search' methods",
677 | //                 err,
678 | //             ),
679 | //         }
680 | //     }
681 | // }
682 | 
683 | // #[cfg(test)]
684 | // pub fn assert_eq<T: PartialEq + Debug>(
685 | //     mut iter1: impl Iterator<Item = T>,
686 | //     mut iter2: impl Iterator<Item = T>,
687 | // ) {
688 | //     let mut i = 0;
689 | //     loop {
690 | //         match (iter1.next(), iter2.next()) {
691 | //             (None, None) => break,
692 | //             (iter1, iter2) => assert_eq!(iter1, iter2, "{i}"),
693 | //         }
694 | //         i += 1;
695 | //     }
696 | // }
697 | 
698 | #[cfg(test)]
699 | pub fn prop_assert_eq<T: PartialEq + Debug>(
700 |     mut iter1: impl Iterator<Item = T>,
701 |     mut iter2: impl Iterator<Item = T>,
702 | ) -> proptest::test_runner::TestCaseResult {
703 |     let mut i = 0;
704 |     let mut prev = None;
705 |     loop {
706 |         match (iter1.next(), iter2.next()) {
707 |             (None, None) => break,
708 |             (iter1, iter2) => {
709 |                 proptest::prop_assert_eq!(&iter1, &iter2, "i={}, prev={:?}", i, prev);
710 |                 prev = iter1;
711 |             }
712 |         }
713 |         i += 1;
714 |     }
715 |     Ok(())
716 | }
717 | 


--------------------------------------------------------------------------------
/src/util/prefilter.rs:
--------------------------------------------------------------------------------
 1 | /*!
 2 | Defines a prefilter for accelerating regex searches.
 3 | 
 4 | A prefilter can be created by building a [`Prefilter`] value.
 5 | 
 6 | A prefilter represents one of the most important optimizations available for
 7 | accelerating regex searches. The idea of a prefilter is to very quickly find
 8 | candidate locations in a haystack where a regex _could_ match. Once a candidate
 9 | is found, it is then intended for the regex engine to run at that position to
10 | determine whether the candidate is a match or a false positive.
11 | 
12 | In the aforementioned description of the prefilter optimization also lay its
13 | demise. Namely, if a prefilter has a high false positive rate and it produces
14 | lots of candidates, then a prefilter can overall make a regex search slower.
15 | It can run more slowly because more time is spent ping-ponging between the
16 | prefilter search and the regex engine attempting to confirm each candidate as
17 | a match. This ping-ponging has overhead that adds up, and is exacerbated by
18 | a high false positive rate.
19 | 
20 | Nevertheless, the optimization is still generally worth performing in most
21 | cases. Particularly given just how much throughput can be improved. (It is not
22 | uncommon for prefilter optimizations to improve throughput by one or two orders
23 | of magnitude.)
24 | 
25 | Typically a prefilter is used to find occurrences of literal prefixes from a
26 | regex pattern, but this isn't required. A prefilter can be used to look for
27 | suffixes or even inner literals.
28 | 
29 | Note that as of now, prefilters throw away information about which pattern
30 | each literal comes from. In other words, when a prefilter finds a match,
31 | there's no way to know which pattern (or patterns) it came from. Therefore,
32 | in order to confirm a match, you'll have to check all of the patterns by
33 | running the full regex engine.
34 | */
35 | 
36 | use log::debug;
37 | use regex_automata::MatchKind;
38 | use regex_syntax::hir::{literal, Hir};
39 | 
40 | /// Extracts all of the prefix literals from the given HIR expressions into a
41 | /// single `Seq`. The literals in the sequence are ordered with respect to the
42 | /// order of the given HIR expressions and consistent with the match semantics
43 | /// given.
44 | ///
45 | /// The sequence returned is "optimized." That is, they may be shrunk or even
46 | /// truncated according to heuristics with the intent of making them more
47 | /// useful as a prefilter. (Which translates to both using faster algorithms
48 | /// and minimizing the false positive rate.)
49 | ///
50 | /// Note that this erases any connection between the literals and which pattern
51 | /// (or patterns) they came from.
52 | ///
53 | /// The match kind given must correspond to the match semantics of the regex
54 | /// that is represented by the HIRs given. The match semantics may change the
55 | /// literal sequence returned.
56 | pub(crate) fn prefixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq
57 | where
58 |     H: core::borrow::Borrow<Hir>,
59 | {
60 |     let mut extractor = literal::Extractor::new();
61 |     extractor.kind(literal::ExtractKind::Prefix);
62 | 
63 |     let mut prefixes = literal::Seq::empty();
64 |     for hir in hirs {
65 |         prefixes.union(&mut extractor.extract(hir.borrow()));
66 |     }
67 |     debug!(
68 |         "prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
69 |         prefixes.len(),
70 |         prefixes.is_exact(),
71 |         prefixes
72 |     );
73 |     match kind {
74 |         MatchKind::All => {
75 |             prefixes.sort();
76 |             prefixes.dedup();
77 |         }
78 |         MatchKind::LeftmostFirst => {
79 |             prefixes.optimize_for_prefix_by_preference();
80 |         }
81 |         _ => unreachable!(),
82 |     }
83 |     debug!(
84 |         "prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
85 |         prefixes.len(),
86 |         prefixes.is_exact(),
87 |         prefixes
88 |     );
89 |     prefixes
90 | }
91 | 


--------------------------------------------------------------------------------
/src/util/primitives.rs:
--------------------------------------------------------------------------------
  1 | use regex_automata::util::primitives::SmallIndex;
  2 | use regex_automata::PatternID;
  3 | 
  4 | #[derive(Clone, Debug)]
  5 | pub(crate) struct SmallIndexIter {
  6 |     rng: core::ops::Range<usize>,
  7 | }
  8 | 
  9 | impl Iterator for SmallIndexIter {
 10 |     type Item = SmallIndex;
 11 | 
 12 |     fn next(&mut self) -> Option<SmallIndex> {
 13 |         if self.rng.start >= self.rng.end {
 14 |             return None;
 15 |         }
 16 |         let next_id = self.rng.start + 1;
 17 |         let id = core::mem::replace(&mut self.rng.start, next_id);
 18 |         // new_unchecked is OK since we asserted that the number of
 19 |         // elements in this iterator will fit in an ID at construction.
 20 |         Some(SmallIndex::new_unchecked(id))
 21 |     }
 22 | }
 23 | 
 24 | macro_rules! index_type_impls {
 25 |     ($name:ident, $err:ident, $iter:ident, $withiter:ident) => {
 26 |         #[derive(Clone, Debug)]
 27 |         pub(crate) struct $iter(SmallIndexIter);
 28 | 
 29 |         impl $iter {
 30 |             fn new(len: usize) -> $iter {
 31 |                 assert!(
 32 |                     len <= $name::LIMIT,
 33 |                     "cannot create iterator for {} when number of \
 34 |                      elements exceed {:?}",
 35 |                     stringify!($name),
 36 |                     $name::LIMIT,
 37 |                 );
 38 |                 $iter(SmallIndexIter { rng: 0..len })
 39 |             }
 40 |         }
 41 | 
 42 |         impl Iterator for $iter {
 43 |             type Item = $name;
 44 | 
 45 |             fn next(&mut self) -> Option<$name> {
 46 |                 self.0.next().map(|id| $name::new_unchecked(id.as_usize()))
 47 |             }
 48 |         }
 49 | 
 50 |         /// An iterator adapter that is like std::iter::Enumerate, but attaches
 51 |         /// small index values instead. It requires `ExactSizeIterator`. At
 52 |         /// construction, it ensures that the index of each element in the
 53 |         /// iterator is representable in the corresponding small index type.
 54 |         #[derive(Clone, Debug)]
 55 |         pub(crate) struct $withiter<I> {
 56 |             it: I,
 57 |             ids: $iter,
 58 |         }
 59 | 
 60 |         impl<I: Iterator + ExactSizeIterator> $withiter<I> {
 61 |             fn new(it: I) -> $withiter<I> {
 62 |                 let ids = $iter::new(it.len());
 63 |                 $withiter { it, ids }
 64 |             }
 65 |         }
 66 | 
 67 |         impl<I: Iterator + ExactSizeIterator> Iterator for $withiter<I> {
 68 |             type Item = ($name, I::Item);
 69 | 
 70 |             fn next(&mut self) -> Option<($name, I::Item)> {
 71 |                 let item = self.it.next()?;
 72 |                 // Number of elements in this iterator must match, according
 73 |                 // to contract of ExactSizeIterator.
 74 |                 let id = self.ids.next().unwrap();
 75 |                 Some((id, item))
 76 |             }
 77 |         }
 78 |     };
 79 | }
 80 | 
 81 | index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter);
 82 | // index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter);
 83 | 
 84 | /// A utility trait that defines a couple of adapters for making it convenient
 85 | /// to access indices as "small index" types. We require ExactSizeIterator so
 86 | /// that iterator construction can do a single check to make sure the index of
 87 | /// each element is representable by its small index type.
 88 | pub(crate) trait IteratorIndexExt: Iterator {
 89 |     fn with_pattern_ids(self) -> WithPatternIDIter<Self>
 90 |     where
 91 |         Self: Sized + ExactSizeIterator,
 92 |     {
 93 |         WithPatternIDIter::new(self)
 94 |     }
 95 | 
 96 |     // fn with_state_ids(self) -> WithStateIDIter<Self>
 97 |     // where
 98 |     //     Self: Sized + ExactSizeIterator,
 99 |     // {
100 |     //     WithStateIDIter::new(self)
101 |     // }
102 | }
103 | 
104 | impl<I: Iterator> IteratorIndexExt for I {}
105 | 


--------------------------------------------------------------------------------
/src/util/sparse_set.rs:
--------------------------------------------------------------------------------
  1 | /*!
  2 | This module defines a sparse set data structure. Its most interesting
  3 | properties are:
  4 | 
  5 | * They preserve insertion order.
  6 | * Set membership testing is done in constant time.
  7 | * Set insertion is done in constant time.
  8 | * Clearing the set is done in constant time.
  9 | 
 10 | The cost for doing this is that the capacity of the set needs to be known up
 11 | front, and the elements in the set are limited to state identifiers.
 12 | 
 13 | These sets are principally used when traversing an NFA state graph. This
 14 | happens at search time, for example, in the PikeVM. It also happens during DFA
 15 | determinization.
 16 | */
 17 | 
 18 | use std::vec;
 19 | use std::vec::Vec;
 20 | 
 21 | use regex_automata::util::primitives::StateID;
 22 | 
 23 | /// A sparse set used for representing ordered NFA states.
 24 | ///
 25 | /// This supports constant time addition and membership testing. Clearing an
 26 | /// entire set can also be done in constant time. Iteration yields elements
 27 | /// in the order in which they were inserted.
 28 | ///
 29 | /// The data structure is based on: https://research.swtch.com/sparse
 30 | /// Note though that we don't actually use uninitialized memory. We generally
 31 | /// reuse sparse sets, so the initial allocation cost is bareable. However, its
 32 | /// other properties listed above are extremely useful.
 33 | #[derive(Clone)]
 34 | pub(crate) struct SparseSet {
 35 |     /// The number of elements currently in this set.
 36 |     len: usize,
 37 |     /// Dense contains the ids in the order in which they were inserted.
 38 |     dense: Vec<StateID>,
 39 |     /// Sparse maps ids to their location in dense.
 40 |     ///
 41 |     /// A state ID is in the set if and only if
 42 |     /// sparse[id] < len && id == dense[sparse[id]].
 43 |     ///
 44 |     /// Note that these are indices into 'dense'. It's a little weird to use
 45 |     /// StateID here, but we know our length can never exceed the bounds of
 46 |     /// StateID (enforced by 'resize') and StateID will be at most 4 bytes
 47 |     /// where as a usize is likely double that in most cases.
 48 |     sparse: Vec<StateID>,
 49 | }
 50 | 
 51 | impl SparseSet {
 52 |     /// Create a new sparse set with the given capacity.
 53 |     ///
 54 |     /// Sparse sets have a fixed size and they cannot grow. Attempting to
 55 |     /// insert more distinct elements than the total capacity of the set will
 56 |     /// result in a panic.
 57 |     ///
 58 |     /// This panics if the capacity given is bigger than `StateID::LIMIT`.
 59 |     #[inline]
 60 |     pub(crate) fn new(capacity: usize) -> SparseSet {
 61 |         let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] };
 62 |         set.resize(capacity);
 63 |         set
 64 |     }
 65 | 
 66 |     /// Resizes this sparse set to have the new capacity given.
 67 |     ///
 68 |     /// This set is automatically cleared.
 69 |     ///
 70 |     /// This panics if the capacity given is bigger than `StateID::LIMIT`.
 71 |     #[inline]
 72 |     pub(crate) fn resize(&mut self, new_capacity: usize) {
 73 |         assert!(
 74 |             new_capacity <= StateID::LIMIT,
 75 |             "sparse set capacity cannot excced {:?}",
 76 |             StateID::LIMIT
 77 |         );
 78 |         self.clear();
 79 |         self.dense.resize(new_capacity, StateID::ZERO);
 80 |         self.sparse.resize(new_capacity, StateID::ZERO);
 81 |     }
 82 | 
 83 |     /// Returns the capacity of this set.
 84 |     ///
 85 |     /// The capacity represents a fixed limit on the number of distinct
 86 |     /// elements that are allowed in this set. The capacity cannot be changed.
 87 |     #[inline]
 88 |     pub(crate) fn capacity(&self) -> usize {
 89 |         self.dense.len()
 90 |     }
 91 | 
 92 |     /// Returns the number of elements in this set.
 93 |     #[inline]
 94 |     pub(crate) fn len(&self) -> usize {
 95 |         self.len
 96 |     }
 97 | 
 98 |     /// Returns true if and only if this set is empty.
 99 |     #[inline]
100 |     pub(crate) fn is_empty(&self) -> bool {
101 |         self.len() == 0
102 |     }
103 | 
104 |     /// Insert the state ID value into this set and return true if the given
105 |     /// state ID was not previously in this set.
106 |     ///
107 |     /// This operation is idempotent. If the given value is already in this
108 |     /// set, then this is a no-op.
109 |     ///
110 |     /// If more than `capacity` ids are inserted, then this panics.
111 |     ///
112 |     /// This is marked as inline(always) since the compiler won't inline it
113 |     /// otherwise, and it's a fairly hot piece of code in DFA determinization.
114 |     #[cfg_attr(feature = "perf-inline", inline(always))]
115 |     pub(crate) fn insert(&mut self, id: StateID) -> bool {
116 |         if self.contains(id) {
117 |             return false;
118 |         }
119 | 
120 |         let i = self.len();
121 |         assert!(
122 |             i < self.capacity(),
123 |             "{:?} exceeds capacity of {:?} when inserting {:?}",
124 |             i,
125 |             self.capacity(),
126 |             id,
127 |         );
128 |         // OK since i < self.capacity() and self.capacity() is guaranteed to
129 |         // be <= StateID::LIMIT.
130 |         let index = StateID::new_unchecked(i);
131 |         self.dense[index] = id;
132 |         self.sparse[id] = index;
133 |         self.len += 1;
134 |         true
135 |     }
136 | 
137 |     /// Returns true if and only if this set contains the given value.
138 |     #[inline]
139 |     pub(crate) fn contains(&self, id: StateID) -> bool {
140 |         let index = self.sparse[id];
141 |         index.as_usize() < self.len() && self.dense[index] == id
142 |     }
143 | 
144 |     /// Clear this set such that it has no members.
145 |     #[inline]
146 |     pub(crate) fn clear(&mut self) {
147 |         self.len = 0;
148 |     }
149 | 
150 |     #[inline]
151 |     pub(crate) fn iter(&self) -> SparseSetIter<'_> {
152 |         SparseSetIter(self.dense[..self.len()].iter())
153 |     }
154 | 
155 |     /// Returns the heap memory usage, in bytes, used by this sparse set.
156 |     #[inline]
157 |     pub(crate) fn memory_usage(&self) -> usize {
158 |         self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE
159 |     }
160 | }
161 | 
162 | impl core::fmt::Debug for SparseSet {
163 |     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
164 |         let elements: Vec<StateID> = self.iter().collect();
165 |         f.debug_tuple("SparseSet").field(&elements).finish()
166 |     }
167 | }
168 | 
169 | /// An iterator over all elements in a sparse set.
170 | ///
171 | /// The lifetime `'a` refers to the lifetime of the set being iterated over.
172 | #[derive(Debug)]
173 | pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>);
174 | 
175 | impl<'a> Iterator for SparseSetIter<'a> {
176 |     type Item = StateID;
177 | 
178 |     #[cfg_attr(feature = "perf-inline", inline(always))]
179 |     fn next(&mut self) -> Option<StateID> {
180 |         self.0.next().copied()
181 |     }
182 | }
183 | 


--------------------------------------------------------------------------------
/src/util/tests.rs:
--------------------------------------------------------------------------------
 1 | use crate::util::{decode, decode_last};
 2 | use crate::Input;
 3 | use proptest::{prop_assert_eq, proptest};
 4 | use std::iter::successors;
 5 | 
 6 | proptest! {
 7 |     #[test]
 8 |     fn test_decode(haystack: String) {
 9 |         let foo = ropey::Rope::from_str(&haystack);
10 |         let mut input = Input::new(foo.slice(..));
11 |         let first_char = decode(&mut input, 0).transpose().unwrap();
12 |         let res: Vec<_> = successors(first_char.map(|c| (0, c)), |(i, c)| {
13 |             decode(&mut input, i + c.len_utf8())
14 |                 .transpose()
15 |                 .unwrap()
16 |                 .map(|c2| (i + c.len_utf8(), c2))
17 |         })
18 |         .collect();
19 |         let ref_chars: Vec<_> = haystack.char_indices().collect();
20 |         prop_assert_eq!(res, ref_chars);
21 | 
22 |         // let last_char = decode_last(&[], &mut input, 0).transpose().unwrap();
23 |         // let chars_rev = std::iter::successors(first_char.map(|c| (0, c)), |(i, c)| {
24 |         //     decode(&mut input, i + c.len_utf8())
25 |         //         .transpose()
26 |         //         .unwrap()
27 |         //         .map(|c2| (i + c.len_utf8(), c2))
28 |         // });
29 |     }
30 |     #[test]
31 |     fn test_decode_last(haystack: String) {
32 |         let foo = ropey::Rope::from_str(&haystack);
33 |         let mut input = Input::new(foo.slice(..));
34 |         let end = haystack.len();
35 |         input.move_to(end);
36 |         let first_char = decode_last(haystack[..input.haystack_off()].as_bytes(), &mut input, end).transpose().unwrap();
37 |         let res: Vec<_> = successors(first_char.map(|c| (end - c.len_utf8(), c)), |&(i, _)| {
38 |             input.move_to(i);
39 |             decode_last(haystack[..input.haystack_off()].as_bytes(), &mut input, i)
40 |                 .transpose()
41 |                 .unwrap()
42 |                 .map(|c2| (i - c2.len_utf8(), c2))
43 |         })
44 |         .collect();
45 |         let ref_chars: Vec<_> = haystack.char_indices().rev().collect();
46 |         prop_assert_eq!(res, ref_chars);
47 | 
48 |         // let last_char = decode_last(&[], &mut input, 0).transpose().unwrap();
49 |         // let chars_rev = std::iter::successors(first_char.map(|c| (0, c)), |(i, c)| {
50 |         //     decode(&mut input, i + c.len_utf8())
51 |         //         .transpose()
52 |         //         .unwrap()
53 |         //         .map(|c2| (i + c.len_utf8(), c2))
54 |         // });
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/util/utf8.rs:
--------------------------------------------------------------------------------
 1 | /*!
 2 | Utilities for dealing with UTF-8.
 3 | 
 4 | This module provides some UTF-8 related helper routines, including an
 5 | incremental decoder.
 6 | */
 7 | 
 8 | /// Returns true if and only if the given offset in the given bytes falls on a
 9 | /// valid UTF-8 encoded codepoint boundary.
10 | ///
11 | /// If `bytes` is not valid UTF-8, then the behavior of this routine is
12 | /// unspecified.
13 | #[cfg_attr(feature = "perf-inline", inline(always))]
14 | pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool {
15 |     match bytes.get(i) {
16 |         // The position at the end of the bytes always represents an empty
17 |         // string, which is a valid boundary. But anything after that doesn't
18 |         // make much sense to call valid a boundary.
19 |         None => i == bytes.len(),
20 |         // Other than ASCII (where the most significant bit is never set),
21 |         // valid starting bytes always have their most significant two bits
22 |         // set, where as continuation bytes never have their second most
23 |         // significant bit set. Therefore, this only returns true when bytes[i]
24 |         // corresponds to a byte that begins a valid UTF-8 encoding of a
25 |         // Unicode scalar value.
26 |         Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000,
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------