├── .gitignore
├── .travis.yml
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── benches
    ├── bench.rs
    ├── bench_default.rs
    └── bench_dynamic.rs
├── src
    ├── dfa
    │   ├── minimizer.rs
    │   ├── mod.rs
    │   ├── prefix_searcher.rs
    │   └── trie.rs
    ├── error.rs
    ├── graph.rs
    ├── lib.rs
    ├── look.rs
    ├── nfa
    │   ├── has_looks.rs
    │   ├── mod.rs
    │   └── no_looks.rs
    ├── regex.rs
    ├── runner
    │   ├── anchored.rs
    │   ├── forward_backward.rs
    │   ├── mod.rs
    │   └── program.rs
    └── unicode.rs
└── tests
    └── matches.rs


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /Cargo.lock
3 | *.swp
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: rust
 2 | sudo: false
 3 | 
 4 | # necessary for `travis-cargo coveralls --no-sudo`
 5 | addons:
 6 |   apt:
 7 |     packages:
 8 |       - libcurl4-openssl-dev
 9 |       - libelf-dev
10 |       - libdw-dev
11 | 
12 | rust:
13 |   - nightly
14 | 
15 | before_script:
16 |   - |
17 |       pip install 'travis-cargo<0.2' --user &&
18 |       export PATH=$HOME/.local/bin:$PATH
19 | 
20 | script:
21 |   - |
22 |       travis-cargo build &&
23 |       travis-cargo test &&
24 |       travis-cargo doc
25 | after_success:
26 |   - travis-cargo doc-upload
27 |   - travis-cargo coveralls --no-sudo
28 | 
29 | env:
30 |   global:
31 |     - TRAVIS_CARGO_NIGHTLY_FEATURE=""
32 |     - secure: FsSMY8g9LrxT3iLZPnTIAQNdIGmMNCYuwh0k1e8okpILNerRLSbDrCTtKgf4BA/wpRCYyZwb1O7K+8nd279WTaiWdQpDRGuY+JCXG91cKhXCZIydPWG3NAKD5p+WFLP16fwZkni54IAStwKymAiyZlcGN2/9lHy1KtbPUuStRuTKHIP6cEJfupVm0xcd0FuQZGDYY3h2YsgRz8JGW53nIwxRbv2Kti/4bwxCBOGgdo7nTYBidHfFWrMZQdKIrohBeRp0h1ALIGrS4ASrHeuk6wISOT57UhzlgQhZXstp18FIR3EyEbJyNzdu+0pvh4dVRxmSl5vCvtzyzph2szkJEYRgxz8JNDW3V4ya/rt/KURUxJNke3TCjcOV9uIxfnCQo0fuC2R4kO2zM7zen8K0gK77GmFfDoelM6f7KS0SL3ymxIQ7TR2eEHkXa4h+VrzuJcyGHiaruHolh6Hui6T5gSc6/1Y9Abovva95gguLmAMgtRAx19yaVBbRwr9VasC35bJBjFW9KhVCIptn2rZ0JUCNWscdnzKBI5c5ffv6ZNtCqapGf/rQyZuEZJgNzcr2BIKQwtUoyPojWu8i87FEAg9GIgjlDWBSR+dxMeehuXt1kuEsVXl1BZkqSrwFyK64PT+il8eKyyqsDZAfqmZfIi2TfRN292cZkFLfWew79Co=
33 | 
34 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "regex_dfa"
 3 | version = "0.5.0"
 4 | authors = ["Joe Neeman <joeneeman@gmail.com>"]
 5 | description = "A crate for turning regexes into DFAs."
 6 | documentation = "http://jneem.github.io/regex-dfa"
 7 | homepage = "http://jneem.github.io/regex-dfa"
 8 | repository = "http://github.com/jneem/regex-dfa"
 9 | readme = "README.md"
10 | license = "MIT/Apache-2.0"
11 | 
12 | [dependencies]
13 | itertools = "0.4"
14 | lazy_static = "0.1"
15 | memchr = "0.1"
16 | num-traits = "0.1"
17 | range-map = "0.1.5"
18 | refinery = "0.1"
19 | regex-syntax = "0.2"
20 | utf8-ranges = "0.1"
21 | 
22 | [dev-dependencies]
23 | matches = "0.1"
24 | quickcheck = "0.2"
25 | regex = "0.1.41"
26 | rand = "0.3"
27 | serde_json = "0.6"
28 | 
29 | [[bench]]
30 | name = "dynamic"
31 | path = "benches/bench_dynamic.rs"
32 | test = false
33 | bench = true
34 | 
35 | [[bench]]
36 | name = "default"
37 | path = "benches/bench_default.rs"
38 | test = true
39 | bench = true
40 | 
41 | [[test]]
42 | name = "examples"
43 | path = "tests/matches.rs"
44 | 
45 | [[test]]
46 | name = "crate"
47 | path = "src/lib.rs"
48 | 
49 | [profile.bench]
50 | debug = true
51 | lto = true
52 | 
53 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | 	http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014 The Rust Project Developers
 2 | 
 3 | Permission is hereby granted, free of charge, to any
 4 | person obtaining a copy of this software and associated
 5 | documentation files (the "Software"), to deal in the
 6 | Software without restriction, including without
 7 | limitation the rights to use, copy, modify, merge,
 8 | publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software
10 | is furnished to do so, subject to the following
11 | conditions:
12 | 
13 | The above copyright notice and this permission notice
14 | shall be included in all copies or substantial portions
15 | of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | DEALINGS IN THE SOFTWARE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | regex_dfa
 2 | =========
 3 | 
 4 | A crate for compiling regular expressions down to deterministic finite
 5 | automata.
 6 | 
 7 | [![Build status](https://travis-ci.org/jneem/regex-dfa.svg)](https://travis-ci.org/jneem/regex-dfa)
 8 | [![Coverage Status](https://coveralls.io/repos/jneem/regex-dfa/badge.svg?branch=master&service=github)](https://coveralls.io/github/jneem/regex-dfa?branch=master)
 9 | 
10 | [Documentation](http://jneem.github.io/regex-dfa/regex_dfa/index.html)
11 | 
12 | # Why?
13 | 
14 | Some regular expression implementations (e.g. [rust's regex
15 | library](http://github.com/rust-lang/regex)) are based on simulating
16 | non-deterministic finite automata (NFAs). By turning NFAs into DFAs, we can
17 | get a speed boost, at the cost of some compilation time and memory usage.
18 | [Preliminary benchmarks](http://bl.ocks.org/jneem/raw/3f08ade195796358d027/?data=%5B%7B%22x%22%3A%201506622%2C%20%22y%22%3A%201646883%2C%20%22bench%22%3A%20%22%28%3Fi%29Twain%22%2C%20%22ratio%22%3A%200.914832444077691%7D%2C%20%7B%22x%22%3A%2015187577%2C%20%22y%22%3A%20125978857%2C%20%22bench%22%3A%20%22%5Ba-q%5D%5B%5Eu-z%5D%7B13%7Dx%22%2C%20%22ratio%22%3A%200.12055655497811034%7D%2C%20%7B%22x%22%3A%201630324%2C%20%22y%22%3A%201631615%2C%20%22bench%22%3A%20%22Tom%7CSawyer%7CHuckleberry%7CFinn%22%2C%20%22ratio%22%3A%200.9992087594193483%7D%2C%20%7B%22x%22%3A%201501634%2C%20%22y%22%3A%20243419316%2C%20%22bench%22%3A%20%22.%7B0%2C2%7D%28Tom%7CSawyer%7CHuckleberry%7CFinn%29%22%2C%20%22ratio%22%3A%200.0061689188215449595%7D%2C%20%7B%22x%22%3A%201506470%2C%20%22y%22%3A%20295351074%2C%20%22bench%22%3A%20%22.%7B2%2C4%7D%28Tom%7CSawyer%7CHuckleberry%7CFinn%29%22%2C%20%22ratio%22%3A%200.005100607827821882%7D%2C%20%7B%22x%22%3A%201583452%2C%20%22y%22%3A%201724976%2C%20%22bench%22%3A%20%22Tom.%7B10%2C25%7Driver%7Criver.%7B10%2C25%7DTom%22%2C%20%22ratio%22%3A%200.9179559599669792%7D%2C%20%7B%22x%22%3A%201290763%2C%20%22y%22%3A%207366270%2C%20%22bench%22%3A%20%22%5B%5C%22%27%5D%5B%5E%5C%22%27%5D%7B0%2C30%7D%5B%3F%21%5C%5C.%5D%5B%5C%22%27%5D%22%2C%20%22ratio%22%3A%200.17522613208584534%7D%2C%20%7B%22x%22%3A%201571131%2C%20%22y%22%3A%20137662852%2C%20%22bench%22%3A%20%22%28%3Fi%29Tom%7CSawyer%7CHuckleberry%7CFinn%22%2C%20%22ratio%22%3A%200.011412890094707612%7D%2C%20%7B%22x%22%3A%201493996%2C%20%22y%22%3A%20131818244%2C%20%22bench%22%3A%20%22%28%5BA-Za-z%5Dawyer%7C%5BA-Za-z%5Dinn%29%5C%5Cs%22%2C%20%22ratio%22%3A%200.011333757412213746%7D%2C%20%7B%22x%22%3A%201499437%2C%20%22y%22%3A%2094501284%2C%20%22bench%22%3A%20%22%5C%5Cb%5C%5Cw%2Bnn%5C%5Cb%22%2C%20%22ratio%22%3A%200.015866842613482375%7D%2C%20%7B%22x%22%3A%201494150%2C%20%22y%22%3A%201620677%2C%20%22bench%22%3A%20%22Huck%5Ba-zA-Z%5D%2B%7CSaw%5Ba-zA-Z%5D%2B%22%2C%20%22ratio%22%3A%200.9219295393221475%7D%2C%20%7B%22x%22%3A%204604887%2C%20%22y%22%3A%2084880712%2C%20%22bench%22%3A%20%22%5C%5Cs%5Ba-zA-Z%5D%7B0%2C12%7Ding%5C%5Cs%22%2C%20%22ratio%22%3A%200.0542512767800534%7D%2C%20%7B%22x%22%3A%201657821%2C%20%22y%22%3A%201664874%2C%20%22bench%22%3A%20%22%5Ba-z%5Dshing%22%2C%20%22ratio%22%3A%200.9957636433748139%7D%2C%20%7B%22x%22%3A%20100143%2C%20%22y%22%3A%2099739%2C%20%22bench%22%3A%20%22Twain%22%2C%20%22ratio%22%3A%201.0040505719929014%7D%2C%20%7B%22x%22%3A%204877680%2C%20%22y%22%3A%2071127171%2C%20%22bench%22%3A%20%22%5Ba-zA-Z%5D%2Bing%22%2C%20%22ratio%22%3A%200.06857688744572732%7D%2C%20%7B%22x%22%3A%201584255%2C%20%22y%22%3A%2062666721%2C%20%22bench%22%3A%20%22%5C%5CbF%5C%5Cw%2Bn%5C%5Cb%22%2C%20%22ratio%22%3A%200.025280642974761677%7D%5D)
19 | show a substantial speed improvement over rust's default `regex` crate.
20 | 
21 | # Limitations
22 | 
23 | - Turning an NFA into a DFA can take a lot of memory, especially when unicode character classes are involved.
24 | - Subgroup captures are a bit tricky, and this crate does not handle them.
25 | - `regex_dfa` currently only works on nightly rust.
26 | 
27 | # License
28 | 
29 | `regex_dfa` is distributed under the MIT license and the Apache license (version 2.0).
30 | See LICENSE-APACHE and LICENSE-MIT for details.
31 | 
32 | 


--------------------------------------------------------------------------------
/benches/bench.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Rust Project Developers.
  2 | // Copyright 2015-2016 Joe Neeman.
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  7 | // option. This file may not be copied, modified, or distributed
  8 | // except according to those terms.
  9 | #![allow(non_snake_case)]
 10 | 
 11 | use std::iter::repeat;
 12 | use rand::{Rng, thread_rng};
 13 | use test::Bencher;
 14 | 
 15 | fn bench_assert_non_match(b: &mut Bencher, re: ::Regex, text: &str) {
 16 |     b.iter(|| if re.is_match(text) { panic!("match") });
 17 | }
 18 | 
 19 | fn bench_assert_match(b: &mut Bencher, re: ::Regex, text: &str) {
 20 |     b.iter(|| if !re.is_match(text) { panic!("no match") });
 21 | }
 22 | 
 23 | #[bench]
 24 | fn compile_word_boundary(b: &mut Bencher) {
 25 |     b.iter(|| regex!(r"\btest\b"));
 26 | }
 27 | 
 28 | #[bench]
 29 | fn literal(b: &mut Bencher) {
 30 |     let re = regex!("y");
 31 |     let text = format!("{}y", repeat("x").take(50).collect::<String>());
 32 |     bench_assert_match(b, re, &text);
 33 | }
 34 | 
 35 | #[bench]
 36 | fn longer_literal(b: &mut Bencher) {
 37 |     let re = regex!("foobar");
 38 |     let text = format!("{}foobar", repeat("f").take(10000).collect::<String>());
 39 |     bench_assert_match(b, re, &text);
 40 | }
 41 | 
 42 | #[bench]
 43 | fn longer_literal_no_regex(b: &mut Bencher) {
 44 |     let re = "foobar";
 45 |     let text = format!("{}foobar", repeat("f").take(10000).collect::<String>());
 46 |     b.iter(|| text.find(re));
 47 | }
 48 | 
 49 | #[bench]
 50 | fn not_literal(b: &mut Bencher) {
 51 |     let re = regex!(".y");
 52 |     let text = format!("{}y", repeat("x").take(10000).collect::<String>());
 53 |     b.bytes = 10000;
 54 |     bench_assert_match(b, re, &text);
 55 | }
 56 | 
 57 | #[bench]
 58 | fn match_class(b: &mut Bencher) {
 59 |     let re = regex!("[abcdw]");
 60 |     let text = format!("{}w", repeat("xxxx").take(20).collect::<String>());
 61 |     bench_assert_match(b, re, &text);
 62 | }
 63 | 
 64 | #[bench]
 65 | fn match_class_in_range(b: &mut Bencher) {
 66 |     // 'b' is between 'a' and 'c', so the class range checking doesn't help.
 67 |     let re = regex!("[ac]");
 68 |     let text = format!("{}c", repeat("bbbb").take(20).collect::<String>());
 69 |     bench_assert_match(b, re, &text);
 70 | }
 71 | 
 72 | #[bench]
 73 | fn match_class_unicode(b: &mut Bencher) {
 74 |     let re = regex!(r"\pL");
 75 |     let text = format!("{}a", repeat("☃5☃5").take(20).collect::<String>());
 76 |     bench_assert_match(b, re, &text);
 77 | }
 78 | 
 79 | #[bench]
 80 | fn anchored_literal_short_non_match(b: &mut Bencher) {
 81 |     let re = regex!("^zbc(d|e)");
 82 |     let text = "abcdefghijklmnopqrstuvwxyz";
 83 |     bench_assert_non_match(b, re, &text);
 84 | }
 85 | 
 86 | #[bench]
 87 | fn anchored_literal_long_non_match(b: &mut Bencher) {
 88 |     let re = regex!("^zbc(d|e)");
 89 |     let text: String = repeat("abcdefghijklmnopqrstuvwxyz").take(15).collect();
 90 |     bench_assert_non_match(b, re, &text);
 91 | }
 92 | 
 93 | #[bench]
 94 | fn anchored_literal_short_match(b: &mut Bencher) {
 95 |     let re = regex!("^.bc(d|e)");
 96 |     let text = "abcdefghijklmnopqrstuvwxyz";
 97 |     bench_assert_match(b, re, text);
 98 | }
 99 | 
100 | #[bench]
101 | fn anchored_literal_long_match(b: &mut Bencher) {
102 |     let re = regex!("^.bc(d|e)");
103 |     let text: String = repeat("abcdefghijklmnopqrstuvwxyz").take(15).collect();
104 |     bench_assert_match(b, re, &text);
105 | }
106 | 
107 | #[bench]
108 | fn one_pass_short_a(b: &mut Bencher) {
109 |     let re = regex!("^.bc(d|e)*$");
110 |     let text = "abcddddddeeeededd";
111 |     bench_assert_match(b, re, text);
112 | }
113 | 
114 | #[bench]
115 | fn one_pass_short_a_not(b: &mut Bencher) {
116 |     let re = regex!(".bc(d|e)*$");
117 |     let text = "abcddddddeeeededd";
118 |     bench_assert_match(b, re, text);
119 | }
120 | 
121 | #[bench]
122 | fn one_pass_short_b(b: &mut Bencher) {
123 |     let re = regex!("^.bc(?:d|e)*$");
124 |     let text = "abcddddddeeeededd";
125 |     bench_assert_match(b, re, text);
126 | }
127 | 
128 | #[bench]
129 | fn one_pass_short_b_not(b: &mut Bencher) {
130 |     let re = regex!(".bc(?:d|e)*$");
131 |     let text = "abcddddddeeeededd";
132 |     bench_assert_match(b, re, text);
133 | }
134 | 
135 | #[bench]
136 | fn one_pass_long_prefix(b: &mut Bencher) {
137 |     let re = regex!("^abcdefghijklmnopqrstuvwxyz.*$");
138 |     let text = "abcdefghijklmnopqrstuvwxyz";
139 |     bench_assert_match(b, re, text);
140 | }
141 | 
142 | #[bench]
143 | fn one_pass_long_prefix_not(b: &mut Bencher) {
144 |     let re = regex!("^.bcdefghijklmnopqrstuvwxyz.*$");
145 |     let text = "abcdefghijklmnopqrstuvwxyz";
146 |     bench_assert_match(b, re, text);
147 | }
148 | 
149 | #[bench]
150 | fn backtrack(b: &mut Bencher) {
151 |     let re = regex!("a*b");
152 |     let text: String = repeat("aaaaaaaaaaaaaaaaaaaaaaaaaaaa").take(100).collect();
153 |     b.bytes = text.len() as u64;
154 |     bench_assert_non_match(b, re, &text);
155 | }
156 | 
157 | #[bench]
158 | fn skip(b: &mut Bencher) {
159 |     let re = regex!("a[b-zA-Z]+a");
160 |     let text: String = repeat("aaaaaaaaaaaaaaaaaaaaaaaaaaaa").take(100).collect();
161 |     b.bytes = text.len() as u64;
162 |     bench_assert_non_match(b, re, &text);
163 | }
164 | 
165 | macro_rules! throughput(
166 |     ($name:ident, $regex:expr, $size:expr) => (
167 |         #[bench]
168 |         fn $name(b: &mut Bencher) {
169 |             let text = gen_text($size);
170 |             b.bytes = $size;
171 |             let re = $regex;
172 |             b.iter(|| if re.is_match(&text) { panic!("match") });
173 |         }
174 |     );
175 | );
176 | 
177 | fn easy0() -> ::Regex { regex!("ABCDEFGHIJKLMNOPQRSTUVWXYZ$") }
178 | fn easy1() -> ::Regex { regex!("A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$") }
179 | fn medium() -> ::Regex { regex!("[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$") }
180 | fn hard() -> ::Regex { regex!("[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$") }
181 | 
182 | fn gen_text(n: usize) -> String {
183 |     let mut rng = thread_rng();
184 |     let mut bytes = rng.gen_ascii_chars().map(|n| n as u8).take(n)
185 |                        .collect::<Vec<u8>>();
186 |     for (i, b) in bytes.iter_mut().enumerate() {
187 |         if i % 20 == 0 {
188 |             *b = b'\n'
189 |         }
190 |     }
191 |     String::from_utf8(bytes).unwrap()
192 | }
193 | 
194 | throughput!(easy0_32, easy0(), 32);
195 | throughput!(easy0_1K, easy0(), 1<<10);
196 | throughput!(easy0_32K, easy0(), 32<<10);
197 | throughput!(easy0_1MB, easy0(), 1<<20);
198 | 
199 | throughput!(easy1_32, easy1(), 32);
200 | throughput!(easy1_1K, easy1(), 1<<10);
201 | throughput!(easy1_32K, easy1(), 32<<10);
202 | throughput!(easy1_1MB, easy1(), 1<<20);
203 | 
204 | throughput!(medium_32, medium(), 32);
205 | throughput!(medium_1K, medium(), 1<<10);
206 | throughput!(medium_32K,medium(), 32<<10);
207 | throughput!(medium_1MB, medium(), 1<<20);
208 | 
209 | throughput!(hard_32, hard(), 32);
210 | throughput!(hard_1K, hard(), 1<<10);
211 | throughput!(hard_32K,hard(), 32<<10);
212 | throughput!(hard_1MB, hard(), 1<<20);
213 | 
214 | 


--------------------------------------------------------------------------------
/benches/bench_default.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 The Rust Project Developers.
 2 | // Copyright 2015-2016 Joe Neeman.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 7 | // option. This file may not be copied, modified, or distributed
 8 | // except according to those terms.
 9 | 
10 | #![feature(test)]
11 | 
12 | extern crate rand;
13 | extern crate regex;
14 | extern crate regex_dfa;
15 | extern crate test;
16 | 
17 | // Due to macro scoping rules, this definition only applies for the modules
18 | // defined below. Effectively, it allows us to use the same tests for both
19 | // native and dynamic regexes.
20 | macro_rules! regex(
21 |     ($re:expr) => (
22 |         ::regex_dfa::Regex::new($re).unwrap()
23 |     );
24 | );
25 | 
26 | type Regex = ::regex_dfa::Regex;
27 | 
28 | mod bench;
29 | 


--------------------------------------------------------------------------------
/benches/bench_dynamic.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2014 The Rust Project Developers.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | // option. This file may not be copied, modified, or distributed
 7 | // except according to those terms.
 8 | 
 9 | #![feature(test)]
10 | 
11 | extern crate rand;
12 | extern crate regex;
13 | extern crate test;
14 | 
15 | // Due to macro scoping rules, this definition only applies for the modules
16 | // defined below. Effectively, it allows us to use the same tests for both
17 | // native and dynamic regexes.
18 | macro_rules! regex(
19 |     ($re:expr) => (
20 |         match ::regex::Regex::new($re) {
21 |             Ok(re) => re,
22 |             Err(err) => panic!("{}", err),
23 |         }
24 |     );
25 | );
26 | 
27 | type Regex = ::regex::Regex;
28 | 
29 | mod bench;
30 | 


--------------------------------------------------------------------------------
/src/dfa/minimizer.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2016 Joe Neeman.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. This file may not be copied, modified, or distributed
  7 | // except according to those terms.
  8 | 
  9 | use dfa::{Dfa, RetTrait};
 10 | use nfa::{Accept, StateIdx, StateSet};
 11 | use range_map::{RangeMultiMap, RangeSet};
 12 | use refinery::Partition;
 13 | use std::collections::{HashSet, HashMap};
 14 | 
 15 | pub struct Minimizer {
 16 |     partition: Partition,
 17 |     distinguishers: HashSet<usize>,
 18 |     // The reversed transitions of the dfa.
 19 |     rev: Vec<RangeMultiMap<u8, StateIdx>>,
 20 | }
 21 | 
 22 | impl Minimizer {
 23 |     // Partition the states according to
 24 |     // - when they accept,
 25 |     // - what they return if they do sometimes accept, and
 26 |     // - what set of bytes do we expect to see next.
 27 |     fn initial_partition<Ret: RetTrait>(dfa: &Dfa<Ret>) -> Vec<Vec<StateIdx>> {
 28 |         let mut part: HashMap<(Accept, Option<&Ret>, RangeSet<u8>), Vec<StateIdx>> = HashMap::new();
 29 |         for (idx, st) in dfa.states.iter().enumerate() {
 30 |             let chars = st.transitions.to_range_set();
 31 |             part.entry((st.accept, dfa.ret(idx), chars)).or_insert_with(Vec::new).push(idx);
 32 |         }
 33 |         part.into_iter().map(|x| x.1).collect()
 34 |     }
 35 | 
 36 |     // Refine the current partition based on the fact that everything in `splitter` is distinct
 37 |     // from everything not in it.
 38 |     fn refine(&mut self, splitter: &[StateIdx]) {
 39 |         let dists = &mut self.distinguishers;
 40 | 
 41 |         self.partition.refine_with_callback(splitter, |p, int_idx, diff_idx| {
 42 |             if dists.contains(&int_idx) || p.part(diff_idx).len() < p.part(int_idx).len() {
 43 |                 dists.insert(diff_idx);
 44 |             } else {
 45 |                 dists.insert(int_idx);
 46 |             }
 47 |         });
 48 |     }
 49 | 
 50 |     fn next_distinguisher(&mut self) -> Option<usize> {
 51 |         let maybe_elt = self.distinguishers.iter().next().cloned();
 52 |         if let Some(elt) = maybe_elt {
 53 |             self.distinguishers.remove(&elt);
 54 |         }
 55 |         maybe_elt
 56 |     }
 57 | 
 58 |     fn get_input_sets(&mut self, part_idx: usize) -> Vec<StateSet> {
 59 |         let inputs: Vec<_> = self.partition.part(part_idx)
 60 |                 .iter()
 61 |                 .flat_map(|s| self.rev[*s].ranges_values().cloned())
 62 |                 .collect();
 63 |         if inputs.is_empty() {
 64 |             return Vec::new();
 65 |         }
 66 | 
 67 |         let inputs = RangeMultiMap::from_vec(inputs);
 68 |         let mut sets: Vec<StateSet> = inputs.group()
 69 |             .ranges_values()
 70 |             .map(|&(_, ref x)| x.clone())
 71 |             .collect();
 72 |         for set in &mut sets {
 73 |             set.sort();
 74 |         }
 75 |         sets.sort();
 76 |         sets.dedup();
 77 |         sets
 78 |     }
 79 | 
 80 |     fn compute_partition(&mut self) {
 81 |         while let Some(dist) = self.next_distinguisher() {
 82 |             let sets = self.get_input_sets(dist);
 83 | 
 84 |             for set in &sets {
 85 |                 self.refine(set);
 86 |             }
 87 |         }
 88 |     }
 89 | 
 90 |     pub fn minimize<Ret: RetTrait>(dfa: &Dfa<Ret>) -> Dfa<Ret> {
 91 |         let mut min = Minimizer::new(dfa);
 92 | 
 93 |         min.compute_partition();
 94 | 
 95 |         let mut ret = Dfa::new();
 96 | 
 97 |         // We need to re-index the states: build a map that maps old indices to
 98 |         // new indices.
 99 |         let mut old_state_to_new = vec![0; dfa.num_states()];
100 |         for part in min.partition.iter() {
101 |             // This unwrap is safe because we don't allow any empty sets into the partition.
102 |             let rep_idx = *part.iter().next().unwrap();
103 |             ret.states.push(dfa.states[rep_idx].clone());
104 | 
105 |             for &state in part.iter() {
106 |                 old_state_to_new[state] = ret.states.len() - 1;
107 |             }
108 |         }
109 | 
110 |         ret.map_states(|s: StateIdx| old_state_to_new[s]);
111 |         ret.init = dfa.init.iter()
112 |             .map(|x| x.map(|s: StateIdx| old_state_to_new[s]))
113 |             .collect();
114 |         ret
115 |     }
116 | 
117 |     fn new<Ret: RetTrait>(dfa: &Dfa<Ret>) -> Minimizer {
118 |         let init = Minimizer::initial_partition(dfa);
119 |         let part = Partition::new(init.into_iter().map(|set| set.into_iter()), dfa.num_states());
120 | 
121 |         // According to Hopcroft's algorithm, we're allowed to leave out one of the distinguishers
122 |         // (at least, as long as it isn't a set of accepting states). Choose the one with the
123 |         // most states to leave out.
124 |         let mut dists: HashSet<usize> = (0..part.num_parts()).collect();
125 |         let worst = (0..dists.len())
126 |             .filter(|i| dfa.states[part.part(*i)[0]].accept == Accept::Never)
127 |             .max_by_key(|i| part.part(*i).len());
128 |         if let Some(worst) = worst {
129 |             dists.remove(&worst);
130 |         }
131 | 
132 |         Minimizer {
133 |             partition: part,
134 |             distinguishers: dists,
135 |             rev: dfa.reversed_transitions(),
136 |         }
137 |     }
138 | }
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/src/dfa/mod.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2016 Joe Neeman.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. This file may not be copied, modified, or distributed
  7 | // except according to those terms.
  8 | 
  9 | mod trie;
 10 | mod prefix_searcher;
 11 | mod minimizer;
 12 | 
 13 | use dfa::minimizer::Minimizer;
 14 | use dfa::prefix_searcher::PrefixSearcher;
 15 | use graph::Graph;
 16 | use look::Look;
 17 | use itertools::Itertools;
 18 | use nfa::{Accept, StateIdx};
 19 | use range_map::{RangeMap, RangeMultiMap};
 20 | use refinery::Partition;
 21 | use runner::program::TableInsts;
 22 | use std;
 23 | use std::fmt::{Debug, Formatter};
 24 | use std::hash::Hash;
 25 | use std::mem;
 26 | use std::u32;
 27 | 
 28 | pub use dfa::prefix_searcher::PrefixPart;
 29 | 
 30 | #[derive(Clone, PartialEq, Debug)]
 31 | pub struct State<Ret> {
 32 |     pub transitions: RangeMap<u8, StateIdx>,
 33 |     pub accept: Accept,
 34 |     pub ret: Option<Ret>,
 35 | }
 36 | 
 37 | impl<Ret> State<Ret> {
 38 |     pub fn new(accept: Accept, ret: Option<Ret>) -> State<Ret> {
 39 |         State {
 40 |             transitions: RangeMap::new(),
 41 |             accept: accept,
 42 |             ret: ret,
 43 |         }
 44 |     }
 45 | }
 46 | 
 47 | pub trait RetTrait: Clone + Copy + Debug + Eq + Hash {}
 48 | impl<T: Clone + Copy + Debug + Eq + Hash> RetTrait for T {}
 49 | 
 50 | #[derive(Clone, PartialEq)]
 51 | pub struct Dfa<Ret: 'static> {
 52 |     states: Vec<State<Ret>>,
 53 | 
 54 |     /// This is a vector of length `Look::num()` containing all possible starting positions.
 55 |     ///
 56 |     /// `init[Look::Boundary]` is the starting position if we are at the beginning of the
 57 |     /// input.
 58 |     ///
 59 |     /// `init[Look::Full]` is the default starting position.
 60 |     ///
 61 |     /// All other positions in `init` are only used if we are specifically asked to start
 62 |     /// there; this is mainly useful in the forward-backward engine.
 63 |     pub init: Vec<Option<StateIdx>>,
 64 | }
 65 | 
 66 | impl<Ret: RetTrait> Dfa<Ret> {
 67 |     /// Returns a `Dfa` with no states.
 68 |     pub fn new() -> Dfa<Ret> {
 69 |         Dfa {
 70 |             states: Vec::new(),
 71 |             init: vec![None; Look::num()],
 72 |         }
 73 |     }
 74 | 
 75 |     /// Returns the number of states.
 76 |     pub fn num_states(&self) -> usize {
 77 |         self.states.len()
 78 |     }
 79 | 
 80 |     pub fn add_state(&mut self, accept: Accept, ret: Option<Ret>) -> StateIdx {
 81 |         self.states.push(State::new(accept, ret));
 82 |         self.states.len() - 1
 83 |     }
 84 | 
 85 |     pub fn set_transitions(&mut self, from: StateIdx, transitions: RangeMap<u8, StateIdx>) {
 86 |         self.states[from].transitions = transitions;
 87 |     }
 88 | 
 89 |     pub fn init_state(&self, look: Look) -> Option<StateIdx> {
 90 |         self.init[look.as_usize()]
 91 |     }
 92 | 
 93 |     pub fn init_at_start(&self) -> Option<StateIdx> {
 94 |         self.init_state(Look::Boundary)
 95 |     }
 96 | 
 97 |     pub fn init_otherwise(&self) -> Option<StateIdx> {
 98 |         self.init_state(Look::Full)
 99 |     }
100 | 
101 |     /// Returns true if this `Dfa` only matches things at the beginning of the input.
102 |     pub fn is_anchored(&self) -> bool {
103 |         self.init_otherwise().is_none() && self.init_at_start().is_some()
104 |     }
105 | 
106 |     /// Get transitions from a given state.
107 |     pub fn transitions(&self, state: StateIdx) -> &RangeMap<u8, StateIdx> {
108 |         &self.states[state].transitions
109 |     }
110 | 
111 |     /// Returns the conditions under which the given state accepts.
112 |     pub fn accept(&self, state: StateIdx) -> &Accept {
113 |         &self.states[state].accept
114 |     }
115 | 
116 |     /// The value that will be returned if we accept in state `state`.
117 |     pub fn ret(&self, state: StateIdx) -> Option<&Ret> {
118 |         self.states[state].ret.as_ref()
119 |     }
120 | 
121 |     /// Changes the return value.
122 |     pub fn map_ret<T: RetTrait, F: FnMut(Ret) -> T>(self, mut f: F) -> Dfa<T> {
123 |         let mut ret: Dfa<T> = Dfa::new();
124 |         ret.init = self.init;
125 | 
126 |         for st in self.states {
127 |             let new_st = State {
128 |                 transitions: st.transitions,
129 |                 accept: st.accept,
130 |                 ret: st.ret.map(&mut f),
131 |             };
132 |             ret.states.push(new_st);
133 |         }
134 |         ret
135 |     }
136 | 
137 |     /// Returns an equivalent DFA with a minimal number of states.
138 |     ///
139 |     /// Uses Hopcroft's algorithm.
140 |     fn minimize(&self) -> Dfa<Ret> {
141 |         Minimizer::minimize(self)
142 |     }
143 | 
144 |     /// Returns the transitions of this automaton, reversed.
145 |     fn reversed_transitions(&self) -> Vec<RangeMultiMap<u8, StateIdx>> {
146 |         let mut ret = vec![RangeMultiMap::new(); self.states.len()];
147 | 
148 |         for (source, st) in self.states.iter().enumerate() {
149 |             for &(range, target) in st.transitions.ranges_values() {
150 |                 ret[target].insert(range, source);
151 |             }
152 |         }
153 | 
154 |         ret
155 |     }
156 | 
157 |     /// Returns a set of strings that match the beginning of this `Dfa`.
158 |     ///
159 |     /// If the set is non-empty, every match of this `Dfa` is guaranteed to start with one of these
160 |     /// strings.
161 |     pub fn prefix_strings(&self) -> Vec<PrefixPart> {
162 |         // It might seem silly to look for prefixes starting at the anchored state, but it's useful
163 |         // for forward-backward matching. In cases where the regex is honestly anchored, we won't
164 |         // ask to make a prefix anyway.
165 |         if let Some(state) = self.init_state(Look::Boundary) {
166 |             PrefixSearcher::extract(self, state)
167 |         } else {
168 |             Vec::new()
169 |         }
170 |     }
171 | 
172 |     /*
173 |     pub fn critical_strings(&self) -> Vec<(Vec<u8>, StateIdx)> {
174 |         unimplemented!();
175 |     }
176 |     */
177 | 
178 |     // Finds the bytes that are treated equivalently by this Dfa.
179 |     //
180 |     // Returns a Vec of length 256 such that vec[i] == vec[j] when i and j are two equivalent
181 |     // bytes. Also returns the log of the number of classes, rounded up.
182 |     fn byte_equivalence_classes(&self) -> (Vec<u8>, u32) {
183 |         let mut part = Partition::new(Some(0..256).into_iter(), 256);
184 |         let mut buf = Vec::with_capacity(256);
185 | 
186 |         for st in &self.states {
187 |             let group = st.transitions.keys_values().group_by_lazy(|x| x.1);
188 |             for (_, keys_values) in &group {
189 |                 buf.clear();
190 |                 for (key, _) in keys_values {
191 |                     buf.push(key as usize);
192 |                 }
193 |                 part.refine(&buf);
194 |             }
195 |         }
196 | 
197 |         let mut ret = vec![0; 256];
198 |         for (i, p) in part.iter().enumerate() {
199 |             for &x in p {
200 |                 ret[x] = i as u8;
201 |             }
202 |         }
203 |         let size = (part.num_parts() - 1) as u32;
204 | 
205 |         (ret, 32 - size.leading_zeros())
206 |     }
207 | 
208 |     /// Compiles this `Dfa` into instructions for execution.
209 |     pub fn compile(&self) -> TableInsts<Ret> {
210 |         let (byte_class, log_num_classes) = self.byte_equivalence_classes();
211 | 
212 |         let mut table = vec![u32::MAX; self.num_states() << log_num_classes];
213 |         let accept: Vec<Option<Ret>> = self.states.iter()
214 |             .map(|st| if st.accept == Accept::Always { st.ret } else { None })
215 |             .collect();
216 |         let accept_at_eoi: Vec<Option<Ret>> = self.states.iter()
217 |             .map(|st| if st.accept != Accept::Never { st.ret } else { None })
218 |             .collect();
219 | 
220 |         for (idx, st) in self.states.iter().enumerate() {
221 |             for (ch, &tgt_state) in st.transitions.keys_values() {
222 |                 let class = byte_class[ch as usize];
223 |                 table[(idx << log_num_classes) + class as usize] = tgt_state as u32;
224 |             }
225 |         }
226 | 
227 |         TableInsts {
228 |             log_num_classes: log_num_classes,
229 |             byte_class: byte_class,
230 |             accept: accept,
231 |             accept_at_eoi: accept_at_eoi,
232 |             table: table,
233 |         }
234 |     }
235 | 
236 |     /// Finds an equivalent DFA with the minimal number of states.
237 |     pub fn optimize(self) -> Dfa<Ret> {
238 |         let mut ret = self.minimize();
239 |         ret.sort_states();
240 |         ret
241 |     }
242 | 
243 |     /// Deletes any transitions that return to the initial state.
244 |     ///
245 |     /// This results in a new Dfa with the following properties:
246 |     /// - if the original Dfa has a match then the new Dfa also has a match that ends in the same
247 |     ///   position (and vice versa), and
248 |     /// - the new Dfa doesn't need to backtrack to find matches: if it fails then it can be
249 |     ///   restarted at the same position it failed in.
250 |     ///
251 |     /// The reason for this method is that it makes prefixes more effective: where the original Dfa
252 |     /// would just loop back to the start state, the new Dfa will signal a failure. Then we can use
253 |     /// a `Prefix` to scan ahead for a good place to resume matching.
254 |     ///
255 |     /// # Panics
256 |     /// - if `self` is not anchored.
257 |     pub fn cut_loop_to_init(mut self) -> Dfa<Ret> {
258 |         if !self.is_anchored() {
259 |             panic!("only anchored Dfas can be cut");
260 |         }
261 | 
262 |         // The unwrap is safe because we just checked that we are anchored.
263 |         let init = self.init_at_start().unwrap();
264 |         for st in &mut self.states {
265 |             st.transitions.retain_values(|x| *x != init);
266 |         }
267 |         self
268 |     }
269 | 
270 |     fn map_states<F: FnMut(StateIdx) -> StateIdx>(&mut self, mut map: F) {
271 |         for st in &mut self.states {
272 |             st.transitions.map_values(|x| map(*x));
273 |         }
274 |         let init: Vec<_> = self.init.iter().map(|x| x.map(&mut map)).collect();
275 |         self.init = init;
276 |     }
277 | 
278 |     /// Sorts states in depth-first alphabetical order.
279 |     ///
280 |     /// This has the following advantages:
281 |     /// - the construction of a `Dfa` becomes deterministic: without sorting, the states aren't in
282 |     ///   deterministic order because `minimize` using hashing.
283 |     /// - better locality: after sorting, many transitions just go straight to the next state.
284 |     /// - we prune unreachable states.
285 |     fn sort_states(&mut self) {
286 |         let sorted = self.dfs_order(self.init.iter().filter_map(|x| *x));
287 | 
288 |         // Not every old state will necessary get mapped to a new one (unreachable states won't).
289 |         let mut state_map: Vec<Option<StateIdx>> = vec![None; self.states.len()];
290 |         let mut old_states = vec![State::new(Accept::Never, None); self.states.len()];
291 |         mem::swap(&mut old_states, &mut self.states);
292 | 
293 |         for (new_idx, old_idx) in sorted.into_iter().enumerate() {
294 |             state_map[old_idx] = Some(new_idx);
295 |             mem::swap(&mut old_states[old_idx], &mut self.states[new_idx]);
296 |         }
297 | 
298 |         // Fix the transitions and initialization to point to the new states. The `unwrap` here is
299 |         // basically the assertion that all reachable states should be mapped to new states.
300 |         self.map_states(|s| state_map[s].unwrap());
301 |     }
302 | 
303 |     /*
304 |     // Finds all the transitions between states that only match a single byte.
305 |     fn single_byte_transitions(&self) -> HashMap<(StateIdx, StateIdx), u8> {
306 |         use std::collections::hash_map::Entry::*;
307 | 
308 |         let mut ret = HashMap::new();
309 |         let mut seen = HashSet::new();
310 |         for (src_idx, st) in self.states.iter().enumerate() {
311 |             for &(range, tgt_idx) in st.transitions.ranges_values() {
312 |                 if range.start == range.end && !seen.contains(&(src_idx, tgt_idx)) {
313 |                     match ret.entry((src_idx, tgt_idx)) {
314 |                         Occupied(e) => {
315 |                             e.remove();
316 |                             seen.insert((src_idx, tgt_idx));
317 |                         },
318 |                         Vacant(e) => { e.insert(range.start); },
319 |                     }
320 |                 }
321 |             }
322 |         }
323 |         ret
324 |     }
325 | 
326 |     // Finds all the single-byte transitions that must be traversed in order to get to an accepting
327 |     // state.
328 |     fn mandatory_single_byte_transitions(&self, max_steps: usize) -> Vec<(StateIdx, StateIdx, u8)> {
329 |         let map = self.single_byte_transitions();
330 |         let interesting_bytes: HashSet<u8> = map.values().cloned().collect();
331 | 
332 |         // In order to get from the initial state to state i, we need to see all the bytes in
333 |         // mandatory_bytes[i] at least once. (At least, that's the goal of mandatory_bytes; we
334 |         // start out with too many elements in it and gradually remove them.)
335 |         let mut mandatory_bytes = vec![interesting_bytes.clone(); self.num_states()];
336 |         mandatory_bytes[0] = HashSet::new();
337 | 
338 |         let mut visited = HashSet::<StateIdx>::new();
339 |         let mut active = HashSet::<StateIdx>::new();
340 |         let mut next = HashSet::<StateIdx>::new();
341 |         next.insert(0);
342 |         let mut steps_left = max_steps;
343 | 
344 |         fn intersect(a: &mut HashSet<u8>, b: &HashSet<u8>) -> bool {
345 |             let old_size = a.len();
346 |             *a = a.intersection(b).cloned().collect();
347 |             a.len() < old_size
348 |         }
349 | 
350 |         while steps_left > 0 {
351 |             steps_left -= 1;
352 |             mem::swap(&mut active, &mut next);
353 |             next.clear();
354 | 
355 |             for &src in &active {
356 |                 // If we found an accepting state, keep it in the active set but don't go any
357 |                 // further.
358 |                 if self.accept(src) != &Accept::Never {
359 |                     next.insert(src);
360 |                     continue;
361 |                 }
362 | 
363 |                 visited.insert(src);
364 |                 for tgt in self.transitions(src).ranges_values().map(|x| x.1).dedup() {
365 |                     let mut bytes = mandatory_bytes[src].clone();
366 |                     if let Some(b) = map.get(&(src, tgt)) {
367 |                         bytes.insert(*b);
368 |                     }
369 |                     if intersect(&mut mandatory_bytes[tgt], &bytes) || !visited.contains(&tgt) {
370 |                         next.insert(tgt);
371 |                     }
372 |                 }
373 |             }
374 |         }
375 | 
376 |         let critical_bytes = next.into_iter()
377 |             .fold(interesting_bytes,
378 |                   |x, state| x.intersection(&mandatory_bytes[state]).cloned().collect());
379 | 
380 |         let mut ret: Vec<_> = map.into_iter()
381 |             .filter(|&(pair, byte)| critical_bytes.contains(&byte) && visited.contains(&pair.0))
382 |             .map(|(pair, byte)| (pair.0, pair.1, byte))
383 |             .collect();
384 |         ret.sort();
385 |         ret
386 |     }
387 |     */
388 | }
389 | 
390 | impl<Ret: Debug> Debug for Dfa<Ret> {
391 |     fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
392 |         try!(f.write_fmt(format_args!("Dfa ({} states):\n", self.states.len())));
393 | 
394 |         try!(f.write_fmt(format_args!("Init: {:?}\n", self.init)));
395 | 
396 |         for (st_idx, st) in self.states.iter().enumerate().take(40) {
397 |             try!(f.write_fmt(format_args!("\tState {} (accepting: {:?}):\n", st_idx, st.accept)));
398 |             if let Some(ref ret) = st.ret {
399 |                 try!(f.write_fmt(format_args!("\t\t{:?}\n", ret)));
400 |             }
401 | 
402 |             if !st.transitions.is_empty() {
403 |                 try!(f.write_str("\t\tTransitions:\n"));
404 |                 // Cap it at 5 transitions, since it gets unreadable otherwise.
405 |                 for &(range, target) in st.transitions.ranges_values().take(5) {
406 |                     try!(f.write_fmt(format_args!("\t\t\t{} -- {} => {}\n",
407 |                                                   range.start, range.end, target)));
408 |                 }
409 |                 if st.transitions.num_ranges() > 5 {
410 |                     try!(f.write_str("\t\t\t...\n"));
411 |                 }
412 |             }
413 |         }
414 |         if self.states.len() > 40 {
415 |             try!(f.write_fmt(format_args!("\t...({} more states)\n", self.states.len() - 40)));
416 |         }
417 |         Ok(())
418 |     }
419 | }
420 | 
421 | #[cfg(test)]
422 | pub mod tests {
423 |     use dfa::*;
424 |     use itertools::Itertools;
425 |     use look::Look;
426 |     use nfa::{Accept, Nfa, StateIdx};
427 |     use range_map::{Range, RangeMap};
428 |     use std::usize;
429 | 
430 |     // Creates a non-backtracking dfa from a regex string.
431 |     pub fn make_dfa_bounded(re: &str, max_states: usize) -> ::Result<Dfa<(Look, u8)>> {
432 |         let nfa = try!(Nfa::from_regex(re));
433 |         let nfa = nfa.remove_looks();
434 |         println!("after remove_looks: {:?}", nfa);
435 |         let nfa = try!(nfa.byte_me(max_states));
436 |         println!("after byte: {:?}", nfa);
437 | 
438 |         let dfa = try!(nfa.determinize(max_states));
439 |         Ok(dfa.optimize())
440 |     }
441 | 
442 |     pub fn make_dfa(re: &str) -> ::Result<Dfa<(Look, u8)>> {
443 |         make_dfa_bounded(re, usize::MAX)
444 |     }
445 | 
446 |     pub fn make_anchored(re: &str) -> Dfa<(Look, u8)> {
447 |         let nfa = Nfa::from_regex(re).unwrap()
448 |             .remove_looks()
449 |             .byte_me(usize::MAX).unwrap()
450 |             .anchor(usize::MAX).unwrap();
451 | 
452 |         nfa.determinize(usize::MAX).unwrap()
453 |             .optimize()
454 |             .cut_loop_to_init()
455 |             .optimize()
456 |     }
457 | 
458 |     pub fn trans_dfa_anchored(size: usize, trans: &[(StateIdx, StateIdx, Range<u8>)])
459 |     -> Dfa<(Look, u8)> {
460 |         let mut ret = Dfa::new();
461 |         for _ in 0..size {
462 |             ret.add_state(Accept::Never, None);
463 |         }
464 |         for (src, trans) in trans.iter().group_by(|x| x.0) {
465 |             let rm: RangeMap<u8, usize> = trans.into_iter()
466 |                 .map(|x| (x.2, x.1))
467 |                 .collect();
468 |             ret.set_transitions(src, rm);
469 |         }
470 |         ret
471 |     }
472 | 
473 |     #[test]
474 |     fn test_anchored_dfa_simple() {
475 |         let dfa = make_anchored("a");
476 |         let mut tgt = trans_dfa_anchored(2, &[(0, 1, Range::new(b'a', b'a'))]);
477 |         tgt.init[Look::Boundary.as_usize()] = Some(0);
478 |         tgt.states[1].accept = Accept::Always;
479 |         tgt.states[1].ret = Some((Look::Full, 0));
480 | 
481 |         assert_eq!(dfa, tgt);
482 |     }
483 | 
484 |     #[test]
485 |     fn test_forward_backward_simple() {
486 |         // TODO
487 |     }
488 | 
489 |     #[test]
490 |     fn test_anchored_dfa_anchored_end() {
491 |         let dfa = make_anchored("a$");
492 |         let mut tgt = trans_dfa_anchored(2, &[(0, 1, Range::new(b'a', b'a')),
493 |                                               (1, 1, Range::new(b'a', b'a'))]);
494 |         tgt.init[Look::Boundary.as_usize()] = Some(0);
495 |         tgt.states[1].accept = Accept::AtEoi;
496 |         tgt.states[1].ret = Some((Look::Boundary, 0));
497 | 
498 |         assert_eq!(dfa, tgt);
499 |     }
500 | 
501 |     #[test]
502 |     fn test_anchored_dfa_literal_prefix() {
503 |         let dfa = make_anchored("abc[A-z]");
504 |         let pref = dfa.prefix_strings().into_iter().map(|p| p.0).collect::<Vec<_>>();
505 |         assert_eq!(pref, vec!["abc".as_bytes()]);
506 |     }
507 | 
508 |     #[test]
509 |     fn test_minimize() {
510 |         let auto = make_dfa("a*?b*?").unwrap();
511 |         // 1, because our highest-priority match is an empty string.
512 |         assert_eq!(auto.states.len(), 1);
513 | 
514 |         let auto = make_dfa(r"^a").unwrap();
515 |         assert_eq!(auto.states.len(), 2);
516 | 
517 |         let mut auto = make_dfa("[cgt]gggtaaa|tttaccc[acg]").unwrap();
518 |         // Since `minimize` is non-deterministic (involving random hashes), run this a bunch of
519 |         // times.
520 |         for _ in 0..100 {
521 |             auto = auto.optimize();
522 |             assert_eq!(auto.states.len(), 16);
523 |         }
524 |     }
525 | 
526 |    #[test]
527 |     fn test_class_normalized() {
528 |         let mut re = make_dfa("[abcdw]").unwrap();
529 |         re.sort_states();
530 |         assert_eq!(re.states.len(), 2);
531 |         assert_eq!(re.states[0].transitions.num_ranges(), 2)
532 |     }
533 | 
534 |     #[test]
535 |     fn test_max_states() {
536 |         assert!(make_dfa_bounded("foo", 3).is_err());
537 |         assert!(make_dfa_bounded("foo", 4).is_ok());
538 |     }
539 | 
540 |     #[test]
541 |     fn test_adjacent_predicates() {
542 |         assert!(make_dfa_bounded(r"\btest\b\B", 100).unwrap().states.is_empty());
543 |         assert!(make_dfa_bounded(r"\btest\B\b", 100).unwrap().states.is_empty());
544 |         assert!(make_dfa_bounded(r"test1\b\Btest2", 100).unwrap().states.is_empty());
545 |     }
546 | 
547 |     #[test]
548 |     fn test_syntax_error() {
549 |         assert!(make_dfa_bounded("(abc", 10).is_err());
550 |     }
551 | 
552 |     #[test]
553 |     fn match_priority() {
554 |         macro_rules! eq {
555 |             ($re1:expr, $re2:expr) => {
556 |                 {
557 |                     let dfa1 = make_dfa($re1).unwrap();
558 |                     let dfa2 = make_dfa($re2).unwrap();
559 |                     assert_eq!(dfa1, dfa2);
560 |                 }
561 |             };
562 |         }
563 |         eq!("(a|aa)", "a");
564 |         eq!("abcd*?", "abc");
565 |         //eq!("a*?", ""); // TODO: figure out how empty regexes should behave
566 |     }
567 | 
568 |     // TODO: add a test checking that minimize() doesn't clobber return values.
569 | 
570 |     /*
571 |     #[test]
572 |     fn critical_transitions() {
573 |         fn crit(max_steps: usize, re: &str, answer: &[(StateIdx, StateIdx, u8)]) {
574 |             let dfa = make_dfa(re).unwrap();
575 |             println!("{:?}", dfa);
576 |             assert_eq!(&dfa.mandatory_single_byte_transitions(max_steps)[..], answer);
577 |         }
578 | 
579 |         fn crit_anchored(max_steps: usize, re: &str, answer: &[(StateIdx, StateIdx, u8)]) {
580 |             let dfa = make_anchored(re);
581 |             println!("{:?}", dfa);
582 |             assert_eq!(&dfa.mandatory_single_byte_transitions(max_steps)[..], answer);
583 |         }
584 | 
585 |         crit(10, "a", &[(0, 1, b'a')]);
586 |         crit(10, "aaa", &[(0, 1, b'a'), (1, 2, b'a'), (2, 3, b'a')]);
587 |         crit(2, "aaa", &[(0, 1, b'a'), (1, 2, b'a')]);
588 |         crit(10, "a*|ab", &[]);
589 |         crit(10, "a+|ab", &[(0, 1, b'a')]);
590 |         crit(10, "brown|fox", &[(2, 3, b'o'), (6, 7, b'o')]);
591 |         crit(10, "quick|brown", &[]);
592 |         crit(10, "zzzzzzzzzz|abracadabraz", &[]);
593 |         crit(10, "eeeeeeeeez|abracadabz", &[(9, 10, b'z')]);
594 |         crit(10, ".*x", &[(0, 1, b'x')]);
595 |         crit_anchored(10, "\\bx", &[(0, 260, b'x')]);
596 |     }
597 |     */
598 | }
599 | 


--------------------------------------------------------------------------------
/src/dfa/prefix_searcher.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2016 Joe Neeman.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. This file may not be copied, modified, or distributed
  7 | // except according to those terms.
  8 | 
  9 | use dfa::{Dfa, RetTrait};
 10 | use dfa::trie::Trie;
 11 | use nfa::{Accept, StateIdx};
 12 | use std::cmp::{Ordering, PartialOrd};
 13 | use std::collections::{HashSet, VecDeque};
 14 | use std::mem::swap;
 15 | 
 16 | // TODO: These limits are pretty arbitrary (copied from the regex crate).
 17 | const NUM_PREFIX_LIMIT: usize = 30;
 18 | const PREFIX_LEN_LIMIT: usize = 15;
 19 | 
 20 | /// A pair of a byte sequence and the index of the state that we are in after encountering that
 21 | /// sequence.
 22 | #[derive(Clone, Debug, PartialEq)]
 23 | pub struct PrefixPart(pub Vec<u8>, pub StateIdx);
 24 | 
 25 | pub struct PrefixSearcher {
 26 |     active: VecDeque<PrefixPart>,
 27 |     current: PrefixPart,
 28 |     suffixes: Trie,
 29 |     finished: Vec<PrefixPart>,
 30 | 
 31 |     // The set of prefixes is complete if:
 32 |     //  - we're done with active prefixes before we go over any of our limits, and
 33 |     //  - we didn't encounter any states that accept conditionally.
 34 |     complete: bool,
 35 | 
 36 |     max_prefixes: usize,
 37 |     max_len: usize,
 38 | }
 39 | 
 40 | impl PrefixSearcher {
 41 |     pub fn extract<T: RetTrait>(dfa: &Dfa<T>, state: StateIdx) -> Vec<PrefixPart> {
 42 |         let mut searcher = PrefixSearcher::new();
 43 |         searcher.search(dfa, state);
 44 |         searcher.finished
 45 |     }
 46 | 
 47 |     fn new() -> PrefixSearcher {
 48 |         PrefixSearcher {
 49 |             active: VecDeque::new(),
 50 |             current: PrefixPart(Vec::new(), 0),
 51 |             suffixes: Trie::new(),
 52 |             finished: Vec::new(),
 53 |             complete: true,
 54 |             max_prefixes: NUM_PREFIX_LIMIT,
 55 |             max_len: PREFIX_LEN_LIMIT,
 56 |         }
 57 |     }
 58 | 
 59 |     fn bail_out(&mut self) {
 60 |         let mut current = PrefixPart(Vec::new(), 0);
 61 |         let mut active = VecDeque::new();
 62 |         swap(&mut current, &mut self.current);
 63 |         swap(&mut active, &mut self.active);
 64 | 
 65 |         self.finished.extend(active.into_iter());
 66 |         self.finished.push(current);
 67 |         self.complete = false;
 68 |     }
 69 | 
 70 |     fn add(&mut self, new_prefs: Vec<PrefixPart>) {
 71 |         debug_assert!(new_prefs.len() + self.active.len() + self.finished.len() <= self.max_prefixes);
 72 | 
 73 |         for p in new_prefs.into_iter() {
 74 |             if p.0.len() >= self.max_len {
 75 |                 self.finished.push(p);
 76 |             } else {
 77 |                 self.active.push_back(p);
 78 |             }
 79 |         }
 80 |     }
 81 | 
 82 |     fn too_many(&mut self, more: usize) -> bool {
 83 |         self.active.len() + self.finished.len() + more > self.max_prefixes
 84 |     }
 85 | 
 86 |     fn search<T: RetTrait>(&mut self, dfa: &Dfa<T>, state: StateIdx) {
 87 |         self.active.push_back(PrefixPart(Vec::new(), state));
 88 |         self.suffixes.insert(vec![].into_iter(), state);
 89 |         while !self.active.is_empty() {
 90 |             self.current = self.active.pop_front().unwrap();
 91 | 
 92 |             let trans = dfa.transitions(self.current.1);
 93 |             let mut next_prefs = Vec::new();
 94 |             for (ch, next_state) in trans.keys_values() {
 95 |                 let mut next_pref = self.current.0.clone();
 96 |                 next_pref.push(ch);
 97 |                 next_prefs.push(PrefixPart(next_pref, *next_state));
 98 |             }
 99 | 
100 |             // Discard any new prefix that is the suffix of some existing prefix.
101 |             next_prefs.retain(|pref| {
102 |                 let rev_bytes = pref.0.iter().cloned().rev();
103 |                 !self.suffixes
104 |                     .prefixes(rev_bytes)
105 |                     .any(|s| s == pref.1)
106 |             });
107 |             for pref in &next_prefs {
108 |                 self.suffixes.insert(pref.0.iter().cloned().rev(), pref.1);
109 |             }
110 | 
111 |             // Stop searching if we have too many prefixes already, or if we've run into an accept
112 |             // state. In principle, we could continue expanding the other prefixes even after we
113 |             // run into an accept state, but there doesn't seem much point in having some short
114 |             // prefixes and other long prefixes.
115 |             if self.too_many(next_prefs.len())
116 |                 || *dfa.accept(self.current.1) != Accept::Never {
117 |                 self.bail_out();
118 |                 break;
119 |             }
120 | 
121 | 
122 |             self.add(next_prefs);
123 |         }
124 |     }
125 | }
126 | 
127 | // A critical segment is a sequence of bytes that we must match if we want to get to a particular
128 | // state. That sequence need not necessarily correspond to a unique path in the DFA, however.
129 | // Therefore, we store the sequence of bytes and also a set of possible paths that we might have
130 | // traversed while reading those bytes.
131 | #[derive(Clone, Debug, PartialEq)]
132 | pub struct CriticalSegment {
133 |     bytes: Vec<u8>,
134 |     paths: HashSet<Vec<StateIdx>>,
135 | }
136 | 
137 | // The stdlib seems to have searching functions for &str, but not for &[u8]. If they get added, we
138 | // can remove this.
139 | fn find(haystack: &[u8], needle: &[u8]) -> Option<usize> {
140 |     haystack.windows(needle.len())
141 |         .enumerate()
142 |         .find(|x| x.1 == needle)
143 |         .map(|y| y.0)
144 | }
145 | 
146 | // For two critical segments a and b, we say a <= b if it is more specific than b: either a's
147 | // byte sequence contains b's byte sequence or else the byte sequences are the same and a's set of
148 | // paths is a subset of b's set of paths.
149 | // TODO: not sure if this is necessary
150 | impl PartialOrd for CriticalSegment {
151 |     fn partial_cmp(&self, other: &CriticalSegment) -> Option<Ordering> {
152 |         fn less(a: &CriticalSegment, b: &CriticalSegment) -> bool {
153 |             let a_len = a.bytes.len();
154 |             let b_len = b.bytes.len();
155 |             (a_len > b_len && find(&a.bytes, &b.bytes).is_some())
156 |                 || (a.bytes == b.bytes && a.paths.is_subset(&b.paths))
157 |         }
158 |         if less(self, other) {
159 |             Some(Ordering::Less)
160 |         } else if less(other, self) {
161 |             Some(Ordering::Greater)
162 |         } else {
163 |             None
164 |         }
165 |     }
166 | }
167 | 
168 | /*
169 | impl CriticalSegment {
170 |     pub fn intersection(xs: &[CriticalSegment], ys: &[CriticalSegment]) -> Vec<CriticalSegment> {
171 |         let common = maximal_common_substrings(
172 |             xs.iter().map(|x| &x.bytes[..]),
173 |             ys.iter().map(|y| &y.bytes[..]));
174 |         let mut ret = Vec::new();
175 | 
176 |         for s in common {
177 |             let mut paths = HashSet::new();
178 |             for x in xs.iter().chain(ys.iter()) {
179 |                 // We look for only the first occurence of the substring in x.
180 |                 if let Some(pos) = find(&x.bytes, &s) {
181 |                     paths.extend(x.paths.iter().map(|p| p[pos..(pos + s.len())].to_vec()));
182 |                 }
183 |             }
184 |             ret.push(CriticalSegment { bytes: s, paths: paths });
185 |         }
186 |         ret
187 |     }
188 | }
189 | 
190 | // Finds all strings that are
191 | // - a substring of some element of xs,
192 | // - a substring of some element of ys, and
193 | // - maximal among all strings satisfying the first two conditions.
194 | //
195 | // Note that this implementation is *extremely* naive -- an efficient implementation would probably
196 | // want to use a generalized suffix tree. But since the strings we deal with here are small, we can
197 | // sort of get away with it.
198 | fn maximal_common_substrings<'a, I, J>(xs: I, ys: J) -> HashSet<Vec<u8>>
199 | where I: Iterator<Item=&'a [u8]>, J: Iterator<Item=&'a [u8]> {
200 |     let mut ys_substrings = HashSet::new();
201 |     let mut common_substrings = HashSet::new();
202 | 
203 |     for y in ys {
204 |         let len = y.len();
205 |         for i in 0..len {
206 |             for j in i..len {
207 |                 ys_substrings.insert(&y[i..(j + 1)]);
208 |             }
209 |         }
210 |     }
211 | 
212 |     for x in xs {
213 |         let len = x.len();
214 |         for i in 0..len {
215 |             for j in i..len {
216 |                 if ys_substrings.contains(&x[i..(j + 1)]) {
217 |                     common_substrings.insert(x[i..(j + 1)].to_vec());
218 |                 }
219 |             }
220 |         }
221 |     }
222 | 
223 |     // Now prune out anything that isn't maximal.
224 |     let mut ret = common_substrings.clone();
225 |     for s in &common_substrings {
226 |         let len = s.len();
227 |         for i in 0..len {
228 |             for j in i..len {
229 |                 // Make sure we're only looking at proper substrings of s.
230 |                 if i > 0 || j < len - 1 {
231 |                     ret.remove(&s[i..(j + 1)]);
232 |                 }
233 |             }
234 |         }
235 |     }
236 |     ret
237 | }
238 | */
239 | 
240 | #[cfg(test)]
241 | mod tests {
242 |     use dfa;
243 |     use look::Look;
244 |     use quickcheck::{QuickCheck, quickcheck, StdGen, TestResult};
245 |     use rand;
246 |     use super::*;
247 |     //use super::{find, maximal_common_substrings};
248 | 
249 |     fn qc(size: usize) -> QuickCheck<StdGen<rand::ThreadRng>> {
250 |         QuickCheck::new().gen(StdGen::new(rand::thread_rng(), size))
251 |     }
252 | 
253 |     macro_rules! test_prefix {
254 |         ($name:ident, $re_str:expr, $answer:expr, $max_num:expr, $max_len:expr) => {
255 |             #[test]
256 |             fn $name() {
257 |                 let dfa = dfa::tests::make_dfa($re_str).unwrap();
258 |                 println!("{:?}", dfa);
259 |                 let mut pref = PrefixSearcher::new();
260 |                 pref.max_prefixes = $max_num;
261 |                 pref.max_len = $max_len;
262 |                 pref.search(&dfa, dfa.init_state(Look::Full).unwrap());
263 |                 let mut prefs = pref.finished.into_iter().map(|x| x.0).collect::<Vec<_>>();
264 |                 prefs.sort();
265 | 
266 |                 let answer: Vec<Vec<u8>> = $answer.iter()
267 |                     .map(|s| s.as_bytes().to_owned())
268 |                     .collect();
269 |                 assert_eq!(prefs, answer);
270 |             }
271 |         };
272 |     }
273 | 
274 |     test_prefix!(long,
275 |         "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ",
276 |         vec!["XABCDEFGHIJKLMNOPQRSTUVWXYZ",
277 |            "YABCDEFGHIJKLMNOPQRSTUVWXYZ",
278 |            "ZABCDEFGHIJKLMNOPQRSTUVWXYZ",],
279 |         3, 30);
280 | 
281 |     test_prefix!(case_insensitive,
282 |         "(?i)abc[a-z]",
283 |         vec!["ABC", "ABc", "AbC", "Abc", "aBC", "aBc", "abC", "abc"],
284 |         30, 5);
285 | 
286 |     test_prefix!(byte_set,
287 |         "[ac]",
288 |         vec!["a", "c"],
289 |         30, 5);
290 | 
291 |     test_prefix!(pruned_repetition,
292 |         "a+bc",
293 |         vec!["abc"],
294 |         10, 10);
295 | 
296 |     test_prefix!(pruned_empty_repetition,
297 |         "[a-zA-Z]*bc",
298 |         vec!["bc"],
299 |         10, 10);
300 | 
301 |     /*
302 |     #[test]
303 |     fn common_substrings() {
304 |         fn sound(xs: Vec<Vec<u8>>, ys: Vec<Vec<u8>>) -> bool {
305 |             let result = maximal_common_substrings(xs.iter().map(|x| &x[..]), ys.iter().map(|y| &y[..]));
306 | 
307 |             // Everything in the result should be a substring of something in xs.
308 |             result.iter().all(|x| xs.iter().any(|y| find(&y, &x).is_some()))
309 |                 // Everything in the result should be a substring of something in xs.
310 |                 && result.iter().all(|x| ys.iter().any(|y| find(&y, &x).is_some()))
311 |                 // Nothing in the result should be a strict substring of anything else.
312 |                 && result.iter().all(
313 |                     |x| !result.iter().any(|y| y.len() > x.len() && find(&y, &x).is_some()))
314 |         }
315 | 
316 |         // If z is a substring of something in xs and something in ys then it must be a substring
317 |         // of something in result.
318 |         fn complete(xs: Vec<Vec<u8>>, ys: Vec<Vec<u8>>, z: Vec<u8>) -> TestResult {
319 |             if z.is_empty()
320 |                     || !xs.iter().any(|x| find(&x, &z).is_some())
321 |                     || !ys.iter().any(|y| find(&y, &z).is_some()) {
322 |                 return TestResult::discard();
323 |             }
324 | 
325 |             let result = maximal_common_substrings(xs.iter().map(|x| &x[..]), ys.iter().map(|y| &y[..]));
326 |             TestResult::from_bool(result.iter().any(|x| find(&x, &z).is_some()))
327 |         }
328 | 
329 |         qc(10).quickcheck(sound as fn(_, _) -> _);
330 |         qc(10).quickcheck(complete as fn(_, _, _) -> _);
331 |     }
332 |     */
333 | }
334 | 
335 | 


--------------------------------------------------------------------------------
/src/dfa/trie.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2015-2016 Joe Neeman.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | // option. This file may not be copied, modified, or distributed
 7 | // except according to those terms.
 8 | 
 9 | // This is a fairly simple-minded implementation of a trie. Since we don't really have any special
10 | // needs, this module could be replaced by a different crate (if we can find one that's
11 | // well-supported).
12 | 
13 | #[derive(Clone, Debug)]
14 | pub struct Trie {
15 |     value: Option<usize>,
16 |     sub_tries: Vec<Trie>,
17 | }
18 | 
19 | impl Trie {
20 |     pub fn new() -> Trie {
21 |         Trie {
22 |             value: None,
23 |             sub_tries: Vec::new(),
24 |         }
25 |     }
26 | 
27 |     pub fn insert<I: Iterator<Item=u8>>(&mut self, mut key: I, value: usize) {
28 |         if let Some(head) = key.next() {
29 |             if self.sub_tries.is_empty() {
30 |                 self.sub_tries = vec![Trie::new(); 256];
31 |             }
32 |             self.sub_tries[head as usize].insert(key, value);
33 |         } else {
34 |             if self.value.is_some() {
35 |                 panic!("tried to insert the same key twice");
36 |             }
37 |             self.value = Some(value);
38 |         }
39 |     }
40 | 
41 |     pub fn prefixes<'a, I: Iterator<Item=u8>>(&'a self, input: I) -> TrieIter<'a, I> {
42 |         TrieIter {
43 |             trie: Some(self),
44 |             input: input,
45 |         }
46 |     }
47 | }
48 | 
49 | pub struct TrieIter<'a, I: Iterator<Item=u8>> {
50 |     trie: Option<&'a Trie>,
51 |     input: I,
52 | }
53 | 
54 | impl<'a, I: Iterator<Item=u8>> Iterator for TrieIter<'a, I> {
55 |     type Item = usize;
56 | 
57 |     fn next(&mut self) -> Option<Self::Item> {
58 |         let mut next_trie = self.trie;
59 |         let mut ret = None;
60 |         while let Some(t) = next_trie {
61 |             next_trie = self.input.next().and_then(|c| t.sub_tries.get(c as usize));
62 |             if let Some(v) = t.value {
63 |                 ret = Some(v);
64 |                 break;
65 |             }
66 |         }
67 |         self.trie = next_trie;
68 |         ret
69 |     }
70 | }
71 | 
72 | 


--------------------------------------------------------------------------------
/src/error.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2015-2016 Joe Neeman.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | // option. This file may not be copied, modified, or distributed
 7 | // except according to those terms.
 8 | 
 9 | use regex_syntax;
10 | use std::error;
11 | use std::fmt;
12 | 
13 | #[derive(Debug)]
14 | pub enum Error {
15 |     RegexSyntax(regex_syntax::Error),
16 |     TooManyStates,
17 |     InvalidEngine(&'static str),
18 | }
19 | 
20 | use error::Error::*;
21 | impl fmt::Display for Error {
22 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
23 |         match *self {
24 |             RegexSyntax(ref e) => write!(f, "Regex syntax error: {}", e),
25 |             TooManyStates => write!(f, "State overflow"),
26 |             InvalidEngine(s) => write!(f, "Invalid engine: {}", s),
27 |         }
28 |     }
29 | }
30 | 
31 | impl error::Error for Error {
32 |     fn description(&self) -> &str {
33 |         match *self {
34 |             RegexSyntax(ref e) => e.description(),
35 |             TooManyStates => "This NFA required too many states to represent as a DFA.",
36 |             InvalidEngine(_) => "The regex was not compatible with the requested engine.",
37 |         }
38 |     }
39 | }
40 | 
41 | impl From<regex_syntax::Error> for Error {
42 |     fn from(e: regex_syntax::Error) -> Error {
43 |         RegexSyntax(e)
44 |     }
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/src/graph.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 Joe Neeman.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. This file may not be copied, modified, or distributed
  7 | // except according to those terms.
  8 | 
  9 | use dfa::{Dfa, RetTrait};
 10 | use nfa::{Nfa, NoLooks, StateIdx};
 11 | use num_traits::PrimInt;
 12 | use std::collections::HashSet;
 13 | use std::fmt::Debug;
 14 | 
 15 | #[derive(Clone, Copy, Debug, PartialEq)]
 16 | pub enum DfsInstruction {
 17 |     Continue,
 18 |     #[allow(dead_code)]
 19 |     TurnBack,
 20 |     Stop,
 21 | }
 22 | 
 23 | pub trait Graph {
 24 |     fn num_states(&self) -> usize;
 25 | 
 26 |     fn neighbors<'a>(&'a self, i: StateIdx) -> Box<Iterator<Item=StateIdx> + 'a>;
 27 | 
 28 |     /// Does a depth-first search of this graph.
 29 |     ///
 30 |     /// Every time the search visits a new state, `visit` will be called. Every time the search
 31 |     /// detects a loop, `cycle` will be called. These return value of these callbacks tell the
 32 |     /// search how to proceed:
 33 |     /// - on `Continue`, the search will proceed normally
 34 |     /// - on `TurnBack`, the search will stop searching the current branch
 35 |     /// - on `Stop`, the search will terminate early.
 36 |     fn dfs<Inits, Visit, Cycle>(&self, init: Inits, mut visit: Visit, mut cycle: Cycle)
 37 |     where
 38 |     Visit: FnMut(&[StateIdx]) -> DfsInstruction,
 39 |     Cycle: FnMut(&[StateIdx]) -> DfsInstruction,
 40 |     Inits: Iterator<Item=StateIdx>,
 41 |     {
 42 |         // Pairs of (state, children_left_to_explore).
 43 |         let mut stack: Vec<StateIdx> = Vec::with_capacity(self.num_states());
 44 |         let mut remaining_children_stack: Vec<Box<Iterator<Item=StateIdx>>>
 45 |             = Vec::with_capacity(self.num_states());
 46 |         let mut visiting: Vec<bool> = vec![false; self.num_states()];
 47 |         let mut done: Vec<bool> = vec![false; self.num_states()];
 48 | 
 49 |         // For nodes that we are currently visiting, this is their position on the stack.
 50 |         let mut stack_pos: Vec<usize> = vec![0; self.num_states()];
 51 | 
 52 |         let start_states: Vec<StateIdx> = init.collect();
 53 | 
 54 |         for &start_idx in &start_states {
 55 |             if !done[start_idx] {
 56 |                 match visit(&[start_idx][..]) {
 57 |                     DfsInstruction::Continue => {},
 58 |                     DfsInstruction::TurnBack => {
 59 |                         done[start_idx] = true;
 60 |                         continue;
 61 |                     },
 62 |                     DfsInstruction::Stop => { return; },
 63 |                 }
 64 | 
 65 |                 visiting[start_idx] = true;
 66 |                 stack.push(start_idx);
 67 |                 remaining_children_stack.push(self.neighbors(start_idx));
 68 |                 stack_pos[start_idx] = 0;
 69 | 
 70 |                 while !stack.is_empty() {
 71 |                     // We keep stack and remaining_children_stack synchronized.
 72 |                     debug_assert!(!remaining_children_stack.is_empty());
 73 | 
 74 |                     let cur = *stack.last().unwrap();
 75 |                     let next_child = remaining_children_stack.last_mut().unwrap().next();
 76 | 
 77 |                     if let Some(child) = next_child {
 78 |                         if visiting[child] {
 79 |                             // We found a cycle: report it (and maybe terminate early).
 80 |                             // Since we turn back on finding a cycle anyway, we treat Continue
 81 |                             // and TurnBack the same (i.e. we don't need to handle either one
 82 |                             // explicitly).
 83 |                             if cycle(&stack[stack_pos[child]..]) == DfsInstruction::Stop {
 84 |                                 return;
 85 |                             }
 86 |                         } else if !done[child] {
 87 |                             // This is a new state: report it and push it onto the stack.
 88 |                             stack.push(child);
 89 |                             match visit(&stack[stack_pos[child]..]) {
 90 |                                 DfsInstruction::Stop => { return; },
 91 |                                 DfsInstruction::TurnBack => {
 92 |                                     stack.pop();
 93 |                                     done[child] = true;
 94 |                                 },
 95 |                                 DfsInstruction::Continue => {
 96 |                                     remaining_children_stack.push(self.neighbors(child));
 97 |                                     visiting[child] = true;
 98 |                                     stack_pos[child] = stack.len() - 1;
 99 |                                 },
100 |                             }
101 |                         }
102 |                         continue;
103 |                     }
104 | 
105 |                     // If we got this far, the current node is out of children. Pop it from the
106 |                     // stack.
107 |                     visiting[cur] = false;
108 |                     done[cur] = true;
109 |                     stack.pop();
110 |                     remaining_children_stack.pop();
111 |                 }
112 |             }
113 |         }
114 |     }
115 | 
116 |     /// The same as `dfs`, but runs on a graph with cuts in it.
117 |     ///
118 |     /// Instead of running on the full graph, runs on the graph where pairs in `cuts` are
119 |     /// disconnected.
120 |     fn dfs_with_cut<Inits, Cuts, Visit, Cycle>(
121 |         &self,
122 |         init: Inits,
123 |         cuts: &HashSet<(StateIdx, StateIdx)>,
124 |         mut visit: Visit,
125 |         mut cycle: Cycle)
126 |     where
127 |     Visit: FnMut(&[StateIdx]) -> DfsInstruction,
128 |     Cycle: FnMut(&[StateIdx]) -> DfsInstruction,
129 |     Inits: Iterator<Item=StateIdx>,
130 |     {
131 |         let should_cut = |s: &[StateIdx]| {
132 |             let len = s.len();
133 |             len >= 2 && cuts.contains(&(s[len-2], s[len-1]))
134 |         };
135 |         let my_visit = |s: &[StateIdx]|
136 |             if should_cut(s) { DfsInstruction::TurnBack } else { visit(s) };
137 |         let my_cycle = |s: &[StateIdx]|
138 |             if should_cut(s) { DfsInstruction::TurnBack } else { cycle(s) };
139 |         self.dfs(init, my_visit, my_cycle);
140 |     }
141 | 
142 |     /// Returns a list of states, visited in depth-first order.
143 |     fn dfs_order<I: Iterator<Item=StateIdx>>(&self, init: I) -> Vec<StateIdx> {
144 |         use self::DfsInstruction::*;
145 | 
146 |         let mut ret: Vec<StateIdx> = Vec::new();
147 |         // The unwrap is ok because dfa guarantees never to pass an empty slice.
148 |         self.dfs(init, |st| { ret.push(*st.last().unwrap()); Continue }, |_| Continue);
149 |         ret
150 |     }
151 | 
152 |     /// Checks whether this graph has any cycles.
153 |     #[allow(unused)]
154 |     fn has_cycles(&self) -> bool {
155 |         use self::DfsInstruction::*;
156 | 
157 |         let mut found = false;
158 |         self.dfs(0..self.num_states(), |_| Continue, |_| { found = true; Stop });
159 |         found
160 |     }
161 | }
162 | 
163 | impl<T: RetTrait> Graph for Dfa<T> {
164 |     fn num_states(&self) -> usize {
165 |         Dfa::num_states(self)
166 |     }
167 | 
168 |     fn neighbors<'a>(&'a self, i: StateIdx) -> Box<Iterator<Item=StateIdx> + 'a> {
169 |         Box::new(self.transitions(i).ranges_values().map(|x| x.1))
170 |     }
171 | }
172 | 
173 | impl<Tok: Debug + PrimInt> Graph for Nfa<Tok, NoLooks> {
174 |     fn num_states(&self) -> usize {
175 |         Nfa::num_states(self)
176 |     }
177 | 
178 |     fn neighbors<'a>(&'a self, i: usize) -> Box<Iterator<Item=usize> + 'a> {
179 |         Box::new(self.consuming(i).ranges_values().map(|x| x.1))
180 |     }
181 | }
182 | 
183 | #[cfg(test)]
184 | mod tests {
185 |     use dfa::tests::make_dfa;
186 |     use graph::Graph;
187 | 
188 |     #[test]
189 |     fn cycles() {
190 |         macro_rules! cyc {
191 |             ($re:expr, $res:expr) => {
192 |                 {
193 |                     let dfa = make_dfa($re).unwrap();
194 |                     println!("{:?}", dfa);
195 |                     assert_eq!(dfa.has_cycles(), $res);
196 |                 }
197 |             };
198 |         }
199 | 
200 |         cyc!("abcde", false);
201 |         cyc!("ab*d", true);
202 |         cyc!("ab*", true);
203 |         cyc!("ab*?", false);
204 |         cyc!("ab+", true);
205 |         cyc!("ab+?", false);
206 |         cyc!("(ab*?|cde)", false);
207 |         cyc!("(ab*?|cde)f", true);
208 |         cyc!("(abc)*?", false);
209 |         cyc!("(abc)*?def", true);
210 |     }
211 | }
212 | 
213 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2015-2016 Joe Neeman.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | // option. This file may not be copied, modified, or distributed
 7 | // except according to those terms.
 8 | 
 9 | /*!
10 | This crate provides tools for converting regular expressions into deterministic finite automata
11 | (DFAs). The most interesting type is `Regex`, which is a virtual machine for executing a DFA.
12 | 
13 | # Example: creating and running a `Regex`
14 | 
15 | ```rust
16 | use regex_dfa::Regex;
17 | let re = Regex::new(r"\d{4}-\d{2}-\d{2}").unwrap();
18 | assert_eq!(re.find("My birthday is 1986-08-22!"), Some((15, 25)));
19 | ```
20 | 
21 | The most useful function in this crate is `Regex::find`, which looks for the first substring of the
22 | given string that match the language of the DFA.
23 | 
24 | # Comparison to the `regex` crate
25 | 
26 | Compared to rust's standard `regex` crate, the main feature of `regex_dfa` is that `regex_dfa`
27 | *eagerly* compiles a regular expression into a DFA, whereas `regex` does so lazily. There are
28 | advantages and disadvantages to the eager approach. To begin with, doing all the compilation
29 | up-front means that there is less to do at match time. If we get around to writing a compiler
30 | plugin for compiling the regular expression at compile time, this would be an even bigger win. 
31 | Another advantage is that since we don't care so much about compilation speed, we have more
32 | opportunities to look for optimizations.
33 | 
34 | The main disadvantage to eager compilation is memory usage. Even fairly simple regular expressions
35 | may take several tens of kilobytes to represent as a DFA. More complicated ones (especially regular
36 | expressions that use unicode word boundaries or character classes) may require much more. This
37 | disadvantage is specific to eager compilation, since lazy DFA compilation only needs to create DFA
38 | states for those characters that are actually seen (i.e., probably a tiny fraction of the entire
39 | unicode character class). For this reason, `regex_dfa` allows you to restrict the amount of memory
40 | it uses: simply use the method `Regex::new_bounded`, which will fail and report an error if it
41 | would otherwise need to use too much memory.
42 | 
43 | # Roadmap
44 | 
45 | There are two substantial features that need to be added before this crate can be considered
46 | feature-complete.
47 | 
48 | ## SIMD optimizations
49 | 
50 | There are some nice tricks available for using SIMD instructions to quickly scan over uninteresting
51 | parts of the input. The `regex` crate is capable (with a nightly compiler) of doing some of these
52 | already, and we should imitate it.
53 | 
54 | ## Compiler plugin
55 | 
56 | Since the main advantage of this crate is that it can do work ahead of time, it would make total
57 | sense to do it all at the program's compile time. This feature will probably wait until the rust's
58 | compiler plugin story stabilizes a bit.
59 | */
60 | 
61 | #![cfg_attr(test, feature(test))]
62 | #[cfg(test)]
63 | extern crate quickcheck;
64 | 
65 | #[cfg(test)]
66 | #[macro_use]
67 | extern crate matches;
68 | 
69 | #[cfg(test)]
70 | extern crate rand;
71 | 
72 | #[cfg(test)]
73 | extern crate test;
74 | 
75 | extern crate itertools;
76 | extern crate memchr;
77 | extern crate num_traits;
78 | extern crate range_map;
79 | extern crate refinery;
80 | extern crate regex_syntax;
81 | extern crate utf8_ranges;
82 | 
83 | #[macro_use]
84 | extern crate lazy_static;
85 | 
86 | mod dfa;
87 | mod error;
88 | mod look;
89 | mod graph;
90 | mod nfa;
91 | mod regex;
92 | mod runner;
93 | mod unicode;
94 | 
95 | pub use error::Error;
96 | pub use regex::Regex;
97 | pub type Result<T> = ::std::result::Result<T, Error>;
98 | 
99 | 


--------------------------------------------------------------------------------
/src/look.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2016 Joe Neeman.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. This file may not be copied, modified, or distributed
  7 | // except according to those terms.
  8 | 
  9 | #![allow(dead_code)]
 10 | 
 11 | use range_map::{Range, RangeSet};
 12 | use std::cmp::Ordering;
 13 | use unicode::PERLW;
 14 | 
 15 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Ord)]
 16 | pub enum Look {
 17 |     Full,
 18 |     WordChar,
 19 |     NotWordChar,
 20 |     NewLine,
 21 |     Boundary,
 22 |     Empty,
 23 | }
 24 | 
 25 | lazy_static! {
 26 |     static ref FULL: RangeSet<u32> = RangeSet::full();
 27 |     static ref WORD_CHAR: RangeSet<u32> =
 28 |         PERLW.iter().map(|&(x, y)| Range::new(x as u32, y as u32)).collect();
 29 |     static ref NOT_WORD_CHAR: RangeSet<u32> = WORD_CHAR.negated();
 30 |     static ref NEW_LINE: RangeSet<u32> = RangeSet::single('\n' as u32);
 31 |     static ref EMPTY: RangeSet<u32> = RangeSet::new();
 32 | }
 33 | 
 34 | static ALL: [Look; 6] = [Look::Full, Look::WordChar, Look::NotWordChar,
 35 |     Look::NewLine, Look::Boundary, Look::Empty];
 36 | 
 37 | impl PartialOrd for Look {
 38 |     fn partial_cmp(&self, other: &Look) -> Option<Ordering> {
 39 |         if self == other {
 40 |             Some(Ordering::Equal)
 41 |         } else if self.intersection(other) == *self {
 42 |             Some(Ordering::Less)
 43 |         } else if self.intersection(other) == *other {
 44 |             Some(Ordering::Greater)
 45 |         } else {
 46 |             None
 47 |         }
 48 |     }
 49 | }
 50 | 
 51 | impl Look {
 52 |     pub fn intersection(&self, other: &Look) -> Look {
 53 |         use self::Look::*;
 54 |         match *self {
 55 |             Full => *other,
 56 |             WordChar => match *other {
 57 |                 Full => WordChar,
 58 |                 WordChar => WordChar,
 59 |                 _ => Empty,
 60 |             },
 61 |             NotWordChar => match *other {
 62 |                 Full => NotWordChar,
 63 |                 NotWordChar => NotWordChar,
 64 |                 NewLine => NewLine,
 65 |                 Boundary => Boundary,
 66 |                 _ => Empty,
 67 |             },
 68 |             NewLine => match *other {
 69 |                 Full => NewLine,
 70 |                 NotWordChar => NewLine,
 71 |                 NewLine => NewLine,
 72 |                 Boundary => Boundary,
 73 |                 _ => Empty,
 74 |             },
 75 |             Boundary => match *other {
 76 |                 WordChar => Empty,
 77 |                 Empty => Empty,
 78 |                 _ => Boundary,
 79 |             },
 80 |             Empty => Empty,
 81 |         }
 82 |     }
 83 | 
 84 |     pub fn supersets(&self) -> Vec<Look> {
 85 |         ALL.iter().cloned().filter(|x| *self <= *x).collect()
 86 |     }
 87 | 
 88 |     pub fn as_set(&self) -> &RangeSet<u32> {
 89 |         use self::Look::*;
 90 | 
 91 |         match *self {
 92 |             Full => &FULL,
 93 |             WordChar => &WORD_CHAR,
 94 |             NotWordChar => &NOT_WORD_CHAR,
 95 |             NewLine => &NEW_LINE,
 96 |             Boundary => &EMPTY,
 97 |             Empty => &EMPTY,
 98 |         }
 99 |     }
100 | 
101 |     pub fn allows_eoi(&self) -> bool {
102 |         use self::Look::*;
103 | 
104 |         match *self {
105 |             Full => true,
106 |             WordChar => false,
107 |             NotWordChar => true,
108 |             NewLine => true,
109 |             Boundary => true,
110 |             Empty => false,
111 |         }
112 |     }
113 | 
114 |     pub fn is_full(&self) -> bool {
115 |         match *self {
116 |             Look::Full => true,
117 |             _ => false,
118 |         }
119 |     }
120 | 
121 |     pub fn as_usize(&self) -> usize {
122 |         use self::Look::*;
123 | 
124 |         match *self {
125 |             Full => 0,
126 |             WordChar => 1,
127 |             NotWordChar => 2,
128 |             NewLine => 3,
129 |             Boundary => 4,
130 |             Empty => 5,
131 |         }
132 |     }
133 | 
134 |     pub fn num() -> usize { 6 }
135 | 
136 |     pub fn all() -> &'static [Look] {
137 |         &ALL
138 |     }
139 | }
140 | 
141 | #[cfg(test)]
142 | mod tests {
143 |     use quickcheck::{Arbitrary, Gen, quickcheck};
144 |     use super::*;
145 | 
146 |     impl Arbitrary for Look {
147 |         fn arbitrary<G: Gen>(g: &mut G) -> Look {
148 |             use look::Look::*;
149 | 
150 |             *g.choose(&[Full, WordChar, NotWordChar, NewLine, Boundary, Empty]).unwrap()
151 |         }
152 |     }
153 | 
154 |     #[test]
155 |     fn intersection_commutes() {
156 |         fn prop(a: Look, b: Look) -> bool {
157 |             a.intersection(&b) == b.intersection(&a)
158 |         }
159 |         quickcheck(prop as fn(_, _) -> _);
160 |     }
161 | 
162 |     #[test]
163 |     fn intersection_ordering() {
164 |         fn prop(a: Look, b: Look) -> bool {
165 |             a.intersection(&b) <= a
166 |         }
167 |         quickcheck(prop as fn(_, _) -> _);
168 |     }
169 | 
170 |     #[test]
171 |     fn intersection_eoi() {
172 |         fn prop(a: Look, b: Look) -> bool {
173 |             a.intersection(&b).allows_eoi() == (a.allows_eoi() && b.allows_eoi())
174 |         }
175 |         quickcheck(prop as fn(_, _) -> _);
176 |     }
177 | 
178 |     #[test]
179 |     fn intersection_set() {
180 |         fn prop(a: Look, b: Look) -> bool {
181 |             a.intersection(&b).as_set() == &a.as_set().intersection(b.as_set())
182 |         }
183 |         quickcheck(prop as fn(_, _) -> _);
184 |     }
185 | }
186 | 
187 | 


--------------------------------------------------------------------------------
/src/nfa/has_looks.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2016 Joe Neeman.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. This file may not be copied, modified, or distributed
  7 | // except according to those terms.
  8 | 
  9 | //! This module contains two main pieces of functionality: building an NFA from a regular
 10 | //! expression and processing an NFA to remove all non-consuming transitions. The first of these is
 11 | //! carried out by the `from_regex` function and it is fairly straightforward. The second is
 12 | //! possibly more unusual, and so we describe it here in some detail.
 13 | //!
 14 | //! For your standard classroom NFA, it's trivial to remove non-consuming transitions: for every
 15 | //! consuming transition with source state `s` and target state `t`, take the eps-closure of `t`
 16 | //! and then add a transition from `s` to everything in that eps-closure. Finally, all
 17 | //! non-consuming transitions are deleted. Here it is in ASCII art, where a non-consuming
 18 | //! transition transition is denoted by an epsilon (ε):
 19 | //!
 20 | //! ```text
 21 | //!               ε           b
 22 | //!     a     /-------> 3 -------> 4
 23 | //! 1 -----> 2    ε
 24 | //!           \-------> 5
 25 | //! ```
 26 | //!
 27 | //! becomes
 28 | //!
 29 | //! ```text
 30 | //!           a               b
 31 | //!   /---------------> 3 -------> 4
 32 | //!  /  a
 33 | //! 1 -----> 2
 34 | //!  \        a
 35 | //!   \---------------> 5
 36 | //! ```
 37 | //!
 38 | //! The situation becomes (just a little) tricker when the non-consuming transitions are allowed to
 39 | //! have predicates that look forward or back by one token. We need to support this sort of
 40 | //! transition if we want to support word boundaries (and the fact that doing so is a bit tricky is
 41 | //! probably the main reason that the standard `regex` crate doesn't support DFA simulation if the
 42 | //! regex contains word boundaries). So now we allow our non-consuming transitions to be of the
 43 | //! form `(P, Q)`, where `P` and `Q` are sets of tokens. You can pass through such a non-consuming
 44 | //! transition if and only if the previous token belonged to `P` and the next token belongs to `Q`.
 45 | //! (The code for this is in `nfa::LookPair`, which is a teeny bit more complicated because it also
 46 | //! allows checking for the edge (beginning for `P`, end for `Q`) of the input.)
 47 | //!
 48 | //! If `Q` is the set of all tokens, then supporting these kinds of non-consuming transitions is
 49 | //! almost the same as the previous case. The first difference is that when we take the
 50 | //! eps-closure, we also need to keep track of the predicates on the non-consuming transitions that
 51 | //! we passed through. For example, if we have a configuration like
 52 | //!
 53 | //! ```text
 54 | //!                   (P2, Q2)
 55 | //!    (P1, Q1)    /------------> 3
 56 | //! 1 ----------> 2   (P3, Q3)
 57 | //!                \------------> 4
 58 | //! ```
 59 | //!
 60 | //! then states 2, 3, and 4 all belong to the eps-closure of 1. In order to get from 1 to 3, we
 61 | //! need to pass through the predicate `(P1 ∩ P2, Q1 ∩ Q2)`; in order to get from 1 to 4, we need
 62 | //! to pass through the predicate `(P1 ∩ P3, Q1 ∩ Q3)`.
 63 | //!
 64 | //! Assuming for now that all of the `Q` predicates are the set of all possible tokens, we remove
 65 | //! the non-consuming transitions as follows: take every consuming transition with source state `s`
 66 | //! and target state `t`. Then for every `u` in the eps-closure of `t` with predicate `(P, Q)`
 67 | //! leading from `t` to `u`, we add a consuming transition from `s` to `u` *if and only if the
 68 | //! consumed token belongs to `P`*. Then we delete all the non-consuming transitions. Going back to
 69 | //! the first example, suppose that `P1` contains `a` but `P2` does not. Then
 70 | //!
 71 | //! ```text
 72 | //!               (P1, Q1)           b
 73 | //!     a     /--------------> 3 -------> 4
 74 | //! 1 -----> 2    (P2, Q2)
 75 | //!           \--------------> 5
 76 | //! ```
 77 | //!
 78 | //! becomes
 79 | //!
 80 | //! ```text
 81 | //!           a               b
 82 | //!   /---------------> 3 -------> 4
 83 | //!  /  a
 84 | //! 1 -----> 2
 85 | //!                     5
 86 | //! ```
 87 | //!
 88 | //! There is actually one more complication that we won't discuss in detail here: the procedure
 89 | //! above doesn't account properly for the eps-closure of the initial state, since it only does
 90 | //! things to the eps-closure of a state that follows a transition. In order to handle the
 91 | //! eps-closure of the initial state, we actually introduce a collection of initial states, some of
 92 | //! which are only active if the previous character of the input satisfied some predicate.
 93 | //!
 94 | //! Finally, in the case that the `Q` predicates are not the set of all possible tokens, we need to
 95 | //! add extra states. For every consuming transition from `s` to `t` and every `u` in the
 96 | //! eps-closure of `t` with predicate `(P, Q)` leading from `t` to `u`, we add a new state `u'`.
 97 | //! The consuming transitions leading out from `u'` are those consuming transitions leading out
 98 | //! from `u` whose tokens belong to `Q`. Then we add a consuming transition from `s` to `u'` if the
 99 | //! token that was consumed in going from `s` to `t` belongs to `P`. In ASCII art, if `P` contains
100 | //! `a` but not `b`, and if `Q` contains `c` but not `d` then
101 | //!
102 | //! ```text
103 | //!     a          (P, Q)           c
104 | //! 1 -----> 2 -------------> 3 --------> 4
105 | //!     b   ^                  \    d
106 | //! 5 -----/                    \-------> 5
107 | //! ```
108 | //!
109 | //! becomes
110 | //!
111 | //! ```text
112 | //!              a                  c
113 | //!    /--------------------> 3' -----\
114 | //!   / a                           c  \
115 | //! 1 -----> 2                3 --------> 4
116 | //!     b   ^                  \    d
117 | //! 5 -----/                    \-------> 5
118 | //! ```
119 | //!
120 | //! There are a couple of caveats to this transformation also. The first is that we process *all*
121 | //! of the look-behind (i.e. `P`) predicates before we process any of the look-ahead (i.e. `Q`)
122 | //! predicates. The reason for this can be seen in the example above: if state 4 had any
123 | //! non-consuming transitions leading out of it, then in processing that non-consuming transition
124 | //! we might need to add more consuming transitions leading out of 3. That would in turn affect the
125 | //! consuming transitions that we add to 3'. Therefore, we need to add the extra transitions coming
126 | //! out of 3 (which are due to a look-behind predicate) before we add the transitions coming
127 | //! out of 3' (which are due to a look-ahead predicate).
128 | //!
129 | //! The second caveat to the transformation above comes in the handling of accepting states. When a
130 | //! non-consuming transition leads to an accepting state, it means that the source of that
131 | //! transition should become a conditionally accepting state.
132 | 
133 | use look::Look;
134 | use nfa::{Accept, HasLooks, LookPair, Nfa, NoLooks, StateIdx};
135 | use std::cmp::max;
136 | use std::collections::HashSet;
137 | use std::ops::Deref;
138 | use range_map::{Range, RangeSet};
139 | use regex_syntax::{CharClass, ClassRange, Expr, Repeater};
140 | 
141 | // Converts a `CharClass` into a `RangeSet`
142 | fn class_to_set(cc: &CharClass) -> RangeSet<u32> {
143 |     cc.iter().map(|r| Range::new(r.start as u32, r.end as u32)).collect()
144 | }
145 | 
146 | impl Nfa<u32, HasLooks> {
147 |     /// Asserts that the invariants that are supposed to hold do.
148 |     fn check_invariants(&self) {
149 |         // The init state is implicitly the first one, so there are no explicit init states.
150 |         debug_assert!(self.init.is_empty());
151 | 
152 |         // The final state is accepting, and no others are.
153 |         debug_assert!(self.states.last().unwrap().accept == Accept::Always);
154 |         debug_assert!(self.states.iter().rev().skip(1).all(|s| s.accept == Accept::Never));
155 | 
156 |         // No state has both a look transition and a consuming transition.
157 |         debug_assert!(self.states.iter().all(|s| s.looking.is_empty() || s.consuming.is_empty()));
158 | 
159 |         // All targets of a consuming transition are just the next state.
160 |         debug_assert!(self.states.iter()
161 |                         .enumerate()
162 |                         .all(|(idx, s)| s.consuming.ranges_values().all(|&(_, val)| val == idx + 1)));
163 |     }
164 | 
165 |     /// Creates a new Nfa from a regex string.
166 |     pub fn from_regex(re: &str) -> ::Result<Nfa<u32, HasLooks>> {
167 |         let expr = try!(Expr::parse(re));
168 |         let mut ret = Nfa::new();
169 | 
170 |         ret.add_state(Accept::Never);
171 |         ret.add_expr(&expr);
172 |         ret.add_eps(0, 1);
173 | 
174 |         let len = ret.num_states();
175 |         ret.states[len - 1].accept = Accept::Always;
176 | 
177 |         ret.check_invariants();
178 |         Ok(ret)
179 |     }
180 | 
181 |     /// Adds a non-input consuming transition between states `source` and `target`.
182 |     ///
183 |     /// The transition will be traversed if the last consumed byte matches `behind` and the next
184 |     /// available byte matches `ahead`.
185 |     pub fn add_look(&mut self, source: StateIdx, target: StateIdx, behind: Look, ahead: Look) {
186 |         let look = LookPair {
187 |             behind: behind,
188 |             ahead: ahead,
189 |             target_state: target,
190 |         };
191 |         self.states[source].looking.push(look);
192 |     }
193 | 
194 |     /// Removes all look transitions, converting this Nfa into an `Nfa<u32, NoLooks>`.
195 |     pub fn remove_looks(mut self) -> Nfa<u32, NoLooks> {
196 |         if self.states.is_empty() {
197 |             return Nfa::with_capacity(0);
198 |         }
199 | 
200 |         // For every state with out transitions, add transitions from it to everything in the closure
201 |         // of the target. Note that (according to `check_invariants`) the target state is always
202 |         // the next state.
203 |         let old_len = self.num_states();
204 |         let mut new_states: Vec<(StateIdx, Look, StateIdx)> = Vec::new();
205 |         for src_idx in 0..self.states.len() {
206 |             if !self.states[src_idx].consuming.is_empty() {
207 |                 let consuming = self.states[src_idx].consuming.clone();
208 |                 for look in self.closure(src_idx + 1) {
209 |                     // Add transitions into the look target.
210 |                     let new_idx = self.add_look_state(look);
211 |                     let filtered_consuming = consuming.intersection(look.behind.as_set());
212 |                     for &(range, _) in filtered_consuming.ranges_values() {
213 |                         self.add_transition(src_idx, new_idx, range);
214 |                     }
215 |                     // If the look target is actually a new state, hold off on adding transitions
216 |                     // out of it, because we need to make sure that all the transitions from
217 |                     // look.target_state have been added first.
218 |                     if new_idx >= old_len {
219 |                         new_states.push((new_idx, look.ahead, look.target_state));
220 |                     }
221 |                 }
222 |             }
223 |         }
224 | 
225 |         // Add the new initial states: everything that was immediately reachable from state 0 is now
226 |         // an initial state.
227 |         for look in self.closure(0) {
228 |             let new_idx = self.add_look_state(look);
229 |             self.init.push((look.behind, new_idx));
230 |             if new_idx >= old_len {
231 |                 new_states.push((new_idx, look.ahead, look.target_state));
232 |             }
233 |         }
234 | 
235 |         // Now add transitions out of the new states.
236 |         for (src_idx, look, tgt_idx) in new_states {
237 |             let out_consuming = self.states[tgt_idx].consuming.intersection(look.as_set());
238 |             for &(range, tgt) in out_consuming.ranges_values() {
239 |                 self.states[src_idx].consuming.insert(range, tgt);
240 |             }
241 |         }
242 | 
243 |         // Get rid of all looking transitions.
244 |         for st in &mut self.states {
245 |             st.looking.clear();
246 |         }
247 | 
248 |         let mut ret: Nfa<u32, NoLooks> = self.transmuted();
249 |         ret.trim_unreachable();
250 |         ret
251 |     }
252 | 
253 |     // Adds a new state for a LookPair, if necessary. It is necessary to add a new state if and
254 |     // only if the LookPair needs to look ahead.
255 |     //
256 |     // Returns the index of the new state.
257 |     fn add_look_state(&mut self, look: LookPair) -> StateIdx {
258 |         if look.ahead.is_full() {
259 |             look.target_state
260 |         } else {
261 |             let tgt_idx = look.target_state;
262 |             let new_idx = self.add_state(Accept::Never);
263 | 
264 |             // If the target states accepts at end of input and the look allows eoi, then the new
265 |             // state must also accept at eoi.
266 |             if self.states[tgt_idx].accept != Accept::Never && look.ahead.allows_eoi() {
267 |                 self.states[new_idx].accept = Accept::AtEoi;
268 |                 self.states[new_idx].accept_look = Look::Boundary;
269 |             }
270 | 
271 |             // If the target state of the look is accepting, add a new look-ahead accepting state.
272 |             if self.states[tgt_idx].accept == Accept::Always
273 |                     && !look.ahead.as_set().is_empty() {
274 |                 let acc_idx = self.add_look_ahead_state(look.ahead, 1, new_idx);
275 |                 for range in look.ahead.as_set().ranges() {
276 |                     self.add_transition(new_idx, acc_idx, range);
277 |                 }
278 |             }
279 |             new_idx
280 |         }
281 |     }
282 | 
283 |     /// Finds (transitively) the set of all non-consuming transitions that can be made starting
284 |     /// from `state`.
285 |     ///
286 |     /// The search is done depth-first so that priority is preserved.
287 |     fn closure(&self, state: StateIdx) -> Vec<LookPair> {
288 |         let mut stack: Vec<LookPair> = Vec::new();
289 |         let mut seen: HashSet<LookPair> = HashSet::new();
290 |         let mut ret: Vec<LookPair> = Vec::new();
291 |         let mut next_looks: Vec<LookPair> = Vec::new();
292 | 
293 |         stack.extend(self.states[state].looking.iter().cloned().rev());
294 |         while let Some(last_look) = stack.pop() {
295 |             ret.push(last_look);
296 |             next_looks.clear();
297 | 
298 |             for next_look in &self.states[last_look.target_state].looking {
299 |                 let int = next_look.intersection(&last_look);
300 |                 if !int.is_empty() && !seen.contains(&int) {
301 |                     seen.insert(int);
302 |                     next_looks.push(int);
303 |                 }
304 |             }
305 | 
306 |             stack.extend(next_looks.drain(..).rev());
307 |         }
308 | 
309 |         ret
310 |     }
311 | 
312 |     /// Adds an eps transition between the given states.
313 |     fn add_eps(&mut self, from: StateIdx, to: StateIdx) {
314 |         self.add_look(from, to, Look::Full, Look::Full);
315 |     }
316 | 
317 |     /// Appends a single state that transitions to the next state on observing one of the chars in
318 |     /// the given range.
319 |     fn add_state_with_chars(&mut self, chars: &RangeSet<u32>) {
320 |         let idx = self.num_states();
321 |         self.add_state(Accept::Never);
322 |         for range in chars.ranges() {
323 |             self.add_transition(idx, idx + 1, range);
324 |         }
325 |     }
326 | 
327 |     /// Appends two states, with a given transition between them.
328 |     fn add_single_transition(&mut self, chars: &RangeSet<u32>) {
329 |         self.add_state_with_chars(chars);
330 |         self.add_state(Accept::Never);
331 |     }
332 | 
333 |     /// Appends a sequence of states that recognizes a literal.
334 |     fn add_literal<C, I>(&mut self, chars: I, case_insensitive: bool)
335 |         where C: Deref<Target=char>,
336 |               I: Iterator<Item=C>
337 |     {
338 |         for ch in chars {
339 |             let ranges = if case_insensitive {
340 |                 let cc = CharClass::new(vec![ClassRange { start: *ch, end: *ch }]);
341 |                 class_to_set(&cc.case_fold())
342 |             } else {
343 |                 RangeSet::single(*ch as u32)
344 |             };
345 |             self.add_state_with_chars(&ranges);
346 |         }
347 |         self.add_state(Accept::Never);
348 |     }
349 | 
350 |     /// Appends a sequence of states that recognizes the concatenation of `exprs`.
351 |     fn add_concat_exprs(&mut self, exprs: &[Expr]) {
352 |         if let Some((expr, rest)) = exprs.split_first() {
353 |             self.add_expr(expr);
354 | 
355 |             for expr in rest {
356 |                 let cur_len = self.num_states();
357 |                 self.add_eps(cur_len - 1, cur_len);
358 |                 self.add_expr(expr);
359 |             }
360 |         } else {
361 |             self.add_state(Accept::Never);
362 |         }
363 |     }
364 | 
365 |     /// Appends a sequence of states that recognizes one of the expressions in `alts`.
366 |     ///
367 |     /// The earlier expressions in `alts` get higher priority when matching.
368 |     fn add_alternate_exprs(&mut self, alts: &[Expr]) {
369 |         // Add the new initial state that feeds into the alternate.
370 |         let init_idx = self.num_states();
371 |         self.add_state(Accept::Never);
372 | 
373 |         let mut expr_end_indices = Vec::<StateIdx>::with_capacity(alts.len());
374 |         for expr in alts {
375 |             let expr_init_idx = self.states.len();
376 |             self.add_eps(init_idx, expr_init_idx);
377 |             self.add_expr(expr);
378 |             expr_end_indices.push(self.states.len() - 1);
379 |         }
380 | 
381 |         // Make the final state of each alternative point to our new final state.
382 |         self.add_state(Accept::Never);
383 |         let final_idx = self.states.len() - 1;
384 |         for idx in expr_end_indices {
385 |             self.add_eps(idx, final_idx);
386 |         }
387 |     }
388 | 
389 |     /// Appends new states, representing multiple copies of `expr`.
390 |     fn add_repeat(&mut self, expr: &Expr, rep: Repeater, greedy: bool) {
391 |         match rep {
392 |             Repeater::ZeroOrOne => {
393 |                 self.add_repeat_up_to(expr, 1, greedy);
394 |             },
395 |             Repeater::ZeroOrMore => {
396 |                 self.add_repeat_zero_or_more(expr, greedy);
397 |             },
398 |             Repeater::OneOrMore => {
399 |                 self.add_repeat_min_max(expr, 1, None, greedy);
400 |             },
401 |             Repeater::Range{ min, max } => {
402 |                 self.add_repeat_min_max(expr, min, max, greedy);
403 |             }
404 |         }
405 |     }
406 | 
407 |     /// Repeats `expr` a fixed number of times (which must be positive).
408 |     fn add_repeat_exact(&mut self, expr: &Expr, n: u32) {
409 |         assert!(n > 0);
410 |         self.add_expr(expr);
411 |         for _ in 1..n {
412 |             let idx = self.states.len();
413 |             self.add_expr(expr);
414 |             self.add_eps(idx - 1, idx);
415 |         }
416 |     }
417 | 
418 |     /// Repeats `expr` between zero and `n` times (`n` must be positive).
419 |     fn add_repeat_up_to(&mut self, expr: &Expr, n: u32, greedy: bool) {
420 |         assert!(n > 0);
421 | 
422 |         self.add_state(Accept::Never);
423 |         let mut init_indices = Vec::<StateIdx>::with_capacity(n as usize);
424 |         for _ in 0..n {
425 |             init_indices.push(self.states.len() as StateIdx);
426 |             self.add_expr(expr);
427 |         }
428 |         let final_idx = self.states.len() - 1;
429 |         for idx in init_indices {
430 |             self.add_alt_eps(idx - 1, idx, final_idx, greedy);
431 |         }
432 |     }
433 | 
434 |     /// Adds an eps transition from `from` to both `to1` and `to2`. If `greedy` is true, `to1` is
435 |     /// preferred, and otherwise `to2` is preferred.
436 |     fn add_alt_eps(&mut self, from: usize, to1: usize, to2: usize, greedy: bool) {
437 |         if greedy {
438 |             self.add_eps(from, to1);
439 |             self.add_eps(from, to2);
440 |         } else {
441 |             self.add_eps(from, to2);
442 |             self.add_eps(from, to1);
443 |         }
444 |     }
445 | 
446 |     /// Appends new states, representing multiple copies of `expr`.
447 |     ///
448 |     /// The new states represent a language that accepts at least `min` and at most `maybe_max`
449 |     /// copies of `expr`. (If `maybe_max` is `None`, there is no upper bound.)
450 |     fn add_repeat_min_max(&mut self, expr: &Expr, min: u32, maybe_max: Option<u32>, greedy: bool) {
451 |         if min == 0 && maybe_max == Some(0) {
452 |             // We add a state anyway, in order to maintain the convention that every expr should
453 |             // add at least one state (otherwise keeping track of indices becomes much more
454 |             // tedious).
455 |             self.add_state(Accept::Never);
456 |             return;
457 |         }
458 | 
459 |         if min > 0 {
460 |             self.add_repeat_exact(expr, min);
461 | 
462 |             // If anything else comes after this, we need to connect the two parts.
463 |             if maybe_max != Some(min) {
464 |                 let len = self.num_states();
465 |                 self.add_eps(len - 1, len);
466 |             }
467 |         }
468 | 
469 |         if let Some(max) = maybe_max {
470 |             if max > min {
471 |                 self.add_repeat_up_to(expr, max - min, greedy);
472 |             }
473 |         } else {
474 |             self.add_repeat_zero_or_more(expr, greedy);
475 |         }
476 |     }
477 | 
478 |     /// Repeats the given expression zero or more times.
479 |     fn add_repeat_zero_or_more(&mut self, expr: &Expr, greedy: bool) {
480 |         let start_idx = self.num_states();
481 |         self.add_state(Accept::Never);
482 |         self.add_expr(expr);
483 |         self.add_state(Accept::Never);
484 |         let end_idx = self.num_states() - 1;
485 | 
486 |         self.add_alt_eps(start_idx, start_idx + 1, end_idx, greedy);
487 |         self.add_alt_eps(end_idx - 1, start_idx + 1, end_idx, greedy);
488 |     }
489 | 
490 |     /// Adds two new states, with a look connecting them.
491 |     fn add_look_pair(&mut self, behind: Look, ahead: Look) {
492 |         let idx = self.add_state(Accept::Never);
493 |         self.add_look(idx, idx + 1, behind, ahead);
494 |         self.add_state(Accept::Never);
495 |     }
496 | 
497 |     /// Adds an extra predicate between the last two states (there must be at least two states).
498 |     fn extra_look(&mut self, behind: Look, ahead: Look) {
499 |         let len = self.states.len();
500 |         self.add_look(len - 2, len - 1, behind, ahead);
501 |     }
502 | 
503 |     /// Appends a bunch of new states, representing `expr`.
504 |     ///
505 |     /// This maintains the invariant that the last state is always empty (i.e. it doesn't have any
506 |     /// transitions leading out of it). It is also guaranteed to add at least one new state.
507 |     fn add_expr(&mut self, expr: &Expr) {
508 |         use regex_syntax::Expr::*;
509 | 
510 |         match *expr {
511 |             Empty => { self.add_state(Accept::Never); },
512 |             Class(ref c) => self.add_single_transition(&class_to_set(c)),
513 |             AnyChar => self.add_single_transition(&RangeSet::full()),
514 |             AnyCharNoNL => {
515 |                 let nls = b"\n\r".into_iter().map(|b| *b as u32);
516 |                 self.add_single_transition(&RangeSet::except(nls))
517 |             },
518 |             Concat(ref es) => self.add_concat_exprs(es),
519 |             Alternate(ref es) => self.add_alternate_exprs(es),
520 |             Literal { ref chars, casei } => self.add_literal(chars.iter(), casei),
521 |             StartLine => self.add_look_pair(Look::NewLine, Look::Full),
522 |             StartText => self.add_look_pair(Look::Boundary, Look::Full),
523 |             EndLine => self.add_look_pair(Look::Full, Look::NewLine),
524 |             EndText => self.add_look_pair(Look::Full, Look::Boundary),
525 |             WordBoundary => {
526 |                 self.add_look_pair(Look::WordChar, Look::NotWordChar);
527 |                 self.extra_look(Look::NotWordChar, Look::WordChar);
528 |             },
529 |             NotWordBoundary => {
530 |                 self.add_look_pair(Look::WordChar, Look::WordChar);
531 |                 self.extra_look(Look::NotWordChar, Look::NotWordChar);
532 |             },
533 |             Repeat { ref e, r, greedy } => self.add_repeat(e, r, greedy),
534 | 
535 |             // We don't support capture groups, so there is no need to keep track of
536 |             // the group name or number.
537 |             Group { ref e, .. } => self.add_expr(e),
538 | 
539 |         }
540 |     }
541 | }
542 | 
543 | #[cfg(test)]
544 | mod tests {
545 |     use look::Look;
546 |     use nfa::{Accept, NoLooks, Nfa, StateIdx};
547 |     use nfa::tests::{re_nfa, trans_nfa};
548 | 
549 |     // Creates an Nfa with the given transitions, with initial state zero, and with the final
550 |     // state the only accepting state.
551 |     fn trans_nfa_extra(size: usize, transitions: &[(StateIdx, StateIdx, char)])
552 |     -> Nfa<u32, NoLooks> {
553 |         let mut ret: Nfa<u32, NoLooks> = trans_nfa(size, transitions);
554 | 
555 |         ret.states[size-1].accept = Accept::Always;
556 |         ret.init.push((Look::Full, 0));
557 |         ret
558 |      }
559 | 
560 |     #[test]
561 |     fn single() {
562 |         let nfa = re_nfa("a");
563 |         let target = trans_nfa_extra(2, &[(0, 1, 'a')]);
564 | 
565 |         assert_eq!(nfa, target);
566 |     }
567 | 
568 |     #[test]
569 |     fn alternate() {
570 |         let nfa = re_nfa("a|b");
571 |         let mut target = trans_nfa_extra(3, &[(0, 2, 'a'), (1, 2, 'b')]);
572 |         target.init.push((Look::Full, 1));
573 | 
574 |         assert_eq!(nfa, target);
575 |     }
576 | 
577 |     // TODO: once remove_looks supports laziness, test it.
578 | 
579 |     #[test]
580 |     fn plus() {
581 |         let nfa = re_nfa("a+");
582 |         // It's possible to generate a smaller NFA for '+', but we don't currently do it.
583 |         let target = trans_nfa_extra(3, &[(0, 1, 'a'), (0, 2, 'a'), (1, 1, 'a'), (1, 2, 'a')]);
584 | 
585 |         assert_eq!(nfa, target);
586 |     }
587 | 
588 |     #[test]
589 |     fn star() {
590 |         let nfa = re_nfa("a*");
591 |         // It's possible to generate a smaller NFA for '*', but we don't currently do it.
592 |         let mut target = trans_nfa_extra(2, &[(0, 0, 'a'), (0, 1, 'a')]);
593 |         target.init.push((Look::Full, 1));
594 | 
595 |         assert_eq!(nfa, target);
596 |     }
597 | 
598 |     #[test]
599 |     fn rep_fixed() {
600 |         assert_eq!(re_nfa("a{3}"), re_nfa("aaa"));
601 |     }
602 | 
603 |     #[test]
604 |     fn rep_range() {
605 |         assert_eq!(re_nfa("a{2,4}"), re_nfa("aaa{0,2}"));
606 |     }
607 | 
608 |     #[test]
609 |     fn sequence() {
610 |         let nfa = re_nfa("ab");
611 |         let target = trans_nfa_extra(3, &[(0, 1, 'a'), (1, 2, 'b')]);
612 | 
613 |         assert_eq!(nfa, target);
614 |     }
615 | 
616 |     #[test]
617 |     fn anchored_start() {
618 |         let nfa = re_nfa("^a");
619 |         let mut target = trans_nfa(2, &[(0, 1, 'a')]);
620 |         target.init.push((Look::Boundary, 0));
621 |         target.states[1].accept = Accept::Always;
622 | 
623 |         assert_eq!(nfa, target);
624 |     }
625 | 
626 |     #[test]
627 |     fn anchored_end() {
628 |         let nfa = re_nfa("a$");
629 |         let mut target = trans_nfa_extra(2, &[(0, 1, 'a')]);
630 |         target.states[1].accept = Accept::AtEoi;
631 |         target.states[1].accept_look = Look::Boundary;
632 |         target.states[1].accept_state = 1;
633 | 
634 |         assert_eq!(nfa, target);
635 |     }
636 | 
637 |     #[test]
638 |     fn word_boundary_start() {
639 |         let nfa = re_nfa(r"\ba");
640 |         let mut target = trans_nfa(2, &[(1, 0, 'a')]);
641 |         target.init.push((Look::NotWordChar, 1));
642 |         target.states[0].accept = Accept::Always;
643 | 
644 |         assert_eq!(nfa, target);
645 |     }
646 | 
647 |     #[test]
648 |     fn word_boundary_end() {
649 |         let nfa = re_nfa(r"a\b");
650 |         let mut target = trans_nfa_extra(3, &[(0, 1, 'a')]);
651 |         for range in Look::NotWordChar.as_set().ranges() {
652 |             target.add_transition(1, 2, range);
653 |         }
654 |         target.states[1].accept = Accept::AtEoi;
655 |         target.states[1].accept_look = Look::Boundary;
656 |         target.states[1].accept_state = 1;
657 |         target.states[2].accept = Accept::Always;
658 |         target.states[2].accept_look = Look::NotWordChar;
659 |         target.states[2].accept_state = 1;
660 |         target.states[2].accept_tokens = 1;
661 | 
662 |         assert_eq!(nfa, target);
663 |     }
664 | 
665 |     #[test]
666 |     fn word_boundary_ambiguous() {
667 |         let nfa = re_nfa(r"\b(a| )");
668 |         let mut target = trans_nfa(3, &[(1, 0, ' '), (2, 0, 'a')]);
669 |         target.states[0].accept = Accept::Always;
670 |         target.init.push((Look::WordChar, 1));
671 |         target.init.push((Look::NotWordChar, 2));
672 | 
673 |         assert_eq!(nfa, target);
674 |     }
675 | 
676 |     #[test]
677 |     fn empty() {
678 |         assert_eq!(re_nfa(""), trans_nfa_extra(1, &[]));
679 |     }
680 | }
681 | 
682 | 


--------------------------------------------------------------------------------
/src/nfa/mod.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2016 Joe Neeman.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. This file may not be copied, modified, or distributed
  7 | // except according to those terms.
  8 | 
  9 | use look::Look;
 10 | use num_traits::PrimInt;
 11 | use range_map::{Range, RangeMultiMap};
 12 | use std::fmt::{self, Debug, Formatter};
 13 | use std::marker::PhantomData;
 14 | 
 15 | mod has_looks;
 16 | mod no_looks;
 17 | 
 18 | // TODO: it would be nice to make StateIdx a new type instead of a type alias. The problem is that
 19 | // we need to be able to index Vecs with it, and we can't impl<T> Index<StateIdx> for Vec<T>
 20 | // because of coherence rules.
 21 | pub type StateIdx = usize;
 22 | 
 23 | /// How we represent a set of states. The two important criteria are:
 24 | ///
 25 | /// - it should be reasonably fast even when there are thousands of states (this knocks out
 26 | ///   BitSet), and
 27 | /// - it should be hashable (this knocks out HashSet).
 28 | ///
 29 | /// Note that efficient insertion and O(1) queries are not important. Therefore, we use a sorted
 30 | /// Vec. (But be careful to keep it sorted!)
 31 | pub type StateSet = Vec<StateIdx>;
 32 | 
 33 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)]
 34 | struct LookPair {
 35 |     pub behind: Look,
 36 |     pub ahead: Look,
 37 |     pub target_state: StateIdx,
 38 | }
 39 | 
 40 | impl LookPair {
 41 |     fn is_empty(&self) -> bool {
 42 |         self.behind == Look::Empty || self.ahead == Look::Empty
 43 |     }
 44 | 
 45 |     fn intersection(&self, other: &LookPair) -> LookPair {
 46 |         LookPair {
 47 |             behind: self.behind.intersection(&other.behind),
 48 |             ahead: self.ahead.intersection(&other.ahead),
 49 |             target_state: self.target_state,
 50 |         }
 51 |     }
 52 | }
 53 | 
 54 | /// The enum for determining whether a state is accepting. Classical NFAs would only allow `Never`
 55 | /// and `Always` here, but we also allow `AtEoi`, which means that the state should accept if and
 56 | /// only if we've reached the end of the input.
 57 | #[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
 58 | pub enum Accept {
 59 |     Never,
 60 |     AtEoi,
 61 |     Always,
 62 | }
 63 | 
 64 | #[derive(Clone, Eq, PartialEq)]
 65 | struct State<Tok> {
 66 |     accept: Accept,
 67 |     // If accept == Always and accept_tokens > 0, then we had to do some look-ahead in order to
 68 |     // determine that we have a match. In that case, accept_state is the index of the state that
 69 |     // should have accepted.
 70 |     accept_state: StateIdx,
 71 |     // In the case that we had some look-ahead, `accept_look` says what kind of char was involved
 72 |     // in the look-ahead, and `accept_tokens` says how many input tokens were consumed while
 73 |     // looking ahead. There are some restrictions on these values:
 74 |     // - if accept is Never then accept_look is Full and accept_tokens is zero
 75 |     // - if accept is AtEoi then accept_look is Boundary and accept_tokens is zero
 76 |     // - accept_look is never Empty
 77 |     // - if accept is Always then accept_look is neither empty nor Boundary
 78 |     // - if Tok is u32 then accept_tokens is either 0 or 1
 79 |     // - if Tok is u8 then accept_tokens is at most 4.
 80 |     accept_look: Look,
 81 |     accept_tokens: u8,
 82 | 
 83 |     // The transitions that consume input.
 84 |     consuming: RangeMultiMap<Tok, StateIdx>,
 85 |     // Transitions that do not consume input, but that are allowed to look forward and backward one
 86 |     // token.
 87 |     looking: Vec<LookPair>,
 88 | }
 89 | 
 90 | /// A non-deterministic finite automaton.
 91 | ///
 92 | /// `Tok` is the type of symbol that the automaton consumes. For most operations, only `u8` and
 93 | /// `u32` are supported.
 94 | ///
 95 | /// Match priority
 96 | /// ==============
 97 | ///
 98 | /// The whole point of a *non-deterministic* finite automaton is that it can match an input in
 99 | /// multiple ways. This implementation supports match priorities, meaning that in the event of
100 | /// multiple matches, there is exactly one that is preferred to all the others. In this
101 | /// implementation, the transitions out of each state are ordered and we prefer a match that makes
102 | /// an earlier transition over one that makes a later one.
103 | ///
104 | /// The `Variant` parameter
105 | /// =======================
106 | ///
107 | /// There are basically two versions of this struct with different representations and invariants,
108 | /// but they share enough code in common that it made more sense to write one struct and use a type
109 | /// parameter to determine which version it is. This is the meaning of the `Variant` type
110 | /// parameter, and it has two possible values: `HasLooks` and `NoLooks`.
111 | ///
112 | /// If `Variant == HasLooks` then the `init` field is unused. The only legal values for a state's
113 | /// `accept` field are `Always` and `Never`, and all the `accept_*` fields are unused. The
114 | /// automaton implicitly has a single initial state (state 0). Methods specific to
115 | /// `Nfa<_, HasLooks>` are in `has_looks.rs`.
116 | ///
117 | /// If `Variant == NoLooks` then the states' `looking` fields are unused. Initial states are
118 | /// explicitly given in `init` and in the states' `accept.*` fields. Methods specific to
119 | /// `Nfa<_, NoLooks>` are in `no_looks.rs`.
120 | ///
121 | /// The typical life-cycle of an `Nfa` is as follows:
122 | ///
123 | /// - First, create an `Nfa<u32, HasLooks>` using `from_regex`.
124 | /// - Call `nfa.remove_looks()` to turn the `Nfa<u32, HasLooks>` to an `Nfa<u32, NoLooks>`.
125 | /// - Call `nfa.byte_me()` to turn the `Nfa<u32, NoLooks>` into an `Nfa<u8, NoLooks>`.
126 | /// - Call one of the `nfa.determinize_*()` methods to make a `Dfa`.
127 | ///
128 | /// There are also some operations modifying `Nfa<u8, NoLooks>` that can be called between the last
129 | /// two steps.
130 | #[derive(Clone, Eq, PartialEq)]
131 | pub struct Nfa<Tok, Variant> {
132 |     states: Vec<State<Tok>>,
133 |     // The various possible sets of states that the automaton can start in, depending on what the
134 |     // most recent `char` of input was.
135 |     //
136 |     // We decide the initial state by looking at the previous char of input. For every element of
137 |     // `self.init` whose first entry matches that char, we start in the corresponding NFA state.
138 |     // Note that these states are ordered: states that appear earlier are given higher priority for
139 |     // matching.
140 |     init: Vec<(Look, StateIdx)>,
141 |     phantom: PhantomData<Variant>,
142 | }
143 | 
144 | pub trait Lookability {}
145 | 
146 | #[derive(Copy, Clone, Debug, PartialEq)]
147 | pub struct HasLooks;
148 | #[derive(Copy, Clone, Debug, PartialEq)]
149 | pub struct NoLooks;
150 | 
151 | impl Lookability for HasLooks {}
152 | impl Lookability for NoLooks {}
153 | 
154 | impl<Tok: Debug + PrimInt, L: Lookability> Nfa<Tok, L> {
155 |     pub fn new() -> Nfa<Tok, L> {
156 |         Nfa::with_capacity(0)
157 |     }
158 | 
159 |     /// Creates a new `Nfa` that can `add_state()` `n` times without re-allocating.
160 |     pub fn with_capacity(n: usize) -> Nfa<Tok, L> {
161 |         Nfa {
162 |             states: Vec::with_capacity(n),
163 |             init: Vec::new(),
164 |             phantom: PhantomData,
165 |         }
166 |     }
167 | 
168 |     /// Returns my consuming transitions, but with the source and destination swapped.
169 |     ///
170 |     /// If I have a transition from state `i` to state `j` that consumes token `c`, then
171 |     /// `ret[j]` will contain a mapping from `c` to `i`, where `ret` is the value returned by this
172 |     /// method.
173 |     ///
174 |     /// Note that information about match priorities is lost.
175 |     pub fn reversed_transitions(&self) -> Vec<RangeMultiMap<Tok, StateIdx>> {
176 |         let mut ret = vec![RangeMultiMap::new(); self.states.len()];
177 | 
178 |         for (source_idx, st) in self.states.iter().enumerate() {
179 |             for &(range, target_idx) in st.consuming.ranges_values() {
180 |                 ret[target_idx].insert(range, source_idx);
181 |             }
182 |         }
183 |         ret
184 |     }
185 | 
186 |     /// Adds a new state and returns its index.
187 |     pub fn add_state(&mut self, accept: Accept) -> StateIdx {
188 |         let state_idx = self.states.len();
189 |         self.states.push(State {
190 |             accept: accept,
191 |             accept_state: state_idx,
192 |             accept_look: if accept == Accept::AtEoi { Look::Boundary } else { Look::Full },
193 |             accept_tokens: 0,
194 |             consuming: RangeMultiMap::new(),
195 |             looking: Vec::new(),
196 |         });
197 |         state_idx
198 |     }
199 | 
200 |     /// Adds a new state and returns its index.
201 |     ///
202 |     /// The new state is always accepting; it represents the case that we accept after looking
203 |     /// ahead a few tokens.
204 |     pub fn add_look_ahead_state(&mut self, look: Look, tokens: u8, accept_state: StateIdx)
205 |     -> StateIdx {
206 |         debug_assert!(look != Look::Boundary && look != Look::Full && look != Look::Empty);
207 |         debug_assert!(tokens > 0);
208 | 
209 |         let state_idx = self.states.len();
210 |         self.states.push(State {
211 |             accept: Accept::Always,
212 |             accept_state: accept_state,
213 |             accept_look: look,
214 |             accept_tokens: tokens,
215 |             consuming: RangeMultiMap::new(),
216 |             looking: Vec::new(),
217 |         });
218 |         state_idx
219 |     }
220 | 
221 |     /// Adds a transition that moves from `source` to `target` on consuming a token in `range`.
222 |     pub fn add_transition(&mut self, source: StateIdx, target: StateIdx, range: Range<Tok>) {
223 |         self.states[source].consuming.insert(range, target);
224 |     }
225 | 
226 |     /// Returns the set of consuming transitions out of the given state.
227 |     pub fn consuming(&self, i: StateIdx) -> &RangeMultiMap<Tok, StateIdx> {
228 |         &self.states[i].consuming
229 |     }
230 | 
231 |     /// Returns the number of states.
232 |     pub fn num_states(&self) -> usize {
233 |         self.states.len()
234 |     }
235 | 
236 |     // You've just done some operation that has changed state indices (probably by deleting
237 |     // un-needed states). Now re-label the existing transitions according to the new state indices.
238 |     fn map_states<F>(&mut self, map: F) where F: Fn(StateIdx) -> Option<StateIdx> {
239 |         for st in &mut self.states {
240 |             st.consuming.retain_values(|x| map(*x).is_some());
241 |             // The unwrap is ok because we've just filtered all the `None`s (and `map` is Fn, not
242 |             // FnMut).
243 |             st.consuming.map_values(|x| map(*x).unwrap());
244 | 
245 |             st.looking = st.looking.iter()
246 |                 .filter(|look| map(look.target_state).is_some())
247 |                 // The unwrap is ok because we've just filtered all the `None`s.
248 |                 .map(|look| LookPair { target_state: map(look.target_state).unwrap(), .. *look })
249 |                 .collect();
250 | 
251 |             st.accept_state = map(st.accept_state).expect("bug in map_states");
252 |         }
253 | 
254 |         self.init = self.init.iter()
255 |             .filter_map(|pair| map(pair.1).map(|idx| (pair.0, idx)))
256 |             .collect();
257 |     }
258 | 
259 |     // Changes the `Lookability` marker without allocating anything.
260 |     fn transmuted<NewL: Lookability>(self) -> Nfa<Tok, NewL> {
261 |         Nfa {
262 |             states: self.states,
263 |             init: self.init,
264 |             phantom: PhantomData,
265 |         }
266 |     }
267 | 
268 |     /// Returns true if this Nfa only matches things at the beginning of the input.
269 |     pub fn is_anchored(&self) -> bool {
270 |         self.init.iter().all(|pair| pair.0 == Look::Boundary)
271 |     }
272 | 
273 |     /// Returns true if this Nfa never matches anything.
274 |     pub fn is_empty(&self) -> bool {
275 |         self.states.is_empty()
276 |     }
277 | }
278 | 
279 | impl<Tok: Debug + PrimInt, L: Lookability> Debug for Nfa<Tok, L> {
280 |     fn fmt(&self, f: &mut Formatter) -> fmt::Result {
281 |         try!(f.write_fmt(format_args!("Nfa ({} states):\n", self.states.len())));
282 | 
283 |         try!(f.write_fmt(format_args!("Init: {:?}\n", self.init)));
284 | 
285 |         for (st_idx, st) in self.states.iter().enumerate().take(40) {
286 |             try!(f.write_fmt(format_args!("\tState {} ({:?}):\n", st_idx, st.accept)));
287 | 
288 |             if st.accept != Accept::Never {
289 |                 try!(f.write_fmt(format_args!("\t\tlook {:?}, tokens {:?}, state {:?}\n",
290 |                                               st.accept_look, st.accept_tokens, st.accept_state)));
291 |             }
292 |             if !st.consuming.is_empty() {
293 |                 try!(f.write_str("\t\tConsuming:\n"));
294 |                 // Cap it at 10 transitions, since it gets unreadable otherwise.
295 |                 for &(range, target) in st.consuming.ranges_values().take(10) {
296 |                     try!(f.write_fmt(format_args!("\t\t\t{:?} -- {:?} => {}\n",
297 |                                                   range.start, range.end, target)));
298 |                 }
299 |                 if st.consuming.num_ranges() > 10 {
300 |                     try!(f.write_str("\t\t\t...\n"));
301 |                 }
302 |             }
303 |             if !st.looking.is_empty() {
304 |                 try!(f.write_str("\t\tLooking:\n"));
305 |                 for look in &st.looking {
306 |                     try!(f.write_fmt(format_args!("\t\t\t({:?},{:?}) => {}\n",
307 |                         look.behind, look.ahead, look.target_state)));
308 |                 }
309 |             }
310 |         }
311 |         if self.states.len() > 40 {
312 |             try!(f.write_fmt(format_args!("\t... ({} more states)\n", self.states.len() - 40)));
313 |         }
314 |         Ok(())
315 |     }
316 | }
317 | 
318 | #[cfg(test)]
319 | pub mod tests {
320 |     use nfa::{Accept, NoLooks, Nfa, StateIdx};
321 |     use num_traits::PrimInt;
322 |     use range_map::Range;
323 |     use std::fmt::Debug;
324 | 
325 |     // Creates an Nfa from a regular expression string.
326 |     pub fn re_nfa(re: &str) -> Nfa<u32, NoLooks> {
327 |         let nfa = Nfa::from_regex(re).unwrap();
328 |         println!("before remove looks: {:?}", nfa);
329 |         let nfa = nfa.remove_looks();
330 |         println!("after remove looks: {:?}", nfa);
331 |         nfa
332 |         //Nfa::from_regex(re).unwrap().remove_looks()
333 |     }
334 | 
335 |     // Creates an Nfa with the given transitions.
336 |     pub fn trans_range_nfa<Tok>(size: usize, transitions: &[(StateIdx, StateIdx, Range<Tok>)])
337 |     -> Nfa<Tok, NoLooks>
338 |     where Tok: Debug + PrimInt {
339 |         let mut ret: Nfa<Tok, NoLooks> = Nfa::with_capacity(size);
340 |         for _ in 0..size {
341 |             ret.add_state(Accept::Never);
342 |         }
343 |         for &(src, tgt, range) in transitions {
344 |             ret.add_transition(src, tgt, range);
345 |         }
346 |         ret
347 |     }
348 | 
349 |     // Creates an Nfa with the given transitions, each of which only takes a single char.
350 |     pub fn trans_nfa<Tok>(size: usize, transitions: &[(StateIdx, StateIdx, char)])
351 |     -> Nfa<Tok, NoLooks>
352 |     where Tok: Debug + PrimInt {
353 |         let tok = |x: char| -> Tok { Tok::from(x as u32).unwrap() };
354 |         let range_trans: Vec<_> = transitions.iter()
355 |             .map(|x| (x.0, x.1, Range::new(tok(x.2), tok(x.2))))
356 |             .collect();
357 |         trans_range_nfa(size, &range_trans)
358 |     }
359 | }
360 | 
361 | 


--------------------------------------------------------------------------------
/src/nfa/no_looks.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2016 Joe Neeman.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. This file may not be copied, modified, or distributed
  7 | // except according to those terms.
  8 | 
  9 | use dfa::Dfa;
 10 | use error::Error;
 11 | use itertools::Itertools;
 12 | use look::Look;
 13 | use nfa::{Accept, Nfa, NoLooks, State, StateIdx, StateSet};
 14 | use num_traits::PrimInt;
 15 | use range_map::{Range, RangeMap, RangeMultiMap};
 16 | use std::{char, u8, usize};
 17 | use std::cmp::max;
 18 | use std::collections::{HashMap, HashSet};
 19 | use std::fmt::Debug;
 20 | use std::marker::PhantomData;
 21 | use std::mem::swap;
 22 | use utf8_ranges::{Utf8Range, Utf8Sequence, Utf8Sequences};
 23 | 
 24 | // This provides a more compact way of representing UTF-8 sequences.
 25 | //
 26 | // A sequence of bytes belongs to this set if its first byte is in `head[0]`, its second byte is
 27 | // in `head[1]`, etc., and its last byte belongs to one of the ranges in `last_byte`.
 28 | //
 29 | // This representation is handy for making NFAs because compared to the representation in
 30 | // `Utf8Sequences`, it adds many fewer states. Basically, we are doing some crude minimization
 31 | // before creating the states.
 32 | struct MergedUtf8Sequences {
 33 |     pub head: Vec<Utf8Range>,
 34 |     pub last_byte: Vec<Utf8Range>,
 35 | }
 36 | 
 37 | // Returns this range as a pair of chars, or none if this is an empty range.
 38 | fn to_char_pair(r: Range<u32>) -> Option<(char, char)> {
 39 |     // Round up self.start to the nearest legal codepoint.
 40 |     let start = if r.start > 0xD7FF && r.start < 0xE000 {
 41 |         0xE000
 42 |     } else {
 43 |         r.start
 44 |     };
 45 | 
 46 |     // Round down self.end.
 47 |     let end = if r.end > 0x10FFFF {
 48 |         0x10FFFF
 49 |     } else if r.end < 0xE000 && r.end > 0xD7FF {
 50 |         0xD7FF
 51 |     } else {
 52 |         r.end
 53 |     };
 54 | 
 55 |     if start > end {
 56 |         None
 57 |     } else {
 58 |         Some((char::from_u32(start).unwrap(), char::from_u32(end).unwrap()))
 59 |     }
 60 | }
 61 | 
 62 | impl MergedUtf8Sequences {
 63 |     // Panics if not all the input sequences have the same leading byte ranges.
 64 |     fn merge<I>(iter: I) -> MergedUtf8Sequences where I: Iterator<Item=Utf8Sequence> {
 65 |         let mut head = Vec::new();
 66 |         let mut last_byte = Vec::new();
 67 | 
 68 |         for seq in iter {
 69 |             let len = seq.len();
 70 |             let h = &seq.as_slice()[..len-1];
 71 |             if head.is_empty() {
 72 |                 head.extend_from_slice(h);
 73 |             } else if &head[..] != h {
 74 |                 panic!("invalid sequences to merge");
 75 |             }
 76 | 
 77 |             last_byte.push(seq.as_slice()[len-1]);
 78 |         }
 79 | 
 80 |         MergedUtf8Sequences {
 81 |             head: head,
 82 |             last_byte: last_byte,
 83 |         }
 84 |     }
 85 | 
 86 |     fn from_sequences<'a, I>(iter: I) -> Box<Iterator<Item=MergedUtf8Sequences> + 'a>
 87 |     where I: Iterator<Item=Utf8Sequence> + 'a {
 88 |         fn head(u: &Utf8Sequence) -> Vec<Utf8Range> {
 89 |             let len = u.len();
 90 |             u.as_slice()[..len-1].to_owned()
 91 |         }
 92 | 
 93 |         Box::new(iter
 94 |             .group_by(head)
 95 |             .into_iter()
 96 |             .map(|(_, seqs)| MergedUtf8Sequences::merge(seqs.into_iter())))
 97 |     }
 98 | 
 99 |     fn from_ranges<'a, I>(iter: I) -> Box<Iterator<Item=MergedUtf8Sequences> + 'a>
100 |     where I: Iterator<Item=Range<u32>> + 'a {
101 |         MergedUtf8Sequences::from_sequences(
102 |             iter.filter_map(to_char_pair)
103 |                 .flat_map(|r| Utf8Sequences::new(r.0, r.1)))
104 |     }
105 | 
106 |     fn num_bytes(&self) -> u8 {
107 |         (self.head.len() + 1) as u8
108 |     }
109 | }
110 | 
111 | // Creates a byte-based Dfa that matches all the chars in `look.as_set()`.
112 | fn make_char_dfa(look: Look) -> Dfa<(Look, u8)> {
113 |     let mut nfa: Nfa<u32, NoLooks> = Nfa::with_capacity(2);
114 |     nfa.add_state(Accept::Never);
115 |     nfa.add_look_ahead_state(look, 1, 0);
116 |     // TODO: shouldn't adding both Full and Boundary be redundant?
117 |     nfa.init.push((Look::Full, 0));
118 |     nfa.init.push((Look::Boundary, 0));
119 |     nfa.states[0].consuming
120 |         = RangeMultiMap::from_vec(look.as_set().ranges().map(|x| (x, 1)).collect());
121 | 
122 |     // These unwraps are OK because the only failures are caused by having too many states.
123 |     nfa.byte_me(usize::MAX).unwrap()
124 |         .determinize(usize::MAX).unwrap()
125 |         .optimize()
126 | }
127 | 
128 | // Creates a byte-based Dfa that matches backwards all the chars in `look.as_set()`.
129 | fn make_rev_char_dfa(look: Look) -> Dfa<(Look, u8)> {
130 |     let mut nfa: Nfa<u8, NoLooks> = Nfa::with_capacity(0); // TODO: better capacity
131 |     nfa.add_state(Accept::Never);
132 |     nfa.init.push((Look::Full, 0));
133 |     nfa.init.push((Look::Boundary, 0));
134 | 
135 |     // This is more-or-less C&P from add_utf8_sequence.
136 |     for seq in MergedUtf8Sequences::from_ranges(look.as_set().ranges()) {
137 |         let mut last_state = nfa.add_state(Accept::Never);
138 | 
139 |         for range in &seq.last_byte {
140 |             nfa.add_transition(0, last_state, Range::new(range.start, range.end));
141 |         }
142 |         for range in seq.head.iter().rev() {
143 |             let cur_state = nfa.add_state(Accept::Never);
144 | 
145 |             nfa.add_transition(last_state, cur_state, Range::new(range.start, range.end));
146 |             last_state = cur_state;
147 |         }
148 | 
149 |         nfa.states[last_state].accept = Accept::Always;
150 |         nfa.states[last_state].accept_look = look;
151 |         nfa.states[last_state].accept_state = 0;
152 |         nfa.states[last_state].accept_tokens = seq.num_bytes();
153 |     }
154 | 
155 |     // This unwrap is OK because the only failures are caused by having too many states.
156 |     nfa.determinize(usize::MAX).unwrap()
157 |         .optimize()
158 | }
159 | 
160 | // We cache optimized Dfas for the expensive looks. See `Nfa<u8, NoLooks>::add_min_utf8_sequences`
161 | // for an explanation.
162 | lazy_static! {
163 |     static ref WORD_CHAR_DFA: Dfa<(Look, u8)> = make_char_dfa(Look::WordChar);
164 |     static ref NOT_WORD_CHAR_DFA: Dfa<(Look, u8)> = make_char_dfa(Look::NotWordChar);
165 |     static ref REV_WORD_CHAR_DFA: Dfa<(Look, u8)> = make_rev_char_dfa(Look::WordChar);
166 |     static ref REV_NOT_WORD_CHAR_DFA: Dfa<(Look, u8)> = make_rev_char_dfa(Look::NotWordChar);
167 | }
168 | 
169 | impl<Tok: Debug + PrimInt> Nfa<Tok, NoLooks> {
170 |     // Returns the set of all states that can be reached from some initial state.
171 |     fn reachable_from<I>(&self, states: I) -> HashSet<StateIdx> where I: Iterator<Item=StateIdx> {
172 |         let mut active: HashSet<StateIdx> = states.collect();
173 |         let mut next_active: HashSet<StateIdx> = HashSet::new();
174 |         let mut ret = active.clone();
175 | 
176 |         while !active.is_empty() {
177 |             for &s in &active {
178 |                 for &(_, t) in self.states[s].consuming.ranges_values() {
179 |                     if !ret.contains(&t) {
180 |                         ret.insert(t);
181 |                         next_active.insert(t);
182 |                     }
183 |                 }
184 |             }
185 |             swap(&mut active, &mut next_active);
186 |             next_active.clear();
187 |         }
188 |         ret
189 |     }
190 | 
191 |     // Reverses this Nfa, but only the transitions (i.e. doesn't do anything about initial and
192 |     // final states).
193 |     fn reversed_simple(&self) -> Nfa<Tok, NoLooks> {
194 |         let rev_transitions = self.reversed_transitions();
195 |         let mut ret: Nfa<Tok, NoLooks> = Nfa::with_capacity(self.states.len());
196 | 
197 |         for trans in rev_transitions {
198 |             let idx = ret.add_state(Accept::Never);
199 |             ret.states[idx].consuming = trans;
200 |         }
201 | 
202 |         ret
203 |     }
204 | 
205 |     // Returns the set of all states that can be reached from an initial state and that can reach
206 |     // some accepting state.
207 |     fn reachable_states(&self) -> HashSet<StateIdx> {
208 |         let init_states = self.init.iter().map(|pair| pair.1);
209 |         let final_states = self.states.iter().enumerate()
210 |             .filter(|&(_, state)| state.accept != Accept::Never)
211 |             .map(|(idx, _)| idx);
212 | 
213 |         let forward = self.reachable_from(init_states);
214 |         let backward = self.reversed_simple().reachable_from(final_states);
215 |         forward.intersection(&backward).cloned().collect()
216 |     }
217 | 
218 |     /// Optimizes this Nfa by removing all states that cannot be reached from an initial state
219 |     /// and all states that cannot lead to an accepting state.
220 |     pub fn trim_unreachable(&mut self) {
221 |         let reachable = self.reachable_states();
222 | 
223 |         let mut old_states = Vec::new();
224 |         swap(&mut self.states, &mut old_states);
225 |         let mut old_to_new = vec![None; old_states.len()];
226 | 
227 |         let (new_to_old, new_states): (Vec<_>, Vec<State<Tok>>) = old_states.into_iter()
228 |             .enumerate()
229 |             .filter(|&(i, _)| reachable.contains(&i))
230 |             .unzip();
231 |         self.states = new_states;
232 | 
233 |         for (new, &old) in new_to_old.iter().enumerate() {
234 |             old_to_new[old] = Some(new);
235 |         }
236 | 
237 |         self.map_states(|s| old_to_new[s]);
238 |     }
239 | 
240 |     // Returns an `Accept` that will accept whenever anything in `states` would accept.
241 |     fn accept_union(&self, states: &StateSet) -> Accept {
242 |         states.iter().map(|s| self.states[*s].accept).max().unwrap_or(Accept::Never)
243 |     }
244 | }
245 | 
246 | impl Nfa<u32, NoLooks> {
247 |     /// Converts this `Nfa` into one that consumes the input byte-by-byte.
248 |     pub fn byte_me(self, max_states: usize) -> ::Result<Nfa<u8, NoLooks>> {
249 |         let mut ret = Nfa::<u8, NoLooks> {
250 |             states: self.states.iter().map(|s| State {
251 |                 accept: s.accept,
252 |                 accept_look: s.accept_look,
253 |                 accept_state: s.accept_state,
254 |                 accept_tokens: s.accept_tokens,
255 |                 consuming: RangeMultiMap::new(),
256 |                 looking: Vec::new(),
257 |             }).collect(),
258 |             init: self.init,
259 |             phantom: PhantomData,
260 |         };
261 | 
262 |         for (i, state) in self.states.into_iter().enumerate() {
263 |             // Group transitions by the target state, and add them in batches. Most of the time, we
264 |             // can merge a bunch of Utf8Sequences before adding them, which saves a bunch of
265 |             // states.
266 |             for (tgt, transitions) in state.consuming.ranges_values().group_by(|x| x.1) {
267 |                 try!(ret.add_utf8_sequences(i, transitions.into_iter().map(|x| x.0), tgt, max_states));
268 |             }
269 |         }
270 |         Ok(ret)
271 |     }
272 | }
273 | 
274 | impl Nfa<u8, NoLooks> {
275 |     /// Converts this `Nfa` into a `Dfa`.
276 |     pub fn determinize(&self, max_states: usize) -> ::Result<Dfa<(Look, u8)>> {
277 |         Determinizer::determinize(self, max_states, MatchChoice::TransitionOrder, self.init.clone())
278 |     }
279 | 
280 |     /// Converts this `Nfa` into a `Dfa`.
281 |     ///
282 |     /// Whenever this `Nfa` matches some text, the `Dfa` also will. But if this `Nfa` has multiple
283 |     /// possible endpoints for a match then the returned `Dfa` is only guaranteed to match the
284 |     /// longest one.
285 |     pub fn determinize_longest(&self, max_states: usize) -> ::Result<Dfa<(Look, u8)>> {
286 |         Determinizer::determinize(self, max_states, MatchChoice::LongestMatch, self.init.clone())
287 |     }
288 | 
289 |     /// Returns the reversal of this `Nfa`.
290 |     ///
291 |     /// If `self` matches some string of bytes, then the return value of this method will match
292 |     /// the same strings of bytes reversed.
293 |     ///
294 |     /// Note that this loses information about match priorities.
295 |     pub fn reverse(&self, max_states: usize) -> ::Result<Nfa<u8, NoLooks>> {
296 |         let mut ret = self.reversed_simple();
297 | 
298 |         // Turn our initial states into ret's accepting states.
299 |         for &(look, i) in &self.init {
300 |             match look {
301 |                 Look::Full => {
302 |                     ret.states[i].accept = Accept::Always;
303 |                     ret.states[i].accept_look = Look::Full;
304 |                 },
305 |                 Look::Boundary => {
306 |                     ret.states[i].accept = max(ret.states[i].accept, Accept::AtEoi);
307 |                     ret.states[i].accept_look = max(ret.states[i].accept_look, Look::Boundary);
308 |                 },
309 |                 Look::NewLine => {
310 |                     let accept_state = ret.add_look_ahead_state(Look::NewLine, 1, i);
311 |                     ret.add_transition(i, accept_state, Range::new(b'\n', b'\n'));
312 |                     ret.states[i].accept = max(ret.states[i].accept, Accept::AtEoi);
313 |                     ret.states[i].accept_look = max(ret.states[i].accept_look, Look::Boundary);
314 |                 },
315 |                 Look::WordChar | Look::NotWordChar => {
316 |                     // It would make more sense to put this outside the loop, but having it inside
317 |                     // prevents a deadlock: constructing REV_*_DFA ends up calling reverse(), but
318 |                     // with no look-ahead so it never gets inside this loop.
319 |                     let dfa: &Dfa<_> = if look == Look::WordChar {
320 |                         &REV_WORD_CHAR_DFA
321 |                     } else {
322 |                         ret.states[i].accept = max(ret.states[i].accept, Accept::AtEoi);
323 |                         ret.states[i].accept_look = max(ret.states[i].accept_look, Look::Boundary);
324 |                         &REV_NOT_WORD_CHAR_DFA
325 |                     };
326 |                     let accept_state = ret.add_look_ahead_state(look, 1, i);
327 |                     try!(ret.add_min_utf8_sequences(i, dfa, accept_state, max_states));
328 |                 },
329 |                 Look::Empty => {
330 |                     panic!("Empty cannot be an init look");
331 |                 },
332 |             }
333 |         }
334 | 
335 |         // Turn our accepting states into ret's initial states.
336 |         ret.init.clear();
337 |         for st in &self.states {
338 |             if st.accept != Accept::Never {
339 |                 ret.init.push((st.accept_look, st.accept_state));
340 |             }
341 |         }
342 |         Ok(ret)
343 |     }
344 | 
345 |     /// Can we accept immediately if the beginning of the input matches `look`?
346 |     fn init_accept(&self, look: Look) -> Accept {
347 |         let set = self.init.iter()
348 |             .filter(|pair| look <= pair.0)
349 |             .map(|pair| pair.1)
350 |             .collect::<Vec<_>>();
351 |         self.accept_union(&set)
352 |     }
353 | 
354 |     /// This essentially modifies `self` by adding a `^.*` at the beginning.
355 |     ///
356 |     /// The result is actually a little bit different, because `.` matches a whole code point,
357 |     /// whereas the `^.*` that we add works at the byte level.
358 |     pub fn anchor(mut self, max_states: usize) -> ::Result<Nfa<u8, NoLooks>> {
359 |         let loop_accept = self.init_accept(Look::Full);
360 |         let loop_state = self.add_state(loop_accept);
361 |         let init_accept = self.init_accept(Look::Boundary);
362 |         let init_state = self.add_state(init_accept);
363 | 
364 |         // Swap out init so that we can iterate over it while modifying `self`.
365 |         let mut init = Vec::new();
366 |         swap(&mut init, &mut self.init);
367 | 
368 |         for &(look, st_idx) in &init {
369 |             if look.allows_eoi() {
370 |                 // TODO: shouldn't need to clone here.
371 |                 for &(range, target) in self.states[st_idx].consuming.clone().ranges_values() {
372 |                     self.add_transition(init_state, target, range);
373 |                 }
374 |             }
375 | 
376 |             match look {
377 |                 Look::Boundary => {},
378 |                 Look::Full => {
379 |                     for &(range, target) in self.states[st_idx].consuming.clone().ranges_values() {
380 |                         self.add_transition(loop_state, target, range);
381 |                     }
382 |                 },
383 |                 Look::NewLine => {
384 |                     self.add_transition(init_state, st_idx, Range::new(b'\n', b'\n'));
385 |                     self.add_transition(loop_state, st_idx, Range::new(b'\n', b'\n'));
386 |                 },
387 |                 Look::WordChar | Look::NotWordChar => {
388 |                     let dfa: &Dfa<_> =
389 |                         if look == Look::WordChar { &WORD_CHAR_DFA } else { &NOT_WORD_CHAR_DFA };
390 | 
391 |                     try!(self.add_min_utf8_sequences(loop_state, dfa, st_idx, max_states));
392 |                     try!(self.add_min_utf8_sequences(init_state, dfa, st_idx, max_states));
393 |                 },
394 |                 Look::Empty => {
395 |                     panic!("Cannot start with an empty look");
396 |                 },
397 |             }
398 | 
399 |             // Once we've found an init state that accepts immediately, don't look for any others
400 |             // (since any matches that we find starting from them are lower priority that the one
401 |             // we've found already). This check is *almost* unnecessary, since similar pruning
402 |             // happens when we turn the NFA into a DFA. The important case that needs to be handled
403 |             // here is the case that a high-priority init state has no transitions out of it. Such
404 |             // a state will be completely removed by this function, and so we need to acknowledge
405 |             // its existence here.
406 |             if self.states[st_idx].accept == Accept::Always {
407 |                 break;
408 |             }
409 |         }
410 | 
411 |         // Wire up the initial and loop states, but only if they aren't accepting. That's because
412 |         // if they are accepting then the accept should take priority over the transition (since
413 |         // making the transition means that we are searching for a match that starts later).
414 |         if init_accept != Accept::Always {
415 |             self.add_transition(init_state, loop_state, Range::full());
416 |         }
417 |         if loop_accept != Accept::Always {
418 |             self.add_transition(loop_state, loop_state, Range::full());
419 |         }
420 | 
421 |         // The new Nfa is only allowed to start at the beginning of the input, and only at the new
422 |         // initial state.
423 |         self.init.push((Look::Boundary, init_state));
424 |         self.trim_unreachable();
425 |         Ok(self)
426 |     }
427 | 
428 |     // This does the same thing as add_utf8_sequences, but it gets the transitions from a dfa,
429 |     // which should have zero as its only starting state, and for which every accepting state
430 |     // should be Accept::Always.
431 |     //
432 |     // This is probably used in conjunction with make_char_dfa, which ends up having the same
433 |     // effect as add_utf8_sequences, but adds fewer states.
434 |     fn add_min_utf8_sequences(
435 |         &mut self,
436 |         start_state: StateIdx,
437 |         dfa: &Dfa<(Look, u8)>,
438 |         end_state: StateIdx,
439 |         max_states: usize,
440 |     ) -> ::Result<()> {
441 |         let offset = self.states.len();
442 |         // If end_accept is true, then it isn't actually important that we end in state
443 |         // `end_state`: we can create a new look_ahead state to end in.
444 |         let end_accept = self.states[end_state].accept_tokens > 0;
445 | 
446 |         if self.states.len() + dfa.num_states() > max_states {
447 |             return Err(Error::TooManyStates);
448 |         }
449 |         for _ in 0..dfa.num_states() {
450 |             self.add_state(Accept::Never);
451 |         }
452 |         for d_idx in 0..dfa.num_states() {
453 |             let n_src = if d_idx == 0 { start_state } else { d_idx + offset };
454 |             for &(range, d_tgt) in dfa.transitions(d_idx).ranges_values() {
455 |                 let n_tgt = if dfa.accept(d_tgt) == &Accept::Always && !end_accept {
456 |                     end_state
457 |                 } else {
458 |                     let n_tgt = d_tgt + offset;
459 |                     self.states[n_tgt].accept = *dfa.accept(d_tgt);
460 |                     if let Some(&(look, bytes)) = dfa.ret(d_tgt) {
461 |                         self.states[n_tgt].accept_look = look;
462 |                         self.states[n_tgt].accept_state = start_state;
463 |                         self.states[n_tgt].accept_tokens = bytes;
464 |                     }
465 |                     n_tgt
466 |                 };
467 |                 self.add_transition(n_src, n_tgt, range);
468 |             }
469 |         }
470 | 
471 |         Ok(())
472 |     }
473 | 
474 |     // Adds a path from `start_state` to `end_state` for all byte sequences matching `seq`.
475 |     //
476 |     // If `end_state` is a look-ahead state, makes a new accepting state instead (so that we know
477 |     // how many bytes of look-ahead we used).
478 |     fn add_utf8_sequence(
479 |         &mut self,
480 |         start_state: StateIdx,
481 |         mut end_state: StateIdx,
482 |         seq: MergedUtf8Sequences
483 |     ) {
484 |         let mut last_state = start_state;
485 |         for range in &seq.head {
486 |             let cur_state = self.add_state(Accept::Never);
487 | 
488 |             self.add_transition(last_state, cur_state, Range::new(range.start, range.end));
489 |             last_state = cur_state;
490 |         }
491 | 
492 |         if self.states[end_state].accept_tokens > 0 {
493 |             let look = self.states[end_state].accept_look;
494 |             let acc_state = self.states[end_state].accept_state;
495 |             end_state = self.add_look_ahead_state(look, seq.num_bytes(), acc_state);
496 |         }
497 |         for range in &seq.last_byte {
498 |             self.add_transition(last_state, end_state, Range::new(range.start, range.end));
499 |         }
500 |     }
501 | 
502 |     // Adds a byte path from `start_state` to `end_state` for every char in `ranges`.
503 |     fn add_utf8_sequences<I>(
504 |         &mut self,
505 |         start_state: StateIdx,
506 |         ranges: I,
507 |         end_state: StateIdx,
508 |         max_states: usize
509 |     ) -> ::Result<()>
510 |     where I: Iterator<Item=Range<u32>> {
511 |         for m in MergedUtf8Sequences::from_ranges(ranges) {
512 |             self.add_utf8_sequence(start_state, end_state, m);
513 |             if self.states.len() > max_states {
514 |                 return Err(Error::TooManyStates);
515 |             }
516 |         }
517 |         Ok(())
518 |     }
519 | 
520 |     // Finds the transitions out of the given set of states, as a RangeMap.
521 |     fn transition_map(&self, states: &[StateIdx]) -> RangeMap<u8, Vec<usize>> {
522 |         let mut transitions = states.into_iter()
523 |             .flat_map(|s| self.states[*s].consuming.ranges_values().cloned())
524 |             .collect::<RangeMultiMap<u8, StateIdx>>()
525 |             .group();
526 | 
527 |         // `scratch` is large enough to be indexed by anything in `elts`. It is full of `false`.
528 |         fn uniquify(elts: &mut Vec<StateIdx>, scratch: &mut Vec<bool>) {
529 |             elts.retain(|&e| {
530 |                 let ret = !scratch[e];
531 |                 scratch[e] = true;
532 |                 ret
533 |             });
534 | 
535 |             // Clean up scratch, so that it is full of `false` again.
536 |             for e in elts {
537 |                 scratch[*e] = false;
538 |             }
539 |         }
540 | 
541 |         let mut scratch = vec![false; self.num_states()];
542 |         for pair in transitions.as_mut_slice() {
543 |             uniquify(&mut pair.1, &mut scratch);
544 |         }
545 | 
546 |         transitions
547 |     }
548 | }
549 | 
550 | #[derive(PartialEq)]
551 | enum MatchChoice {
552 |     TransitionOrder,
553 |     LongestMatch,
554 | }
555 | 
556 | // This contains all the intermediate data structures that we need when turning an `Nfa` into a
557 | // `Dfa`.
558 | struct Determinizer<'a> {
559 |     nfa: &'a Nfa<u8, NoLooks>,
560 |     dfa: Dfa<(Look, u8)>,
561 |     state_map: HashMap<StateSet, StateIdx>,
562 |     active_states: Vec<StateSet>,
563 |     max_states: usize,
564 |     match_choice: MatchChoice,
565 | }
566 | 
567 | impl<'a> Determinizer<'a> {
568 |     // Turns an Nfa into an almost-equivalent (up to the difference between shortest and longest
569 |     // matches) Dfa.
570 |     //
571 |     // `init` is a vector of length Look::num(). Each entry gives a set of initial states that
572 |     // will be turned into the initial states of the dfa.
573 |     fn determinize(nfa: &Nfa<u8, NoLooks>,
574 |                    max_states: usize,
575 |                    match_choice: MatchChoice,
576 |                    init: Vec<(Look, StateIdx)>) -> ::Result<Dfa<(Look, u8)>> {
577 |         let mut det = Determinizer::new(nfa, max_states, match_choice);
578 |         try!(det.run(init));
579 |         Ok(det.dfa)
580 |     }
581 | 
582 |     fn new(nfa: &'a Nfa<u8, NoLooks>,
583 |            max_states: usize,
584 |            match_choice: MatchChoice) -> Determinizer<'a> {
585 |         Determinizer {
586 |             nfa: nfa,
587 |             dfa: Dfa::new(),
588 |             state_map: HashMap::new(),
589 |             active_states: Vec::new(),
590 |             max_states: max_states,
591 |             match_choice: match_choice,
592 |         }
593 |     }
594 | 
595 |     // Checks whether we should accept in the given set of states.
596 |     //
597 |     // Returns a tuple: the first element says when we accept, the second says what look-ahead (if
598 |     // any) led to us accepting, and the third says how many bytes of look-ahead we needed before
599 |     // knowing that we can accept.
600 |     //
601 |     // There is one annoying corner case: there could be two states in the set `s` with different
602 |     // values of `accept_tokens`, where the higher priority state says `Accept::AtEoi` and the
603 |     // lower priority state says `Accept::Always`. In this case, we return `(AtEoi, look, bytes)`
604 |     // where `look` and `bytes` come from the lower priority state. This doesn't lose any
605 |     // information, since if a state says `Accept::AtEoi` then its `accept_look` and
606 |     // `accept_tokens` are guaranteed to be `Boundary` and `0`.
607 |     fn accept(&self, s: &[StateIdx]) -> (Accept, Look, u8) {
608 |         let mut accept_states = s.iter().cloned()
609 |             .filter(|i| self.nfa.states[*i].accept != Accept::Never);
610 |         let mut accept_always_states = s.iter().cloned()
611 |             .filter(|i| self.nfa.states[*i].accept == Accept::Always);
612 | 
613 |         let (first_accept, other_accept) = if self.match_choice == MatchChoice::TransitionOrder {
614 |             (accept_states.next(), accept_always_states.next())
615 |         } else {
616 |             (accept_states.min_by_key(|i| self.nfa.states[*i].accept_tokens),
617 |                 accept_always_states.min_by_key(|i| self.nfa.states[*i].accept_tokens))
618 |         };
619 | 
620 |         // Returns the intersection of state.accept_look over all states in s that accept
621 |         // unconditionally and have the given number of look-ahead bytes.
622 |         let look_intersection = |toks: u8| {
623 |             s.iter().cloned()
624 |                 .filter(|i| self.nfa.states[*i].accept == Accept::Always)
625 |                 .filter(|i| self.nfa.states[*i].accept_tokens == toks)
626 |                 .fold(Look::Full, |x, y| x.intersection(&self.nfa.states[y].accept_look))
627 |         };
628 | 
629 |         if let Some(first_accept) = first_accept {
630 |             let st = &self.nfa.states[first_accept];
631 | 
632 |             if st.accept == Accept::AtEoi {
633 |                 // Check if there is a lower-priority Accept::Always.
634 |                 if let Some(other_accept) = other_accept {
635 |                     let other_st = &self.nfa.states[other_accept];
636 |                     if other_st.accept_tokens > 0 {
637 |                         let look = look_intersection(other_st.accept_tokens);
638 |                         return (Accept::AtEoi, look, other_st.accept_tokens);
639 |                     }
640 |                 }
641 |                 (Accept::AtEoi, Look::Boundary, 0)
642 |             } else {
643 |                 (Accept::Always, look_intersection(st.accept_tokens), st.accept_tokens)
644 |             }
645 |         } else {
646 |             // There are no accepting states.
647 |             (Accept::Never, Look::Empty, 0)
648 |         }
649 |     }
650 | 
651 |     // Tries to add a new state to the Dfa.
652 |     //
653 |     // If the state already exists, returns the index of the old one. If there are too many states,
654 |     // returns an error.
655 |     fn add_state(&mut self, mut s: StateSet) -> ::Result<StateIdx> {
656 |         // When we choose our matches by transition order, discard any states that have lower
657 |         // priority than the best match we've found.
658 |         if self.match_choice == MatchChoice::TransitionOrder {
659 |             if let Some(accept_idx) = s.iter().position(|&i| self.nfa.states[i].accept == Accept::Always) {
660 |                 s.truncate(accept_idx + 1);
661 |             }
662 |         }
663 | 
664 |         if self.state_map.contains_key(&s) {
665 |             Ok(*self.state_map.get(&s).unwrap())
666 |         } else if self.dfa.num_states() >= self.max_states {
667 |             Err(Error::TooManyStates)
668 |         } else {
669 |             let (acc, look, bytes_ago) = self.accept(&s);
670 |             let ret = if acc != Accept::Never { Some ((look, bytes_ago)) } else { None };
671 |             let new_state = self.dfa.add_state(acc, ret);
672 | 
673 |             self.active_states.push(s.clone());
674 |             self.state_map.insert(s, new_state);
675 |             Ok(new_state)
676 |         }
677 |     }
678 | 
679 |     // Creates a deterministic automaton representing the same language as our `nfa`.
680 |     // Puts the new Dfa in self.dfa.
681 |     fn run(&mut self, init: Vec<(Look, StateIdx)>) -> ::Result<()> {
682 |         if self.nfa.states.is_empty() {
683 |             return Ok(());
684 |         }
685 | 
686 |         for &look in Look::all() {
687 |             let init_states: StateSet = init.iter().cloned()
688 |                 .filter(|&(x, _)| look == x)
689 |                 .map(|(_, y)| y)
690 |                 .collect();
691 |             if !init_states.is_empty() {
692 |                 let new_state_idx = try!(self.add_state(init_states));
693 |                 self.dfa.init[look.as_usize()] = Some(new_state_idx);
694 |             }
695 |         }
696 | 
697 |         while !self.active_states.is_empty() {
698 |             let state = self.active_states.pop().unwrap();
699 |             // This unwrap is ok because anything in active_states must also be in state_map.
700 |             let state_idx = *self.state_map.get(&state).unwrap();
701 |             let trans = self.nfa.transition_map(&state);
702 | 
703 |             let mut dfa_trans = Vec::new();
704 |             for &(range, ref target) in trans.ranges_values() {
705 |                 let target_idx = try!(self.add_state(target.clone()));
706 |                 dfa_trans.push((range, target_idx));
707 |             }
708 |             self.dfa.set_transitions(state_idx, dfa_trans.into_iter().collect());
709 |         }
710 |         Ok(())
711 |     }
712 | }
713 | 
714 | #[cfg(test)]
715 | mod tests {
716 |     use look::Look;
717 |     use dfa::Dfa;
718 |     use nfa::{Accept, Nfa, NoLooks};
719 |     use nfa::tests::{re_nfa, trans_nfa, trans_range_nfa};
720 |     use range_map::Range;
721 |     use std::usize;
722 | 
723 |     fn re_nfa_anchored(re: &str) -> Nfa<u8, NoLooks> {
724 |         re_nfa(re).byte_me(usize::MAX).unwrap().anchor(usize::MAX).unwrap()
725 |     }
726 | 
727 |     fn re_dfa(re: &str) -> Dfa<(Look, u8)> {
728 |         re_nfa(re).byte_me(usize::MAX).unwrap().determinize(usize::MAX).unwrap()
729 |     }
730 | 
731 |     #[test]
732 |     fn anchor_simple() {
733 |         let nfa = re_nfa_anchored("a");
734 |         let mut target = trans_range_nfa(3, &[(2, 0, Range::new(b'a', b'a')),
735 |                                               (2, 1, Range::full()),
736 |                                               (1, 0, Range::new(b'a', b'a')),
737 |                                               (1, 1, Range::full())]);
738 |         target.init.push((Look::Boundary, 2));
739 |         target.states[0].accept = Accept::Always;
740 | 
741 |         assert_eq!(nfa, target);
742 |     }
743 | 
744 |     #[test]
745 |     fn anchor_nl() {
746 |         let nfa = re_nfa_anchored(r"(?m)^a");
747 |         let mut target = trans_nfa(4, &[(3, 1, 'a'),
748 |                                         (0, 1, 'a'),
749 |                                         (2, 0, '\n'),
750 |                                         (3, 0, '\n')]);
751 |         target.init.push((Look::Boundary, 3));
752 |         target.states[1].accept = Accept::Always;
753 | 
754 |         let mut target = target.byte_me(usize::MAX).unwrap();
755 |         target.states[2].consuming.insert(Range::full(), 2);
756 |         target.states[3].consuming.insert(Range::full(), 2);
757 | 
758 |         assert_eq!(nfa, target);
759 |     }
760 | 
761 |     #[test]
762 |     fn anchor_already_anchored() {
763 |         let nfa = re_nfa_anchored("^a");
764 |         let mut target = trans_nfa(2, &[(1, 0, 'a')]);
765 |         target.init.push((Look::Boundary, 1));
766 |         target.states[0].accept = Accept::Always;
767 | 
768 |         assert_eq!(nfa, target);
769 |     }
770 | 
771 |     #[test]
772 |     fn determinize_pruning() {
773 |         assert_eq!(re_dfa("a|aa"), re_dfa("a"));
774 |     }
775 | 
776 |     macro_rules! check_rev_inits {
777 |         ($name:ident, $re:expr, $inits:expr) => {
778 |             #[test]
779 |             fn $name() {
780 |                 let rev = re_nfa($re).byte_me(usize::MAX).unwrap().reverse(usize::MAX).unwrap();
781 |                 println!("{:?}", rev.init);
782 |                 for &look in Look::all() {
783 |                     println!("checking look {:?}", look);
784 |                     if $inits.contains(&look) {
785 |                         assert!(rev.init.iter().any(|pair| pair.0 == look));
786 |                     } else {
787 |                         assert!(!rev.init.iter().any(|pair| pair.0 == look));
788 |                     }
789 |                 }
790 |             }
791 |         };
792 |     }
793 | 
794 |     check_rev_inits!(rev_init_simple, "abc", [Look::Full]);
795 |     check_rev_inits!(rev_init_boundary, "abc$", [Look::Boundary]);
796 |     check_rev_inits!(rev_init_simple_and_boundary, "(abc$|abc)", [Look::Full, Look::Boundary]);
797 |     check_rev_inits!(rev_init_new_line, "(?m)abc$", [Look::Boundary, Look::NewLine]);
798 |     check_rev_inits!(rev_init_word, r"  \b", [Look::WordChar]);
799 |     check_rev_inits!(rev_init_not_word, r"abc\b", [Look::Boundary, Look::NotWordChar]);
800 |     check_rev_inits!(rev_init_word_or_not_word, r".\b", [Look::Boundary, Look::NotWordChar, Look::WordChar]);
801 | }
802 | 


--------------------------------------------------------------------------------
/src/regex.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2016 Joe Neeman.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. This file may not be copied, modified, or distributed
  7 | // except according to those terms.
  8 | 
  9 | use error::Error;
 10 | use nfa::{Nfa, NoLooks};
 11 | use runner::anchored::AnchoredEngine;
 12 | use runner::forward_backward::{ForwardBackwardEngine, Prefix};
 13 | use runner::Engine;
 14 | use std;
 15 | use std::fmt::Debug;
 16 | 
 17 | #[derive(Debug)]
 18 | pub struct Regex {
 19 |     engine: Box<Engine<u8>>,
 20 | }
 21 | 
 22 | // An engine that doesn't match anything.
 23 | #[derive(Clone, Debug)]
 24 | struct EmptyEngine;
 25 | 
 26 | impl<Ret: Debug> Engine<Ret> for EmptyEngine {
 27 |     fn find(&self, _: &str) -> Option<(usize, usize, Ret)> { None }
 28 |     fn clone_box(&self) -> Box<Engine<Ret>> { Box::new(EmptyEngine) }
 29 | }
 30 | 
 31 | impl Clone for Regex {
 32 |     fn clone(&self) -> Regex {
 33 |         Regex {
 34 |             engine: self.engine.clone_box(),
 35 |         }
 36 |     }
 37 | }
 38 | 
 39 | impl Regex {
 40 |     /// Creates a new `Regex` from a regular expression string.
 41 |     pub fn new(re: &str) -> ::Result<Regex> {
 42 |         Regex::new_bounded(re, std::usize::MAX)
 43 |     }
 44 | 
 45 |     /// Creates a new `Regex` from a regular expression string, but only if it doesn't require too
 46 |     /// many states.
 47 |     pub fn new_bounded(re: &str, max_states: usize) -> ::Result<Regex> {
 48 |         let nfa = try!(Nfa::from_regex(re));
 49 |         let nfa = nfa.remove_looks();
 50 | 
 51 |         let eng = if nfa.is_empty() {
 52 |             Box::new(EmptyEngine) as Box<Engine<u8>>
 53 |         } else if nfa.is_anchored() {
 54 |             Box::new(try!(Regex::make_anchored(nfa, max_states))) as Box<Engine<u8>>
 55 |         } else {
 56 |             Box::new(try!(Regex::make_forward_backward(nfa, max_states))) as Box<Engine<u8>>
 57 |         };
 58 | 
 59 |         Ok(Regex { engine: eng })
 60 |     }
 61 | 
 62 |     fn make_anchored(nfa: Nfa<u32, NoLooks>, max_states: usize)
 63 |     -> ::Result<AnchoredEngine<u8>> {
 64 |         let nfa = try!(nfa.byte_me(max_states));
 65 |         let dfa = try!(nfa.determinize(max_states))
 66 |             .optimize()
 67 |             .map_ret(|(_, bytes)| bytes);
 68 |         let prog = dfa.compile();
 69 | 
 70 |         Ok(AnchoredEngine::new(prog))
 71 |     }
 72 | 
 73 |     fn make_forward_backward(nfa: Nfa<u32, NoLooks>, max_states: usize)
 74 |     -> ::Result<ForwardBackwardEngine<u8>> {
 75 |         if nfa.is_anchored() {
 76 |             return Err(Error::InvalidEngine("anchors rule out the forward-backward engine"));
 77 |         }
 78 | 
 79 |         let f_nfa = try!(try!(nfa.clone().byte_me(max_states)).anchor(max_states));
 80 |         let b_nfa = try!(try!(nfa.byte_me(max_states)).reverse(max_states));
 81 | 
 82 |         let f_dfa = try!(f_nfa.determinize(max_states)).optimize();
 83 |         let b_dfa = try!(b_nfa.determinize_longest(max_states)).optimize();
 84 |         let b_dfa = b_dfa.map_ret(|(_, bytes)| bytes);
 85 | 
 86 |         let b_prog = b_dfa.compile();
 87 |         let f_dfa = f_dfa.map_ret(|(look, bytes)| {
 88 |             let b_dfa_state = b_dfa.init[look.as_usize()].expect("BUG: back dfa must have this init");
 89 |             (b_dfa_state, bytes)
 90 |         });
 91 | 
 92 |         let mut f_prog = f_dfa.compile();
 93 |         let prefix = Prefix::from_parts(f_dfa.prefix_strings());
 94 |         match prefix {
 95 |             Prefix::Empty => {},
 96 |             _ => {
 97 |                 // If there is a non-trivial prefix, we can usually speed up matching by deleting
 98 |                 // transitions that return to the start state. That way, instead of returning to
 99 |                 // the start state, we will just fail to match. Then we get to search for the
100 |                 // prefix before trying to match again.
101 |                 let f_dfa = f_dfa.cut_loop_to_init().optimize();
102 |                 f_prog = f_dfa.compile();
103 |             },
104 |         }
105 | 
106 |         Ok(ForwardBackwardEngine::new(f_prog, prefix, b_prog))
107 |     }
108 | 
109 |     /// Returns the index range of the first match, if there is a match. The indices returned are
110 |     /// byte indices of the string. The first index is inclusive; the second is exclusive.
111 |     pub fn find(&self, s: &str) -> Option<(usize, usize)> {
112 |         if let Some((start, end, look_behind)) = self.engine.find(s) {
113 |             Some((start + look_behind as usize, end))
114 |         } else {
115 |             None
116 |         }
117 |     }
118 | 
119 |     pub fn is_match(&self, s: &str) -> bool {
120 |         // TODO: for the forward-backward engine, this could be faster because we don't need
121 |         // to run backward.
122 |         self.find(s).is_some()
123 |     }
124 | }
125 | 
126 | 


--------------------------------------------------------------------------------
/src/runner/anchored.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2015-2016 Joe Neeman.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 6 | // option. This file may not be copied, modified, or distributed
 7 | // except according to those terms.
 8 | 
 9 | use std::fmt::Debug;
10 | use runner::Engine;
11 | use runner::program::TableInsts;
12 | 
13 | #[derive(Clone, Debug)]
14 | pub struct AnchoredEngine<Ret> {
15 |     prog: TableInsts<Ret>,
16 | }
17 | 
18 | impl<Ret: Copy + Debug> AnchoredEngine<Ret> {
19 |     pub fn new(prog: TableInsts<Ret>) -> AnchoredEngine<Ret> {
20 |         AnchoredEngine {
21 |             prog: prog,
22 |         }
23 |     }
24 | }
25 | 
26 | impl<Ret: Copy + Debug + 'static> Engine<Ret> for AnchoredEngine<Ret> {
27 |     fn find(&self, s: &str) -> Option<(usize, usize, Ret)> {
28 |         let input = s.as_bytes();
29 |         if self.prog.is_empty() {
30 |             None
31 |         } else if let Ok(end) = self.prog.find_from(input, 0, 0) {
32 |             Some((0, end.0, end.1))
33 |         } else {
34 |             None
35 |         }
36 |     }
37 | 
38 |     fn clone_box(&self) -> Box<Engine<Ret>> {
39 |         Box::new(self.clone())
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/runner/forward_backward.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2016 Joe Neeman.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. This file may not be copied, modified, or distributed
  7 | // except according to those terms.
  8 | 
  9 | use std::fmt::Debug;
 10 | //use dfa::{Dfa, PrefixPart, RetTrait};
 11 | use dfa::PrefixPart;
 12 | use itertools::Itertools;
 13 | use memchr::memchr;
 14 | use runner::Engine;
 15 | use runner::program::TableInsts;
 16 | 
 17 | #[derive(Clone, Debug)]
 18 | pub struct ForwardBackwardEngine<Ret> {
 19 |     forward: TableInsts<(usize, u8)>,
 20 |     backward: TableInsts<Ret>,
 21 |     prefix: Prefix,
 22 | }
 23 | 
 24 | impl<Ret: Copy + Debug> ForwardBackwardEngine<Ret> {
 25 |     pub fn new(forward: TableInsts<(usize, u8)>, prefix: Prefix, backward: TableInsts<Ret>) -> Self {
 26 |         ForwardBackwardEngine {
 27 |             forward: forward,
 28 |             backward: backward,
 29 |             prefix: prefix,
 30 |         }
 31 |     }
 32 | 
 33 |     fn find_with_searcher<SearchFn>(&self, input: &[u8], search: SearchFn)
 34 |     -> Option<(usize, usize, Ret)>
 35 |     where SearchFn: Fn(&[u8], usize) -> Option<usize> {
 36 |         let mut pos = 0;
 37 |         while let Some(start) = search(input, pos) {
 38 |             match self.forward.find_from(input, start, 0) {
 39 |                 Ok((end, (rev_state, look_ahead))) => {
 40 |                     let rev_pos = end.saturating_sub(look_ahead as usize);
 41 |                     let (start_pos, ret) = self.backward
 42 |                         .longest_backward_find_from(input, rev_pos, rev_state)
 43 |                         .expect("BUG: matched forward but failed to match backward");
 44 |                     return Some((start_pos, rev_pos, ret));
 45 | 
 46 |                 },
 47 |                 Err(end) => {
 48 |                     pos = end + 1;
 49 |                 },
 50 |             }
 51 |         }
 52 | 
 53 |         None
 54 |     }
 55 | 
 56 | }
 57 | 
 58 | impl<Ret: Copy + Debug + 'static> Engine<Ret> for ForwardBackwardEngine<Ret> {
 59 |     fn find(&self, s: &str) -> Option<(usize, usize, Ret)> {
 60 |         let input = s.as_bytes();
 61 |         if self.forward.is_empty() {
 62 |             return None;
 63 |         }
 64 | 
 65 |         match self.prefix {
 66 |             Prefix::Empty => self.find_with_searcher(
 67 |                 input,
 68 |                 |s, pos| if pos <= s.len() { Some(pos) } else { None }
 69 |             ),
 70 |             Prefix::ByteSet { ref bytes, offset } => self.find_with_searcher(
 71 |                 input,
 72 |                 |s, pos| if pos + offset <= s.len() {
 73 |                         s[(pos + offset)..].iter().position(|c| bytes[*c as usize]).map(|x| x + pos)
 74 |                     } else {
 75 |                         None
 76 |                     }
 77 |             ),
 78 |             Prefix::Byte { byte, offset } => self.find_with_searcher(
 79 |                 input,
 80 |                 |s, pos| if pos + offset <= s.len() {
 81 |                     memchr(byte, &input[(pos + offset)..]).map(|x| x + pos)
 82 |                 } else {
 83 |                     None
 84 |                 }
 85 |             ),
 86 |             //Prefix::ByteBackwards { .. } => unimplemented!(),
 87 |         }
 88 |     }
 89 | 
 90 |     fn clone_box(&self) -> Box<Engine<Ret>> {
 91 |         Box::new(self.clone())
 92 |     }
 93 | }
 94 | 
 95 | /// A `Prefix` is the first part of a DFA. Anything matching the DFA should start with
 96 | /// something matching the `Prefix`.
 97 | ///
 98 | /// The purpose of a `Prefix` is that scanning through the input looking for the `Prefix` should be
 99 | /// much faster than running the DFA naively.
100 | #[derive(Clone, Debug)]
101 | pub enum Prefix {
102 |     // Matches every position.
103 |     Empty,
104 |     // Matches a single byte in a particular set and then rewinds some number of bytes.
105 |     ByteSet { bytes: Vec<bool>, offset: usize },
106 |     // Matches a specific byte and then rewinds some number of bytes.
107 |     Byte { byte: u8, offset: usize },
108 |     // Matches a specific byte and then runs a DFA backwards.
109 |     //ByteBackwards { byte: u8, rev: Dfa<()> },
110 | }
111 | 
112 | // How big we allow the byte sets to be. In order for byte sets to be a performance win, finding a
113 | // byte in the set needs to be sufficiently rare; therefore, we only use small sets. There might be
114 | // room for a better heuristic, though: we could use large sets that only have rare bytes.
115 | const MAX_BYTE_SET_SIZE: usize = 16;
116 | 
117 | impl Prefix {
118 |     fn byte_prefix(parts: &[PrefixPart]) -> Option<Prefix> {
119 |         fn common_prefix<'a>(s1: &'a [u8], s2: &'a [u8]) -> &'a [u8] {
120 |             let prefix_len = s1.iter().zip(s2.iter())
121 |                 .take_while(|pair| pair.0 == pair.1)
122 |                 .count();
123 |             &s1[0..prefix_len]
124 |         }
125 | 
126 |         let mut parts = parts.iter();
127 |         if let Some(first) = parts.next() {
128 |             let lit = parts.fold(&first.0[..], |acc, p| common_prefix(acc, &p.0));
129 |             if !lit.is_empty() {
130 |                 // See if the common prefix contains a full codepoint. If it does, search for the last
131 |                 // byte of that codepoint.
132 |                 let cp_last_byte = ((!lit[0]).leading_zeros() as usize).saturating_sub(1);
133 |                 if cp_last_byte < lit.len() {
134 |                     return Some(Prefix::Byte { byte: lit[cp_last_byte], offset: cp_last_byte });
135 |                 }
136 |             }
137 |         }
138 | 
139 |         None
140 |     }
141 | 
142 |     fn byte_set_prefix(parts: &[PrefixPart]) -> Option<Prefix> {
143 |         let crit_byte_pos = |p: &PrefixPart| ((!p.0[0]).leading_zeros() as usize).saturating_sub(1);
144 |         let crit_byte_posns: Vec<usize> = parts.iter().map(crit_byte_pos).dedup().collect();
145 | 
146 |         if crit_byte_posns.len() == 1 {
147 |             let crit_byte = crit_byte_posns[0];
148 |             if parts.iter().all(|x| x.0.len() > crit_byte) {
149 |                 let mut crit_bytes: Vec<u8> = parts.iter().map(|x| x.0[crit_byte]).collect();
150 |                 crit_bytes.sort();
151 |                 crit_bytes.dedup();
152 | 
153 |                 if crit_bytes.len() <= MAX_BYTE_SET_SIZE {
154 |                     let mut ret = vec![false; 256];
155 |                     for &b in &crit_bytes {
156 |                         ret[b as usize] = true;
157 |                     }
158 |                     return Some(Prefix::ByteSet { bytes: ret, offset: crit_byte });
159 |                 }
160 |             }
161 |         }
162 | 
163 |         None
164 |     }
165 | 
166 |     /*
167 |     pub fn from_dfa<Ret: RetTrait>(dfa: &Dfa<Ret>) -> Prefix {
168 |         let parts = dfa.prefix_strings();
169 |         let first_try = Prefix::from_parts(parts);
170 | 
171 |         /*
172 |         match first_try {
173 |             Prefix::Byte {..} => first_try,
174 |             _ => {
175 |                 let crit_strings = dfa.critical_strings();
176 |                 unimplemented!();
177 |                 first_try
178 |             },
179 |         }
180 |         */
181 |         unimplemented!();
182 |     }
183 |     */
184 | 
185 |     /// Converts a set of `PrefixParts` into a `Prefix` that matches any of the strings.
186 |     pub fn from_parts(mut parts: Vec<PrefixPart>) -> Prefix {
187 |         parts.retain(|x| !x.0.is_empty());
188 | 
189 |         if let Some(pref) = Prefix::byte_prefix(&parts) {
190 |             pref
191 |         } else if let Some(pref) = Prefix::byte_set_prefix(&parts) {
192 |             pref
193 |         } else {
194 |             Prefix::Empty
195 |         }
196 |     }
197 | }
198 | 
199 | #[cfg(test)]
200 | mod tests {
201 |     use dfa::PrefixPart;
202 |     use super::*;
203 | 
204 |     fn pref(strs: Vec<&str>) -> Prefix {
205 |         Prefix::from_parts(
206 |             strs.into_iter()
207 |                 .enumerate()
208 |                 .map(|(i, s)| PrefixPart(s.as_bytes().to_vec(), i))
209 |                 .collect())
210 |     }
211 | 
212 |     #[test]
213 |     fn test_prefix_choice() {
214 |         use super::Prefix::*;
215 | 
216 |         assert!(matches!(pref(vec![]), Empty));
217 |         assert!(matches!(pref(vec![""]), Empty));
218 |         assert!(matches!(pref(vec!["a"]), Byte {..}));
219 |         assert!(matches!(pref(vec!["", "a", ""]), Byte {..}));
220 |         assert!(matches!(pref(vec!["abc"]), Byte {..}));
221 |         assert!(matches!(pref(vec!["abc", ""]), Byte {..}));
222 |         assert!(matches!(pref(vec!["a", "b", "c"]), ByteSet {..}));
223 |         assert!(matches!(pref(vec!["a", "b", "", "c"]), ByteSet {..}));
224 |         assert!(matches!(pref(vec!["a", "baa", "", "c"]), ByteSet {..}));
225 |         assert!(matches!(pref(vec!["ab", "baa", "", "cb"]), ByteSet {..}));
226 |         assert!(matches!(pref(vec!["ab", "aaa", "", "acb"]), Byte {..}));
227 |         assert!(matches!(pref(vec!["ab", "abc", "abd"]), Byte {..}));
228 |     }
229 | }
230 | 
231 | 


--------------------------------------------------------------------------------
/src/runner/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt::Debug;
 2 | 
 3 | pub trait Engine<Ret: Debug>: Debug {
 4 |     fn find(&self, s: &str) -> Option<(usize, usize, Ret)>;
 5 |     fn clone_box(&self) -> Box<Engine<Ret>>;
 6 | }
 7 | 
 8 | pub mod anchored;
 9 | pub mod forward_backward;
10 | pub mod program;
11 | 


--------------------------------------------------------------------------------
/src/runner/program.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2015-2016 Joe Neeman.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  6 | // option. This file may not be copied, modified, or distributed
  7 | // except according to those terms.
  8 | 
  9 | use std::fmt::{Debug, Formatter, Error as FmtError};
 10 | use std::u32;
 11 | 
 12 | pub type TableStateIdx = u32;
 13 | 
 14 | /// A DFA program implemented as a lookup table.
 15 | #[derive(Clone)]
 16 | pub struct TableInsts<Ret> {
 17 |     /// The log (rounded up) of the number of different equivalence classes of bytes.
 18 |     // We could save a bit more memory by storing the actual number instead of the log, because
 19 |     // then `table` could have length num_classes x num_instructions. However, then we need to
 20 |     // multiply (instead of just shifting) to look up the next state, and that slows us down by
 21 |     // 10-20%.
 22 |     //
 23 |     // TODO: we can probably save more memory by splitting classes into ASCII/non-ASCII. Often,
 24 |     // many states share the same non-ASCII transitions, so those tables can be merged.
 25 |     pub log_num_classes: u32,
 26 |     /// A vec of length 256 mapping from bytes to their class indices.
 27 |     pub byte_class: Vec<u8>,
 28 |     /// A `(1 << log_num_classes) x num_instructions`-long table.
 29 |     ///
 30 |     /// For a given input byte `b` in state `state`, we look up the next state using
 31 |     /// `table[state << log_num_classes + b]`.
 32 |     pub table: Vec<TableStateIdx>,
 33 |     /// If `accept[st]` is not `None` then `st` is accepting, and `accept[st]` is the data
 34 |     /// to return.
 35 |     pub accept: Vec<Option<Ret>>,
 36 |     /// Same as `accept`, but applies only at the end of the input.
 37 |     pub accept_at_eoi: Vec<Option<Ret>>,
 38 | }
 39 | 
 40 | impl<Ret: Debug> Debug for TableInsts<Ret> {
 41 |     fn fmt(&self, f: &mut Formatter) -> Result<(), FmtError> {
 42 |         try!(f.write_fmt(format_args!("TableInsts ({} log_classes, {} instructions):\n",
 43 |                                       self.log_num_classes,
 44 |                                       self.accept.len())));
 45 |         try!(f.write_str("Byte classes: "));
 46 |         try!(f.debug_map()
 47 |             .entries((0..256).map(|b| (b, self.byte_class[b])))
 48 |             .finish());
 49 | 
 50 |         let num_classes = 1 << self.log_num_classes;
 51 |         for idx in 0..self.accept.len() {
 52 |             try!(f.write_fmt(format_args!("State {}:\n", idx)));
 53 |             try!(f.debug_map()
 54 |                 .entries((0usize..num_classes)
 55 |                     .map(|c| (c, self.table[(idx << self.log_num_classes) + c]))
 56 |                     .filter(|x| x.1 != u32::MAX))
 57 |                 .finish());
 58 |             try!(f.write_str("\n"));
 59 |         }
 60 | 
 61 |         try!(f.write_str("Accept: "));
 62 |         for idx in 0..self.accept.len() {
 63 |             if let Some(ref ret) = self.accept[idx] {
 64 |                 try!(f.write_fmt(format_args!("{} -> {:?}, ", idx, ret)));
 65 |             }
 66 |         }
 67 | 
 68 |         try!(f.write_str("Accept_at_eoi: "));
 69 |         for idx in 0..self.accept_at_eoi.len() {
 70 |             if let Some(ref ret) = self.accept_at_eoi[idx] {
 71 |                 try!(f.write_fmt(format_args!("{} -> {:?}, ", idx, ret)));
 72 |             }
 73 |         }
 74 |         Ok(())
 75 |     }
 76 | }
 77 | 
 78 | impl<Ret: Copy + Debug> TableInsts<Ret> {
 79 |     fn next_state(&self, state: usize, input: u8) -> Option<usize> {
 80 |         let class = self.byte_class[input as usize];
 81 |         let next_state = self.table[(state << self.log_num_classes) + class as usize];
 82 |         if next_state != u32::MAX {
 83 |             Some(next_state as usize)
 84 |         } else {
 85 |             None
 86 |         }
 87 |     }
 88 | 
 89 |     pub fn num_states(&self) -> usize {
 90 |         self.accept.len()
 91 |     }
 92 | 
 93 |     pub fn find_from(&self, input: &[u8], pos: usize, state: usize)
 94 |     -> Result<(usize, Ret), usize> {
 95 |         let mut state = state as u32;
 96 |         let mut ret = Err(input.len());
 97 | 
 98 |         if state as usize >= self.accept.len() {
 99 |             panic!("BUG");
100 |         }
101 |         for pos in pos..input.len() {
102 |             if let Some(accept_ret) = self.accept[state as usize] {
103 |                 ret = Ok((pos, accept_ret));
104 |             }
105 | 
106 |             // We've manually inlined next_state here, for better performance (measurably better
107 |             // than using #[inline(always)]).
108 |             // For some reason, these bounds checks (even though LLVM leaves them in) don't seem to
109 |             // hurt performance.
110 |             let class = self.byte_class[input[pos] as usize];
111 |             state = self.table[((state as usize) << self.log_num_classes) + class as usize];
112 | 
113 |             // Since everything in `self.table` is either a valid state or u32::MAX, this is the
114 |             // same as checking if state == u32::MAX. We write it this way in the hope that
115 |             // rustc/LLVM will be able to elide the bounds check at the top of the loop.
116 |             if state as usize >= self.accept.len() {
117 |                 if ret.is_err() {
118 |                     return Err(pos);
119 |                 }
120 |                 break;
121 |             }
122 |         }
123 | 
124 |         // If we made it to the end of the input, prefer a return value that is specific to EOI
125 |         // over one that can occur anywhere.
126 |         if (state as usize) < self.accept.len() {
127 |             if let Some(accept_ret) = self.accept_at_eoi[state as usize] {
128 |                 return Ok((input.len(), accept_ret))
129 |             }
130 |         }
131 |         ret
132 |     }
133 | 
134 |     pub fn longest_backward_find_from(&self, input: &[u8], pos: usize, mut state: usize)
135 |     -> Option<(usize, Ret)> {
136 |         let mut ret = None;
137 |         for pos in (0..pos).rev() {
138 |             if let Some(next_ret) = self.accept[state] {
139 |                 ret = Some((pos + 1, next_ret));
140 |             }
141 |             if let Some(next_state) = self.next_state(state, input[pos]) {
142 |                 state = next_state;
143 |             } else {
144 |                 return ret;
145 |             }
146 |         }
147 | 
148 |         if let Some(end_ret) = self.accept_at_eoi[state] {
149 |             Some((0, end_ret))
150 |         } else {
151 |             ret
152 |         }
153 |     }
154 | 
155 |     pub fn is_empty(&self) -> bool {
156 |         self.num_states() == 0
157 |     }
158 | }
159 | 
160 | 


--------------------------------------------------------------------------------
/src/unicode.rs:
--------------------------------------------------------------------------------
  1 | // TODO: This was copied from the regex-syntax crate. At some point, this should presumably live in
  2 | // a third crate.
  3 | pub const PERLW: &'static [(char, char)] = &[
  4 |     ('\u{30}', '\u{39}'), ('\u{41}', '\u{5a}'), ('\u{5f}', '\u{5f}'),
  5 |     ('\u{61}', '\u{7a}'), ('\u{aa}', '\u{aa}'), ('\u{b5}', '\u{b5}'),
  6 |     ('\u{ba}', '\u{ba}'), ('\u{c0}', '\u{d6}'), ('\u{d8}', '\u{f6}'),
  7 |     ('\u{f8}', '\u{2c1}'), ('\u{2c6}', '\u{2d1}'), ('\u{2e0}', '\u{2e4}'),
  8 |     ('\u{2ec}', '\u{2ec}'), ('\u{2ee}', '\u{2ee}'), ('\u{300}', '\u{374}'),
  9 |     ('\u{376}', '\u{377}'), ('\u{37a}', '\u{37d}'), ('\u{37f}', '\u{37f}'),
 10 |     ('\u{386}', '\u{386}'), ('\u{388}', '\u{38a}'), ('\u{38c}', '\u{38c}'),
 11 |     ('\u{38e}', '\u{3a1}'), ('\u{3a3}', '\u{3f5}'), ('\u{3f7}', '\u{481}'),
 12 |     ('\u{483}', '\u{52f}'), ('\u{531}', '\u{556}'), ('\u{559}', '\u{559}'),
 13 |     ('\u{561}', '\u{587}'), ('\u{591}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'),
 14 |     ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'),
 15 |     ('\u{5d0}', '\u{5ea}'), ('\u{5f0}', '\u{5f2}'), ('\u{610}', '\u{61a}'),
 16 |     ('\u{620}', '\u{669}'), ('\u{66e}', '\u{6d3}'), ('\u{6d5}', '\u{6dc}'),
 17 |     ('\u{6df}', '\u{6e8}'), ('\u{6ea}', '\u{6fc}'), ('\u{6ff}', '\u{6ff}'),
 18 |     ('\u{710}', '\u{74a}'), ('\u{74d}', '\u{7b1}'), ('\u{7c0}', '\u{7f5}'),
 19 |     ('\u{7fa}', '\u{7fa}'), ('\u{800}', '\u{82d}'), ('\u{840}', '\u{85b}'),
 20 |     ('\u{8a0}', '\u{8b4}'), ('\u{8e3}', '\u{963}'), ('\u{966}', '\u{96f}'),
 21 |     ('\u{971}', '\u{983}'), ('\u{985}', '\u{98c}'), ('\u{98f}', '\u{990}'),
 22 |     ('\u{993}', '\u{9a8}'), ('\u{9aa}', '\u{9b0}'), ('\u{9b2}', '\u{9b2}'),
 23 |     ('\u{9b6}', '\u{9b9}'), ('\u{9bc}', '\u{9c4}'), ('\u{9c7}', '\u{9c8}'),
 24 |     ('\u{9cb}', '\u{9ce}'), ('\u{9d7}', '\u{9d7}'), ('\u{9dc}', '\u{9dd}'),
 25 |     ('\u{9df}', '\u{9e3}'), ('\u{9e6}', '\u{9f1}'), ('\u{a01}', '\u{a03}'),
 26 |     ('\u{a05}', '\u{a0a}'), ('\u{a0f}', '\u{a10}'), ('\u{a13}', '\u{a28}'),
 27 |     ('\u{a2a}', '\u{a30}'), ('\u{a32}', '\u{a33}'), ('\u{a35}', '\u{a36}'),
 28 |     ('\u{a38}', '\u{a39}'), ('\u{a3c}', '\u{a3c}'), ('\u{a3e}', '\u{a42}'),
 29 |     ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'),
 30 |     ('\u{a59}', '\u{a5c}'), ('\u{a5e}', '\u{a5e}'), ('\u{a66}', '\u{a75}'),
 31 |     ('\u{a81}', '\u{a83}'), ('\u{a85}', '\u{a8d}'), ('\u{a8f}', '\u{a91}'),
 32 |     ('\u{a93}', '\u{aa8}'), ('\u{aaa}', '\u{ab0}'), ('\u{ab2}', '\u{ab3}'),
 33 |     ('\u{ab5}', '\u{ab9}'), ('\u{abc}', '\u{ac5}'), ('\u{ac7}', '\u{ac9}'),
 34 |     ('\u{acb}', '\u{acd}'), ('\u{ad0}', '\u{ad0}'), ('\u{ae0}', '\u{ae3}'),
 35 |     ('\u{ae6}', '\u{aef}'), ('\u{af9}', '\u{af9}'), ('\u{b01}', '\u{b03}'),
 36 |     ('\u{b05}', '\u{b0c}'), ('\u{b0f}', '\u{b10}'), ('\u{b13}', '\u{b28}'),
 37 |     ('\u{b2a}', '\u{b30}'), ('\u{b32}', '\u{b33}'), ('\u{b35}', '\u{b39}'),
 38 |     ('\u{b3c}', '\u{b44}'), ('\u{b47}', '\u{b48}'), ('\u{b4b}', '\u{b4d}'),
 39 |     ('\u{b56}', '\u{b57}'), ('\u{b5c}', '\u{b5d}'), ('\u{b5f}', '\u{b63}'),
 40 |     ('\u{b66}', '\u{b6f}'), ('\u{b71}', '\u{b71}'), ('\u{b82}', '\u{b83}'),
 41 |     ('\u{b85}', '\u{b8a}'), ('\u{b8e}', '\u{b90}'), ('\u{b92}', '\u{b95}'),
 42 |     ('\u{b99}', '\u{b9a}'), ('\u{b9c}', '\u{b9c}'), ('\u{b9e}', '\u{b9f}'),
 43 |     ('\u{ba3}', '\u{ba4}'), ('\u{ba8}', '\u{baa}'), ('\u{bae}', '\u{bb9}'),
 44 |     ('\u{bbe}', '\u{bc2}'), ('\u{bc6}', '\u{bc8}'), ('\u{bca}', '\u{bcd}'),
 45 |     ('\u{bd0}', '\u{bd0}'), ('\u{bd7}', '\u{bd7}'), ('\u{be6}', '\u{bef}'),
 46 |     ('\u{c00}', '\u{c03}'), ('\u{c05}', '\u{c0c}'), ('\u{c0e}', '\u{c10}'),
 47 |     ('\u{c12}', '\u{c28}'), ('\u{c2a}', '\u{c39}'), ('\u{c3d}', '\u{c44}'),
 48 |     ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'),
 49 |     ('\u{c58}', '\u{c5a}'), ('\u{c60}', '\u{c63}'), ('\u{c66}', '\u{c6f}'),
 50 |     ('\u{c81}', '\u{c83}'), ('\u{c85}', '\u{c8c}'), ('\u{c8e}', '\u{c90}'),
 51 |     ('\u{c92}', '\u{ca8}'), ('\u{caa}', '\u{cb3}'), ('\u{cb5}', '\u{cb9}'),
 52 |     ('\u{cbc}', '\u{cc4}'), ('\u{cc6}', '\u{cc8}'), ('\u{cca}', '\u{ccd}'),
 53 |     ('\u{cd5}', '\u{cd6}'), ('\u{cde}', '\u{cde}'), ('\u{ce0}', '\u{ce3}'),
 54 |     ('\u{ce6}', '\u{cef}'), ('\u{cf1}', '\u{cf2}'), ('\u{d01}', '\u{d03}'),
 55 |     ('\u{d05}', '\u{d0c}'), ('\u{d0e}', '\u{d10}'), ('\u{d12}', '\u{d3a}'),
 56 |     ('\u{d3d}', '\u{d44}'), ('\u{d46}', '\u{d48}'), ('\u{d4a}', '\u{d4e}'),
 57 |     ('\u{d57}', '\u{d57}'), ('\u{d5f}', '\u{d63}'), ('\u{d66}', '\u{d6f}'),
 58 |     ('\u{d7a}', '\u{d7f}'), ('\u{d82}', '\u{d83}'), ('\u{d85}', '\u{d96}'),
 59 |     ('\u{d9a}', '\u{db1}'), ('\u{db3}', '\u{dbb}'), ('\u{dbd}', '\u{dbd}'),
 60 |     ('\u{dc0}', '\u{dc6}'), ('\u{dca}', '\u{dca}'), ('\u{dcf}', '\u{dd4}'),
 61 |     ('\u{dd6}', '\u{dd6}'), ('\u{dd8}', '\u{ddf}'), ('\u{de6}', '\u{def}'),
 62 |     ('\u{df2}', '\u{df3}'), ('\u{e01}', '\u{e3a}'), ('\u{e40}', '\u{e4e}'),
 63 |     ('\u{e50}', '\u{e59}'), ('\u{e81}', '\u{e82}'), ('\u{e84}', '\u{e84}'),
 64 |     ('\u{e87}', '\u{e88}'), ('\u{e8a}', '\u{e8a}'), ('\u{e8d}', '\u{e8d}'),
 65 |     ('\u{e94}', '\u{e97}'), ('\u{e99}', '\u{e9f}'), ('\u{ea1}', '\u{ea3}'),
 66 |     ('\u{ea5}', '\u{ea5}'), ('\u{ea7}', '\u{ea7}'), ('\u{eaa}', '\u{eab}'),
 67 |     ('\u{ead}', '\u{eb9}'), ('\u{ebb}', '\u{ebd}'), ('\u{ec0}', '\u{ec4}'),
 68 |     ('\u{ec6}', '\u{ec6}'), ('\u{ec8}', '\u{ecd}'), ('\u{ed0}', '\u{ed9}'),
 69 |     ('\u{edc}', '\u{edf}'), ('\u{f00}', '\u{f00}'), ('\u{f18}', '\u{f19}'),
 70 |     ('\u{f20}', '\u{f29}'), ('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'),
 71 |     ('\u{f39}', '\u{f39}'), ('\u{f3e}', '\u{f47}'), ('\u{f49}', '\u{f6c}'),
 72 |     ('\u{f71}', '\u{f84}'), ('\u{f86}', '\u{f97}'), ('\u{f99}', '\u{fbc}'),
 73 |     ('\u{fc6}', '\u{fc6}'), ('\u{1000}', '\u{1049}'), ('\u{1050}',
 74 |     '\u{109d}'), ('\u{10a0}', '\u{10c5}'), ('\u{10c7}', '\u{10c7}'),
 75 |     ('\u{10cd}', '\u{10cd}'), ('\u{10d0}', '\u{10fa}'), ('\u{10fc}',
 76 |     '\u{1248}'), ('\u{124a}', '\u{124d}'), ('\u{1250}', '\u{1256}'),
 77 |     ('\u{1258}', '\u{1258}'), ('\u{125a}', '\u{125d}'), ('\u{1260}',
 78 |     '\u{1288}'), ('\u{128a}', '\u{128d}'), ('\u{1290}', '\u{12b0}'),
 79 |     ('\u{12b2}', '\u{12b5}'), ('\u{12b8}', '\u{12be}'), ('\u{12c0}',
 80 |     '\u{12c0}'), ('\u{12c2}', '\u{12c5}'), ('\u{12c8}', '\u{12d6}'),
 81 |     ('\u{12d8}', '\u{1310}'), ('\u{1312}', '\u{1315}'), ('\u{1318}',
 82 |     '\u{135a}'), ('\u{135d}', '\u{135f}'), ('\u{1380}', '\u{138f}'),
 83 |     ('\u{13a0}', '\u{13f5}'), ('\u{13f8}', '\u{13fd}'), ('\u{1401}',
 84 |     '\u{166c}'), ('\u{166f}', '\u{167f}'), ('\u{1681}', '\u{169a}'),
 85 |     ('\u{16a0}', '\u{16ea}'), ('\u{16ee}', '\u{16f8}'), ('\u{1700}',
 86 |     '\u{170c}'), ('\u{170e}', '\u{1714}'), ('\u{1720}', '\u{1734}'),
 87 |     ('\u{1740}', '\u{1753}'), ('\u{1760}', '\u{176c}'), ('\u{176e}',
 88 |     '\u{1770}'), ('\u{1772}', '\u{1773}'), ('\u{1780}', '\u{17d3}'),
 89 |     ('\u{17d7}', '\u{17d7}'), ('\u{17dc}', '\u{17dd}'), ('\u{17e0}',
 90 |     '\u{17e9}'), ('\u{180b}', '\u{180d}'), ('\u{1810}', '\u{1819}'),
 91 |     ('\u{1820}', '\u{1877}'), ('\u{1880}', '\u{18aa}'), ('\u{18b0}',
 92 |     '\u{18f5}'), ('\u{1900}', '\u{191e}'), ('\u{1920}', '\u{192b}'),
 93 |     ('\u{1930}', '\u{193b}'), ('\u{1946}', '\u{196d}'), ('\u{1970}',
 94 |     '\u{1974}'), ('\u{1980}', '\u{19ab}'), ('\u{19b0}', '\u{19c9}'),
 95 |     ('\u{19d0}', '\u{19d9}'), ('\u{1a00}', '\u{1a1b}'), ('\u{1a20}',
 96 |     '\u{1a5e}'), ('\u{1a60}', '\u{1a7c}'), ('\u{1a7f}', '\u{1a89}'),
 97 |     ('\u{1a90}', '\u{1a99}'), ('\u{1aa7}', '\u{1aa7}'), ('\u{1ab0}',
 98 |     '\u{1abe}'), ('\u{1b00}', '\u{1b4b}'), ('\u{1b50}', '\u{1b59}'),
 99 |     ('\u{1b6b}', '\u{1b73}'), ('\u{1b80}', '\u{1bf3}'), ('\u{1c00}',
100 |     '\u{1c37}'), ('\u{1c40}', '\u{1c49}'), ('\u{1c4d}', '\u{1c7d}'),
101 |     ('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', '\u{1cf6}'), ('\u{1cf8}',
102 |     '\u{1cf9}'), ('\u{1d00}', '\u{1df5}'), ('\u{1dfc}', '\u{1f15}'),
103 |     ('\u{1f18}', '\u{1f1d}'), ('\u{1f20}', '\u{1f45}'), ('\u{1f48}',
104 |     '\u{1f4d}'), ('\u{1f50}', '\u{1f57}'), ('\u{1f59}', '\u{1f59}'),
105 |     ('\u{1f5b}', '\u{1f5b}'), ('\u{1f5d}', '\u{1f5d}'), ('\u{1f5f}',
106 |     '\u{1f7d}'), ('\u{1f80}', '\u{1fb4}'), ('\u{1fb6}', '\u{1fbc}'),
107 |     ('\u{1fbe}', '\u{1fbe}'), ('\u{1fc2}', '\u{1fc4}'), ('\u{1fc6}',
108 |     '\u{1fcc}'), ('\u{1fd0}', '\u{1fd3}'), ('\u{1fd6}', '\u{1fdb}'),
109 |     ('\u{1fe0}', '\u{1fec}'), ('\u{1ff2}', '\u{1ff4}'), ('\u{1ff6}',
110 |     '\u{1ffc}'), ('\u{200c}', '\u{200d}'), ('\u{203f}', '\u{2040}'),
111 |     ('\u{2054}', '\u{2054}'), ('\u{2071}', '\u{2071}'), ('\u{207f}',
112 |     '\u{207f}'), ('\u{2090}', '\u{209c}'), ('\u{20d0}', '\u{20f0}'),
113 |     ('\u{2102}', '\u{2102}'), ('\u{2107}', '\u{2107}'), ('\u{210a}',
114 |     '\u{2113}'), ('\u{2115}', '\u{2115}'), ('\u{2119}', '\u{211d}'),
115 |     ('\u{2124}', '\u{2124}'), ('\u{2126}', '\u{2126}'), ('\u{2128}',
116 |     '\u{2128}'), ('\u{212a}', '\u{212d}'), ('\u{212f}', '\u{2139}'),
117 |     ('\u{213c}', '\u{213f}'), ('\u{2145}', '\u{2149}'), ('\u{214e}',
118 |     '\u{214e}'), ('\u{2160}', '\u{2188}'), ('\u{24b6}', '\u{24e9}'),
119 |     ('\u{2c00}', '\u{2c2e}'), ('\u{2c30}', '\u{2c5e}'), ('\u{2c60}',
120 |     '\u{2ce4}'), ('\u{2ceb}', '\u{2cf3}'), ('\u{2d00}', '\u{2d25}'),
121 |     ('\u{2d27}', '\u{2d27}'), ('\u{2d2d}', '\u{2d2d}'), ('\u{2d30}',
122 |     '\u{2d67}'), ('\u{2d6f}', '\u{2d6f}'), ('\u{2d7f}', '\u{2d96}'),
123 |     ('\u{2da0}', '\u{2da6}'), ('\u{2da8}', '\u{2dae}'), ('\u{2db0}',
124 |     '\u{2db6}'), ('\u{2db8}', '\u{2dbe}'), ('\u{2dc0}', '\u{2dc6}'),
125 |     ('\u{2dc8}', '\u{2dce}'), ('\u{2dd0}', '\u{2dd6}'), ('\u{2dd8}',
126 |     '\u{2dde}'), ('\u{2de0}', '\u{2dff}'), ('\u{2e2f}', '\u{2e2f}'),
127 |     ('\u{3005}', '\u{3007}'), ('\u{3021}', '\u{302f}'), ('\u{3031}',
128 |     '\u{3035}'), ('\u{3038}', '\u{303c}'), ('\u{3041}', '\u{3096}'),
129 |     ('\u{3099}', '\u{309a}'), ('\u{309d}', '\u{309f}'), ('\u{30a1}',
130 |     '\u{30fa}'), ('\u{30fc}', '\u{30ff}'), ('\u{3105}', '\u{312d}'),
131 |     ('\u{3131}', '\u{318e}'), ('\u{31a0}', '\u{31ba}'), ('\u{31f0}',
132 |     '\u{31ff}'), ('\u{3400}', '\u{4db5}'), ('\u{4e00}', '\u{9fd5}'),
133 |     ('\u{a000}', '\u{a48c}'), ('\u{a4d0}', '\u{a4fd}'), ('\u{a500}',
134 |     '\u{a60c}'), ('\u{a610}', '\u{a62b}'), ('\u{a640}', '\u{a672}'),
135 |     ('\u{a674}', '\u{a67d}'), ('\u{a67f}', '\u{a6f1}'), ('\u{a717}',
136 |     '\u{a71f}'), ('\u{a722}', '\u{a788}'), ('\u{a78b}', '\u{a7ad}'),
137 |     ('\u{a7b0}', '\u{a7b7}'), ('\u{a7f7}', '\u{a827}'), ('\u{a840}',
138 |     '\u{a873}'), ('\u{a880}', '\u{a8c4}'), ('\u{a8d0}', '\u{a8d9}'),
139 |     ('\u{a8e0}', '\u{a8f7}'), ('\u{a8fb}', '\u{a8fb}'), ('\u{a8fd}',
140 |     '\u{a8fd}'), ('\u{a900}', '\u{a92d}'), ('\u{a930}', '\u{a953}'),
141 |     ('\u{a960}', '\u{a97c}'), ('\u{a980}', '\u{a9c0}'), ('\u{a9cf}',
142 |     '\u{a9d9}'), ('\u{a9e0}', '\u{a9fe}'), ('\u{aa00}', '\u{aa36}'),
143 |     ('\u{aa40}', '\u{aa4d}'), ('\u{aa50}', '\u{aa59}'), ('\u{aa60}',
144 |     '\u{aa76}'), ('\u{aa7a}', '\u{aac2}'), ('\u{aadb}', '\u{aadd}'),
145 |     ('\u{aae0}', '\u{aaef}'), ('\u{aaf2}', '\u{aaf6}'), ('\u{ab01}',
146 |     '\u{ab06}'), ('\u{ab09}', '\u{ab0e}'), ('\u{ab11}', '\u{ab16}'),
147 |     ('\u{ab20}', '\u{ab26}'), ('\u{ab28}', '\u{ab2e}'), ('\u{ab30}',
148 |     '\u{ab5a}'), ('\u{ab5c}', '\u{ab65}'), ('\u{ab70}', '\u{abea}'),
149 |     ('\u{abec}', '\u{abed}'), ('\u{abf0}', '\u{abf9}'), ('\u{ac00}',
150 |     '\u{d7a3}'), ('\u{d7b0}', '\u{d7c6}'), ('\u{d7cb}', '\u{d7fb}'),
151 |     ('\u{f900}', '\u{fa6d}'), ('\u{fa70}', '\u{fad9}'), ('\u{fb00}',
152 |     '\u{fb06}'), ('\u{fb13}', '\u{fb17}'), ('\u{fb1d}', '\u{fb28}'),
153 |     ('\u{fb2a}', '\u{fb36}'), ('\u{fb38}', '\u{fb3c}'), ('\u{fb3e}',
154 |     '\u{fb3e}'), ('\u{fb40}', '\u{fb41}'), ('\u{fb43}', '\u{fb44}'),
155 |     ('\u{fb46}', '\u{fbb1}'), ('\u{fbd3}', '\u{fd3d}'), ('\u{fd50}',
156 |     '\u{fd8f}'), ('\u{fd92}', '\u{fdc7}'), ('\u{fdf0}', '\u{fdfb}'),
157 |     ('\u{fe00}', '\u{fe0f}'), ('\u{fe20}', '\u{fe2f}'), ('\u{fe33}',
158 |     '\u{fe34}'), ('\u{fe4d}', '\u{fe4f}'), ('\u{fe70}', '\u{fe74}'),
159 |     ('\u{fe76}', '\u{fefc}'), ('\u{ff10}', '\u{ff19}'), ('\u{ff21}',
160 |     '\u{ff3a}'), ('\u{ff3f}', '\u{ff3f}'), ('\u{ff41}', '\u{ff5a}'),
161 |     ('\u{ff66}', '\u{ffbe}'), ('\u{ffc2}', '\u{ffc7}'), ('\u{ffca}',
162 |     '\u{ffcf}'), ('\u{ffd2}', '\u{ffd7}'), ('\u{ffda}', '\u{ffdc}'),
163 |     ('\u{10000}', '\u{1000b}'), ('\u{1000d}', '\u{10026}'), ('\u{10028}',
164 |     '\u{1003a}'), ('\u{1003c}', '\u{1003d}'), ('\u{1003f}', '\u{1004d}'),
165 |     ('\u{10050}', '\u{1005d}'), ('\u{10080}', '\u{100fa}'), ('\u{10140}',
166 |     '\u{10174}'), ('\u{101fd}', '\u{101fd}'), ('\u{10280}', '\u{1029c}'),
167 |     ('\u{102a0}', '\u{102d0}'), ('\u{102e0}', '\u{102e0}'), ('\u{10300}',
168 |     '\u{1031f}'), ('\u{10330}', '\u{1034a}'), ('\u{10350}', '\u{1037a}'),
169 |     ('\u{10380}', '\u{1039d}'), ('\u{103a0}', '\u{103c3}'), ('\u{103c8}',
170 |     '\u{103cf}'), ('\u{103d1}', '\u{103d5}'), ('\u{10400}', '\u{1049d}'),
171 |     ('\u{104a0}', '\u{104a9}'), ('\u{10500}', '\u{10527}'), ('\u{10530}',
172 |     '\u{10563}'), ('\u{10600}', '\u{10736}'), ('\u{10740}', '\u{10755}'),
173 |     ('\u{10760}', '\u{10767}'), ('\u{10800}', '\u{10805}'), ('\u{10808}',
174 |     '\u{10808}'), ('\u{1080a}', '\u{10835}'), ('\u{10837}', '\u{10838}'),
175 |     ('\u{1083c}', '\u{1083c}'), ('\u{1083f}', '\u{10855}'), ('\u{10860}',
176 |     '\u{10876}'), ('\u{10880}', '\u{1089e}'), ('\u{108e0}', '\u{108f2}'),
177 |     ('\u{108f4}', '\u{108f5}'), ('\u{10900}', '\u{10915}'), ('\u{10920}',
178 |     '\u{10939}'), ('\u{10980}', '\u{109b7}'), ('\u{109be}', '\u{109bf}'),
179 |     ('\u{10a00}', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}',
180 |     '\u{10a13}'), ('\u{10a15}', '\u{10a17}'), ('\u{10a19}', '\u{10a33}'),
181 |     ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '\u{10a3f}'), ('\u{10a60}',
182 |     '\u{10a7c}'), ('\u{10a80}', '\u{10a9c}'), ('\u{10ac0}', '\u{10ac7}'),
183 |     ('\u{10ac9}', '\u{10ae6}'), ('\u{10b00}', '\u{10b35}'), ('\u{10b40}',
184 |     '\u{10b55}'), ('\u{10b60}', '\u{10b72}'), ('\u{10b80}', '\u{10b91}'),
185 |     ('\u{10c00}', '\u{10c48}'), ('\u{10c80}', '\u{10cb2}'), ('\u{10cc0}',
186 |     '\u{10cf2}'), ('\u{11000}', '\u{11046}'), ('\u{11066}', '\u{1106f}'),
187 |     ('\u{1107f}', '\u{110ba}'), ('\u{110d0}', '\u{110e8}'), ('\u{110f0}',
188 |     '\u{110f9}'), ('\u{11100}', '\u{11134}'), ('\u{11136}', '\u{1113f}'),
189 |     ('\u{11150}', '\u{11173}'), ('\u{11176}', '\u{11176}'), ('\u{11180}',
190 |     '\u{111c4}'), ('\u{111ca}', '\u{111cc}'), ('\u{111d0}', '\u{111da}'),
191 |     ('\u{111dc}', '\u{111dc}'), ('\u{11200}', '\u{11211}'), ('\u{11213}',
192 |     '\u{11237}'), ('\u{11280}', '\u{11286}'), ('\u{11288}', '\u{11288}'),
193 |     ('\u{1128a}', '\u{1128d}'), ('\u{1128f}', '\u{1129d}'), ('\u{1129f}',
194 |     '\u{112a8}'), ('\u{112b0}', '\u{112ea}'), ('\u{112f0}', '\u{112f9}'),
195 |     ('\u{11300}', '\u{11303}'), ('\u{11305}', '\u{1130c}'), ('\u{1130f}',
196 |     '\u{11310}'), ('\u{11313}', '\u{11328}'), ('\u{1132a}', '\u{11330}'),
197 |     ('\u{11332}', '\u{11333}'), ('\u{11335}', '\u{11339}'), ('\u{1133c}',
198 |     '\u{11344}'), ('\u{11347}', '\u{11348}'), ('\u{1134b}', '\u{1134d}'),
199 |     ('\u{11350}', '\u{11350}'), ('\u{11357}', '\u{11357}'), ('\u{1135d}',
200 |     '\u{11363}'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'),
201 |     ('\u{11480}', '\u{114c5}'), ('\u{114c7}', '\u{114c7}'), ('\u{114d0}',
202 |     '\u{114d9}'), ('\u{11580}', '\u{115b5}'), ('\u{115b8}', '\u{115c0}'),
203 |     ('\u{115d8}', '\u{115dd}'), ('\u{11600}', '\u{11640}'), ('\u{11644}',
204 |     '\u{11644}'), ('\u{11650}', '\u{11659}'), ('\u{11680}', '\u{116b7}'),
205 |     ('\u{116c0}', '\u{116c9}'), ('\u{11700}', '\u{11719}'), ('\u{1171d}',
206 |     '\u{1172b}'), ('\u{11730}', '\u{11739}'), ('\u{118a0}', '\u{118e9}'),
207 |     ('\u{118ff}', '\u{118ff}'), ('\u{11ac0}', '\u{11af8}'), ('\u{12000}',
208 |     '\u{12399}'), ('\u{12400}', '\u{1246e}'), ('\u{12480}', '\u{12543}'),
209 |     ('\u{13000}', '\u{1342e}'), ('\u{14400}', '\u{14646}'), ('\u{16800}',
210 |     '\u{16a38}'), ('\u{16a40}', '\u{16a5e}'), ('\u{16a60}', '\u{16a69}'),
211 |     ('\u{16ad0}', '\u{16aed}'), ('\u{16af0}', '\u{16af4}'), ('\u{16b00}',
212 |     '\u{16b36}'), ('\u{16b40}', '\u{16b43}'), ('\u{16b50}', '\u{16b59}'),
213 |     ('\u{16b63}', '\u{16b77}'), ('\u{16b7d}', '\u{16b8f}'), ('\u{16f00}',
214 |     '\u{16f44}'), ('\u{16f50}', '\u{16f7e}'), ('\u{16f8f}', '\u{16f9f}'),
215 |     ('\u{1b000}', '\u{1b001}'), ('\u{1bc00}', '\u{1bc6a}'), ('\u{1bc70}',
216 |     '\u{1bc7c}'), ('\u{1bc80}', '\u{1bc88}'), ('\u{1bc90}', '\u{1bc99}'),
217 |     ('\u{1bc9d}', '\u{1bc9e}'), ('\u{1d165}', '\u{1d169}'), ('\u{1d16d}',
218 |     '\u{1d172}'), ('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'),
219 |     ('\u{1d1aa}', '\u{1d1ad}'), ('\u{1d242}', '\u{1d244}'), ('\u{1d400}',
220 |     '\u{1d454}'), ('\u{1d456}', '\u{1d49c}'), ('\u{1d49e}', '\u{1d49f}'),
221 |     ('\u{1d4a2}', '\u{1d4a2}'), ('\u{1d4a5}', '\u{1d4a6}'), ('\u{1d4a9}',
222 |     '\u{1d4ac}'), ('\u{1d4ae}', '\u{1d4b9}'), ('\u{1d4bb}', '\u{1d4bb}'),
223 |     ('\u{1d4bd}', '\u{1d4c3}'), ('\u{1d4c5}', '\u{1d505}'), ('\u{1d507}',
224 |     '\u{1d50a}'), ('\u{1d50d}', '\u{1d514}'), ('\u{1d516}', '\u{1d51c}'),
225 |     ('\u{1d51e}', '\u{1d539}'), ('\u{1d53b}', '\u{1d53e}'), ('\u{1d540}',
226 |     '\u{1d544}'), ('\u{1d546}', '\u{1d546}'), ('\u{1d54a}', '\u{1d550}'),
227 |     ('\u{1d552}', '\u{1d6a5}'), ('\u{1d6a8}', '\u{1d6c0}'), ('\u{1d6c2}',
228 |     '\u{1d6da}'), ('\u{1d6dc}', '\u{1d6fa}'), ('\u{1d6fc}', '\u{1d714}'),
229 |     ('\u{1d716}', '\u{1d734}'), ('\u{1d736}', '\u{1d74e}'), ('\u{1d750}',
230 |     '\u{1d76e}'), ('\u{1d770}', '\u{1d788}'), ('\u{1d78a}', '\u{1d7a8}'),
231 |     ('\u{1d7aa}', '\u{1d7c2}'), ('\u{1d7c4}', '\u{1d7cb}'), ('\u{1d7ce}',
232 |     '\u{1d7ff}'), ('\u{1da00}', '\u{1da36}'), ('\u{1da3b}', '\u{1da6c}'),
233 |     ('\u{1da75}', '\u{1da75}'), ('\u{1da84}', '\u{1da84}'), ('\u{1da9b}',
234 |     '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}'), ('\u{1e800}', '\u{1e8c4}'),
235 |     ('\u{1e8d0}', '\u{1e8d6}'), ('\u{1ee00}', '\u{1ee03}'), ('\u{1ee05}',
236 |     '\u{1ee1f}'), ('\u{1ee21}', '\u{1ee22}'), ('\u{1ee24}', '\u{1ee24}'),
237 |     ('\u{1ee27}', '\u{1ee27}'), ('\u{1ee29}', '\u{1ee32}'), ('\u{1ee34}',
238 |     '\u{1ee37}'), ('\u{1ee39}', '\u{1ee39}'), ('\u{1ee3b}', '\u{1ee3b}'),
239 |     ('\u{1ee42}', '\u{1ee42}'), ('\u{1ee47}', '\u{1ee47}'), ('\u{1ee49}',
240 |     '\u{1ee49}'), ('\u{1ee4b}', '\u{1ee4b}'), ('\u{1ee4d}', '\u{1ee4f}'),
241 |     ('\u{1ee51}', '\u{1ee52}'), ('\u{1ee54}', '\u{1ee54}'), ('\u{1ee57}',
242 |     '\u{1ee57}'), ('\u{1ee59}', '\u{1ee59}'), ('\u{1ee5b}', '\u{1ee5b}'),
243 |     ('\u{1ee5d}', '\u{1ee5d}'), ('\u{1ee5f}', '\u{1ee5f}'), ('\u{1ee61}',
244 |     '\u{1ee62}'), ('\u{1ee64}', '\u{1ee64}'), ('\u{1ee67}', '\u{1ee6a}'),
245 |     ('\u{1ee6c}', '\u{1ee72}'), ('\u{1ee74}', '\u{1ee77}'), ('\u{1ee79}',
246 |     '\u{1ee7c}'), ('\u{1ee7e}', '\u{1ee7e}'), ('\u{1ee80}', '\u{1ee89}'),
247 |     ('\u{1ee8b}', '\u{1ee9b}'), ('\u{1eea1}', '\u{1eea3}'), ('\u{1eea5}',
248 |     '\u{1eea9}'), ('\u{1eeab}', '\u{1eebb}'), ('\u{1f130}', '\u{1f149}'),
249 |     ('\u{1f150}', '\u{1f169}'), ('\u{1f170}', '\u{1f189}'), ('\u{20000}',
250 |     '\u{2a6d6}'), ('\u{2a700}', '\u{2b734}'), ('\u{2b740}', '\u{2b81d}'),
251 |     ('\u{2b820}', '\u{2cea1}'), ('\u{2f800}', '\u{2fa1d}'), ('\u{e0100}',
252 |     '\u{e01ef}')
253 | ];
254 | 
255 | 
256 | 


--------------------------------------------------------------------------------