├── .gitignore ├── .travis.yml ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── benches ├── bench.rs ├── bench_default.rs └── bench_dynamic.rs ├── src ├── dfa │ ├── minimizer.rs │ ├── mod.rs │ ├── prefix_searcher.rs │ └── trie.rs ├── error.rs ├── graph.rs ├── lib.rs ├── look.rs ├── nfa │ ├── has_looks.rs │ ├── mod.rs │ └── no_looks.rs ├── regex.rs ├── runner │ ├── anchored.rs │ ├── forward_backward.rs │ ├── mod.rs │ └── program.rs └── unicode.rs └── tests └── matches.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | *.swp 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | sudo: false 3 | 4 | # necessary for `travis-cargo coveralls --no-sudo` 5 | addons: 6 | apt: 7 | packages: 8 | - libcurl4-openssl-dev 9 | - libelf-dev 10 | - libdw-dev 11 | 12 | rust: 13 | - nightly 14 | 15 | before_script: 16 | - | 17 | pip install 'travis-cargo<0.2' --user && 18 | export PATH=$HOME/.local/bin:$PATH 19 | 20 | script: 21 | - | 22 | travis-cargo build && 23 | travis-cargo test && 24 | travis-cargo doc 25 | after_success: 26 | - travis-cargo doc-upload 27 | - travis-cargo coveralls --no-sudo 28 | 29 | env: 30 | global: 31 | - TRAVIS_CARGO_NIGHTLY_FEATURE="" 32 | - secure: FsSMY8g9LrxT3iLZPnTIAQNdIGmMNCYuwh0k1e8okpILNerRLSbDrCTtKgf4BA/wpRCYyZwb1O7K+8nd279WTaiWdQpDRGuY+JCXG91cKhXCZIydPWG3NAKD5p+WFLP16fwZkni54IAStwKymAiyZlcGN2/9lHy1KtbPUuStRuTKHIP6cEJfupVm0xcd0FuQZGDYY3h2YsgRz8JGW53nIwxRbv2Kti/4bwxCBOGgdo7nTYBidHfFWrMZQdKIrohBeRp0h1ALIGrS4ASrHeuk6wISOT57UhzlgQhZXstp18FIR3EyEbJyNzdu+0pvh4dVRxmSl5vCvtzyzph2szkJEYRgxz8JNDW3V4ya/rt/KURUxJNke3TCjcOV9uIxfnCQo0fuC2R4kO2zM7zen8K0gK77GmFfDoelM6f7KS0SL3ymxIQ7TR2eEHkXa4h+VrzuJcyGHiaruHolh6Hui6T5gSc6/1Y9Abovva95gguLmAMgtRAx19yaVBbRwr9VasC35bJBjFW9KhVCIptn2rZ0JUCNWscdnzKBI5c5ffv6ZNtCqapGf/rQyZuEZJgNzcr2BIKQwtUoyPojWu8i87FEAg9GIgjlDWBSR+dxMeehuXt1kuEsVXl1BZkqSrwFyK64PT+il8eKyyqsDZAfqmZfIi2TfRN292cZkFLfWew79Co= 33 | 34 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "regex_dfa" 3 | version = "0.5.0" 4 | authors = ["Joe Neeman "] 5 | description = "A crate for turning regexes into DFAs." 6 | documentation = "http://jneem.github.io/regex-dfa" 7 | homepage = "http://jneem.github.io/regex-dfa" 8 | repository = "http://github.com/jneem/regex-dfa" 9 | readme = "README.md" 10 | license = "MIT/Apache-2.0" 11 | 12 | [dependencies] 13 | itertools = "0.4" 14 | lazy_static = "0.1" 15 | memchr = "0.1" 16 | num-traits = "0.1" 17 | range-map = "0.1.5" 18 | refinery = "0.1" 19 | regex-syntax = "0.2" 20 | utf8-ranges = "0.1" 21 | 22 | [dev-dependencies] 23 | matches = "0.1" 24 | quickcheck = "0.2" 25 | regex = "0.1.41" 26 | rand = "0.3" 27 | serde_json = "0.6" 28 | 29 | [[bench]] 30 | name = "dynamic" 31 | path = "benches/bench_dynamic.rs" 32 | test = false 33 | bench = true 34 | 35 | [[bench]] 36 | name = "default" 37 | path = "benches/bench_default.rs" 38 | test = true 39 | bench = true 40 | 41 | [[test]] 42 | name = "examples" 43 | path = "tests/matches.rs" 44 | 45 | [[test]] 46 | name = "crate" 47 | path = "src/lib.rs" 48 | 49 | [profile.bench] 50 | debug = true 51 | lto = true 52 | 53 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 The Rust Project Developers 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | regex_dfa 2 | ========= 3 | 4 | A crate for compiling regular expressions down to deterministic finite 5 | automata. 6 | 7 | [![Build status](https://travis-ci.org/jneem/regex-dfa.svg)](https://travis-ci.org/jneem/regex-dfa) 8 | [![Coverage Status](https://coveralls.io/repos/jneem/regex-dfa/badge.svg?branch=master&service=github)](https://coveralls.io/github/jneem/regex-dfa?branch=master) 9 | 10 | [Documentation](http://jneem.github.io/regex-dfa/regex_dfa/index.html) 11 | 12 | # Why? 13 | 14 | Some regular expression implementations (e.g. [rust's regex 15 | library](http://github.com/rust-lang/regex)) are based on simulating 16 | non-deterministic finite automata (NFAs). By turning NFAs into DFAs, we can 17 | get a speed boost, at the cost of some compilation time and memory usage. 18 | [Preliminary benchmarks](http://bl.ocks.org/jneem/raw/3f08ade195796358d027/?data=%5B%7B%22x%22%3A%201506622%2C%20%22y%22%3A%201646883%2C%20%22bench%22%3A%20%22%28%3Fi%29Twain%22%2C%20%22ratio%22%3A%200.914832444077691%7D%2C%20%7B%22x%22%3A%2015187577%2C%20%22y%22%3A%20125978857%2C%20%22bench%22%3A%20%22%5Ba-q%5D%5B%5Eu-z%5D%7B13%7Dx%22%2C%20%22ratio%22%3A%200.12055655497811034%7D%2C%20%7B%22x%22%3A%201630324%2C%20%22y%22%3A%201631615%2C%20%22bench%22%3A%20%22Tom%7CSawyer%7CHuckleberry%7CFinn%22%2C%20%22ratio%22%3A%200.9992087594193483%7D%2C%20%7B%22x%22%3A%201501634%2C%20%22y%22%3A%20243419316%2C%20%22bench%22%3A%20%22.%7B0%2C2%7D%28Tom%7CSawyer%7CHuckleberry%7CFinn%29%22%2C%20%22ratio%22%3A%200.0061689188215449595%7D%2C%20%7B%22x%22%3A%201506470%2C%20%22y%22%3A%20295351074%2C%20%22bench%22%3A%20%22.%7B2%2C4%7D%28Tom%7CSawyer%7CHuckleberry%7CFinn%29%22%2C%20%22ratio%22%3A%200.005100607827821882%7D%2C%20%7B%22x%22%3A%201583452%2C%20%22y%22%3A%201724976%2C%20%22bench%22%3A%20%22Tom.%7B10%2C25%7Driver%7Criver.%7B10%2C25%7DTom%22%2C%20%22ratio%22%3A%200.9179559599669792%7D%2C%20%7B%22x%22%3A%201290763%2C%20%22y%22%3A%207366270%2C%20%22bench%22%3A%20%22%5B%5C%22%27%5D%5B%5E%5C%22%27%5D%7B0%2C30%7D%5B%3F%21%5C%5C.%5D%5B%5C%22%27%5D%22%2C%20%22ratio%22%3A%200.17522613208584534%7D%2C%20%7B%22x%22%3A%201571131%2C%20%22y%22%3A%20137662852%2C%20%22bench%22%3A%20%22%28%3Fi%29Tom%7CSawyer%7CHuckleberry%7CFinn%22%2C%20%22ratio%22%3A%200.011412890094707612%7D%2C%20%7B%22x%22%3A%201493996%2C%20%22y%22%3A%20131818244%2C%20%22bench%22%3A%20%22%28%5BA-Za-z%5Dawyer%7C%5BA-Za-z%5Dinn%29%5C%5Cs%22%2C%20%22ratio%22%3A%200.011333757412213746%7D%2C%20%7B%22x%22%3A%201499437%2C%20%22y%22%3A%2094501284%2C%20%22bench%22%3A%20%22%5C%5Cb%5C%5Cw%2Bnn%5C%5Cb%22%2C%20%22ratio%22%3A%200.015866842613482375%7D%2C%20%7B%22x%22%3A%201494150%2C%20%22y%22%3A%201620677%2C%20%22bench%22%3A%20%22Huck%5Ba-zA-Z%5D%2B%7CSaw%5Ba-zA-Z%5D%2B%22%2C%20%22ratio%22%3A%200.9219295393221475%7D%2C%20%7B%22x%22%3A%204604887%2C%20%22y%22%3A%2084880712%2C%20%22bench%22%3A%20%22%5C%5Cs%5Ba-zA-Z%5D%7B0%2C12%7Ding%5C%5Cs%22%2C%20%22ratio%22%3A%200.0542512767800534%7D%2C%20%7B%22x%22%3A%201657821%2C%20%22y%22%3A%201664874%2C%20%22bench%22%3A%20%22%5Ba-z%5Dshing%22%2C%20%22ratio%22%3A%200.9957636433748139%7D%2C%20%7B%22x%22%3A%20100143%2C%20%22y%22%3A%2099739%2C%20%22bench%22%3A%20%22Twain%22%2C%20%22ratio%22%3A%201.0040505719929014%7D%2C%20%7B%22x%22%3A%204877680%2C%20%22y%22%3A%2071127171%2C%20%22bench%22%3A%20%22%5Ba-zA-Z%5D%2Bing%22%2C%20%22ratio%22%3A%200.06857688744572732%7D%2C%20%7B%22x%22%3A%201584255%2C%20%22y%22%3A%2062666721%2C%20%22bench%22%3A%20%22%5C%5CbF%5C%5Cw%2Bn%5C%5Cb%22%2C%20%22ratio%22%3A%200.025280642974761677%7D%5D) 19 | show a substantial speed improvement over rust's default `regex` crate. 20 | 21 | # Limitations 22 | 23 | - Turning an NFA into a DFA can take a lot of memory, especially when unicode character classes are involved. 24 | - Subgroup captures are a bit tricky, and this crate does not handle them. 25 | - `regex_dfa` currently only works on nightly rust. 26 | 27 | # License 28 | 29 | `regex_dfa` is distributed under the MIT license and the Apache license (version 2.0). 30 | See LICENSE-APACHE and LICENSE-MIT for details. 31 | 32 | -------------------------------------------------------------------------------- /benches/bench.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Rust Project Developers. 2 | // Copyright 2015-2016 Joe Neeman. 3 | // 4 | // Licensed under the Apache License, Version 2.0 or the MIT license 6 | // , at your 7 | // option. This file may not be copied, modified, or distributed 8 | // except according to those terms. 9 | #![allow(non_snake_case)] 10 | 11 | use std::iter::repeat; 12 | use rand::{Rng, thread_rng}; 13 | use test::Bencher; 14 | 15 | fn bench_assert_non_match(b: &mut Bencher, re: ::Regex, text: &str) { 16 | b.iter(|| if re.is_match(text) { panic!("match") }); 17 | } 18 | 19 | fn bench_assert_match(b: &mut Bencher, re: ::Regex, text: &str) { 20 | b.iter(|| if !re.is_match(text) { panic!("no match") }); 21 | } 22 | 23 | #[bench] 24 | fn compile_word_boundary(b: &mut Bencher) { 25 | b.iter(|| regex!(r"\btest\b")); 26 | } 27 | 28 | #[bench] 29 | fn literal(b: &mut Bencher) { 30 | let re = regex!("y"); 31 | let text = format!("{}y", repeat("x").take(50).collect::()); 32 | bench_assert_match(b, re, &text); 33 | } 34 | 35 | #[bench] 36 | fn longer_literal(b: &mut Bencher) { 37 | let re = regex!("foobar"); 38 | let text = format!("{}foobar", repeat("f").take(10000).collect::()); 39 | bench_assert_match(b, re, &text); 40 | } 41 | 42 | #[bench] 43 | fn longer_literal_no_regex(b: &mut Bencher) { 44 | let re = "foobar"; 45 | let text = format!("{}foobar", repeat("f").take(10000).collect::()); 46 | b.iter(|| text.find(re)); 47 | } 48 | 49 | #[bench] 50 | fn not_literal(b: &mut Bencher) { 51 | let re = regex!(".y"); 52 | let text = format!("{}y", repeat("x").take(10000).collect::()); 53 | b.bytes = 10000; 54 | bench_assert_match(b, re, &text); 55 | } 56 | 57 | #[bench] 58 | fn match_class(b: &mut Bencher) { 59 | let re = regex!("[abcdw]"); 60 | let text = format!("{}w", repeat("xxxx").take(20).collect::()); 61 | bench_assert_match(b, re, &text); 62 | } 63 | 64 | #[bench] 65 | fn match_class_in_range(b: &mut Bencher) { 66 | // 'b' is between 'a' and 'c', so the class range checking doesn't help. 67 | let re = regex!("[ac]"); 68 | let text = format!("{}c", repeat("bbbb").take(20).collect::()); 69 | bench_assert_match(b, re, &text); 70 | } 71 | 72 | #[bench] 73 | fn match_class_unicode(b: &mut Bencher) { 74 | let re = regex!(r"\pL"); 75 | let text = format!("{}a", repeat("☃5☃5").take(20).collect::()); 76 | bench_assert_match(b, re, &text); 77 | } 78 | 79 | #[bench] 80 | fn anchored_literal_short_non_match(b: &mut Bencher) { 81 | let re = regex!("^zbc(d|e)"); 82 | let text = "abcdefghijklmnopqrstuvwxyz"; 83 | bench_assert_non_match(b, re, &text); 84 | } 85 | 86 | #[bench] 87 | fn anchored_literal_long_non_match(b: &mut Bencher) { 88 | let re = regex!("^zbc(d|e)"); 89 | let text: String = repeat("abcdefghijklmnopqrstuvwxyz").take(15).collect(); 90 | bench_assert_non_match(b, re, &text); 91 | } 92 | 93 | #[bench] 94 | fn anchored_literal_short_match(b: &mut Bencher) { 95 | let re = regex!("^.bc(d|e)"); 96 | let text = "abcdefghijklmnopqrstuvwxyz"; 97 | bench_assert_match(b, re, text); 98 | } 99 | 100 | #[bench] 101 | fn anchored_literal_long_match(b: &mut Bencher) { 102 | let re = regex!("^.bc(d|e)"); 103 | let text: String = repeat("abcdefghijklmnopqrstuvwxyz").take(15).collect(); 104 | bench_assert_match(b, re, &text); 105 | } 106 | 107 | #[bench] 108 | fn one_pass_short_a(b: &mut Bencher) { 109 | let re = regex!("^.bc(d|e)*$"); 110 | let text = "abcddddddeeeededd"; 111 | bench_assert_match(b, re, text); 112 | } 113 | 114 | #[bench] 115 | fn one_pass_short_a_not(b: &mut Bencher) { 116 | let re = regex!(".bc(d|e)*$"); 117 | let text = "abcddddddeeeededd"; 118 | bench_assert_match(b, re, text); 119 | } 120 | 121 | #[bench] 122 | fn one_pass_short_b(b: &mut Bencher) { 123 | let re = regex!("^.bc(?:d|e)*$"); 124 | let text = "abcddddddeeeededd"; 125 | bench_assert_match(b, re, text); 126 | } 127 | 128 | #[bench] 129 | fn one_pass_short_b_not(b: &mut Bencher) { 130 | let re = regex!(".bc(?:d|e)*$"); 131 | let text = "abcddddddeeeededd"; 132 | bench_assert_match(b, re, text); 133 | } 134 | 135 | #[bench] 136 | fn one_pass_long_prefix(b: &mut Bencher) { 137 | let re = regex!("^abcdefghijklmnopqrstuvwxyz.*$"); 138 | let text = "abcdefghijklmnopqrstuvwxyz"; 139 | bench_assert_match(b, re, text); 140 | } 141 | 142 | #[bench] 143 | fn one_pass_long_prefix_not(b: &mut Bencher) { 144 | let re = regex!("^.bcdefghijklmnopqrstuvwxyz.*$"); 145 | let text = "abcdefghijklmnopqrstuvwxyz"; 146 | bench_assert_match(b, re, text); 147 | } 148 | 149 | #[bench] 150 | fn backtrack(b: &mut Bencher) { 151 | let re = regex!("a*b"); 152 | let text: String = repeat("aaaaaaaaaaaaaaaaaaaaaaaaaaaa").take(100).collect(); 153 | b.bytes = text.len() as u64; 154 | bench_assert_non_match(b, re, &text); 155 | } 156 | 157 | #[bench] 158 | fn skip(b: &mut Bencher) { 159 | let re = regex!("a[b-zA-Z]+a"); 160 | let text: String = repeat("aaaaaaaaaaaaaaaaaaaaaaaaaaaa").take(100).collect(); 161 | b.bytes = text.len() as u64; 162 | bench_assert_non_match(b, re, &text); 163 | } 164 | 165 | macro_rules! throughput( 166 | ($name:ident, $regex:expr, $size:expr) => ( 167 | #[bench] 168 | fn $name(b: &mut Bencher) { 169 | let text = gen_text($size); 170 | b.bytes = $size; 171 | let re = $regex; 172 | b.iter(|| if re.is_match(&text) { panic!("match") }); 173 | } 174 | ); 175 | ); 176 | 177 | fn easy0() -> ::Regex { regex!("ABCDEFGHIJKLMNOPQRSTUVWXYZ$") } 178 | fn easy1() -> ::Regex { regex!("A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$") } 179 | fn medium() -> ::Regex { regex!("[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$") } 180 | fn hard() -> ::Regex { regex!("[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$") } 181 | 182 | fn gen_text(n: usize) -> String { 183 | let mut rng = thread_rng(); 184 | let mut bytes = rng.gen_ascii_chars().map(|n| n as u8).take(n) 185 | .collect::>(); 186 | for (i, b) in bytes.iter_mut().enumerate() { 187 | if i % 20 == 0 { 188 | *b = b'\n' 189 | } 190 | } 191 | String::from_utf8(bytes).unwrap() 192 | } 193 | 194 | throughput!(easy0_32, easy0(), 32); 195 | throughput!(easy0_1K, easy0(), 1<<10); 196 | throughput!(easy0_32K, easy0(), 32<<10); 197 | throughput!(easy0_1MB, easy0(), 1<<20); 198 | 199 | throughput!(easy1_32, easy1(), 32); 200 | throughput!(easy1_1K, easy1(), 1<<10); 201 | throughput!(easy1_32K, easy1(), 32<<10); 202 | throughput!(easy1_1MB, easy1(), 1<<20); 203 | 204 | throughput!(medium_32, medium(), 32); 205 | throughput!(medium_1K, medium(), 1<<10); 206 | throughput!(medium_32K,medium(), 32<<10); 207 | throughput!(medium_1MB, medium(), 1<<20); 208 | 209 | throughput!(hard_32, hard(), 32); 210 | throughput!(hard_1K, hard(), 1<<10); 211 | throughput!(hard_32K,hard(), 32<<10); 212 | throughput!(hard_1MB, hard(), 1<<20); 213 | 214 | -------------------------------------------------------------------------------- /benches/bench_default.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Rust Project Developers. 2 | // Copyright 2015-2016 Joe Neeman. 3 | // 4 | // Licensed under the Apache License, Version 2.0 or the MIT license 6 | // , at your 7 | // option. This file may not be copied, modified, or distributed 8 | // except according to those terms. 9 | 10 | #![feature(test)] 11 | 12 | extern crate rand; 13 | extern crate regex; 14 | extern crate regex_dfa; 15 | extern crate test; 16 | 17 | // Due to macro scoping rules, this definition only applies for the modules 18 | // defined below. Effectively, it allows us to use the same tests for both 19 | // native and dynamic regexes. 20 | macro_rules! regex( 21 | ($re:expr) => ( 22 | ::regex_dfa::Regex::new($re).unwrap() 23 | ); 24 | ); 25 | 26 | type Regex = ::regex_dfa::Regex; 27 | 28 | mod bench; 29 | -------------------------------------------------------------------------------- /benches/bench_dynamic.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Rust Project Developers. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | #![feature(test)] 10 | 11 | extern crate rand; 12 | extern crate regex; 13 | extern crate test; 14 | 15 | // Due to macro scoping rules, this definition only applies for the modules 16 | // defined below. Effectively, it allows us to use the same tests for both 17 | // native and dynamic regexes. 18 | macro_rules! regex( 19 | ($re:expr) => ( 20 | match ::regex::Regex::new($re) { 21 | Ok(re) => re, 22 | Err(err) => panic!("{}", err), 23 | } 24 | ); 25 | ); 26 | 27 | type Regex = ::regex::Regex; 28 | 29 | mod bench; 30 | -------------------------------------------------------------------------------- /src/dfa/minimizer.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2016 Joe Neeman. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | use dfa::{Dfa, RetTrait}; 10 | use nfa::{Accept, StateIdx, StateSet}; 11 | use range_map::{RangeMultiMap, RangeSet}; 12 | use refinery::Partition; 13 | use std::collections::{HashSet, HashMap}; 14 | 15 | pub struct Minimizer { 16 | partition: Partition, 17 | distinguishers: HashSet, 18 | // The reversed transitions of the dfa. 19 | rev: Vec>, 20 | } 21 | 22 | impl Minimizer { 23 | // Partition the states according to 24 | // - when they accept, 25 | // - what they return if they do sometimes accept, and 26 | // - what set of bytes do we expect to see next. 27 | fn initial_partition(dfa: &Dfa) -> Vec> { 28 | let mut part: HashMap<(Accept, Option<&Ret>, RangeSet), Vec> = HashMap::new(); 29 | for (idx, st) in dfa.states.iter().enumerate() { 30 | let chars = st.transitions.to_range_set(); 31 | part.entry((st.accept, dfa.ret(idx), chars)).or_insert_with(Vec::new).push(idx); 32 | } 33 | part.into_iter().map(|x| x.1).collect() 34 | } 35 | 36 | // Refine the current partition based on the fact that everything in `splitter` is distinct 37 | // from everything not in it. 38 | fn refine(&mut self, splitter: &[StateIdx]) { 39 | let dists = &mut self.distinguishers; 40 | 41 | self.partition.refine_with_callback(splitter, |p, int_idx, diff_idx| { 42 | if dists.contains(&int_idx) || p.part(diff_idx).len() < p.part(int_idx).len() { 43 | dists.insert(diff_idx); 44 | } else { 45 | dists.insert(int_idx); 46 | } 47 | }); 48 | } 49 | 50 | fn next_distinguisher(&mut self) -> Option { 51 | let maybe_elt = self.distinguishers.iter().next().cloned(); 52 | if let Some(elt) = maybe_elt { 53 | self.distinguishers.remove(&elt); 54 | } 55 | maybe_elt 56 | } 57 | 58 | fn get_input_sets(&mut self, part_idx: usize) -> Vec { 59 | let inputs: Vec<_> = self.partition.part(part_idx) 60 | .iter() 61 | .flat_map(|s| self.rev[*s].ranges_values().cloned()) 62 | .collect(); 63 | if inputs.is_empty() { 64 | return Vec::new(); 65 | } 66 | 67 | let inputs = RangeMultiMap::from_vec(inputs); 68 | let mut sets: Vec = inputs.group() 69 | .ranges_values() 70 | .map(|&(_, ref x)| x.clone()) 71 | .collect(); 72 | for set in &mut sets { 73 | set.sort(); 74 | } 75 | sets.sort(); 76 | sets.dedup(); 77 | sets 78 | } 79 | 80 | fn compute_partition(&mut self) { 81 | while let Some(dist) = self.next_distinguisher() { 82 | let sets = self.get_input_sets(dist); 83 | 84 | for set in &sets { 85 | self.refine(set); 86 | } 87 | } 88 | } 89 | 90 | pub fn minimize(dfa: &Dfa) -> Dfa { 91 | let mut min = Minimizer::new(dfa); 92 | 93 | min.compute_partition(); 94 | 95 | let mut ret = Dfa::new(); 96 | 97 | // We need to re-index the states: build a map that maps old indices to 98 | // new indices. 99 | let mut old_state_to_new = vec![0; dfa.num_states()]; 100 | for part in min.partition.iter() { 101 | // This unwrap is safe because we don't allow any empty sets into the partition. 102 | let rep_idx = *part.iter().next().unwrap(); 103 | ret.states.push(dfa.states[rep_idx].clone()); 104 | 105 | for &state in part.iter() { 106 | old_state_to_new[state] = ret.states.len() - 1; 107 | } 108 | } 109 | 110 | ret.map_states(|s: StateIdx| old_state_to_new[s]); 111 | ret.init = dfa.init.iter() 112 | .map(|x| x.map(|s: StateIdx| old_state_to_new[s])) 113 | .collect(); 114 | ret 115 | } 116 | 117 | fn new(dfa: &Dfa) -> Minimizer { 118 | let init = Minimizer::initial_partition(dfa); 119 | let part = Partition::new(init.into_iter().map(|set| set.into_iter()), dfa.num_states()); 120 | 121 | // According to Hopcroft's algorithm, we're allowed to leave out one of the distinguishers 122 | // (at least, as long as it isn't a set of accepting states). Choose the one with the 123 | // most states to leave out. 124 | let mut dists: HashSet = (0..part.num_parts()).collect(); 125 | let worst = (0..dists.len()) 126 | .filter(|i| dfa.states[part.part(*i)[0]].accept == Accept::Never) 127 | .max_by_key(|i| part.part(*i).len()); 128 | if let Some(worst) = worst { 129 | dists.remove(&worst); 130 | } 131 | 132 | Minimizer { 133 | partition: part, 134 | distinguishers: dists, 135 | rev: dfa.reversed_transitions(), 136 | } 137 | } 138 | } 139 | 140 | 141 | -------------------------------------------------------------------------------- /src/dfa/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2016 Joe Neeman. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | mod trie; 10 | mod prefix_searcher; 11 | mod minimizer; 12 | 13 | use dfa::minimizer::Minimizer; 14 | use dfa::prefix_searcher::PrefixSearcher; 15 | use graph::Graph; 16 | use look::Look; 17 | use itertools::Itertools; 18 | use nfa::{Accept, StateIdx}; 19 | use range_map::{RangeMap, RangeMultiMap}; 20 | use refinery::Partition; 21 | use runner::program::TableInsts; 22 | use std; 23 | use std::fmt::{Debug, Formatter}; 24 | use std::hash::Hash; 25 | use std::mem; 26 | use std::u32; 27 | 28 | pub use dfa::prefix_searcher::PrefixPart; 29 | 30 | #[derive(Clone, PartialEq, Debug)] 31 | pub struct State { 32 | pub transitions: RangeMap, 33 | pub accept: Accept, 34 | pub ret: Option, 35 | } 36 | 37 | impl State { 38 | pub fn new(accept: Accept, ret: Option) -> State { 39 | State { 40 | transitions: RangeMap::new(), 41 | accept: accept, 42 | ret: ret, 43 | } 44 | } 45 | } 46 | 47 | pub trait RetTrait: Clone + Copy + Debug + Eq + Hash {} 48 | impl RetTrait for T {} 49 | 50 | #[derive(Clone, PartialEq)] 51 | pub struct Dfa { 52 | states: Vec>, 53 | 54 | /// This is a vector of length `Look::num()` containing all possible starting positions. 55 | /// 56 | /// `init[Look::Boundary]` is the starting position if we are at the beginning of the 57 | /// input. 58 | /// 59 | /// `init[Look::Full]` is the default starting position. 60 | /// 61 | /// All other positions in `init` are only used if we are specifically asked to start 62 | /// there; this is mainly useful in the forward-backward engine. 63 | pub init: Vec>, 64 | } 65 | 66 | impl Dfa { 67 | /// Returns a `Dfa` with no states. 68 | pub fn new() -> Dfa { 69 | Dfa { 70 | states: Vec::new(), 71 | init: vec![None; Look::num()], 72 | } 73 | } 74 | 75 | /// Returns the number of states. 76 | pub fn num_states(&self) -> usize { 77 | self.states.len() 78 | } 79 | 80 | pub fn add_state(&mut self, accept: Accept, ret: Option) -> StateIdx { 81 | self.states.push(State::new(accept, ret)); 82 | self.states.len() - 1 83 | } 84 | 85 | pub fn set_transitions(&mut self, from: StateIdx, transitions: RangeMap) { 86 | self.states[from].transitions = transitions; 87 | } 88 | 89 | pub fn init_state(&self, look: Look) -> Option { 90 | self.init[look.as_usize()] 91 | } 92 | 93 | pub fn init_at_start(&self) -> Option { 94 | self.init_state(Look::Boundary) 95 | } 96 | 97 | pub fn init_otherwise(&self) -> Option { 98 | self.init_state(Look::Full) 99 | } 100 | 101 | /// Returns true if this `Dfa` only matches things at the beginning of the input. 102 | pub fn is_anchored(&self) -> bool { 103 | self.init_otherwise().is_none() && self.init_at_start().is_some() 104 | } 105 | 106 | /// Get transitions from a given state. 107 | pub fn transitions(&self, state: StateIdx) -> &RangeMap { 108 | &self.states[state].transitions 109 | } 110 | 111 | /// Returns the conditions under which the given state accepts. 112 | pub fn accept(&self, state: StateIdx) -> &Accept { 113 | &self.states[state].accept 114 | } 115 | 116 | /// The value that will be returned if we accept in state `state`. 117 | pub fn ret(&self, state: StateIdx) -> Option<&Ret> { 118 | self.states[state].ret.as_ref() 119 | } 120 | 121 | /// Changes the return value. 122 | pub fn map_ret T>(self, mut f: F) -> Dfa { 123 | let mut ret: Dfa = Dfa::new(); 124 | ret.init = self.init; 125 | 126 | for st in self.states { 127 | let new_st = State { 128 | transitions: st.transitions, 129 | accept: st.accept, 130 | ret: st.ret.map(&mut f), 131 | }; 132 | ret.states.push(new_st); 133 | } 134 | ret 135 | } 136 | 137 | /// Returns an equivalent DFA with a minimal number of states. 138 | /// 139 | /// Uses Hopcroft's algorithm. 140 | fn minimize(&self) -> Dfa { 141 | Minimizer::minimize(self) 142 | } 143 | 144 | /// Returns the transitions of this automaton, reversed. 145 | fn reversed_transitions(&self) -> Vec> { 146 | let mut ret = vec![RangeMultiMap::new(); self.states.len()]; 147 | 148 | for (source, st) in self.states.iter().enumerate() { 149 | for &(range, target) in st.transitions.ranges_values() { 150 | ret[target].insert(range, source); 151 | } 152 | } 153 | 154 | ret 155 | } 156 | 157 | /// Returns a set of strings that match the beginning of this `Dfa`. 158 | /// 159 | /// If the set is non-empty, every match of this `Dfa` is guaranteed to start with one of these 160 | /// strings. 161 | pub fn prefix_strings(&self) -> Vec { 162 | // It might seem silly to look for prefixes starting at the anchored state, but it's useful 163 | // for forward-backward matching. In cases where the regex is honestly anchored, we won't 164 | // ask to make a prefix anyway. 165 | if let Some(state) = self.init_state(Look::Boundary) { 166 | PrefixSearcher::extract(self, state) 167 | } else { 168 | Vec::new() 169 | } 170 | } 171 | 172 | /* 173 | pub fn critical_strings(&self) -> Vec<(Vec, StateIdx)> { 174 | unimplemented!(); 175 | } 176 | */ 177 | 178 | // Finds the bytes that are treated equivalently by this Dfa. 179 | // 180 | // Returns a Vec of length 256 such that vec[i] == vec[j] when i and j are two equivalent 181 | // bytes. Also returns the log of the number of classes, rounded up. 182 | fn byte_equivalence_classes(&self) -> (Vec, u32) { 183 | let mut part = Partition::new(Some(0..256).into_iter(), 256); 184 | let mut buf = Vec::with_capacity(256); 185 | 186 | for st in &self.states { 187 | let group = st.transitions.keys_values().group_by_lazy(|x| x.1); 188 | for (_, keys_values) in &group { 189 | buf.clear(); 190 | for (key, _) in keys_values { 191 | buf.push(key as usize); 192 | } 193 | part.refine(&buf); 194 | } 195 | } 196 | 197 | let mut ret = vec![0; 256]; 198 | for (i, p) in part.iter().enumerate() { 199 | for &x in p { 200 | ret[x] = i as u8; 201 | } 202 | } 203 | let size = (part.num_parts() - 1) as u32; 204 | 205 | (ret, 32 - size.leading_zeros()) 206 | } 207 | 208 | /// Compiles this `Dfa` into instructions for execution. 209 | pub fn compile(&self) -> TableInsts { 210 | let (byte_class, log_num_classes) = self.byte_equivalence_classes(); 211 | 212 | let mut table = vec![u32::MAX; self.num_states() << log_num_classes]; 213 | let accept: Vec> = self.states.iter() 214 | .map(|st| if st.accept == Accept::Always { st.ret } else { None }) 215 | .collect(); 216 | let accept_at_eoi: Vec> = self.states.iter() 217 | .map(|st| if st.accept != Accept::Never { st.ret } else { None }) 218 | .collect(); 219 | 220 | for (idx, st) in self.states.iter().enumerate() { 221 | for (ch, &tgt_state) in st.transitions.keys_values() { 222 | let class = byte_class[ch as usize]; 223 | table[(idx << log_num_classes) + class as usize] = tgt_state as u32; 224 | } 225 | } 226 | 227 | TableInsts { 228 | log_num_classes: log_num_classes, 229 | byte_class: byte_class, 230 | accept: accept, 231 | accept_at_eoi: accept_at_eoi, 232 | table: table, 233 | } 234 | } 235 | 236 | /// Finds an equivalent DFA with the minimal number of states. 237 | pub fn optimize(self) -> Dfa { 238 | let mut ret = self.minimize(); 239 | ret.sort_states(); 240 | ret 241 | } 242 | 243 | /// Deletes any transitions that return to the initial state. 244 | /// 245 | /// This results in a new Dfa with the following properties: 246 | /// - if the original Dfa has a match then the new Dfa also has a match that ends in the same 247 | /// position (and vice versa), and 248 | /// - the new Dfa doesn't need to backtrack to find matches: if it fails then it can be 249 | /// restarted at the same position it failed in. 250 | /// 251 | /// The reason for this method is that it makes prefixes more effective: where the original Dfa 252 | /// would just loop back to the start state, the new Dfa will signal a failure. Then we can use 253 | /// a `Prefix` to scan ahead for a good place to resume matching. 254 | /// 255 | /// # Panics 256 | /// - if `self` is not anchored. 257 | pub fn cut_loop_to_init(mut self) -> Dfa { 258 | if !self.is_anchored() { 259 | panic!("only anchored Dfas can be cut"); 260 | } 261 | 262 | // The unwrap is safe because we just checked that we are anchored. 263 | let init = self.init_at_start().unwrap(); 264 | for st in &mut self.states { 265 | st.transitions.retain_values(|x| *x != init); 266 | } 267 | self 268 | } 269 | 270 | fn map_states StateIdx>(&mut self, mut map: F) { 271 | for st in &mut self.states { 272 | st.transitions.map_values(|x| map(*x)); 273 | } 274 | let init: Vec<_> = self.init.iter().map(|x| x.map(&mut map)).collect(); 275 | self.init = init; 276 | } 277 | 278 | /// Sorts states in depth-first alphabetical order. 279 | /// 280 | /// This has the following advantages: 281 | /// - the construction of a `Dfa` becomes deterministic: without sorting, the states aren't in 282 | /// deterministic order because `minimize` using hashing. 283 | /// - better locality: after sorting, many transitions just go straight to the next state. 284 | /// - we prune unreachable states. 285 | fn sort_states(&mut self) { 286 | let sorted = self.dfs_order(self.init.iter().filter_map(|x| *x)); 287 | 288 | // Not every old state will necessary get mapped to a new one (unreachable states won't). 289 | let mut state_map: Vec> = vec![None; self.states.len()]; 290 | let mut old_states = vec![State::new(Accept::Never, None); self.states.len()]; 291 | mem::swap(&mut old_states, &mut self.states); 292 | 293 | for (new_idx, old_idx) in sorted.into_iter().enumerate() { 294 | state_map[old_idx] = Some(new_idx); 295 | mem::swap(&mut old_states[old_idx], &mut self.states[new_idx]); 296 | } 297 | 298 | // Fix the transitions and initialization to point to the new states. The `unwrap` here is 299 | // basically the assertion that all reachable states should be mapped to new states. 300 | self.map_states(|s| state_map[s].unwrap()); 301 | } 302 | 303 | /* 304 | // Finds all the transitions between states that only match a single byte. 305 | fn single_byte_transitions(&self) -> HashMap<(StateIdx, StateIdx), u8> { 306 | use std::collections::hash_map::Entry::*; 307 | 308 | let mut ret = HashMap::new(); 309 | let mut seen = HashSet::new(); 310 | for (src_idx, st) in self.states.iter().enumerate() { 311 | for &(range, tgt_idx) in st.transitions.ranges_values() { 312 | if range.start == range.end && !seen.contains(&(src_idx, tgt_idx)) { 313 | match ret.entry((src_idx, tgt_idx)) { 314 | Occupied(e) => { 315 | e.remove(); 316 | seen.insert((src_idx, tgt_idx)); 317 | }, 318 | Vacant(e) => { e.insert(range.start); }, 319 | } 320 | } 321 | } 322 | } 323 | ret 324 | } 325 | 326 | // Finds all the single-byte transitions that must be traversed in order to get to an accepting 327 | // state. 328 | fn mandatory_single_byte_transitions(&self, max_steps: usize) -> Vec<(StateIdx, StateIdx, u8)> { 329 | let map = self.single_byte_transitions(); 330 | let interesting_bytes: HashSet = map.values().cloned().collect(); 331 | 332 | // In order to get from the initial state to state i, we need to see all the bytes in 333 | // mandatory_bytes[i] at least once. (At least, that's the goal of mandatory_bytes; we 334 | // start out with too many elements in it and gradually remove them.) 335 | let mut mandatory_bytes = vec![interesting_bytes.clone(); self.num_states()]; 336 | mandatory_bytes[0] = HashSet::new(); 337 | 338 | let mut visited = HashSet::::new(); 339 | let mut active = HashSet::::new(); 340 | let mut next = HashSet::::new(); 341 | next.insert(0); 342 | let mut steps_left = max_steps; 343 | 344 | fn intersect(a: &mut HashSet, b: &HashSet) -> bool { 345 | let old_size = a.len(); 346 | *a = a.intersection(b).cloned().collect(); 347 | a.len() < old_size 348 | } 349 | 350 | while steps_left > 0 { 351 | steps_left -= 1; 352 | mem::swap(&mut active, &mut next); 353 | next.clear(); 354 | 355 | for &src in &active { 356 | // If we found an accepting state, keep it in the active set but don't go any 357 | // further. 358 | if self.accept(src) != &Accept::Never { 359 | next.insert(src); 360 | continue; 361 | } 362 | 363 | visited.insert(src); 364 | for tgt in self.transitions(src).ranges_values().map(|x| x.1).dedup() { 365 | let mut bytes = mandatory_bytes[src].clone(); 366 | if let Some(b) = map.get(&(src, tgt)) { 367 | bytes.insert(*b); 368 | } 369 | if intersect(&mut mandatory_bytes[tgt], &bytes) || !visited.contains(&tgt) { 370 | next.insert(tgt); 371 | } 372 | } 373 | } 374 | } 375 | 376 | let critical_bytes = next.into_iter() 377 | .fold(interesting_bytes, 378 | |x, state| x.intersection(&mandatory_bytes[state]).cloned().collect()); 379 | 380 | let mut ret: Vec<_> = map.into_iter() 381 | .filter(|&(pair, byte)| critical_bytes.contains(&byte) && visited.contains(&pair.0)) 382 | .map(|(pair, byte)| (pair.0, pair.1, byte)) 383 | .collect(); 384 | ret.sort(); 385 | ret 386 | } 387 | */ 388 | } 389 | 390 | impl Debug for Dfa { 391 | fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { 392 | try!(f.write_fmt(format_args!("Dfa ({} states):\n", self.states.len()))); 393 | 394 | try!(f.write_fmt(format_args!("Init: {:?}\n", self.init))); 395 | 396 | for (st_idx, st) in self.states.iter().enumerate().take(40) { 397 | try!(f.write_fmt(format_args!("\tState {} (accepting: {:?}):\n", st_idx, st.accept))); 398 | if let Some(ref ret) = st.ret { 399 | try!(f.write_fmt(format_args!("\t\t{:?}\n", ret))); 400 | } 401 | 402 | if !st.transitions.is_empty() { 403 | try!(f.write_str("\t\tTransitions:\n")); 404 | // Cap it at 5 transitions, since it gets unreadable otherwise. 405 | for &(range, target) in st.transitions.ranges_values().take(5) { 406 | try!(f.write_fmt(format_args!("\t\t\t{} -- {} => {}\n", 407 | range.start, range.end, target))); 408 | } 409 | if st.transitions.num_ranges() > 5 { 410 | try!(f.write_str("\t\t\t...\n")); 411 | } 412 | } 413 | } 414 | if self.states.len() > 40 { 415 | try!(f.write_fmt(format_args!("\t...({} more states)\n", self.states.len() - 40))); 416 | } 417 | Ok(()) 418 | } 419 | } 420 | 421 | #[cfg(test)] 422 | pub mod tests { 423 | use dfa::*; 424 | use itertools::Itertools; 425 | use look::Look; 426 | use nfa::{Accept, Nfa, StateIdx}; 427 | use range_map::{Range, RangeMap}; 428 | use std::usize; 429 | 430 | // Creates a non-backtracking dfa from a regex string. 431 | pub fn make_dfa_bounded(re: &str, max_states: usize) -> ::Result> { 432 | let nfa = try!(Nfa::from_regex(re)); 433 | let nfa = nfa.remove_looks(); 434 | println!("after remove_looks: {:?}", nfa); 435 | let nfa = try!(nfa.byte_me(max_states)); 436 | println!("after byte: {:?}", nfa); 437 | 438 | let dfa = try!(nfa.determinize(max_states)); 439 | Ok(dfa.optimize()) 440 | } 441 | 442 | pub fn make_dfa(re: &str) -> ::Result> { 443 | make_dfa_bounded(re, usize::MAX) 444 | } 445 | 446 | pub fn make_anchored(re: &str) -> Dfa<(Look, u8)> { 447 | let nfa = Nfa::from_regex(re).unwrap() 448 | .remove_looks() 449 | .byte_me(usize::MAX).unwrap() 450 | .anchor(usize::MAX).unwrap(); 451 | 452 | nfa.determinize(usize::MAX).unwrap() 453 | .optimize() 454 | .cut_loop_to_init() 455 | .optimize() 456 | } 457 | 458 | pub fn trans_dfa_anchored(size: usize, trans: &[(StateIdx, StateIdx, Range)]) 459 | -> Dfa<(Look, u8)> { 460 | let mut ret = Dfa::new(); 461 | for _ in 0..size { 462 | ret.add_state(Accept::Never, None); 463 | } 464 | for (src, trans) in trans.iter().group_by(|x| x.0) { 465 | let rm: RangeMap = trans.into_iter() 466 | .map(|x| (x.2, x.1)) 467 | .collect(); 468 | ret.set_transitions(src, rm); 469 | } 470 | ret 471 | } 472 | 473 | #[test] 474 | fn test_anchored_dfa_simple() { 475 | let dfa = make_anchored("a"); 476 | let mut tgt = trans_dfa_anchored(2, &[(0, 1, Range::new(b'a', b'a'))]); 477 | tgt.init[Look::Boundary.as_usize()] = Some(0); 478 | tgt.states[1].accept = Accept::Always; 479 | tgt.states[1].ret = Some((Look::Full, 0)); 480 | 481 | assert_eq!(dfa, tgt); 482 | } 483 | 484 | #[test] 485 | fn test_forward_backward_simple() { 486 | // TODO 487 | } 488 | 489 | #[test] 490 | fn test_anchored_dfa_anchored_end() { 491 | let dfa = make_anchored("a$"); 492 | let mut tgt = trans_dfa_anchored(2, &[(0, 1, Range::new(b'a', b'a')), 493 | (1, 1, Range::new(b'a', b'a'))]); 494 | tgt.init[Look::Boundary.as_usize()] = Some(0); 495 | tgt.states[1].accept = Accept::AtEoi; 496 | tgt.states[1].ret = Some((Look::Boundary, 0)); 497 | 498 | assert_eq!(dfa, tgt); 499 | } 500 | 501 | #[test] 502 | fn test_anchored_dfa_literal_prefix() { 503 | let dfa = make_anchored("abc[A-z]"); 504 | let pref = dfa.prefix_strings().into_iter().map(|p| p.0).collect::>(); 505 | assert_eq!(pref, vec!["abc".as_bytes()]); 506 | } 507 | 508 | #[test] 509 | fn test_minimize() { 510 | let auto = make_dfa("a*?b*?").unwrap(); 511 | // 1, because our highest-priority match is an empty string. 512 | assert_eq!(auto.states.len(), 1); 513 | 514 | let auto = make_dfa(r"^a").unwrap(); 515 | assert_eq!(auto.states.len(), 2); 516 | 517 | let mut auto = make_dfa("[cgt]gggtaaa|tttaccc[acg]").unwrap(); 518 | // Since `minimize` is non-deterministic (involving random hashes), run this a bunch of 519 | // times. 520 | for _ in 0..100 { 521 | auto = auto.optimize(); 522 | assert_eq!(auto.states.len(), 16); 523 | } 524 | } 525 | 526 | #[test] 527 | fn test_class_normalized() { 528 | let mut re = make_dfa("[abcdw]").unwrap(); 529 | re.sort_states(); 530 | assert_eq!(re.states.len(), 2); 531 | assert_eq!(re.states[0].transitions.num_ranges(), 2) 532 | } 533 | 534 | #[test] 535 | fn test_max_states() { 536 | assert!(make_dfa_bounded("foo", 3).is_err()); 537 | assert!(make_dfa_bounded("foo", 4).is_ok()); 538 | } 539 | 540 | #[test] 541 | fn test_adjacent_predicates() { 542 | assert!(make_dfa_bounded(r"\btest\b\B", 100).unwrap().states.is_empty()); 543 | assert!(make_dfa_bounded(r"\btest\B\b", 100).unwrap().states.is_empty()); 544 | assert!(make_dfa_bounded(r"test1\b\Btest2", 100).unwrap().states.is_empty()); 545 | } 546 | 547 | #[test] 548 | fn test_syntax_error() { 549 | assert!(make_dfa_bounded("(abc", 10).is_err()); 550 | } 551 | 552 | #[test] 553 | fn match_priority() { 554 | macro_rules! eq { 555 | ($re1:expr, $re2:expr) => { 556 | { 557 | let dfa1 = make_dfa($re1).unwrap(); 558 | let dfa2 = make_dfa($re2).unwrap(); 559 | assert_eq!(dfa1, dfa2); 560 | } 561 | }; 562 | } 563 | eq!("(a|aa)", "a"); 564 | eq!("abcd*?", "abc"); 565 | //eq!("a*?", ""); // TODO: figure out how empty regexes should behave 566 | } 567 | 568 | // TODO: add a test checking that minimize() doesn't clobber return values. 569 | 570 | /* 571 | #[test] 572 | fn critical_transitions() { 573 | fn crit(max_steps: usize, re: &str, answer: &[(StateIdx, StateIdx, u8)]) { 574 | let dfa = make_dfa(re).unwrap(); 575 | println!("{:?}", dfa); 576 | assert_eq!(&dfa.mandatory_single_byte_transitions(max_steps)[..], answer); 577 | } 578 | 579 | fn crit_anchored(max_steps: usize, re: &str, answer: &[(StateIdx, StateIdx, u8)]) { 580 | let dfa = make_anchored(re); 581 | println!("{:?}", dfa); 582 | assert_eq!(&dfa.mandatory_single_byte_transitions(max_steps)[..], answer); 583 | } 584 | 585 | crit(10, "a", &[(0, 1, b'a')]); 586 | crit(10, "aaa", &[(0, 1, b'a'), (1, 2, b'a'), (2, 3, b'a')]); 587 | crit(2, "aaa", &[(0, 1, b'a'), (1, 2, b'a')]); 588 | crit(10, "a*|ab", &[]); 589 | crit(10, "a+|ab", &[(0, 1, b'a')]); 590 | crit(10, "brown|fox", &[(2, 3, b'o'), (6, 7, b'o')]); 591 | crit(10, "quick|brown", &[]); 592 | crit(10, "zzzzzzzzzz|abracadabraz", &[]); 593 | crit(10, "eeeeeeeeez|abracadabz", &[(9, 10, b'z')]); 594 | crit(10, ".*x", &[(0, 1, b'x')]); 595 | crit_anchored(10, "\\bx", &[(0, 260, b'x')]); 596 | } 597 | */ 598 | } 599 | -------------------------------------------------------------------------------- /src/dfa/prefix_searcher.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2016 Joe Neeman. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | use dfa::{Dfa, RetTrait}; 10 | use dfa::trie::Trie; 11 | use nfa::{Accept, StateIdx}; 12 | use std::cmp::{Ordering, PartialOrd}; 13 | use std::collections::{HashSet, VecDeque}; 14 | use std::mem::swap; 15 | 16 | // TODO: These limits are pretty arbitrary (copied from the regex crate). 17 | const NUM_PREFIX_LIMIT: usize = 30; 18 | const PREFIX_LEN_LIMIT: usize = 15; 19 | 20 | /// A pair of a byte sequence and the index of the state that we are in after encountering that 21 | /// sequence. 22 | #[derive(Clone, Debug, PartialEq)] 23 | pub struct PrefixPart(pub Vec, pub StateIdx); 24 | 25 | pub struct PrefixSearcher { 26 | active: VecDeque, 27 | current: PrefixPart, 28 | suffixes: Trie, 29 | finished: Vec, 30 | 31 | // The set of prefixes is complete if: 32 | // - we're done with active prefixes before we go over any of our limits, and 33 | // - we didn't encounter any states that accept conditionally. 34 | complete: bool, 35 | 36 | max_prefixes: usize, 37 | max_len: usize, 38 | } 39 | 40 | impl PrefixSearcher { 41 | pub fn extract(dfa: &Dfa, state: StateIdx) -> Vec { 42 | let mut searcher = PrefixSearcher::new(); 43 | searcher.search(dfa, state); 44 | searcher.finished 45 | } 46 | 47 | fn new() -> PrefixSearcher { 48 | PrefixSearcher { 49 | active: VecDeque::new(), 50 | current: PrefixPart(Vec::new(), 0), 51 | suffixes: Trie::new(), 52 | finished: Vec::new(), 53 | complete: true, 54 | max_prefixes: NUM_PREFIX_LIMIT, 55 | max_len: PREFIX_LEN_LIMIT, 56 | } 57 | } 58 | 59 | fn bail_out(&mut self) { 60 | let mut current = PrefixPart(Vec::new(), 0); 61 | let mut active = VecDeque::new(); 62 | swap(&mut current, &mut self.current); 63 | swap(&mut active, &mut self.active); 64 | 65 | self.finished.extend(active.into_iter()); 66 | self.finished.push(current); 67 | self.complete = false; 68 | } 69 | 70 | fn add(&mut self, new_prefs: Vec) { 71 | debug_assert!(new_prefs.len() + self.active.len() + self.finished.len() <= self.max_prefixes); 72 | 73 | for p in new_prefs.into_iter() { 74 | if p.0.len() >= self.max_len { 75 | self.finished.push(p); 76 | } else { 77 | self.active.push_back(p); 78 | } 79 | } 80 | } 81 | 82 | fn too_many(&mut self, more: usize) -> bool { 83 | self.active.len() + self.finished.len() + more > self.max_prefixes 84 | } 85 | 86 | fn search(&mut self, dfa: &Dfa, state: StateIdx) { 87 | self.active.push_back(PrefixPart(Vec::new(), state)); 88 | self.suffixes.insert(vec![].into_iter(), state); 89 | while !self.active.is_empty() { 90 | self.current = self.active.pop_front().unwrap(); 91 | 92 | let trans = dfa.transitions(self.current.1); 93 | let mut next_prefs = Vec::new(); 94 | for (ch, next_state) in trans.keys_values() { 95 | let mut next_pref = self.current.0.clone(); 96 | next_pref.push(ch); 97 | next_prefs.push(PrefixPart(next_pref, *next_state)); 98 | } 99 | 100 | // Discard any new prefix that is the suffix of some existing prefix. 101 | next_prefs.retain(|pref| { 102 | let rev_bytes = pref.0.iter().cloned().rev(); 103 | !self.suffixes 104 | .prefixes(rev_bytes) 105 | .any(|s| s == pref.1) 106 | }); 107 | for pref in &next_prefs { 108 | self.suffixes.insert(pref.0.iter().cloned().rev(), pref.1); 109 | } 110 | 111 | // Stop searching if we have too many prefixes already, or if we've run into an accept 112 | // state. In principle, we could continue expanding the other prefixes even after we 113 | // run into an accept state, but there doesn't seem much point in having some short 114 | // prefixes and other long prefixes. 115 | if self.too_many(next_prefs.len()) 116 | || *dfa.accept(self.current.1) != Accept::Never { 117 | self.bail_out(); 118 | break; 119 | } 120 | 121 | 122 | self.add(next_prefs); 123 | } 124 | } 125 | } 126 | 127 | // A critical segment is a sequence of bytes that we must match if we want to get to a particular 128 | // state. That sequence need not necessarily correspond to a unique path in the DFA, however. 129 | // Therefore, we store the sequence of bytes and also a set of possible paths that we might have 130 | // traversed while reading those bytes. 131 | #[derive(Clone, Debug, PartialEq)] 132 | pub struct CriticalSegment { 133 | bytes: Vec, 134 | paths: HashSet>, 135 | } 136 | 137 | // The stdlib seems to have searching functions for &str, but not for &[u8]. If they get added, we 138 | // can remove this. 139 | fn find(haystack: &[u8], needle: &[u8]) -> Option { 140 | haystack.windows(needle.len()) 141 | .enumerate() 142 | .find(|x| x.1 == needle) 143 | .map(|y| y.0) 144 | } 145 | 146 | // For two critical segments a and b, we say a <= b if it is more specific than b: either a's 147 | // byte sequence contains b's byte sequence or else the byte sequences are the same and a's set of 148 | // paths is a subset of b's set of paths. 149 | // TODO: not sure if this is necessary 150 | impl PartialOrd for CriticalSegment { 151 | fn partial_cmp(&self, other: &CriticalSegment) -> Option { 152 | fn less(a: &CriticalSegment, b: &CriticalSegment) -> bool { 153 | let a_len = a.bytes.len(); 154 | let b_len = b.bytes.len(); 155 | (a_len > b_len && find(&a.bytes, &b.bytes).is_some()) 156 | || (a.bytes == b.bytes && a.paths.is_subset(&b.paths)) 157 | } 158 | if less(self, other) { 159 | Some(Ordering::Less) 160 | } else if less(other, self) { 161 | Some(Ordering::Greater) 162 | } else { 163 | None 164 | } 165 | } 166 | } 167 | 168 | /* 169 | impl CriticalSegment { 170 | pub fn intersection(xs: &[CriticalSegment], ys: &[CriticalSegment]) -> Vec { 171 | let common = maximal_common_substrings( 172 | xs.iter().map(|x| &x.bytes[..]), 173 | ys.iter().map(|y| &y.bytes[..])); 174 | let mut ret = Vec::new(); 175 | 176 | for s in common { 177 | let mut paths = HashSet::new(); 178 | for x in xs.iter().chain(ys.iter()) { 179 | // We look for only the first occurence of the substring in x. 180 | if let Some(pos) = find(&x.bytes, &s) { 181 | paths.extend(x.paths.iter().map(|p| p[pos..(pos + s.len())].to_vec())); 182 | } 183 | } 184 | ret.push(CriticalSegment { bytes: s, paths: paths }); 185 | } 186 | ret 187 | } 188 | } 189 | 190 | // Finds all strings that are 191 | // - a substring of some element of xs, 192 | // - a substring of some element of ys, and 193 | // - maximal among all strings satisfying the first two conditions. 194 | // 195 | // Note that this implementation is *extremely* naive -- an efficient implementation would probably 196 | // want to use a generalized suffix tree. But since the strings we deal with here are small, we can 197 | // sort of get away with it. 198 | fn maximal_common_substrings<'a, I, J>(xs: I, ys: J) -> HashSet> 199 | where I: Iterator, J: Iterator { 200 | let mut ys_substrings = HashSet::new(); 201 | let mut common_substrings = HashSet::new(); 202 | 203 | for y in ys { 204 | let len = y.len(); 205 | for i in 0..len { 206 | for j in i..len { 207 | ys_substrings.insert(&y[i..(j + 1)]); 208 | } 209 | } 210 | } 211 | 212 | for x in xs { 213 | let len = x.len(); 214 | for i in 0..len { 215 | for j in i..len { 216 | if ys_substrings.contains(&x[i..(j + 1)]) { 217 | common_substrings.insert(x[i..(j + 1)].to_vec()); 218 | } 219 | } 220 | } 221 | } 222 | 223 | // Now prune out anything that isn't maximal. 224 | let mut ret = common_substrings.clone(); 225 | for s in &common_substrings { 226 | let len = s.len(); 227 | for i in 0..len { 228 | for j in i..len { 229 | // Make sure we're only looking at proper substrings of s. 230 | if i > 0 || j < len - 1 { 231 | ret.remove(&s[i..(j + 1)]); 232 | } 233 | } 234 | } 235 | } 236 | ret 237 | } 238 | */ 239 | 240 | #[cfg(test)] 241 | mod tests { 242 | use dfa; 243 | use look::Look; 244 | use quickcheck::{QuickCheck, quickcheck, StdGen, TestResult}; 245 | use rand; 246 | use super::*; 247 | //use super::{find, maximal_common_substrings}; 248 | 249 | fn qc(size: usize) -> QuickCheck> { 250 | QuickCheck::new().gen(StdGen::new(rand::thread_rng(), size)) 251 | } 252 | 253 | macro_rules! test_prefix { 254 | ($name:ident, $re_str:expr, $answer:expr, $max_num:expr, $max_len:expr) => { 255 | #[test] 256 | fn $name() { 257 | let dfa = dfa::tests::make_dfa($re_str).unwrap(); 258 | println!("{:?}", dfa); 259 | let mut pref = PrefixSearcher::new(); 260 | pref.max_prefixes = $max_num; 261 | pref.max_len = $max_len; 262 | pref.search(&dfa, dfa.init_state(Look::Full).unwrap()); 263 | let mut prefs = pref.finished.into_iter().map(|x| x.0).collect::>(); 264 | prefs.sort(); 265 | 266 | let answer: Vec> = $answer.iter() 267 | .map(|s| s.as_bytes().to_owned()) 268 | .collect(); 269 | assert_eq!(prefs, answer); 270 | } 271 | }; 272 | } 273 | 274 | test_prefix!(long, 275 | "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ", 276 | vec!["XABCDEFGHIJKLMNOPQRSTUVWXYZ", 277 | "YABCDEFGHIJKLMNOPQRSTUVWXYZ", 278 | "ZABCDEFGHIJKLMNOPQRSTUVWXYZ",], 279 | 3, 30); 280 | 281 | test_prefix!(case_insensitive, 282 | "(?i)abc[a-z]", 283 | vec!["ABC", "ABc", "AbC", "Abc", "aBC", "aBc", "abC", "abc"], 284 | 30, 5); 285 | 286 | test_prefix!(byte_set, 287 | "[ac]", 288 | vec!["a", "c"], 289 | 30, 5); 290 | 291 | test_prefix!(pruned_repetition, 292 | "a+bc", 293 | vec!["abc"], 294 | 10, 10); 295 | 296 | test_prefix!(pruned_empty_repetition, 297 | "[a-zA-Z]*bc", 298 | vec!["bc"], 299 | 10, 10); 300 | 301 | /* 302 | #[test] 303 | fn common_substrings() { 304 | fn sound(xs: Vec>, ys: Vec>) -> bool { 305 | let result = maximal_common_substrings(xs.iter().map(|x| &x[..]), ys.iter().map(|y| &y[..])); 306 | 307 | // Everything in the result should be a substring of something in xs. 308 | result.iter().all(|x| xs.iter().any(|y| find(&y, &x).is_some())) 309 | // Everything in the result should be a substring of something in xs. 310 | && result.iter().all(|x| ys.iter().any(|y| find(&y, &x).is_some())) 311 | // Nothing in the result should be a strict substring of anything else. 312 | && result.iter().all( 313 | |x| !result.iter().any(|y| y.len() > x.len() && find(&y, &x).is_some())) 314 | } 315 | 316 | // If z is a substring of something in xs and something in ys then it must be a substring 317 | // of something in result. 318 | fn complete(xs: Vec>, ys: Vec>, z: Vec) -> TestResult { 319 | if z.is_empty() 320 | || !xs.iter().any(|x| find(&x, &z).is_some()) 321 | || !ys.iter().any(|y| find(&y, &z).is_some()) { 322 | return TestResult::discard(); 323 | } 324 | 325 | let result = maximal_common_substrings(xs.iter().map(|x| &x[..]), ys.iter().map(|y| &y[..])); 326 | TestResult::from_bool(result.iter().any(|x| find(&x, &z).is_some())) 327 | } 328 | 329 | qc(10).quickcheck(sound as fn(_, _) -> _); 330 | qc(10).quickcheck(complete as fn(_, _, _) -> _); 331 | } 332 | */ 333 | } 334 | 335 | -------------------------------------------------------------------------------- /src/dfa/trie.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2016 Joe Neeman. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | // This is a fairly simple-minded implementation of a trie. Since we don't really have any special 10 | // needs, this module could be replaced by a different crate (if we can find one that's 11 | // well-supported). 12 | 13 | #[derive(Clone, Debug)] 14 | pub struct Trie { 15 | value: Option, 16 | sub_tries: Vec, 17 | } 18 | 19 | impl Trie { 20 | pub fn new() -> Trie { 21 | Trie { 22 | value: None, 23 | sub_tries: Vec::new(), 24 | } 25 | } 26 | 27 | pub fn insert>(&mut self, mut key: I, value: usize) { 28 | if let Some(head) = key.next() { 29 | if self.sub_tries.is_empty() { 30 | self.sub_tries = vec![Trie::new(); 256]; 31 | } 32 | self.sub_tries[head as usize].insert(key, value); 33 | } else { 34 | if self.value.is_some() { 35 | panic!("tried to insert the same key twice"); 36 | } 37 | self.value = Some(value); 38 | } 39 | } 40 | 41 | pub fn prefixes<'a, I: Iterator>(&'a self, input: I) -> TrieIter<'a, I> { 42 | TrieIter { 43 | trie: Some(self), 44 | input: input, 45 | } 46 | } 47 | } 48 | 49 | pub struct TrieIter<'a, I: Iterator> { 50 | trie: Option<&'a Trie>, 51 | input: I, 52 | } 53 | 54 | impl<'a, I: Iterator> Iterator for TrieIter<'a, I> { 55 | type Item = usize; 56 | 57 | fn next(&mut self) -> Option { 58 | let mut next_trie = self.trie; 59 | let mut ret = None; 60 | while let Some(t) = next_trie { 61 | next_trie = self.input.next().and_then(|c| t.sub_tries.get(c as usize)); 62 | if let Some(v) = t.value { 63 | ret = Some(v); 64 | break; 65 | } 66 | } 67 | self.trie = next_trie; 68 | ret 69 | } 70 | } 71 | 72 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2016 Joe Neeman. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | use regex_syntax; 10 | use std::error; 11 | use std::fmt; 12 | 13 | #[derive(Debug)] 14 | pub enum Error { 15 | RegexSyntax(regex_syntax::Error), 16 | TooManyStates, 17 | InvalidEngine(&'static str), 18 | } 19 | 20 | use error::Error::*; 21 | impl fmt::Display for Error { 22 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 23 | match *self { 24 | RegexSyntax(ref e) => write!(f, "Regex syntax error: {}", e), 25 | TooManyStates => write!(f, "State overflow"), 26 | InvalidEngine(s) => write!(f, "Invalid engine: {}", s), 27 | } 28 | } 29 | } 30 | 31 | impl error::Error for Error { 32 | fn description(&self) -> &str { 33 | match *self { 34 | RegexSyntax(ref e) => e.description(), 35 | TooManyStates => "This NFA required too many states to represent as a DFA.", 36 | InvalidEngine(_) => "The regex was not compatible with the requested engine.", 37 | } 38 | } 39 | } 40 | 41 | impl From for Error { 42 | fn from(e: regex_syntax::Error) -> Error { 43 | RegexSyntax(e) 44 | } 45 | } 46 | 47 | -------------------------------------------------------------------------------- /src/graph.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Joe Neeman. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | use dfa::{Dfa, RetTrait}; 10 | use nfa::{Nfa, NoLooks, StateIdx}; 11 | use num_traits::PrimInt; 12 | use std::collections::HashSet; 13 | use std::fmt::Debug; 14 | 15 | #[derive(Clone, Copy, Debug, PartialEq)] 16 | pub enum DfsInstruction { 17 | Continue, 18 | #[allow(dead_code)] 19 | TurnBack, 20 | Stop, 21 | } 22 | 23 | pub trait Graph { 24 | fn num_states(&self) -> usize; 25 | 26 | fn neighbors<'a>(&'a self, i: StateIdx) -> Box + 'a>; 27 | 28 | /// Does a depth-first search of this graph. 29 | /// 30 | /// Every time the search visits a new state, `visit` will be called. Every time the search 31 | /// detects a loop, `cycle` will be called. These return value of these callbacks tell the 32 | /// search how to proceed: 33 | /// - on `Continue`, the search will proceed normally 34 | /// - on `TurnBack`, the search will stop searching the current branch 35 | /// - on `Stop`, the search will terminate early. 36 | fn dfs(&self, init: Inits, mut visit: Visit, mut cycle: Cycle) 37 | where 38 | Visit: FnMut(&[StateIdx]) -> DfsInstruction, 39 | Cycle: FnMut(&[StateIdx]) -> DfsInstruction, 40 | Inits: Iterator, 41 | { 42 | // Pairs of (state, children_left_to_explore). 43 | let mut stack: Vec = Vec::with_capacity(self.num_states()); 44 | let mut remaining_children_stack: Vec>> 45 | = Vec::with_capacity(self.num_states()); 46 | let mut visiting: Vec = vec![false; self.num_states()]; 47 | let mut done: Vec = vec![false; self.num_states()]; 48 | 49 | // For nodes that we are currently visiting, this is their position on the stack. 50 | let mut stack_pos: Vec = vec![0; self.num_states()]; 51 | 52 | let start_states: Vec = init.collect(); 53 | 54 | for &start_idx in &start_states { 55 | if !done[start_idx] { 56 | match visit(&[start_idx][..]) { 57 | DfsInstruction::Continue => {}, 58 | DfsInstruction::TurnBack => { 59 | done[start_idx] = true; 60 | continue; 61 | }, 62 | DfsInstruction::Stop => { return; }, 63 | } 64 | 65 | visiting[start_idx] = true; 66 | stack.push(start_idx); 67 | remaining_children_stack.push(self.neighbors(start_idx)); 68 | stack_pos[start_idx] = 0; 69 | 70 | while !stack.is_empty() { 71 | // We keep stack and remaining_children_stack synchronized. 72 | debug_assert!(!remaining_children_stack.is_empty()); 73 | 74 | let cur = *stack.last().unwrap(); 75 | let next_child = remaining_children_stack.last_mut().unwrap().next(); 76 | 77 | if let Some(child) = next_child { 78 | if visiting[child] { 79 | // We found a cycle: report it (and maybe terminate early). 80 | // Since we turn back on finding a cycle anyway, we treat Continue 81 | // and TurnBack the same (i.e. we don't need to handle either one 82 | // explicitly). 83 | if cycle(&stack[stack_pos[child]..]) == DfsInstruction::Stop { 84 | return; 85 | } 86 | } else if !done[child] { 87 | // This is a new state: report it and push it onto the stack. 88 | stack.push(child); 89 | match visit(&stack[stack_pos[child]..]) { 90 | DfsInstruction::Stop => { return; }, 91 | DfsInstruction::TurnBack => { 92 | stack.pop(); 93 | done[child] = true; 94 | }, 95 | DfsInstruction::Continue => { 96 | remaining_children_stack.push(self.neighbors(child)); 97 | visiting[child] = true; 98 | stack_pos[child] = stack.len() - 1; 99 | }, 100 | } 101 | } 102 | continue; 103 | } 104 | 105 | // If we got this far, the current node is out of children. Pop it from the 106 | // stack. 107 | visiting[cur] = false; 108 | done[cur] = true; 109 | stack.pop(); 110 | remaining_children_stack.pop(); 111 | } 112 | } 113 | } 114 | } 115 | 116 | /// The same as `dfs`, but runs on a graph with cuts in it. 117 | /// 118 | /// Instead of running on the full graph, runs on the graph where pairs in `cuts` are 119 | /// disconnected. 120 | fn dfs_with_cut( 121 | &self, 122 | init: Inits, 123 | cuts: &HashSet<(StateIdx, StateIdx)>, 124 | mut visit: Visit, 125 | mut cycle: Cycle) 126 | where 127 | Visit: FnMut(&[StateIdx]) -> DfsInstruction, 128 | Cycle: FnMut(&[StateIdx]) -> DfsInstruction, 129 | Inits: Iterator, 130 | { 131 | let should_cut = |s: &[StateIdx]| { 132 | let len = s.len(); 133 | len >= 2 && cuts.contains(&(s[len-2], s[len-1])) 134 | }; 135 | let my_visit = |s: &[StateIdx]| 136 | if should_cut(s) { DfsInstruction::TurnBack } else { visit(s) }; 137 | let my_cycle = |s: &[StateIdx]| 138 | if should_cut(s) { DfsInstruction::TurnBack } else { cycle(s) }; 139 | self.dfs(init, my_visit, my_cycle); 140 | } 141 | 142 | /// Returns a list of states, visited in depth-first order. 143 | fn dfs_order>(&self, init: I) -> Vec { 144 | use self::DfsInstruction::*; 145 | 146 | let mut ret: Vec = Vec::new(); 147 | // The unwrap is ok because dfa guarantees never to pass an empty slice. 148 | self.dfs(init, |st| { ret.push(*st.last().unwrap()); Continue }, |_| Continue); 149 | ret 150 | } 151 | 152 | /// Checks whether this graph has any cycles. 153 | #[allow(unused)] 154 | fn has_cycles(&self) -> bool { 155 | use self::DfsInstruction::*; 156 | 157 | let mut found = false; 158 | self.dfs(0..self.num_states(), |_| Continue, |_| { found = true; Stop }); 159 | found 160 | } 161 | } 162 | 163 | impl Graph for Dfa { 164 | fn num_states(&self) -> usize { 165 | Dfa::num_states(self) 166 | } 167 | 168 | fn neighbors<'a>(&'a self, i: StateIdx) -> Box + 'a> { 169 | Box::new(self.transitions(i).ranges_values().map(|x| x.1)) 170 | } 171 | } 172 | 173 | impl Graph for Nfa { 174 | fn num_states(&self) -> usize { 175 | Nfa::num_states(self) 176 | } 177 | 178 | fn neighbors<'a>(&'a self, i: usize) -> Box + 'a> { 179 | Box::new(self.consuming(i).ranges_values().map(|x| x.1)) 180 | } 181 | } 182 | 183 | #[cfg(test)] 184 | mod tests { 185 | use dfa::tests::make_dfa; 186 | use graph::Graph; 187 | 188 | #[test] 189 | fn cycles() { 190 | macro_rules! cyc { 191 | ($re:expr, $res:expr) => { 192 | { 193 | let dfa = make_dfa($re).unwrap(); 194 | println!("{:?}", dfa); 195 | assert_eq!(dfa.has_cycles(), $res); 196 | } 197 | }; 198 | } 199 | 200 | cyc!("abcde", false); 201 | cyc!("ab*d", true); 202 | cyc!("ab*", true); 203 | cyc!("ab*?", false); 204 | cyc!("ab+", true); 205 | cyc!("ab+?", false); 206 | cyc!("(ab*?|cde)", false); 207 | cyc!("(ab*?|cde)f", true); 208 | cyc!("(abc)*?", false); 209 | cyc!("(abc)*?def", true); 210 | } 211 | } 212 | 213 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2016 Joe Neeman. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | /*! 10 | This crate provides tools for converting regular expressions into deterministic finite automata 11 | (DFAs). The most interesting type is `Regex`, which is a virtual machine for executing a DFA. 12 | 13 | # Example: creating and running a `Regex` 14 | 15 | ```rust 16 | use regex_dfa::Regex; 17 | let re = Regex::new(r"\d{4}-\d{2}-\d{2}").unwrap(); 18 | assert_eq!(re.find("My birthday is 1986-08-22!"), Some((15, 25))); 19 | ``` 20 | 21 | The most useful function in this crate is `Regex::find`, which looks for the first substring of the 22 | given string that match the language of the DFA. 23 | 24 | # Comparison to the `regex` crate 25 | 26 | Compared to rust's standard `regex` crate, the main feature of `regex_dfa` is that `regex_dfa` 27 | *eagerly* compiles a regular expression into a DFA, whereas `regex` does so lazily. There are 28 | advantages and disadvantages to the eager approach. To begin with, doing all the compilation 29 | up-front means that there is less to do at match time. If we get around to writing a compiler 30 | plugin for compiling the regular expression at compile time, this would be an even bigger win. 31 | Another advantage is that since we don't care so much about compilation speed, we have more 32 | opportunities to look for optimizations. 33 | 34 | The main disadvantage to eager compilation is memory usage. Even fairly simple regular expressions 35 | may take several tens of kilobytes to represent as a DFA. More complicated ones (especially regular 36 | expressions that use unicode word boundaries or character classes) may require much more. This 37 | disadvantage is specific to eager compilation, since lazy DFA compilation only needs to create DFA 38 | states for those characters that are actually seen (i.e., probably a tiny fraction of the entire 39 | unicode character class). For this reason, `regex_dfa` allows you to restrict the amount of memory 40 | it uses: simply use the method `Regex::new_bounded`, which will fail and report an error if it 41 | would otherwise need to use too much memory. 42 | 43 | # Roadmap 44 | 45 | There are two substantial features that need to be added before this crate can be considered 46 | feature-complete. 47 | 48 | ## SIMD optimizations 49 | 50 | There are some nice tricks available for using SIMD instructions to quickly scan over uninteresting 51 | parts of the input. The `regex` crate is capable (with a nightly compiler) of doing some of these 52 | already, and we should imitate it. 53 | 54 | ## Compiler plugin 55 | 56 | Since the main advantage of this crate is that it can do work ahead of time, it would make total 57 | sense to do it all at the program's compile time. This feature will probably wait until the rust's 58 | compiler plugin story stabilizes a bit. 59 | */ 60 | 61 | #![cfg_attr(test, feature(test))] 62 | #[cfg(test)] 63 | extern crate quickcheck; 64 | 65 | #[cfg(test)] 66 | #[macro_use] 67 | extern crate matches; 68 | 69 | #[cfg(test)] 70 | extern crate rand; 71 | 72 | #[cfg(test)] 73 | extern crate test; 74 | 75 | extern crate itertools; 76 | extern crate memchr; 77 | extern crate num_traits; 78 | extern crate range_map; 79 | extern crate refinery; 80 | extern crate regex_syntax; 81 | extern crate utf8_ranges; 82 | 83 | #[macro_use] 84 | extern crate lazy_static; 85 | 86 | mod dfa; 87 | mod error; 88 | mod look; 89 | mod graph; 90 | mod nfa; 91 | mod regex; 92 | mod runner; 93 | mod unicode; 94 | 95 | pub use error::Error; 96 | pub use regex::Regex; 97 | pub type Result = ::std::result::Result; 98 | 99 | -------------------------------------------------------------------------------- /src/look.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2016 Joe Neeman. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | #![allow(dead_code)] 10 | 11 | use range_map::{Range, RangeSet}; 12 | use std::cmp::Ordering; 13 | use unicode::PERLW; 14 | 15 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, Ord)] 16 | pub enum Look { 17 | Full, 18 | WordChar, 19 | NotWordChar, 20 | NewLine, 21 | Boundary, 22 | Empty, 23 | } 24 | 25 | lazy_static! { 26 | static ref FULL: RangeSet = RangeSet::full(); 27 | static ref WORD_CHAR: RangeSet = 28 | PERLW.iter().map(|&(x, y)| Range::new(x as u32, y as u32)).collect(); 29 | static ref NOT_WORD_CHAR: RangeSet = WORD_CHAR.negated(); 30 | static ref NEW_LINE: RangeSet = RangeSet::single('\n' as u32); 31 | static ref EMPTY: RangeSet = RangeSet::new(); 32 | } 33 | 34 | static ALL: [Look; 6] = [Look::Full, Look::WordChar, Look::NotWordChar, 35 | Look::NewLine, Look::Boundary, Look::Empty]; 36 | 37 | impl PartialOrd for Look { 38 | fn partial_cmp(&self, other: &Look) -> Option { 39 | if self == other { 40 | Some(Ordering::Equal) 41 | } else if self.intersection(other) == *self { 42 | Some(Ordering::Less) 43 | } else if self.intersection(other) == *other { 44 | Some(Ordering::Greater) 45 | } else { 46 | None 47 | } 48 | } 49 | } 50 | 51 | impl Look { 52 | pub fn intersection(&self, other: &Look) -> Look { 53 | use self::Look::*; 54 | match *self { 55 | Full => *other, 56 | WordChar => match *other { 57 | Full => WordChar, 58 | WordChar => WordChar, 59 | _ => Empty, 60 | }, 61 | NotWordChar => match *other { 62 | Full => NotWordChar, 63 | NotWordChar => NotWordChar, 64 | NewLine => NewLine, 65 | Boundary => Boundary, 66 | _ => Empty, 67 | }, 68 | NewLine => match *other { 69 | Full => NewLine, 70 | NotWordChar => NewLine, 71 | NewLine => NewLine, 72 | Boundary => Boundary, 73 | _ => Empty, 74 | }, 75 | Boundary => match *other { 76 | WordChar => Empty, 77 | Empty => Empty, 78 | _ => Boundary, 79 | }, 80 | Empty => Empty, 81 | } 82 | } 83 | 84 | pub fn supersets(&self) -> Vec { 85 | ALL.iter().cloned().filter(|x| *self <= *x).collect() 86 | } 87 | 88 | pub fn as_set(&self) -> &RangeSet { 89 | use self::Look::*; 90 | 91 | match *self { 92 | Full => &FULL, 93 | WordChar => &WORD_CHAR, 94 | NotWordChar => &NOT_WORD_CHAR, 95 | NewLine => &NEW_LINE, 96 | Boundary => &EMPTY, 97 | Empty => &EMPTY, 98 | } 99 | } 100 | 101 | pub fn allows_eoi(&self) -> bool { 102 | use self::Look::*; 103 | 104 | match *self { 105 | Full => true, 106 | WordChar => false, 107 | NotWordChar => true, 108 | NewLine => true, 109 | Boundary => true, 110 | Empty => false, 111 | } 112 | } 113 | 114 | pub fn is_full(&self) -> bool { 115 | match *self { 116 | Look::Full => true, 117 | _ => false, 118 | } 119 | } 120 | 121 | pub fn as_usize(&self) -> usize { 122 | use self::Look::*; 123 | 124 | match *self { 125 | Full => 0, 126 | WordChar => 1, 127 | NotWordChar => 2, 128 | NewLine => 3, 129 | Boundary => 4, 130 | Empty => 5, 131 | } 132 | } 133 | 134 | pub fn num() -> usize { 6 } 135 | 136 | pub fn all() -> &'static [Look] { 137 | &ALL 138 | } 139 | } 140 | 141 | #[cfg(test)] 142 | mod tests { 143 | use quickcheck::{Arbitrary, Gen, quickcheck}; 144 | use super::*; 145 | 146 | impl Arbitrary for Look { 147 | fn arbitrary(g: &mut G) -> Look { 148 | use look::Look::*; 149 | 150 | *g.choose(&[Full, WordChar, NotWordChar, NewLine, Boundary, Empty]).unwrap() 151 | } 152 | } 153 | 154 | #[test] 155 | fn intersection_commutes() { 156 | fn prop(a: Look, b: Look) -> bool { 157 | a.intersection(&b) == b.intersection(&a) 158 | } 159 | quickcheck(prop as fn(_, _) -> _); 160 | } 161 | 162 | #[test] 163 | fn intersection_ordering() { 164 | fn prop(a: Look, b: Look) -> bool { 165 | a.intersection(&b) <= a 166 | } 167 | quickcheck(prop as fn(_, _) -> _); 168 | } 169 | 170 | #[test] 171 | fn intersection_eoi() { 172 | fn prop(a: Look, b: Look) -> bool { 173 | a.intersection(&b).allows_eoi() == (a.allows_eoi() && b.allows_eoi()) 174 | } 175 | quickcheck(prop as fn(_, _) -> _); 176 | } 177 | 178 | #[test] 179 | fn intersection_set() { 180 | fn prop(a: Look, b: Look) -> bool { 181 | a.intersection(&b).as_set() == &a.as_set().intersection(b.as_set()) 182 | } 183 | quickcheck(prop as fn(_, _) -> _); 184 | } 185 | } 186 | 187 | -------------------------------------------------------------------------------- /src/nfa/has_looks.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2016 Joe Neeman. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | //! This module contains two main pieces of functionality: building an NFA from a regular 10 | //! expression and processing an NFA to remove all non-consuming transitions. The first of these is 11 | //! carried out by the `from_regex` function and it is fairly straightforward. The second is 12 | //! possibly more unusual, and so we describe it here in some detail. 13 | //! 14 | //! For your standard classroom NFA, it's trivial to remove non-consuming transitions: for every 15 | //! consuming transition with source state `s` and target state `t`, take the eps-closure of `t` 16 | //! and then add a transition from `s` to everything in that eps-closure. Finally, all 17 | //! non-consuming transitions are deleted. Here it is in ASCII art, where a non-consuming 18 | //! transition transition is denoted by an epsilon (ε): 19 | //! 20 | //! ```text 21 | //! ε b 22 | //! a /-------> 3 -------> 4 23 | //! 1 -----> 2 ε 24 | //! \-------> 5 25 | //! ``` 26 | //! 27 | //! becomes 28 | //! 29 | //! ```text 30 | //! a b 31 | //! /---------------> 3 -------> 4 32 | //! / a 33 | //! 1 -----> 2 34 | //! \ a 35 | //! \---------------> 5 36 | //! ``` 37 | //! 38 | //! The situation becomes (just a little) tricker when the non-consuming transitions are allowed to 39 | //! have predicates that look forward or back by one token. We need to support this sort of 40 | //! transition if we want to support word boundaries (and the fact that doing so is a bit tricky is 41 | //! probably the main reason that the standard `regex` crate doesn't support DFA simulation if the 42 | //! regex contains word boundaries). So now we allow our non-consuming transitions to be of the 43 | //! form `(P, Q)`, where `P` and `Q` are sets of tokens. You can pass through such a non-consuming 44 | //! transition if and only if the previous token belonged to `P` and the next token belongs to `Q`. 45 | //! (The code for this is in `nfa::LookPair`, which is a teeny bit more complicated because it also 46 | //! allows checking for the edge (beginning for `P`, end for `Q`) of the input.) 47 | //! 48 | //! If `Q` is the set of all tokens, then supporting these kinds of non-consuming transitions is 49 | //! almost the same as the previous case. The first difference is that when we take the 50 | //! eps-closure, we also need to keep track of the predicates on the non-consuming transitions that 51 | //! we passed through. For example, if we have a configuration like 52 | //! 53 | //! ```text 54 | //! (P2, Q2) 55 | //! (P1, Q1) /------------> 3 56 | //! 1 ----------> 2 (P3, Q3) 57 | //! \------------> 4 58 | //! ``` 59 | //! 60 | //! then states 2, 3, and 4 all belong to the eps-closure of 1. In order to get from 1 to 3, we 61 | //! need to pass through the predicate `(P1 ∩ P2, Q1 ∩ Q2)`; in order to get from 1 to 4, we need 62 | //! to pass through the predicate `(P1 ∩ P3, Q1 ∩ Q3)`. 63 | //! 64 | //! Assuming for now that all of the `Q` predicates are the set of all possible tokens, we remove 65 | //! the non-consuming transitions as follows: take every consuming transition with source state `s` 66 | //! and target state `t`. Then for every `u` in the eps-closure of `t` with predicate `(P, Q)` 67 | //! leading from `t` to `u`, we add a consuming transition from `s` to `u` *if and only if the 68 | //! consumed token belongs to `P`*. Then we delete all the non-consuming transitions. Going back to 69 | //! the first example, suppose that `P1` contains `a` but `P2` does not. Then 70 | //! 71 | //! ```text 72 | //! (P1, Q1) b 73 | //! a /--------------> 3 -------> 4 74 | //! 1 -----> 2 (P2, Q2) 75 | //! \--------------> 5 76 | //! ``` 77 | //! 78 | //! becomes 79 | //! 80 | //! ```text 81 | //! a b 82 | //! /---------------> 3 -------> 4 83 | //! / a 84 | //! 1 -----> 2 85 | //! 5 86 | //! ``` 87 | //! 88 | //! There is actually one more complication that we won't discuss in detail here: the procedure 89 | //! above doesn't account properly for the eps-closure of the initial state, since it only does 90 | //! things to the eps-closure of a state that follows a transition. In order to handle the 91 | //! eps-closure of the initial state, we actually introduce a collection of initial states, some of 92 | //! which are only active if the previous character of the input satisfied some predicate. 93 | //! 94 | //! Finally, in the case that the `Q` predicates are not the set of all possible tokens, we need to 95 | //! add extra states. For every consuming transition from `s` to `t` and every `u` in the 96 | //! eps-closure of `t` with predicate `(P, Q)` leading from `t` to `u`, we add a new state `u'`. 97 | //! The consuming transitions leading out from `u'` are those consuming transitions leading out 98 | //! from `u` whose tokens belong to `Q`. Then we add a consuming transition from `s` to `u'` if the 99 | //! token that was consumed in going from `s` to `t` belongs to `P`. In ASCII art, if `P` contains 100 | //! `a` but not `b`, and if `Q` contains `c` but not `d` then 101 | //! 102 | //! ```text 103 | //! a (P, Q) c 104 | //! 1 -----> 2 -------------> 3 --------> 4 105 | //! b ^ \ d 106 | //! 5 -----/ \-------> 5 107 | //! ``` 108 | //! 109 | //! becomes 110 | //! 111 | //! ```text 112 | //! a c 113 | //! /--------------------> 3' -----\ 114 | //! / a c \ 115 | //! 1 -----> 2 3 --------> 4 116 | //! b ^ \ d 117 | //! 5 -----/ \-------> 5 118 | //! ``` 119 | //! 120 | //! There are a couple of caveats to this transformation also. The first is that we process *all* 121 | //! of the look-behind (i.e. `P`) predicates before we process any of the look-ahead (i.e. `Q`) 122 | //! predicates. The reason for this can be seen in the example above: if state 4 had any 123 | //! non-consuming transitions leading out of it, then in processing that non-consuming transition 124 | //! we might need to add more consuming transitions leading out of 3. That would in turn affect the 125 | //! consuming transitions that we add to 3'. Therefore, we need to add the extra transitions coming 126 | //! out of 3 (which are due to a look-behind predicate) before we add the transitions coming 127 | //! out of 3' (which are due to a look-ahead predicate). 128 | //! 129 | //! The second caveat to the transformation above comes in the handling of accepting states. When a 130 | //! non-consuming transition leads to an accepting state, it means that the source of that 131 | //! transition should become a conditionally accepting state. 132 | 133 | use look::Look; 134 | use nfa::{Accept, HasLooks, LookPair, Nfa, NoLooks, StateIdx}; 135 | use std::cmp::max; 136 | use std::collections::HashSet; 137 | use std::ops::Deref; 138 | use range_map::{Range, RangeSet}; 139 | use regex_syntax::{CharClass, ClassRange, Expr, Repeater}; 140 | 141 | // Converts a `CharClass` into a `RangeSet` 142 | fn class_to_set(cc: &CharClass) -> RangeSet { 143 | cc.iter().map(|r| Range::new(r.start as u32, r.end as u32)).collect() 144 | } 145 | 146 | impl Nfa { 147 | /// Asserts that the invariants that are supposed to hold do. 148 | fn check_invariants(&self) { 149 | // The init state is implicitly the first one, so there are no explicit init states. 150 | debug_assert!(self.init.is_empty()); 151 | 152 | // The final state is accepting, and no others are. 153 | debug_assert!(self.states.last().unwrap().accept == Accept::Always); 154 | debug_assert!(self.states.iter().rev().skip(1).all(|s| s.accept == Accept::Never)); 155 | 156 | // No state has both a look transition and a consuming transition. 157 | debug_assert!(self.states.iter().all(|s| s.looking.is_empty() || s.consuming.is_empty())); 158 | 159 | // All targets of a consuming transition are just the next state. 160 | debug_assert!(self.states.iter() 161 | .enumerate() 162 | .all(|(idx, s)| s.consuming.ranges_values().all(|&(_, val)| val == idx + 1))); 163 | } 164 | 165 | /// Creates a new Nfa from a regex string. 166 | pub fn from_regex(re: &str) -> ::Result> { 167 | let expr = try!(Expr::parse(re)); 168 | let mut ret = Nfa::new(); 169 | 170 | ret.add_state(Accept::Never); 171 | ret.add_expr(&expr); 172 | ret.add_eps(0, 1); 173 | 174 | let len = ret.num_states(); 175 | ret.states[len - 1].accept = Accept::Always; 176 | 177 | ret.check_invariants(); 178 | Ok(ret) 179 | } 180 | 181 | /// Adds a non-input consuming transition between states `source` and `target`. 182 | /// 183 | /// The transition will be traversed if the last consumed byte matches `behind` and the next 184 | /// available byte matches `ahead`. 185 | pub fn add_look(&mut self, source: StateIdx, target: StateIdx, behind: Look, ahead: Look) { 186 | let look = LookPair { 187 | behind: behind, 188 | ahead: ahead, 189 | target_state: target, 190 | }; 191 | self.states[source].looking.push(look); 192 | } 193 | 194 | /// Removes all look transitions, converting this Nfa into an `Nfa`. 195 | pub fn remove_looks(mut self) -> Nfa { 196 | if self.states.is_empty() { 197 | return Nfa::with_capacity(0); 198 | } 199 | 200 | // For every state with out transitions, add transitions from it to everything in the closure 201 | // of the target. Note that (according to `check_invariants`) the target state is always 202 | // the next state. 203 | let old_len = self.num_states(); 204 | let mut new_states: Vec<(StateIdx, Look, StateIdx)> = Vec::new(); 205 | for src_idx in 0..self.states.len() { 206 | if !self.states[src_idx].consuming.is_empty() { 207 | let consuming = self.states[src_idx].consuming.clone(); 208 | for look in self.closure(src_idx + 1) { 209 | // Add transitions into the look target. 210 | let new_idx = self.add_look_state(look); 211 | let filtered_consuming = consuming.intersection(look.behind.as_set()); 212 | for &(range, _) in filtered_consuming.ranges_values() { 213 | self.add_transition(src_idx, new_idx, range); 214 | } 215 | // If the look target is actually a new state, hold off on adding transitions 216 | // out of it, because we need to make sure that all the transitions from 217 | // look.target_state have been added first. 218 | if new_idx >= old_len { 219 | new_states.push((new_idx, look.ahead, look.target_state)); 220 | } 221 | } 222 | } 223 | } 224 | 225 | // Add the new initial states: everything that was immediately reachable from state 0 is now 226 | // an initial state. 227 | for look in self.closure(0) { 228 | let new_idx = self.add_look_state(look); 229 | self.init.push((look.behind, new_idx)); 230 | if new_idx >= old_len { 231 | new_states.push((new_idx, look.ahead, look.target_state)); 232 | } 233 | } 234 | 235 | // Now add transitions out of the new states. 236 | for (src_idx, look, tgt_idx) in new_states { 237 | let out_consuming = self.states[tgt_idx].consuming.intersection(look.as_set()); 238 | for &(range, tgt) in out_consuming.ranges_values() { 239 | self.states[src_idx].consuming.insert(range, tgt); 240 | } 241 | } 242 | 243 | // Get rid of all looking transitions. 244 | for st in &mut self.states { 245 | st.looking.clear(); 246 | } 247 | 248 | let mut ret: Nfa = self.transmuted(); 249 | ret.trim_unreachable(); 250 | ret 251 | } 252 | 253 | // Adds a new state for a LookPair, if necessary. It is necessary to add a new state if and 254 | // only if the LookPair needs to look ahead. 255 | // 256 | // Returns the index of the new state. 257 | fn add_look_state(&mut self, look: LookPair) -> StateIdx { 258 | if look.ahead.is_full() { 259 | look.target_state 260 | } else { 261 | let tgt_idx = look.target_state; 262 | let new_idx = self.add_state(Accept::Never); 263 | 264 | // If the target states accepts at end of input and the look allows eoi, then the new 265 | // state must also accept at eoi. 266 | if self.states[tgt_idx].accept != Accept::Never && look.ahead.allows_eoi() { 267 | self.states[new_idx].accept = Accept::AtEoi; 268 | self.states[new_idx].accept_look = Look::Boundary; 269 | } 270 | 271 | // If the target state of the look is accepting, add a new look-ahead accepting state. 272 | if self.states[tgt_idx].accept == Accept::Always 273 | && !look.ahead.as_set().is_empty() { 274 | let acc_idx = self.add_look_ahead_state(look.ahead, 1, new_idx); 275 | for range in look.ahead.as_set().ranges() { 276 | self.add_transition(new_idx, acc_idx, range); 277 | } 278 | } 279 | new_idx 280 | } 281 | } 282 | 283 | /// Finds (transitively) the set of all non-consuming transitions that can be made starting 284 | /// from `state`. 285 | /// 286 | /// The search is done depth-first so that priority is preserved. 287 | fn closure(&self, state: StateIdx) -> Vec { 288 | let mut stack: Vec = Vec::new(); 289 | let mut seen: HashSet = HashSet::new(); 290 | let mut ret: Vec = Vec::new(); 291 | let mut next_looks: Vec = Vec::new(); 292 | 293 | stack.extend(self.states[state].looking.iter().cloned().rev()); 294 | while let Some(last_look) = stack.pop() { 295 | ret.push(last_look); 296 | next_looks.clear(); 297 | 298 | for next_look in &self.states[last_look.target_state].looking { 299 | let int = next_look.intersection(&last_look); 300 | if !int.is_empty() && !seen.contains(&int) { 301 | seen.insert(int); 302 | next_looks.push(int); 303 | } 304 | } 305 | 306 | stack.extend(next_looks.drain(..).rev()); 307 | } 308 | 309 | ret 310 | } 311 | 312 | /// Adds an eps transition between the given states. 313 | fn add_eps(&mut self, from: StateIdx, to: StateIdx) { 314 | self.add_look(from, to, Look::Full, Look::Full); 315 | } 316 | 317 | /// Appends a single state that transitions to the next state on observing one of the chars in 318 | /// the given range. 319 | fn add_state_with_chars(&mut self, chars: &RangeSet) { 320 | let idx = self.num_states(); 321 | self.add_state(Accept::Never); 322 | for range in chars.ranges() { 323 | self.add_transition(idx, idx + 1, range); 324 | } 325 | } 326 | 327 | /// Appends two states, with a given transition between them. 328 | fn add_single_transition(&mut self, chars: &RangeSet) { 329 | self.add_state_with_chars(chars); 330 | self.add_state(Accept::Never); 331 | } 332 | 333 | /// Appends a sequence of states that recognizes a literal. 334 | fn add_literal(&mut self, chars: I, case_insensitive: bool) 335 | where C: Deref, 336 | I: Iterator 337 | { 338 | for ch in chars { 339 | let ranges = if case_insensitive { 340 | let cc = CharClass::new(vec![ClassRange { start: *ch, end: *ch }]); 341 | class_to_set(&cc.case_fold()) 342 | } else { 343 | RangeSet::single(*ch as u32) 344 | }; 345 | self.add_state_with_chars(&ranges); 346 | } 347 | self.add_state(Accept::Never); 348 | } 349 | 350 | /// Appends a sequence of states that recognizes the concatenation of `exprs`. 351 | fn add_concat_exprs(&mut self, exprs: &[Expr]) { 352 | if let Some((expr, rest)) = exprs.split_first() { 353 | self.add_expr(expr); 354 | 355 | for expr in rest { 356 | let cur_len = self.num_states(); 357 | self.add_eps(cur_len - 1, cur_len); 358 | self.add_expr(expr); 359 | } 360 | } else { 361 | self.add_state(Accept::Never); 362 | } 363 | } 364 | 365 | /// Appends a sequence of states that recognizes one of the expressions in `alts`. 366 | /// 367 | /// The earlier expressions in `alts` get higher priority when matching. 368 | fn add_alternate_exprs(&mut self, alts: &[Expr]) { 369 | // Add the new initial state that feeds into the alternate. 370 | let init_idx = self.num_states(); 371 | self.add_state(Accept::Never); 372 | 373 | let mut expr_end_indices = Vec::::with_capacity(alts.len()); 374 | for expr in alts { 375 | let expr_init_idx = self.states.len(); 376 | self.add_eps(init_idx, expr_init_idx); 377 | self.add_expr(expr); 378 | expr_end_indices.push(self.states.len() - 1); 379 | } 380 | 381 | // Make the final state of each alternative point to our new final state. 382 | self.add_state(Accept::Never); 383 | let final_idx = self.states.len() - 1; 384 | for idx in expr_end_indices { 385 | self.add_eps(idx, final_idx); 386 | } 387 | } 388 | 389 | /// Appends new states, representing multiple copies of `expr`. 390 | fn add_repeat(&mut self, expr: &Expr, rep: Repeater, greedy: bool) { 391 | match rep { 392 | Repeater::ZeroOrOne => { 393 | self.add_repeat_up_to(expr, 1, greedy); 394 | }, 395 | Repeater::ZeroOrMore => { 396 | self.add_repeat_zero_or_more(expr, greedy); 397 | }, 398 | Repeater::OneOrMore => { 399 | self.add_repeat_min_max(expr, 1, None, greedy); 400 | }, 401 | Repeater::Range{ min, max } => { 402 | self.add_repeat_min_max(expr, min, max, greedy); 403 | } 404 | } 405 | } 406 | 407 | /// Repeats `expr` a fixed number of times (which must be positive). 408 | fn add_repeat_exact(&mut self, expr: &Expr, n: u32) { 409 | assert!(n > 0); 410 | self.add_expr(expr); 411 | for _ in 1..n { 412 | let idx = self.states.len(); 413 | self.add_expr(expr); 414 | self.add_eps(idx - 1, idx); 415 | } 416 | } 417 | 418 | /// Repeats `expr` between zero and `n` times (`n` must be positive). 419 | fn add_repeat_up_to(&mut self, expr: &Expr, n: u32, greedy: bool) { 420 | assert!(n > 0); 421 | 422 | self.add_state(Accept::Never); 423 | let mut init_indices = Vec::::with_capacity(n as usize); 424 | for _ in 0..n { 425 | init_indices.push(self.states.len() as StateIdx); 426 | self.add_expr(expr); 427 | } 428 | let final_idx = self.states.len() - 1; 429 | for idx in init_indices { 430 | self.add_alt_eps(idx - 1, idx, final_idx, greedy); 431 | } 432 | } 433 | 434 | /// Adds an eps transition from `from` to both `to1` and `to2`. If `greedy` is true, `to1` is 435 | /// preferred, and otherwise `to2` is preferred. 436 | fn add_alt_eps(&mut self, from: usize, to1: usize, to2: usize, greedy: bool) { 437 | if greedy { 438 | self.add_eps(from, to1); 439 | self.add_eps(from, to2); 440 | } else { 441 | self.add_eps(from, to2); 442 | self.add_eps(from, to1); 443 | } 444 | } 445 | 446 | /// Appends new states, representing multiple copies of `expr`. 447 | /// 448 | /// The new states represent a language that accepts at least `min` and at most `maybe_max` 449 | /// copies of `expr`. (If `maybe_max` is `None`, there is no upper bound.) 450 | fn add_repeat_min_max(&mut self, expr: &Expr, min: u32, maybe_max: Option, greedy: bool) { 451 | if min == 0 && maybe_max == Some(0) { 452 | // We add a state anyway, in order to maintain the convention that every expr should 453 | // add at least one state (otherwise keeping track of indices becomes much more 454 | // tedious). 455 | self.add_state(Accept::Never); 456 | return; 457 | } 458 | 459 | if min > 0 { 460 | self.add_repeat_exact(expr, min); 461 | 462 | // If anything else comes after this, we need to connect the two parts. 463 | if maybe_max != Some(min) { 464 | let len = self.num_states(); 465 | self.add_eps(len - 1, len); 466 | } 467 | } 468 | 469 | if let Some(max) = maybe_max { 470 | if max > min { 471 | self.add_repeat_up_to(expr, max - min, greedy); 472 | } 473 | } else { 474 | self.add_repeat_zero_or_more(expr, greedy); 475 | } 476 | } 477 | 478 | /// Repeats the given expression zero or more times. 479 | fn add_repeat_zero_or_more(&mut self, expr: &Expr, greedy: bool) { 480 | let start_idx = self.num_states(); 481 | self.add_state(Accept::Never); 482 | self.add_expr(expr); 483 | self.add_state(Accept::Never); 484 | let end_idx = self.num_states() - 1; 485 | 486 | self.add_alt_eps(start_idx, start_idx + 1, end_idx, greedy); 487 | self.add_alt_eps(end_idx - 1, start_idx + 1, end_idx, greedy); 488 | } 489 | 490 | /// Adds two new states, with a look connecting them. 491 | fn add_look_pair(&mut self, behind: Look, ahead: Look) { 492 | let idx = self.add_state(Accept::Never); 493 | self.add_look(idx, idx + 1, behind, ahead); 494 | self.add_state(Accept::Never); 495 | } 496 | 497 | /// Adds an extra predicate between the last two states (there must be at least two states). 498 | fn extra_look(&mut self, behind: Look, ahead: Look) { 499 | let len = self.states.len(); 500 | self.add_look(len - 2, len - 1, behind, ahead); 501 | } 502 | 503 | /// Appends a bunch of new states, representing `expr`. 504 | /// 505 | /// This maintains the invariant that the last state is always empty (i.e. it doesn't have any 506 | /// transitions leading out of it). It is also guaranteed to add at least one new state. 507 | fn add_expr(&mut self, expr: &Expr) { 508 | use regex_syntax::Expr::*; 509 | 510 | match *expr { 511 | Empty => { self.add_state(Accept::Never); }, 512 | Class(ref c) => self.add_single_transition(&class_to_set(c)), 513 | AnyChar => self.add_single_transition(&RangeSet::full()), 514 | AnyCharNoNL => { 515 | let nls = b"\n\r".into_iter().map(|b| *b as u32); 516 | self.add_single_transition(&RangeSet::except(nls)) 517 | }, 518 | Concat(ref es) => self.add_concat_exprs(es), 519 | Alternate(ref es) => self.add_alternate_exprs(es), 520 | Literal { ref chars, casei } => self.add_literal(chars.iter(), casei), 521 | StartLine => self.add_look_pair(Look::NewLine, Look::Full), 522 | StartText => self.add_look_pair(Look::Boundary, Look::Full), 523 | EndLine => self.add_look_pair(Look::Full, Look::NewLine), 524 | EndText => self.add_look_pair(Look::Full, Look::Boundary), 525 | WordBoundary => { 526 | self.add_look_pair(Look::WordChar, Look::NotWordChar); 527 | self.extra_look(Look::NotWordChar, Look::WordChar); 528 | }, 529 | NotWordBoundary => { 530 | self.add_look_pair(Look::WordChar, Look::WordChar); 531 | self.extra_look(Look::NotWordChar, Look::NotWordChar); 532 | }, 533 | Repeat { ref e, r, greedy } => self.add_repeat(e, r, greedy), 534 | 535 | // We don't support capture groups, so there is no need to keep track of 536 | // the group name or number. 537 | Group { ref e, .. } => self.add_expr(e), 538 | 539 | } 540 | } 541 | } 542 | 543 | #[cfg(test)] 544 | mod tests { 545 | use look::Look; 546 | use nfa::{Accept, NoLooks, Nfa, StateIdx}; 547 | use nfa::tests::{re_nfa, trans_nfa}; 548 | 549 | // Creates an Nfa with the given transitions, with initial state zero, and with the final 550 | // state the only accepting state. 551 | fn trans_nfa_extra(size: usize, transitions: &[(StateIdx, StateIdx, char)]) 552 | -> Nfa { 553 | let mut ret: Nfa = trans_nfa(size, transitions); 554 | 555 | ret.states[size-1].accept = Accept::Always; 556 | ret.init.push((Look::Full, 0)); 557 | ret 558 | } 559 | 560 | #[test] 561 | fn single() { 562 | let nfa = re_nfa("a"); 563 | let target = trans_nfa_extra(2, &[(0, 1, 'a')]); 564 | 565 | assert_eq!(nfa, target); 566 | } 567 | 568 | #[test] 569 | fn alternate() { 570 | let nfa = re_nfa("a|b"); 571 | let mut target = trans_nfa_extra(3, &[(0, 2, 'a'), (1, 2, 'b')]); 572 | target.init.push((Look::Full, 1)); 573 | 574 | assert_eq!(nfa, target); 575 | } 576 | 577 | // TODO: once remove_looks supports laziness, test it. 578 | 579 | #[test] 580 | fn plus() { 581 | let nfa = re_nfa("a+"); 582 | // It's possible to generate a smaller NFA for '+', but we don't currently do it. 583 | let target = trans_nfa_extra(3, &[(0, 1, 'a'), (0, 2, 'a'), (1, 1, 'a'), (1, 2, 'a')]); 584 | 585 | assert_eq!(nfa, target); 586 | } 587 | 588 | #[test] 589 | fn star() { 590 | let nfa = re_nfa("a*"); 591 | // It's possible to generate a smaller NFA for '*', but we don't currently do it. 592 | let mut target = trans_nfa_extra(2, &[(0, 0, 'a'), (0, 1, 'a')]); 593 | target.init.push((Look::Full, 1)); 594 | 595 | assert_eq!(nfa, target); 596 | } 597 | 598 | #[test] 599 | fn rep_fixed() { 600 | assert_eq!(re_nfa("a{3}"), re_nfa("aaa")); 601 | } 602 | 603 | #[test] 604 | fn rep_range() { 605 | assert_eq!(re_nfa("a{2,4}"), re_nfa("aaa{0,2}")); 606 | } 607 | 608 | #[test] 609 | fn sequence() { 610 | let nfa = re_nfa("ab"); 611 | let target = trans_nfa_extra(3, &[(0, 1, 'a'), (1, 2, 'b')]); 612 | 613 | assert_eq!(nfa, target); 614 | } 615 | 616 | #[test] 617 | fn anchored_start() { 618 | let nfa = re_nfa("^a"); 619 | let mut target = trans_nfa(2, &[(0, 1, 'a')]); 620 | target.init.push((Look::Boundary, 0)); 621 | target.states[1].accept = Accept::Always; 622 | 623 | assert_eq!(nfa, target); 624 | } 625 | 626 | #[test] 627 | fn anchored_end() { 628 | let nfa = re_nfa("a$"); 629 | let mut target = trans_nfa_extra(2, &[(0, 1, 'a')]); 630 | target.states[1].accept = Accept::AtEoi; 631 | target.states[1].accept_look = Look::Boundary; 632 | target.states[1].accept_state = 1; 633 | 634 | assert_eq!(nfa, target); 635 | } 636 | 637 | #[test] 638 | fn word_boundary_start() { 639 | let nfa = re_nfa(r"\ba"); 640 | let mut target = trans_nfa(2, &[(1, 0, 'a')]); 641 | target.init.push((Look::NotWordChar, 1)); 642 | target.states[0].accept = Accept::Always; 643 | 644 | assert_eq!(nfa, target); 645 | } 646 | 647 | #[test] 648 | fn word_boundary_end() { 649 | let nfa = re_nfa(r"a\b"); 650 | let mut target = trans_nfa_extra(3, &[(0, 1, 'a')]); 651 | for range in Look::NotWordChar.as_set().ranges() { 652 | target.add_transition(1, 2, range); 653 | } 654 | target.states[1].accept = Accept::AtEoi; 655 | target.states[1].accept_look = Look::Boundary; 656 | target.states[1].accept_state = 1; 657 | target.states[2].accept = Accept::Always; 658 | target.states[2].accept_look = Look::NotWordChar; 659 | target.states[2].accept_state = 1; 660 | target.states[2].accept_tokens = 1; 661 | 662 | assert_eq!(nfa, target); 663 | } 664 | 665 | #[test] 666 | fn word_boundary_ambiguous() { 667 | let nfa = re_nfa(r"\b(a| )"); 668 | let mut target = trans_nfa(3, &[(1, 0, ' '), (2, 0, 'a')]); 669 | target.states[0].accept = Accept::Always; 670 | target.init.push((Look::WordChar, 1)); 671 | target.init.push((Look::NotWordChar, 2)); 672 | 673 | assert_eq!(nfa, target); 674 | } 675 | 676 | #[test] 677 | fn empty() { 678 | assert_eq!(re_nfa(""), trans_nfa_extra(1, &[])); 679 | } 680 | } 681 | 682 | -------------------------------------------------------------------------------- /src/nfa/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2016 Joe Neeman. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | use look::Look; 10 | use num_traits::PrimInt; 11 | use range_map::{Range, RangeMultiMap}; 12 | use std::fmt::{self, Debug, Formatter}; 13 | use std::marker::PhantomData; 14 | 15 | mod has_looks; 16 | mod no_looks; 17 | 18 | // TODO: it would be nice to make StateIdx a new type instead of a type alias. The problem is that 19 | // we need to be able to index Vecs with it, and we can't impl Index for Vec 20 | // because of coherence rules. 21 | pub type StateIdx = usize; 22 | 23 | /// How we represent a set of states. The two important criteria are: 24 | /// 25 | /// - it should be reasonably fast even when there are thousands of states (this knocks out 26 | /// BitSet), and 27 | /// - it should be hashable (this knocks out HashSet). 28 | /// 29 | /// Note that efficient insertion and O(1) queries are not important. Therefore, we use a sorted 30 | /// Vec. (But be careful to keep it sorted!) 31 | pub type StateSet = Vec; 32 | 33 | #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)] 34 | struct LookPair { 35 | pub behind: Look, 36 | pub ahead: Look, 37 | pub target_state: StateIdx, 38 | } 39 | 40 | impl LookPair { 41 | fn is_empty(&self) -> bool { 42 | self.behind == Look::Empty || self.ahead == Look::Empty 43 | } 44 | 45 | fn intersection(&self, other: &LookPair) -> LookPair { 46 | LookPair { 47 | behind: self.behind.intersection(&other.behind), 48 | ahead: self.ahead.intersection(&other.ahead), 49 | target_state: self.target_state, 50 | } 51 | } 52 | } 53 | 54 | /// The enum for determining whether a state is accepting. Classical NFAs would only allow `Never` 55 | /// and `Always` here, but we also allow `AtEoi`, which means that the state should accept if and 56 | /// only if we've reached the end of the input. 57 | #[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] 58 | pub enum Accept { 59 | Never, 60 | AtEoi, 61 | Always, 62 | } 63 | 64 | #[derive(Clone, Eq, PartialEq)] 65 | struct State { 66 | accept: Accept, 67 | // If accept == Always and accept_tokens > 0, then we had to do some look-ahead in order to 68 | // determine that we have a match. In that case, accept_state is the index of the state that 69 | // should have accepted. 70 | accept_state: StateIdx, 71 | // In the case that we had some look-ahead, `accept_look` says what kind of char was involved 72 | // in the look-ahead, and `accept_tokens` says how many input tokens were consumed while 73 | // looking ahead. There are some restrictions on these values: 74 | // - if accept is Never then accept_look is Full and accept_tokens is zero 75 | // - if accept is AtEoi then accept_look is Boundary and accept_tokens is zero 76 | // - accept_look is never Empty 77 | // - if accept is Always then accept_look is neither empty nor Boundary 78 | // - if Tok is u32 then accept_tokens is either 0 or 1 79 | // - if Tok is u8 then accept_tokens is at most 4. 80 | accept_look: Look, 81 | accept_tokens: u8, 82 | 83 | // The transitions that consume input. 84 | consuming: RangeMultiMap, 85 | // Transitions that do not consume input, but that are allowed to look forward and backward one 86 | // token. 87 | looking: Vec, 88 | } 89 | 90 | /// A non-deterministic finite automaton. 91 | /// 92 | /// `Tok` is the type of symbol that the automaton consumes. For most operations, only `u8` and 93 | /// `u32` are supported. 94 | /// 95 | /// Match priority 96 | /// ============== 97 | /// 98 | /// The whole point of a *non-deterministic* finite automaton is that it can match an input in 99 | /// multiple ways. This implementation supports match priorities, meaning that in the event of 100 | /// multiple matches, there is exactly one that is preferred to all the others. In this 101 | /// implementation, the transitions out of each state are ordered and we prefer a match that makes 102 | /// an earlier transition over one that makes a later one. 103 | /// 104 | /// The `Variant` parameter 105 | /// ======================= 106 | /// 107 | /// There are basically two versions of this struct with different representations and invariants, 108 | /// but they share enough code in common that it made more sense to write one struct and use a type 109 | /// parameter to determine which version it is. This is the meaning of the `Variant` type 110 | /// parameter, and it has two possible values: `HasLooks` and `NoLooks`. 111 | /// 112 | /// If `Variant == HasLooks` then the `init` field is unused. The only legal values for a state's 113 | /// `accept` field are `Always` and `Never`, and all the `accept_*` fields are unused. The 114 | /// automaton implicitly has a single initial state (state 0). Methods specific to 115 | /// `Nfa<_, HasLooks>` are in `has_looks.rs`. 116 | /// 117 | /// If `Variant == NoLooks` then the states' `looking` fields are unused. Initial states are 118 | /// explicitly given in `init` and in the states' `accept.*` fields. Methods specific to 119 | /// `Nfa<_, NoLooks>` are in `no_looks.rs`. 120 | /// 121 | /// The typical life-cycle of an `Nfa` is as follows: 122 | /// 123 | /// - First, create an `Nfa` using `from_regex`. 124 | /// - Call `nfa.remove_looks()` to turn the `Nfa` to an `Nfa`. 125 | /// - Call `nfa.byte_me()` to turn the `Nfa` into an `Nfa`. 126 | /// - Call one of the `nfa.determinize_*()` methods to make a `Dfa`. 127 | /// 128 | /// There are also some operations modifying `Nfa` that can be called between the last 129 | /// two steps. 130 | #[derive(Clone, Eq, PartialEq)] 131 | pub struct Nfa { 132 | states: Vec>, 133 | // The various possible sets of states that the automaton can start in, depending on what the 134 | // most recent `char` of input was. 135 | // 136 | // We decide the initial state by looking at the previous char of input. For every element of 137 | // `self.init` whose first entry matches that char, we start in the corresponding NFA state. 138 | // Note that these states are ordered: states that appear earlier are given higher priority for 139 | // matching. 140 | init: Vec<(Look, StateIdx)>, 141 | phantom: PhantomData, 142 | } 143 | 144 | pub trait Lookability {} 145 | 146 | #[derive(Copy, Clone, Debug, PartialEq)] 147 | pub struct HasLooks; 148 | #[derive(Copy, Clone, Debug, PartialEq)] 149 | pub struct NoLooks; 150 | 151 | impl Lookability for HasLooks {} 152 | impl Lookability for NoLooks {} 153 | 154 | impl Nfa { 155 | pub fn new() -> Nfa { 156 | Nfa::with_capacity(0) 157 | } 158 | 159 | /// Creates a new `Nfa` that can `add_state()` `n` times without re-allocating. 160 | pub fn with_capacity(n: usize) -> Nfa { 161 | Nfa { 162 | states: Vec::with_capacity(n), 163 | init: Vec::new(), 164 | phantom: PhantomData, 165 | } 166 | } 167 | 168 | /// Returns my consuming transitions, but with the source and destination swapped. 169 | /// 170 | /// If I have a transition from state `i` to state `j` that consumes token `c`, then 171 | /// `ret[j]` will contain a mapping from `c` to `i`, where `ret` is the value returned by this 172 | /// method. 173 | /// 174 | /// Note that information about match priorities is lost. 175 | pub fn reversed_transitions(&self) -> Vec> { 176 | let mut ret = vec![RangeMultiMap::new(); self.states.len()]; 177 | 178 | for (source_idx, st) in self.states.iter().enumerate() { 179 | for &(range, target_idx) in st.consuming.ranges_values() { 180 | ret[target_idx].insert(range, source_idx); 181 | } 182 | } 183 | ret 184 | } 185 | 186 | /// Adds a new state and returns its index. 187 | pub fn add_state(&mut self, accept: Accept) -> StateIdx { 188 | let state_idx = self.states.len(); 189 | self.states.push(State { 190 | accept: accept, 191 | accept_state: state_idx, 192 | accept_look: if accept == Accept::AtEoi { Look::Boundary } else { Look::Full }, 193 | accept_tokens: 0, 194 | consuming: RangeMultiMap::new(), 195 | looking: Vec::new(), 196 | }); 197 | state_idx 198 | } 199 | 200 | /// Adds a new state and returns its index. 201 | /// 202 | /// The new state is always accepting; it represents the case that we accept after looking 203 | /// ahead a few tokens. 204 | pub fn add_look_ahead_state(&mut self, look: Look, tokens: u8, accept_state: StateIdx) 205 | -> StateIdx { 206 | debug_assert!(look != Look::Boundary && look != Look::Full && look != Look::Empty); 207 | debug_assert!(tokens > 0); 208 | 209 | let state_idx = self.states.len(); 210 | self.states.push(State { 211 | accept: Accept::Always, 212 | accept_state: accept_state, 213 | accept_look: look, 214 | accept_tokens: tokens, 215 | consuming: RangeMultiMap::new(), 216 | looking: Vec::new(), 217 | }); 218 | state_idx 219 | } 220 | 221 | /// Adds a transition that moves from `source` to `target` on consuming a token in `range`. 222 | pub fn add_transition(&mut self, source: StateIdx, target: StateIdx, range: Range) { 223 | self.states[source].consuming.insert(range, target); 224 | } 225 | 226 | /// Returns the set of consuming transitions out of the given state. 227 | pub fn consuming(&self, i: StateIdx) -> &RangeMultiMap { 228 | &self.states[i].consuming 229 | } 230 | 231 | /// Returns the number of states. 232 | pub fn num_states(&self) -> usize { 233 | self.states.len() 234 | } 235 | 236 | // You've just done some operation that has changed state indices (probably by deleting 237 | // un-needed states). Now re-label the existing transitions according to the new state indices. 238 | fn map_states(&mut self, map: F) where F: Fn(StateIdx) -> Option { 239 | for st in &mut self.states { 240 | st.consuming.retain_values(|x| map(*x).is_some()); 241 | // The unwrap is ok because we've just filtered all the `None`s (and `map` is Fn, not 242 | // FnMut). 243 | st.consuming.map_values(|x| map(*x).unwrap()); 244 | 245 | st.looking = st.looking.iter() 246 | .filter(|look| map(look.target_state).is_some()) 247 | // The unwrap is ok because we've just filtered all the `None`s. 248 | .map(|look| LookPair { target_state: map(look.target_state).unwrap(), .. *look }) 249 | .collect(); 250 | 251 | st.accept_state = map(st.accept_state).expect("bug in map_states"); 252 | } 253 | 254 | self.init = self.init.iter() 255 | .filter_map(|pair| map(pair.1).map(|idx| (pair.0, idx))) 256 | .collect(); 257 | } 258 | 259 | // Changes the `Lookability` marker without allocating anything. 260 | fn transmuted(self) -> Nfa { 261 | Nfa { 262 | states: self.states, 263 | init: self.init, 264 | phantom: PhantomData, 265 | } 266 | } 267 | 268 | /// Returns true if this Nfa only matches things at the beginning of the input. 269 | pub fn is_anchored(&self) -> bool { 270 | self.init.iter().all(|pair| pair.0 == Look::Boundary) 271 | } 272 | 273 | /// Returns true if this Nfa never matches anything. 274 | pub fn is_empty(&self) -> bool { 275 | self.states.is_empty() 276 | } 277 | } 278 | 279 | impl Debug for Nfa { 280 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { 281 | try!(f.write_fmt(format_args!("Nfa ({} states):\n", self.states.len()))); 282 | 283 | try!(f.write_fmt(format_args!("Init: {:?}\n", self.init))); 284 | 285 | for (st_idx, st) in self.states.iter().enumerate().take(40) { 286 | try!(f.write_fmt(format_args!("\tState {} ({:?}):\n", st_idx, st.accept))); 287 | 288 | if st.accept != Accept::Never { 289 | try!(f.write_fmt(format_args!("\t\tlook {:?}, tokens {:?}, state {:?}\n", 290 | st.accept_look, st.accept_tokens, st.accept_state))); 291 | } 292 | if !st.consuming.is_empty() { 293 | try!(f.write_str("\t\tConsuming:\n")); 294 | // Cap it at 10 transitions, since it gets unreadable otherwise. 295 | for &(range, target) in st.consuming.ranges_values().take(10) { 296 | try!(f.write_fmt(format_args!("\t\t\t{:?} -- {:?} => {}\n", 297 | range.start, range.end, target))); 298 | } 299 | if st.consuming.num_ranges() > 10 { 300 | try!(f.write_str("\t\t\t...\n")); 301 | } 302 | } 303 | if !st.looking.is_empty() { 304 | try!(f.write_str("\t\tLooking:\n")); 305 | for look in &st.looking { 306 | try!(f.write_fmt(format_args!("\t\t\t({:?},{:?}) => {}\n", 307 | look.behind, look.ahead, look.target_state))); 308 | } 309 | } 310 | } 311 | if self.states.len() > 40 { 312 | try!(f.write_fmt(format_args!("\t... ({} more states)\n", self.states.len() - 40))); 313 | } 314 | Ok(()) 315 | } 316 | } 317 | 318 | #[cfg(test)] 319 | pub mod tests { 320 | use nfa::{Accept, NoLooks, Nfa, StateIdx}; 321 | use num_traits::PrimInt; 322 | use range_map::Range; 323 | use std::fmt::Debug; 324 | 325 | // Creates an Nfa from a regular expression string. 326 | pub fn re_nfa(re: &str) -> Nfa { 327 | let nfa = Nfa::from_regex(re).unwrap(); 328 | println!("before remove looks: {:?}", nfa); 329 | let nfa = nfa.remove_looks(); 330 | println!("after remove looks: {:?}", nfa); 331 | nfa 332 | //Nfa::from_regex(re).unwrap().remove_looks() 333 | } 334 | 335 | // Creates an Nfa with the given transitions. 336 | pub fn trans_range_nfa(size: usize, transitions: &[(StateIdx, StateIdx, Range)]) 337 | -> Nfa 338 | where Tok: Debug + PrimInt { 339 | let mut ret: Nfa = Nfa::with_capacity(size); 340 | for _ in 0..size { 341 | ret.add_state(Accept::Never); 342 | } 343 | for &(src, tgt, range) in transitions { 344 | ret.add_transition(src, tgt, range); 345 | } 346 | ret 347 | } 348 | 349 | // Creates an Nfa with the given transitions, each of which only takes a single char. 350 | pub fn trans_nfa(size: usize, transitions: &[(StateIdx, StateIdx, char)]) 351 | -> Nfa 352 | where Tok: Debug + PrimInt { 353 | let tok = |x: char| -> Tok { Tok::from(x as u32).unwrap() }; 354 | let range_trans: Vec<_> = transitions.iter() 355 | .map(|x| (x.0, x.1, Range::new(tok(x.2), tok(x.2)))) 356 | .collect(); 357 | trans_range_nfa(size, &range_trans) 358 | } 359 | } 360 | 361 | -------------------------------------------------------------------------------- /src/nfa/no_looks.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2016 Joe Neeman. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | use dfa::Dfa; 10 | use error::Error; 11 | use itertools::Itertools; 12 | use look::Look; 13 | use nfa::{Accept, Nfa, NoLooks, State, StateIdx, StateSet}; 14 | use num_traits::PrimInt; 15 | use range_map::{Range, RangeMap, RangeMultiMap}; 16 | use std::{char, u8, usize}; 17 | use std::cmp::max; 18 | use std::collections::{HashMap, HashSet}; 19 | use std::fmt::Debug; 20 | use std::marker::PhantomData; 21 | use std::mem::swap; 22 | use utf8_ranges::{Utf8Range, Utf8Sequence, Utf8Sequences}; 23 | 24 | // This provides a more compact way of representing UTF-8 sequences. 25 | // 26 | // A sequence of bytes belongs to this set if its first byte is in `head[0]`, its second byte is 27 | // in `head[1]`, etc., and its last byte belongs to one of the ranges in `last_byte`. 28 | // 29 | // This representation is handy for making NFAs because compared to the representation in 30 | // `Utf8Sequences`, it adds many fewer states. Basically, we are doing some crude minimization 31 | // before creating the states. 32 | struct MergedUtf8Sequences { 33 | pub head: Vec, 34 | pub last_byte: Vec, 35 | } 36 | 37 | // Returns this range as a pair of chars, or none if this is an empty range. 38 | fn to_char_pair(r: Range) -> Option<(char, char)> { 39 | // Round up self.start to the nearest legal codepoint. 40 | let start = if r.start > 0xD7FF && r.start < 0xE000 { 41 | 0xE000 42 | } else { 43 | r.start 44 | }; 45 | 46 | // Round down self.end. 47 | let end = if r.end > 0x10FFFF { 48 | 0x10FFFF 49 | } else if r.end < 0xE000 && r.end > 0xD7FF { 50 | 0xD7FF 51 | } else { 52 | r.end 53 | }; 54 | 55 | if start > end { 56 | None 57 | } else { 58 | Some((char::from_u32(start).unwrap(), char::from_u32(end).unwrap())) 59 | } 60 | } 61 | 62 | impl MergedUtf8Sequences { 63 | // Panics if not all the input sequences have the same leading byte ranges. 64 | fn merge(iter: I) -> MergedUtf8Sequences where I: Iterator { 65 | let mut head = Vec::new(); 66 | let mut last_byte = Vec::new(); 67 | 68 | for seq in iter { 69 | let len = seq.len(); 70 | let h = &seq.as_slice()[..len-1]; 71 | if head.is_empty() { 72 | head.extend_from_slice(h); 73 | } else if &head[..] != h { 74 | panic!("invalid sequences to merge"); 75 | } 76 | 77 | last_byte.push(seq.as_slice()[len-1]); 78 | } 79 | 80 | MergedUtf8Sequences { 81 | head: head, 82 | last_byte: last_byte, 83 | } 84 | } 85 | 86 | fn from_sequences<'a, I>(iter: I) -> Box + 'a> 87 | where I: Iterator + 'a { 88 | fn head(u: &Utf8Sequence) -> Vec { 89 | let len = u.len(); 90 | u.as_slice()[..len-1].to_owned() 91 | } 92 | 93 | Box::new(iter 94 | .group_by(head) 95 | .into_iter() 96 | .map(|(_, seqs)| MergedUtf8Sequences::merge(seqs.into_iter()))) 97 | } 98 | 99 | fn from_ranges<'a, I>(iter: I) -> Box + 'a> 100 | where I: Iterator> + 'a { 101 | MergedUtf8Sequences::from_sequences( 102 | iter.filter_map(to_char_pair) 103 | .flat_map(|r| Utf8Sequences::new(r.0, r.1))) 104 | } 105 | 106 | fn num_bytes(&self) -> u8 { 107 | (self.head.len() + 1) as u8 108 | } 109 | } 110 | 111 | // Creates a byte-based Dfa that matches all the chars in `look.as_set()`. 112 | fn make_char_dfa(look: Look) -> Dfa<(Look, u8)> { 113 | let mut nfa: Nfa = Nfa::with_capacity(2); 114 | nfa.add_state(Accept::Never); 115 | nfa.add_look_ahead_state(look, 1, 0); 116 | // TODO: shouldn't adding both Full and Boundary be redundant? 117 | nfa.init.push((Look::Full, 0)); 118 | nfa.init.push((Look::Boundary, 0)); 119 | nfa.states[0].consuming 120 | = RangeMultiMap::from_vec(look.as_set().ranges().map(|x| (x, 1)).collect()); 121 | 122 | // These unwraps are OK because the only failures are caused by having too many states. 123 | nfa.byte_me(usize::MAX).unwrap() 124 | .determinize(usize::MAX).unwrap() 125 | .optimize() 126 | } 127 | 128 | // Creates a byte-based Dfa that matches backwards all the chars in `look.as_set()`. 129 | fn make_rev_char_dfa(look: Look) -> Dfa<(Look, u8)> { 130 | let mut nfa: Nfa = Nfa::with_capacity(0); // TODO: better capacity 131 | nfa.add_state(Accept::Never); 132 | nfa.init.push((Look::Full, 0)); 133 | nfa.init.push((Look::Boundary, 0)); 134 | 135 | // This is more-or-less C&P from add_utf8_sequence. 136 | for seq in MergedUtf8Sequences::from_ranges(look.as_set().ranges()) { 137 | let mut last_state = nfa.add_state(Accept::Never); 138 | 139 | for range in &seq.last_byte { 140 | nfa.add_transition(0, last_state, Range::new(range.start, range.end)); 141 | } 142 | for range in seq.head.iter().rev() { 143 | let cur_state = nfa.add_state(Accept::Never); 144 | 145 | nfa.add_transition(last_state, cur_state, Range::new(range.start, range.end)); 146 | last_state = cur_state; 147 | } 148 | 149 | nfa.states[last_state].accept = Accept::Always; 150 | nfa.states[last_state].accept_look = look; 151 | nfa.states[last_state].accept_state = 0; 152 | nfa.states[last_state].accept_tokens = seq.num_bytes(); 153 | } 154 | 155 | // This unwrap is OK because the only failures are caused by having too many states. 156 | nfa.determinize(usize::MAX).unwrap() 157 | .optimize() 158 | } 159 | 160 | // We cache optimized Dfas for the expensive looks. See `Nfa::add_min_utf8_sequences` 161 | // for an explanation. 162 | lazy_static! { 163 | static ref WORD_CHAR_DFA: Dfa<(Look, u8)> = make_char_dfa(Look::WordChar); 164 | static ref NOT_WORD_CHAR_DFA: Dfa<(Look, u8)> = make_char_dfa(Look::NotWordChar); 165 | static ref REV_WORD_CHAR_DFA: Dfa<(Look, u8)> = make_rev_char_dfa(Look::WordChar); 166 | static ref REV_NOT_WORD_CHAR_DFA: Dfa<(Look, u8)> = make_rev_char_dfa(Look::NotWordChar); 167 | } 168 | 169 | impl Nfa { 170 | // Returns the set of all states that can be reached from some initial state. 171 | fn reachable_from(&self, states: I) -> HashSet where I: Iterator { 172 | let mut active: HashSet = states.collect(); 173 | let mut next_active: HashSet = HashSet::new(); 174 | let mut ret = active.clone(); 175 | 176 | while !active.is_empty() { 177 | for &s in &active { 178 | for &(_, t) in self.states[s].consuming.ranges_values() { 179 | if !ret.contains(&t) { 180 | ret.insert(t); 181 | next_active.insert(t); 182 | } 183 | } 184 | } 185 | swap(&mut active, &mut next_active); 186 | next_active.clear(); 187 | } 188 | ret 189 | } 190 | 191 | // Reverses this Nfa, but only the transitions (i.e. doesn't do anything about initial and 192 | // final states). 193 | fn reversed_simple(&self) -> Nfa { 194 | let rev_transitions = self.reversed_transitions(); 195 | let mut ret: Nfa = Nfa::with_capacity(self.states.len()); 196 | 197 | for trans in rev_transitions { 198 | let idx = ret.add_state(Accept::Never); 199 | ret.states[idx].consuming = trans; 200 | } 201 | 202 | ret 203 | } 204 | 205 | // Returns the set of all states that can be reached from an initial state and that can reach 206 | // some accepting state. 207 | fn reachable_states(&self) -> HashSet { 208 | let init_states = self.init.iter().map(|pair| pair.1); 209 | let final_states = self.states.iter().enumerate() 210 | .filter(|&(_, state)| state.accept != Accept::Never) 211 | .map(|(idx, _)| idx); 212 | 213 | let forward = self.reachable_from(init_states); 214 | let backward = self.reversed_simple().reachable_from(final_states); 215 | forward.intersection(&backward).cloned().collect() 216 | } 217 | 218 | /// Optimizes this Nfa by removing all states that cannot be reached from an initial state 219 | /// and all states that cannot lead to an accepting state. 220 | pub fn trim_unreachable(&mut self) { 221 | let reachable = self.reachable_states(); 222 | 223 | let mut old_states = Vec::new(); 224 | swap(&mut self.states, &mut old_states); 225 | let mut old_to_new = vec![None; old_states.len()]; 226 | 227 | let (new_to_old, new_states): (Vec<_>, Vec>) = old_states.into_iter() 228 | .enumerate() 229 | .filter(|&(i, _)| reachable.contains(&i)) 230 | .unzip(); 231 | self.states = new_states; 232 | 233 | for (new, &old) in new_to_old.iter().enumerate() { 234 | old_to_new[old] = Some(new); 235 | } 236 | 237 | self.map_states(|s| old_to_new[s]); 238 | } 239 | 240 | // Returns an `Accept` that will accept whenever anything in `states` would accept. 241 | fn accept_union(&self, states: &StateSet) -> Accept { 242 | states.iter().map(|s| self.states[*s].accept).max().unwrap_or(Accept::Never) 243 | } 244 | } 245 | 246 | impl Nfa { 247 | /// Converts this `Nfa` into one that consumes the input byte-by-byte. 248 | pub fn byte_me(self, max_states: usize) -> ::Result> { 249 | let mut ret = Nfa:: { 250 | states: self.states.iter().map(|s| State { 251 | accept: s.accept, 252 | accept_look: s.accept_look, 253 | accept_state: s.accept_state, 254 | accept_tokens: s.accept_tokens, 255 | consuming: RangeMultiMap::new(), 256 | looking: Vec::new(), 257 | }).collect(), 258 | init: self.init, 259 | phantom: PhantomData, 260 | }; 261 | 262 | for (i, state) in self.states.into_iter().enumerate() { 263 | // Group transitions by the target state, and add them in batches. Most of the time, we 264 | // can merge a bunch of Utf8Sequences before adding them, which saves a bunch of 265 | // states. 266 | for (tgt, transitions) in state.consuming.ranges_values().group_by(|x| x.1) { 267 | try!(ret.add_utf8_sequences(i, transitions.into_iter().map(|x| x.0), tgt, max_states)); 268 | } 269 | } 270 | Ok(ret) 271 | } 272 | } 273 | 274 | impl Nfa { 275 | /// Converts this `Nfa` into a `Dfa`. 276 | pub fn determinize(&self, max_states: usize) -> ::Result> { 277 | Determinizer::determinize(self, max_states, MatchChoice::TransitionOrder, self.init.clone()) 278 | } 279 | 280 | /// Converts this `Nfa` into a `Dfa`. 281 | /// 282 | /// Whenever this `Nfa` matches some text, the `Dfa` also will. But if this `Nfa` has multiple 283 | /// possible endpoints for a match then the returned `Dfa` is only guaranteed to match the 284 | /// longest one. 285 | pub fn determinize_longest(&self, max_states: usize) -> ::Result> { 286 | Determinizer::determinize(self, max_states, MatchChoice::LongestMatch, self.init.clone()) 287 | } 288 | 289 | /// Returns the reversal of this `Nfa`. 290 | /// 291 | /// If `self` matches some string of bytes, then the return value of this method will match 292 | /// the same strings of bytes reversed. 293 | /// 294 | /// Note that this loses information about match priorities. 295 | pub fn reverse(&self, max_states: usize) -> ::Result> { 296 | let mut ret = self.reversed_simple(); 297 | 298 | // Turn our initial states into ret's accepting states. 299 | for &(look, i) in &self.init { 300 | match look { 301 | Look::Full => { 302 | ret.states[i].accept = Accept::Always; 303 | ret.states[i].accept_look = Look::Full; 304 | }, 305 | Look::Boundary => { 306 | ret.states[i].accept = max(ret.states[i].accept, Accept::AtEoi); 307 | ret.states[i].accept_look = max(ret.states[i].accept_look, Look::Boundary); 308 | }, 309 | Look::NewLine => { 310 | let accept_state = ret.add_look_ahead_state(Look::NewLine, 1, i); 311 | ret.add_transition(i, accept_state, Range::new(b'\n', b'\n')); 312 | ret.states[i].accept = max(ret.states[i].accept, Accept::AtEoi); 313 | ret.states[i].accept_look = max(ret.states[i].accept_look, Look::Boundary); 314 | }, 315 | Look::WordChar | Look::NotWordChar => { 316 | // It would make more sense to put this outside the loop, but having it inside 317 | // prevents a deadlock: constructing REV_*_DFA ends up calling reverse(), but 318 | // with no look-ahead so it never gets inside this loop. 319 | let dfa: &Dfa<_> = if look == Look::WordChar { 320 | &REV_WORD_CHAR_DFA 321 | } else { 322 | ret.states[i].accept = max(ret.states[i].accept, Accept::AtEoi); 323 | ret.states[i].accept_look = max(ret.states[i].accept_look, Look::Boundary); 324 | &REV_NOT_WORD_CHAR_DFA 325 | }; 326 | let accept_state = ret.add_look_ahead_state(look, 1, i); 327 | try!(ret.add_min_utf8_sequences(i, dfa, accept_state, max_states)); 328 | }, 329 | Look::Empty => { 330 | panic!("Empty cannot be an init look"); 331 | }, 332 | } 333 | } 334 | 335 | // Turn our accepting states into ret's initial states. 336 | ret.init.clear(); 337 | for st in &self.states { 338 | if st.accept != Accept::Never { 339 | ret.init.push((st.accept_look, st.accept_state)); 340 | } 341 | } 342 | Ok(ret) 343 | } 344 | 345 | /// Can we accept immediately if the beginning of the input matches `look`? 346 | fn init_accept(&self, look: Look) -> Accept { 347 | let set = self.init.iter() 348 | .filter(|pair| look <= pair.0) 349 | .map(|pair| pair.1) 350 | .collect::>(); 351 | self.accept_union(&set) 352 | } 353 | 354 | /// This essentially modifies `self` by adding a `^.*` at the beginning. 355 | /// 356 | /// The result is actually a little bit different, because `.` matches a whole code point, 357 | /// whereas the `^.*` that we add works at the byte level. 358 | pub fn anchor(mut self, max_states: usize) -> ::Result> { 359 | let loop_accept = self.init_accept(Look::Full); 360 | let loop_state = self.add_state(loop_accept); 361 | let init_accept = self.init_accept(Look::Boundary); 362 | let init_state = self.add_state(init_accept); 363 | 364 | // Swap out init so that we can iterate over it while modifying `self`. 365 | let mut init = Vec::new(); 366 | swap(&mut init, &mut self.init); 367 | 368 | for &(look, st_idx) in &init { 369 | if look.allows_eoi() { 370 | // TODO: shouldn't need to clone here. 371 | for &(range, target) in self.states[st_idx].consuming.clone().ranges_values() { 372 | self.add_transition(init_state, target, range); 373 | } 374 | } 375 | 376 | match look { 377 | Look::Boundary => {}, 378 | Look::Full => { 379 | for &(range, target) in self.states[st_idx].consuming.clone().ranges_values() { 380 | self.add_transition(loop_state, target, range); 381 | } 382 | }, 383 | Look::NewLine => { 384 | self.add_transition(init_state, st_idx, Range::new(b'\n', b'\n')); 385 | self.add_transition(loop_state, st_idx, Range::new(b'\n', b'\n')); 386 | }, 387 | Look::WordChar | Look::NotWordChar => { 388 | let dfa: &Dfa<_> = 389 | if look == Look::WordChar { &WORD_CHAR_DFA } else { &NOT_WORD_CHAR_DFA }; 390 | 391 | try!(self.add_min_utf8_sequences(loop_state, dfa, st_idx, max_states)); 392 | try!(self.add_min_utf8_sequences(init_state, dfa, st_idx, max_states)); 393 | }, 394 | Look::Empty => { 395 | panic!("Cannot start with an empty look"); 396 | }, 397 | } 398 | 399 | // Once we've found an init state that accepts immediately, don't look for any others 400 | // (since any matches that we find starting from them are lower priority that the one 401 | // we've found already). This check is *almost* unnecessary, since similar pruning 402 | // happens when we turn the NFA into a DFA. The important case that needs to be handled 403 | // here is the case that a high-priority init state has no transitions out of it. Such 404 | // a state will be completely removed by this function, and so we need to acknowledge 405 | // its existence here. 406 | if self.states[st_idx].accept == Accept::Always { 407 | break; 408 | } 409 | } 410 | 411 | // Wire up the initial and loop states, but only if they aren't accepting. That's because 412 | // if they are accepting then the accept should take priority over the transition (since 413 | // making the transition means that we are searching for a match that starts later). 414 | if init_accept != Accept::Always { 415 | self.add_transition(init_state, loop_state, Range::full()); 416 | } 417 | if loop_accept != Accept::Always { 418 | self.add_transition(loop_state, loop_state, Range::full()); 419 | } 420 | 421 | // The new Nfa is only allowed to start at the beginning of the input, and only at the new 422 | // initial state. 423 | self.init.push((Look::Boundary, init_state)); 424 | self.trim_unreachable(); 425 | Ok(self) 426 | } 427 | 428 | // This does the same thing as add_utf8_sequences, but it gets the transitions from a dfa, 429 | // which should have zero as its only starting state, and for which every accepting state 430 | // should be Accept::Always. 431 | // 432 | // This is probably used in conjunction with make_char_dfa, which ends up having the same 433 | // effect as add_utf8_sequences, but adds fewer states. 434 | fn add_min_utf8_sequences( 435 | &mut self, 436 | start_state: StateIdx, 437 | dfa: &Dfa<(Look, u8)>, 438 | end_state: StateIdx, 439 | max_states: usize, 440 | ) -> ::Result<()> { 441 | let offset = self.states.len(); 442 | // If end_accept is true, then it isn't actually important that we end in state 443 | // `end_state`: we can create a new look_ahead state to end in. 444 | let end_accept = self.states[end_state].accept_tokens > 0; 445 | 446 | if self.states.len() + dfa.num_states() > max_states { 447 | return Err(Error::TooManyStates); 448 | } 449 | for _ in 0..dfa.num_states() { 450 | self.add_state(Accept::Never); 451 | } 452 | for d_idx in 0..dfa.num_states() { 453 | let n_src = if d_idx == 0 { start_state } else { d_idx + offset }; 454 | for &(range, d_tgt) in dfa.transitions(d_idx).ranges_values() { 455 | let n_tgt = if dfa.accept(d_tgt) == &Accept::Always && !end_accept { 456 | end_state 457 | } else { 458 | let n_tgt = d_tgt + offset; 459 | self.states[n_tgt].accept = *dfa.accept(d_tgt); 460 | if let Some(&(look, bytes)) = dfa.ret(d_tgt) { 461 | self.states[n_tgt].accept_look = look; 462 | self.states[n_tgt].accept_state = start_state; 463 | self.states[n_tgt].accept_tokens = bytes; 464 | } 465 | n_tgt 466 | }; 467 | self.add_transition(n_src, n_tgt, range); 468 | } 469 | } 470 | 471 | Ok(()) 472 | } 473 | 474 | // Adds a path from `start_state` to `end_state` for all byte sequences matching `seq`. 475 | // 476 | // If `end_state` is a look-ahead state, makes a new accepting state instead (so that we know 477 | // how many bytes of look-ahead we used). 478 | fn add_utf8_sequence( 479 | &mut self, 480 | start_state: StateIdx, 481 | mut end_state: StateIdx, 482 | seq: MergedUtf8Sequences 483 | ) { 484 | let mut last_state = start_state; 485 | for range in &seq.head { 486 | let cur_state = self.add_state(Accept::Never); 487 | 488 | self.add_transition(last_state, cur_state, Range::new(range.start, range.end)); 489 | last_state = cur_state; 490 | } 491 | 492 | if self.states[end_state].accept_tokens > 0 { 493 | let look = self.states[end_state].accept_look; 494 | let acc_state = self.states[end_state].accept_state; 495 | end_state = self.add_look_ahead_state(look, seq.num_bytes(), acc_state); 496 | } 497 | for range in &seq.last_byte { 498 | self.add_transition(last_state, end_state, Range::new(range.start, range.end)); 499 | } 500 | } 501 | 502 | // Adds a byte path from `start_state` to `end_state` for every char in `ranges`. 503 | fn add_utf8_sequences( 504 | &mut self, 505 | start_state: StateIdx, 506 | ranges: I, 507 | end_state: StateIdx, 508 | max_states: usize 509 | ) -> ::Result<()> 510 | where I: Iterator> { 511 | for m in MergedUtf8Sequences::from_ranges(ranges) { 512 | self.add_utf8_sequence(start_state, end_state, m); 513 | if self.states.len() > max_states { 514 | return Err(Error::TooManyStates); 515 | } 516 | } 517 | Ok(()) 518 | } 519 | 520 | // Finds the transitions out of the given set of states, as a RangeMap. 521 | fn transition_map(&self, states: &[StateIdx]) -> RangeMap> { 522 | let mut transitions = states.into_iter() 523 | .flat_map(|s| self.states[*s].consuming.ranges_values().cloned()) 524 | .collect::>() 525 | .group(); 526 | 527 | // `scratch` is large enough to be indexed by anything in `elts`. It is full of `false`. 528 | fn uniquify(elts: &mut Vec, scratch: &mut Vec) { 529 | elts.retain(|&e| { 530 | let ret = !scratch[e]; 531 | scratch[e] = true; 532 | ret 533 | }); 534 | 535 | // Clean up scratch, so that it is full of `false` again. 536 | for e in elts { 537 | scratch[*e] = false; 538 | } 539 | } 540 | 541 | let mut scratch = vec![false; self.num_states()]; 542 | for pair in transitions.as_mut_slice() { 543 | uniquify(&mut pair.1, &mut scratch); 544 | } 545 | 546 | transitions 547 | } 548 | } 549 | 550 | #[derive(PartialEq)] 551 | enum MatchChoice { 552 | TransitionOrder, 553 | LongestMatch, 554 | } 555 | 556 | // This contains all the intermediate data structures that we need when turning an `Nfa` into a 557 | // `Dfa`. 558 | struct Determinizer<'a> { 559 | nfa: &'a Nfa, 560 | dfa: Dfa<(Look, u8)>, 561 | state_map: HashMap, 562 | active_states: Vec, 563 | max_states: usize, 564 | match_choice: MatchChoice, 565 | } 566 | 567 | impl<'a> Determinizer<'a> { 568 | // Turns an Nfa into an almost-equivalent (up to the difference between shortest and longest 569 | // matches) Dfa. 570 | // 571 | // `init` is a vector of length Look::num(). Each entry gives a set of initial states that 572 | // will be turned into the initial states of the dfa. 573 | fn determinize(nfa: &Nfa, 574 | max_states: usize, 575 | match_choice: MatchChoice, 576 | init: Vec<(Look, StateIdx)>) -> ::Result> { 577 | let mut det = Determinizer::new(nfa, max_states, match_choice); 578 | try!(det.run(init)); 579 | Ok(det.dfa) 580 | } 581 | 582 | fn new(nfa: &'a Nfa, 583 | max_states: usize, 584 | match_choice: MatchChoice) -> Determinizer<'a> { 585 | Determinizer { 586 | nfa: nfa, 587 | dfa: Dfa::new(), 588 | state_map: HashMap::new(), 589 | active_states: Vec::new(), 590 | max_states: max_states, 591 | match_choice: match_choice, 592 | } 593 | } 594 | 595 | // Checks whether we should accept in the given set of states. 596 | // 597 | // Returns a tuple: the first element says when we accept, the second says what look-ahead (if 598 | // any) led to us accepting, and the third says how many bytes of look-ahead we needed before 599 | // knowing that we can accept. 600 | // 601 | // There is one annoying corner case: there could be two states in the set `s` with different 602 | // values of `accept_tokens`, where the higher priority state says `Accept::AtEoi` and the 603 | // lower priority state says `Accept::Always`. In this case, we return `(AtEoi, look, bytes)` 604 | // where `look` and `bytes` come from the lower priority state. This doesn't lose any 605 | // information, since if a state says `Accept::AtEoi` then its `accept_look` and 606 | // `accept_tokens` are guaranteed to be `Boundary` and `0`. 607 | fn accept(&self, s: &[StateIdx]) -> (Accept, Look, u8) { 608 | let mut accept_states = s.iter().cloned() 609 | .filter(|i| self.nfa.states[*i].accept != Accept::Never); 610 | let mut accept_always_states = s.iter().cloned() 611 | .filter(|i| self.nfa.states[*i].accept == Accept::Always); 612 | 613 | let (first_accept, other_accept) = if self.match_choice == MatchChoice::TransitionOrder { 614 | (accept_states.next(), accept_always_states.next()) 615 | } else { 616 | (accept_states.min_by_key(|i| self.nfa.states[*i].accept_tokens), 617 | accept_always_states.min_by_key(|i| self.nfa.states[*i].accept_tokens)) 618 | }; 619 | 620 | // Returns the intersection of state.accept_look over all states in s that accept 621 | // unconditionally and have the given number of look-ahead bytes. 622 | let look_intersection = |toks: u8| { 623 | s.iter().cloned() 624 | .filter(|i| self.nfa.states[*i].accept == Accept::Always) 625 | .filter(|i| self.nfa.states[*i].accept_tokens == toks) 626 | .fold(Look::Full, |x, y| x.intersection(&self.nfa.states[y].accept_look)) 627 | }; 628 | 629 | if let Some(first_accept) = first_accept { 630 | let st = &self.nfa.states[first_accept]; 631 | 632 | if st.accept == Accept::AtEoi { 633 | // Check if there is a lower-priority Accept::Always. 634 | if let Some(other_accept) = other_accept { 635 | let other_st = &self.nfa.states[other_accept]; 636 | if other_st.accept_tokens > 0 { 637 | let look = look_intersection(other_st.accept_tokens); 638 | return (Accept::AtEoi, look, other_st.accept_tokens); 639 | } 640 | } 641 | (Accept::AtEoi, Look::Boundary, 0) 642 | } else { 643 | (Accept::Always, look_intersection(st.accept_tokens), st.accept_tokens) 644 | } 645 | } else { 646 | // There are no accepting states. 647 | (Accept::Never, Look::Empty, 0) 648 | } 649 | } 650 | 651 | // Tries to add a new state to the Dfa. 652 | // 653 | // If the state already exists, returns the index of the old one. If there are too many states, 654 | // returns an error. 655 | fn add_state(&mut self, mut s: StateSet) -> ::Result { 656 | // When we choose our matches by transition order, discard any states that have lower 657 | // priority than the best match we've found. 658 | if self.match_choice == MatchChoice::TransitionOrder { 659 | if let Some(accept_idx) = s.iter().position(|&i| self.nfa.states[i].accept == Accept::Always) { 660 | s.truncate(accept_idx + 1); 661 | } 662 | } 663 | 664 | if self.state_map.contains_key(&s) { 665 | Ok(*self.state_map.get(&s).unwrap()) 666 | } else if self.dfa.num_states() >= self.max_states { 667 | Err(Error::TooManyStates) 668 | } else { 669 | let (acc, look, bytes_ago) = self.accept(&s); 670 | let ret = if acc != Accept::Never { Some ((look, bytes_ago)) } else { None }; 671 | let new_state = self.dfa.add_state(acc, ret); 672 | 673 | self.active_states.push(s.clone()); 674 | self.state_map.insert(s, new_state); 675 | Ok(new_state) 676 | } 677 | } 678 | 679 | // Creates a deterministic automaton representing the same language as our `nfa`. 680 | // Puts the new Dfa in self.dfa. 681 | fn run(&mut self, init: Vec<(Look, StateIdx)>) -> ::Result<()> { 682 | if self.nfa.states.is_empty() { 683 | return Ok(()); 684 | } 685 | 686 | for &look in Look::all() { 687 | let init_states: StateSet = init.iter().cloned() 688 | .filter(|&(x, _)| look == x) 689 | .map(|(_, y)| y) 690 | .collect(); 691 | if !init_states.is_empty() { 692 | let new_state_idx = try!(self.add_state(init_states)); 693 | self.dfa.init[look.as_usize()] = Some(new_state_idx); 694 | } 695 | } 696 | 697 | while !self.active_states.is_empty() { 698 | let state = self.active_states.pop().unwrap(); 699 | // This unwrap is ok because anything in active_states must also be in state_map. 700 | let state_idx = *self.state_map.get(&state).unwrap(); 701 | let trans = self.nfa.transition_map(&state); 702 | 703 | let mut dfa_trans = Vec::new(); 704 | for &(range, ref target) in trans.ranges_values() { 705 | let target_idx = try!(self.add_state(target.clone())); 706 | dfa_trans.push((range, target_idx)); 707 | } 708 | self.dfa.set_transitions(state_idx, dfa_trans.into_iter().collect()); 709 | } 710 | Ok(()) 711 | } 712 | } 713 | 714 | #[cfg(test)] 715 | mod tests { 716 | use look::Look; 717 | use dfa::Dfa; 718 | use nfa::{Accept, Nfa, NoLooks}; 719 | use nfa::tests::{re_nfa, trans_nfa, trans_range_nfa}; 720 | use range_map::Range; 721 | use std::usize; 722 | 723 | fn re_nfa_anchored(re: &str) -> Nfa { 724 | re_nfa(re).byte_me(usize::MAX).unwrap().anchor(usize::MAX).unwrap() 725 | } 726 | 727 | fn re_dfa(re: &str) -> Dfa<(Look, u8)> { 728 | re_nfa(re).byte_me(usize::MAX).unwrap().determinize(usize::MAX).unwrap() 729 | } 730 | 731 | #[test] 732 | fn anchor_simple() { 733 | let nfa = re_nfa_anchored("a"); 734 | let mut target = trans_range_nfa(3, &[(2, 0, Range::new(b'a', b'a')), 735 | (2, 1, Range::full()), 736 | (1, 0, Range::new(b'a', b'a')), 737 | (1, 1, Range::full())]); 738 | target.init.push((Look::Boundary, 2)); 739 | target.states[0].accept = Accept::Always; 740 | 741 | assert_eq!(nfa, target); 742 | } 743 | 744 | #[test] 745 | fn anchor_nl() { 746 | let nfa = re_nfa_anchored(r"(?m)^a"); 747 | let mut target = trans_nfa(4, &[(3, 1, 'a'), 748 | (0, 1, 'a'), 749 | (2, 0, '\n'), 750 | (3, 0, '\n')]); 751 | target.init.push((Look::Boundary, 3)); 752 | target.states[1].accept = Accept::Always; 753 | 754 | let mut target = target.byte_me(usize::MAX).unwrap(); 755 | target.states[2].consuming.insert(Range::full(), 2); 756 | target.states[3].consuming.insert(Range::full(), 2); 757 | 758 | assert_eq!(nfa, target); 759 | } 760 | 761 | #[test] 762 | fn anchor_already_anchored() { 763 | let nfa = re_nfa_anchored("^a"); 764 | let mut target = trans_nfa(2, &[(1, 0, 'a')]); 765 | target.init.push((Look::Boundary, 1)); 766 | target.states[0].accept = Accept::Always; 767 | 768 | assert_eq!(nfa, target); 769 | } 770 | 771 | #[test] 772 | fn determinize_pruning() { 773 | assert_eq!(re_dfa("a|aa"), re_dfa("a")); 774 | } 775 | 776 | macro_rules! check_rev_inits { 777 | ($name:ident, $re:expr, $inits:expr) => { 778 | #[test] 779 | fn $name() { 780 | let rev = re_nfa($re).byte_me(usize::MAX).unwrap().reverse(usize::MAX).unwrap(); 781 | println!("{:?}", rev.init); 782 | for &look in Look::all() { 783 | println!("checking look {:?}", look); 784 | if $inits.contains(&look) { 785 | assert!(rev.init.iter().any(|pair| pair.0 == look)); 786 | } else { 787 | assert!(!rev.init.iter().any(|pair| pair.0 == look)); 788 | } 789 | } 790 | } 791 | }; 792 | } 793 | 794 | check_rev_inits!(rev_init_simple, "abc", [Look::Full]); 795 | check_rev_inits!(rev_init_boundary, "abc$", [Look::Boundary]); 796 | check_rev_inits!(rev_init_simple_and_boundary, "(abc$|abc)", [Look::Full, Look::Boundary]); 797 | check_rev_inits!(rev_init_new_line, "(?m)abc$", [Look::Boundary, Look::NewLine]); 798 | check_rev_inits!(rev_init_word, r" \b", [Look::WordChar]); 799 | check_rev_inits!(rev_init_not_word, r"abc\b", [Look::Boundary, Look::NotWordChar]); 800 | check_rev_inits!(rev_init_word_or_not_word, r".\b", [Look::Boundary, Look::NotWordChar, Look::WordChar]); 801 | } 802 | -------------------------------------------------------------------------------- /src/regex.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2016 Joe Neeman. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | use error::Error; 10 | use nfa::{Nfa, NoLooks}; 11 | use runner::anchored::AnchoredEngine; 12 | use runner::forward_backward::{ForwardBackwardEngine, Prefix}; 13 | use runner::Engine; 14 | use std; 15 | use std::fmt::Debug; 16 | 17 | #[derive(Debug)] 18 | pub struct Regex { 19 | engine: Box>, 20 | } 21 | 22 | // An engine that doesn't match anything. 23 | #[derive(Clone, Debug)] 24 | struct EmptyEngine; 25 | 26 | impl Engine for EmptyEngine { 27 | fn find(&self, _: &str) -> Option<(usize, usize, Ret)> { None } 28 | fn clone_box(&self) -> Box> { Box::new(EmptyEngine) } 29 | } 30 | 31 | impl Clone for Regex { 32 | fn clone(&self) -> Regex { 33 | Regex { 34 | engine: self.engine.clone_box(), 35 | } 36 | } 37 | } 38 | 39 | impl Regex { 40 | /// Creates a new `Regex` from a regular expression string. 41 | pub fn new(re: &str) -> ::Result { 42 | Regex::new_bounded(re, std::usize::MAX) 43 | } 44 | 45 | /// Creates a new `Regex` from a regular expression string, but only if it doesn't require too 46 | /// many states. 47 | pub fn new_bounded(re: &str, max_states: usize) -> ::Result { 48 | let nfa = try!(Nfa::from_regex(re)); 49 | let nfa = nfa.remove_looks(); 50 | 51 | let eng = if nfa.is_empty() { 52 | Box::new(EmptyEngine) as Box> 53 | } else if nfa.is_anchored() { 54 | Box::new(try!(Regex::make_anchored(nfa, max_states))) as Box> 55 | } else { 56 | Box::new(try!(Regex::make_forward_backward(nfa, max_states))) as Box> 57 | }; 58 | 59 | Ok(Regex { engine: eng }) 60 | } 61 | 62 | fn make_anchored(nfa: Nfa, max_states: usize) 63 | -> ::Result> { 64 | let nfa = try!(nfa.byte_me(max_states)); 65 | let dfa = try!(nfa.determinize(max_states)) 66 | .optimize() 67 | .map_ret(|(_, bytes)| bytes); 68 | let prog = dfa.compile(); 69 | 70 | Ok(AnchoredEngine::new(prog)) 71 | } 72 | 73 | fn make_forward_backward(nfa: Nfa, max_states: usize) 74 | -> ::Result> { 75 | if nfa.is_anchored() { 76 | return Err(Error::InvalidEngine("anchors rule out the forward-backward engine")); 77 | } 78 | 79 | let f_nfa = try!(try!(nfa.clone().byte_me(max_states)).anchor(max_states)); 80 | let b_nfa = try!(try!(nfa.byte_me(max_states)).reverse(max_states)); 81 | 82 | let f_dfa = try!(f_nfa.determinize(max_states)).optimize(); 83 | let b_dfa = try!(b_nfa.determinize_longest(max_states)).optimize(); 84 | let b_dfa = b_dfa.map_ret(|(_, bytes)| bytes); 85 | 86 | let b_prog = b_dfa.compile(); 87 | let f_dfa = f_dfa.map_ret(|(look, bytes)| { 88 | let b_dfa_state = b_dfa.init[look.as_usize()].expect("BUG: back dfa must have this init"); 89 | (b_dfa_state, bytes) 90 | }); 91 | 92 | let mut f_prog = f_dfa.compile(); 93 | let prefix = Prefix::from_parts(f_dfa.prefix_strings()); 94 | match prefix { 95 | Prefix::Empty => {}, 96 | _ => { 97 | // If there is a non-trivial prefix, we can usually speed up matching by deleting 98 | // transitions that return to the start state. That way, instead of returning to 99 | // the start state, we will just fail to match. Then we get to search for the 100 | // prefix before trying to match again. 101 | let f_dfa = f_dfa.cut_loop_to_init().optimize(); 102 | f_prog = f_dfa.compile(); 103 | }, 104 | } 105 | 106 | Ok(ForwardBackwardEngine::new(f_prog, prefix, b_prog)) 107 | } 108 | 109 | /// Returns the index range of the first match, if there is a match. The indices returned are 110 | /// byte indices of the string. The first index is inclusive; the second is exclusive. 111 | pub fn find(&self, s: &str) -> Option<(usize, usize)> { 112 | if let Some((start, end, look_behind)) = self.engine.find(s) { 113 | Some((start + look_behind as usize, end)) 114 | } else { 115 | None 116 | } 117 | } 118 | 119 | pub fn is_match(&self, s: &str) -> bool { 120 | // TODO: for the forward-backward engine, this could be faster because we don't need 121 | // to run backward. 122 | self.find(s).is_some() 123 | } 124 | } 125 | 126 | -------------------------------------------------------------------------------- /src/runner/anchored.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2016 Joe Neeman. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | use std::fmt::Debug; 10 | use runner::Engine; 11 | use runner::program::TableInsts; 12 | 13 | #[derive(Clone, Debug)] 14 | pub struct AnchoredEngine { 15 | prog: TableInsts, 16 | } 17 | 18 | impl AnchoredEngine { 19 | pub fn new(prog: TableInsts) -> AnchoredEngine { 20 | AnchoredEngine { 21 | prog: prog, 22 | } 23 | } 24 | } 25 | 26 | impl Engine for AnchoredEngine { 27 | fn find(&self, s: &str) -> Option<(usize, usize, Ret)> { 28 | let input = s.as_bytes(); 29 | if self.prog.is_empty() { 30 | None 31 | } else if let Ok(end) = self.prog.find_from(input, 0, 0) { 32 | Some((0, end.0, end.1)) 33 | } else { 34 | None 35 | } 36 | } 37 | 38 | fn clone_box(&self) -> Box> { 39 | Box::new(self.clone()) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/runner/forward_backward.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2016 Joe Neeman. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | use std::fmt::Debug; 10 | //use dfa::{Dfa, PrefixPart, RetTrait}; 11 | use dfa::PrefixPart; 12 | use itertools::Itertools; 13 | use memchr::memchr; 14 | use runner::Engine; 15 | use runner::program::TableInsts; 16 | 17 | #[derive(Clone, Debug)] 18 | pub struct ForwardBackwardEngine { 19 | forward: TableInsts<(usize, u8)>, 20 | backward: TableInsts, 21 | prefix: Prefix, 22 | } 23 | 24 | impl ForwardBackwardEngine { 25 | pub fn new(forward: TableInsts<(usize, u8)>, prefix: Prefix, backward: TableInsts) -> Self { 26 | ForwardBackwardEngine { 27 | forward: forward, 28 | backward: backward, 29 | prefix: prefix, 30 | } 31 | } 32 | 33 | fn find_with_searcher(&self, input: &[u8], search: SearchFn) 34 | -> Option<(usize, usize, Ret)> 35 | where SearchFn: Fn(&[u8], usize) -> Option { 36 | let mut pos = 0; 37 | while let Some(start) = search(input, pos) { 38 | match self.forward.find_from(input, start, 0) { 39 | Ok((end, (rev_state, look_ahead))) => { 40 | let rev_pos = end.saturating_sub(look_ahead as usize); 41 | let (start_pos, ret) = self.backward 42 | .longest_backward_find_from(input, rev_pos, rev_state) 43 | .expect("BUG: matched forward but failed to match backward"); 44 | return Some((start_pos, rev_pos, ret)); 45 | 46 | }, 47 | Err(end) => { 48 | pos = end + 1; 49 | }, 50 | } 51 | } 52 | 53 | None 54 | } 55 | 56 | } 57 | 58 | impl Engine for ForwardBackwardEngine { 59 | fn find(&self, s: &str) -> Option<(usize, usize, Ret)> { 60 | let input = s.as_bytes(); 61 | if self.forward.is_empty() { 62 | return None; 63 | } 64 | 65 | match self.prefix { 66 | Prefix::Empty => self.find_with_searcher( 67 | input, 68 | |s, pos| if pos <= s.len() { Some(pos) } else { None } 69 | ), 70 | Prefix::ByteSet { ref bytes, offset } => self.find_with_searcher( 71 | input, 72 | |s, pos| if pos + offset <= s.len() { 73 | s[(pos + offset)..].iter().position(|c| bytes[*c as usize]).map(|x| x + pos) 74 | } else { 75 | None 76 | } 77 | ), 78 | Prefix::Byte { byte, offset } => self.find_with_searcher( 79 | input, 80 | |s, pos| if pos + offset <= s.len() { 81 | memchr(byte, &input[(pos + offset)..]).map(|x| x + pos) 82 | } else { 83 | None 84 | } 85 | ), 86 | //Prefix::ByteBackwards { .. } => unimplemented!(), 87 | } 88 | } 89 | 90 | fn clone_box(&self) -> Box> { 91 | Box::new(self.clone()) 92 | } 93 | } 94 | 95 | /// A `Prefix` is the first part of a DFA. Anything matching the DFA should start with 96 | /// something matching the `Prefix`. 97 | /// 98 | /// The purpose of a `Prefix` is that scanning through the input looking for the `Prefix` should be 99 | /// much faster than running the DFA naively. 100 | #[derive(Clone, Debug)] 101 | pub enum Prefix { 102 | // Matches every position. 103 | Empty, 104 | // Matches a single byte in a particular set and then rewinds some number of bytes. 105 | ByteSet { bytes: Vec, offset: usize }, 106 | // Matches a specific byte and then rewinds some number of bytes. 107 | Byte { byte: u8, offset: usize }, 108 | // Matches a specific byte and then runs a DFA backwards. 109 | //ByteBackwards { byte: u8, rev: Dfa<()> }, 110 | } 111 | 112 | // How big we allow the byte sets to be. In order for byte sets to be a performance win, finding a 113 | // byte in the set needs to be sufficiently rare; therefore, we only use small sets. There might be 114 | // room for a better heuristic, though: we could use large sets that only have rare bytes. 115 | const MAX_BYTE_SET_SIZE: usize = 16; 116 | 117 | impl Prefix { 118 | fn byte_prefix(parts: &[PrefixPart]) -> Option { 119 | fn common_prefix<'a>(s1: &'a [u8], s2: &'a [u8]) -> &'a [u8] { 120 | let prefix_len = s1.iter().zip(s2.iter()) 121 | .take_while(|pair| pair.0 == pair.1) 122 | .count(); 123 | &s1[0..prefix_len] 124 | } 125 | 126 | let mut parts = parts.iter(); 127 | if let Some(first) = parts.next() { 128 | let lit = parts.fold(&first.0[..], |acc, p| common_prefix(acc, &p.0)); 129 | if !lit.is_empty() { 130 | // See if the common prefix contains a full codepoint. If it does, search for the last 131 | // byte of that codepoint. 132 | let cp_last_byte = ((!lit[0]).leading_zeros() as usize).saturating_sub(1); 133 | if cp_last_byte < lit.len() { 134 | return Some(Prefix::Byte { byte: lit[cp_last_byte], offset: cp_last_byte }); 135 | } 136 | } 137 | } 138 | 139 | None 140 | } 141 | 142 | fn byte_set_prefix(parts: &[PrefixPart]) -> Option { 143 | let crit_byte_pos = |p: &PrefixPart| ((!p.0[0]).leading_zeros() as usize).saturating_sub(1); 144 | let crit_byte_posns: Vec = parts.iter().map(crit_byte_pos).dedup().collect(); 145 | 146 | if crit_byte_posns.len() == 1 { 147 | let crit_byte = crit_byte_posns[0]; 148 | if parts.iter().all(|x| x.0.len() > crit_byte) { 149 | let mut crit_bytes: Vec = parts.iter().map(|x| x.0[crit_byte]).collect(); 150 | crit_bytes.sort(); 151 | crit_bytes.dedup(); 152 | 153 | if crit_bytes.len() <= MAX_BYTE_SET_SIZE { 154 | let mut ret = vec![false; 256]; 155 | for &b in &crit_bytes { 156 | ret[b as usize] = true; 157 | } 158 | return Some(Prefix::ByteSet { bytes: ret, offset: crit_byte }); 159 | } 160 | } 161 | } 162 | 163 | None 164 | } 165 | 166 | /* 167 | pub fn from_dfa(dfa: &Dfa) -> Prefix { 168 | let parts = dfa.prefix_strings(); 169 | let first_try = Prefix::from_parts(parts); 170 | 171 | /* 172 | match first_try { 173 | Prefix::Byte {..} => first_try, 174 | _ => { 175 | let crit_strings = dfa.critical_strings(); 176 | unimplemented!(); 177 | first_try 178 | }, 179 | } 180 | */ 181 | unimplemented!(); 182 | } 183 | */ 184 | 185 | /// Converts a set of `PrefixParts` into a `Prefix` that matches any of the strings. 186 | pub fn from_parts(mut parts: Vec) -> Prefix { 187 | parts.retain(|x| !x.0.is_empty()); 188 | 189 | if let Some(pref) = Prefix::byte_prefix(&parts) { 190 | pref 191 | } else if let Some(pref) = Prefix::byte_set_prefix(&parts) { 192 | pref 193 | } else { 194 | Prefix::Empty 195 | } 196 | } 197 | } 198 | 199 | #[cfg(test)] 200 | mod tests { 201 | use dfa::PrefixPart; 202 | use super::*; 203 | 204 | fn pref(strs: Vec<&str>) -> Prefix { 205 | Prefix::from_parts( 206 | strs.into_iter() 207 | .enumerate() 208 | .map(|(i, s)| PrefixPart(s.as_bytes().to_vec(), i)) 209 | .collect()) 210 | } 211 | 212 | #[test] 213 | fn test_prefix_choice() { 214 | use super::Prefix::*; 215 | 216 | assert!(matches!(pref(vec![]), Empty)); 217 | assert!(matches!(pref(vec![""]), Empty)); 218 | assert!(matches!(pref(vec!["a"]), Byte {..})); 219 | assert!(matches!(pref(vec!["", "a", ""]), Byte {..})); 220 | assert!(matches!(pref(vec!["abc"]), Byte {..})); 221 | assert!(matches!(pref(vec!["abc", ""]), Byte {..})); 222 | assert!(matches!(pref(vec!["a", "b", "c"]), ByteSet {..})); 223 | assert!(matches!(pref(vec!["a", "b", "", "c"]), ByteSet {..})); 224 | assert!(matches!(pref(vec!["a", "baa", "", "c"]), ByteSet {..})); 225 | assert!(matches!(pref(vec!["ab", "baa", "", "cb"]), ByteSet {..})); 226 | assert!(matches!(pref(vec!["ab", "aaa", "", "acb"]), Byte {..})); 227 | assert!(matches!(pref(vec!["ab", "abc", "abd"]), Byte {..})); 228 | } 229 | } 230 | 231 | -------------------------------------------------------------------------------- /src/runner/mod.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::Debug; 2 | 3 | pub trait Engine: Debug { 4 | fn find(&self, s: &str) -> Option<(usize, usize, Ret)>; 5 | fn clone_box(&self) -> Box>; 6 | } 7 | 8 | pub mod anchored; 9 | pub mod forward_backward; 10 | pub mod program; 11 | -------------------------------------------------------------------------------- /src/runner/program.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2015-2016 Joe Neeman. 2 | // 3 | // Licensed under the Apache License, Version 2.0 or the MIT license 5 | // , at your 6 | // option. This file may not be copied, modified, or distributed 7 | // except according to those terms. 8 | 9 | use std::fmt::{Debug, Formatter, Error as FmtError}; 10 | use std::u32; 11 | 12 | pub type TableStateIdx = u32; 13 | 14 | /// A DFA program implemented as a lookup table. 15 | #[derive(Clone)] 16 | pub struct TableInsts { 17 | /// The log (rounded up) of the number of different equivalence classes of bytes. 18 | // We could save a bit more memory by storing the actual number instead of the log, because 19 | // then `table` could have length num_classes x num_instructions. However, then we need to 20 | // multiply (instead of just shifting) to look up the next state, and that slows us down by 21 | // 10-20%. 22 | // 23 | // TODO: we can probably save more memory by splitting classes into ASCII/non-ASCII. Often, 24 | // many states share the same non-ASCII transitions, so those tables can be merged. 25 | pub log_num_classes: u32, 26 | /// A vec of length 256 mapping from bytes to their class indices. 27 | pub byte_class: Vec, 28 | /// A `(1 << log_num_classes) x num_instructions`-long table. 29 | /// 30 | /// For a given input byte `b` in state `state`, we look up the next state using 31 | /// `table[state << log_num_classes + b]`. 32 | pub table: Vec, 33 | /// If `accept[st]` is not `None` then `st` is accepting, and `accept[st]` is the data 34 | /// to return. 35 | pub accept: Vec>, 36 | /// Same as `accept`, but applies only at the end of the input. 37 | pub accept_at_eoi: Vec>, 38 | } 39 | 40 | impl Debug for TableInsts { 41 | fn fmt(&self, f: &mut Formatter) -> Result<(), FmtError> { 42 | try!(f.write_fmt(format_args!("TableInsts ({} log_classes, {} instructions):\n", 43 | self.log_num_classes, 44 | self.accept.len()))); 45 | try!(f.write_str("Byte classes: ")); 46 | try!(f.debug_map() 47 | .entries((0..256).map(|b| (b, self.byte_class[b]))) 48 | .finish()); 49 | 50 | let num_classes = 1 << self.log_num_classes; 51 | for idx in 0..self.accept.len() { 52 | try!(f.write_fmt(format_args!("State {}:\n", idx))); 53 | try!(f.debug_map() 54 | .entries((0usize..num_classes) 55 | .map(|c| (c, self.table[(idx << self.log_num_classes) + c])) 56 | .filter(|x| x.1 != u32::MAX)) 57 | .finish()); 58 | try!(f.write_str("\n")); 59 | } 60 | 61 | try!(f.write_str("Accept: ")); 62 | for idx in 0..self.accept.len() { 63 | if let Some(ref ret) = self.accept[idx] { 64 | try!(f.write_fmt(format_args!("{} -> {:?}, ", idx, ret))); 65 | } 66 | } 67 | 68 | try!(f.write_str("Accept_at_eoi: ")); 69 | for idx in 0..self.accept_at_eoi.len() { 70 | if let Some(ref ret) = self.accept_at_eoi[idx] { 71 | try!(f.write_fmt(format_args!("{} -> {:?}, ", idx, ret))); 72 | } 73 | } 74 | Ok(()) 75 | } 76 | } 77 | 78 | impl TableInsts { 79 | fn next_state(&self, state: usize, input: u8) -> Option { 80 | let class = self.byte_class[input as usize]; 81 | let next_state = self.table[(state << self.log_num_classes) + class as usize]; 82 | if next_state != u32::MAX { 83 | Some(next_state as usize) 84 | } else { 85 | None 86 | } 87 | } 88 | 89 | pub fn num_states(&self) -> usize { 90 | self.accept.len() 91 | } 92 | 93 | pub fn find_from(&self, input: &[u8], pos: usize, state: usize) 94 | -> Result<(usize, Ret), usize> { 95 | let mut state = state as u32; 96 | let mut ret = Err(input.len()); 97 | 98 | if state as usize >= self.accept.len() { 99 | panic!("BUG"); 100 | } 101 | for pos in pos..input.len() { 102 | if let Some(accept_ret) = self.accept[state as usize] { 103 | ret = Ok((pos, accept_ret)); 104 | } 105 | 106 | // We've manually inlined next_state here, for better performance (measurably better 107 | // than using #[inline(always)]). 108 | // For some reason, these bounds checks (even though LLVM leaves them in) don't seem to 109 | // hurt performance. 110 | let class = self.byte_class[input[pos] as usize]; 111 | state = self.table[((state as usize) << self.log_num_classes) + class as usize]; 112 | 113 | // Since everything in `self.table` is either a valid state or u32::MAX, this is the 114 | // same as checking if state == u32::MAX. We write it this way in the hope that 115 | // rustc/LLVM will be able to elide the bounds check at the top of the loop. 116 | if state as usize >= self.accept.len() { 117 | if ret.is_err() { 118 | return Err(pos); 119 | } 120 | break; 121 | } 122 | } 123 | 124 | // If we made it to the end of the input, prefer a return value that is specific to EOI 125 | // over one that can occur anywhere. 126 | if (state as usize) < self.accept.len() { 127 | if let Some(accept_ret) = self.accept_at_eoi[state as usize] { 128 | return Ok((input.len(), accept_ret)) 129 | } 130 | } 131 | ret 132 | } 133 | 134 | pub fn longest_backward_find_from(&self, input: &[u8], pos: usize, mut state: usize) 135 | -> Option<(usize, Ret)> { 136 | let mut ret = None; 137 | for pos in (0..pos).rev() { 138 | if let Some(next_ret) = self.accept[state] { 139 | ret = Some((pos + 1, next_ret)); 140 | } 141 | if let Some(next_state) = self.next_state(state, input[pos]) { 142 | state = next_state; 143 | } else { 144 | return ret; 145 | } 146 | } 147 | 148 | if let Some(end_ret) = self.accept_at_eoi[state] { 149 | Some((0, end_ret)) 150 | } else { 151 | ret 152 | } 153 | } 154 | 155 | pub fn is_empty(&self) -> bool { 156 | self.num_states() == 0 157 | } 158 | } 159 | 160 | -------------------------------------------------------------------------------- /src/unicode.rs: -------------------------------------------------------------------------------- 1 | // TODO: This was copied from the regex-syntax crate. At some point, this should presumably live in 2 | // a third crate. 3 | pub const PERLW: &'static [(char, char)] = &[ 4 | ('\u{30}', '\u{39}'), ('\u{41}', '\u{5a}'), ('\u{5f}', '\u{5f}'), 5 | ('\u{61}', '\u{7a}'), ('\u{aa}', '\u{aa}'), ('\u{b5}', '\u{b5}'), 6 | ('\u{ba}', '\u{ba}'), ('\u{c0}', '\u{d6}'), ('\u{d8}', '\u{f6}'), 7 | ('\u{f8}', '\u{2c1}'), ('\u{2c6}', '\u{2d1}'), ('\u{2e0}', '\u{2e4}'), 8 | ('\u{2ec}', '\u{2ec}'), ('\u{2ee}', '\u{2ee}'), ('\u{300}', '\u{374}'), 9 | ('\u{376}', '\u{377}'), ('\u{37a}', '\u{37d}'), ('\u{37f}', '\u{37f}'), 10 | ('\u{386}', '\u{386}'), ('\u{388}', '\u{38a}'), ('\u{38c}', '\u{38c}'), 11 | ('\u{38e}', '\u{3a1}'), ('\u{3a3}', '\u{3f5}'), ('\u{3f7}', '\u{481}'), 12 | ('\u{483}', '\u{52f}'), ('\u{531}', '\u{556}'), ('\u{559}', '\u{559}'), 13 | ('\u{561}', '\u{587}'), ('\u{591}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), 14 | ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'), 15 | ('\u{5d0}', '\u{5ea}'), ('\u{5f0}', '\u{5f2}'), ('\u{610}', '\u{61a}'), 16 | ('\u{620}', '\u{669}'), ('\u{66e}', '\u{6d3}'), ('\u{6d5}', '\u{6dc}'), 17 | ('\u{6df}', '\u{6e8}'), ('\u{6ea}', '\u{6fc}'), ('\u{6ff}', '\u{6ff}'), 18 | ('\u{710}', '\u{74a}'), ('\u{74d}', '\u{7b1}'), ('\u{7c0}', '\u{7f5}'), 19 | ('\u{7fa}', '\u{7fa}'), ('\u{800}', '\u{82d}'), ('\u{840}', '\u{85b}'), 20 | ('\u{8a0}', '\u{8b4}'), ('\u{8e3}', '\u{963}'), ('\u{966}', '\u{96f}'), 21 | ('\u{971}', '\u{983}'), ('\u{985}', '\u{98c}'), ('\u{98f}', '\u{990}'), 22 | ('\u{993}', '\u{9a8}'), ('\u{9aa}', '\u{9b0}'), ('\u{9b2}', '\u{9b2}'), 23 | ('\u{9b6}', '\u{9b9}'), ('\u{9bc}', '\u{9c4}'), ('\u{9c7}', '\u{9c8}'), 24 | ('\u{9cb}', '\u{9ce}'), ('\u{9d7}', '\u{9d7}'), ('\u{9dc}', '\u{9dd}'), 25 | ('\u{9df}', '\u{9e3}'), ('\u{9e6}', '\u{9f1}'), ('\u{a01}', '\u{a03}'), 26 | ('\u{a05}', '\u{a0a}'), ('\u{a0f}', '\u{a10}'), ('\u{a13}', '\u{a28}'), 27 | ('\u{a2a}', '\u{a30}'), ('\u{a32}', '\u{a33}'), ('\u{a35}', '\u{a36}'), 28 | ('\u{a38}', '\u{a39}'), ('\u{a3c}', '\u{a3c}'), ('\u{a3e}', '\u{a42}'), 29 | ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'), 30 | ('\u{a59}', '\u{a5c}'), ('\u{a5e}', '\u{a5e}'), ('\u{a66}', '\u{a75}'), 31 | ('\u{a81}', '\u{a83}'), ('\u{a85}', '\u{a8d}'), ('\u{a8f}', '\u{a91}'), 32 | ('\u{a93}', '\u{aa8}'), ('\u{aaa}', '\u{ab0}'), ('\u{ab2}', '\u{ab3}'), 33 | ('\u{ab5}', '\u{ab9}'), ('\u{abc}', '\u{ac5}'), ('\u{ac7}', '\u{ac9}'), 34 | ('\u{acb}', '\u{acd}'), ('\u{ad0}', '\u{ad0}'), ('\u{ae0}', '\u{ae3}'), 35 | ('\u{ae6}', '\u{aef}'), ('\u{af9}', '\u{af9}'), ('\u{b01}', '\u{b03}'), 36 | ('\u{b05}', '\u{b0c}'), ('\u{b0f}', '\u{b10}'), ('\u{b13}', '\u{b28}'), 37 | ('\u{b2a}', '\u{b30}'), ('\u{b32}', '\u{b33}'), ('\u{b35}', '\u{b39}'), 38 | ('\u{b3c}', '\u{b44}'), ('\u{b47}', '\u{b48}'), ('\u{b4b}', '\u{b4d}'), 39 | ('\u{b56}', '\u{b57}'), ('\u{b5c}', '\u{b5d}'), ('\u{b5f}', '\u{b63}'), 40 | ('\u{b66}', '\u{b6f}'), ('\u{b71}', '\u{b71}'), ('\u{b82}', '\u{b83}'), 41 | ('\u{b85}', '\u{b8a}'), ('\u{b8e}', '\u{b90}'), ('\u{b92}', '\u{b95}'), 42 | ('\u{b99}', '\u{b9a}'), ('\u{b9c}', '\u{b9c}'), ('\u{b9e}', '\u{b9f}'), 43 | ('\u{ba3}', '\u{ba4}'), ('\u{ba8}', '\u{baa}'), ('\u{bae}', '\u{bb9}'), 44 | ('\u{bbe}', '\u{bc2}'), ('\u{bc6}', '\u{bc8}'), ('\u{bca}', '\u{bcd}'), 45 | ('\u{bd0}', '\u{bd0}'), ('\u{bd7}', '\u{bd7}'), ('\u{be6}', '\u{bef}'), 46 | ('\u{c00}', '\u{c03}'), ('\u{c05}', '\u{c0c}'), ('\u{c0e}', '\u{c10}'), 47 | ('\u{c12}', '\u{c28}'), ('\u{c2a}', '\u{c39}'), ('\u{c3d}', '\u{c44}'), 48 | ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), 49 | ('\u{c58}', '\u{c5a}'), ('\u{c60}', '\u{c63}'), ('\u{c66}', '\u{c6f}'), 50 | ('\u{c81}', '\u{c83}'), ('\u{c85}', '\u{c8c}'), ('\u{c8e}', '\u{c90}'), 51 | ('\u{c92}', '\u{ca8}'), ('\u{caa}', '\u{cb3}'), ('\u{cb5}', '\u{cb9}'), 52 | ('\u{cbc}', '\u{cc4}'), ('\u{cc6}', '\u{cc8}'), ('\u{cca}', '\u{ccd}'), 53 | ('\u{cd5}', '\u{cd6}'), ('\u{cde}', '\u{cde}'), ('\u{ce0}', '\u{ce3}'), 54 | ('\u{ce6}', '\u{cef}'), ('\u{cf1}', '\u{cf2}'), ('\u{d01}', '\u{d03}'), 55 | ('\u{d05}', '\u{d0c}'), ('\u{d0e}', '\u{d10}'), ('\u{d12}', '\u{d3a}'), 56 | ('\u{d3d}', '\u{d44}'), ('\u{d46}', '\u{d48}'), ('\u{d4a}', '\u{d4e}'), 57 | ('\u{d57}', '\u{d57}'), ('\u{d5f}', '\u{d63}'), ('\u{d66}', '\u{d6f}'), 58 | ('\u{d7a}', '\u{d7f}'), ('\u{d82}', '\u{d83}'), ('\u{d85}', '\u{d96}'), 59 | ('\u{d9a}', '\u{db1}'), ('\u{db3}', '\u{dbb}'), ('\u{dbd}', '\u{dbd}'), 60 | ('\u{dc0}', '\u{dc6}'), ('\u{dca}', '\u{dca}'), ('\u{dcf}', '\u{dd4}'), 61 | ('\u{dd6}', '\u{dd6}'), ('\u{dd8}', '\u{ddf}'), ('\u{de6}', '\u{def}'), 62 | ('\u{df2}', '\u{df3}'), ('\u{e01}', '\u{e3a}'), ('\u{e40}', '\u{e4e}'), 63 | ('\u{e50}', '\u{e59}'), ('\u{e81}', '\u{e82}'), ('\u{e84}', '\u{e84}'), 64 | ('\u{e87}', '\u{e88}'), ('\u{e8a}', '\u{e8a}'), ('\u{e8d}', '\u{e8d}'), 65 | ('\u{e94}', '\u{e97}'), ('\u{e99}', '\u{e9f}'), ('\u{ea1}', '\u{ea3}'), 66 | ('\u{ea5}', '\u{ea5}'), ('\u{ea7}', '\u{ea7}'), ('\u{eaa}', '\u{eab}'), 67 | ('\u{ead}', '\u{eb9}'), ('\u{ebb}', '\u{ebd}'), ('\u{ec0}', '\u{ec4}'), 68 | ('\u{ec6}', '\u{ec6}'), ('\u{ec8}', '\u{ecd}'), ('\u{ed0}', '\u{ed9}'), 69 | ('\u{edc}', '\u{edf}'), ('\u{f00}', '\u{f00}'), ('\u{f18}', '\u{f19}'), 70 | ('\u{f20}', '\u{f29}'), ('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'), 71 | ('\u{f39}', '\u{f39}'), ('\u{f3e}', '\u{f47}'), ('\u{f49}', '\u{f6c}'), 72 | ('\u{f71}', '\u{f84}'), ('\u{f86}', '\u{f97}'), ('\u{f99}', '\u{fbc}'), 73 | ('\u{fc6}', '\u{fc6}'), ('\u{1000}', '\u{1049}'), ('\u{1050}', 74 | '\u{109d}'), ('\u{10a0}', '\u{10c5}'), ('\u{10c7}', '\u{10c7}'), 75 | ('\u{10cd}', '\u{10cd}'), ('\u{10d0}', '\u{10fa}'), ('\u{10fc}', 76 | '\u{1248}'), ('\u{124a}', '\u{124d}'), ('\u{1250}', '\u{1256}'), 77 | ('\u{1258}', '\u{1258}'), ('\u{125a}', '\u{125d}'), ('\u{1260}', 78 | '\u{1288}'), ('\u{128a}', '\u{128d}'), ('\u{1290}', '\u{12b0}'), 79 | ('\u{12b2}', '\u{12b5}'), ('\u{12b8}', '\u{12be}'), ('\u{12c0}', 80 | '\u{12c0}'), ('\u{12c2}', '\u{12c5}'), ('\u{12c8}', '\u{12d6}'), 81 | ('\u{12d8}', '\u{1310}'), ('\u{1312}', '\u{1315}'), ('\u{1318}', 82 | '\u{135a}'), ('\u{135d}', '\u{135f}'), ('\u{1380}', '\u{138f}'), 83 | ('\u{13a0}', '\u{13f5}'), ('\u{13f8}', '\u{13fd}'), ('\u{1401}', 84 | '\u{166c}'), ('\u{166f}', '\u{167f}'), ('\u{1681}', '\u{169a}'), 85 | ('\u{16a0}', '\u{16ea}'), ('\u{16ee}', '\u{16f8}'), ('\u{1700}', 86 | '\u{170c}'), ('\u{170e}', '\u{1714}'), ('\u{1720}', '\u{1734}'), 87 | ('\u{1740}', '\u{1753}'), ('\u{1760}', '\u{176c}'), ('\u{176e}', 88 | '\u{1770}'), ('\u{1772}', '\u{1773}'), ('\u{1780}', '\u{17d3}'), 89 | ('\u{17d7}', '\u{17d7}'), ('\u{17dc}', '\u{17dd}'), ('\u{17e0}', 90 | '\u{17e9}'), ('\u{180b}', '\u{180d}'), ('\u{1810}', '\u{1819}'), 91 | ('\u{1820}', '\u{1877}'), ('\u{1880}', '\u{18aa}'), ('\u{18b0}', 92 | '\u{18f5}'), ('\u{1900}', '\u{191e}'), ('\u{1920}', '\u{192b}'), 93 | ('\u{1930}', '\u{193b}'), ('\u{1946}', '\u{196d}'), ('\u{1970}', 94 | '\u{1974}'), ('\u{1980}', '\u{19ab}'), ('\u{19b0}', '\u{19c9}'), 95 | ('\u{19d0}', '\u{19d9}'), ('\u{1a00}', '\u{1a1b}'), ('\u{1a20}', 96 | '\u{1a5e}'), ('\u{1a60}', '\u{1a7c}'), ('\u{1a7f}', '\u{1a89}'), 97 | ('\u{1a90}', '\u{1a99}'), ('\u{1aa7}', '\u{1aa7}'), ('\u{1ab0}', 98 | '\u{1abe}'), ('\u{1b00}', '\u{1b4b}'), ('\u{1b50}', '\u{1b59}'), 99 | ('\u{1b6b}', '\u{1b73}'), ('\u{1b80}', '\u{1bf3}'), ('\u{1c00}', 100 | '\u{1c37}'), ('\u{1c40}', '\u{1c49}'), ('\u{1c4d}', '\u{1c7d}'), 101 | ('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', '\u{1cf6}'), ('\u{1cf8}', 102 | '\u{1cf9}'), ('\u{1d00}', '\u{1df5}'), ('\u{1dfc}', '\u{1f15}'), 103 | ('\u{1f18}', '\u{1f1d}'), ('\u{1f20}', '\u{1f45}'), ('\u{1f48}', 104 | '\u{1f4d}'), ('\u{1f50}', '\u{1f57}'), ('\u{1f59}', '\u{1f59}'), 105 | ('\u{1f5b}', '\u{1f5b}'), ('\u{1f5d}', '\u{1f5d}'), ('\u{1f5f}', 106 | '\u{1f7d}'), ('\u{1f80}', '\u{1fb4}'), ('\u{1fb6}', '\u{1fbc}'), 107 | ('\u{1fbe}', '\u{1fbe}'), ('\u{1fc2}', '\u{1fc4}'), ('\u{1fc6}', 108 | '\u{1fcc}'), ('\u{1fd0}', '\u{1fd3}'), ('\u{1fd6}', '\u{1fdb}'), 109 | ('\u{1fe0}', '\u{1fec}'), ('\u{1ff2}', '\u{1ff4}'), ('\u{1ff6}', 110 | '\u{1ffc}'), ('\u{200c}', '\u{200d}'), ('\u{203f}', '\u{2040}'), 111 | ('\u{2054}', '\u{2054}'), ('\u{2071}', '\u{2071}'), ('\u{207f}', 112 | '\u{207f}'), ('\u{2090}', '\u{209c}'), ('\u{20d0}', '\u{20f0}'), 113 | ('\u{2102}', '\u{2102}'), ('\u{2107}', '\u{2107}'), ('\u{210a}', 114 | '\u{2113}'), ('\u{2115}', '\u{2115}'), ('\u{2119}', '\u{211d}'), 115 | ('\u{2124}', '\u{2124}'), ('\u{2126}', '\u{2126}'), ('\u{2128}', 116 | '\u{2128}'), ('\u{212a}', '\u{212d}'), ('\u{212f}', '\u{2139}'), 117 | ('\u{213c}', '\u{213f}'), ('\u{2145}', '\u{2149}'), ('\u{214e}', 118 | '\u{214e}'), ('\u{2160}', '\u{2188}'), ('\u{24b6}', '\u{24e9}'), 119 | ('\u{2c00}', '\u{2c2e}'), ('\u{2c30}', '\u{2c5e}'), ('\u{2c60}', 120 | '\u{2ce4}'), ('\u{2ceb}', '\u{2cf3}'), ('\u{2d00}', '\u{2d25}'), 121 | ('\u{2d27}', '\u{2d27}'), ('\u{2d2d}', '\u{2d2d}'), ('\u{2d30}', 122 | '\u{2d67}'), ('\u{2d6f}', '\u{2d6f}'), ('\u{2d7f}', '\u{2d96}'), 123 | ('\u{2da0}', '\u{2da6}'), ('\u{2da8}', '\u{2dae}'), ('\u{2db0}', 124 | '\u{2db6}'), ('\u{2db8}', '\u{2dbe}'), ('\u{2dc0}', '\u{2dc6}'), 125 | ('\u{2dc8}', '\u{2dce}'), ('\u{2dd0}', '\u{2dd6}'), ('\u{2dd8}', 126 | '\u{2dde}'), ('\u{2de0}', '\u{2dff}'), ('\u{2e2f}', '\u{2e2f}'), 127 | ('\u{3005}', '\u{3007}'), ('\u{3021}', '\u{302f}'), ('\u{3031}', 128 | '\u{3035}'), ('\u{3038}', '\u{303c}'), ('\u{3041}', '\u{3096}'), 129 | ('\u{3099}', '\u{309a}'), ('\u{309d}', '\u{309f}'), ('\u{30a1}', 130 | '\u{30fa}'), ('\u{30fc}', '\u{30ff}'), ('\u{3105}', '\u{312d}'), 131 | ('\u{3131}', '\u{318e}'), ('\u{31a0}', '\u{31ba}'), ('\u{31f0}', 132 | '\u{31ff}'), ('\u{3400}', '\u{4db5}'), ('\u{4e00}', '\u{9fd5}'), 133 | ('\u{a000}', '\u{a48c}'), ('\u{a4d0}', '\u{a4fd}'), ('\u{a500}', 134 | '\u{a60c}'), ('\u{a610}', '\u{a62b}'), ('\u{a640}', '\u{a672}'), 135 | ('\u{a674}', '\u{a67d}'), ('\u{a67f}', '\u{a6f1}'), ('\u{a717}', 136 | '\u{a71f}'), ('\u{a722}', '\u{a788}'), ('\u{a78b}', '\u{a7ad}'), 137 | ('\u{a7b0}', '\u{a7b7}'), ('\u{a7f7}', '\u{a827}'), ('\u{a840}', 138 | '\u{a873}'), ('\u{a880}', '\u{a8c4}'), ('\u{a8d0}', '\u{a8d9}'), 139 | ('\u{a8e0}', '\u{a8f7}'), ('\u{a8fb}', '\u{a8fb}'), ('\u{a8fd}', 140 | '\u{a8fd}'), ('\u{a900}', '\u{a92d}'), ('\u{a930}', '\u{a953}'), 141 | ('\u{a960}', '\u{a97c}'), ('\u{a980}', '\u{a9c0}'), ('\u{a9cf}', 142 | '\u{a9d9}'), ('\u{a9e0}', '\u{a9fe}'), ('\u{aa00}', '\u{aa36}'), 143 | ('\u{aa40}', '\u{aa4d}'), ('\u{aa50}', '\u{aa59}'), ('\u{aa60}', 144 | '\u{aa76}'), ('\u{aa7a}', '\u{aac2}'), ('\u{aadb}', '\u{aadd}'), 145 | ('\u{aae0}', '\u{aaef}'), ('\u{aaf2}', '\u{aaf6}'), ('\u{ab01}', 146 | '\u{ab06}'), ('\u{ab09}', '\u{ab0e}'), ('\u{ab11}', '\u{ab16}'), 147 | ('\u{ab20}', '\u{ab26}'), ('\u{ab28}', '\u{ab2e}'), ('\u{ab30}', 148 | '\u{ab5a}'), ('\u{ab5c}', '\u{ab65}'), ('\u{ab70}', '\u{abea}'), 149 | ('\u{abec}', '\u{abed}'), ('\u{abf0}', '\u{abf9}'), ('\u{ac00}', 150 | '\u{d7a3}'), ('\u{d7b0}', '\u{d7c6}'), ('\u{d7cb}', '\u{d7fb}'), 151 | ('\u{f900}', '\u{fa6d}'), ('\u{fa70}', '\u{fad9}'), ('\u{fb00}', 152 | '\u{fb06}'), ('\u{fb13}', '\u{fb17}'), ('\u{fb1d}', '\u{fb28}'), 153 | ('\u{fb2a}', '\u{fb36}'), ('\u{fb38}', '\u{fb3c}'), ('\u{fb3e}', 154 | '\u{fb3e}'), ('\u{fb40}', '\u{fb41}'), ('\u{fb43}', '\u{fb44}'), 155 | ('\u{fb46}', '\u{fbb1}'), ('\u{fbd3}', '\u{fd3d}'), ('\u{fd50}', 156 | '\u{fd8f}'), ('\u{fd92}', '\u{fdc7}'), ('\u{fdf0}', '\u{fdfb}'), 157 | ('\u{fe00}', '\u{fe0f}'), ('\u{fe20}', '\u{fe2f}'), ('\u{fe33}', 158 | '\u{fe34}'), ('\u{fe4d}', '\u{fe4f}'), ('\u{fe70}', '\u{fe74}'), 159 | ('\u{fe76}', '\u{fefc}'), ('\u{ff10}', '\u{ff19}'), ('\u{ff21}', 160 | '\u{ff3a}'), ('\u{ff3f}', '\u{ff3f}'), ('\u{ff41}', '\u{ff5a}'), 161 | ('\u{ff66}', '\u{ffbe}'), ('\u{ffc2}', '\u{ffc7}'), ('\u{ffca}', 162 | '\u{ffcf}'), ('\u{ffd2}', '\u{ffd7}'), ('\u{ffda}', '\u{ffdc}'), 163 | ('\u{10000}', '\u{1000b}'), ('\u{1000d}', '\u{10026}'), ('\u{10028}', 164 | '\u{1003a}'), ('\u{1003c}', '\u{1003d}'), ('\u{1003f}', '\u{1004d}'), 165 | ('\u{10050}', '\u{1005d}'), ('\u{10080}', '\u{100fa}'), ('\u{10140}', 166 | '\u{10174}'), ('\u{101fd}', '\u{101fd}'), ('\u{10280}', '\u{1029c}'), 167 | ('\u{102a0}', '\u{102d0}'), ('\u{102e0}', '\u{102e0}'), ('\u{10300}', 168 | '\u{1031f}'), ('\u{10330}', '\u{1034a}'), ('\u{10350}', '\u{1037a}'), 169 | ('\u{10380}', '\u{1039d}'), ('\u{103a0}', '\u{103c3}'), ('\u{103c8}', 170 | '\u{103cf}'), ('\u{103d1}', '\u{103d5}'), ('\u{10400}', '\u{1049d}'), 171 | ('\u{104a0}', '\u{104a9}'), ('\u{10500}', '\u{10527}'), ('\u{10530}', 172 | '\u{10563}'), ('\u{10600}', '\u{10736}'), ('\u{10740}', '\u{10755}'), 173 | ('\u{10760}', '\u{10767}'), ('\u{10800}', '\u{10805}'), ('\u{10808}', 174 | '\u{10808}'), ('\u{1080a}', '\u{10835}'), ('\u{10837}', '\u{10838}'), 175 | ('\u{1083c}', '\u{1083c}'), ('\u{1083f}', '\u{10855}'), ('\u{10860}', 176 | '\u{10876}'), ('\u{10880}', '\u{1089e}'), ('\u{108e0}', '\u{108f2}'), 177 | ('\u{108f4}', '\u{108f5}'), ('\u{10900}', '\u{10915}'), ('\u{10920}', 178 | '\u{10939}'), ('\u{10980}', '\u{109b7}'), ('\u{109be}', '\u{109bf}'), 179 | ('\u{10a00}', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', 180 | '\u{10a13}'), ('\u{10a15}', '\u{10a17}'), ('\u{10a19}', '\u{10a33}'), 181 | ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '\u{10a3f}'), ('\u{10a60}', 182 | '\u{10a7c}'), ('\u{10a80}', '\u{10a9c}'), ('\u{10ac0}', '\u{10ac7}'), 183 | ('\u{10ac9}', '\u{10ae6}'), ('\u{10b00}', '\u{10b35}'), ('\u{10b40}', 184 | '\u{10b55}'), ('\u{10b60}', '\u{10b72}'), ('\u{10b80}', '\u{10b91}'), 185 | ('\u{10c00}', '\u{10c48}'), ('\u{10c80}', '\u{10cb2}'), ('\u{10cc0}', 186 | '\u{10cf2}'), ('\u{11000}', '\u{11046}'), ('\u{11066}', '\u{1106f}'), 187 | ('\u{1107f}', '\u{110ba}'), ('\u{110d0}', '\u{110e8}'), ('\u{110f0}', 188 | '\u{110f9}'), ('\u{11100}', '\u{11134}'), ('\u{11136}', '\u{1113f}'), 189 | ('\u{11150}', '\u{11173}'), ('\u{11176}', '\u{11176}'), ('\u{11180}', 190 | '\u{111c4}'), ('\u{111ca}', '\u{111cc}'), ('\u{111d0}', '\u{111da}'), 191 | ('\u{111dc}', '\u{111dc}'), ('\u{11200}', '\u{11211}'), ('\u{11213}', 192 | '\u{11237}'), ('\u{11280}', '\u{11286}'), ('\u{11288}', '\u{11288}'), 193 | ('\u{1128a}', '\u{1128d}'), ('\u{1128f}', '\u{1129d}'), ('\u{1129f}', 194 | '\u{112a8}'), ('\u{112b0}', '\u{112ea}'), ('\u{112f0}', '\u{112f9}'), 195 | ('\u{11300}', '\u{11303}'), ('\u{11305}', '\u{1130c}'), ('\u{1130f}', 196 | '\u{11310}'), ('\u{11313}', '\u{11328}'), ('\u{1132a}', '\u{11330}'), 197 | ('\u{11332}', '\u{11333}'), ('\u{11335}', '\u{11339}'), ('\u{1133c}', 198 | '\u{11344}'), ('\u{11347}', '\u{11348}'), ('\u{1134b}', '\u{1134d}'), 199 | ('\u{11350}', '\u{11350}'), ('\u{11357}', '\u{11357}'), ('\u{1135d}', 200 | '\u{11363}'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), 201 | ('\u{11480}', '\u{114c5}'), ('\u{114c7}', '\u{114c7}'), ('\u{114d0}', 202 | '\u{114d9}'), ('\u{11580}', '\u{115b5}'), ('\u{115b8}', '\u{115c0}'), 203 | ('\u{115d8}', '\u{115dd}'), ('\u{11600}', '\u{11640}'), ('\u{11644}', 204 | '\u{11644}'), ('\u{11650}', '\u{11659}'), ('\u{11680}', '\u{116b7}'), 205 | ('\u{116c0}', '\u{116c9}'), ('\u{11700}', '\u{11719}'), ('\u{1171d}', 206 | '\u{1172b}'), ('\u{11730}', '\u{11739}'), ('\u{118a0}', '\u{118e9}'), 207 | ('\u{118ff}', '\u{118ff}'), ('\u{11ac0}', '\u{11af8}'), ('\u{12000}', 208 | '\u{12399}'), ('\u{12400}', '\u{1246e}'), ('\u{12480}', '\u{12543}'), 209 | ('\u{13000}', '\u{1342e}'), ('\u{14400}', '\u{14646}'), ('\u{16800}', 210 | '\u{16a38}'), ('\u{16a40}', '\u{16a5e}'), ('\u{16a60}', '\u{16a69}'), 211 | ('\u{16ad0}', '\u{16aed}'), ('\u{16af0}', '\u{16af4}'), ('\u{16b00}', 212 | '\u{16b36}'), ('\u{16b40}', '\u{16b43}'), ('\u{16b50}', '\u{16b59}'), 213 | ('\u{16b63}', '\u{16b77}'), ('\u{16b7d}', '\u{16b8f}'), ('\u{16f00}', 214 | '\u{16f44}'), ('\u{16f50}', '\u{16f7e}'), ('\u{16f8f}', '\u{16f9f}'), 215 | ('\u{1b000}', '\u{1b001}'), ('\u{1bc00}', '\u{1bc6a}'), ('\u{1bc70}', 216 | '\u{1bc7c}'), ('\u{1bc80}', '\u{1bc88}'), ('\u{1bc90}', '\u{1bc99}'), 217 | ('\u{1bc9d}', '\u{1bc9e}'), ('\u{1d165}', '\u{1d169}'), ('\u{1d16d}', 218 | '\u{1d172}'), ('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'), 219 | ('\u{1d1aa}', '\u{1d1ad}'), ('\u{1d242}', '\u{1d244}'), ('\u{1d400}', 220 | '\u{1d454}'), ('\u{1d456}', '\u{1d49c}'), ('\u{1d49e}', '\u{1d49f}'), 221 | ('\u{1d4a2}', '\u{1d4a2}'), ('\u{1d4a5}', '\u{1d4a6}'), ('\u{1d4a9}', 222 | '\u{1d4ac}'), ('\u{1d4ae}', '\u{1d4b9}'), ('\u{1d4bb}', '\u{1d4bb}'), 223 | ('\u{1d4bd}', '\u{1d4c3}'), ('\u{1d4c5}', '\u{1d505}'), ('\u{1d507}', 224 | '\u{1d50a}'), ('\u{1d50d}', '\u{1d514}'), ('\u{1d516}', '\u{1d51c}'), 225 | ('\u{1d51e}', '\u{1d539}'), ('\u{1d53b}', '\u{1d53e}'), ('\u{1d540}', 226 | '\u{1d544}'), ('\u{1d546}', '\u{1d546}'), ('\u{1d54a}', '\u{1d550}'), 227 | ('\u{1d552}', '\u{1d6a5}'), ('\u{1d6a8}', '\u{1d6c0}'), ('\u{1d6c2}', 228 | '\u{1d6da}'), ('\u{1d6dc}', '\u{1d6fa}'), ('\u{1d6fc}', '\u{1d714}'), 229 | ('\u{1d716}', '\u{1d734}'), ('\u{1d736}', '\u{1d74e}'), ('\u{1d750}', 230 | '\u{1d76e}'), ('\u{1d770}', '\u{1d788}'), ('\u{1d78a}', '\u{1d7a8}'), 231 | ('\u{1d7aa}', '\u{1d7c2}'), ('\u{1d7c4}', '\u{1d7cb}'), ('\u{1d7ce}', 232 | '\u{1d7ff}'), ('\u{1da00}', '\u{1da36}'), ('\u{1da3b}', '\u{1da6c}'), 233 | ('\u{1da75}', '\u{1da75}'), ('\u{1da84}', '\u{1da84}'), ('\u{1da9b}', 234 | '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}'), ('\u{1e800}', '\u{1e8c4}'), 235 | ('\u{1e8d0}', '\u{1e8d6}'), ('\u{1ee00}', '\u{1ee03}'), ('\u{1ee05}', 236 | '\u{1ee1f}'), ('\u{1ee21}', '\u{1ee22}'), ('\u{1ee24}', '\u{1ee24}'), 237 | ('\u{1ee27}', '\u{1ee27}'), ('\u{1ee29}', '\u{1ee32}'), ('\u{1ee34}', 238 | '\u{1ee37}'), ('\u{1ee39}', '\u{1ee39}'), ('\u{1ee3b}', '\u{1ee3b}'), 239 | ('\u{1ee42}', '\u{1ee42}'), ('\u{1ee47}', '\u{1ee47}'), ('\u{1ee49}', 240 | '\u{1ee49}'), ('\u{1ee4b}', '\u{1ee4b}'), ('\u{1ee4d}', '\u{1ee4f}'), 241 | ('\u{1ee51}', '\u{1ee52}'), ('\u{1ee54}', '\u{1ee54}'), ('\u{1ee57}', 242 | '\u{1ee57}'), ('\u{1ee59}', '\u{1ee59}'), ('\u{1ee5b}', '\u{1ee5b}'), 243 | ('\u{1ee5d}', '\u{1ee5d}'), ('\u{1ee5f}', '\u{1ee5f}'), ('\u{1ee61}', 244 | '\u{1ee62}'), ('\u{1ee64}', '\u{1ee64}'), ('\u{1ee67}', '\u{1ee6a}'), 245 | ('\u{1ee6c}', '\u{1ee72}'), ('\u{1ee74}', '\u{1ee77}'), ('\u{1ee79}', 246 | '\u{1ee7c}'), ('\u{1ee7e}', '\u{1ee7e}'), ('\u{1ee80}', '\u{1ee89}'), 247 | ('\u{1ee8b}', '\u{1ee9b}'), ('\u{1eea1}', '\u{1eea3}'), ('\u{1eea5}', 248 | '\u{1eea9}'), ('\u{1eeab}', '\u{1eebb}'), ('\u{1f130}', '\u{1f149}'), 249 | ('\u{1f150}', '\u{1f169}'), ('\u{1f170}', '\u{1f189}'), ('\u{20000}', 250 | '\u{2a6d6}'), ('\u{2a700}', '\u{2b734}'), ('\u{2b740}', '\u{2b81d}'), 251 | ('\u{2b820}', '\u{2cea1}'), ('\u{2f800}', '\u{2fa1d}'), ('\u{e0100}', 252 | '\u{e01ef}') 253 | ]; 254 | 255 | 256 | --------------------------------------------------------------------------------