├── .gitignore ├── .travis.yml ├── LICENSE-APACHE ├── LICENSE-MIT ├── Makefile ├── README.md ├── benchmark ├── README.md ├── golang ├── regex-dna │ ├── Makefile │ ├── README.md │ ├── regex-dna-single.rs │ ├── regex-dna.c │ ├── regex-dna.go │ ├── regex-dna.py │ ├── regex-dna.rs │ └── shootout-fasta.rs └── rust ├── cargo-lite.conf ├── ctags.rust ├── regex-match-tests.py ├── regex-unicode-tables.py ├── session.vim └── src ├── compile.rs ├── lib.rs ├── macro.rs ├── parse.rs ├── re.rs ├── test ├── bench.rs ├── matches.rs ├── mod.rs └── tests.rs ├── testdata ├── LICENSE ├── README ├── basic.dat ├── nullsubexpr.dat └── repetition.dat ├── unicode.rs └── vm.rs /.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | doc 3 | tags 4 | build 5 | scratch.rs 6 | expanded.rs 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | before_install: 3 | - yes | sudo add-apt-repository ppa:hansjorg/rust 4 | - sudo apt-get update 5 | install: 6 | - sudo apt-get install rust-nightly 7 | script: 8 | - rustc -L . --crate-type lib ./src/lib.rs 9 | 10 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2006-2009 Graydon Hoare 2 | Copyright (c) 2009-2014 Mozilla Foundation 3 | 4 | Permission is hereby granted, free of charge, to any 5 | person obtaining a copy of this software and associated 6 | documentation files (the "Software"), to deal in the 7 | Software without restriction, including without 8 | limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of 10 | the Software, and to permit persons to whom the Software 11 | is furnished to do so, subject to the following 12 | conditions: 13 | 14 | The above copyright notice and this permission notice 15 | shall be included in all copies or substantial portions 16 | of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 19 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 20 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 21 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 22 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 23 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 24 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 25 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 | DEALINGS IN THE SOFTWARE. 27 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | RUSTC ?= rustc 2 | RUSTDOC ?= rustdoc 3 | BUILD_DIR ?= ./build 4 | RUST_PATH ?= $(BUILD_DIR) 5 | RUSTFLAGS ?= --opt-level=3 6 | RUSTTESTFLAGS ?= 7 | REGEXP_LIB ?= $(BUILD_DIR)/.libregex.timestamp 8 | REGEXP_LIB_FILES = src/compile.rs src/lib.rs src/parse.rs src/re.rs \ 9 | src/unicode.rs src/vm.rs 10 | REGEXP_MACRO_LIB ?= $(BUILD_DIR)/.libregex_macros.timestamp 11 | REGEXP_MACRO_LIB_FILES = src/macro.rs 12 | REGEXP_TEST_FILES = src/test/bench.rs src/test/matches.rs \ 13 | src/test/mod.rs src/test/tests.rs 14 | MOZILLA_RUST ?= $(HOME)/clones/rust 15 | REGEXP_DYN_FLAGS = 16 | 17 | ifdef REGEXP_DYNAMIC 18 | REGEXP_DYN_FLAGS = --cfg dynamic 19 | endif 20 | 21 | all: $(REGEXP_LIB) $(REGEXP_MACRO_LIB) 22 | 23 | install: 24 | cargo-lite install 25 | 26 | $(REGEXP_LIB): $(REGEXP_LIB_FILES) 27 | @mkdir -p $(BUILD_DIR) 28 | $(RUSTC) $(RUSTFLAGS) ./src/lib.rs --out-dir=$(BUILD_DIR) 29 | @touch $(REGEXP_LIB) 30 | 31 | $(REGEXP_MACRO_LIB): $(REGEXP_LIB) $(REGEXP_MACRO_LIB_FILES) 32 | @mkdir -p $(BUILD_DIR) 33 | $(RUSTC) -L $(BUILD_DIR) $(RUSTFLAGS) ./src/macro.rs --out-dir=$(BUILD_DIR) 34 | @touch $(REGEXP_MACRO_LIB) 35 | 36 | match-tests: 37 | ./regex-match-tests.py ./src/testdata/*.dat > ./src/test/matches.rs 38 | 39 | unicode-tables: 40 | ./regex-unicode-tables.py > ./src/unicode.rs 41 | 42 | docs: $(REGEXP_LIB_FILES) $(REGEXP_MACRO_LIB_FILES) 43 | rm -rf doc 44 | $(RUSTDOC) -L $(RUST_PATH) --test ./src/lib.rs 45 | $(RUSTDOC) -L $(RUST_PATH) ./src/lib.rs 46 | $(RUSTDOC) -L $(RUST_PATH) ./src/macro.rs 47 | # WTF is rustdoc doing? 48 | chmod 755 doc 49 | in-dir doc fix-perms 50 | rscp ./doc/* gopher:~/www/burntsushi.net/rustdoc/ 51 | 52 | test: build/tests 53 | RUST_TEST_TASKS=1 RUST_LOG=regex ./build/tests 54 | 55 | build/tests: $(REGEXP_LIB) $(REGEXP_MACRO_LIB) $(REGEXP_TEST_FILES) 56 | $(RUSTC) $(RUSTTESTFLAGS) -L $(RUST_PATH) --test $(REGEXP_DYN_FLAGS) src/lib.rs -o ./build/tests 57 | 58 | bench: build/bench 59 | RUST_TEST_TASKS=1 RUST_LOG=regex ./build/bench --bench 60 | 61 | bench-perf: build/bench 62 | RUST_TEST_TASKS=1 RUST_LOG=regex perf record -g --call-graph dwarf -s ./build/bench --bench 63 | 64 | build/bench: $(REGEXP_LIB) $(REGEXP_MACRO_LIB) $(REGEXP_TEST_FILES) 65 | $(RUSTC) $(RUSTFLAGS) -g -Z lto -L $(RUST_PATH) --test --cfg bench $(REGEXP_DYN_FLAGS) src/lib.rs -o ./build/bench 66 | 67 | scratch: build/scratch 68 | RUST_TEST_TASKS=1 RUST_LOG=regex ./build/scratch 69 | 70 | build/scratch: $(REGEXP_MACRO_LIB) scratch.rs 71 | $(RUSTC) -L $(BUILD_DIR) $(RUSTTESTFLAGS) scratch.rs -o ./build/scratch 72 | 73 | ctags: 74 | ctags --recurse --options=ctags.rust --languages=Rust 75 | 76 | clean: 77 | rm -f $(BUILD_DIR)/.*.timestamp $(BUILD_DIR)/* 78 | 79 | push: 80 | git push origin master 81 | git push github master 82 | 83 | mozilla: 84 | mkdir -p $(MOZILLA_RUST)/src/libregex 85 | mkdir -p $(MOZILLA_RUST)/src/libregex_macros 86 | rm -rf $(MOZILLA_RUST)/src/libregex/* 87 | cp -a ./src/* $(MOZILLA_RUST)/src/libregex/ 88 | rm $(MOZILLA_RUST)/src/libregex/macro.rs 89 | cp ./src/macro.rs $(MOZILLA_RUST)/src/libregex_macros/lib.rs 90 | cp *.py $(MOZILLA_RUST)/src/etc/ 91 | cp ./benchmark/regex-dna/regex-dna.rs $(MOZILLA_RUST)/src/test/bench/shootout-regex-dna.rs 92 | 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Initial regexp code for [Rust's regex crate](https://github.com/rust-lang-nursery/regex). 2 | 3 | Do not use. 4 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | Rust 2 | ---- 3 | ``` 4 | rustc --opt-level=3 -Z lto -g --test --cfg bench src/lib.rs -o ./build/bench 5 | ./build/bench --bench 6 | 7 | literal 125 ns/iter (+/- 0) 8 | not_literal 944 ns/iter (+/- 29) 9 | match_class 1259 ns/iter (+/- 25) 10 | match_class_in_range 1342 ns/iter (+/- 6) 11 | replace_all 1130 ns/iter (+/- 18) 12 | anchored_literal_short_non_match 432 ns/iter (+/- 4) 13 | anchored_literal_long_non_match 5825 ns/iter (+/- 157) 14 | anchored_literal_short_match 147 ns/iter (+/- 3) 15 | anchored_literal_long_match 137 ns/iter (+/- 2) 16 | one_pass_short_a 1002 ns/iter (+/- 14) 17 | one_pass_short_a_not 1500 ns/iter (+/- 32) 18 | one_pass_short_b 734 ns/iter (+/- 10) 19 | one_pass_short_b_not 974 ns/iter (+/- 10) 20 | one_pass_long_prefix 508 ns/iter (+/- 4) 21 | one_pass_long_prefix_not 510 ns/iter (+/- 6) 22 | easy0_32 263 ns/iter (+/- 15) = 121 MB/s 23 | easy0_1K 1477 ns/iter (+/- 139) = 693 MB/s 24 | easy0_32K 40140 ns/iter (+/- 917) = 816 MB/s 25 | easy1_32 328 ns/iter (+/- 71) = 97 MB/s 26 | easy1_1K 1774 ns/iter (+/- 524) = 577 MB/s 27 | easy1_32K 48362 ns/iter (+/- 3161) = 677 MB/s 28 | medium_32 774 ns/iter (+/- 34) = 41 MB/s 29 | medium_1K 15623 ns/iter (+/- 293) = 65 MB/s 30 | medium_32K 490884 ns/iter (+/- 2344) = 66 MB/s 31 | hard_32 1306 ns/iter (+/- 30) = 24 MB/s 32 | hard_1K 33060 ns/iter (+/- 245) = 30 MB/s 33 | hard_32K 1048745 ns/iter (+/- 5576) = 31 MB/s 34 | no_exponential 286117 ns/iter (+/- 2050) 35 | ``` 36 | 37 | Golang 38 | ------ 39 | Benchmarks are taken from the `regexp` package included in the Go distribution. 40 | 41 | ``` 42 | cd go/src/pkg/regexp 43 | go test -run ' ' -bench . 44 | 45 | Literal 10000000 229 ns/op 46 | NotLiteral 500000 3354 ns/op 47 | MatchClass 500000 5092 ns/op 48 | MatchClass_InRange 500000 4200 ns/op 49 | ReplaceAll 500000 3548 ns/op 50 | AnchoredLiteralShortNonMatch 20000000 145 ns/op 51 | AnchoredLiteralLongNonMatch 20000000 142 ns/op 52 | AnchoredShortMatch 5000000 381 ns/op 53 | AnchoredLongMatch 5000000 383 ns/op 54 | OnePassShortA 1000000 1045 ns/op 55 | NotOnePassShortA 1000000 2478 ns/op 56 | OnePassShortB 2000000 766 ns/op 57 | NotOnePassShortB 1000000 2216 ns/op 58 | OnePassLongPrefix 10000000 156 ns/op 59 | OnePassLongNotPrefix 5000000 614 ns/op 60 | MatchEasy0_32 20000000 114 ns/op 279.35 MB/s 61 | MatchEasy0_1K 5000000 653 ns/op 1566.63 MB/s 62 | MatchEasy0_32K 200000 12624 ns/op 2595.57 MB/s 63 | MatchEasy0_1M 5000 458608 ns/op 2286.43 MB/s 64 | MatchEasy1_32 20000000 96.7 ns/op 330.99 MB/s 65 | MatchEasy1_1K 1000000 2647 ns/op 386.74 MB/s 66 | MatchEasy1_32K 50000 57848 ns/op 566.45 MB/s 67 | MatchEasy1_1M 1000 1991274 ns/op 526.59 MB/s 68 | MatchMedium_32 1000000 1746 ns/op 18.33 MB/s 69 | MatchMedium_1K 50000 58501 ns/op 17.50 MB/s 70 | MatchMedium_32K 1000 1914850 ns/op 17.11 MB/s 71 | MatchMedium_1M 50 61487227 ns/op 17.05 MB/s 72 | MatchHard_32 500000 2918 ns/op 10.97 MB/s 73 | MatchHard_1K 20000 92338 ns/op 11.09 MB/s 74 | MatchHard_32K 1000 2979930 ns/op 11.00 MB/s 75 | MatchHard_1M 20 95889705 ns/op 10.94 MB/s 76 | ``` 77 | 78 | 79 | NOW OUTDATED: Very rough benchmark analysis 80 | ------------------------------------------- 81 | All benchmarks were taken from RE2/Go and hopefully implemented correctly. 82 | Both RE2/Rust and RE2/Go are benchmarked with an implicit `.*?` prefixing all 83 | regular expressions. (i.e., They are unachored unless there is an explicit 84 | '^'.) 85 | 86 | RE2/Rust gets absolutely clobbered by RE2/Go in the Easy{0,1} benchmarks. 87 | Interestingly, Rust does the same or better on the Medium/Hard benchmarks. My 88 | suspicion is that RE2/Go is performing some optimizations on the easy 89 | benchmarks to make the throughput very high. This gives me hope. 90 | 91 | For example, the EASY{0,1} benchmarks are subject to optimization. RE2/Rust 92 | does do some optimization with literal prefix strings (explaining the higher 93 | throughput when compared to the MEDIUM/HARD benchmarks). 94 | 95 | It's promising that RE2/Rust is beating RE2/Go on the MEDIUM/HARD benchmarks, 96 | which I think suggests that the core VM implementation is probably decent. 97 | 98 | Also note that RE2/Rust is performing much worse on the small Medium/Hard 99 | benchmarks (searching 32 bytes of text). My suspicion is that there are some 100 | big constant factors lurking somewhere that need to be fixed in RE2/Rust. 101 | This may also explain some of the performance difference in other benchmarks 102 | (NOT easy/medium/hard) since they mostly work with shortish search strings. 103 | (Although this is not true for all, since some specifically target the presence 104 | of optimizations in RE2/Go.) 105 | 106 | -------------------------------------------------------------------------------- /benchmark/golang: -------------------------------------------------------------------------------- 1 | Golang 2 | ------ 3 | cd go/src/pkg/regexp 4 | go test -run ' ' -bench . 5 | 6 | Literal 10000000 229 ns/op 7 | NotLiteral 500000 3354 ns/op 8 | MatchClass 500000 5092 ns/op 9 | MatchClass_InRange 500000 4200 ns/op 10 | ReplaceAll 500000 3548 ns/op 11 | AnchoredLiteralShortNonMatch 20000000 145 ns/op 12 | AnchoredLiteralLongNonMatch 20000000 142 ns/op 13 | AnchoredShortMatch 5000000 381 ns/op 14 | AnchoredLongMatch 5000000 383 ns/op 15 | OnePassShortA 1000000 1045 ns/op 16 | NotOnePassShortA 1000000 2478 ns/op 17 | OnePassShortB 2000000 766 ns/op 18 | NotOnePassShortB 1000000 2216 ns/op 19 | OnePassLongPrefix 10000000 156 ns/op 20 | OnePassLongNotPrefix 5000000 614 ns/op 21 | MatchEasy0_32 20000000 114 ns/op 279.35 MB/s 22 | MatchEasy0_1K 5000000 653 ns/op 1566.63 MB/s 23 | MatchEasy0_32K 200000 12624 ns/op 2595.57 MB/s 24 | MatchEasy0_1M 5000 458608 ns/op 2286.43 MB/s 25 | MatchEasy1_32 20000000 96.7 ns/op 330.99 MB/s 26 | MatchEasy1_1K 1000000 2647 ns/op 386.74 MB/s 27 | MatchEasy1_32K 50000 57848 ns/op 566.45 MB/s 28 | MatchEasy1_1M 1000 1991274 ns/op 526.59 MB/s 29 | MatchMedium_32 1000000 1746 ns/op 18.33 MB/s 30 | MatchMedium_1K 50000 58501 ns/op 17.50 MB/s 31 | MatchMedium_32K 1000 1914850 ns/op 17.11 MB/s 32 | MatchMedium_1M 50 61487227 ns/op 17.05 MB/s 33 | MatchHard_32 500000 2918 ns/op 10.97 MB/s 34 | MatchHard_1K 20000 92338 ns/op 11.09 MB/s 35 | MatchHard_32K 1000 2979930 ns/op 11.00 MB/s 36 | MatchHard_1M 20 95889705 ns/op 10.94 MB/s 37 | 38 | -------------------------------------------------------------------------------- /benchmark/regex-dna/Makefile: -------------------------------------------------------------------------------- 1 | RUSTC ?= rustc 2 | RUSTFILE ?= regex-dna.rs 3 | 4 | bench-rust: run-rust big.fasta 5 | time ./run-rust < big.fasta 6 | 7 | bench-rust-perf: run-rust big.fasta 8 | time perf record --call-graph dwarf ./run-rust < big.fasta 9 | 10 | bench-golang: run-golang big.fasta 11 | time ./run-golang < big.fasta 12 | 13 | bench-python: regex-dna.py big.fasta 14 | time python3 ./regex-dna.py < big.fasta 15 | 16 | bench-c: run-c big.fasta 17 | time ./run-c < big.fasta 18 | 19 | big.fasta: generator 20 | ./generator 5000000 > big.fasta 21 | 22 | generator: shootout-fasta.rs 23 | $(RUSTC) --opt-level=3 shootout-fasta.rs -o generator 24 | 25 | run-rust: $(RUSTFILE) 26 | (cd ../.. && make RUSTC=$(RUSTC)) 27 | $(RUSTC) --opt-level=3 -Z lto -g -L ../../build $(RUSTFILE) -o run-rust 28 | 29 | run-golang: regex-dna.go 30 | go build -o run-golang regex-dna.go 31 | 32 | run-c: regex-dna.c 33 | gcc -pipe -Wall -O3 -fomit-frame-pointer -march=native -pthread `pkg-config --cflags --libs glib-2.0` regex-dna.c -o run-c -ltcl -lglib-2.0 34 | 35 | check: check.fasta check.output run-rust run-golang run-c 36 | bash -c 'diff check.output <(./run-golang < check.fasta)' 37 | bash -c 'diff check.output <(./run-rust < check.fasta)' 38 | bash -c 'diff check.output <(python3 ./regex-dna.py < check.fasta)' 39 | bash -c 'diff check.output <(./run-c < check.fasta)' 40 | 41 | check.fasta: 42 | curl 'http://benchmarksgame.alioth.debian.org/download/regexdna-input.txt' > check.fasta 43 | 44 | check.output: 45 | curl 'http://benchmarksgame.alioth.debian.org/download/regexdna-output.txt' > check.output 46 | 47 | clean: 48 | rm -rf big.fasta check.fasta check.output run-golang run-rust run-c generator 49 | rm -f perf.data* 50 | 51 | -------------------------------------------------------------------------------- /benchmark/regex-dna/README.md: -------------------------------------------------------------------------------- 1 | This compares RE2/Rust with RE2/Go on the 2 | [regex-dna](http://benchmarksgame.alioth.debian.org/u32/performance.php?test=regexdna) 3 | benchmark. The Python and C benchmarks are also provided for additional 4 | context. 5 | 6 | To run, first make sure all benchmarks are correct: 7 | 8 | ``` 9 | [andrew@Liger regex-dna] make check 10 | bash -c 'diff check.output <(./run-golang < check.fasta)' 11 | bash -c 'diff check.output <(./run-rust < check.fasta)' 12 | bash -c 'diff check.output <(python3 ./regex-dna.py < check.fasta)' 13 | bash -c 'diff check.output <(./run-c < check.fasta)' 14 | ``` 15 | 16 | If there's something wrong, an error will be reported along with a non-empty 17 | diff. 18 | 19 | Then run the Rust benchmark: 20 | 21 | ``` 22 | [andrew@Liger regex-dna] make bench-rust 23 | ... 24 | real 0m5.235s 25 | user 0m28.940s 26 | sys 0m0.623s 27 | ``` 28 | 29 | And the Go benchmark: 30 | 31 | ``` 32 | [andrew@Liger regex-dna] make bench-golang 33 | time ./run-golang < big.fasta 34 | ... 35 | real 0m18.654s 36 | user 1m44.733s 37 | sys 0m0.420s 38 | ``` 39 | 40 | And the Python benchmark: 41 | 42 | ``` 43 | [andrew@Liger regex-dna] make bench-python 44 | time python3 ./regex-dna.py < big.fasta 45 | ... 46 | real 0m4.174s 47 | user 0m13.757s 48 | sys 0m0.407s 49 | ``` 50 | 51 | And the C (Tcl) benchmark: 52 | 53 | ``` 54 | [andrew@Liger regex-dna] make bench-c 55 | time ./run-c < big.fasta 56 | real 0m0.970s 57 | user 0m3.793s 58 | sys 0m0.380s 59 | ``` 60 | 61 | Note that all benchmarks are multithreaded and were run on an Intel i7 3930K 62 | (12 threads). 63 | 64 | -------------------------------------------------------------------------------- /benchmark/regex-dna/regex-dna-single.rs: -------------------------------------------------------------------------------- 1 | // Originally written by JustAPerson (https://github.com/JustAPerson). 2 | // Modified by Andrew Gallant (https://github.com/BurntSushi). 3 | 4 | #![feature(macro_rules, phase)] 5 | 6 | extern crate regex; 7 | #[phase(syntax)]extern crate regex_macros; 8 | 9 | use regex::{NoExpand, Regex}; 10 | 11 | fn replace(re: &Regex, text: &str, rep: &str) -> ~str { 12 | re.replace_all(text, NoExpand(rep)) 13 | } 14 | 15 | fn count_matches(seq: &str, variant: &Regex) -> int { 16 | let mut n = 0; 17 | for _ in variant.find_iter(seq) { 18 | n += 1; 19 | } 20 | n 21 | } 22 | 23 | fn main() { 24 | let mut stdin = std::io::stdio::stdin(); 25 | let mut seq = stdin.read_to_str().unwrap(); 26 | let ilen = seq.len(); 27 | 28 | seq = regex!(">[^\n]*\n|\n").replace_all(seq, NoExpand("")); 29 | let clen = seq.len(); 30 | 31 | let variants = ~[ 32 | regex!("agggtaaa|tttaccct"), 33 | regex!("[cgt]gggtaaa|tttaccc[acg]"), 34 | regex!("a[act]ggtaaa|tttacc[agt]t"), 35 | regex!("ag[act]gtaaa|tttac[agt]ct"), 36 | regex!("agg[act]taaa|ttta[agt]cct"), 37 | regex!("aggg[acg]aaa|ttt[cgt]ccct"), 38 | regex!("agggt[cgt]aa|tt[acg]accct"), 39 | regex!("agggta[cgt]a|t[acg]taccct"), 40 | regex!("agggtaa[cgt]|[acg]ttaccct"), 41 | ]; 42 | let (mut variant_strs, mut counts) = (vec!(), vec!()); 43 | for variant in variants.move_iter() { 44 | variant_strs.push(variant.to_str().to_owned()); 45 | counts.push(count_matches(seq, &variant)); 46 | } 47 | for (i, variant) in variant_strs.iter().enumerate() { 48 | println!("{} {}", variant, *counts.get(i)); 49 | } 50 | 51 | let substs = ~[ 52 | (regex!("B"), "(c|g|t)"), 53 | (regex!("D"), "(a|g|t)"), 54 | (regex!("H"), "(a|c|t)"), 55 | (regex!("K"), "(g|t)"), 56 | (regex!("M"), "(a|c)"), 57 | (regex!("N"), "(a|c|g|t)"), 58 | (regex!("R"), "(a|g)"), 59 | (regex!("S"), "(c|g)"), 60 | (regex!("V"), "(a|c|g)"), 61 | (regex!("W"), "(a|t)"), 62 | (regex!("Y"), "(c|t)"), 63 | ]; 64 | for (re, replacement) in substs.move_iter() { 65 | seq = replace(&re, seq, replacement) 66 | } 67 | println!(""); 68 | println!("{}", ilen); 69 | println!("{}", clen); 70 | println!("{}", seq.len()); 71 | } 72 | -------------------------------------------------------------------------------- /benchmark/regex-dna/regex-dna.c: -------------------------------------------------------------------------------- 1 | /* The Computer Language Benchmarks Game 2 | * http://benchmarksgame.alioth.debian.org/ 3 | contributed by Paul Serice 4 | */ 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | /************************************************************************* 16 | * Data Structures and Typedefs 17 | *************************************************************************/ 18 | 19 | /* Mapping of a nucleic acid code to its meaning. This is used with 20 | * regsub() to substitute each occurrence of "code" in the main input 21 | * string with its "meaning." */ 22 | static struct nucleic_acid_code { 23 | char* code; 24 | char* meaning; 25 | } nacodes[] = {{"B", "(c|g|t)"}, 26 | {"D", "(a|g|t)"}, 27 | {"H", "(a|c|t)"}, 28 | {"K", "(g|t)"}, 29 | {"M", "(a|c)"}, 30 | {"N", "(a|c|g|t)"}, 31 | {"R", "(a|g)"}, 32 | {"S", "(c|g)"}, 33 | {"V", "(a|c|g)"}, 34 | {"W", "(a|t)"}, 35 | {"Y", "(c|t)"}, 36 | {NULL, NULL} 37 | }; 38 | 39 | /* The variants are used with regcount() to count the number of times 40 | * each variant appears in the main input string. */ 41 | static const char* variants[] = { 42 | "agggtaaa|tttaccct", 43 | "[cgt]gggtaaa|tttaccc[acg]", 44 | "a[act]ggtaaa|tttacc[agt]t", 45 | "ag[act]gtaaa|tttac[agt]ct", 46 | "agg[act]taaa|ttta[agt]cct", 47 | "aggg[acg]aaa|ttt[cgt]ccct", 48 | "agggt[cgt]aa|tt[acg]accct", 49 | "agggta[cgt]a|t[acg]taccct", 50 | "agggtaa[cgt]|[acg]ttaccct", 51 | NULL 52 | }; 53 | 54 | 55 | /* To process the variants, a small thread pool is created. Each 56 | * thread is passed an array of these tasks. The threads combine to 57 | * perform the tasks. When there are no more tasks, the threads exit 58 | * and the parent joins with them before continuing. */ 59 | typedef struct variant_worker_task { 60 | 61 | /* input: which variant to process */ 62 | const char* variant; 63 | 64 | /* input: string against which "variant" will be matched */ 65 | Tcl_Obj* s; 66 | 67 | /* output: number of times "variant" matched against "s" */ 68 | unsigned long int count; 69 | 70 | } *variant_worker_task_t; 71 | 72 | 73 | /* Data passed into each thread that process the variants. All the 74 | * threads in the pool share one copy of this data structure and must 75 | * use "lock" to synchronize access to it. */ 76 | typedef struct variant_worker_data { 77 | 78 | /* shared: lock that protects this structure */ 79 | pthread_mutex_t lock; 80 | 81 | /* shared: array of tasks that the threads are trying to complete */ 82 | variant_worker_task_t tasks; 83 | 84 | /* shared: pointer to shared index into "tasks" */ 85 | volatile int next_task; 86 | 87 | /* shared: total number of tasks in the "tasks" array */ 88 | int total_tasks; 89 | 90 | } *variant_worker_data_t; 91 | 92 | 93 | /* Data passed into each thread that substitutes nucleic acid codes. */ 94 | typedef struct nacodes_worker_data { 95 | 96 | /* input/output: String object that is input to the thread as a 97 | * copy of the range of characters from the main input string over 98 | * which the thread should work. The thread should call 99 | * Tcl_SetStringObj() to set "range" to hold the result of the 100 | * substitutions. */ 101 | Tcl_Obj* range; 102 | 103 | } *nacodes_worker_data_t; 104 | 105 | 106 | /* Create an explicit typedef for the pthread start functions. */ 107 | typedef void* (*thread_start_t)(void*); 108 | 109 | /************************************************************************* 110 | * regcount() 111 | *************************************************************************/ 112 | 113 | /* Return the number of times the regular expression "regexp_cstr" 114 | * uniquely matches against the input string "s". */ 115 | static unsigned long 116 | regcount(const char* regexp_cstr, 117 | Tcl_Obj* s) 118 | { 119 | int regexec_rv = 0; 120 | int index = 0; 121 | int index_max = 0; 122 | unsigned long rv = 0; 123 | Tcl_Obj* regexp_cstr_obj = NULL; 124 | Tcl_RegExp regexp = NULL; 125 | struct Tcl_RegExpInfo info = {0}; 126 | 127 | /* Get "regexp_cstr" as a Tcl string object. */ 128 | regexp_cstr_obj = Tcl_NewStringObj(regexp_cstr, strlen(regexp_cstr)); 129 | Tcl_IncrRefCount(regexp_cstr_obj); 130 | 131 | /* Compile the regular expression. */ 132 | regexp = Tcl_GetRegExpFromObj(NULL, regexp_cstr_obj, 133 | TCL_REG_ADVANCED | TCL_REG_NOCASE | TCL_REG_NEWLINE); 134 | if (!regexp) { 135 | fprintf(stderr, "*** Error: Tcl_GetRegExpFromObj: failed"); 136 | exit(1); 137 | } 138 | 139 | /* Iterate over each match. */ 140 | index = 0; 141 | index_max = Tcl_GetCharLength(s); 142 | while (index < index_max) { 143 | 144 | /* Test for a match. */ 145 | regexec_rv = Tcl_RegExpExecObj(NULL, regexp, s, index, 1, 0); 146 | if (regexec_rv == -1) { 147 | fprintf(stderr, "*** Error: Tcl_RegExpExecObj: failed"); 148 | exit(1); 149 | } 150 | if (regexec_rv == 0) { 151 | /* No matches. */ 152 | break; 153 | } 154 | 155 | /* Get the match information. */ 156 | Tcl_RegExpGetInfo(regexp, &info); 157 | 158 | /* Advance curr. */ 159 | index += info.matches[0].end; 160 | 161 | /* Increment the match count. */ 162 | ++rv; 163 | } 164 | 165 | /* Clean up. Note that "regexp" is owned by "regexp_cstr_obj" so 166 | * it does not need explicit clean up. */ 167 | Tcl_DecrRefCount(regexp_cstr_obj); 168 | 169 | return rv; 170 | } 171 | 172 | /************************************************************************* 173 | * regsub() 174 | *************************************************************************/ 175 | 176 | /* Substitute each occurrence of the regular expression "regex" in "s" 177 | * with "subst". The result is returned in a newly allocate string 178 | * that must be freed with g_free(). */ 179 | static char* 180 | regsub(const char* regex, 181 | const char* s, 182 | const char* subst, 183 | GError** err) 184 | { 185 | char* rv = NULL; 186 | GRegex* prog = NULL; 187 | 188 | /* How glib propagates exceptions. */ 189 | if (err && *err) { 190 | goto out; 191 | } 192 | 193 | /* Compile regex. */ 194 | prog = g_regex_new(regex, 195 | G_REGEX_CASELESS | 196 | G_REGEX_RAW | 197 | G_REGEX_NO_AUTO_CAPTURE | 198 | G_REGEX_OPTIMIZE, 199 | 0, 200 | err); 201 | if (err && *err) { 202 | goto out; 203 | } 204 | 205 | /* Substitute. */ 206 | rv = g_regex_replace_literal(prog, s, -1, 0, subst, 0, err); 207 | if (err && *err) { 208 | goto out; 209 | } 210 | 211 | out: 212 | 213 | /* Clean up. */ 214 | if (prog) { 215 | g_regex_unref(prog); 216 | } 217 | 218 | return rv; 219 | } 220 | 221 | /************************************************************************* 222 | * load_file() 223 | *************************************************************************/ 224 | 225 | /* Load the file f into the string s. */ 226 | static void 227 | load_file(FILE* f, 228 | Tcl_Obj* s) 229 | { 230 | char* block = NULL; 231 | size_t block_size = 16384; 232 | size_t rcount = 0; 233 | 234 | /* Allocate a block for I/O. */ 235 | block = malloc(block_size); 236 | if (!block) { 237 | perror("malloc"); 238 | exit(1); 239 | } 240 | 241 | /* Iterate over each block of input. */ 242 | for (;;) { 243 | 244 | /* Read a block. */ 245 | rcount = fread(block, 1, block_size, f); 246 | if (rcount == 0) { 247 | /* Check for errors. */ 248 | if (ferror(f)) { 249 | perror("fread"); 250 | exit(1); 251 | } 252 | /* EOF */ 253 | break; 254 | } 255 | 256 | /* Append a block. */ 257 | Tcl_AppendToObj(s, block, rcount); 258 | } 259 | 260 | /* Free block. */ 261 | free(block); 262 | } 263 | 264 | /************************************************************************* 265 | * process_variant_worker() and process_variants() 266 | *************************************************************************/ 267 | 268 | /* This is a helper function for process_variant_worker() which is the 269 | * start routine for the threads that count how many times a variant 270 | * matches the main input string. This routing locks "data" and 271 | * attempts to get the index of the next task. If successful, it 272 | * takes ownership of that index by incrementing "data->next_task" so 273 | * that the next thread that comes along will get the next task. 274 | * Before returning, this routine releases the lock. This routine 275 | * returns true if successful and false otherwise. */ 276 | static int 277 | get_variant_index(variant_worker_data_t data, 278 | int* index) 279 | { 280 | int rv = 0; 281 | 282 | /* Lock "data". */ 283 | pthread_mutex_lock(&data->lock); 284 | 285 | /* Get the index for the next task if any remain. */ 286 | if (data->next_task < data->total_tasks) { 287 | *index = data->next_task++; 288 | rv = 1; 289 | } 290 | 291 | /* Unlock "data". */ 292 | pthread_mutex_unlock(&data->lock); 293 | 294 | return rv; 295 | } 296 | 297 | /* This is the worker routine for the thread pool that processes the 298 | * variants. This routine atomically gets the next task which holds 299 | * all the information needed to count the number of times the task's 300 | * "variant" value matches the main input string and stores the result 301 | * in the task's "count" value. The main input string is passed in as 302 | * the task's read-only "s" value. */ 303 | static void* 304 | process_variant_worker(variant_worker_data_t data) 305 | { 306 | int index = 0; 307 | 308 | /* Carefully get the index for the next task. */ 309 | while (get_variant_index(data, &index)) { 310 | /* Perform the task of counting regex matches. */ 311 | data->tasks[index].count 312 | = regcount(data->tasks[index].variant, 313 | data->tasks[index].s); 314 | } 315 | 316 | return NULL; 317 | } 318 | 319 | /* Process the list of variants by counting the frequency of each 320 | * regexp in the main input string "s" and printing the results. */ 321 | static void 322 | process_variants(int cpu_count, 323 | Tcl_Obj* s) 324 | { 325 | int i = 0; 326 | int s_length = 0; 327 | int thread_rv = 0; 328 | int thread_count = 0; 329 | int task_count = 0; 330 | pthread_t* threads = NULL; 331 | variant_worker_task_t tasks = NULL; 332 | struct variant_worker_data data = {PTHREAD_MUTEX_INITIALIZER,}; 333 | 334 | /* WARNING: Tcl_RegExpExecObj() always does an internal conversion 335 | * of "s" to a UCS-2 Unicode string if "s" is in UTF-8 format. 336 | * Normally, this is a nice feature, but as of tcl-8.5, it doesn't 337 | * appear to be thread-safe. As a work-around, force the 338 | * conversion now before starting the threads. */ 339 | Tcl_GetUnicodeFromObj(s, &s_length); 340 | 341 | /* Determine the total number of variants (minus the NULL sentinel). */ 342 | task_count = (int)(sizeof(variants) / sizeof(variants[0]) - 1); 343 | 344 | /* Determine the number of threads to start. */ 345 | thread_count = cpu_count * 2; 346 | if (thread_count > task_count) { 347 | thread_count = task_count; 348 | } 349 | 350 | /* Allocate the "threads" array which holds the thread IDs. */ 351 | threads = calloc(thread_count, sizeof(*threads)); 352 | if (!threads) { 353 | perror("calloc"); 354 | exit(1); 355 | } 356 | 357 | /* Allocate the "tasks" array which holds one unit of work per 358 | * element in the array. */ 359 | tasks = calloc(task_count, sizeof(*tasks)); 360 | if (!tasks) { 361 | perror("calloc"); 362 | exit(1); 363 | } 364 | 365 | /* Initialize the task array. */ 366 | for (i = 0 ; i < task_count ; ++i) { 367 | tasks[i].variant = variants[i]; 368 | tasks[i].s = s; 369 | tasks[i].count = 0; 370 | } 371 | 372 | /* Initialize the data shared by the threads. */ 373 | data.tasks = tasks; 374 | data.next_task = 0; 375 | data.total_tasks = task_count; 376 | 377 | /* Start the threads. */ 378 | for (i = 0 ; i < thread_count ; ++i) { 379 | thread_rv = pthread_create(&threads[i], 380 | NULL, 381 | (thread_start_t)process_variant_worker, 382 | &data); 383 | if (thread_rv) { 384 | fprintf(stderr, "*** Error: pthread_create: failed"); 385 | exit(1); 386 | } 387 | } 388 | 389 | /* Wait for each thread to finish. */ 390 | for (i = 0 ; i < thread_count ; ++i) { 391 | thread_rv = pthread_join(threads[i], NULL); 392 | if (thread_rv) { 393 | fprintf(stderr, "*** Error: pthread_join: failed"); 394 | exit(1); 395 | } 396 | } 397 | 398 | /* Print results. */ 399 | for (i = 0 ; i < task_count ; ++i) { 400 | printf("%s %lu\n", variants[i], tasks[i].count); 401 | } 402 | 403 | /* Clean up. */ 404 | free(tasks); 405 | free(threads); 406 | } 407 | 408 | /************************************************************************* 409 | * process_nacodes_worker() and process_nacodes() 410 | *************************************************************************/ 411 | 412 | /* This is the worker routing for the threads that process the 413 | * substitution of the nucleic acid codes with their meanings. These 414 | * threads are not in a thread pool because the work can be divided 415 | * exactly into one thread per cpu. So the parent just starts each 416 | * thread and waits for them all to finish. 417 | * 418 | * Each worker gets a range of characters from the main input string 419 | * and is responsible for calling regsub() once for each nucleic acid 420 | * code. Thus, if there are 11 nucleic acid codes, each thread calls 421 | * regsub() 11 times but the scope of the regsub() call is limited to 422 | * just the range of characters it has been assigned. */ 423 | static void* 424 | process_nacodes_worker(nacodes_worker_data_t data) 425 | { 426 | char* s_in = NULL; 427 | char* s_out = NULL; 428 | struct nucleic_acid_code* nacode = NULL; 429 | 430 | /* Get the character range as a C-style string. */ 431 | s_in = Tcl_GetString(data->range); 432 | 433 | /* Iterate over the nucleic acid codes. */ 434 | for (nacode = nacodes ; nacode->code ; ++nacode) { 435 | 436 | /* Perform the substitution. */ 437 | s_out = regsub(nacode->code, s_in, nacode->meaning, NULL); 438 | 439 | /* Free s_in on all but the first pass because s_in 440 | * belongs to Tcl on the first pass. */ 441 | if (nacode != nacodes) { 442 | g_free(s_in); 443 | s_in = NULL; 444 | } 445 | /* If this is the last pass, save the result and clean up. */ 446 | if ((nacode + 1)->code == NULL) { 447 | Tcl_SetStringObj(data->range, s_out, strlen(s_out)); 448 | g_free(s_out); 449 | s_out = NULL; 450 | } else { 451 | /* Otherwise, prepare for the next iteration. */ 452 | s_in = s_out; 453 | s_out = NULL; 454 | } 455 | } 456 | 457 | return NULL; 458 | } 459 | 460 | /* Process the nucleic acid codes by substituting each nucleic acid 461 | * code in "s" with its meaning as defined in the static "nacodes" 462 | * structure (see top of file). On return, "s" will hold the 463 | * substituted string. */ 464 | static void 465 | process_nacodes(int cpu_count, 466 | Tcl_Obj* s) 467 | { 468 | int i = 0; 469 | int first = 0; 470 | int last = 0; 471 | int s_length = 0; 472 | int range_length = 0; 473 | int thread_rv = 0; 474 | nacodes_worker_data_t data = NULL; 475 | pthread_t* threads = NULL; 476 | 477 | /* Sanity check to make sure we don't divide by zero. */ 478 | if (cpu_count == 0) { 479 | return; 480 | } 481 | 482 | /* Get the total length of s. */ 483 | s_length = Tcl_GetCharLength(s); 484 | if (s_length == 0) { 485 | return; 486 | } 487 | 488 | /* Allocate the "data" array which is used to pass data to and 489 | * from the threads. */ 490 | data = calloc(cpu_count, sizeof(*data)); 491 | 492 | /* Allocate the "threads" array which holds the thread IDs. */ 493 | threads = calloc(cpu_count, sizeof(*threads)); 494 | 495 | /* Calculate the number of characters to feed each thread. Note 496 | * that we checked above to make sure cpu_count is not zero. */ 497 | range_length = s_length / cpu_count; 498 | 499 | /* Start one thread for each cpu. */ 500 | for (i = 0 ; i < cpu_count ; ++i) { 501 | 502 | /* First, initialize the thread's client data. */ 503 | 504 | /* Calculate the first and last index for the range. Both 505 | * "first" and "last" indexes are inclusive because that is 506 | * what Tcl_GetRange() requires. We also need to make sure 507 | * the very last range has all the characters in case 508 | * range_length does not divide s_length evenly. */ 509 | first = range_length * i; 510 | last = range_length * (i + 1) - 1; 511 | if (i + 1 == cpu_count) { 512 | last = s_length - 1; 513 | } 514 | 515 | /* Pack the data for the worker thread. */ 516 | data[i].range = Tcl_GetRange(s, first, last); 517 | Tcl_IncrRefCount(data[i].range); 518 | 519 | /* Second, start the thread. */ 520 | thread_rv = pthread_create(&threads[i], 521 | NULL, 522 | (thread_start_t)process_nacodes_worker, 523 | &data[i]); 524 | if (thread_rv) { 525 | fprintf(stderr, "*** Error: pthread_create: failed"); 526 | exit(1); 527 | } 528 | } 529 | 530 | /* Wait for each thread to finish. */ 531 | for (i = 0 ; i < cpu_count ; ++i) { 532 | thread_rv = pthread_join(threads[i], NULL); 533 | if (thread_rv) { 534 | fprintf(stderr, "*** Error: pthread_join: failed"); 535 | exit(1); 536 | } 537 | } 538 | 539 | /* Merge results. */ 540 | Tcl_SetObjLength(s, 0); 541 | for (i = 0 ; i < cpu_count ; ++i) { 542 | Tcl_AppendObjToObj(s, data[i].range); 543 | } 544 | 545 | /* Clean up. */ 546 | for (i = 0 ; i < cpu_count ; ++i) { 547 | Tcl_DecrRefCount(data[i].range); 548 | } 549 | free(threads); 550 | free(data); 551 | } 552 | 553 | /************************************************************************* 554 | * get_cpu_count() 555 | *************************************************************************/ 556 | 557 | /* Return the number of cpus. If an error occurs, 0 cpus will be 558 | * reported. There are other ways to do this, but this is a program 559 | * to test regexp processing so ... */ 560 | static int 561 | get_cpu_count(void) 562 | { 563 | int rv = 0; 564 | FILE* f = NULL; 565 | Tcl_Obj* s = NULL; 566 | 567 | /* Allocate a string. */ 568 | s = Tcl_NewStringObj("", 0); 569 | Tcl_IncrRefCount(s); 570 | 571 | /* Open /proc/cpuinfo. */ 572 | f = fopen("/proc/cpuinfo", "r"); 573 | if (!f) { 574 | goto out; 575 | } 576 | 577 | /* Load file into s. */ 578 | load_file(f, s); 579 | 580 | /* Count the number of cpus. "\M" matches at the end of a word. */ 581 | rv = regcount("^processor\\M", s); 582 | 583 | out: 584 | 585 | /* Clean up. */ 586 | if (f) { 587 | fclose(f); 588 | } 589 | if (s) { 590 | Tcl_DecrRefCount(s); 591 | } 592 | 593 | return rv; 594 | } 595 | 596 | /************************************************************************* 597 | * main() 598 | *************************************************************************/ 599 | 600 | int 601 | main(int argc, 602 | char* argv[]) 603 | { 604 | int rv = 0; 605 | int cpu_count = 0; 606 | int init_length = 0; 607 | int code_length = 0; 608 | int seq_length = 0; 609 | char* s_cstr = NULL; 610 | Tcl_Interp *tcl = NULL; 611 | Tcl_Obj* s = NULL; 612 | 613 | /* Initialize Tcl. */ 614 | Tcl_FindExecutable(argv[0]); 615 | tcl = Tcl_CreateInterp(); 616 | Tcl_Preserve((ClientData)tcl); 617 | 618 | /* Count the number of cpus. If the cpu count could not be 619 | * determined, assume 4 cpus. */ 620 | cpu_count = get_cpu_count(); 621 | if (!cpu_count) { 622 | cpu_count = 4; 623 | } 624 | 625 | /* Allocate s. */ 626 | s = Tcl_NewStringObj("", 0); 627 | Tcl_IncrRefCount(s); 628 | 629 | /* Load stdin into s. */ 630 | load_file(stdin, s); 631 | 632 | /* Get the length of s. */ 633 | init_length = Tcl_GetCharLength(s); 634 | 635 | /* Strip off section headers and EOLs from s. This is a little 636 | * messy because we have to go from Tcl-string to C-string and 637 | * back to Tcl-string. */ 638 | s_cstr = regsub("(>.*)|\n", Tcl_GetString(s), "", NULL); 639 | Tcl_SetStringObj(s, s_cstr, strlen(s_cstr)); 640 | g_free(s_cstr); 641 | s_cstr = NULL; 642 | 643 | /* Get the length of s. */ 644 | code_length = Tcl_GetCharLength(s); 645 | 646 | /* Process the variants by counting them and printing the results. */ 647 | process_variants(cpu_count, s); 648 | 649 | /* Substitute nucleic acid codes in s with their meanings. */ 650 | process_nacodes(cpu_count, s); 651 | 652 | /* Get the length of s. */ 653 | seq_length = Tcl_GetCharLength(s); 654 | 655 | /* Print the lengths. */ 656 | printf("\n%d\n%d\n%d\n", init_length, code_length, seq_length); 657 | 658 | /* Clean up. */ 659 | Tcl_DecrRefCount(s); 660 | 661 | /* Finalize Tcl. */ 662 | Tcl_Release((ClientData)tcl); 663 | Tcl_Exit(rv); 664 | 665 | /* Not reached. */ 666 | return rv; 667 | } 668 | -------------------------------------------------------------------------------- /benchmark/regex-dna/regex-dna.go: -------------------------------------------------------------------------------- 1 | /* The Computer Language Benchmarks Game 2 | * http://benchmarksgame.alioth.debian.org/ 3 | * 4 | * contributed by The Go Authors. 5 | */ 6 | 7 | package main 8 | 9 | import ( 10 | "fmt" 11 | "io/ioutil" 12 | "os" 13 | "regexp" 14 | "runtime" 15 | ) 16 | 17 | var variants = []string{ 18 | "agggtaaa|tttaccct", 19 | "[cgt]gggtaaa|tttaccc[acg]", 20 | "a[act]ggtaaa|tttacc[agt]t", 21 | "ag[act]gtaaa|tttac[agt]ct", 22 | "agg[act]taaa|ttta[agt]cct", 23 | "aggg[acg]aaa|ttt[cgt]ccct", 24 | "agggt[cgt]aa|tt[acg]accct", 25 | "agggta[cgt]a|t[acg]taccct", 26 | "agggtaa[cgt]|[acg]ttaccct", 27 | } 28 | 29 | type Subst struct { 30 | pat, repl string 31 | } 32 | 33 | var substs = []Subst{ 34 | Subst{"B", "(c|g|t)"}, 35 | Subst{"D", "(a|g|t)"}, 36 | Subst{"H", "(a|c|t)"}, 37 | Subst{"K", "(g|t)"}, 38 | Subst{"M", "(a|c)"}, 39 | Subst{"N", "(a|c|g|t)"}, 40 | Subst{"R", "(a|g)"}, 41 | Subst{"S", "(c|g)"}, 42 | Subst{"V", "(a|c|g)"}, 43 | Subst{"W", "(a|t)"}, 44 | Subst{"Y", "(c|t)"}, 45 | } 46 | 47 | func countMatches(pat string, bytes []byte) int { 48 | re := regexp.MustCompile(pat) 49 | n := 0 50 | for { 51 | e := re.FindIndex(bytes) 52 | if e == nil { 53 | break 54 | } 55 | n++ 56 | bytes = bytes[e[1]:] 57 | } 58 | return n 59 | } 60 | 61 | func main() { 62 | runtime.GOMAXPROCS(runtime.NumCPU()) 63 | 64 | bytes, err := ioutil.ReadFile("/dev/stdin") 65 | if err != nil { 66 | fmt.Fprintf(os.Stderr, "can't read input: %s\n", err) 67 | os.Exit(2) 68 | } 69 | ilen := len(bytes) 70 | // Delete the comment lines and newlines 71 | bytes = regexp.MustCompile("(>[^\n]+)?\n").ReplaceAll(bytes, []byte{}) 72 | clen := len(bytes) 73 | 74 | mresults := make([]chan int, len(variants)) 75 | for i, s := range variants { 76 | ch := make(chan int) 77 | mresults[i] = ch 78 | go func(ss string) { 79 | ch <- countMatches(ss, bytes) 80 | }(s) 81 | } 82 | 83 | lenresult := make(chan int) 84 | bb := bytes 85 | go func() { 86 | for _, sub := range substs { 87 | bb = regexp.MustCompile(sub.pat).ReplaceAll(bb, []byte(sub.repl)) 88 | } 89 | lenresult <- len(bb) 90 | }() 91 | 92 | for i, s := range variants { 93 | fmt.Printf("%s %d\n", s, <-mresults[i]) 94 | } 95 | fmt.Printf("\n%d\n%d\n%d\n", ilen, clen, <-lenresult) 96 | } 97 | -------------------------------------------------------------------------------- /benchmark/regex-dna/regex-dna.py: -------------------------------------------------------------------------------- 1 | # The Computer Language Benchmarks Game 2 | # http://shootout.alioth.debian.org/ 3 | # contributed by Dominique Wahli 4 | # 2to3 5 | # mp by Ahmad Syukri 6 | # modified by Justin Peel 7 | 8 | from sys import stdin 9 | from re import sub, findall 10 | from multiprocessing import Pool 11 | 12 | def init(arg): 13 | global seq 14 | seq = arg 15 | 16 | def var_find(f): 17 | return len(findall(f, seq)) 18 | 19 | def main(): 20 | seq = stdin.read() 21 | ilen = len(seq) 22 | 23 | seq = sub('>.*\n|\n', '', seq) 24 | clen = len(seq) 25 | 26 | pool = Pool(initializer = init, initargs = (seq,)) 27 | 28 | variants = ( 29 | 'agggtaaa|tttaccct', 30 | '[cgt]gggtaaa|tttaccc[acg]', 31 | 'a[act]ggtaaa|tttacc[agt]t', 32 | 'ag[act]gtaaa|tttac[agt]ct', 33 | 'agg[act]taaa|ttta[agt]cct', 34 | 'aggg[acg]aaa|ttt[cgt]ccct', 35 | 'agggt[cgt]aa|tt[acg]accct', 36 | 'agggta[cgt]a|t[acg]taccct', 37 | 'agggtaa[cgt]|[acg]ttaccct') 38 | for f in zip(variants, pool.imap(var_find, variants)): 39 | print(f[0], f[1]) 40 | 41 | subst = { 42 | 'B' : '(c|g|t)', 'D' : '(a|g|t)', 'H' : '(a|c|t)', 'K' : '(g|t)', 43 | 'M' : '(a|c)', 'N' : '(a|c|g|t)', 'R' : '(a|g)', 'S' : '(c|g)', 44 | 'V' : '(a|c|g)', 'W' : '(a|t)', 'Y' : '(c|t)'} 45 | for f, r in list(subst.items()): 46 | seq = sub(f, r, seq) 47 | 48 | print() 49 | print(ilen) 50 | print(clen) 51 | print(len(seq)) 52 | 53 | if __name__=="__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /benchmark/regex-dna/regex-dna.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT 2 | // file at the top-level directory of this distribution and at 3 | // http://rust-lang.org/COPYRIGHT. 4 | // 5 | // Licensed under the Apache License, Version 2.0 or the MIT license 7 | // , at your 8 | // option. This file may not be copied, modified, or distributed 9 | // except according to those terms. 10 | 11 | // FIXME(#13725) windows needs fixing. 12 | // ignore-win32 13 | // ignore-stage1 14 | // ignore-cross-compile #12102 15 | 16 | #![feature(macro_rules, phase)] 17 | 18 | extern crate regex; 19 | #[phase(syntax)]extern crate regex_macros; 20 | extern crate sync; 21 | 22 | use std::io; 23 | use regex::{NoExpand, Regex}; 24 | use sync::Arc; 25 | 26 | fn count_matches(seq: &str, variant: &Regex) -> int { 27 | let mut n = 0; 28 | for _ in variant.find_iter(seq) { 29 | n += 1; 30 | } 31 | n 32 | } 33 | 34 | fn main() { 35 | let mut rdr = if std::os::getenv("RUST_BENCH").is_some() { 36 | let fd = io::File::open(&Path::new("shootout-k-nucleotide.data")); 37 | ~io::BufferedReader::new(fd) as ~io::Reader 38 | } else { 39 | ~io::stdin() as ~io::Reader 40 | }; 41 | let mut seq = StrBuf::from_str(rdr.read_to_str().unwrap()); 42 | let ilen = seq.len(); 43 | 44 | seq = regex!(">[^\n]*\n|\n").replace_all(seq.as_slice(), NoExpand("")); 45 | let seq_arc = Arc::new(seq.clone()); // copy before it moves 46 | let clen = seq.len(); 47 | 48 | let mut seqlen = sync::Future::spawn(proc() { 49 | let substs = ~[ 50 | (regex!("B"), "(c|g|t)"), 51 | (regex!("D"), "(a|g|t)"), 52 | (regex!("H"), "(a|c|t)"), 53 | (regex!("K"), "(g|t)"), 54 | (regex!("M"), "(a|c)"), 55 | (regex!("N"), "(a|c|g|t)"), 56 | (regex!("R"), "(a|g)"), 57 | (regex!("S"), "(c|g)"), 58 | (regex!("V"), "(a|c|g)"), 59 | (regex!("W"), "(a|t)"), 60 | (regex!("Y"), "(c|t)"), 61 | ]; 62 | let mut seq = seq; 63 | for (re, replacement) in substs.move_iter() { 64 | seq = re.replace_all(seq.as_slice(), NoExpand(replacement)); 65 | } 66 | seq.len() 67 | }); 68 | 69 | let variants = ~[ 70 | regex!("agggtaaa|tttaccct"), 71 | regex!("[cgt]gggtaaa|tttaccc[acg]"), 72 | regex!("a[act]ggtaaa|tttacc[agt]t"), 73 | regex!("ag[act]gtaaa|tttac[agt]ct"), 74 | regex!("agg[act]taaa|ttta[agt]cct"), 75 | regex!("aggg[acg]aaa|ttt[cgt]ccct"), 76 | regex!("agggt[cgt]aa|tt[acg]accct"), 77 | regex!("agggta[cgt]a|t[acg]taccct"), 78 | regex!("agggtaa[cgt]|[acg]ttaccct"), 79 | ]; 80 | let (mut variant_strs, mut counts) = (vec!(), vec!()); 81 | for variant in variants.move_iter() { 82 | let seq_arc_copy = seq_arc.clone(); 83 | variant_strs.push(variant.to_str().to_owned()); 84 | counts.push(sync::Future::spawn(proc() { 85 | count_matches(seq_arc_copy.as_slice(), &variant) 86 | })); 87 | } 88 | 89 | for (i, variant) in variant_strs.iter().enumerate() { 90 | println!("{} {}", variant, counts.get_mut(i).get()); 91 | } 92 | println!(""); 93 | println!("{}", ilen); 94 | println!("{}", clen); 95 | println!("{}", seqlen.get()); 96 | } 97 | -------------------------------------------------------------------------------- /benchmark/regex-dna/shootout-fasta.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT 2 | // file at the top-level directory of this distribution and at 3 | // http://rust-lang.org/COPYRIGHT. 4 | // 5 | // Licensed under the Apache License, Version 2.0 or the MIT license 7 | // , at your 8 | // option. This file may not be copied, modified, or distributed 9 | // except according to those terms. 10 | 11 | /* -*- mode: rust; indent-tabs-mode: nil -*- 12 | * Implementation of 'fasta' benchmark from 13 | * Computer Language Benchmarks Game 14 | * http://shootout.alioth.debian.org/ 15 | */ 16 | 17 | #![allow(unused_must_use)] 18 | 19 | use std::io; 20 | use std::io::{BufferedWriter, File}; 21 | use std::cmp::min; 22 | use std::os; 23 | 24 | static LINE_LENGTH: uint = 60; 25 | static IM: u32 = 139968; 26 | 27 | struct MyRandom { 28 | last: u32 29 | } 30 | impl MyRandom { 31 | fn new() -> MyRandom { MyRandom { last: 42 } } 32 | fn normalize(p: f32) -> u32 {(p * IM as f32).floor() as u32} 33 | fn gen(&mut self) -> u32 { 34 | self.last = (self.last * 3877 + 29573) % IM; 35 | self.last 36 | } 37 | } 38 | 39 | struct AAGen<'a> { 40 | rng: &'a mut MyRandom, 41 | data: Vec<(u32, u8)> } 42 | impl<'a> AAGen<'a> { 43 | fn new<'b>(rng: &'b mut MyRandom, aa: &[(char, f32)]) -> AAGen<'b> { 44 | let mut cum = 0.; 45 | let data = aa.iter() 46 | .map(|&(ch, p)| { cum += p; (MyRandom::normalize(cum), ch as u8) }) 47 | .collect(); 48 | AAGen { rng: rng, data: data } 49 | } 50 | } 51 | impl<'a> Iterator for AAGen<'a> { 52 | fn next(&mut self) -> Option { 53 | let r = self.rng.gen(); 54 | self.data.iter() 55 | .skip_while(|pc| pc.val0() < r) 56 | .map(|&(_, c)| c) 57 | .next() 58 | } 59 | } 60 | 61 | fn make_fasta>( 62 | wr: &mut W, header: &str, mut it: I, mut n: uint) 63 | { 64 | wr.write(header.as_bytes()); 65 | let mut line = [0u8, .. LINE_LENGTH + 1]; 66 | while n > 0 { 67 | let nb = min(LINE_LENGTH, n); 68 | for i in range(0, nb) { 69 | line[i] = it.next().unwrap(); 70 | } 71 | n -= nb; 72 | line[nb] = '\n' as u8; 73 | wr.write(line.slice_to(nb + 1)); 74 | } 75 | } 76 | 77 | fn run(writer: &mut W) { 78 | let args = os::args(); 79 | let n = if os::getenv("RUST_BENCH").is_some() { 80 | 25000000 81 | } else if args.len() <= 1u { 82 | 1000 83 | } else { 84 | from_str(args[1]).unwrap() 85 | }; 86 | 87 | let rng = &mut MyRandom::new(); 88 | let alu = 89 | "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\ 90 | GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\ 91 | CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\ 92 | ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\ 93 | GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\ 94 | AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\ 95 | AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"; 96 | let iub = &[('a', 0.27), ('c', 0.12), ('g', 0.12), 97 | ('t', 0.27), ('B', 0.02), ('D', 0.02), 98 | ('H', 0.02), ('K', 0.02), ('M', 0.02), 99 | ('N', 0.02), ('R', 0.02), ('S', 0.02), 100 | ('V', 0.02), ('W', 0.02), ('Y', 0.02)]; 101 | let homosapiens = &[('a', 0.3029549426680), 102 | ('c', 0.1979883004921), 103 | ('g', 0.1975473066391), 104 | ('t', 0.3015094502008)]; 105 | 106 | make_fasta(writer, ">ONE Homo sapiens alu\n", 107 | alu.as_bytes().iter().cycle().map(|c| *c), n * 2); 108 | make_fasta(writer, ">TWO IUB ambiguity codes\n", 109 | AAGen::new(rng, iub), n * 3); 110 | make_fasta(writer, ">THREE Homo sapiens frequency\n", 111 | AAGen::new(rng, homosapiens), n * 5); 112 | 113 | writer.flush(); 114 | } 115 | 116 | fn main() { 117 | if os::getenv("RUST_BENCH").is_some() { 118 | let mut file = BufferedWriter::new(File::create(&Path::new("./shootout-fasta.data"))); 119 | run(&mut file); 120 | } else { 121 | run(&mut io::stdout()); 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /benchmark/rust: -------------------------------------------------------------------------------- 1 | literal 435 ns/iter (+/- 2) 2 | not_literal 1967 ns/iter (+/- 10) 3 | match_class 2545 ns/iter (+/- 17) 4 | match_class_in_range 2644 ns/iter (+/- 34) 5 | replace_all 6224 ns/iter (+/- 398) 6 | anchored_literal_short_non_match 991 ns/iter (+/- 4) 7 | anchored_literal_long_non_match 9119 ns/iter (+/- 20) 8 | anchored_literal_short_match 571 ns/iter (+/- 4) 9 | anchored_literal_long_match 565 ns/iter (+/- 2) 10 | one_pass_short_a 2149 ns/iter (+/- 17) 11 | one_pass_short_a_not 2644 ns/iter (+/- 27) 12 | one_pass_short_b 1565 ns/iter (+/- 7) 13 | one_pass_short_b_not 2157 ns/iter (+/- 10) 14 | one_pass_long_prefix 1281 ns/iter (+/- 11) 15 | one_pass_long_prefix_not 1234 ns/iter (+/- 6) 16 | easy0_32 651 ns/iter (+/- 4) = 49 MB/s 17 | easy0_1K 2123 ns/iter (+/- 115) = 482 MB/s 18 | easy0_32K 48763 ns/iter (+/- 896) = 671 MB/s 19 | easy0_1M 1545978 ns/iter (+/- 5075) = 677 MB/s 20 | easy1_32 609 ns/iter (+/- 154) = 52 MB/s 21 | easy1_1K 3091 ns/iter (+/- 815) = 331 MB/s 22 | easy1_32K 83045 ns/iter (+/- 4995) = 394 MB/s 23 | easy1_1M 2654424 ns/iter (+/- 34276) = 394 MB/s 24 | medium_32 1648 ns/iter (+/- 63) = 19 MB/s 25 | medium_1K 33882 ns/iter (+/- 838) = 30 MB/s 26 | medium_32K 1072079 ns/iter (+/- 5921) = 30 MB/s 27 | medium_1M 34140609 ns/iter (+/- 51115) = 30 MB/s 28 | hard_32 2479 ns/iter (+/- 40) = 12 MB/s 29 | hard_1K 54950 ns/iter (+/- 255) = 18 MB/s 30 | hard_32K 1738851 ns/iter (+/- 3483) = 18 MB/s 31 | hard_1M 55405512 ns/iter (+/- 40061) = 18 MB/s 32 | no_exponential 269850 ns/iter (+/- 380) 33 | -------------------------------------------------------------------------------- /cargo-lite.conf: -------------------------------------------------------------------------------- 1 | [build] 2 | crate_root = "src/lib.rs" 3 | crate_type = "library" 4 | 5 | -------------------------------------------------------------------------------- /ctags.rust: -------------------------------------------------------------------------------- 1 | --langdef=Rust 2 | --langmap=Rust:.rs 3 | --regex-Rust=/^[ \t]*(#\[[^\]]\][ \t]*)*(pub[ \t]+)?(extern[ \t]+)?("[^"]+"[ \t]+)?(unsafe[ \t]+)?fn[ \t]+([a-zA-Z0-9_]+)/\6/f,functions,function definitions/ 4 | --regex-Rust=/^[ \t]*(pub[ \t]+)?type[ \t]+([a-zA-Z0-9_]+)/\2/T,types,type definitions/ 5 | --regex-Rust=/^[ \t]*(pub[ \t]+)?enum[ \t]+([a-zA-Z0-9_]+)/\2/g,enum,enumeration names/ 6 | --regex-Rust=/^[ \t]*(pub[ \t]+)?struct[ \t]+([a-zA-Z0-9_]+)/\2/s,structure names/ 7 | --regex-Rust=/^[ \t]*(pub[ \t]+)?mod[ \t]+([a-zA-Z0-9_]+)/\2/m,modules,module names/ 8 | --regex-Rust=/^[ \t]*(pub[ \t]+)?static[ \t]+([a-zA-Z0-9_]+)/\2/c,consts,static constants/ 9 | --regex-Rust=/^[ \t]*(pub[ \t]+)?trait[ \t]+([a-zA-Z0-9_]+)/\2/t,traits,traits/ 10 | --regex-Rust=/^[ \t]*(pub[ \t]+)?impl([ \t\n]+<.*>)?[ \t]+([a-zA-Z0-9_]+)/\3/i,impls,trait implementations/ 11 | --regex-Rust=/^[ \t]*macro_rules![ \t]+([a-zA-Z0-9_]+)/\1/d,macros,macro definitions/ 12 | -------------------------------------------------------------------------------- /regex-match-tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # Copyright 2014 The Rust Project Developers. See the COPYRIGHT 4 | # file at the top-level directory of this distribution and at 5 | # http://rust-lang.org/COPYRIGHT. 6 | # 7 | # Licensed under the Apache License, Version 2.0 or the MIT license 9 | # , at your 10 | # option. This file may not be copied, modified, or distributed 11 | # except according to those terms. 12 | 13 | from __future__ import absolute_import, division, print_function 14 | import argparse 15 | import datetime 16 | import os.path as path 17 | 18 | 19 | def print_tests(tests): 20 | print('\n'.join([test_tostr(t) for t in tests])) 21 | 22 | 23 | def read_tests(f): 24 | basename, _ = path.splitext(path.basename(f)) 25 | tests = [] 26 | for lineno, line in enumerate(open(f), 1): 27 | fields = filter(None, map(str.strip, line.split('\t'))) 28 | if not (4 <= len(fields) <= 5) \ 29 | or 'E' not in fields[0] or fields[0][0] == '#': 30 | continue 31 | 32 | opts, pat, text, sgroups = fields[0:4] 33 | groups = [] # groups as integer ranges 34 | if sgroups == 'NOMATCH': 35 | groups = [None] 36 | elif ',' in sgroups: 37 | noparen = map(lambda s: s.strip('()'), sgroups.split(')(')) 38 | for g in noparen: 39 | s, e = map(str.strip, g.split(',')) 40 | if s == '?' and e == '?': 41 | groups.append(None) 42 | else: 43 | groups.append((int(s), int(e))) 44 | else: 45 | # This skips tests that should result in an error. 46 | # There aren't many, so I think we can just capture those 47 | # manually. Possibly fix this in future. 48 | continue 49 | 50 | if pat == 'SAME': 51 | pat = tests[-1][1] 52 | if '$' in opts: 53 | pat = pat.decode('string_escape') 54 | text = text.decode('string_escape') 55 | if 'i' in opts: 56 | pat = '(?i)%s' % pat 57 | 58 | name = '%s_%d' % (basename, lineno) 59 | tests.append((name, pat, text, groups)) 60 | return tests 61 | 62 | 63 | def test_tostr(t): 64 | lineno, pat, text, groups = t 65 | options = map(group_tostr, groups) 66 | return 'mat!(match_%s, r"%s", r"%s", %s)' \ 67 | % (lineno, pat, '' if text == "NULL" else text, ', '.join(options)) 68 | 69 | 70 | def group_tostr(g): 71 | if g is None: 72 | return 'None' 73 | else: 74 | return 'Some((%d, %d))' % (g[0], g[1]) 75 | 76 | 77 | if __name__ == '__main__': 78 | parser = argparse.ArgumentParser( 79 | description='Generate match tests from an AT&T POSIX test file.') 80 | aa = parser.add_argument 81 | aa('files', nargs='+', 82 | help='A list of dat AT&T POSIX test files. See src/libregexp/testdata') 83 | args = parser.parse_args() 84 | 85 | tests = [] 86 | for f in args.files: 87 | tests += read_tests(f) 88 | 89 | tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT 90 | // file at the top-level directory of this distribution and at 91 | // http://rust-lang.org/COPYRIGHT. 92 | // 93 | // Licensed under the Apache License, Version 2.0 or the MIT license 95 | // , at your 96 | // option. This file may not be copied, modified, or distributed 97 | // except according to those terms. 98 | 99 | // ignore-tidy-linelength 100 | 101 | // DO NOT EDIT. Automatically generated by 'src/etc/regexp-match-tests' 102 | // on {date}. 103 | ''' 104 | print(tpl.format(date=str(datetime.datetime.now()))) 105 | 106 | for f in args.files: 107 | print('// Tests from %s' % path.basename(f)) 108 | print_tests(read_tests(f)) 109 | print('') 110 | -------------------------------------------------------------------------------- /regex-unicode-tables.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # Copyright 2014 The Rust Project Developers. See the COPYRIGHT 4 | # file at the top-level directory of this distribution and at 5 | # http://rust-lang.org/COPYRIGHT. 6 | # 7 | # Licensed under the Apache License, Version 2.0 or the MIT license 9 | # , at your 10 | # option. This file may not be copied, modified, or distributed 11 | # except according to those terms. 12 | 13 | from __future__ import absolute_import, division, print_function 14 | import argparse 15 | from collections import defaultdict 16 | import csv 17 | import datetime 18 | import urllib2 19 | 20 | BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/' 21 | DATA = 'UnicodeData.txt' 22 | SCRIPTS = 'Scripts.txt' 23 | 24 | # Mapping taken from Table 12 from: 25 | # http://www.unicode.org/reports/tr44/#General_Category_Values 26 | expanded_categories = { 27 | 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], 28 | 'Lm': ['L'], 'Lo': ['L'], 29 | 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], 30 | 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], 31 | 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], 32 | 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], 33 | 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], 34 | 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], 35 | 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], 36 | } 37 | 38 | 39 | def as_4byte_uni(n): 40 | s = hex(n)[2:] 41 | return '\\U%s%s' % ('0' * (8 - len(s)), s) 42 | 43 | 44 | def expand_cat(c): 45 | return expanded_categories.get(c, []) + [c] 46 | 47 | 48 | def is_valid_unicode(n): 49 | return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF 50 | 51 | 52 | def read_cats(f): 53 | assigned = defaultdict(list) 54 | for row in csv.reader(f, delimiter=';'): 55 | (hex, cats) = (int(row[0], 16), expand_cat(row[2])) 56 | if not is_valid_unicode(hex): 57 | continue 58 | for cat in cats: 59 | assigned[cat].append(hex) 60 | return assigned 61 | 62 | 63 | def read_scripts(f): 64 | assigned = defaultdict(list) 65 | for line in f: 66 | line = line.strip() 67 | if not line or line.startswith('#'): 68 | continue 69 | hexes, name = map(str.strip, line.split(';'))[:2] 70 | name = name[:name.index('#')].strip() 71 | if '..' not in hexes: 72 | hex = int(hexes, 16) 73 | if is_valid_unicode(hex): 74 | assigned[name].append(hex) 75 | else: 76 | hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..')) 77 | for hex in xrange(hex1, hex2 + 1): 78 | if is_valid_unicode(hex): 79 | assigned[name].append(hex) 80 | return assigned 81 | 82 | 83 | def group(letters): 84 | letters = sorted(set(letters)) 85 | grouped = [] 86 | cur_start = letters.pop(0) 87 | cur_end = cur_start 88 | for letter in letters: 89 | assert letter > cur_end, \ 90 | 'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter)) 91 | 92 | if letter == cur_end + 1: 93 | cur_end = letter 94 | else: 95 | grouped.append((cur_start, cur_end)) 96 | cur_start, cur_end = letter, letter 97 | grouped.append((cur_start, cur_end)) 98 | return grouped 99 | 100 | 101 | def ranges_to_rust(rs): 102 | rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs) 103 | return ',\n '.join(rs) 104 | 105 | 106 | def groups_to_rust(groups): 107 | rust_groups = [] 108 | for group_name in sorted(groups): 109 | rust_groups.append('("%s", &[\n %s\n ]),' 110 | % (group_name, ranges_to_rust(groups[group_name]))) 111 | return '\n'.join(rust_groups) 112 | 113 | 114 | if __name__ == '__main__': 115 | parser = argparse.ArgumentParser( 116 | description='Generate Unicode character class tables.') 117 | aa = parser.add_argument 118 | aa('--local', action='store_true', 119 | help='When set, Scripts.txt and UnicodeData.txt will be read from ' 120 | 'the CWD.') 121 | aa('--base-url', type=str, default=BASE_URL, 122 | help='The base URL to use for downloading Unicode data files.') 123 | args = parser.parse_args() 124 | 125 | if args.local: 126 | cats = read_cats(open(DATA)) 127 | scripts = read_scripts(open(SCRIPTS)) 128 | else: 129 | cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA)) 130 | scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS)) 131 | 132 | # Get Rust code for all Unicode general categories and scripts. 133 | combined = dict(cats, **scripts) 134 | unigroups = groups_to_rust({k: group(letters) 135 | for k, letters in combined.items()}) 136 | 137 | # Now get Perl character classes that are Unicode friendly. 138 | perld = range(ord('0'), ord('9') + 1) 139 | dgroups = ranges_to_rust(group(perld + cats['Nd'][:])) 140 | 141 | perls = map(ord, ['\t', '\n', '\x0C', '\r', ' ']) 142 | sgroups = ranges_to_rust(group(perls + cats['Z'][:])) 143 | 144 | low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1)) 145 | perlw = [ord('_')] + perld + low + up 146 | wgroups = ranges_to_rust(group(perlw + cats['L'][:])) 147 | 148 | tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT 149 | // file at the top-level directory of this distribution and at 150 | // http://rust-lang.org/COPYRIGHT. 151 | // 152 | // Licensed under the Apache License, Version 2.0 or the MIT license 154 | // , at your 155 | // option. This file may not be copied, modified, or distributed 156 | // except according to those terms. 157 | 158 | // DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables' 159 | // on {date}. 160 | 161 | use parse::{{Class, NamedClasses}}; 162 | 163 | pub static UNICODE_CLASSES: NamedClasses = &[ 164 | 165 | {groups} 166 | 167 | ]; 168 | 169 | pub static PERLD: Class = &[ 170 | {dgroups} 171 | ]; 172 | 173 | pub static PERLS: Class = &[ 174 | {sgroups} 175 | ]; 176 | 177 | pub static PERLW: Class = &[ 178 | {wgroups} 179 | ]; 180 | ''' 181 | now = datetime.datetime.now() 182 | print(tpl.format(date=str(now), groups=unigroups, 183 | dgroups=dgroups, sgroups=sgroups, wgroups=wgroups)) 184 | -------------------------------------------------------------------------------- /session.vim: -------------------------------------------------------------------------------- 1 | au BufWritePost *.rs silent!make ctags > /dev/null 2>&1 2 | -------------------------------------------------------------------------------- /src/compile.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT 2 | // file at the top-level directory of this distribution and at 3 | // http://rust-lang.org/COPYRIGHT. 4 | // 5 | // Licensed under the Apache License, Version 2.0 or the MIT license 7 | // , at your 8 | // option. This file may not be copied, modified, or distributed 9 | // except according to those terms. 10 | 11 | // Enable this to squash warnings due to exporting pieces of the representation 12 | // for use with the regex! macro. See lib.rs for explanation. 13 | #![allow(visible_private_types)] 14 | 15 | use std::cmp; 16 | use std::iter; 17 | use parse; 18 | use parse::{ 19 | Flags, FLAG_EMPTY, 20 | Nothing, Literal, Dot, Class, Begin, End, WordBoundary, Capture, Cat, Alt, 21 | Rep, 22 | ZeroOne, ZeroMore, OneMore, 23 | }; 24 | 25 | type InstIdx = uint; 26 | 27 | #[deriving(Show, Clone)] 28 | pub enum Inst { 29 | // When a Match instruction is executed, the current thread is successful. 30 | Match, 31 | 32 | // The OneChar instruction matches a literal character. 33 | // The flags indicate whether to do a case insensitive match. 34 | OneChar(char, Flags), 35 | 36 | // The CharClass instruction tries to match one input character against 37 | // the range of characters given. 38 | // The flags indicate whether to do a case insentivie match and whether 39 | // the character class is negated or not. 40 | CharClass(Vec<(char, char)>, Flags), 41 | 42 | // Matches any character except new lines. 43 | // The flags indicate whether to include the '\n' character. 44 | Any(Flags), 45 | 46 | // Matches the beginning of the string, consumes no characters. 47 | // The flags indicate whether it matches if the preceding character 48 | // is a new line. 49 | EmptyBegin(Flags), 50 | 51 | // Matches the end of the string, consumes no characters. 52 | // The flags indicate whether it matches if the proceding character 53 | // is a new line. 54 | EmptyEnd(Flags), 55 | 56 | // Matches a word boundary (\w on one side and \W \A or \z on the other), 57 | // and consumes no character. 58 | // The flags indicate whether this matches a word boundary or something 59 | // that isn't a word boundary. 60 | EmptyWordBoundary(Flags), 61 | 62 | // Saves the current position in the input string to the Nth save slot. 63 | Save(uint), 64 | 65 | // Jumps to the instruction at the index given. 66 | Jump(InstIdx), 67 | 68 | // Jumps to the instruction at the first index given. If that leads to 69 | // a failing state, then the instruction at the second index given is 70 | // tried. 71 | Split(InstIdx, InstIdx), 72 | } 73 | 74 | /// Program represents a compiled regular expression. Once an expression is 75 | /// compiled, its representation is immutable and will never change. 76 | /// 77 | /// All of the data in a compiled expression is wrapped in "MaybeStatic" or 78 | /// "MaybeOwned" types so that a `Program` can be represented as static data. 79 | /// (This makes it convenient and efficient for use with the `regex!` macro.) 80 | #[deriving(Clone)] 81 | pub struct Program { 82 | /// A sequence of instructions. 83 | pub insts: Vec, 84 | /// If the regular expression requires a literal prefix in order to have a 85 | /// match, that prefix is stored here. (It's used in the VM to implement 86 | /// an optimization.) 87 | pub prefix: ~str, 88 | } 89 | 90 | impl Program { 91 | /// Compiles a Regex given its AST. 92 | pub fn new(ast: ~parse::Ast) -> (Program, ~[Option<~str>]) { 93 | let mut c = Compiler { 94 | insts: Vec::with_capacity(100), 95 | names: Vec::with_capacity(10), 96 | }; 97 | 98 | c.insts.push(Save(0)); 99 | c.compile(ast); 100 | c.insts.push(Save(1)); 101 | c.insts.push(Match); 102 | 103 | // Try to discover a literal string prefix. 104 | // This is a bit hacky since we have to skip over the initial 105 | // 'Save' instruction. 106 | let mut pre = StrBuf::with_capacity(5); 107 | for i in iter::range(1, c.insts.len()) { 108 | match *c.insts.get(i) { 109 | OneChar(c, FLAG_EMPTY) => pre.push_char(c), 110 | _ => break 111 | } 112 | } 113 | 114 | let names = c.names.as_slice().into_owned(); 115 | let prog = Program { 116 | insts: c.insts, 117 | prefix: pre.into_owned(), 118 | }; 119 | (prog, names) 120 | } 121 | 122 | /// Returns the total number of capture groups in the regular expression. 123 | /// This includes the zeroth capture. 124 | pub fn num_captures(&self) -> uint { 125 | let mut n = 0; 126 | for inst in self.insts.iter() { 127 | match *inst { 128 | Save(c) => n = cmp::max(n, c+1), 129 | _ => {} 130 | } 131 | } 132 | // There's exactly 2 Save slots for every capture. 133 | n / 2 134 | } 135 | } 136 | 137 | struct Compiler<'r> { 138 | insts: Vec, 139 | names: Vec>, 140 | } 141 | 142 | // The compiler implemented here is extremely simple. Most of the complexity 143 | // in this crate is in the parser or the VM. 144 | // The only tricky thing here is patching jump/split instructions to point to 145 | // the right instruction. 146 | impl<'r> Compiler<'r> { 147 | fn compile(&mut self, ast: ~parse::Ast) { 148 | match ast { 149 | ~Nothing => {}, 150 | ~Literal(c, flags) => self.push(OneChar(c, flags)), 151 | ~Dot(nl) => self.push(Any(nl)), 152 | ~Class(ranges, flags) => 153 | self.push(CharClass(ranges, flags)), 154 | ~Begin(flags) => self.push(EmptyBegin(flags)), 155 | ~End(flags) => self.push(EmptyEnd(flags)), 156 | ~WordBoundary(flags) => self.push(EmptyWordBoundary(flags)), 157 | ~Capture(cap, name, x) => { 158 | let len = self.names.len(); 159 | if cap >= len { 160 | self.names.grow(10 + cap - len, &None) 161 | } 162 | *self.names.get_mut(cap) = name; 163 | 164 | self.push(Save(2 * cap)); 165 | self.compile(x); 166 | self.push(Save(2 * cap + 1)); 167 | } 168 | ~Cat(xs) => { 169 | for x in xs.move_iter() { 170 | self.compile(x) 171 | } 172 | } 173 | ~Alt(x, y) => { 174 | let split = self.empty_split(); // push: split 0, 0 175 | let j1 = self.insts.len(); 176 | self.compile(x); // push: insts for x 177 | let jmp = self.empty_jump(); // push: jmp 0 178 | let j2 = self.insts.len(); 179 | self.compile(y); // push: insts for y 180 | let j3 = self.insts.len(); 181 | 182 | self.set_split(split, j1, j2); // split 0, 0 -> split j1, j2 183 | self.set_jump(jmp, j3); // jmp 0 -> jmp j3 184 | } 185 | ~Rep(x, ZeroOne, g) => { 186 | let split = self.empty_split(); 187 | let j1 = self.insts.len(); 188 | self.compile(x); 189 | let j2 = self.insts.len(); 190 | 191 | if g.is_greedy() { 192 | self.set_split(split, j1, j2); 193 | } else { 194 | self.set_split(split, j2, j1); 195 | } 196 | } 197 | ~Rep(x, ZeroMore, g) => { 198 | let j1 = self.insts.len(); 199 | let split = self.empty_split(); 200 | let j2 = self.insts.len(); 201 | self.compile(x); 202 | let jmp = self.empty_jump(); 203 | let j3 = self.insts.len(); 204 | 205 | self.set_jump(jmp, j1); 206 | if g.is_greedy() { 207 | self.set_split(split, j2, j3); 208 | } else { 209 | self.set_split(split, j3, j2); 210 | } 211 | } 212 | ~Rep(x, OneMore, g) => { 213 | let j1 = self.insts.len(); 214 | self.compile(x); 215 | let split = self.empty_split(); 216 | let j2 = self.insts.len(); 217 | 218 | if g.is_greedy() { 219 | self.set_split(split, j1, j2); 220 | } else { 221 | self.set_split(split, j2, j1); 222 | } 223 | } 224 | } 225 | } 226 | 227 | /// Appends the given instruction to the program. 228 | #[inline] 229 | fn push(&mut self, x: Inst) { 230 | self.insts.push(x) 231 | } 232 | 233 | /// Appends an *empty* `Split` instruction to the program and returns 234 | /// the index of that instruction. (The index can then be used to "patch" 235 | /// the actual locations of the split in later.) 236 | #[inline] 237 | fn empty_split(&mut self) -> InstIdx { 238 | self.insts.push(Split(0, 0)); 239 | self.insts.len() - 1 240 | } 241 | 242 | /// Sets the left and right locations of a `Split` instruction at index 243 | /// `i` to `pc1` and `pc2`, respectively. 244 | /// If the instruction at index `i` isn't a `Split` instruction, then 245 | /// `fail!` is called. 246 | #[inline] 247 | fn set_split(&mut self, i: InstIdx, pc1: InstIdx, pc2: InstIdx) { 248 | let split = self.insts.get_mut(i); 249 | match *split { 250 | Split(_, _) => *split = Split(pc1, pc2), 251 | _ => fail!("BUG: Invalid split index."), 252 | } 253 | } 254 | 255 | /// Appends an *empty* `Jump` instruction to the program and returns the 256 | /// index of that instruction. 257 | #[inline] 258 | fn empty_jump(&mut self) -> InstIdx { 259 | self.insts.push(Jump(0)); 260 | self.insts.len() - 1 261 | } 262 | 263 | /// Sets the location of a `Jump` instruction at index `i` to `pc`. 264 | /// If the instruction at index `i` isn't a `Jump` instruction, then 265 | /// `fail!` is called. 266 | #[inline] 267 | fn set_jump(&mut self, i: InstIdx, pc: InstIdx) { 268 | let jmp = self.insts.get_mut(i); 269 | match *jmp { 270 | Jump(_) => *jmp = Jump(pc), 271 | _ => fail!("BUG: Invalid jump index."), 272 | } 273 | } 274 | } 275 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT 2 | // file at the top-level directory of this distribution and at 3 | // http://rust-lang.org/COPYRIGHT. 4 | // 5 | // Licensed under the Apache License, Version 2.0 or the MIT license 7 | // , at your 8 | // option. This file may not be copied, modified, or distributed 9 | // except according to those terms. 10 | 11 | //! This crate provides a native implementation of regular expressions that is 12 | //! heavily based on RE2 both in syntax and in implementation. Notably, 13 | //! backreferences and arbitrary lookahead/lookbehind assertions are not 14 | //! provided. In return, regular expression searching provided by this package 15 | //! has excellent worst case performance. The specific syntax supported is 16 | //! documented further down. 17 | //! 18 | //! This crate's documentation provides some simple examples, describes Unicode 19 | //! support and exhaustively lists the supported syntax. For more specific 20 | //! details on the API, please see the documentation for the `Regex` type. 21 | //! 22 | //! # First example: find a date 23 | //! 24 | //! General use of regular expressions in this package involves compiling an 25 | //! expression and then using it to search, split or replace text. For example, 26 | //! to confirm that some text resembles a date: 27 | //! 28 | //! ```rust 29 | //! use regex::Regex; 30 | //! let re = match Regex::new(r"^\d{4}-\d{2}-\d{2}$") { 31 | //! Ok(re) => re, 32 | //! Err(err) => fail!("{}", err), 33 | //! }; 34 | //! assert_eq!(re.is_match("2014-01-01"), true); 35 | //! ``` 36 | //! 37 | //! Notice the use of the `^` and `$` anchors. In this crate, every expression 38 | //! is executed with an implicit `.*?` at the beginning and end, which allows 39 | //! it to match anywhere in the text. Anchors can be used to ensure that the 40 | //! full text matches an expression. 41 | //! 42 | //! This example also demonstrates the utility of raw strings in Rust, which 43 | //! are just like regular strings except they are prefixed with an `r` and do 44 | //! not process any escape sequences. For example, `"\\d"` is the same 45 | //! expression as `r"\d"`. 46 | //! 47 | //! # The `regex!` macro 48 | //! 49 | //! Rust's compile time meta-programming facilities provide a way to write a 50 | //! `regex!` macro which compiles regular expressions *when your program 51 | //! compiles*. Said differently, if you only use `regex!` to build regular 52 | //! expressions in your program, then your program cannot compile with an 53 | //! invalid regular expression. Moreover, the `regex!` macro compiles the 54 | //! given expression to native Rust code, which makes it much faster for 55 | //! searching text. 56 | //! 57 | //! Since `regex!` provides compiled regular expressions that are both safer 58 | //! and faster to use, you should use them whenever possible. The only 59 | //! requirement for using them is that you have a string literal corresponding 60 | //! to your expression. Otherwise, it is indistinguishable from an expression 61 | //! compiled at runtime with `Regex::new`. 62 | //! 63 | //! To use the `regex!` macro, you must enable the `phase` feature and import 64 | //! the `regex_macros` crate as a syntax extension: 65 | //! 66 | //! ```rust 67 | //! #![feature(phase)] 68 | //! #[phase(syntax)] 69 | //! extern crate regex_macros; 70 | //! extern crate regex; 71 | //! 72 | //! fn main() { 73 | //! let re = regex!(r"^\d{4}-\d{2}-\d{2}$"); 74 | //! assert_eq!(re.is_match("2014-01-01"), true); 75 | //! } 76 | //! ``` 77 | //! 78 | //! There are a few things worth mentioning about using the `regex!` macro. 79 | //! Firstly, the `regex!` macro *only* accepts string *literals*. 80 | //! Secondly, the `regex` crate *must* be linked with the name `regex` since 81 | //! the generated code depends on finding symbols in the `regex` crate. 82 | //! 83 | //! The only downside of using the `regex!` macro is that it can increase the 84 | //! size of your program's binary since it generates specialized Rust code. 85 | //! The extra size probably won't be significant for a small number of 86 | //! expressions, but 100+ calls to `regex!` will probably result in a 87 | //! noticeably bigger binary. 88 | //! 89 | //! # Example: iterating over capture groups 90 | //! 91 | //! This crate provides convenient iterators for matching an expression 92 | //! repeatedly against a search string to find successive non-overlapping 93 | //! matches. For example, to find all dates in a string and be able to access 94 | //! them by their component pieces: 95 | //! 96 | //! ```rust 97 | //! # #![feature(phase)] 98 | //! # extern crate regex; #[phase(syntax)] extern crate regex_macros; 99 | //! # fn main() { 100 | //! let re = regex!(r"(\d{4})-(\d{2})-(\d{2})"); 101 | //! let text = "2012-03-14, 2013-01-01 and 2014-07-05"; 102 | //! for cap in re.captures_iter(text) { 103 | //! println!("Month: {} Day: {} Year: {}", cap.at(2), cap.at(3), cap.at(1)); 104 | //! } 105 | //! // Output: 106 | //! // Month: 03 Day: 14 Year: 2012 107 | //! // Month: 01 Day: 01 Year: 2013 108 | //! // Month: 07 Day: 05 Year: 2014 109 | //! # } 110 | //! ``` 111 | //! 112 | //! Notice that the year is in the capture group indexed at `1`. This is 113 | //! because the *entire match* is stored in the capture group at index `0`. 114 | //! 115 | //! # Example: replacement with named capture groups 116 | //! 117 | //! Building on the previous example, perhaps we'd like to rearrange the date 118 | //! formats. This can be done with text replacement. But to make the code 119 | //! clearer, we can *name* our capture groups and use those names as variables 120 | //! in our replacement text: 121 | //! 122 | //! ```rust 123 | //! # #![feature(phase)] 124 | //! # extern crate regex; #[phase(syntax)] extern crate regex_macros; 125 | //! # fn main() { 126 | //! let re = regex!(r"(?P\d{4})-(?P\d{2})-(?P\d{2})"); 127 | //! let before = "2012-03-14, 2013-01-01 and 2014-07-05"; 128 | //! let after = re.replace_all(before, "$m/$d/$y"); 129 | //! assert_eq!(after.as_slice(), "03/14/2012, 01/01/2013 and 07/05/2014"); 130 | //! # } 131 | //! ``` 132 | //! 133 | //! The `replace` methods are actually polymorphic in the replacement, which 134 | //! provides more flexibility than is seen here. (See the documentation for 135 | //! `Regex::replace` for more details.) 136 | //! 137 | //! # Pay for what you use 138 | //! 139 | //! With respect to searching text with a regular expression, there are three 140 | //! questions that can be asked: 141 | //! 142 | //! 1. Does the text match this expression? 143 | //! 2. If so, where does it match? 144 | //! 3. Where are the submatches? 145 | //! 146 | //! Generally speaking, this crate could provide a function to answer only #3, 147 | //! which would subsume #1 and #2 automatically. However, it can be 148 | //! significantly more expensive to compute the location of submatches, so it's 149 | //! best not to do it if you don't need to. 150 | //! 151 | //! Therefore, only use what you need. For example, don't use `find` if you 152 | //! only need to test if an expression matches a string. (Use `is_match` 153 | //! instead.) 154 | //! 155 | //! # Unicode 156 | //! 157 | //! This implementation executes regular expressions **only** on sequences of 158 | //! UTF8 codepoints while exposing match locations as byte indices. 159 | //! 160 | //! Currently, only naive case folding is supported. Namely, when matching 161 | //! case insensitively, the characters are first converted to their uppercase 162 | //! forms and then compared. 163 | //! 164 | //! Regular expressions themselves are also **only** interpreted as a sequence 165 | //! of UTF8 codepoints. This means you can embed Unicode characters directly 166 | //! into your expression: 167 | //! 168 | //! ```rust 169 | //! # #![feature(phase)] 170 | //! # extern crate regex; #[phase(syntax)] extern crate regex_macros; 171 | //! # fn main() { 172 | //! let re = regex!(r"(?i)Δ+"); 173 | //! assert_eq!(re.find("ΔδΔ"), Some((0, 6))); 174 | //! # } 175 | //! ``` 176 | //! 177 | //! Finally, Unicode general categories and scripts are available as character 178 | //! classes. For example, you can match a sequence of numerals, Greek or 179 | //! Cherokee letters: 180 | //! 181 | //! ```rust 182 | //! # #![feature(phase)] 183 | //! # extern crate regex; #[phase(syntax)] extern crate regex_macros; 184 | //! # fn main() { 185 | //! let re = regex!(r"[\pN\p{Greek}\p{Cherokee}]+"); 186 | //! assert_eq!(re.find("abcΔᎠβⅠᏴγδⅡxyz"), Some((3, 23))); 187 | //! # } 188 | //! ``` 189 | //! 190 | //! # Syntax 191 | //! 192 | //! The syntax supported in this crate is almost in an exact correspondence 193 | //! with the syntax supported by RE2. 194 | //! 195 | //! ## Matching one character 196 | //! 197 | //!
198 | //! .           any character except new line (includes new line with s flag)
199 | //! [xyz]       A character class matching either x, y or z.
200 | //! [^xyz]      A character class matching any character except x, y and z.
201 | //! [a-z]       A character class matching any character in range a-z.
202 | //! \d          Perl character class ([0-9])
203 | //! \D          Negated Perl character class ([^0-9])
204 | //! [:alpha:]   ASCII character class ([A-Za-z])
205 | //! [:^alpha:]  Negated ASCII character class ([^A-Za-z])
206 | //! \pN         One letter name Unicode character class
207 | //! \p{Greek}   Unicode character class (general category or script)
208 | //! \PN         Negated one letter name Unicode character class
209 | //! \P{Greek}   negated Unicode character class (general category or script)
210 | //! 
211 | //! 212 | //! Any named character class may appear inside a bracketed `[...]` character 213 | //! class. For example, `[\p{Greek}\pN]` matches any Greek or numeral 214 | //! character. 215 | //! 216 | //! ## Composites 217 | //! 218 | //!
219 | //! xy    concatenation (x followed by y)
220 | //! x|y   alternation (x or y, prefer x)
221 | //! 
222 | //! 223 | //! ## Repetitions 224 | //! 225 | //!
226 | //! x*        zero or more of x (greedy)
227 | //! x+        one or more of x (greedy)
228 | //! x?        zero or one of x (greedy)
229 | //! x*?       zero or more of x (ungreedy)
230 | //! x+?       one or more of x (ungreedy)
231 | //! x??       zero or one of x (ungreedy)
232 | //! x{n,m}    at least n and at most x (greedy)
233 | //! x{n,}     at least n x (greedy)
234 | //! x{n}      exactly n x
235 | //! x{n,m}?   at least n and at most x (ungreedy)
236 | //! x{n,}?    at least n x (ungreedy)
237 | //! x{n}?     exactly n x
238 | //! 
239 | //! 240 | //! ## Empty matches 241 | //! 242 | //!
243 | //! ^     the beginning of text (or start-of-line with multi-line mode)
244 | //! $     the end of text (or end-of-line with multi-line mode)
245 | //! \A    only the beginning of text (even with multi-line mode enabled)
246 | //! \z    only the end of text (even with multi-line mode enabled)
247 | //! \b    a Unicode word boundary (\w on one side and \W, \A, or \z on other)
248 | //! \B    not a Unicode word boundary
249 | //! 
250 | //! 251 | //! ## Grouping and flags 252 | //! 253 | //!
254 | //! (exp)          numbered capture group (indexed by opening parenthesis)
255 | //! (?P<name>exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
256 | //! (?:exp)        non-capturing group
257 | //! (?flags)       set flags within current group
258 | //! (?flags:exp)   set flags for exp (non-capturing)
259 | //! 
260 | //! 261 | //! Flags are each a single character. For example, `(?x)` sets the flag `x` 262 | //! and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at 263 | //! the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets 264 | //! the `x` flag and clears the `y` flag. 265 | //! 266 | //! All flags are by default disabled. They are: 267 | //! 268 | //!
269 | //! i     case insensitive
270 | //! m     multi-line mode: ^ and $ match begin/end of line
271 | //! s     allow . to match \n
272 | //! U     swap the meaning of x* and x*?
273 | //! 
274 | //! 275 | //! Here's an example that matches case insensitively for only part of the 276 | //! expression: 277 | //! 278 | //! ```rust 279 | //! # #![feature(phase)] 280 | //! # extern crate regex; #[phase(syntax)] extern crate regex_macros; 281 | //! # fn main() { 282 | //! let re = regex!(r"(?i)a+(?-i)b+"); 283 | //! let cap = re.captures("AaAaAbbBBBb").unwrap(); 284 | //! assert_eq!(cap.at(0), "AaAaAbb"); 285 | //! # } 286 | //! ``` 287 | //! 288 | //! Notice that the `a+` matches either `a` or `A`, but the `b+` only matches 289 | //! `b`. 290 | //! 291 | //! ## Escape sequences 292 | //! 293 | //!
294 | //! \*         literal *, works for any punctuation character: \.+*?()|[]{}^$
295 | //! \a         bell (\x07)
296 | //! \f         form feed (\x0C)
297 | //! \t         horizontal tab
298 | //! \n         new line
299 | //! \r         carriage return
300 | //! \v         vertical tab (\x0B)
301 | //! \123       octal character code (up to three digits)
302 | //! \x7F       hex character code (exactly two digits)
303 | //! \x{10FFFF} any hex character code corresponding to a valid UTF8 codepoint
304 | //! 
305 | //! 306 | //! ## Perl character classes (Unicode friendly) 307 | //! 308 | //!
309 | //! \d     digit ([0-9] + \p{Nd})
310 | //! \D     not digit
311 | //! \s     whitespace ([\t\n\f\r ] + \p{Z})
312 | //! \S     not whitespace
313 | //! \w     word character ([0-9A-Za-z_] + \p{L})
314 | //! \W     not word character
315 | //! 
316 | //! 317 | //! ## ASCII character classes 318 | //! 319 | //!
320 | //! [:alnum:]    alphanumeric ([0-9A-Za-z])
321 | //! [:alpha:]    alphabetic ([A-Za-z])
322 | //! [:ascii:]    ASCII ([\x00-\x7F])
323 | //! [:blank:]    blank ([\t ])
324 | //! [:cntrl:]    control ([\x00-\x1F\x7F])
325 | //! [:digit:]    digits ([0-9])
326 | //! [:graph:]    graphical ([!-~])
327 | //! [:lower:]    lower case ([a-z])
328 | //! [:print:]    printable ([ -~])
329 | //! [:punct:]    punctuation ([!-/:-@[-`{-~])
330 | //! [:space:]    whitespace ([\t\n\v\f\r ])
331 | //! [:upper:]    upper case ([A-Z])
332 | //! [:word:]     word characters ([0-9A-Za-z_])
333 | //! [:xdigit:]   hex digit ([0-9A-Fa-f])
334 | //! 
335 | //! 336 | //! # Untrusted input 337 | //! 338 | //! There are two factors to consider here: untrusted regular expressions and 339 | //! untrusted search text. 340 | //! 341 | //! Currently, there are no counter-measures in place to prevent a malicious 342 | //! user from writing an expression that may use a lot of resources. One such 343 | //! example is to repeat counted repetitions: `((a{100}){100}){100}` will try 344 | //! to repeat the `a` instruction `100^3` times. Essentially, this means it's 345 | //! very easy for an attacker to exhaust your system's memory if they are 346 | //! allowed to execute arbitrary regular expressions. A possible solution to 347 | //! this is to impose a hard limit on the size of a compiled expression, but it 348 | //! does not yet exist. 349 | //! 350 | //! The story is a bit better with untrusted search text, since this crate's 351 | //! implementation provides `O(nm)` search where `n` is the number of 352 | //! characters in the search text and `m` is the number of instructions in a 353 | //! compiled expression. 354 | 355 | #![crate_id = "regex#0.11-pre"] 356 | #![crate_type = "rlib"] 357 | #![crate_type = "dylib"] 358 | #![experimental] 359 | #![license = "MIT/ASL2"] 360 | #![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png", 361 | html_favicon_url = "http://www.rust-lang.org/favicon.ico", 362 | html_root_url = "http://static.rust-lang.org/doc/master")] 363 | 364 | #![feature(macro_rules, phase)] 365 | #![deny(missing_doc)] 366 | 367 | extern crate collections; 368 | #[cfg(test)] 369 | extern crate stdtest = "test"; 370 | #[cfg(test)] 371 | extern crate rand; 372 | 373 | // During tests, this links with the `regex` crate so that the `regex!` macro 374 | // can be tested. 375 | #[cfg(test)] 376 | extern crate regex; 377 | 378 | pub use parse::Error; 379 | pub use re::{Regex, Captures, SubCaptures, SubCapturesPos}; 380 | pub use re::{FindCaptures, FindMatches}; 381 | pub use re::{Replacer, NoExpand, RegexSplits, RegexSplitsN}; 382 | pub use re::{quote, is_match}; 383 | 384 | mod compile; 385 | mod parse; 386 | mod re; 387 | mod vm; 388 | 389 | // FIXME(#13725) windows needs fixing. 390 | #[cfg(test, not(windows))] 391 | mod test; 392 | 393 | /// The `program` module exists to support the `regex!` macro. Do not use. 394 | #[doc(hidden)] 395 | pub mod native { 396 | // Exporting this stuff is bad form, but it's necessary for two reasons. 397 | // Firstly, the `regex!` syntax extension is in a different crate and 398 | // requires access to the representation of a regex (particularly the 399 | // instruction set) in order to compile to native Rust. This could be 400 | // mitigated if `regex!` was defined in the same crate, but this has 401 | // undesirable consequences (such as requiring a dependency on 402 | // `libsyntax`). 403 | // 404 | // Secondly, the code generated generated by `regex!` must *also* be able 405 | // to access various functions in this crate to reduce code duplication 406 | // and to provide a value with precisely the same `Regex` type in this 407 | // crate. This, AFAIK, is impossible to mitigate. 408 | // 409 | // On the bright side, `rustdoc` lets us hide this from the public API 410 | // documentation. 411 | pub use compile::{ 412 | Program, 413 | OneChar, CharClass, Any, Save, Jump, Split, 414 | Match, EmptyBegin, EmptyEnd, EmptyWordBoundary, 415 | }; 416 | pub use parse::{ 417 | FLAG_EMPTY, FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, 418 | FLAG_SWAP_GREED, FLAG_NEGATED, 419 | }; 420 | pub use re::{Dynamic, Native}; 421 | pub use vm::{ 422 | MatchKind, Exists, Location, Submatches, 423 | StepState, StepMatchEarlyReturn, StepMatch, StepContinue, 424 | CharReader, find_prefix, 425 | }; 426 | } 427 | -------------------------------------------------------------------------------- /src/macro.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT 2 | // file at the top-level directory of this distribution and at 3 | // http://rust-lang.org/COPYRIGHT. 4 | // 5 | // Licensed under the Apache License, Version 2.0 or the MIT license 7 | // , at your 8 | // option. This file may not be copied, modified, or distributed 9 | // except according to those terms. 10 | 11 | //! This crate provides the `regex!` macro. Its use is documented in the 12 | //! `regex` crate. 13 | 14 | #![crate_id = "regex_macros#0.11-pre"] 15 | #![crate_type = "dylib"] 16 | #![experimental] 17 | #![license = "MIT/ASL2"] 18 | #![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png", 19 | html_favicon_url = "http://www.rust-lang.org/favicon.ico", 20 | html_root_url = "http://static.rust-lang.org/doc/master")] 21 | 22 | #![feature(macro_registrar, managed_boxes, quote)] 23 | 24 | extern crate regex; 25 | extern crate syntax; 26 | 27 | use syntax::ast; 28 | use syntax::codemap; 29 | use syntax::ext::base::{ 30 | SyntaxExtension, ExtCtxt, MacResult, MacExpr, DummyResult, 31 | NormalTT, BasicMacroExpander, 32 | }; 33 | use syntax::parse; 34 | use syntax::parse::token; 35 | use syntax::print::pprust; 36 | 37 | use regex::Regex; 38 | use regex::native::{ 39 | OneChar, CharClass, Any, Save, Jump, Split, 40 | Match, EmptyBegin, EmptyEnd, EmptyWordBoundary, 41 | Program, Dynamic, Native, 42 | FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED, 43 | }; 44 | 45 | /// For the `regex!` syntax extension. Do not use. 46 | #[macro_registrar] 47 | #[doc(hidden)] 48 | pub fn macro_registrar(register: |ast::Name, SyntaxExtension|) { 49 | let expander = ~BasicMacroExpander { expander: native, span: None }; 50 | register(token::intern("regex"), NormalTT(expander, None)) 51 | } 52 | 53 | /// Generates specialized code for the Pike VM for a particular regular 54 | /// expression. 55 | /// 56 | /// There are two primary differences between the code generated here and the 57 | /// general code in vm.rs. 58 | /// 59 | /// 1. All heap allocation is removed. Sized vector types are used instead. 60 | /// Care must be taken to make sure that these vectors are not copied 61 | /// gratuitously. (If you're not sure, run the benchmarks. They will yell 62 | /// at you if you do.) 63 | /// 2. The main `match instruction { ... }` expressions are replaced with more 64 | /// direct `match pc { ... }`. The generators can be found in 65 | /// `step_insts` and `add_insts`. 66 | /// 67 | /// Other more minor changes include eliding code when possible (although this 68 | /// isn't completely thorough at the moment), and translating character class 69 | /// matching from using a binary search to a simple `match` expression (see 70 | /// `match_class`). 71 | /// 72 | /// It is strongly recommended to read the dynamic implementation in vm.rs 73 | /// first before trying to understand the code generator. The implementation 74 | /// strategy is identical and vm.rs has comments and will be easier to follow. 75 | fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree]) 76 | -> ~MacResult { 77 | let regex = match parse(cx, tts) { 78 | Some(r) => r, 79 | // error is logged in 'parse' with cx.span_err 80 | None => return DummyResult::any(sp), 81 | }; 82 | let re = match Regex::new(regex.to_owned()) { 83 | Ok(re) => re, 84 | Err(err) => { 85 | cx.span_err(sp, err.to_str()); 86 | return DummyResult::any(sp) 87 | } 88 | }; 89 | let prog = match re.p { 90 | Dynamic(ref prog) => prog.clone(), 91 | Native(_) => unreachable!(), 92 | }; 93 | 94 | let mut gen = NfaGen { 95 | cx: &*cx, sp: sp, prog: prog, 96 | names: re.names.clone(), original: re.original.clone(), 97 | }; 98 | MacExpr::new(gen.code()) 99 | } 100 | 101 | struct NfaGen<'a> { 102 | cx: &'a ExtCtxt<'a>, 103 | sp: codemap::Span, 104 | prog: Program, 105 | names: ~[Option<~str>], 106 | original: ~str, 107 | } 108 | 109 | impl<'a> NfaGen<'a> { 110 | fn code(&mut self) -> @ast::Expr { 111 | // Most or all of the following things are used in the quasiquoted 112 | // expression returned. 113 | let num_cap_locs = 2 * self.prog.num_captures(); 114 | let num_insts = self.prog.insts.len(); 115 | let cap_names = self.vec_expr(self.names, 116 | |cx, name| match name { 117 | &Some(ref name) => { 118 | let name = name.as_slice(); 119 | quote_expr!(cx, Some(~$name)) 120 | } 121 | &None => quote_expr!(cx, None), 122 | } 123 | ); 124 | let prefix_anchor = 125 | match self.prog.insts.as_slice()[1] { 126 | EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true, 127 | _ => false, 128 | }; 129 | let init_groups = self.vec_from_fn(num_cap_locs, 130 | |cx| quote_expr!(cx, None)); 131 | let prefix_bytes = self.vec_expr(self.prog.prefix.as_slice().as_bytes(), 132 | |cx, b| quote_expr!(cx, $b)); 133 | let check_prefix = self.check_prefix(); 134 | let step_insts = self.step_insts(); 135 | let add_insts = self.add_insts(); 136 | let regex = self.original.as_slice(); 137 | 138 | quote_expr!(self.cx, { 139 | fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, 140 | start: uint, end: uint) -> Vec> { 141 | #![allow(unused_imports)] 142 | use regex::native::{ 143 | MatchKind, Exists, Location, Submatches, 144 | StepState, StepMatchEarlyReturn, StepMatch, StepContinue, 145 | CharReader, find_prefix, 146 | }; 147 | 148 | return Nfa { 149 | which: which, 150 | input: input, 151 | ic: 0, 152 | chars: CharReader::new(input), 153 | }.run(start, end); 154 | 155 | type Captures = [Option, ..$num_cap_locs]; 156 | 157 | struct Nfa<'t> { 158 | which: MatchKind, 159 | input: &'t str, 160 | ic: uint, 161 | chars: CharReader<'t>, 162 | } 163 | 164 | impl<'t> Nfa<'t> { 165 | #[allow(unused_variable)] 166 | fn run(&mut self, start: uint, end: uint) -> Vec> { 167 | let mut matched = false; 168 | let prefix_bytes: &[u8] = &$prefix_bytes; 169 | let mut clist = &mut Threads::new(self.which); 170 | let mut nlist = &mut Threads::new(self.which); 171 | 172 | let mut groups = $init_groups; 173 | 174 | self.ic = start; 175 | let mut next_ic = self.chars.set(start); 176 | while self.ic <= end { 177 | if clist.size == 0 { 178 | if matched { 179 | break 180 | } 181 | $check_prefix 182 | } 183 | if clist.size == 0 || (!$prefix_anchor && !matched) { 184 | self.add(clist, 0, &mut groups) 185 | } 186 | 187 | self.ic = next_ic; 188 | next_ic = self.chars.advance(); 189 | 190 | let mut i = 0; 191 | while i < clist.size { 192 | let pc = clist.pc(i); 193 | let step_state = self.step(&mut groups, nlist, 194 | clist.groups(i), pc); 195 | match step_state { 196 | StepMatchEarlyReturn => 197 | return vec![Some(0u), Some(0u)], 198 | StepMatch => { matched = true; clist.empty() }, 199 | StepContinue => {}, 200 | } 201 | i += 1; 202 | } 203 | ::std::mem::swap(&mut clist, &mut nlist); 204 | nlist.empty(); 205 | } 206 | match self.which { 207 | Exists if matched => vec![Some(0u), Some(0u)], 208 | Exists => vec![None, None], 209 | Location | Submatches => groups.iter().map(|x| *x).collect(), 210 | } 211 | } 212 | 213 | // Sometimes `nlist` is never used (for empty regexes). 214 | #[allow(unused_variable)] 215 | #[inline] 216 | fn step(&self, groups: &mut Captures, nlist: &mut Threads, 217 | caps: &mut Captures, pc: uint) -> StepState { 218 | $step_insts 219 | StepContinue 220 | } 221 | 222 | fn add(&self, nlist: &mut Threads, pc: uint, 223 | groups: &mut Captures) { 224 | if nlist.contains(pc) { 225 | return 226 | } 227 | $add_insts 228 | } 229 | } 230 | 231 | struct Thread { 232 | pc: uint, 233 | groups: Captures, 234 | } 235 | 236 | struct Threads { 237 | which: MatchKind, 238 | queue: [Thread, ..$num_insts], 239 | sparse: [uint, ..$num_insts], 240 | size: uint, 241 | } 242 | 243 | impl Threads { 244 | fn new(which: MatchKind) -> Threads { 245 | Threads { 246 | which: which, 247 | // These unsafe blocks are used for performance reasons, as it 248 | // gives us a zero-cost initialization of a sparse set. The 249 | // trick is described in more detail here: 250 | // http://research.swtch.com/sparse 251 | // The idea here is to avoid initializing threads that never 252 | // need to be initialized, particularly for larger regexs with 253 | // a lot of instructions. 254 | queue: unsafe { ::std::mem::uninit() }, 255 | sparse: unsafe { ::std::mem::uninit() }, 256 | size: 0, 257 | } 258 | } 259 | 260 | #[inline] 261 | fn add(&mut self, pc: uint, groups: &Captures) { 262 | let t = &mut self.queue[self.size]; 263 | t.pc = pc; 264 | match self.which { 265 | Exists => {}, 266 | Location => { 267 | t.groups[0] = groups[0]; 268 | t.groups[1] = groups[1]; 269 | } 270 | Submatches => { 271 | for (slot, val) in t.groups.mut_iter().zip(groups.iter()) { 272 | *slot = *val; 273 | } 274 | } 275 | } 276 | self.sparse[pc] = self.size; 277 | self.size += 1; 278 | } 279 | 280 | #[inline] 281 | fn add_empty(&mut self, pc: uint) { 282 | self.queue[self.size].pc = pc; 283 | self.sparse[pc] = self.size; 284 | self.size += 1; 285 | } 286 | 287 | #[inline] 288 | fn contains(&self, pc: uint) -> bool { 289 | let s = self.sparse[pc]; 290 | s < self.size && self.queue[s].pc == pc 291 | } 292 | 293 | #[inline] 294 | fn empty(&mut self) { 295 | self.size = 0; 296 | } 297 | 298 | #[inline] 299 | fn pc(&self, i: uint) -> uint { 300 | self.queue[i].pc 301 | } 302 | 303 | #[inline] 304 | fn groups<'r>(&'r mut self, i: uint) -> &'r mut Captures { 305 | &'r mut self.queue[i].groups 306 | } 307 | } 308 | } 309 | 310 | ::regex::Regex { 311 | original: ~$regex, 312 | names: ~$cap_names, 313 | p: ::regex::native::Native(exec), 314 | } 315 | }) 316 | } 317 | 318 | // Generates code for the `add` method, which is responsible for adding 319 | // zero-width states to the next queue of states to visit. 320 | fn add_insts(&self) -> @ast::Expr { 321 | let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { 322 | let nextpc = pc + 1; 323 | let body = match *inst { 324 | EmptyBegin(flags) => { 325 | let nl = '\n'; 326 | let cond = 327 | if flags & FLAG_MULTI > 0 { 328 | quote_expr!(self.cx, 329 | self.chars.is_begin() 330 | || self.chars.prev == Some($nl) 331 | ) 332 | } else { 333 | quote_expr!(self.cx, self.chars.is_begin()) 334 | }; 335 | quote_expr!(self.cx, { 336 | nlist.add_empty($pc); 337 | if $cond { self.add(nlist, $nextpc, &mut *groups) } 338 | }) 339 | } 340 | EmptyEnd(flags) => { 341 | let nl = '\n'; 342 | let cond = 343 | if flags & FLAG_MULTI > 0 { 344 | quote_expr!(self.cx, 345 | self.chars.is_end() 346 | || self.chars.cur == Some($nl) 347 | ) 348 | } else { 349 | quote_expr!(self.cx, self.chars.is_end()) 350 | }; 351 | quote_expr!(self.cx, { 352 | nlist.add_empty($pc); 353 | if $cond { self.add(nlist, $nextpc, &mut *groups) } 354 | }) 355 | } 356 | EmptyWordBoundary(flags) => { 357 | let cond = 358 | if flags & FLAG_NEGATED > 0 { 359 | quote_expr!(self.cx, !self.chars.is_word_boundary()) 360 | } else { 361 | quote_expr!(self.cx, self.chars.is_word_boundary()) 362 | }; 363 | quote_expr!(self.cx, { 364 | nlist.add_empty($pc); 365 | if $cond { self.add(nlist, $nextpc, &mut *groups) } 366 | }) 367 | } 368 | Save(slot) => { 369 | let save = quote_expr!(self.cx, { 370 | let old = groups[$slot]; 371 | groups[$slot] = Some(self.ic); 372 | self.add(nlist, $nextpc, &mut *groups); 373 | groups[$slot] = old; 374 | }); 375 | let add = quote_expr!(self.cx, { 376 | self.add(nlist, $nextpc, &mut *groups); 377 | }); 378 | // If this is saving a submatch location but we request 379 | // existence or only full match location, then we can skip 380 | // right over it every time. 381 | if slot > 1 { 382 | quote_expr!(self.cx, { 383 | nlist.add_empty($pc); 384 | match self.which { 385 | Submatches => $save, 386 | Exists | Location => $add, 387 | } 388 | }) 389 | } else { 390 | quote_expr!(self.cx, { 391 | nlist.add_empty($pc); 392 | match self.which { 393 | Submatches | Location => $save, 394 | Exists => $add, 395 | } 396 | }) 397 | } 398 | } 399 | Jump(to) => { 400 | quote_expr!(self.cx, { 401 | nlist.add_empty($pc); 402 | self.add(nlist, $to, &mut *groups); 403 | }) 404 | } 405 | Split(x, y) => { 406 | quote_expr!(self.cx, { 407 | nlist.add_empty($pc); 408 | self.add(nlist, $x, &mut *groups); 409 | self.add(nlist, $y, &mut *groups); 410 | }) 411 | } 412 | // For Match, OneChar, CharClass, Any 413 | _ => quote_expr!(self.cx, nlist.add($pc, &*groups)), 414 | }; 415 | self.arm_inst(pc, body) 416 | }).collect::>(); 417 | 418 | self.match_insts(arms) 419 | } 420 | 421 | // Generates the code for the `step` method, which processes all states 422 | // in the current queue that consume a single character. 423 | fn step_insts(&self) -> @ast::Expr { 424 | let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { 425 | let nextpc = pc + 1; 426 | let body = match *inst { 427 | Match => { 428 | quote_expr!(self.cx, { 429 | match self.which { 430 | Exists => { 431 | return StepMatchEarlyReturn 432 | } 433 | Location => { 434 | groups[0] = caps[0]; 435 | groups[1] = caps[1]; 436 | return StepMatch 437 | } 438 | Submatches => { 439 | for (slot, val) in groups.mut_iter().zip(caps.iter()) { 440 | *slot = *val; 441 | } 442 | return StepMatch 443 | } 444 | } 445 | }) 446 | } 447 | OneChar(c, flags) => { 448 | if flags & FLAG_NOCASE > 0 { 449 | let upc = c.to_uppercase(); 450 | quote_expr!(self.cx, { 451 | let upc = self.chars.prev.map(|c| c.to_uppercase()); 452 | if upc == Some($upc) { 453 | self.add(nlist, $nextpc, caps); 454 | } 455 | }) 456 | } else { 457 | quote_expr!(self.cx, { 458 | if self.chars.prev == Some($c) { 459 | self.add(nlist, $nextpc, caps); 460 | } 461 | }) 462 | } 463 | } 464 | CharClass(ref ranges, flags) => { 465 | let negate = flags & FLAG_NEGATED > 0; 466 | let casei = flags & FLAG_NOCASE > 0; 467 | let get_char = 468 | if casei { 469 | quote_expr!(self.cx, self.chars.prev.unwrap().to_uppercase()) 470 | } else { 471 | quote_expr!(self.cx, self.chars.prev.unwrap()) 472 | }; 473 | let negcond = 474 | if negate { 475 | quote_expr!(self.cx, !found) 476 | } else { 477 | quote_expr!(self.cx, found) 478 | }; 479 | let mranges = self.match_class(casei, ranges.as_slice()); 480 | quote_expr!(self.cx, { 481 | if self.chars.prev.is_some() { 482 | let c = $get_char; 483 | let found = $mranges; 484 | if $negcond { 485 | self.add(nlist, $nextpc, caps); 486 | } 487 | } 488 | }) 489 | } 490 | Any(flags) => { 491 | if flags & FLAG_DOTNL > 0 { 492 | quote_expr!(self.cx, self.add(nlist, $nextpc, caps)) 493 | } else { 494 | let nl = '\n'; // no char lits allowed? wtf? 495 | quote_expr!(self.cx, { 496 | if self.chars.prev != Some($nl) { 497 | self.add(nlist, $nextpc, caps) 498 | } 499 | }) 500 | } 501 | } 502 | // EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split 503 | _ => quote_expr!(self.cx, {}), 504 | }; 505 | self.arm_inst(pc, body) 506 | }).collect::>(); 507 | 508 | self.match_insts(arms) 509 | } 510 | 511 | // Translates a character class into a match expression. 512 | // This avoids a binary search (and is hopefully replaced by a jump 513 | // table). 514 | fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> @ast::Expr { 515 | let mut arms = ranges.iter().map(|&(mut start, mut end)| { 516 | if casei { 517 | start = start.to_uppercase(); 518 | end = end.to_uppercase(); 519 | } 520 | ast::Arm { 521 | attrs: vec!(), 522 | pats: vec!(@ast::Pat{ 523 | id: ast::DUMMY_NODE_ID, 524 | span: self.sp, 525 | node: ast::PatRange(quote_expr!(self.cx, $start), 526 | quote_expr!(self.cx, $end)), 527 | }), 528 | guard: None, 529 | body: quote_expr!(self.cx, true), 530 | } 531 | }).collect::>(); 532 | 533 | arms.push(self.wild_arm_expr(quote_expr!(self.cx, false))); 534 | 535 | let match_on = quote_expr!(self.cx, c); 536 | self.dummy_expr(ast::ExprMatch(match_on, arms)) 537 | } 538 | 539 | // Generates code for checking a literal prefix of the search string. 540 | // The code is only generated if the regex *has* a literal prefix. 541 | // Otherwise, a no-op is returned. 542 | fn check_prefix(&self) -> @ast::Expr { 543 | if self.prog.prefix.len() == 0 { 544 | quote_expr!(self.cx, {}) 545 | } else { 546 | quote_expr!(self.cx, 547 | if clist.size == 0 { 548 | let haystack = self.input.as_bytes().slice_from(self.ic); 549 | match find_prefix(prefix_bytes, haystack) { 550 | None => break, 551 | Some(i) => { 552 | self.ic += i; 553 | next_ic = self.chars.set(self.ic); 554 | } 555 | } 556 | } 557 | ) 558 | } 559 | } 560 | 561 | // Builds a `match pc { ... }` expression from a list of arms, specifically 562 | // for matching the current program counter with an instruction. 563 | // A wild-card arm is automatically added that executes a no-op. It will 564 | // never be used, but is added to satisfy the compiler complaining about 565 | // non-exhaustive patterns. 566 | fn match_insts(&self, mut arms: Vec) -> @ast::Expr { 567 | let mat_pc = quote_expr!(self.cx, pc); 568 | arms.push(self.wild_arm_expr(quote_expr!(self.cx, {}))); 569 | self.dummy_expr(ast::ExprMatch(mat_pc, arms)) 570 | } 571 | 572 | // Creates a match arm for the instruction at `pc` with the expression 573 | // `body`. 574 | fn arm_inst(&self, pc: uint, body: @ast::Expr) -> ast::Arm { 575 | ast::Arm { 576 | attrs: vec!(), 577 | pats: vec!(@ast::Pat{ 578 | id: ast::DUMMY_NODE_ID, 579 | span: self.sp, 580 | node: ast::PatLit(quote_expr!(self.cx, $pc)), 581 | }), 582 | guard: None, 583 | body: body, 584 | } 585 | } 586 | 587 | // Creates a wild-card match arm with the expression `body`. 588 | fn wild_arm_expr(&self, body: @ast::Expr) -> ast::Arm { 589 | ast::Arm { 590 | attrs: vec!(), 591 | pats: vec!(@ast::Pat{ 592 | id: ast::DUMMY_NODE_ID, 593 | span: self.sp, 594 | node: ast::PatWild, 595 | }), 596 | guard: None, 597 | body: body, 598 | } 599 | } 600 | 601 | // Builds a `[a, b, .., len]` expression where each element is the result 602 | // of executing `to_expr`. 603 | fn vec_from_fn(&self, len: uint, to_expr: |&ExtCtxt| -> @ast::Expr) 604 | -> @ast::Expr { 605 | self.vec_expr(Vec::from_elem(len, ()).as_slice(), 606 | |cx, _| to_expr(cx)) 607 | } 608 | 609 | // Converts `xs` to a `[x1, x2, .., xN]` expression by calling `to_expr` 610 | // on each element in `xs`. 611 | fn vec_expr(&self, xs: &[T], to_expr: |&ExtCtxt, &T| -> @ast::Expr) 612 | -> @ast::Expr { 613 | let mut exprs = vec!(); 614 | for x in xs.iter() { 615 | exprs.push(to_expr(self.cx, x)) 616 | } 617 | let vec_exprs = self.dummy_expr(ast::ExprVec(exprs)); 618 | quote_expr!(self.cx, $vec_exprs) 619 | } 620 | 621 | // Creates an expression with a dummy node ID given an underlying 622 | // `ast::Expr_`. 623 | fn dummy_expr(&self, e: ast::Expr_) -> @ast::Expr { 624 | @ast::Expr { 625 | id: ast::DUMMY_NODE_ID, 626 | node: e, 627 | span: self.sp, 628 | } 629 | } 630 | } 631 | 632 | // This trait is defined in the quote module in the syntax crate, but I 633 | // don't think it's exported. 634 | // Interestingly, quote_expr! only requires that a 'to_tokens' method be 635 | // defined rather than satisfying a particular trait. 636 | #[doc(hidden)] 637 | trait ToTokens { 638 | fn to_tokens(&self, cx: &ExtCtxt) -> Vec; 639 | } 640 | 641 | impl ToTokens for char { 642 | fn to_tokens(&self, _: &ExtCtxt) -> Vec { 643 | vec!(ast::TTTok(codemap::DUMMY_SP, token::LIT_CHAR((*self) as u32))) 644 | } 645 | } 646 | 647 | impl ToTokens for bool { 648 | fn to_tokens(&self, _: &ExtCtxt) -> Vec { 649 | let ident = token::IDENT(token::str_to_ident(self.to_str()), false); 650 | vec!(ast::TTTok(codemap::DUMMY_SP, ident)) 651 | } 652 | } 653 | 654 | /// Looks for a single string literal and returns it. 655 | /// Otherwise, logs an error with cx.span_err and returns None. 656 | fn parse(cx: &mut ExtCtxt, tts: &[ast::TokenTree]) -> Option<~str> { 657 | let mut parser = parse::new_parser_from_tts(cx.parse_sess(), cx.cfg(), 658 | Vec::from_slice(tts)); 659 | let entry = cx.expand_expr(parser.parse_expr()); 660 | let regex = match entry.node { 661 | ast::ExprLit(lit) => { 662 | match lit.node { 663 | ast::LitStr(ref s, _) => s.to_str(), 664 | _ => { 665 | cx.span_err(entry.span, format!( 666 | "expected string literal but got `{}`", 667 | pprust::lit_to_str(lit))); 668 | return None 669 | } 670 | } 671 | } 672 | _ => { 673 | cx.span_err(entry.span, format!( 674 | "expected string literal but got `{}`", 675 | pprust::expr_to_str(entry))); 676 | return None 677 | } 678 | }; 679 | if !parser.eat(&token::EOF) { 680 | cx.span_err(parser.span, "only one string literal allowed"); 681 | return None; 682 | } 683 | Some(regex) 684 | } 685 | -------------------------------------------------------------------------------- /src/test/bench.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT 2 | // file at the top-level directory of this distribution and at 3 | // http://rust-lang.org/COPYRIGHT. 4 | // 5 | // Licensed under the Apache License, Version 2.0 or the MIT license 7 | // , at your 8 | // option. This file may not be copied, modified, or distributed 9 | // except according to those terms. 10 | 11 | use rand::{Rng, task_rng}; 12 | use stdtest::Bencher; 13 | use std::str; 14 | use regex::{Regex, NoExpand}; 15 | 16 | fn bench_assert_match(b: &mut Bencher, re: Regex, text: &str) { 17 | b.iter(|| if !re.is_match(text) { fail!("no match") }); 18 | } 19 | 20 | #[bench] 21 | fn no_exponential(b: &mut Bencher) { 22 | let n = 100; 23 | let re = Regex::new("a?".repeat(n) + "a".repeat(n)).unwrap(); 24 | let text = "a".repeat(n); 25 | bench_assert_match(b, re, text); 26 | } 27 | 28 | #[bench] 29 | fn literal(b: &mut Bencher) { 30 | let re = regex!("y"); 31 | let text = "x".repeat(50) + "y"; 32 | bench_assert_match(b, re, text); 33 | } 34 | 35 | #[bench] 36 | fn not_literal(b: &mut Bencher) { 37 | let re = regex!(".y"); 38 | let text = "x".repeat(50) + "y"; 39 | bench_assert_match(b, re, text); 40 | } 41 | 42 | #[bench] 43 | fn match_class(b: &mut Bencher) { 44 | let re = regex!("[abcdw]"); 45 | let text = "xxxx".repeat(20) + "w"; 46 | bench_assert_match(b, re, text); 47 | } 48 | 49 | #[bench] 50 | fn match_class_in_range(b: &mut Bencher) { 51 | // 'b' is between 'a' and 'c', so the class range checking doesn't help. 52 | let re = regex!("[ac]"); 53 | let text = "bbbb".repeat(20) + "c"; 54 | bench_assert_match(b, re, text); 55 | } 56 | 57 | #[bench] 58 | fn replace_all(b: &mut Bencher) { 59 | let re = regex!("[cjrw]"); 60 | let text = "abcdefghijklmnopqrstuvwxyz"; 61 | // FIXME: This isn't using the $name expand stuff. 62 | // It's possible RE2/Go is using it, but currently, the expand in this 63 | // crate is actually compiling a regex, so it's incredibly slow. 64 | b.iter(|| re.replace_all(text, NoExpand(""))); 65 | } 66 | 67 | #[bench] 68 | fn anchored_literal_short_non_match(b: &mut Bencher) { 69 | let re = regex!("^zbc(d|e)"); 70 | let text = "abcdefghijklmnopqrstuvwxyz"; 71 | b.iter(|| re.is_match(text)); 72 | } 73 | 74 | #[bench] 75 | fn anchored_literal_long_non_match(b: &mut Bencher) { 76 | let re = regex!("^zbc(d|e)"); 77 | let text = "abcdefghijklmnopqrstuvwxyz".repeat(15); 78 | b.iter(|| re.is_match(text)); 79 | } 80 | 81 | #[bench] 82 | fn anchored_literal_short_match(b: &mut Bencher) { 83 | let re = regex!("^.bc(d|e)"); 84 | let text = "abcdefghijklmnopqrstuvwxyz"; 85 | b.iter(|| re.is_match(text)); 86 | } 87 | 88 | #[bench] 89 | fn anchored_literal_long_match(b: &mut Bencher) { 90 | let re = regex!("^.bc(d|e)"); 91 | let text = "abcdefghijklmnopqrstuvwxyz".repeat(15); 92 | b.iter(|| re.is_match(text)); 93 | } 94 | 95 | #[bench] 96 | fn one_pass_short_a(b: &mut Bencher) { 97 | let re = regex!("^.bc(d|e)*$"); 98 | let text = "abcddddddeeeededd"; 99 | b.iter(|| re.is_match(text)); 100 | } 101 | 102 | #[bench] 103 | fn one_pass_short_a_not(b: &mut Bencher) { 104 | let re = regex!(".bc(d|e)*$"); 105 | let text = "abcddddddeeeededd"; 106 | b.iter(|| re.is_match(text)); 107 | } 108 | 109 | #[bench] 110 | fn one_pass_short_b(b: &mut Bencher) { 111 | let re = regex!("^.bc(?:d|e)*$"); 112 | let text = "abcddddddeeeededd"; 113 | b.iter(|| re.is_match(text)); 114 | } 115 | 116 | #[bench] 117 | fn one_pass_short_b_not(b: &mut Bencher) { 118 | let re = regex!(".bc(?:d|e)*$"); 119 | let text = "abcddddddeeeededd"; 120 | b.iter(|| re.is_match(text)); 121 | } 122 | 123 | #[bench] 124 | fn one_pass_long_prefix(b: &mut Bencher) { 125 | let re = regex!("^abcdefghijklmnopqrstuvwxyz.*$"); 126 | let text = "abcdefghijklmnopqrstuvwxyz"; 127 | b.iter(|| re.is_match(text)); 128 | } 129 | 130 | #[bench] 131 | fn one_pass_long_prefix_not(b: &mut Bencher) { 132 | let re = regex!("^.bcdefghijklmnopqrstuvwxyz.*$"); 133 | let text = "abcdefghijklmnopqrstuvwxyz"; 134 | b.iter(|| re.is_match(text)); 135 | } 136 | 137 | macro_rules! throughput( 138 | ($name:ident, $regex:expr, $size:expr) => ( 139 | #[bench] 140 | fn $name(b: &mut Bencher) { 141 | let text = gen_text($size); 142 | b.bytes = $size; 143 | b.iter(|| if $regex.is_match(text) { fail!("match") }); 144 | } 145 | ); 146 | ) 147 | 148 | fn easy0() -> Regex { regex!("ABCDEFGHIJKLMNOPQRSTUVWXYZ$") } 149 | fn easy1() -> Regex { regex!("A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$") } 150 | fn medium() -> Regex { regex!("[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$") } 151 | fn hard() -> Regex { regex!("[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$") } 152 | 153 | fn gen_text(n: uint) -> ~str { 154 | let mut rng = task_rng(); 155 | let mut bytes = rng.gen_ascii_str(n).into_bytes(); 156 | for (i, b) in bytes.mut_iter().enumerate() { 157 | if i % 20 == 0 { 158 | *b = '\n' as u8 159 | } 160 | } 161 | str::from_utf8(bytes).unwrap().to_owned() 162 | } 163 | 164 | throughput!(easy0_32, easy0(), 32) 165 | throughput!(easy0_1K, easy0(), 1<<10) 166 | throughput!(easy0_32K, easy0(), 32<<10) 167 | 168 | throughput!(easy1_32, easy1(), 32) 169 | throughput!(easy1_1K, easy1(), 1<<10) 170 | throughput!(easy1_32K, easy1(), 32<<10) 171 | 172 | throughput!(medium_32, medium(), 32) 173 | throughput!(medium_1K, medium(), 1<<10) 174 | throughput!(medium_32K,medium(), 32<<10) 175 | 176 | throughput!(hard_32, hard(), 32) 177 | throughput!(hard_1K, hard(), 1<<10) 178 | throughput!(hard_32K,hard(), 32<<10) 179 | 180 | -------------------------------------------------------------------------------- /src/test/matches.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT 2 | // file at the top-level directory of this distribution and at 3 | // http://rust-lang.org/COPYRIGHT. 4 | // 5 | // Licensed under the Apache License, Version 2.0 or the MIT license 7 | // , at your 8 | // option. This file may not be copied, modified, or distributed 9 | // except according to those terms. 10 | 11 | // ignore-tidy-linelength 12 | 13 | // DO NOT EDIT. Automatically generated by 'src/etc/regex-match-tests' 14 | // on 2014-04-23 01:33:36.539280. 15 | 16 | // Tests from basic.dat 17 | mat!(match_basic_3, r"abracadabra$", r"abracadabracadabra", Some((7, 18))) 18 | mat!(match_basic_4, r"a...b", r"abababbb", Some((2, 7))) 19 | mat!(match_basic_5, r"XXXXXX", r"..XXXXXX", Some((2, 8))) 20 | mat!(match_basic_6, r"\)", r"()", Some((1, 2))) 21 | mat!(match_basic_7, r"a]", r"a]a", Some((0, 2))) 22 | mat!(match_basic_9, r"\}", r"}", Some((0, 1))) 23 | mat!(match_basic_10, r"\]", r"]", Some((0, 1))) 24 | mat!(match_basic_12, r"]", r"]", Some((0, 1))) 25 | mat!(match_basic_15, r"^a", r"ax", Some((0, 1))) 26 | mat!(match_basic_16, r"\^a", r"a^a", Some((1, 3))) 27 | mat!(match_basic_17, r"a\^", r"a^", Some((0, 2))) 28 | mat!(match_basic_18, r"a$", r"aa", Some((1, 2))) 29 | mat!(match_basic_19, r"a\$", r"a$", Some((0, 2))) 30 | mat!(match_basic_20, r"^$", r"", Some((0, 0))) 31 | mat!(match_basic_21, r"$^", r"", Some((0, 0))) 32 | mat!(match_basic_22, r"a($)", r"aa", Some((1, 2)), Some((2, 2))) 33 | mat!(match_basic_23, r"a*(^a)", r"aa", Some((0, 1)), Some((0, 1))) 34 | mat!(match_basic_24, r"(..)*(...)*", r"a", Some((0, 0))) 35 | mat!(match_basic_25, r"(..)*(...)*", r"abcd", Some((0, 4)), Some((2, 4))) 36 | mat!(match_basic_26, r"(ab|a)(bc|c)", r"abc", Some((0, 3)), Some((0, 2)), Some((2, 3))) 37 | mat!(match_basic_27, r"(ab)c|abc", r"abc", Some((0, 3)), Some((0, 2))) 38 | mat!(match_basic_28, r"a{0}b", r"ab", Some((1, 2))) 39 | mat!(match_basic_29, r"(a*)(b?)(b+)b{3}", r"aaabbbbbbb", Some((0, 10)), Some((0, 3)), Some((3, 4)), Some((4, 7))) 40 | mat!(match_basic_30, r"(a*)(b{0,1})(b{1,})b{3}", r"aaabbbbbbb", Some((0, 10)), Some((0, 3)), Some((3, 4)), Some((4, 7))) 41 | mat!(match_basic_32, r"((a|a)|a)", r"a", Some((0, 1)), Some((0, 1)), Some((0, 1))) 42 | mat!(match_basic_33, r"(a*)(a|aa)", r"aaaa", Some((0, 4)), Some((0, 3)), Some((3, 4))) 43 | mat!(match_basic_34, r"a*(a.|aa)", r"aaaa", Some((0, 4)), Some((2, 4))) 44 | mat!(match_basic_35, r"a(b)|c(d)|a(e)f", r"aef", Some((0, 3)), None, None, Some((1, 2))) 45 | mat!(match_basic_36, r"(a|b)?.*", r"b", Some((0, 1)), Some((0, 1))) 46 | mat!(match_basic_37, r"(a|b)c|a(b|c)", r"ac", Some((0, 2)), Some((0, 1))) 47 | mat!(match_basic_38, r"(a|b)c|a(b|c)", r"ab", Some((0, 2)), None, Some((1, 2))) 48 | mat!(match_basic_39, r"(a|b)*c|(a|ab)*c", r"abc", Some((0, 3)), Some((1, 2))) 49 | mat!(match_basic_40, r"(a|b)*c|(a|ab)*c", r"xc", Some((1, 2))) 50 | mat!(match_basic_41, r"(.a|.b).*|.*(.a|.b)", r"xa", Some((0, 2)), Some((0, 2))) 51 | mat!(match_basic_42, r"a?(ab|ba)ab", r"abab", Some((0, 4)), Some((0, 2))) 52 | mat!(match_basic_43, r"a?(ac{0}b|ba)ab", r"abab", Some((0, 4)), Some((0, 2))) 53 | mat!(match_basic_44, r"ab|abab", r"abbabab", Some((0, 2))) 54 | mat!(match_basic_45, r"aba|bab|bba", r"baaabbbaba", Some((5, 8))) 55 | mat!(match_basic_46, r"aba|bab", r"baaabbbaba", Some((6, 9))) 56 | mat!(match_basic_47, r"(aa|aaa)*|(a|aaaaa)", r"aa", Some((0, 2)), Some((0, 2))) 57 | mat!(match_basic_48, r"(a.|.a.)*|(a|.a...)", r"aa", Some((0, 2)), Some((0, 2))) 58 | mat!(match_basic_49, r"ab|a", r"xabc", Some((1, 3))) 59 | mat!(match_basic_50, r"ab|a", r"xxabc", Some((2, 4))) 60 | mat!(match_basic_51, r"(?i)(Ab|cD)*", r"aBcD", Some((0, 4)), Some((2, 4))) 61 | mat!(match_basic_52, r"[^-]", r"--a", Some((2, 3))) 62 | mat!(match_basic_53, r"[a-]*", r"--a", Some((0, 3))) 63 | mat!(match_basic_54, r"[a-m-]*", r"--amoma--", Some((0, 4))) 64 | mat!(match_basic_55, r":::1:::0:|:::1:1:0:", r":::0:::1:::1:::0:", Some((8, 17))) 65 | mat!(match_basic_56, r":::1:::0:|:::1:1:1:", r":::0:::1:::1:::0:", Some((8, 17))) 66 | mat!(match_basic_57, r"[[:upper:]]", r"A", Some((0, 1))) 67 | mat!(match_basic_58, r"[[:lower:]]+", r"`az{", Some((1, 3))) 68 | mat!(match_basic_59, r"[[:upper:]]+", r"@AZ[", Some((1, 3))) 69 | mat!(match_basic_65, r" 70 | ", r" 71 | ", Some((0, 1))) 72 | mat!(match_basic_66, r" 73 | ", r" 74 | ", Some((0, 1))) 75 | mat!(match_basic_67, r"[^a]", r" 76 | ", Some((0, 1))) 77 | mat!(match_basic_68, r" 78 | a", r" 79 | a", Some((0, 2))) 80 | mat!(match_basic_69, r"(a)(b)(c)", r"abc", Some((0, 3)), Some((0, 1)), Some((1, 2)), Some((2, 3))) 81 | mat!(match_basic_70, r"xxx", r"xxx", Some((0, 3))) 82 | mat!(match_basic_71, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"feb 6,", Some((0, 6))) 83 | mat!(match_basic_72, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"2/7", Some((0, 3))) 84 | mat!(match_basic_73, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"feb 1,Feb 6", Some((5, 11))) 85 | mat!(match_basic_74, r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))", r"x", Some((0, 1)), Some((0, 1)), Some((0, 1))) 86 | mat!(match_basic_75, r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*", r"xx", Some((0, 2)), Some((1, 2)), Some((1, 2))) 87 | mat!(match_basic_76, r"a?(ab|ba)*", r"ababababababababababababababababababababababababababababababababababababababababa", Some((0, 81)), Some((79, 81))) 88 | mat!(match_basic_77, r"abaa|abbaa|abbbaa|abbbbaa", r"ababbabbbabbbabbbbabbbbaa", Some((18, 25))) 89 | mat!(match_basic_78, r"abaa|abbaa|abbbaa|abbbbaa", r"ababbabbbabbbabbbbabaa", Some((18, 22))) 90 | mat!(match_basic_79, r"aaac|aabc|abac|abbc|baac|babc|bbac|bbbc", r"baaabbbabac", Some((7, 11))) 91 | mat!(match_basic_80, r".*", r"", Some((0, 2))) 92 | mat!(match_basic_81, r"aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll", r"XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa", Some((53, 57))) 93 | mat!(match_basic_83, r"a*a*a*a*a*b", r"aaaaaaaaab", Some((0, 10))) 94 | mat!(match_basic_84, r"^", r"", Some((0, 0))) 95 | mat!(match_basic_85, r"$", r"", Some((0, 0))) 96 | mat!(match_basic_86, r"^$", r"", Some((0, 0))) 97 | mat!(match_basic_87, r"^a$", r"a", Some((0, 1))) 98 | mat!(match_basic_88, r"abc", r"abc", Some((0, 3))) 99 | mat!(match_basic_89, r"abc", r"xabcy", Some((1, 4))) 100 | mat!(match_basic_90, r"abc", r"ababc", Some((2, 5))) 101 | mat!(match_basic_91, r"ab*c", r"abc", Some((0, 3))) 102 | mat!(match_basic_92, r"ab*bc", r"abc", Some((0, 3))) 103 | mat!(match_basic_93, r"ab*bc", r"abbc", Some((0, 4))) 104 | mat!(match_basic_94, r"ab*bc", r"abbbbc", Some((0, 6))) 105 | mat!(match_basic_95, r"ab+bc", r"abbc", Some((0, 4))) 106 | mat!(match_basic_96, r"ab+bc", r"abbbbc", Some((0, 6))) 107 | mat!(match_basic_97, r"ab?bc", r"abbc", Some((0, 4))) 108 | mat!(match_basic_98, r"ab?bc", r"abc", Some((0, 3))) 109 | mat!(match_basic_99, r"ab?c", r"abc", Some((0, 3))) 110 | mat!(match_basic_100, r"^abc$", r"abc", Some((0, 3))) 111 | mat!(match_basic_101, r"^abc", r"abcc", Some((0, 3))) 112 | mat!(match_basic_102, r"abc$", r"aabc", Some((1, 4))) 113 | mat!(match_basic_103, r"^", r"abc", Some((0, 0))) 114 | mat!(match_basic_104, r"$", r"abc", Some((3, 3))) 115 | mat!(match_basic_105, r"a.c", r"abc", Some((0, 3))) 116 | mat!(match_basic_106, r"a.c", r"axc", Some((0, 3))) 117 | mat!(match_basic_107, r"a.*c", r"axyzc", Some((0, 5))) 118 | mat!(match_basic_108, r"a[bc]d", r"abd", Some((0, 3))) 119 | mat!(match_basic_109, r"a[b-d]e", r"ace", Some((0, 3))) 120 | mat!(match_basic_110, r"a[b-d]", r"aac", Some((1, 3))) 121 | mat!(match_basic_111, r"a[-b]", r"a-", Some((0, 2))) 122 | mat!(match_basic_112, r"a[b-]", r"a-", Some((0, 2))) 123 | mat!(match_basic_113, r"a]", r"a]", Some((0, 2))) 124 | mat!(match_basic_114, r"a[]]b", r"a]b", Some((0, 3))) 125 | mat!(match_basic_115, r"a[^bc]d", r"aed", Some((0, 3))) 126 | mat!(match_basic_116, r"a[^-b]c", r"adc", Some((0, 3))) 127 | mat!(match_basic_117, r"a[^]b]c", r"adc", Some((0, 3))) 128 | mat!(match_basic_118, r"ab|cd", r"abc", Some((0, 2))) 129 | mat!(match_basic_119, r"ab|cd", r"abcd", Some((0, 2))) 130 | mat!(match_basic_120, r"a\(b", r"a(b", Some((0, 3))) 131 | mat!(match_basic_121, r"a\(*b", r"ab", Some((0, 2))) 132 | mat!(match_basic_122, r"a\(*b", r"a((b", Some((0, 4))) 133 | mat!(match_basic_123, r"((a))", r"abc", Some((0, 1)), Some((0, 1)), Some((0, 1))) 134 | mat!(match_basic_124, r"(a)b(c)", r"abc", Some((0, 3)), Some((0, 1)), Some((2, 3))) 135 | mat!(match_basic_125, r"a+b+c", r"aabbabc", Some((4, 7))) 136 | mat!(match_basic_126, r"a*", r"aaa", Some((0, 3))) 137 | mat!(match_basic_128, r"(a*)*", r"-", Some((0, 0)), None) 138 | mat!(match_basic_129, r"(a*)+", r"-", Some((0, 0)), Some((0, 0))) 139 | mat!(match_basic_131, r"(a*|b)*", r"-", Some((0, 0)), None) 140 | mat!(match_basic_132, r"(a+|b)*", r"ab", Some((0, 2)), Some((1, 2))) 141 | mat!(match_basic_133, r"(a+|b)+", r"ab", Some((0, 2)), Some((1, 2))) 142 | mat!(match_basic_134, r"(a+|b)?", r"ab", Some((0, 1)), Some((0, 1))) 143 | mat!(match_basic_135, r"[^ab]*", r"cde", Some((0, 3))) 144 | mat!(match_basic_137, r"(^)*", r"-", Some((0, 0)), None) 145 | mat!(match_basic_138, r"a*", r"", Some((0, 0))) 146 | mat!(match_basic_139, r"([abc])*d", r"abbbcd", Some((0, 6)), Some((4, 5))) 147 | mat!(match_basic_140, r"([abc])*bcd", r"abcd", Some((0, 4)), Some((0, 1))) 148 | mat!(match_basic_141, r"a|b|c|d|e", r"e", Some((0, 1))) 149 | mat!(match_basic_142, r"(a|b|c|d|e)f", r"ef", Some((0, 2)), Some((0, 1))) 150 | mat!(match_basic_144, r"((a*|b))*", r"-", Some((0, 0)), None, None) 151 | mat!(match_basic_145, r"abcd*efg", r"abcdefg", Some((0, 7))) 152 | mat!(match_basic_146, r"ab*", r"xabyabbbz", Some((1, 3))) 153 | mat!(match_basic_147, r"ab*", r"xayabbbz", Some((1, 2))) 154 | mat!(match_basic_148, r"(ab|cd)e", r"abcde", Some((2, 5)), Some((2, 4))) 155 | mat!(match_basic_149, r"[abhgefdc]ij", r"hij", Some((0, 3))) 156 | mat!(match_basic_150, r"(a|b)c*d", r"abcd", Some((1, 4)), Some((1, 2))) 157 | mat!(match_basic_151, r"(ab|ab*)bc", r"abc", Some((0, 3)), Some((0, 1))) 158 | mat!(match_basic_152, r"a([bc]*)c*", r"abc", Some((0, 3)), Some((1, 3))) 159 | mat!(match_basic_153, r"a([bc]*)(c*d)", r"abcd", Some((0, 4)), Some((1, 3)), Some((3, 4))) 160 | mat!(match_basic_154, r"a([bc]+)(c*d)", r"abcd", Some((0, 4)), Some((1, 3)), Some((3, 4))) 161 | mat!(match_basic_155, r"a([bc]*)(c+d)", r"abcd", Some((0, 4)), Some((1, 2)), Some((2, 4))) 162 | mat!(match_basic_156, r"a[bcd]*dcdcde", r"adcdcde", Some((0, 7))) 163 | mat!(match_basic_157, r"(ab|a)b*c", r"abc", Some((0, 3)), Some((0, 2))) 164 | mat!(match_basic_158, r"((a)(b)c)(d)", r"abcd", Some((0, 4)), Some((0, 3)), Some((0, 1)), Some((1, 2)), Some((3, 4))) 165 | mat!(match_basic_159, r"[A-Za-z_][A-Za-z0-9_]*", r"alpha", Some((0, 5))) 166 | mat!(match_basic_160, r"^a(bc+|b[eh])g|.h$", r"abh", Some((1, 3))) 167 | mat!(match_basic_161, r"(bc+d$|ef*g.|h?i(j|k))", r"effgz", Some((0, 5)), Some((0, 5))) 168 | mat!(match_basic_162, r"(bc+d$|ef*g.|h?i(j|k))", r"ij", Some((0, 2)), Some((0, 2)), Some((1, 2))) 169 | mat!(match_basic_163, r"(bc+d$|ef*g.|h?i(j|k))", r"reffgz", Some((1, 6)), Some((1, 6))) 170 | mat!(match_basic_164, r"(((((((((a)))))))))", r"a", Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1))) 171 | mat!(match_basic_165, r"multiple words", r"multiple words yeah", Some((0, 14))) 172 | mat!(match_basic_166, r"(.*)c(.*)", r"abcde", Some((0, 5)), Some((0, 2)), Some((3, 5))) 173 | mat!(match_basic_167, r"abcd", r"abcd", Some((0, 4))) 174 | mat!(match_basic_168, r"a(bc)d", r"abcd", Some((0, 4)), Some((1, 3))) 175 | mat!(match_basic_169, r"a[-]?c", r"ac", Some((0, 3))) 176 | mat!(match_basic_170, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Qaddafi", Some((0, 15)), None, Some((10, 12))) 177 | mat!(match_basic_171, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mo'ammar Gadhafi", Some((0, 16)), None, Some((11, 13))) 178 | mat!(match_basic_172, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Kaddafi", Some((0, 15)), None, Some((10, 12))) 179 | mat!(match_basic_173, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Qadhafi", Some((0, 15)), None, Some((10, 12))) 180 | mat!(match_basic_174, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Gadafi", Some((0, 14)), None, Some((10, 11))) 181 | mat!(match_basic_175, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mu'ammar Qadafi", Some((0, 15)), None, Some((11, 12))) 182 | mat!(match_basic_176, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moamar Gaddafi", Some((0, 14)), None, Some((9, 11))) 183 | mat!(match_basic_177, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mu'ammar Qadhdhafi", Some((0, 18)), None, Some((13, 15))) 184 | mat!(match_basic_178, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Khaddafi", Some((0, 16)), None, Some((11, 13))) 185 | mat!(match_basic_179, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghaddafy", Some((0, 16)), None, Some((11, 13))) 186 | mat!(match_basic_180, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghadafi", Some((0, 15)), None, Some((11, 12))) 187 | mat!(match_basic_181, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghaddafi", Some((0, 16)), None, Some((11, 13))) 188 | mat!(match_basic_182, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muamar Kaddafi", Some((0, 14)), None, Some((9, 11))) 189 | mat!(match_basic_183, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Quathafi", Some((0, 16)), None, Some((11, 13))) 190 | mat!(match_basic_184, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Gheddafi", Some((0, 16)), None, Some((11, 13))) 191 | mat!(match_basic_185, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moammar Khadafy", Some((0, 15)), None, Some((11, 12))) 192 | mat!(match_basic_186, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moammar Qudhafi", Some((0, 15)), None, Some((10, 12))) 193 | mat!(match_basic_187, r"a+(b|c)*d+", r"aabcdd", Some((0, 6)), Some((3, 4))) 194 | mat!(match_basic_188, r"^.+$", r"vivi", Some((0, 4))) 195 | mat!(match_basic_189, r"^(.+)$", r"vivi", Some((0, 4)), Some((0, 4))) 196 | mat!(match_basic_190, r"^([^!.]+).att.com!(.+)$", r"gryphon.att.com!eby", Some((0, 19)), Some((0, 7)), Some((16, 19))) 197 | mat!(match_basic_191, r"^([^!]+!)?([^!]+)$", r"bas", Some((0, 3)), None, Some((0, 3))) 198 | mat!(match_basic_192, r"^([^!]+!)?([^!]+)$", r"bar!bas", Some((0, 7)), Some((0, 4)), Some((4, 7))) 199 | mat!(match_basic_193, r"^([^!]+!)?([^!]+)$", r"foo!bas", Some((0, 7)), Some((0, 4)), Some((4, 7))) 200 | mat!(match_basic_194, r"^.+!([^!]+!)([^!]+)$", r"foo!bar!bas", Some((0, 11)), Some((4, 8)), Some((8, 11))) 201 | mat!(match_basic_195, r"((foo)|(bar))!bas", r"bar!bas", Some((0, 7)), Some((0, 3)), None, Some((0, 3))) 202 | mat!(match_basic_196, r"((foo)|(bar))!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)), None, Some((4, 7))) 203 | mat!(match_basic_197, r"((foo)|(bar))!bas", r"foo!bas", Some((0, 7)), Some((0, 3)), Some((0, 3))) 204 | mat!(match_basic_198, r"((foo)|bar)!bas", r"bar!bas", Some((0, 7)), Some((0, 3))) 205 | mat!(match_basic_199, r"((foo)|bar)!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7))) 206 | mat!(match_basic_200, r"((foo)|bar)!bas", r"foo!bas", Some((0, 7)), Some((0, 3)), Some((0, 3))) 207 | mat!(match_basic_201, r"(foo|(bar))!bas", r"bar!bas", Some((0, 7)), Some((0, 3)), Some((0, 3))) 208 | mat!(match_basic_202, r"(foo|(bar))!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)), Some((4, 7))) 209 | mat!(match_basic_203, r"(foo|(bar))!bas", r"foo!bas", Some((0, 7)), Some((0, 3))) 210 | mat!(match_basic_204, r"(foo|bar)!bas", r"bar!bas", Some((0, 7)), Some((0, 3))) 211 | mat!(match_basic_205, r"(foo|bar)!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7))) 212 | mat!(match_basic_206, r"(foo|bar)!bas", r"foo!bas", Some((0, 7)), Some((0, 3))) 213 | mat!(match_basic_207, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bar!bas", Some((0, 11)), Some((0, 11)), None, None, Some((4, 8)), Some((8, 11))) 214 | mat!(match_basic_208, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"bas", Some((0, 3)), None, Some((0, 3))) 215 | mat!(match_basic_209, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"bar!bas", Some((0, 7)), Some((0, 4)), Some((4, 7))) 216 | mat!(match_basic_210, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"foo!bar!bas", Some((0, 11)), None, None, Some((4, 8)), Some((8, 11))) 217 | mat!(match_basic_211, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"foo!bas", Some((0, 7)), Some((0, 4)), Some((4, 7))) 218 | mat!(match_basic_212, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"bas", Some((0, 3)), Some((0, 3)), None, Some((0, 3))) 219 | mat!(match_basic_213, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"bar!bas", Some((0, 7)), Some((0, 7)), Some((0, 4)), Some((4, 7))) 220 | mat!(match_basic_214, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bar!bas", Some((0, 11)), Some((0, 11)), None, None, Some((4, 8)), Some((8, 11))) 221 | mat!(match_basic_215, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bas", Some((0, 7)), Some((0, 7)), Some((0, 4)), Some((4, 7))) 222 | mat!(match_basic_216, r".*(/XXX).*", r"/XXX", Some((0, 4)), Some((0, 4))) 223 | mat!(match_basic_217, r".*(\\XXX).*", r"\XXX", Some((0, 4)), Some((0, 4))) 224 | mat!(match_basic_218, r"\\XXX", r"\XXX", Some((0, 4))) 225 | mat!(match_basic_219, r".*(/000).*", r"/000", Some((0, 4)), Some((0, 4))) 226 | mat!(match_basic_220, r".*(\\000).*", r"\000", Some((0, 4)), Some((0, 4))) 227 | mat!(match_basic_221, r"\\000", r"\000", Some((0, 4))) 228 | 229 | // Tests from nullsubexpr.dat 230 | mat!(match_nullsubexpr_3, r"(a*)*", r"a", Some((0, 1)), Some((0, 1))) 231 | mat!(match_nullsubexpr_5, r"(a*)*", r"x", Some((0, 0)), None) 232 | mat!(match_nullsubexpr_6, r"(a*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))) 233 | mat!(match_nullsubexpr_7, r"(a*)*", r"aaaaaax", Some((0, 6)), Some((0, 6))) 234 | mat!(match_nullsubexpr_8, r"(a*)+", r"a", Some((0, 1)), Some((0, 1))) 235 | mat!(match_nullsubexpr_9, r"(a*)+", r"x", Some((0, 0)), Some((0, 0))) 236 | mat!(match_nullsubexpr_10, r"(a*)+", r"aaaaaa", Some((0, 6)), Some((0, 6))) 237 | mat!(match_nullsubexpr_11, r"(a*)+", r"aaaaaax", Some((0, 6)), Some((0, 6))) 238 | mat!(match_nullsubexpr_12, r"(a+)*", r"a", Some((0, 1)), Some((0, 1))) 239 | mat!(match_nullsubexpr_13, r"(a+)*", r"x", Some((0, 0))) 240 | mat!(match_nullsubexpr_14, r"(a+)*", r"aaaaaa", Some((0, 6)), Some((0, 6))) 241 | mat!(match_nullsubexpr_15, r"(a+)*", r"aaaaaax", Some((0, 6)), Some((0, 6))) 242 | mat!(match_nullsubexpr_16, r"(a+)+", r"a", Some((0, 1)), Some((0, 1))) 243 | mat!(match_nullsubexpr_17, r"(a+)+", r"x", None) 244 | mat!(match_nullsubexpr_18, r"(a+)+", r"aaaaaa", Some((0, 6)), Some((0, 6))) 245 | mat!(match_nullsubexpr_19, r"(a+)+", r"aaaaaax", Some((0, 6)), Some((0, 6))) 246 | mat!(match_nullsubexpr_21, r"([a]*)*", r"a", Some((0, 1)), Some((0, 1))) 247 | mat!(match_nullsubexpr_23, r"([a]*)*", r"x", Some((0, 0)), None) 248 | mat!(match_nullsubexpr_24, r"([a]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))) 249 | mat!(match_nullsubexpr_25, r"([a]*)*", r"aaaaaax", Some((0, 6)), Some((0, 6))) 250 | mat!(match_nullsubexpr_26, r"([a]*)+", r"a", Some((0, 1)), Some((0, 1))) 251 | mat!(match_nullsubexpr_27, r"([a]*)+", r"x", Some((0, 0)), Some((0, 0))) 252 | mat!(match_nullsubexpr_28, r"([a]*)+", r"aaaaaa", Some((0, 6)), Some((0, 6))) 253 | mat!(match_nullsubexpr_29, r"([a]*)+", r"aaaaaax", Some((0, 6)), Some((0, 6))) 254 | mat!(match_nullsubexpr_30, r"([^b]*)*", r"a", Some((0, 1)), Some((0, 1))) 255 | mat!(match_nullsubexpr_32, r"([^b]*)*", r"b", Some((0, 0)), None) 256 | mat!(match_nullsubexpr_33, r"([^b]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))) 257 | mat!(match_nullsubexpr_34, r"([^b]*)*", r"aaaaaab", Some((0, 6)), Some((0, 6))) 258 | mat!(match_nullsubexpr_35, r"([ab]*)*", r"a", Some((0, 1)), Some((0, 1))) 259 | mat!(match_nullsubexpr_36, r"([ab]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6))) 260 | mat!(match_nullsubexpr_37, r"([ab]*)*", r"ababab", Some((0, 6)), Some((0, 6))) 261 | mat!(match_nullsubexpr_38, r"([ab]*)*", r"bababa", Some((0, 6)), Some((0, 6))) 262 | mat!(match_nullsubexpr_39, r"([ab]*)*", r"b", Some((0, 1)), Some((0, 1))) 263 | mat!(match_nullsubexpr_40, r"([ab]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6))) 264 | mat!(match_nullsubexpr_41, r"([ab]*)*", r"aaaabcde", Some((0, 5)), Some((0, 5))) 265 | mat!(match_nullsubexpr_42, r"([^a]*)*", r"b", Some((0, 1)), Some((0, 1))) 266 | mat!(match_nullsubexpr_43, r"([^a]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6))) 267 | mat!(match_nullsubexpr_45, r"([^a]*)*", r"aaaaaa", Some((0, 0)), None) 268 | mat!(match_nullsubexpr_46, r"([^ab]*)*", r"ccccxx", Some((0, 6)), Some((0, 6))) 269 | mat!(match_nullsubexpr_48, r"([^ab]*)*", r"ababab", Some((0, 0)), None) 270 | mat!(match_nullsubexpr_50, r"((z)+|a)*", r"zabcde", Some((0, 2)), Some((1, 2))) 271 | mat!(match_nullsubexpr_69, r"(a*)*(x)", r"x", Some((0, 1)), None, Some((0, 1))) 272 | mat!(match_nullsubexpr_70, r"(a*)*(x)", r"ax", Some((0, 2)), Some((0, 1)), Some((1, 2))) 273 | mat!(match_nullsubexpr_71, r"(a*)*(x)", r"axa", Some((0, 2)), Some((0, 1)), Some((1, 2))) 274 | mat!(match_nullsubexpr_73, r"(a*)+(x)", r"x", Some((0, 1)), Some((0, 0)), Some((0, 1))) 275 | mat!(match_nullsubexpr_74, r"(a*)+(x)", r"ax", Some((0, 2)), Some((0, 1)), Some((1, 2))) 276 | mat!(match_nullsubexpr_75, r"(a*)+(x)", r"axa", Some((0, 2)), Some((0, 1)), Some((1, 2))) 277 | mat!(match_nullsubexpr_77, r"(a*){2}(x)", r"x", Some((0, 1)), Some((0, 0)), Some((0, 1))) 278 | mat!(match_nullsubexpr_78, r"(a*){2}(x)", r"ax", Some((0, 2)), Some((1, 1)), Some((1, 2))) 279 | mat!(match_nullsubexpr_79, r"(a*){2}(x)", r"axa", Some((0, 2)), Some((1, 1)), Some((1, 2))) 280 | 281 | // Tests from repetition.dat 282 | mat!(match_repetition_10, r"((..)|(.))", r"", None) 283 | mat!(match_repetition_11, r"((..)|(.))((..)|(.))", r"", None) 284 | mat!(match_repetition_12, r"((..)|(.))((..)|(.))((..)|(.))", r"", None) 285 | mat!(match_repetition_14, r"((..)|(.)){1}", r"", None) 286 | mat!(match_repetition_15, r"((..)|(.)){2}", r"", None) 287 | mat!(match_repetition_16, r"((..)|(.)){3}", r"", None) 288 | mat!(match_repetition_18, r"((..)|(.))*", r"", Some((0, 0))) 289 | mat!(match_repetition_20, r"((..)|(.))", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1))) 290 | mat!(match_repetition_21, r"((..)|(.))((..)|(.))", r"a", None) 291 | mat!(match_repetition_22, r"((..)|(.))((..)|(.))((..)|(.))", r"a", None) 292 | mat!(match_repetition_24, r"((..)|(.)){1}", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1))) 293 | mat!(match_repetition_25, r"((..)|(.)){2}", r"a", None) 294 | mat!(match_repetition_26, r"((..)|(.)){3}", r"a", None) 295 | mat!(match_repetition_28, r"((..)|(.))*", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1))) 296 | mat!(match_repetition_30, r"((..)|(.))", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) 297 | mat!(match_repetition_31, r"((..)|(.))((..)|(.))", r"aa", Some((0, 2)), Some((0, 1)), None, Some((0, 1)), Some((1, 2)), None, Some((1, 2))) 298 | mat!(match_repetition_32, r"((..)|(.))((..)|(.))((..)|(.))", r"aa", None) 299 | mat!(match_repetition_34, r"((..)|(.)){1}", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) 300 | mat!(match_repetition_35, r"((..)|(.)){2}", r"aa", Some((0, 2)), Some((1, 2)), None, Some((1, 2))) 301 | mat!(match_repetition_36, r"((..)|(.)){3}", r"aa", None) 302 | mat!(match_repetition_38, r"((..)|(.))*", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) 303 | mat!(match_repetition_40, r"((..)|(.))", r"aaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) 304 | mat!(match_repetition_41, r"((..)|(.))((..)|(.))", r"aaa", Some((0, 3)), Some((0, 2)), Some((0, 2)), None, Some((2, 3)), None, Some((2, 3))) 305 | mat!(match_repetition_42, r"((..)|(.))((..)|(.))((..)|(.))", r"aaa", Some((0, 3)), Some((0, 1)), None, Some((0, 1)), Some((1, 2)), None, Some((1, 2)), Some((2, 3)), None, Some((2, 3))) 306 | mat!(match_repetition_44, r"((..)|(.)){1}", r"aaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) 307 | mat!(match_repetition_46, r"((..)|(.)){2}", r"aaa", Some((0, 3)), Some((2, 3)), Some((0, 2)), Some((2, 3))) 308 | mat!(match_repetition_47, r"((..)|(.)){3}", r"aaa", Some((0, 3)), Some((2, 3)), None, Some((2, 3))) 309 | mat!(match_repetition_50, r"((..)|(.))*", r"aaa", Some((0, 3)), Some((2, 3)), Some((0, 2)), Some((2, 3))) 310 | mat!(match_repetition_52, r"((..)|(.))", r"aaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) 311 | mat!(match_repetition_53, r"((..)|(.))((..)|(.))", r"aaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None) 312 | mat!(match_repetition_54, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 3)), None, Some((2, 3)), Some((3, 4)), None, Some((3, 4))) 313 | mat!(match_repetition_56, r"((..)|(.)){1}", r"aaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) 314 | mat!(match_repetition_57, r"((..)|(.)){2}", r"aaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None) 315 | mat!(match_repetition_59, r"((..)|(.)){3}", r"aaaa", Some((0, 4)), Some((3, 4)), Some((0, 2)), Some((3, 4))) 316 | mat!(match_repetition_61, r"((..)|(.))*", r"aaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None) 317 | mat!(match_repetition_63, r"((..)|(.))", r"aaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) 318 | mat!(match_repetition_64, r"((..)|(.))((..)|(.))", r"aaaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None) 319 | mat!(match_repetition_65, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaaa", Some((0, 5)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None, Some((4, 5)), None, Some((4, 5))) 320 | mat!(match_repetition_67, r"((..)|(.)){1}", r"aaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) 321 | mat!(match_repetition_68, r"((..)|(.)){2}", r"aaaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None) 322 | mat!(match_repetition_70, r"((..)|(.)){3}", r"aaaaa", Some((0, 5)), Some((4, 5)), Some((2, 4)), Some((4, 5))) 323 | mat!(match_repetition_73, r"((..)|(.))*", r"aaaaa", Some((0, 5)), Some((4, 5)), Some((2, 4)), Some((4, 5))) 324 | mat!(match_repetition_75, r"((..)|(.))", r"aaaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) 325 | mat!(match_repetition_76, r"((..)|(.))((..)|(.))", r"aaaaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None) 326 | mat!(match_repetition_77, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaaaa", Some((0, 6)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None, Some((4, 6)), Some((4, 6)), None) 327 | mat!(match_repetition_79, r"((..)|(.)){1}", r"aaaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None) 328 | mat!(match_repetition_80, r"((..)|(.)){2}", r"aaaaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None) 329 | mat!(match_repetition_81, r"((..)|(.)){3}", r"aaaaaa", Some((0, 6)), Some((4, 6)), Some((4, 6)), None) 330 | mat!(match_repetition_83, r"((..)|(.))*", r"aaaaaa", Some((0, 6)), Some((4, 6)), Some((4, 6)), None) 331 | mat!(match_repetition_90, r"X(.?){0,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) 332 | mat!(match_repetition_91, r"X(.?){1,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) 333 | mat!(match_repetition_92, r"X(.?){2,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) 334 | mat!(match_repetition_93, r"X(.?){3,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) 335 | mat!(match_repetition_94, r"X(.?){4,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) 336 | mat!(match_repetition_95, r"X(.?){5,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) 337 | mat!(match_repetition_96, r"X(.?){6,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) 338 | mat!(match_repetition_97, r"X(.?){7,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8))) 339 | mat!(match_repetition_98, r"X(.?){8,}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) 340 | mat!(match_repetition_100, r"X(.?){0,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) 341 | mat!(match_repetition_102, r"X(.?){1,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) 342 | mat!(match_repetition_104, r"X(.?){2,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) 343 | mat!(match_repetition_106, r"X(.?){3,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) 344 | mat!(match_repetition_108, r"X(.?){4,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) 345 | mat!(match_repetition_110, r"X(.?){5,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) 346 | mat!(match_repetition_112, r"X(.?){6,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) 347 | mat!(match_repetition_114, r"X(.?){7,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) 348 | mat!(match_repetition_115, r"X(.?){8,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8))) 349 | mat!(match_repetition_126, r"(a|ab|c|bcd){0,}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))) 350 | mat!(match_repetition_127, r"(a|ab|c|bcd){1,}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))) 351 | mat!(match_repetition_128, r"(a|ab|c|bcd){2,}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6))) 352 | mat!(match_repetition_129, r"(a|ab|c|bcd){3,}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6))) 353 | mat!(match_repetition_130, r"(a|ab|c|bcd){4,}(d*)", r"ababcd", None) 354 | mat!(match_repetition_131, r"(a|ab|c|bcd){0,10}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))) 355 | mat!(match_repetition_132, r"(a|ab|c|bcd){1,10}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))) 356 | mat!(match_repetition_133, r"(a|ab|c|bcd){2,10}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6))) 357 | mat!(match_repetition_134, r"(a|ab|c|bcd){3,10}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6))) 358 | mat!(match_repetition_135, r"(a|ab|c|bcd){4,10}(d*)", r"ababcd", None) 359 | mat!(match_repetition_136, r"(a|ab|c|bcd)*(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))) 360 | mat!(match_repetition_137, r"(a|ab|c|bcd)+(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1))) 361 | mat!(match_repetition_143, r"(ab|a|c|bcd){0,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) 362 | mat!(match_repetition_145, r"(ab|a|c|bcd){1,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) 363 | mat!(match_repetition_147, r"(ab|a|c|bcd){2,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) 364 | mat!(match_repetition_149, r"(ab|a|c|bcd){3,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) 365 | mat!(match_repetition_150, r"(ab|a|c|bcd){4,}(d*)", r"ababcd", None) 366 | mat!(match_repetition_152, r"(ab|a|c|bcd){0,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) 367 | mat!(match_repetition_154, r"(ab|a|c|bcd){1,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) 368 | mat!(match_repetition_156, r"(ab|a|c|bcd){2,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) 369 | mat!(match_repetition_158, r"(ab|a|c|bcd){3,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) 370 | mat!(match_repetition_159, r"(ab|a|c|bcd){4,10}(d*)", r"ababcd", None) 371 | mat!(match_repetition_161, r"(ab|a|c|bcd)*(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) 372 | mat!(match_repetition_163, r"(ab|a|c|bcd)+(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6))) 373 | 374 | -------------------------------------------------------------------------------- /src/test/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT 2 | // file at the top-level directory of this distribution and at 3 | // http://rust-lang.org/COPYRIGHT. 4 | // 5 | // Licensed under the Apache License, Version 2.0 or the MIT license 7 | // , at your 8 | // option. This file may not be copied, modified, or distributed 9 | // except according to those terms. 10 | 11 | #[cfg(not(stage1))] 12 | #[phase(syntax)] 13 | extern crate regex_macros; 14 | 15 | #[cfg(not(stage1))] 16 | #[path = "bench.rs"] 17 | mod native_bench; 18 | 19 | #[cfg(not(stage1))] 20 | #[path = "tests.rs"] 21 | mod native_tests; 22 | 23 | // Due to macro scoping rules, this definition only applies for the modules 24 | // defined below. Effectively, it allows us to use the same tests for both 25 | // native and dynamic regexes. 26 | macro_rules! regex( 27 | ($re:expr) => ( 28 | match ::regex::Regex::new($re) { 29 | Ok(re) => re, 30 | Err(err) => fail!("{}", err), 31 | } 32 | ); 33 | ) 34 | 35 | #[path = "bench.rs"] 36 | mod dynamic_bench; 37 | #[path = "tests.rs"] 38 | mod dynamic_tests; 39 | 40 | -------------------------------------------------------------------------------- /src/test/tests.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT 2 | // file at the top-level directory of this distribution and at 3 | // http://rust-lang.org/COPYRIGHT. 4 | // 5 | // Licensed under the Apache License, Version 2.0 or the MIT license 7 | // , at your 8 | // option. This file may not be copied, modified, or distributed 9 | // except according to those terms. 10 | 11 | // ignore-tidy-linelength 12 | 13 | use regex::{Regex, NoExpand}; 14 | 15 | #[test] 16 | fn splitn() { 17 | let re = regex!(r"\d+"); 18 | let text = "cauchy123plato456tyler789binx"; 19 | let subs: Vec<&str> = re.splitn(text, 2).collect(); 20 | assert_eq!(subs, vec!("cauchy", "plato456tyler789binx")); 21 | } 22 | 23 | #[test] 24 | fn split() { 25 | let re = regex!(r"\d+"); 26 | let text = "cauchy123plato456tyler789binx"; 27 | let subs: Vec<&str> = re.split(text).collect(); 28 | assert_eq!(subs, vec!("cauchy", "plato", "tyler", "binx")); 29 | } 30 | 31 | macro_rules! replace( 32 | ($name:ident, $which:ident, $re:expr, 33 | $search:expr, $replace:expr, $result:expr) => ( 34 | #[test] 35 | fn $name() { 36 | let re = regex!($re); 37 | assert_eq!(re.$which($search, $replace), StrBuf::from_str($result)); 38 | } 39 | ); 40 | ) 41 | 42 | replace!(rep_first, replace, r"\d", "age: 26", "Z", "age: Z6") 43 | replace!(rep_plus, replace, r"\d+", "age: 26", "Z", "age: Z") 44 | replace!(rep_all, replace_all, r"\d", "age: 26", "Z", "age: ZZ") 45 | replace!(rep_groups, replace, r"(\S+)\s+(\S+)", "w1 w2", "$2 $1", "w2 w1") 46 | replace!(rep_double_dollar, replace, 47 | r"(\S+)\s+(\S+)", "w1 w2", "$2 $$1", "w2 $1") 48 | replace!(rep_no_expand, replace, 49 | r"(\S+)\s+(\S+)", "w1 w2", NoExpand("$2 $1"), "$2 $1") 50 | replace!(rep_named, replace_all, 51 | r"(?P\S+)\s+(?P\S+)(?P\s*)", 52 | "w1 w2 w3 w4", "$last $first$space", "w2 w1 w4 w3") 53 | replace!(rep_trim, replace_all, "^[ \t]+|[ \t]+$", " \t trim me\t \t", 54 | "", "trim me") 55 | 56 | macro_rules! noparse( 57 | ($name:ident, $re:expr) => ( 58 | #[test] 59 | fn $name() { 60 | let re = $re; 61 | match Regex::new(re) { 62 | Err(_) => {}, 63 | Ok(_) => fail!("Regex '{}' should cause a parse error.", re), 64 | } 65 | } 66 | ); 67 | ) 68 | 69 | noparse!(fail_double_repeat, "a**") 70 | noparse!(fail_no_repeat_arg, "*") 71 | noparse!(fail_no_repeat_arg_begin, "^*") 72 | noparse!(fail_incomplete_escape, "\\") 73 | noparse!(fail_class_incomplete, "[A-") 74 | noparse!(fail_class_not_closed, "[A") 75 | noparse!(fail_class_no_begin, r"[\A]") 76 | noparse!(fail_class_no_end, r"[\z]") 77 | noparse!(fail_class_no_boundary, r"[\b]") 78 | noparse!(fail_open_paren, "(") 79 | noparse!(fail_close_paren, ")") 80 | noparse!(fail_invalid_range, "[a-Z]") 81 | noparse!(fail_empty_capture_name, "(?P<>a)") 82 | noparse!(fail_empty_capture_exp, "(?P)") 83 | noparse!(fail_bad_capture_name, "(?P)") 84 | noparse!(fail_bad_flag, "(?a)a") 85 | noparse!(fail_empty_alt_before, "|a") 86 | noparse!(fail_empty_alt_after, "a|") 87 | noparse!(fail_counted_big_exact, "a{1001}") 88 | noparse!(fail_counted_big_min, "a{1001,}") 89 | noparse!(fail_counted_no_close, "a{1001") 90 | noparse!(fail_unfinished_cap, "(?") 91 | noparse!(fail_unfinished_escape, "\\") 92 | noparse!(fail_octal_digit, r"\8") 93 | noparse!(fail_hex_digit, r"\xG0") 94 | noparse!(fail_hex_short, r"\xF") 95 | noparse!(fail_hex_long_digits, r"\x{fffg}") 96 | noparse!(fail_flag_bad, "(?a)") 97 | noparse!(fail_flag_empty, "(?)") 98 | noparse!(fail_double_neg, "(?-i-i)") 99 | noparse!(fail_neg_empty, "(?i-)") 100 | noparse!(fail_empty_group, "()") 101 | noparse!(fail_dupe_named, "(?P.)(?P.)") 102 | 103 | macro_rules! mat( 104 | ($name:ident, $re:expr, $text:expr, $($loc:tt)+) => ( 105 | #[test] 106 | fn $name() { 107 | let text = $text; 108 | let expected: Vec> = vec!($($loc)+); 109 | let r = regex!($re); 110 | let got = match r.captures(text) { 111 | Some(c) => c.iter_pos().collect::>>(), 112 | None => vec!(None), 113 | }; 114 | // The test set sometimes leave out capture groups, so truncate 115 | // actual capture groups to match test set. 116 | let (sexpect, mut sgot) = (expected.as_slice(), got.as_slice()); 117 | if sgot.len() > sexpect.len() { 118 | sgot = sgot.slice(0, sexpect.len()) 119 | } 120 | if sexpect != sgot { 121 | fail!("For RE '{}' against '{}', expected '{}' but got '{}'", 122 | $re, text, sexpect, sgot); 123 | } 124 | } 125 | ); 126 | ) 127 | 128 | // Some crazy expressions from regular-expressions.info. 129 | mat!(match_ranges, 130 | r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", 131 | "num: 255", Some((5, 8))) 132 | mat!(match_ranges_not, 133 | r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b", 134 | "num: 256", None) 135 | mat!(match_float1, r"[-+]?[0-9]*\.?[0-9]+", "0.1", Some((0, 3))) 136 | mat!(match_float2, r"[-+]?[0-9]*\.?[0-9]+", "0.1.2", Some((0, 3))) 137 | mat!(match_float3, r"[-+]?[0-9]*\.?[0-9]+", "a1.2", Some((1, 4))) 138 | mat!(match_float4, r"^[-+]?[0-9]*\.?[0-9]+$", "1.a", None) 139 | mat!(match_email, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", 140 | "mine is jam.slam@gmail.com ", Some((8, 26))) 141 | mat!(match_email_not, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b", 142 | "mine is jam.slam@gmail ", None) 143 | mat!(match_email_big, r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?", 144 | "mine is jam.slam@gmail.com ", Some((8, 26))) 145 | mat!(match_date1, 146 | r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", 147 | "1900-01-01", Some((0, 10))) 148 | mat!(match_date2, 149 | r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", 150 | "1900-00-01", None) 151 | mat!(match_date3, 152 | r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$", 153 | "1900-13-01", None) 154 | 155 | // Exercise the flags. 156 | mat!(match_flag_case, "(?i)abc", "ABC", Some((0, 3))) 157 | mat!(match_flag_weird_case, "(?i)a(?-i)bc", "Abc", Some((0, 3))) 158 | mat!(match_flag_weird_case_not, "(?i)a(?-i)bc", "ABC", None) 159 | mat!(match_flag_case_dotnl, "(?is)a.", "A\n", Some((0, 2))) 160 | mat!(match_flag_case_dotnl_toggle, "(?is)a.(?-is)a.", "A\nab", Some((0, 4))) 161 | mat!(match_flag_case_dotnl_toggle_not, "(?is)a.(?-is)a.", "A\na\n", None) 162 | mat!(match_flag_case_dotnl_toggle_ok, "(?is)a.(?-is:a.)?", "A\na\n", Some((0, 2))) 163 | mat!(match_flag_multi, "(?m)(?:^\\d+$\n?)+", "123\n456\n789", Some((0, 11))) 164 | mat!(match_flag_ungreedy, "(?U)a+", "aa", Some((0, 1))) 165 | mat!(match_flag_ungreedy_greedy, "(?U)a+?", "aa", Some((0, 2))) 166 | mat!(match_flag_ungreedy_noop, "(?U)(?-U)a+", "aa", Some((0, 2))) 167 | 168 | // Some Unicode tests. 169 | mat!(uni_literal, r"Ⅰ", "Ⅰ", Some((0, 3))) 170 | mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3))) 171 | mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8))) 172 | mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2))) 173 | mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2))) 174 | mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5))) 175 | mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2))) 176 | mat!(uni_case_not, r"Δ", "δ", None) 177 | mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8))) 178 | mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10))) 179 | mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10))) 180 | mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10))) 181 | 182 | // Test the Unicode friendliness of Perl character classes. 183 | mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4))) 184 | mat!(uni_perl_w_not, r"\w+", "Ⅱ", None) 185 | mat!(uni_perl_w_neg, r"\W+", "Ⅱ", Some((0, 3))) 186 | mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8))) 187 | mat!(uni_perl_d_not, r"\d+", "Ⅱ", None) 188 | mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3))) 189 | mat!(uni_perl_s, r"\s+", " ", Some((0, 3))) 190 | mat!(uni_perl_s_not, r"\s+", "☃", None) 191 | mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3))) 192 | 193 | // And do the same for word boundaries. 194 | mat!(uni_boundary_none, r"\d\b", "6δ", None) 195 | mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1))) 196 | 197 | // A whole mess of tests from Glenn Fowler's regex test suite. 198 | // Generated by the 'src/etc/regex-match-tests' program. 199 | mod matches; 200 | -------------------------------------------------------------------------------- /src/testdata/LICENSE: -------------------------------------------------------------------------------- 1 | The following license covers testregex.c and all associated test data. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a 4 | copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software 5 | without restriction, including without limitation the rights to use, 6 | copy, modify, merge, publish, distribute, and/or sell copies of the 7 | Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following disclaimer: 9 | 10 | THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED 11 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 12 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 13 | IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 14 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 15 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 16 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 17 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 18 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 19 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 20 | -------------------------------------------------------------------------------- /src/testdata/README: -------------------------------------------------------------------------------- 1 | Test data was taken from the Go distribution, which was in turn taken from the 2 | testregex test suite: 3 | 4 | http://www2.research.att.com/~astopen/testregex/testregex.html 5 | 6 | The LICENSE in this directory corresponds to the LICENSE that the data was 7 | released under. 8 | 9 | The tests themselves were modified for RE2/Go. A couple were modified further 10 | by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them. 11 | (Yes, it seems like RE2/Go includes failing test cases.) This may or may not 12 | have been a bad idea, but I think being consistent with an established Regex 13 | library is worth something. 14 | 15 | Note that these files are read by 'src/etc/regexp-match-tests' and turned into 16 | Rust tests found in 'src/libregexp/tests/matches.rs'. 17 | 18 | -------------------------------------------------------------------------------- /src/testdata/basic.dat: -------------------------------------------------------------------------------- 1 | NOTE all standard compliant implementations should pass these : 2002-05-31 2 | 3 | BE abracadabra$ abracadabracadabra (7,18) 4 | BE a...b abababbb (2,7) 5 | BE XXXXXX ..XXXXXX (2,8) 6 | E \) () (1,2) 7 | BE a] a]a (0,2) 8 | B } } (0,1) 9 | E \} } (0,1) 10 | BE \] ] (0,1) 11 | B ] ] (0,1) 12 | E ] ] (0,1) 13 | B { { (0,1) 14 | B } } (0,1) 15 | BE ^a ax (0,1) 16 | BE \^a a^a (1,3) 17 | BE a\^ a^ (0,2) 18 | BE a$ aa (1,2) 19 | BE a\$ a$ (0,2) 20 | BE ^$ NULL (0,0) 21 | E $^ NULL (0,0) 22 | E a($) aa (1,2)(2,2) 23 | E a*(^a) aa (0,1)(0,1) 24 | E (..)*(...)* a (0,0) 25 | E (..)*(...)* abcd (0,4)(2,4) 26 | E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) 27 | E (ab)c|abc abc (0,3)(0,2) 28 | E a{0}b ab (1,2) 29 | E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) 30 | E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) 31 | E a{9876543210} NULL BADBR 32 | E ((a|a)|a) a (0,1)(0,1)(0,1) 33 | E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) 34 | E a*(a.|aa) aaaa (0,4)(2,4) 35 | E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) 36 | E (a|b)?.* b (0,1)(0,1) 37 | E (a|b)c|a(b|c) ac (0,2)(0,1) 38 | E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) 39 | E (a|b)*c|(a|ab)*c abc (0,3)(1,2) 40 | E (a|b)*c|(a|ab)*c xc (1,2) 41 | E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) 42 | E a?(ab|ba)ab abab (0,4)(0,2) 43 | E a?(ac{0}b|ba)ab abab (0,4)(0,2) 44 | E ab|abab abbabab (0,2) 45 | E aba|bab|bba baaabbbaba (5,8) 46 | E aba|bab baaabbbaba (6,9) 47 | E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) 48 | E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) 49 | E ab|a xabc (1,3) 50 | E ab|a xxabc (2,4) 51 | Ei (Ab|cD)* aBcD (0,4)(2,4) 52 | BE [^-] --a (2,3) 53 | BE [a-]* --a (0,3) 54 | BE [a-m-]* --amoma-- (0,4) 55 | E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) 56 | E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) 57 | {E [[:upper:]] A (0,1) [[]] not supported 58 | E [[:lower:]]+ `az{ (1,3) 59 | E [[:upper:]]+ @AZ[ (1,3) 60 | # No collation in Go 61 | #BE [[-]] [[-]] (2,4) 62 | #BE [[.NIL.]] NULL ECOLLATE 63 | #BE [[=aleph=]] NULL ECOLLATE 64 | } 65 | BE$ \n \n (0,1) 66 | BEn$ \n \n (0,1) 67 | BE$ [^a] \n (0,1) 68 | BE$ \na \na (0,2) 69 | E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) 70 | BE xxx xxx (0,3) 71 | E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) 72 | E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) 73 | E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) 74 | E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) 75 | E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) 76 | E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) 77 | E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) 78 | E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) 79 | E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) 80 | BE$ .* \x01\x7f (0,2) 81 | E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) 82 | L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH 83 | E a*a*a*a*a*b aaaaaaaaab (0,10) 84 | BE ^ NULL (0,0) 85 | BE $ NULL (0,0) 86 | BE ^$ NULL (0,0) 87 | BE ^a$ a (0,1) 88 | BE abc abc (0,3) 89 | BE abc xabcy (1,4) 90 | BE abc ababc (2,5) 91 | BE ab*c abc (0,3) 92 | BE ab*bc abc (0,3) 93 | BE ab*bc abbc (0,4) 94 | BE ab*bc abbbbc (0,6) 95 | E ab+bc abbc (0,4) 96 | E ab+bc abbbbc (0,6) 97 | E ab?bc abbc (0,4) 98 | E ab?bc abc (0,3) 99 | E ab?c abc (0,3) 100 | BE ^abc$ abc (0,3) 101 | BE ^abc abcc (0,3) 102 | BE abc$ aabc (1,4) 103 | BE ^ abc (0,0) 104 | BE $ abc (3,3) 105 | BE a.c abc (0,3) 106 | BE a.c axc (0,3) 107 | BE a.*c axyzc (0,5) 108 | BE a[bc]d abd (0,3) 109 | BE a[b-d]e ace (0,3) 110 | BE a[b-d] aac (1,3) 111 | BE a[-b] a- (0,2) 112 | BE a[b-] a- (0,2) 113 | BE a] a] (0,2) 114 | BE a[]]b a]b (0,3) 115 | BE a[^bc]d aed (0,3) 116 | BE a[^-b]c adc (0,3) 117 | BE a[^]b]c adc (0,3) 118 | E ab|cd abc (0,2) 119 | E ab|cd abcd (0,2) 120 | E a\(b a(b (0,3) 121 | E a\(*b ab (0,2) 122 | E a\(*b a((b (0,4) 123 | E ((a)) abc (0,1)(0,1)(0,1) 124 | E (a)b(c) abc (0,3)(0,1)(2,3) 125 | E a+b+c aabbabc (4,7) 126 | E a* aaa (0,3) 127 | #E (a*)* - (0,0)(0,0) 128 | E (a*)* - (0,0)(?,?) RE2/Go 129 | E (a*)+ - (0,0)(0,0) 130 | #E (a*|b)* - (0,0)(0,0) 131 | E (a*|b)* - (0,0)(?,?) RE2/Go 132 | E (a+|b)* ab (0,2)(1,2) 133 | E (a+|b)+ ab (0,2)(1,2) 134 | E (a+|b)? ab (0,1)(0,1) 135 | BE [^ab]* cde (0,3) 136 | #E (^)* - (0,0)(0,0) 137 | E (^)* - (0,0)(?,?) RE2/Go 138 | BE a* NULL (0,0) 139 | E ([abc])*d abbbcd (0,6)(4,5) 140 | E ([abc])*bcd abcd (0,4)(0,1) 141 | E a|b|c|d|e e (0,1) 142 | E (a|b|c|d|e)f ef (0,2)(0,1) 143 | #E ((a*|b))* - (0,0)(0,0)(0,0) 144 | E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go 145 | BE abcd*efg abcdefg (0,7) 146 | BE ab* xabyabbbz (1,3) 147 | BE ab* xayabbbz (1,2) 148 | E (ab|cd)e abcde (2,5)(2,4) 149 | BE [abhgefdc]ij hij (0,3) 150 | E (a|b)c*d abcd (1,4)(1,2) 151 | E (ab|ab*)bc abc (0,3)(0,1) 152 | E a([bc]*)c* abc (0,3)(1,3) 153 | E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) 154 | E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) 155 | E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) 156 | E a[bcd]*dcdcde adcdcde (0,7) 157 | E (ab|a)b*c abc (0,3)(0,2) 158 | E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) 159 | BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) 160 | E ^a(bc+|b[eh])g|.h$ abh (1,3) 161 | E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) 162 | E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) 163 | E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) 164 | E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) 165 | BE multiple words multiple words yeah (0,14) 166 | E (.*)c(.*) abcde (0,5)(0,2)(3,5) 167 | BE abcd abcd (0,4) 168 | E a(bc)d abcd (0,4)(1,3) 169 | E a[-]?c ac (0,3) 170 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) 171 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) 172 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) 173 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) 174 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) 175 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) 176 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) 177 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) 178 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) 179 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) 180 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) 181 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) 182 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) 183 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) 184 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) 185 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) 186 | E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) 187 | E a+(b|c)*d+ aabcdd (0,6)(3,4) 188 | E ^.+$ vivi (0,4) 189 | E ^(.+)$ vivi (0,4)(0,4) 190 | E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) 191 | E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) 192 | E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) 193 | E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) 194 | E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) 195 | E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) 196 | E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) 197 | E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) 198 | E ((foo)|bar)!bas bar!bas (0,7)(0,3) 199 | E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) 200 | E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) 201 | E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) 202 | E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) 203 | E (foo|(bar))!bas foo!bas (0,7)(0,3) 204 | E (foo|bar)!bas bar!bas (0,7)(0,3) 205 | E (foo|bar)!bas foo!bar!bas (4,11)(4,7) 206 | E (foo|bar)!bas foo!bas (0,7)(0,3) 207 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) 208 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) 209 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) 210 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) 211 | E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) 212 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) 213 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) 214 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) 215 | E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) 216 | E .*(/XXX).* /XXX (0,4)(0,4) 217 | E .*(\\XXX).* \XXX (0,4)(0,4) 218 | E \\XXX \XXX (0,4) 219 | E .*(/000).* /000 (0,4)(0,4) 220 | E .*(\\000).* \000 (0,4)(0,4) 221 | E \\000 \000 (0,4) 222 | -------------------------------------------------------------------------------- /src/testdata/nullsubexpr.dat: -------------------------------------------------------------------------------- 1 | NOTE null subexpression matches : 2002-06-06 2 | 3 | E (a*)* a (0,1)(0,1) 4 | #E SAME x (0,0)(0,0) 5 | E SAME x (0,0)(?,?) RE2/Go 6 | E SAME aaaaaa (0,6)(0,6) 7 | E SAME aaaaaax (0,6)(0,6) 8 | E (a*)+ a (0,1)(0,1) 9 | E SAME x (0,0)(0,0) 10 | E SAME aaaaaa (0,6)(0,6) 11 | E SAME aaaaaax (0,6)(0,6) 12 | E (a+)* a (0,1)(0,1) 13 | E SAME x (0,0) 14 | E SAME aaaaaa (0,6)(0,6) 15 | E SAME aaaaaax (0,6)(0,6) 16 | E (a+)+ a (0,1)(0,1) 17 | E SAME x NOMATCH 18 | E SAME aaaaaa (0,6)(0,6) 19 | E SAME aaaaaax (0,6)(0,6) 20 | 21 | E ([a]*)* a (0,1)(0,1) 22 | #E SAME x (0,0)(0,0) 23 | E SAME x (0,0)(?,?) RE2/Go 24 | E SAME aaaaaa (0,6)(0,6) 25 | E SAME aaaaaax (0,6)(0,6) 26 | E ([a]*)+ a (0,1)(0,1) 27 | E SAME x (0,0)(0,0) 28 | E SAME aaaaaa (0,6)(0,6) 29 | E SAME aaaaaax (0,6)(0,6) 30 | E ([^b]*)* a (0,1)(0,1) 31 | #E SAME b (0,0)(0,0) 32 | E SAME b (0,0)(?,?) RE2/Go 33 | E SAME aaaaaa (0,6)(0,6) 34 | E SAME aaaaaab (0,6)(0,6) 35 | E ([ab]*)* a (0,1)(0,1) 36 | E SAME aaaaaa (0,6)(0,6) 37 | E SAME ababab (0,6)(0,6) 38 | E SAME bababa (0,6)(0,6) 39 | E SAME b (0,1)(0,1) 40 | E SAME bbbbbb (0,6)(0,6) 41 | E SAME aaaabcde (0,5)(0,5) 42 | E ([^a]*)* b (0,1)(0,1) 43 | E SAME bbbbbb (0,6)(0,6) 44 | #E SAME aaaaaa (0,0)(0,0) 45 | E SAME aaaaaa (0,0)(?,?) RE2/Go 46 | E ([^ab]*)* ccccxx (0,6)(0,6) 47 | #E SAME ababab (0,0)(0,0) 48 | E SAME ababab (0,0)(?,?) RE2/Go 49 | 50 | E ((z)+|a)* zabcde (0,2)(1,2) 51 | 52 | #{E a+? aaaaaa (0,1) no *? +? mimimal match ops 53 | #E (a) aaa (0,1)(0,1) 54 | #E (a*?) aaa (0,0)(0,0) 55 | #E (a)*? aaa (0,0) 56 | #E (a*?)*? aaa (0,0) 57 | #} 58 | 59 | B \(a*\)*\(x\) x (0,1)(0,0)(0,1) 60 | B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) 61 | B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) 62 | B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) 63 | B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) 64 | B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) 65 | B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) 66 | B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) 67 | 68 | #E (a*)*(x) x (0,1)(0,0)(0,1) 69 | E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go 70 | E (a*)*(x) ax (0,2)(0,1)(1,2) 71 | E (a*)*(x) axa (0,2)(0,1)(1,2) 72 | 73 | E (a*)+(x) x (0,1)(0,0)(0,1) 74 | E (a*)+(x) ax (0,2)(0,1)(1,2) 75 | E (a*)+(x) axa (0,2)(0,1)(1,2) 76 | 77 | E (a*){2}(x) x (0,1)(0,0)(0,1) 78 | E (a*){2}(x) ax (0,2)(1,1)(1,2) 79 | E (a*){2}(x) axa (0,2)(1,1)(1,2) 80 | -------------------------------------------------------------------------------- /src/testdata/repetition.dat: -------------------------------------------------------------------------------- 1 | NOTE implicit vs. explicit repetitions : 2009-02-02 2 | 3 | # Glenn Fowler 4 | # conforming matches (column 4) must match one of the following BREs 5 | # NOMATCH 6 | # (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* 7 | # (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* 8 | # i.e., each 3-tuple has two identical elements and one (?,?) 9 | 10 | E ((..)|(.)) NULL NOMATCH 11 | E ((..)|(.))((..)|(.)) NULL NOMATCH 12 | E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH 13 | 14 | E ((..)|(.)){1} NULL NOMATCH 15 | E ((..)|(.)){2} NULL NOMATCH 16 | E ((..)|(.)){3} NULL NOMATCH 17 | 18 | E ((..)|(.))* NULL (0,0) 19 | 20 | E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) 21 | E ((..)|(.))((..)|(.)) a NOMATCH 22 | E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH 23 | 24 | E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) 25 | E ((..)|(.)){2} a NOMATCH 26 | E ((..)|(.)){3} a NOMATCH 27 | 28 | E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) 29 | 30 | E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) 31 | E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) 32 | E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH 33 | 34 | E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) 35 | E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) 36 | E ((..)|(.)){3} aa NOMATCH 37 | 38 | E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) 39 | 40 | E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) 41 | E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) 42 | E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) 43 | 44 | E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) 45 | #E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) 46 | E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go 47 | E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) 48 | 49 | #E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) 50 | E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go 51 | 52 | E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) 53 | E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) 54 | E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) 55 | 56 | E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) 57 | E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) 58 | #E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) 59 | E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go 60 | 61 | E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) 62 | 63 | E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) 64 | E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) 65 | E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) 66 | 67 | E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) 68 | E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) 69 | #E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) 70 | E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go 71 | 72 | #E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) 73 | E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go 74 | 75 | E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) 76 | E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) 77 | E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) 78 | 79 | E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) 80 | E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) 81 | E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) 82 | 83 | E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) 84 | 85 | NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 86 | 87 | # These test a bug in OS X / FreeBSD / NetBSD, and libtree. 88 | # Linux/GLIBC gets the {8,} and {8,8} wrong. 89 | 90 | :HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) 91 | :HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) 92 | :HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) 93 | :HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) 94 | :HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) 95 | :HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) 96 | :HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) 97 | :HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) 98 | :HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) 99 | #:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) 100 | :HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go 101 | #:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) 102 | :HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go 103 | #:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) 104 | :HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go 105 | #:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) 106 | :HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go 107 | #:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) 108 | :HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go 109 | #:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) 110 | :HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go 111 | #:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) 112 | :HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go 113 | #:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) 114 | :HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go 115 | :HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) 116 | 117 | # These test a fixed bug in my regex-tdfa that did not keep the expanded 118 | # form properly grouped, so right association did the wrong thing with 119 | # these ambiguous patterns (crafted just to test my code when I became 120 | # suspicious of my implementation). The first subexpression should use 121 | # "ab" then "a" then "bcd". 122 | 123 | # OS X / FreeBSD / NetBSD badly fail many of these, with impossible 124 | # results like (0,6)(4,5)(6,6). 125 | 126 | :HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1) 127 | :HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1) 128 | :HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) 129 | :HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) 130 | :HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH 131 | :HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1) 132 | :HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1) 133 | :HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) 134 | :HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) 135 | :HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH 136 | :HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1) 137 | :HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1) 138 | 139 | # The above worked on Linux/GLIBC but the following often fail. 140 | # They also trip up OS X / FreeBSD / NetBSD: 141 | 142 | #:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) 143 | :HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 144 | #:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) 145 | :HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 146 | #:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) 147 | :HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 148 | #:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) 149 | :HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 150 | :HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH 151 | #:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) 152 | :HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 153 | #:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) 154 | :HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 155 | #:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) 156 | :HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 157 | #:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) 158 | :HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 159 | :HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH 160 | #:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) 161 | :HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 162 | #:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) 163 | :HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go 164 | -------------------------------------------------------------------------------- /src/vm.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT 2 | // file at the top-level directory of this distribution and at 3 | // http://rust-lang.org/COPYRIGHT. 4 | // 5 | // Licensed under the Apache License, Version 2.0 or the MIT license 7 | // , at your 8 | // option. This file may not be copied, modified, or distributed 9 | // except according to those terms. 10 | 11 | // FIXME: Currently, the VM simulates an NFA. It would be nice to have another 12 | // VM that simulates a DFA. 13 | // 14 | // According to Russ Cox[1], a DFA performs better than an NFA, principally 15 | // because it reuses states previously computed by the machine *and* doesn't 16 | // keep track of capture groups. The drawback of a DFA (aside from its 17 | // complexity) is that it can't accurately return the locations of submatches. 18 | // The NFA *can* do that. (This is my understanding anyway.) 19 | // 20 | // Cox suggests that a DFA ought to be used to answer "does this match" and 21 | // "where does it match" questions. (In the latter, the starting position of 22 | // the match is computed by executing the regex backwards.) Cox also suggests 23 | // that a DFA should be run when asking "where are the submatches", which can 24 | // 1) quickly answer "no" is there's no match and 2) discover the substring 25 | // that matches, which means running the NFA on smaller input. 26 | // 27 | // Currently, the NFA simulation implemented below does some dirty tricks to 28 | // avoid tracking capture groups when they aren't needed (which only works 29 | // for 'is_match', not 'find'). This is a half-measure, but does provide some 30 | // perf improvement. 31 | // 32 | // AFAIK, the DFA/NFA approach is implemented in RE2/C++ but *not* in RE2/Go. 33 | // 34 | // [1] - http://swtch.com/~rsc/regex/regex3.html 35 | 36 | use std::cmp; 37 | use std::mem; 38 | use std::slice::MutableVector; 39 | use compile::{ 40 | Program, 41 | Match, OneChar, CharClass, Any, EmptyBegin, EmptyEnd, EmptyWordBoundary, 42 | Save, Jump, Split, 43 | }; 44 | use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED}; 45 | use parse::unicode::PERLW; 46 | 47 | pub type CaptureLocs = Vec>; 48 | 49 | /// Indicates the type of match to be performed by the VM. 50 | pub enum MatchKind { 51 | /// Only checks if a match exists or not. Does not return location. 52 | Exists, 53 | /// Returns the start and end indices of the entire match in the input 54 | /// given. 55 | Location, 56 | /// Returns the start and end indices of each submatch in the input given. 57 | Submatches, 58 | } 59 | 60 | /// Runs an NFA simulation on the compiled expression given on the search text 61 | /// `input`. The search begins at byte index `start` and ends at byte index 62 | /// `end`. (The range is specified here so that zero-width assertions will work 63 | /// correctly when searching for successive non-overlapping matches.) 64 | /// 65 | /// The `which` parameter indicates what kind of capture information the caller 66 | /// wants. There are three choices: match existence only, the location of the 67 | /// entire match or the locations of the entire match in addition to the 68 | /// locations of each submatch. 69 | pub fn run<'r, 't>(which: MatchKind, prog: &'r Program, input: &'t str, 70 | start: uint, end: uint) -> CaptureLocs { 71 | Nfa { 72 | which: which, 73 | prog: prog, 74 | input: input, 75 | start: start, 76 | end: end, 77 | ic: 0, 78 | chars: CharReader::new(input), 79 | }.run() 80 | } 81 | 82 | struct Nfa<'r, 't> { 83 | which: MatchKind, 84 | prog: &'r Program, 85 | input: &'t str, 86 | start: uint, 87 | end: uint, 88 | ic: uint, 89 | chars: CharReader<'t>, 90 | } 91 | 92 | /// Indicates the next action to take after a single non-empty instruction 93 | /// is processed. 94 | pub enum StepState { 95 | /// This is returned if and only if a Match instruction is reached and 96 | /// we only care about the existence of a match. It instructs the VM to 97 | /// quit early. 98 | StepMatchEarlyReturn, 99 | /// Indicates that a match was found. Thus, the rest of the states in the 100 | /// *current* queue should be dropped (i.e., leftmost-first semantics). 101 | /// States in the "next" queue can still be processed. 102 | StepMatch, 103 | /// No match was found. Continue with the next state in the queue. 104 | StepContinue, 105 | } 106 | 107 | impl<'r, 't> Nfa<'r, 't> { 108 | fn run(&mut self) -> CaptureLocs { 109 | let ncaps = match self.which { 110 | Exists => 0, 111 | Location => 1, 112 | Submatches => self.prog.num_captures(), 113 | }; 114 | let mut matched = false; 115 | let ninsts = self.prog.insts.len(); 116 | let mut clist = &mut Threads::new(self.which, ninsts, ncaps); 117 | let mut nlist = &mut Threads::new(self.which, ninsts, ncaps); 118 | 119 | let mut groups = Vec::from_elem(ncaps * 2, None); 120 | 121 | // Determine if the expression starts with a '^' so we can avoid 122 | // simulating .*? 123 | // Make sure multi-line mode isn't enabled for it, otherwise we can't 124 | // drop the initial .*? 125 | let prefix_anchor = 126 | match *self.prog.insts.get(1) { 127 | EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true, 128 | _ => false, 129 | }; 130 | 131 | self.ic = self.start; 132 | let mut next_ic = self.chars.set(self.start); 133 | while self.ic <= self.end { 134 | if clist.size == 0 { 135 | // We have a match and we're done exploring alternatives. 136 | // Time to quit. 137 | if matched { 138 | break 139 | } 140 | 141 | // If there are no threads to try, then we'll have to start 142 | // over at the beginning of the regex. 143 | // BUT, if there's a literal prefix for the program, try to 144 | // jump ahead quickly. If it can't be found, then we can bail 145 | // out early. 146 | if self.prog.prefix.len() > 0 && clist.size == 0 { 147 | let needle = self.prog.prefix.as_slice().as_bytes(); 148 | let haystack = self.input.as_bytes().slice_from(self.ic); 149 | match find_prefix(needle, haystack) { 150 | None => break, 151 | Some(i) => { 152 | self.ic += i; 153 | next_ic = self.chars.set(self.ic); 154 | } 155 | } 156 | } 157 | } 158 | 159 | // This simulates a preceding '.*?' for every regex by adding 160 | // a state starting at the current position in the input for the 161 | // beginning of the program only if we don't already have a match. 162 | if clist.size == 0 || (!prefix_anchor && !matched) { 163 | self.add(clist, 0, groups.as_mut_slice()) 164 | } 165 | 166 | // Now we try to read the next character. 167 | // As a result, the 'step' method will look at the previous 168 | // character. 169 | self.ic = next_ic; 170 | next_ic = self.chars.advance(); 171 | 172 | let mut i = 0; 173 | while i < clist.size { 174 | let pc = clist.pc(i); 175 | let step_state = self.step(groups.as_mut_slice(), nlist, 176 | clist.groups(i), pc); 177 | match step_state { 178 | StepMatchEarlyReturn => return vec![Some(0), Some(0)], 179 | StepMatch => { matched = true; clist.empty() }, 180 | StepContinue => {}, 181 | } 182 | i += 1; 183 | } 184 | mem::swap(&mut clist, &mut nlist); 185 | nlist.empty(); 186 | } 187 | match self.which { 188 | Exists if matched => vec![Some(0), Some(0)], 189 | Exists => vec![None, None], 190 | Location | Submatches => groups, 191 | } 192 | } 193 | 194 | fn step(&self, groups: &mut [Option], nlist: &mut Threads, 195 | caps: &mut [Option], pc: uint) 196 | -> StepState { 197 | match *self.prog.insts.get(pc) { 198 | Match => { 199 | match self.which { 200 | Exists => { 201 | return StepMatchEarlyReturn 202 | } 203 | Location => { 204 | groups[0] = caps[0]; 205 | groups[1] = caps[1]; 206 | return StepMatch 207 | } 208 | Submatches => { 209 | for (slot, val) in groups.mut_iter().zip(caps.iter()) { 210 | *slot = *val; 211 | } 212 | return StepMatch 213 | } 214 | } 215 | } 216 | OneChar(c, flags) => { 217 | if self.char_eq(flags & FLAG_NOCASE > 0, self.chars.prev, c) { 218 | self.add(nlist, pc+1, caps); 219 | } 220 | } 221 | CharClass(ref ranges, flags) => { 222 | if self.chars.prev.is_some() { 223 | let c = self.chars.prev.unwrap(); 224 | let negate = flags & FLAG_NEGATED > 0; 225 | let casei = flags & FLAG_NOCASE > 0; 226 | let found = ranges.as_slice(); 227 | let found = found.bsearch(|&rc| class_cmp(casei, c, rc)); 228 | let found = found.is_some(); 229 | if (found && !negate) || (!found && negate) { 230 | self.add(nlist, pc+1, caps); 231 | } 232 | } 233 | } 234 | Any(flags) => { 235 | if flags & FLAG_DOTNL > 0 236 | || !self.char_eq(false, self.chars.prev, '\n') { 237 | self.add(nlist, pc+1, caps) 238 | } 239 | } 240 | EmptyBegin(_) | EmptyEnd(_) | EmptyWordBoundary(_) 241 | | Save(_) | Jump(_) | Split(_, _) => {}, 242 | } 243 | StepContinue 244 | } 245 | 246 | fn add(&self, nlist: &mut Threads, pc: uint, groups: &mut [Option]) { 247 | if nlist.contains(pc) { 248 | return 249 | } 250 | // We have to add states to the threads list even if their empty. 251 | // TL;DR - It prevents cycles. 252 | // If we didn't care about cycles, we'd *only* add threads that 253 | // correspond to non-jumping instructions (OneChar, Any, Match, etc.). 254 | // But, it's possible for valid regexs (like '(a*)*') to result in 255 | // a cycle in the instruction list. e.g., We'll keep chasing the Split 256 | // instructions forever. 257 | // So we add these instructions to our thread queue, but in the main 258 | // VM loop, we look for them but simply ignore them. 259 | // Adding them to the queue prevents them from being revisited so we 260 | // can avoid cycles (and the inevitable stack overflow). 261 | // 262 | // We make a minor optimization by indicating that the state is "empty" 263 | // so that its capture groups are not filled in. 264 | match *self.prog.insts.get(pc) { 265 | EmptyBegin(flags) => { 266 | let multi = flags & FLAG_MULTI > 0; 267 | nlist.add(pc, groups, true); 268 | if self.chars.is_begin() 269 | || (multi && self.char_is(self.chars.prev, '\n')) { 270 | self.add(nlist, pc + 1, groups) 271 | } 272 | } 273 | EmptyEnd(flags) => { 274 | let multi = flags & FLAG_MULTI > 0; 275 | nlist.add(pc, groups, true); 276 | if self.chars.is_end() 277 | || (multi && self.char_is(self.chars.cur, '\n')) { 278 | self.add(nlist, pc + 1, groups) 279 | } 280 | } 281 | EmptyWordBoundary(flags) => { 282 | nlist.add(pc, groups, true); 283 | if self.chars.is_word_boundary() == !(flags & FLAG_NEGATED > 0) { 284 | self.add(nlist, pc + 1, groups) 285 | } 286 | } 287 | Save(slot) => { 288 | nlist.add(pc, groups, true); 289 | match self.which { 290 | Location if slot <= 1 => { 291 | let old = groups[slot]; 292 | groups[slot] = Some(self.ic); 293 | self.add(nlist, pc + 1, groups); 294 | groups[slot] = old; 295 | } 296 | Submatches => { 297 | let old = groups[slot]; 298 | groups[slot] = Some(self.ic); 299 | self.add(nlist, pc + 1, groups); 300 | groups[slot] = old; 301 | } 302 | Exists | Location => self.add(nlist, pc + 1, groups), 303 | } 304 | } 305 | Jump(to) => { 306 | nlist.add(pc, groups, true); 307 | self.add(nlist, to, groups) 308 | } 309 | Split(x, y) => { 310 | nlist.add(pc, groups, true); 311 | self.add(nlist, x, groups); 312 | self.add(nlist, y, groups); 313 | } 314 | Match | OneChar(_, _) | CharClass(_, _) | Any(_) => { 315 | nlist.add(pc, groups, false); 316 | } 317 | } 318 | } 319 | 320 | // FIXME: For case insensitive comparisons, it uses the uppercase 321 | // character and tests for equality. IIUC, this does not generalize to 322 | // all of Unicode. I believe we need to check the entire fold for each 323 | // character. This will be easy to add if and when it gets added to Rust's 324 | // standard library. 325 | #[inline] 326 | fn char_eq(&self, casei: bool, textc: Option, regc: char) -> bool { 327 | match textc { 328 | None => false, 329 | Some(textc) => { 330 | regc == textc 331 | || (casei && regc.to_uppercase() == textc.to_uppercase()) 332 | } 333 | } 334 | } 335 | 336 | #[inline] 337 | fn char_is(&self, textc: Option, regc: char) -> bool { 338 | textc == Some(regc) 339 | } 340 | } 341 | 342 | /// CharReader is responsible for maintaining a "previous" and a "current" 343 | /// character. This one-character lookahead is necessary for assertions that 344 | /// look one character before or after the current position. 345 | pub struct CharReader<'t> { 346 | /// The previous character read. It is None only when processing the first 347 | /// character of the input. 348 | pub prev: Option, 349 | /// The current character. 350 | pub cur: Option, 351 | input: &'t str, 352 | next: uint, 353 | } 354 | 355 | impl<'t> CharReader<'t> { 356 | /// Returns a new CharReader that advances through the input given. 357 | /// Note that a CharReader has no knowledge of the range in which to search 358 | /// the input. 359 | pub fn new(input: &'t str) -> CharReader<'t> { 360 | CharReader { 361 | prev: None, 362 | cur: None, 363 | input: input, 364 | next: 0, 365 | } 366 | } 367 | 368 | /// Sets the previous and current character given any arbitrary byte 369 | /// index (at a unicode codepoint boundary). 370 | #[inline] 371 | pub fn set(&mut self, ic: uint) -> uint { 372 | self.prev = None; 373 | self.cur = None; 374 | self.next = 0; 375 | 376 | if self.input.len() == 0 { 377 | return 1 378 | } 379 | if ic > 0 { 380 | let i = cmp::min(ic, self.input.len()); 381 | let prev = self.input.char_range_at_reverse(i); 382 | self.prev = Some(prev.ch); 383 | } 384 | if ic < self.input.len() { 385 | let cur = self.input.char_range_at(ic); 386 | self.cur = Some(cur.ch); 387 | self.next = cur.next; 388 | self.next 389 | } else { 390 | self.input.len() + 1 391 | } 392 | } 393 | 394 | /// Does the same as `set`, except it always advances to the next 395 | /// character in the input (and therefore does half as many UTF8 decodings). 396 | #[inline] 397 | pub fn advance(&mut self) -> uint { 398 | self.prev = self.cur; 399 | if self.next < self.input.len() { 400 | let cur = self.input.char_range_at(self.next); 401 | self.cur = Some(cur.ch); 402 | self.next = cur.next; 403 | } else { 404 | self.cur = None; 405 | self.next = self.input.len() + 1; 406 | } 407 | self.next 408 | } 409 | 410 | /// Returns true if and only if this is the beginning of the input 411 | /// (ignoring the range of the input to search). 412 | #[inline] 413 | pub fn is_begin(&self) -> bool { self.prev.is_none() } 414 | 415 | /// Returns true if and only if this is the end of the input 416 | /// (ignoring the range of the input to search). 417 | #[inline] 418 | pub fn is_end(&self) -> bool { self.cur.is_none() } 419 | 420 | /// Returns true if and only if the current position is a word boundary. 421 | /// (Ignoring the range of the input to search.) 422 | pub fn is_word_boundary(&self) -> bool { 423 | if self.is_begin() { 424 | return is_word(self.cur) 425 | } 426 | if self.is_end() { 427 | return is_word(self.prev) 428 | } 429 | (is_word(self.cur) && !is_word(self.prev)) 430 | || (is_word(self.prev) && !is_word(self.cur)) 431 | } 432 | } 433 | 434 | struct Thread { 435 | pc: uint, 436 | groups: Vec>, 437 | } 438 | 439 | struct Threads { 440 | which: MatchKind, 441 | queue: Vec, 442 | sparse: Vec, 443 | size: uint, 444 | } 445 | 446 | impl Threads { 447 | // This is using a wicked neat trick to provide constant time lookup 448 | // for threads in the queue using a sparse set. A queue of threads is 449 | // allocated once with maximal size when the VM initializes and is reused 450 | // throughout execution. That is, there should be zero allocation during 451 | // the execution of a VM. 452 | // 453 | // See http://research.swtch.com/sparse for the deets. 454 | fn new(which: MatchKind, num_insts: uint, ncaps: uint) -> Threads { 455 | Threads { 456 | which: which, 457 | queue: Vec::from_fn(num_insts, |_| { 458 | Thread { pc: 0, groups: Vec::from_elem(ncaps * 2, None) } 459 | }), 460 | sparse: Vec::from_elem(num_insts, 0u), 461 | size: 0, 462 | } 463 | } 464 | 465 | fn add(&mut self, pc: uint, groups: &[Option], empty: bool) { 466 | let t = self.queue.get_mut(self.size); 467 | t.pc = pc; 468 | match (empty, self.which) { 469 | (_, Exists) | (true, _) => {}, 470 | (false, Location) => { 471 | *t.groups.get_mut(0) = groups[0]; 472 | *t.groups.get_mut(1) = groups[1]; 473 | } 474 | (false, Submatches) => { 475 | for (slot, val) in t.groups.mut_iter().zip(groups.iter()) { 476 | *slot = *val; 477 | } 478 | } 479 | } 480 | *self.sparse.get_mut(pc) = self.size; 481 | self.size += 1; 482 | } 483 | 484 | #[inline] 485 | fn contains(&self, pc: uint) -> bool { 486 | let s = *self.sparse.get(pc); 487 | s < self.size && self.queue.get(s).pc == pc 488 | } 489 | 490 | #[inline] 491 | fn empty(&mut self) { 492 | self.size = 0; 493 | } 494 | 495 | #[inline] 496 | fn pc(&self, i: uint) -> uint { 497 | self.queue.get(i).pc 498 | } 499 | 500 | #[inline] 501 | fn groups<'r>(&'r mut self, i: uint) -> &'r mut [Option] { 502 | self.queue.get_mut(i).groups.as_mut_slice() 503 | } 504 | } 505 | 506 | /// Returns true if the character is a word character, according to the 507 | /// (Unicode friendly) Perl character class '\w'. 508 | /// Note that this is only use for testing word boundaries. The actual '\w' 509 | /// is encoded as a CharClass instruction. 510 | pub fn is_word(c: Option) -> bool { 511 | let c = match c { 512 | None => return false, 513 | Some(c) => c, 514 | }; 515 | // Try the common ASCII case before invoking binary search. 516 | match c { 517 | '_' | '0' .. '9' | 'a' .. 'z' | 'A' .. 'Z' => true, 518 | _ => PERLW.bsearch(|&(start, end)| { 519 | if c >= start && c <= end { 520 | Equal 521 | } else if start > c { 522 | Greater 523 | } else { 524 | Less 525 | } 526 | }).is_some() 527 | } 528 | } 529 | 530 | /// Given a character and a single character class range, return an ordering 531 | /// indicating whether the character is less than the start of the range, 532 | /// in the range (inclusive) or greater than the end of the range. 533 | /// 534 | /// If `casei` is `true`, then this ordering is computed case insensitively. 535 | /// 536 | /// This function is meant to be used with a binary search. 537 | #[inline] 538 | fn class_cmp(casei: bool, mut textc: char, 539 | (mut start, mut end): (char, char)) -> Ordering { 540 | if casei { 541 | // FIXME: This is pretty ridiculous. All of this case conversion 542 | // can be moved outside this function: 543 | // 1) textc should be uppercased outside the bsearch. 544 | // 2) the character class itself should be uppercased either in the 545 | // parser or the compiler. 546 | // FIXME: This is too simplistic for correct Unicode support. 547 | // See also: char_eq 548 | textc = textc.to_uppercase(); 549 | start = start.to_uppercase(); 550 | end = end.to_uppercase(); 551 | } 552 | if textc >= start && textc <= end { 553 | Equal 554 | } else if start > textc { 555 | Greater 556 | } else { 557 | Less 558 | } 559 | } 560 | 561 | /// Returns the starting location of `needle` in `haystack`. 562 | /// If `needle` is not in `haystack`, then `None` is returned. 563 | /// 564 | /// Note that this is using a naive substring algorithm. 565 | #[inline] 566 | pub fn find_prefix(needle: &[u8], haystack: &[u8]) -> Option { 567 | let (hlen, nlen) = (haystack.len(), needle.len()); 568 | if nlen > hlen || nlen == 0 { 569 | return None 570 | } 571 | let mut hayi = 0u; 572 | 'HAYSTACK: loop { 573 | if hayi > hlen - nlen { 574 | break 575 | } 576 | let mut nedi = 0; 577 | while nedi < nlen { 578 | if haystack[hayi+nedi] != needle[nedi] { 579 | hayi += 1; 580 | continue 'HAYSTACK 581 | } 582 | nedi += 1; 583 | } 584 | return Some(hayi) 585 | } 586 | None 587 | } 588 | --------------------------------------------------------------------------------