├── .gitignore
├── .travis.yml
├── LICENSE-APACHE
├── LICENSE-MIT
├── Makefile
├── README.md
├── benchmark
    ├── README.md
    ├── golang
    ├── regex-dna
    │   ├── Makefile
    │   ├── README.md
    │   ├── regex-dna-single.rs
    │   ├── regex-dna.c
    │   ├── regex-dna.go
    │   ├── regex-dna.py
    │   ├── regex-dna.rs
    │   └── shootout-fasta.rs
    └── rust
├── cargo-lite.conf
├── ctags.rust
├── regex-match-tests.py
├── regex-unicode-tables.py
├── session.vim
└── src
    ├── compile.rs
    ├── lib.rs
    ├── macro.rs
    ├── parse.rs
    ├── re.rs
    ├── test
        ├── bench.rs
        ├── matches.rs
        ├── mod.rs
        └── tests.rs
    ├── testdata
        ├── LICENSE
        ├── README
        ├── basic.dat
        ├── nullsubexpr.dat
        └── repetition.dat
    ├── unicode.rs
    └── vm.rs


/.gitignore:
--------------------------------------------------------------------------------
1 | .*.swp
2 | doc
3 | tags
4 | build
5 | scratch.rs
6 | expanded.rs
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c
 2 | before_install:
 3 |   - yes | sudo add-apt-repository ppa:hansjorg/rust
 4 |   - sudo apt-get update
 5 | install:
 6 |   - sudo apt-get install rust-nightly
 7 | script:
 8 |   - rustc -L . --crate-type lib ./src/lib.rs
 9 | 
10 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | 	http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2006-2009 Graydon Hoare
 2 | Copyright (c) 2009-2014 Mozilla Foundation
 3 | 
 4 | Permission is hereby granted, free of charge, to any
 5 | person obtaining a copy of this software and associated
 6 | documentation files (the "Software"), to deal in the
 7 | Software without restriction, including without
 8 | limitation the rights to use, copy, modify, merge,
 9 | publish, distribute, sublicense, and/or sell copies of
10 | the Software, and to permit persons to whom the Software
11 | is furnished to do so, subject to the following
12 | conditions:
13 | 
14 | The above copyright notice and this permission notice
15 | shall be included in all copies or substantial portions
16 | of the Software.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
19 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
20 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
21 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
22 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
23 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
25 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 | DEALINGS IN THE SOFTWARE.
27 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | RUSTC ?= rustc
 2 | RUSTDOC ?= rustdoc
 3 | BUILD_DIR ?= ./build
 4 | RUST_PATH ?= $(BUILD_DIR)
 5 | RUSTFLAGS ?= --opt-level=3
 6 | RUSTTESTFLAGS ?= 
 7 | REGEXP_LIB ?= $(BUILD_DIR)/.libregex.timestamp
 8 | REGEXP_LIB_FILES = src/compile.rs src/lib.rs src/parse.rs src/re.rs \
 9 | 									 src/unicode.rs src/vm.rs
10 | REGEXP_MACRO_LIB ?= $(BUILD_DIR)/.libregex_macros.timestamp
11 | REGEXP_MACRO_LIB_FILES = src/macro.rs
12 | REGEXP_TEST_FILES = src/test/bench.rs src/test/matches.rs \
13 | 									  src/test/mod.rs src/test/tests.rs
14 | MOZILLA_RUST ?= $(HOME)/clones/rust
15 | REGEXP_DYN_FLAGS =
16 | 
17 | ifdef REGEXP_DYNAMIC
18 | 	REGEXP_DYN_FLAGS = --cfg dynamic
19 | endif
20 | 
21 | all: $(REGEXP_LIB) $(REGEXP_MACRO_LIB)
22 | 
23 | install:
24 | 	cargo-lite install
25 | 
26 | $(REGEXP_LIB): $(REGEXP_LIB_FILES)
27 | 	@mkdir -p $(BUILD_DIR)
28 | 	$(RUSTC) $(RUSTFLAGS) ./src/lib.rs --out-dir=$(BUILD_DIR)
29 | 	@touch $(REGEXP_LIB)
30 | 
31 | $(REGEXP_MACRO_LIB): $(REGEXP_LIB) $(REGEXP_MACRO_LIB_FILES)
32 | 	@mkdir -p $(BUILD_DIR)
33 | 	$(RUSTC) -L $(BUILD_DIR) $(RUSTFLAGS) ./src/macro.rs --out-dir=$(BUILD_DIR)
34 | 	@touch $(REGEXP_MACRO_LIB)
35 | 
36 | match-tests:
37 | 	./regex-match-tests.py ./src/testdata/*.dat > ./src/test/matches.rs
38 | 
39 | unicode-tables:
40 | 	./regex-unicode-tables.py > ./src/unicode.rs
41 | 
42 | docs: $(REGEXP_LIB_FILES) $(REGEXP_MACRO_LIB_FILES)
43 | 	rm -rf doc
44 | 	$(RUSTDOC) -L $(RUST_PATH) --test ./src/lib.rs
45 | 	$(RUSTDOC) -L $(RUST_PATH) ./src/lib.rs
46 | 	$(RUSTDOC) -L $(RUST_PATH) ./src/macro.rs
47 | 	# WTF is rustdoc doing?
48 | 	chmod 755 doc
49 | 	in-dir doc fix-perms
50 | 	rscp ./doc/* gopher:~/www/burntsushi.net/rustdoc/
51 | 
52 | test: build/tests
53 | 	RUST_TEST_TASKS=1 RUST_LOG=regex ./build/tests
54 | 
55 | build/tests: $(REGEXP_LIB) $(REGEXP_MACRO_LIB) $(REGEXP_TEST_FILES)
56 | 	$(RUSTC) $(RUSTTESTFLAGS) -L $(RUST_PATH) --test $(REGEXP_DYN_FLAGS) src/lib.rs -o ./build/tests
57 | 
58 | bench: build/bench
59 | 	RUST_TEST_TASKS=1 RUST_LOG=regex ./build/bench --bench
60 | 
61 | bench-perf: build/bench
62 | 	RUST_TEST_TASKS=1 RUST_LOG=regex perf record -g --call-graph dwarf -s ./build/bench --bench
63 | 
64 | build/bench: $(REGEXP_LIB) $(REGEXP_MACRO_LIB) $(REGEXP_TEST_FILES)
65 | 	$(RUSTC) $(RUSTFLAGS) -g -Z lto -L $(RUST_PATH) --test --cfg bench $(REGEXP_DYN_FLAGS) src/lib.rs -o ./build/bench
66 | 
67 | scratch: build/scratch
68 | 	RUST_TEST_TASKS=1 RUST_LOG=regex ./build/scratch
69 | 
70 | build/scratch: $(REGEXP_MACRO_LIB) scratch.rs
71 | 	$(RUSTC) -L $(BUILD_DIR) $(RUSTTESTFLAGS) scratch.rs -o ./build/scratch
72 | 
73 | ctags:
74 | 	ctags --recurse --options=ctags.rust --languages=Rust
75 | 
76 | clean:
77 | 	rm -f $(BUILD_DIR)/.*.timestamp $(BUILD_DIR)/*
78 | 
79 | push:
80 | 	git push origin master
81 | 	git push github master
82 | 
83 | mozilla:
84 | 	mkdir -p $(MOZILLA_RUST)/src/libregex
85 | 	mkdir -p $(MOZILLA_RUST)/src/libregex_macros
86 | 	rm -rf $(MOZILLA_RUST)/src/libregex/*
87 | 	cp -a ./src/* $(MOZILLA_RUST)/src/libregex/
88 | 	rm $(MOZILLA_RUST)/src/libregex/macro.rs
89 | 	cp ./src/macro.rs $(MOZILLA_RUST)/src/libregex_macros/lib.rs
90 | 	cp *.py $(MOZILLA_RUST)/src/etc/
91 | 	cp ./benchmark/regex-dna/regex-dna.rs $(MOZILLA_RUST)/src/test/bench/shootout-regex-dna.rs
92 | 
93 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Initial regexp code for [Rust's regex crate](https://github.com/rust-lang-nursery/regex).
2 | 
3 | Do not use.
4 | 


--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
  1 | Rust
  2 | ----
  3 | ```
  4 | rustc --opt-level=3 -Z lto -g --test --cfg bench src/lib.rs -o ./build/bench
  5 | ./build/bench --bench
  6 | 
  7 | literal                                 125 ns/iter (+/- 0)
  8 | not_literal                             944 ns/iter (+/- 29)
  9 | match_class                            1259 ns/iter (+/- 25)
 10 | match_class_in_range                   1342 ns/iter (+/- 6)
 11 | replace_all                            1130 ns/iter (+/- 18)
 12 | anchored_literal_short_non_match        432 ns/iter (+/- 4)
 13 | anchored_literal_long_non_match        5825 ns/iter (+/- 157)
 14 | anchored_literal_short_match            147 ns/iter (+/- 3)
 15 | anchored_literal_long_match             137 ns/iter (+/- 2)
 16 | one_pass_short_a                       1002 ns/iter (+/- 14)
 17 | one_pass_short_a_not                   1500 ns/iter (+/- 32)
 18 | one_pass_short_b                        734 ns/iter (+/- 10)
 19 | one_pass_short_b_not                    974 ns/iter (+/- 10)
 20 | one_pass_long_prefix                    508 ns/iter (+/- 4)
 21 | one_pass_long_prefix_not                510 ns/iter (+/- 6)
 22 | easy0_32                                263 ns/iter (+/- 15) = 121 MB/s
 23 | easy0_1K                               1477 ns/iter (+/- 139) = 693 MB/s
 24 | easy0_32K                             40140 ns/iter (+/- 917) = 816 MB/s
 25 | easy1_32                                328 ns/iter (+/- 71) = 97 MB/s
 26 | easy1_1K                               1774 ns/iter (+/- 524) = 577 MB/s
 27 | easy1_32K                             48362 ns/iter (+/- 3161) = 677 MB/s
 28 | medium_32                               774 ns/iter (+/- 34) = 41 MB/s
 29 | medium_1K                             15623 ns/iter (+/- 293) = 65 MB/s
 30 | medium_32K                           490884 ns/iter (+/- 2344) = 66 MB/s
 31 | hard_32                                1306 ns/iter (+/- 30) = 24 MB/s
 32 | hard_1K                               33060 ns/iter (+/- 245) = 30 MB/s
 33 | hard_32K                            1048745 ns/iter (+/- 5576) = 31 MB/s
 34 | no_exponential                       286117 ns/iter (+/- 2050)
 35 | ```
 36 | 
 37 | Golang
 38 | ------
 39 | Benchmarks are taken from the `regexp` package included in the Go distribution.
 40 | 
 41 | ```
 42 | cd go/src/pkg/regexp
 43 | go test -run ' ' -bench .
 44 | 
 45 | Literal                      10000000      229 ns/op
 46 | NotLiteral                   500000       3354 ns/op
 47 | MatchClass                   500000       5092 ns/op
 48 | MatchClass_InRange           500000       4200 ns/op
 49 | ReplaceAll                   500000       3548 ns/op
 50 | AnchoredLiteralShortNonMatch 20000000      145 ns/op
 51 | AnchoredLiteralLongNonMatch  20000000      142 ns/op
 52 | AnchoredShortMatch           5000000       381 ns/op
 53 | AnchoredLongMatch            5000000       383 ns/op
 54 | OnePassShortA                1000000      1045 ns/op
 55 | NotOnePassShortA             1000000      2478 ns/op
 56 | OnePassShortB                2000000       766 ns/op
 57 | NotOnePassShortB             1000000      2216 ns/op
 58 | OnePassLongPrefix            10000000      156 ns/op
 59 | OnePassLongNotPrefix         5000000       614 ns/op
 60 | MatchEasy0_32                20000000      114 ns/op  279.35 MB/s
 61 | MatchEasy0_1K                5000000       653 ns/op 1566.63 MB/s
 62 | MatchEasy0_32K               200000      12624 ns/op 2595.57 MB/s
 63 | MatchEasy0_1M                5000       458608 ns/op 2286.43 MB/s
 64 | MatchEasy1_32                20000000     96.7 ns/op  330.99 MB/s
 65 | MatchEasy1_1K                1000000      2647 ns/op  386.74 MB/s
 66 | MatchEasy1_32K               50000       57848 ns/op  566.45 MB/s
 67 | MatchEasy1_1M                1000      1991274 ns/op  526.59 MB/s
 68 | MatchMedium_32               1000000      1746 ns/op   18.33 MB/s
 69 | MatchMedium_1K               50000       58501 ns/op   17.50 MB/s
 70 | MatchMedium_32K              1000      1914850 ns/op   17.11 MB/s
 71 | MatchMedium_1M               50       61487227 ns/op   17.05 MB/s
 72 | MatchHard_32                 500000       2918 ns/op   10.97 MB/s
 73 | MatchHard_1K                 20000       92338 ns/op   11.09 MB/s
 74 | MatchHard_32K                1000      2979930 ns/op   11.00 MB/s
 75 | MatchHard_1M                 20       95889705 ns/op   10.94 MB/s
 76 | ```
 77 | 
 78 | 
 79 | NOW OUTDATED: Very rough benchmark analysis
 80 | -------------------------------------------
 81 | All benchmarks were taken from RE2/Go and hopefully implemented correctly.
 82 | Both RE2/Rust and RE2/Go are benchmarked with an implicit `.*?` prefixing all
 83 | regular expressions. (i.e., They are unachored unless there is an explicit
 84 | '^'.)
 85 | 
 86 | RE2/Rust gets absolutely clobbered by RE2/Go in the Easy{0,1} benchmarks.
 87 | Interestingly, Rust does the same or better on the Medium/Hard benchmarks. My
 88 | suspicion is that RE2/Go is performing some optimizations on the easy
 89 | benchmarks to make the throughput very high. This gives me hope.
 90 | 
 91 | For example, the EASY{0,1} benchmarks are subject to optimization. RE2/Rust
 92 | does do some optimization with literal prefix strings (explaining the higher
 93 | throughput when compared to the MEDIUM/HARD benchmarks).
 94 | 
 95 | It's promising that RE2/Rust is beating RE2/Go on the MEDIUM/HARD benchmarks,
 96 | which I think suggests that the core VM implementation is probably decent.
 97 | 
 98 | Also note that RE2/Rust is performing much worse on the small Medium/Hard
 99 | benchmarks (searching 32 bytes of text). My suspicion is that there are some
100 | big constant factors lurking somewhere that need to be fixed in RE2/Rust.
101 | This may also explain some of the performance difference in other benchmarks
102 | (NOT easy/medium/hard) since they mostly work with shortish search strings.
103 | (Although this is not true for all, since some specifically target the presence
104 | of optimizations in RE2/Go.)
105 | 
106 | 


--------------------------------------------------------------------------------
/benchmark/golang:
--------------------------------------------------------------------------------
 1 | Golang
 2 | ------
 3 | cd go/src/pkg/regexp
 4 | go test -run ' ' -bench .
 5 | 
 6 | Literal                      10000000      229 ns/op
 7 | NotLiteral                   500000       3354 ns/op
 8 | MatchClass                   500000       5092 ns/op
 9 | MatchClass_InRange           500000       4200 ns/op
10 | ReplaceAll                   500000       3548 ns/op
11 | AnchoredLiteralShortNonMatch 20000000      145 ns/op
12 | AnchoredLiteralLongNonMatch  20000000      142 ns/op
13 | AnchoredShortMatch           5000000       381 ns/op
14 | AnchoredLongMatch            5000000       383 ns/op
15 | OnePassShortA                1000000      1045 ns/op
16 | NotOnePassShortA             1000000      2478 ns/op
17 | OnePassShortB                2000000       766 ns/op
18 | NotOnePassShortB             1000000      2216 ns/op
19 | OnePassLongPrefix            10000000      156 ns/op
20 | OnePassLongNotPrefix         5000000       614 ns/op
21 | MatchEasy0_32                20000000      114 ns/op  279.35 MB/s
22 | MatchEasy0_1K                5000000       653 ns/op 1566.63 MB/s
23 | MatchEasy0_32K               200000      12624 ns/op 2595.57 MB/s
24 | MatchEasy0_1M                5000       458608 ns/op 2286.43 MB/s
25 | MatchEasy1_32                20000000     96.7 ns/op  330.99 MB/s
26 | MatchEasy1_1K                1000000      2647 ns/op  386.74 MB/s
27 | MatchEasy1_32K               50000       57848 ns/op  566.45 MB/s
28 | MatchEasy1_1M                1000      1991274 ns/op  526.59 MB/s
29 | MatchMedium_32               1000000      1746 ns/op   18.33 MB/s
30 | MatchMedium_1K               50000       58501 ns/op   17.50 MB/s
31 | MatchMedium_32K              1000      1914850 ns/op   17.11 MB/s
32 | MatchMedium_1M               50       61487227 ns/op   17.05 MB/s
33 | MatchHard_32                 500000       2918 ns/op   10.97 MB/s
34 | MatchHard_1K                 20000       92338 ns/op   11.09 MB/s
35 | MatchHard_32K                1000      2979930 ns/op   11.00 MB/s
36 | MatchHard_1M                 20       95889705 ns/op   10.94 MB/s
37 | 
38 | 


--------------------------------------------------------------------------------
/benchmark/regex-dna/Makefile:
--------------------------------------------------------------------------------
 1 | RUSTC ?= rustc
 2 | RUSTFILE ?= regex-dna.rs
 3 | 
 4 | bench-rust: run-rust big.fasta
 5 | 	time ./run-rust < big.fasta
 6 | 
 7 | bench-rust-perf: run-rust big.fasta
 8 | 	time perf record --call-graph dwarf ./run-rust < big.fasta
 9 | 
10 | bench-golang: run-golang big.fasta
11 | 	time ./run-golang < big.fasta
12 | 
13 | bench-python: regex-dna.py big.fasta
14 | 	time python3 ./regex-dna.py < big.fasta
15 | 
16 | bench-c: run-c big.fasta
17 | 	time ./run-c < big.fasta
18 | 
19 | big.fasta: generator
20 | 	./generator 5000000 > big.fasta
21 | 
22 | generator: shootout-fasta.rs
23 | 	$(RUSTC) --opt-level=3 shootout-fasta.rs -o generator
24 | 
25 | run-rust: $(RUSTFILE)
26 | 	(cd ../.. && make RUSTC=$(RUSTC))
27 | 	$(RUSTC) --opt-level=3 -Z lto -g -L ../../build $(RUSTFILE) -o run-rust
28 | 
29 | run-golang: regex-dna.go
30 | 	go build -o run-golang regex-dna.go
31 | 
32 | run-c: regex-dna.c
33 | 	gcc -pipe -Wall -O3 -fomit-frame-pointer -march=native -pthread `pkg-config --cflags --libs glib-2.0` regex-dna.c -o run-c -ltcl -lglib-2.0
34 | 
35 | check: check.fasta check.output run-rust run-golang run-c
36 | 	bash -c 'diff check.output <(./run-golang < check.fasta)'
37 | 	bash -c 'diff check.output <(./run-rust < check.fasta)'
38 | 	bash -c 'diff check.output <(python3 ./regex-dna.py < check.fasta)'
39 | 	bash -c 'diff check.output <(./run-c < check.fasta)'
40 | 
41 | check.fasta:
42 | 	curl 'http://benchmarksgame.alioth.debian.org/download/regexdna-input.txt' > check.fasta
43 | 
44 | check.output:
45 | 	curl 'http://benchmarksgame.alioth.debian.org/download/regexdna-output.txt' > check.output
46 | 
47 | clean:
48 | 	rm -rf big.fasta check.fasta check.output run-golang run-rust run-c generator
49 | 	rm -f perf.data*
50 | 
51 | 


--------------------------------------------------------------------------------
/benchmark/regex-dna/README.md:
--------------------------------------------------------------------------------
 1 | This compares RE2/Rust with RE2/Go on the
 2 | [regex-dna](http://benchmarksgame.alioth.debian.org/u32/performance.php?test=regexdna)
 3 | benchmark. The Python and C benchmarks are also provided for additional 
 4 | context.
 5 | 
 6 | To run, first make sure all benchmarks are correct:
 7 | 
 8 | ```
 9 | [andrew@Liger regex-dna] make check
10 | bash -c 'diff check.output <(./run-golang < check.fasta)'
11 | bash -c 'diff check.output <(./run-rust < check.fasta)'
12 | bash -c 'diff check.output <(python3 ./regex-dna.py < check.fasta)'
13 | bash -c 'diff check.output <(./run-c < check.fasta)'
14 | ```
15 | 
16 | If there's something wrong, an error will be reported along with a non-empty
17 | diff.
18 | 
19 | Then run the Rust benchmark:
20 | 
21 | ```
22 | [andrew@Liger regex-dna] make bench-rust
23 | ...
24 | real    0m5.235s
25 | user    0m28.940s
26 | sys     0m0.623s
27 | ```
28 | 
29 | And the Go benchmark:
30 | 
31 | ```
32 | [andrew@Liger regex-dna] make bench-golang
33 | time ./run-golang < big.fasta
34 | ...
35 | real    0m18.654s
36 | user    1m44.733s
37 | sys     0m0.420s
38 | ```
39 | 
40 | And the Python benchmark:
41 | 
42 | ```
43 | [andrew@Liger regex-dna] make bench-python
44 | time python3 ./regex-dna.py < big.fasta
45 | ...
46 | real    0m4.174s
47 | user    0m13.757s
48 | sys     0m0.407s
49 | ```
50 | 
51 | And the C (Tcl) benchmark:
52 | 
53 | ```
54 | [andrew@Liger regex-dna] make bench-c
55 | time ./run-c < big.fasta
56 | real    0m0.970s
57 | user    0m3.793s
58 | sys     0m0.380s
59 | ```
60 | 
61 | Note that all benchmarks are multithreaded and were run on an Intel i7 3930K
62 | (12 threads).
63 | 
64 | 


--------------------------------------------------------------------------------
/benchmark/regex-dna/regex-dna-single.rs:
--------------------------------------------------------------------------------
 1 | // Originally written by JustAPerson (https://github.com/JustAPerson).
 2 | // Modified by Andrew Gallant (https://github.com/BurntSushi).
 3 | 
 4 | #![feature(macro_rules, phase)]
 5 | 
 6 | extern crate regex;
 7 | #[phase(syntax)]extern crate regex_macros;
 8 | 
 9 | use regex::{NoExpand, Regex};
10 | 
11 | fn replace(re: &Regex, text: &str, rep: &str) -> ~str {
12 |     re.replace_all(text, NoExpand(rep))
13 | }
14 | 
15 | fn count_matches(seq: &str, variant: &Regex) -> int {
16 |     let mut n = 0;
17 |     for _ in variant.find_iter(seq) {
18 |         n += 1;
19 |     }
20 |     n
21 | }
22 | 
23 | fn main() {
24 |     let mut stdin =  std::io::stdio::stdin();
25 |     let mut seq = stdin.read_to_str().unwrap();
26 |     let ilen = seq.len();
27 | 
28 |     seq = regex!(">[^\n]*\n|\n").replace_all(seq, NoExpand(""));
29 |     let clen = seq.len();
30 | 
31 |     let variants = ~[
32 |         regex!("agggtaaa|tttaccct"),
33 |         regex!("[cgt]gggtaaa|tttaccc[acg]"),
34 |         regex!("a[act]ggtaaa|tttacc[agt]t"),
35 |         regex!("ag[act]gtaaa|tttac[agt]ct"),
36 |         regex!("agg[act]taaa|ttta[agt]cct"),
37 |         regex!("aggg[acg]aaa|ttt[cgt]ccct"),
38 |         regex!("agggt[cgt]aa|tt[acg]accct"),
39 |         regex!("agggta[cgt]a|t[acg]taccct"),
40 |         regex!("agggtaa[cgt]|[acg]ttaccct"),
41 |     ];
42 |     let (mut variant_strs, mut counts) = (vec!(), vec!());
43 |     for variant in variants.move_iter() {
44 |         variant_strs.push(variant.to_str().to_owned());
45 |         counts.push(count_matches(seq, &variant));
46 |     }
47 |     for (i, variant) in variant_strs.iter().enumerate() {
48 |         println!("{} {}", variant, *counts.get(i));
49 |     }
50 | 
51 |     let substs = ~[
52 |         (regex!("B"), "(c|g|t)"),
53 |         (regex!("D"), "(a|g|t)"),
54 |         (regex!("H"), "(a|c|t)"),
55 |         (regex!("K"), "(g|t)"),
56 |         (regex!("M"), "(a|c)"),
57 |         (regex!("N"), "(a|c|g|t)"),
58 |         (regex!("R"), "(a|g)"),
59 |         (regex!("S"), "(c|g)"),
60 |         (regex!("V"), "(a|c|g)"),
61 |         (regex!("W"), "(a|t)"),
62 |         (regex!("Y"), "(c|t)"),
63 |     ];
64 |     for (re, replacement) in substs.move_iter() {
65 |         seq = replace(&re, seq, replacement)
66 |     }
67 |     println!("");
68 |     println!("{}", ilen);
69 |     println!("{}", clen);
70 |     println!("{}", seq.len());
71 | }
72 | 


--------------------------------------------------------------------------------
/benchmark/regex-dna/regex-dna.c:
--------------------------------------------------------------------------------
  1 | /* The Computer Language Benchmarks Game
  2 |  * http://benchmarksgame.alioth.debian.org/
  3 |    contributed by Paul Serice
  4 | */
  5 | 
  6 | #include <sys/types.h>
  7 | #include <pthread.h>
  8 | #include <errno.h>
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | #include <string.h>
 12 | #include <glib.h>
 13 | #include <tcl.h>
 14 | 
 15 | /*************************************************************************
 16 |  * Data Structures and Typedefs
 17 |  *************************************************************************/
 18 | 
 19 | /* Mapping of a nucleic acid code to its meaning.  This is used with
 20 |  * regsub() to substitute each occurrence of "code" in the main input
 21 |  * string with its "meaning." */
 22 | static struct nucleic_acid_code {
 23 |     char* code;
 24 |     char* meaning;
 25 | } nacodes[] = {{"B", "(c|g|t)"},
 26 |                {"D", "(a|g|t)"},
 27 |                {"H", "(a|c|t)"},
 28 |                {"K", "(g|t)"},
 29 |                {"M", "(a|c)"},
 30 |                {"N", "(a|c|g|t)"},
 31 |                {"R", "(a|g)"},
 32 |                {"S", "(c|g)"},
 33 |                {"V", "(a|c|g)"},
 34 |                {"W", "(a|t)"},
 35 |                {"Y", "(c|t)"},
 36 |                {NULL, NULL}
 37 | };
 38 | 
 39 | /* The variants are used with regcount() to count the number of times
 40 |  * each variant appears in the main input string. */
 41 | static const char* variants[] = {
 42 |   "agggtaaa|tttaccct",
 43 |   "[cgt]gggtaaa|tttaccc[acg]",
 44 |   "a[act]ggtaaa|tttacc[agt]t",
 45 |   "ag[act]gtaaa|tttac[agt]ct",
 46 |   "agg[act]taaa|ttta[agt]cct",
 47 |   "aggg[acg]aaa|ttt[cgt]ccct",
 48 |   "agggt[cgt]aa|tt[acg]accct",
 49 |   "agggta[cgt]a|t[acg]taccct",
 50 |   "agggtaa[cgt]|[acg]ttaccct",
 51 |   NULL
 52 | };
 53 | 
 54 | 
 55 | /* To process the variants, a small thread pool is created.  Each
 56 |  * thread is passed an array of these tasks.  The threads combine to
 57 |  * perform the tasks.  When there are no more tasks, the threads exit
 58 |  * and the parent joins with them before continuing. */
 59 | typedef struct variant_worker_task {
 60 | 
 61 |     /* input: which variant to process */
 62 |     const char* variant;
 63 | 
 64 |     /* input: string against which "variant" will be matched */
 65 |     Tcl_Obj* s;
 66 | 
 67 |     /* output: number of times "variant" matched against "s" */
 68 |     unsigned long int count;
 69 | 
 70 | } *variant_worker_task_t;
 71 | 
 72 | 
 73 | /* Data passed into each thread that process the variants.  All the
 74 |  * threads in the pool share one copy of this data structure and must
 75 |  * use "lock" to synchronize access to it. */
 76 | typedef struct variant_worker_data {
 77 | 
 78 |     /* shared: lock that protects this structure */
 79 |     pthread_mutex_t lock;
 80 | 
 81 |     /* shared: array of tasks that the threads are trying to complete */
 82 |     variant_worker_task_t tasks;
 83 | 
 84 |     /* shared: pointer to shared index into "tasks" */
 85 |     volatile int next_task;
 86 | 
 87 |     /* shared: total number of tasks in the "tasks" array */
 88 |     int total_tasks;
 89 | 
 90 | } *variant_worker_data_t;
 91 | 
 92 | 
 93 | /* Data passed into each thread that substitutes nucleic acid codes. */
 94 | typedef struct nacodes_worker_data {
 95 | 
 96 |     /* input/output: String object that is input to the thread as a
 97 |      * copy of the range of characters from the main input string over
 98 |      * which the thread should work.  The thread should call
 99 |      * Tcl_SetStringObj() to set "range" to hold the result of the
100 |      * substitutions. */
101 |     Tcl_Obj* range;
102 | 
103 | } *nacodes_worker_data_t;
104 | 
105 | 
106 | /* Create an explicit typedef for the pthread start functions. */
107 | typedef void* (*thread_start_t)(void*);
108 | 
109 | /*************************************************************************
110 |  * regcount()
111 |  *************************************************************************/
112 | 
113 | /* Return the number of times the regular expression "regexp_cstr"
114 |  * uniquely matches against the input string "s". */
115 | static unsigned long
116 | regcount(const char* regexp_cstr,
117 |          Tcl_Obj* s)
118 | {
119 |     int regexec_rv = 0;
120 |     int index = 0;
121 |     int index_max = 0;
122 |     unsigned long rv = 0;
123 |     Tcl_Obj* regexp_cstr_obj = NULL;
124 |     Tcl_RegExp regexp = NULL;
125 |     struct Tcl_RegExpInfo info = {0};
126 | 
127 |     /* Get "regexp_cstr" as a Tcl string object. */
128 |     regexp_cstr_obj = Tcl_NewStringObj(regexp_cstr, strlen(regexp_cstr));
129 |     Tcl_IncrRefCount(regexp_cstr_obj);
130 | 
131 |     /* Compile the regular expression. */
132 |     regexp = Tcl_GetRegExpFromObj(NULL, regexp_cstr_obj,
133 |                  TCL_REG_ADVANCED | TCL_REG_NOCASE | TCL_REG_NEWLINE);
134 |     if (!regexp) {
135 |         fprintf(stderr, "*** Error: Tcl_GetRegExpFromObj: failed");
136 |         exit(1);
137 |     }
138 | 
139 |     /* Iterate over each match. */
140 |     index = 0;
141 |     index_max = Tcl_GetCharLength(s);
142 |     while (index < index_max) {
143 | 
144 |         /* Test for a match. */
145 |         regexec_rv = Tcl_RegExpExecObj(NULL, regexp, s, index, 1, 0);
146 |         if (regexec_rv == -1) {
147 |             fprintf(stderr, "*** Error: Tcl_RegExpExecObj: failed");
148 |             exit(1);
149 |         }
150 |         if (regexec_rv == 0) {
151 |             /* No matches. */
152 |             break;
153 |         }
154 | 
155 |         /* Get the match information. */
156 |         Tcl_RegExpGetInfo(regexp, &info);
157 | 
158 |         /* Advance curr. */
159 |         index += info.matches[0].end;
160 | 
161 |         /* Increment the match count. */
162 |         ++rv;
163 |     }
164 | 
165 |     /* Clean up.  Note that "regexp" is owned by "regexp_cstr_obj" so
166 |      * it does not need explicit clean up. */
167 |     Tcl_DecrRefCount(regexp_cstr_obj);
168 | 
169 |     return rv;
170 | }
171 | 
172 | /*************************************************************************
173 |  * regsub()
174 |  *************************************************************************/
175 | 
176 | /* Substitute each occurrence of the regular expression "regex" in "s"
177 |  * with "subst".  The result is returned in a newly allocate string
178 |  * that must be freed with g_free(). */
179 | static char*
180 | regsub(const char* regex,
181 |        const char* s,
182 |        const char* subst,
183 |        GError** err)
184 | {
185 |     char* rv = NULL;
186 |     GRegex* prog = NULL;
187 | 
188 |     /* How glib propagates exceptions. */
189 |     if (err && *err) {
190 |         goto out;
191 |     }
192 | 
193 |     /* Compile regex. */
194 |     prog = g_regex_new(regex,
195 |                        G_REGEX_CASELESS |
196 |                        G_REGEX_RAW |
197 |                        G_REGEX_NO_AUTO_CAPTURE |
198 |                        G_REGEX_OPTIMIZE,
199 |                        0,
200 |                        err);
201 |     if (err && *err) {
202 |         goto out;
203 |     }
204 | 
205 |     /* Substitute. */
206 |     rv = g_regex_replace_literal(prog, s, -1, 0, subst, 0, err);
207 |     if (err && *err) {
208 |         goto out;
209 |     }
210 | 
211 |  out:
212 | 
213 |     /* Clean up. */
214 |     if (prog) {
215 |         g_regex_unref(prog);
216 |     }
217 | 
218 |     return rv;
219 | }
220 | 
221 | /*************************************************************************
222 |  * load_file()
223 |  *************************************************************************/
224 | 
225 | /* Load the file f into the string s. */
226 | static void
227 | load_file(FILE* f,
228 |           Tcl_Obj* s)
229 | {
230 |     char* block = NULL;
231 |     size_t block_size = 16384;
232 |     size_t rcount = 0;
233 | 
234 |     /* Allocate a block for I/O. */
235 |     block = malloc(block_size);
236 |     if (!block) {
237 |         perror("malloc");
238 |         exit(1);
239 |     }
240 | 
241 |     /* Iterate over each block of input. */
242 |     for (;;) {
243 | 
244 |         /* Read a block. */
245 |         rcount = fread(block, 1, block_size, f);
246 |         if (rcount == 0) {
247 |             /* Check for errors. */
248 |             if (ferror(f)) {
249 |                 perror("fread");
250 |                 exit(1);
251 |             }
252 |             /* EOF */
253 |             break;
254 |         }
255 | 
256 |         /* Append a block. */
257 |         Tcl_AppendToObj(s, block, rcount);
258 |     }
259 | 
260 |     /* Free block. */
261 |     free(block);
262 | }
263 | 
264 | /*************************************************************************
265 |  * process_variant_worker() and process_variants()
266 |  *************************************************************************/
267 | 
268 | /* This is a helper function for process_variant_worker() which is the
269 |  * start routine for the threads that count how many times a variant
270 |  * matches the main input string.  This routing locks "data" and
271 |  * attempts to get the index of the next task.  If successful, it
272 |  * takes ownership of that index by incrementing "data->next_task" so
273 |  * that the next thread that comes along will get the next task.
274 |  * Before returning, this routine releases the lock.  This routine
275 |  * returns true if successful and false otherwise. */
276 | static int
277 | get_variant_index(variant_worker_data_t data,
278 |                   int* index)
279 | {
280 |     int rv = 0;
281 | 
282 |     /* Lock "data". */
283 |     pthread_mutex_lock(&data->lock);
284 | 
285 |     /* Get the index for the next task if any remain. */
286 |     if (data->next_task < data->total_tasks) {
287 |         *index = data->next_task++;
288 |         rv = 1;
289 |     }
290 | 
291 |     /* Unlock "data". */
292 |     pthread_mutex_unlock(&data->lock);
293 | 
294 |     return rv;
295 | }
296 | 
297 | /* This is the worker routine for the thread pool that processes the
298 |  * variants.  This routine atomically gets the next task which holds
299 |  * all the information needed to count the number of times the task's
300 |  * "variant" value matches the main input string and stores the result
301 |  * in the task's "count" value.  The main input string is passed in as
302 |  * the task's read-only "s" value. */
303 | static void*
304 | process_variant_worker(variant_worker_data_t data)
305 | {
306 |     int index = 0;
307 | 
308 |     /* Carefully get the index for the next task. */
309 |     while (get_variant_index(data, &index)) {
310 |         /* Perform the task of counting regex matches. */
311 |         data->tasks[index].count
312 |             = regcount(data->tasks[index].variant,
313 |                        data->tasks[index].s);
314 |     }
315 | 
316 |     return NULL;
317 | }
318 | 
319 | /* Process the list of variants by counting the frequency of each
320 |  * regexp in the main input string "s" and printing the results. */
321 | static void
322 | process_variants(int cpu_count,
323 |                  Tcl_Obj* s)
324 | {
325 |     int i = 0;
326 |     int s_length = 0;
327 |     int thread_rv = 0;
328 |     int thread_count = 0;
329 |     int task_count = 0;
330 |     pthread_t* threads = NULL;
331 |     variant_worker_task_t tasks = NULL;
332 |     struct variant_worker_data data = {PTHREAD_MUTEX_INITIALIZER,};
333 | 
334 |     /* WARNING: Tcl_RegExpExecObj() always does an internal conversion
335 |      * of "s" to a UCS-2 Unicode string if "s" is in UTF-8 format.
336 |      * Normally, this is a nice feature, but as of tcl-8.5, it doesn't
337 |      * appear to be thread-safe.  As a work-around, force the
338 |      * conversion now before starting the threads. */
339 |     Tcl_GetUnicodeFromObj(s, &s_length);
340 | 
341 |     /* Determine the total number of variants (minus the NULL sentinel). */
342 |     task_count = (int)(sizeof(variants) / sizeof(variants[0]) - 1);
343 | 
344 |     /* Determine the number of threads to start. */
345 |     thread_count = cpu_count * 2;
346 |     if (thread_count > task_count) {
347 |         thread_count = task_count;
348 |     }
349 | 
350 |     /* Allocate the "threads" array which holds the thread IDs. */
351 |     threads = calloc(thread_count, sizeof(*threads));
352 |     if (!threads) {
353 |         perror("calloc");
354 |         exit(1);
355 |     }
356 | 
357 |     /* Allocate the "tasks" array which holds one unit of work per
358 |      * element in the array. */
359 |     tasks = calloc(task_count, sizeof(*tasks));
360 |     if (!tasks) {
361 |         perror("calloc");
362 |         exit(1);
363 |     }
364 | 
365 |     /* Initialize the task array. */
366 |     for (i = 0 ; i < task_count ; ++i) {
367 |         tasks[i].variant = variants[i];
368 |         tasks[i].s = s;
369 |         tasks[i].count = 0;
370 |     }
371 | 
372 |     /* Initialize the data shared by the threads. */
373 |     data.tasks = tasks;
374 |     data.next_task = 0;
375 |     data.total_tasks = task_count;
376 | 
377 |     /* Start the threads. */
378 |     for (i = 0 ; i < thread_count ; ++i) {
379 |         thread_rv = pthread_create(&threads[i],
380 |                                    NULL,
381 |                                    (thread_start_t)process_variant_worker,
382 |                                    &data);
383 |         if (thread_rv) {
384 |             fprintf(stderr, "*** Error: pthread_create: failed");
385 |             exit(1);
386 |         }
387 |     }
388 | 
389 |     /* Wait for each thread to finish. */
390 |     for (i = 0 ; i < thread_count ; ++i) {
391 |         thread_rv = pthread_join(threads[i], NULL);
392 |         if (thread_rv) {
393 |             fprintf(stderr, "*** Error: pthread_join: failed");
394 |             exit(1);
395 |         }
396 |     }
397 | 
398 |     /* Print results. */
399 |     for (i = 0 ; i < task_count ; ++i) {
400 |         printf("%s %lu\n", variants[i], tasks[i].count);
401 |     }
402 | 
403 |     /* Clean up. */
404 |     free(tasks);
405 |     free(threads);
406 | }
407 | 
408 | /*************************************************************************
409 |  * process_nacodes_worker() and process_nacodes()
410 |  *************************************************************************/
411 | 
412 | /* This is the worker routing for the threads that process the
413 |  * substitution of the nucleic acid codes with their meanings.  These
414 |  * threads are not in a thread pool because the work can be divided
415 |  * exactly into one thread per cpu.  So the parent just starts each
416 |  * thread and waits for them all to finish.
417 |  *
418 |  * Each worker gets a range of characters from the main input string
419 |  * and is responsible for calling regsub() once for each nucleic acid
420 |  * code.  Thus, if there are 11 nucleic acid codes, each thread calls
421 |  * regsub() 11 times but the scope of the regsub() call is limited to
422 |  * just the range of characters it has been assigned. */
423 | static void*
424 | process_nacodes_worker(nacodes_worker_data_t data)
425 | {
426 |     char* s_in = NULL;
427 |     char* s_out = NULL;
428 |     struct nucleic_acid_code* nacode = NULL;
429 | 
430 |     /* Get the character range as a C-style string. */
431 |     s_in = Tcl_GetString(data->range);
432 | 
433 |     /* Iterate over the nucleic acid codes. */
434 |     for (nacode = nacodes ; nacode->code ; ++nacode) {
435 | 
436 |         /* Perform the substitution. */
437 |         s_out = regsub(nacode->code, s_in, nacode->meaning, NULL);
438 | 
439 |         /* Free s_in on all but the first pass because s_in
440 |          * belongs to Tcl on the first pass. */
441 |         if (nacode != nacodes) {
442 |             g_free(s_in);
443 |             s_in = NULL;
444 |         }
445 |         /* If this is the last pass, save the result and clean up. */
446 |         if ((nacode + 1)->code == NULL) {
447 |             Tcl_SetStringObj(data->range, s_out, strlen(s_out));
448 |             g_free(s_out);
449 |             s_out = NULL;
450 |         } else {
451 |             /* Otherwise, prepare for the next iteration. */
452 |             s_in = s_out;
453 |             s_out = NULL;
454 |         }
455 |     }
456 | 
457 |     return NULL;
458 | }
459 | 
460 | /* Process the nucleic acid codes by substituting each nucleic acid
461 |  * code in "s" with its meaning as defined in the static "nacodes"
462 |  * structure (see top of file).  On return, "s" will hold the
463 |  * substituted string. */
464 | static void
465 | process_nacodes(int cpu_count,
466 |                 Tcl_Obj* s)
467 | {
468 |     int i = 0;
469 |     int first = 0;
470 |     int last = 0;
471 |     int s_length = 0;
472 |     int range_length = 0;
473 |     int thread_rv = 0;
474 |     nacodes_worker_data_t data = NULL;
475 |     pthread_t* threads = NULL;
476 | 
477 |     /* Sanity check to make sure we don't divide by zero. */
478 |     if (cpu_count == 0) {
479 |         return;
480 |     }
481 | 
482 |     /* Get the total length of s. */
483 |     s_length = Tcl_GetCharLength(s);
484 |     if (s_length == 0) {
485 |         return;
486 |     }
487 | 
488 |     /* Allocate the "data" array which is used to pass data to and
489 |      * from the threads. */
490 |     data = calloc(cpu_count, sizeof(*data));
491 | 
492 |     /* Allocate the "threads" array which holds the thread IDs. */
493 |     threads = calloc(cpu_count, sizeof(*threads));
494 | 
495 |     /* Calculate the number of characters to feed each thread.  Note
496 |      * that we checked above to make sure cpu_count is not zero. */
497 |     range_length = s_length / cpu_count;
498 | 
499 |     /* Start one thread for each cpu. */
500 |     for (i = 0 ; i < cpu_count ; ++i) {
501 | 
502 |         /* First, initialize the thread's client data. */
503 | 
504 |         /* Calculate the first and last index for the range.  Both
505 |          * "first" and "last" indexes are inclusive because that is
506 |          * what Tcl_GetRange() requires.  We also need to make sure
507 |          * the very last range has all the characters in case
508 |          * range_length does not divide s_length evenly. */
509 |         first = range_length * i;
510 |         last = range_length * (i + 1) - 1;
511 |         if (i + 1 == cpu_count) {
512 |             last = s_length - 1;
513 |         }
514 | 
515 |         /* Pack the data for the worker thread. */
516 |         data[i].range = Tcl_GetRange(s, first, last);
517 |         Tcl_IncrRefCount(data[i].range);
518 | 
519 |         /* Second, start the thread. */
520 |         thread_rv = pthread_create(&threads[i],
521 |                                    NULL,
522 |                                    (thread_start_t)process_nacodes_worker,
523 |                                    &data[i]);
524 |         if (thread_rv) {
525 |             fprintf(stderr, "*** Error: pthread_create: failed");
526 |             exit(1);
527 |         }
528 |     }
529 | 
530 |     /* Wait for each thread to finish. */
531 |     for (i = 0 ; i < cpu_count ; ++i) {
532 |         thread_rv = pthread_join(threads[i], NULL);
533 |         if (thread_rv) {
534 |             fprintf(stderr, "*** Error: pthread_join: failed");
535 |             exit(1);
536 |         }
537 |     }
538 | 
539 |     /* Merge results. */
540 |     Tcl_SetObjLength(s, 0);
541 |     for (i = 0 ; i < cpu_count ; ++i) {
542 |         Tcl_AppendObjToObj(s, data[i].range);
543 |     }
544 | 
545 |     /* Clean up. */
546 |     for (i = 0 ; i < cpu_count ; ++i) {
547 |         Tcl_DecrRefCount(data[i].range);
548 |     }
549 |     free(threads);
550 |     free(data);
551 | }
552 | 
553 | /*************************************************************************
554 |  * get_cpu_count()
555 |  *************************************************************************/
556 | 
557 | /* Return the number of cpus.  If an error occurs, 0 cpus will be
558 |  * reported.  There are other ways to do this, but this is a program
559 |  * to test regexp processing so ... */
560 | static int
561 | get_cpu_count(void)
562 | {
563 |     int rv = 0;
564 |     FILE* f = NULL;
565 |     Tcl_Obj* s = NULL;
566 | 
567 |     /* Allocate a string. */
568 |     s = Tcl_NewStringObj("", 0);
569 |     Tcl_IncrRefCount(s);
570 | 
571 |     /* Open /proc/cpuinfo. */
572 |     f = fopen("/proc/cpuinfo", "r");
573 |     if (!f) {
574 |         goto out;
575 |     }
576 | 
577 |     /* Load file into s. */
578 |     load_file(f, s);
579 | 
580 |     /* Count the number of cpus.  "\M" matches at the end of a word. */
581 |     rv = regcount("^processor\\M", s);
582 | 
583 |  out:
584 | 
585 |     /* Clean up. */
586 |     if (f) {
587 |         fclose(f);
588 |     }
589 |     if (s) {
590 |         Tcl_DecrRefCount(s);
591 |     }
592 | 
593 |     return rv;
594 | }
595 | 
596 | /*************************************************************************
597 |  * main()
598 |  *************************************************************************/
599 | 
600 | int
601 | main(int argc,
602 |      char* argv[])
603 | {
604 |     int rv = 0;
605 |     int cpu_count = 0;
606 |     int init_length = 0;
607 |     int code_length = 0;
608 |     int seq_length = 0;
609 |     char* s_cstr = NULL;
610 |     Tcl_Interp *tcl = NULL;
611 |     Tcl_Obj* s = NULL;
612 | 
613 |     /* Initialize Tcl. */
614 |     Tcl_FindExecutable(argv[0]);
615 |     tcl = Tcl_CreateInterp();
616 |     Tcl_Preserve((ClientData)tcl);
617 | 
618 |     /* Count the number of cpus.  If the cpu count could not be
619 |      * determined, assume 4 cpus. */
620 |     cpu_count = get_cpu_count();
621 |     if (!cpu_count) {
622 |         cpu_count = 4;
623 |     }
624 | 
625 |     /* Allocate s. */
626 |     s = Tcl_NewStringObj("", 0);
627 |     Tcl_IncrRefCount(s);
628 | 
629 |     /* Load stdin into s. */
630 |     load_file(stdin, s);
631 | 
632 |     /* Get the length of s. */
633 |     init_length = Tcl_GetCharLength(s);
634 | 
635 |     /* Strip off section headers and EOLs from s.  This is a little
636 |      * messy because we have to go from Tcl-string to C-string and
637 |      * back to Tcl-string. */
638 |     s_cstr = regsub("(>.*)|\n", Tcl_GetString(s), "", NULL);
639 |     Tcl_SetStringObj(s, s_cstr, strlen(s_cstr));
640 |     g_free(s_cstr);
641 |     s_cstr = NULL;
642 | 
643 |     /* Get the length of s. */
644 |     code_length = Tcl_GetCharLength(s);
645 | 
646 |     /* Process the variants by counting them and printing the results. */
647 |     process_variants(cpu_count, s);
648 | 
649 |     /* Substitute nucleic acid codes in s with their meanings. */
650 |     process_nacodes(cpu_count, s);
651 | 
652 |     /* Get the length of s. */
653 |     seq_length = Tcl_GetCharLength(s);
654 | 
655 |     /* Print the lengths. */
656 |     printf("\n%d\n%d\n%d\n", init_length, code_length, seq_length);
657 | 
658 |     /* Clean up. */
659 |     Tcl_DecrRefCount(s);
660 | 
661 |     /* Finalize Tcl. */
662 |     Tcl_Release((ClientData)tcl);
663 |     Tcl_Exit(rv);
664 | 
665 |     /* Not reached. */
666 |     return rv;
667 | }
668 | 


--------------------------------------------------------------------------------
/benchmark/regex-dna/regex-dna.go:
--------------------------------------------------------------------------------
 1 | /* The Computer Language Benchmarks Game
 2 |  * http://benchmarksgame.alioth.debian.org/
 3 |  *
 4 |  * contributed by The Go Authors.
 5 |  */
 6 | 
 7 | package main
 8 | 
 9 | import (
10 | 	"fmt"
11 | 	"io/ioutil"
12 | 	"os"
13 | 	"regexp"
14 | 	"runtime"
15 | )
16 | 
17 | var variants = []string{
18 | 	"agggtaaa|tttaccct",
19 | 	"[cgt]gggtaaa|tttaccc[acg]",
20 | 	"a[act]ggtaaa|tttacc[agt]t",
21 | 	"ag[act]gtaaa|tttac[agt]ct",
22 | 	"agg[act]taaa|ttta[agt]cct",
23 | 	"aggg[acg]aaa|ttt[cgt]ccct",
24 | 	"agggt[cgt]aa|tt[acg]accct",
25 | 	"agggta[cgt]a|t[acg]taccct",
26 | 	"agggtaa[cgt]|[acg]ttaccct",
27 | }
28 | 
29 | type Subst struct {
30 | 	pat, repl string
31 | }
32 | 
33 | var substs = []Subst{
34 | 	Subst{"B", "(c|g|t)"},
35 | 	Subst{"D", "(a|g|t)"},
36 | 	Subst{"H", "(a|c|t)"},
37 | 	Subst{"K", "(g|t)"},
38 | 	Subst{"M", "(a|c)"},
39 | 	Subst{"N", "(a|c|g|t)"},
40 | 	Subst{"R", "(a|g)"},
41 | 	Subst{"S", "(c|g)"},
42 | 	Subst{"V", "(a|c|g)"},
43 | 	Subst{"W", "(a|t)"},
44 | 	Subst{"Y", "(c|t)"},
45 | }
46 | 
47 | func countMatches(pat string, bytes []byte) int {
48 | 	re := regexp.MustCompile(pat)
49 | 	n := 0
50 | 	for {
51 | 		e := re.FindIndex(bytes)
52 | 		if e == nil {
53 | 			break
54 | 		}
55 | 		n++
56 | 		bytes = bytes[e[1]:]
57 | 	}
58 | 	return n
59 | }
60 | 
61 | func main() {
62 | 	runtime.GOMAXPROCS(runtime.NumCPU())
63 | 
64 | 	bytes, err := ioutil.ReadFile("/dev/stdin")
65 | 	if err != nil {
66 | 		fmt.Fprintf(os.Stderr, "can't read input: %s\n", err)
67 | 		os.Exit(2)
68 | 	}
69 | 	ilen := len(bytes)
70 | 	// Delete the comment lines and newlines
71 | 	bytes = regexp.MustCompile("(>[^\n]+)?\n").ReplaceAll(bytes, []byte{})
72 | 	clen := len(bytes)
73 | 
74 | 	mresults := make([]chan int, len(variants))
75 | 	for i, s := range variants {
76 | 		ch := make(chan int)
77 | 		mresults[i] = ch
78 | 		go func(ss string) {
79 | 			ch <- countMatches(ss, bytes)
80 | 		}(s)
81 | 	}
82 | 
83 | 	lenresult := make(chan int)
84 | 	bb := bytes
85 | 	go func() {
86 | 		for _, sub := range substs {
87 | 			bb = regexp.MustCompile(sub.pat).ReplaceAll(bb, []byte(sub.repl))
88 | 		}
89 | 		lenresult <- len(bb)
90 | 	}()
91 | 
92 | 	for i, s := range variants {
93 | 		fmt.Printf("%s %d\n", s, <-mresults[i])
94 | 	}
95 | 	fmt.Printf("\n%d\n%d\n%d\n", ilen, clen, <-lenresult)
96 | }
97 | 


--------------------------------------------------------------------------------
/benchmark/regex-dna/regex-dna.py:
--------------------------------------------------------------------------------
 1 | # The Computer Language Benchmarks Game
 2 | # http://shootout.alioth.debian.org/
 3 | # contributed by Dominique Wahli
 4 | # 2to3
 5 | # mp by Ahmad Syukri
 6 | # modified by Justin Peel
 7 | 
 8 | from sys import stdin
 9 | from re import sub, findall
10 | from multiprocessing import Pool
11 | 
12 | def init(arg):
13 |     global seq
14 |     seq = arg
15 | 
16 | def var_find(f):
17 |     return len(findall(f, seq))
18 | 
19 | def main():
20 |     seq = stdin.read()
21 |     ilen = len(seq)
22 | 
23 |     seq = sub('>.*\n|\n', '', seq)
24 |     clen = len(seq)
25 | 
26 |     pool = Pool(initializer = init, initargs = (seq,))
27 | 
28 |     variants = (
29 |           'agggtaaa|tttaccct',
30 |           '[cgt]gggtaaa|tttaccc[acg]',
31 |           'a[act]ggtaaa|tttacc[agt]t',
32 |           'ag[act]gtaaa|tttac[agt]ct',
33 |           'agg[act]taaa|ttta[agt]cct',
34 |           'aggg[acg]aaa|ttt[cgt]ccct',
35 |           'agggt[cgt]aa|tt[acg]accct',
36 |           'agggta[cgt]a|t[acg]taccct',
37 |           'agggtaa[cgt]|[acg]ttaccct')
38 |     for f in zip(variants, pool.imap(var_find, variants)):
39 |         print(f[0], f[1])
40 | 
41 |     subst = {
42 |           'B' : '(c|g|t)', 'D' : '(a|g|t)',   'H' : '(a|c|t)', 'K' : '(g|t)',
43 |           'M' : '(a|c)',   'N' : '(a|c|g|t)', 'R' : '(a|g)',   'S' : '(c|g)',
44 |           'V' : '(a|c|g)', 'W' : '(a|t)',     'Y' : '(c|t)'}
45 |     for f, r in list(subst.items()):
46 |         seq = sub(f, r, seq)
47 | 
48 |     print()
49 |     print(ilen)
50 |     print(clen)
51 |     print(len(seq))
52 | 
53 | if __name__=="__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/benchmark/regex-dna/regex-dna.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
 2 | // file at the top-level directory of this distribution and at
 3 | // http://rust-lang.org/COPYRIGHT.
 4 | //
 5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 8 | // option. This file may not be copied, modified, or distributed
 9 | // except according to those terms.
10 | 
11 | // FIXME(#13725) windows needs fixing.
12 | // ignore-win32
13 | // ignore-stage1
14 | // ignore-cross-compile #12102
15 | 
16 | #![feature(macro_rules, phase)]
17 | 
18 | extern crate regex;
19 | #[phase(syntax)]extern crate regex_macros;
20 | extern crate sync;
21 | 
22 | use std::io;
23 | use regex::{NoExpand, Regex};
24 | use sync::Arc;
25 | 
26 | fn count_matches(seq: &str, variant: &Regex) -> int {
27 |     let mut n = 0;
28 |     for _ in variant.find_iter(seq) {
29 |         n += 1;
30 |     }
31 |     n
32 | }
33 | 
34 | fn main() {
35 |     let mut rdr = if std::os::getenv("RUST_BENCH").is_some() {
36 |         let fd = io::File::open(&Path::new("shootout-k-nucleotide.data"));
37 |         ~io::BufferedReader::new(fd) as ~io::Reader
38 |     } else {
39 |         ~io::stdin() as ~io::Reader
40 |     };
41 |     let mut seq = StrBuf::from_str(rdr.read_to_str().unwrap());
42 |     let ilen = seq.len();
43 | 
44 |     seq = regex!(">[^\n]*\n|\n").replace_all(seq.as_slice(), NoExpand(""));
45 |     let seq_arc = Arc::new(seq.clone()); // copy before it moves
46 |     let clen = seq.len();
47 | 
48 |     let mut seqlen = sync::Future::spawn(proc() {
49 |         let substs = ~[
50 |             (regex!("B"), "(c|g|t)"),
51 |             (regex!("D"), "(a|g|t)"),
52 |             (regex!("H"), "(a|c|t)"),
53 |             (regex!("K"), "(g|t)"),
54 |             (regex!("M"), "(a|c)"),
55 |             (regex!("N"), "(a|c|g|t)"),
56 |             (regex!("R"), "(a|g)"),
57 |             (regex!("S"), "(c|g)"),
58 |             (regex!("V"), "(a|c|g)"),
59 |             (regex!("W"), "(a|t)"),
60 |             (regex!("Y"), "(c|t)"),
61 |         ];
62 |         let mut seq = seq;
63 |         for (re, replacement) in substs.move_iter() {
64 |             seq = re.replace_all(seq.as_slice(), NoExpand(replacement));
65 |         }
66 |         seq.len()
67 |     });
68 | 
69 |     let variants = ~[
70 |         regex!("agggtaaa|tttaccct"),
71 |         regex!("[cgt]gggtaaa|tttaccc[acg]"),
72 |         regex!("a[act]ggtaaa|tttacc[agt]t"),
73 |         regex!("ag[act]gtaaa|tttac[agt]ct"),
74 |         regex!("agg[act]taaa|ttta[agt]cct"),
75 |         regex!("aggg[acg]aaa|ttt[cgt]ccct"),
76 |         regex!("agggt[cgt]aa|tt[acg]accct"),
77 |         regex!("agggta[cgt]a|t[acg]taccct"),
78 |         regex!("agggtaa[cgt]|[acg]ttaccct"),
79 |     ];
80 |     let (mut variant_strs, mut counts) = (vec!(), vec!());
81 |     for variant in variants.move_iter() {
82 |         let seq_arc_copy = seq_arc.clone();
83 |         variant_strs.push(variant.to_str().to_owned());
84 |         counts.push(sync::Future::spawn(proc() {
85 |             count_matches(seq_arc_copy.as_slice(), &variant)
86 |         }));
87 |     }
88 | 
89 |     for (i, variant) in variant_strs.iter().enumerate() {
90 |         println!("{} {}", variant, counts.get_mut(i).get());
91 |     }
92 |     println!("");
93 |     println!("{}", ilen);
94 |     println!("{}", clen);
95 |     println!("{}", seqlen.get());
96 | }
97 | 


--------------------------------------------------------------------------------
/benchmark/regex-dna/shootout-fasta.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
  2 | // file at the top-level directory of this distribution and at
  3 | // http://rust-lang.org/COPYRIGHT.
  4 | //
  5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  8 | // option. This file may not be copied, modified, or distributed
  9 | // except according to those terms.
 10 | 
 11 | /* -*- mode: rust; indent-tabs-mode: nil -*-
 12 |  * Implementation of 'fasta' benchmark from
 13 |  * Computer Language Benchmarks Game
 14 |  * http://shootout.alioth.debian.org/
 15 |  */
 16 | 
 17 | #![allow(unused_must_use)]
 18 | 
 19 | use std::io;
 20 | use std::io::{BufferedWriter, File};
 21 | use std::cmp::min;
 22 | use std::os;
 23 | 
 24 | static LINE_LENGTH: uint = 60;
 25 | static IM: u32 = 139968;
 26 | 
 27 | struct MyRandom {
 28 |     last: u32
 29 | }
 30 | impl MyRandom {
 31 |     fn new() -> MyRandom { MyRandom { last: 42 } }
 32 |     fn normalize(p: f32) -> u32 {(p * IM as f32).floor() as u32}
 33 |     fn gen(&mut self) -> u32 {
 34 |         self.last = (self.last * 3877 + 29573) % IM;
 35 |         self.last
 36 |     }
 37 | }
 38 | 
 39 | struct AAGen<'a> {
 40 |     rng: &'a mut MyRandom,
 41 |     data: Vec<(u32, u8)> }
 42 | impl<'a> AAGen<'a> {
 43 |     fn new<'b>(rng: &'b mut MyRandom, aa: &[(char, f32)]) -> AAGen<'b> {
 44 |         let mut cum = 0.;
 45 |         let data = aa.iter()
 46 |             .map(|&(ch, p)| { cum += p; (MyRandom::normalize(cum), ch as u8) })
 47 |             .collect();
 48 |         AAGen { rng: rng, data: data }
 49 |     }
 50 | }
 51 | impl<'a> Iterator<u8> for AAGen<'a> {
 52 |     fn next(&mut self) -> Option<u8> {
 53 |         let r = self.rng.gen();
 54 |         self.data.iter()
 55 |             .skip_while(|pc| pc.val0() < r)
 56 |             .map(|&(_, c)| c)
 57 |             .next()
 58 |     }
 59 | }
 60 | 
 61 | fn make_fasta<W: Writer, I: Iterator<u8>>(
 62 |     wr: &mut W, header: &str, mut it: I, mut n: uint)
 63 | {
 64 |     wr.write(header.as_bytes());
 65 |     let mut line = [0u8, .. LINE_LENGTH + 1];
 66 |     while n > 0 {
 67 |         let nb = min(LINE_LENGTH, n);
 68 |         for i in range(0, nb) {
 69 |             line[i] = it.next().unwrap();
 70 |         }
 71 |         n -= nb;
 72 |         line[nb] = '\n' as u8;
 73 |         wr.write(line.slice_to(nb + 1));
 74 |     }
 75 | }
 76 | 
 77 | fn run<W: Writer>(writer: &mut W) {
 78 |     let args = os::args();
 79 |     let n = if os::getenv("RUST_BENCH").is_some() {
 80 |         25000000
 81 |     } else if args.len() <= 1u {
 82 |         1000
 83 |     } else {
 84 |         from_str(args[1]).unwrap()
 85 |     };
 86 | 
 87 |     let rng = &mut MyRandom::new();
 88 |     let alu =
 89 |         "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG\
 90 |         GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA\
 91 |         CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT\
 92 |         ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA\
 93 |         GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG\
 94 |         AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC\
 95 |         AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA";
 96 |     let iub = &[('a', 0.27), ('c', 0.12), ('g', 0.12),
 97 |                 ('t', 0.27), ('B', 0.02), ('D', 0.02),
 98 |                 ('H', 0.02), ('K', 0.02), ('M', 0.02),
 99 |                 ('N', 0.02), ('R', 0.02), ('S', 0.02),
100 |                 ('V', 0.02), ('W', 0.02), ('Y', 0.02)];
101 |     let homosapiens = &[('a', 0.3029549426680),
102 |                         ('c', 0.1979883004921),
103 |                         ('g', 0.1975473066391),
104 |                         ('t', 0.3015094502008)];
105 | 
106 |     make_fasta(writer, ">ONE Homo sapiens alu\n",
107 |                alu.as_bytes().iter().cycle().map(|c| *c), n * 2);
108 |     make_fasta(writer, ">TWO IUB ambiguity codes\n",
109 |                AAGen::new(rng, iub), n * 3);
110 |     make_fasta(writer, ">THREE Homo sapiens frequency\n",
111 |                AAGen::new(rng, homosapiens), n * 5);
112 | 
113 |     writer.flush();
114 | }
115 | 
116 | fn main() {
117 |     if os::getenv("RUST_BENCH").is_some() {
118 |         let mut file = BufferedWriter::new(File::create(&Path::new("./shootout-fasta.data")));
119 |         run(&mut file);
120 |     } else {
121 |         run(&mut io::stdout());
122 |     }
123 | }
124 | 


--------------------------------------------------------------------------------
/benchmark/rust:
--------------------------------------------------------------------------------
 1 | literal                                 435 ns/iter (+/- 2)
 2 | not_literal                            1967 ns/iter (+/- 10)
 3 | match_class                            2545 ns/iter (+/- 17)
 4 | match_class_in_range                   2644 ns/iter (+/- 34)
 5 | replace_all                            6224 ns/iter (+/- 398)
 6 | anchored_literal_short_non_match        991 ns/iter (+/- 4)
 7 | anchored_literal_long_non_match        9119 ns/iter (+/- 20)
 8 | anchored_literal_short_match            571 ns/iter (+/- 4)
 9 | anchored_literal_long_match             565 ns/iter (+/- 2)
10 | one_pass_short_a                       2149 ns/iter (+/- 17)
11 | one_pass_short_a_not                   2644 ns/iter (+/- 27)
12 | one_pass_short_b                       1565 ns/iter (+/- 7)
13 | one_pass_short_b_not                   2157 ns/iter (+/- 10)
14 | one_pass_long_prefix                   1281 ns/iter (+/- 11)
15 | one_pass_long_prefix_not               1234 ns/iter (+/- 6)
16 | easy0_32                                651 ns/iter (+/- 4) = 49 MB/s
17 | easy0_1K                               2123 ns/iter (+/- 115) = 482 MB/s
18 | easy0_32K                             48763 ns/iter (+/- 896) = 671 MB/s
19 | easy0_1M                            1545978 ns/iter (+/- 5075) = 677 MB/s
20 | easy1_32                                609 ns/iter (+/- 154) = 52 MB/s
21 | easy1_1K                               3091 ns/iter (+/- 815) = 331 MB/s
22 | easy1_32K                             83045 ns/iter (+/- 4995) = 394 MB/s
23 | easy1_1M                            2654424 ns/iter (+/- 34276) = 394 MB/s
24 | medium_32                              1648 ns/iter (+/- 63) = 19 MB/s
25 | medium_1K                             33882 ns/iter (+/- 838) = 30 MB/s
26 | medium_32K                          1072079 ns/iter (+/- 5921) = 30 MB/s
27 | medium_1M                          34140609 ns/iter (+/- 51115) = 30 MB/s
28 | hard_32                                2479 ns/iter (+/- 40) = 12 MB/s
29 | hard_1K                               54950 ns/iter (+/- 255) = 18 MB/s
30 | hard_32K                            1738851 ns/iter (+/- 3483) = 18 MB/s
31 | hard_1M                            55405512 ns/iter (+/- 40061) = 18 MB/s
32 | no_exponential                       269850 ns/iter (+/- 380)
33 | 


--------------------------------------------------------------------------------
/cargo-lite.conf:
--------------------------------------------------------------------------------
1 | [build]
2 | crate_root = "src/lib.rs"
3 | crate_type = "library"
4 | 
5 | 


--------------------------------------------------------------------------------
/ctags.rust:
--------------------------------------------------------------------------------
 1 | --langdef=Rust
 2 | --langmap=Rust:.rs
 3 | --regex-Rust=/^[ \t]*(#\[[^\]]\][ \t]*)*(pub[ \t]+)?(extern[ \t]+)?("[^"]+"[ \t]+)?(unsafe[ \t]+)?fn[ \t]+([a-zA-Z0-9_]+)/\6/f,functions,function definitions/
 4 | --regex-Rust=/^[ \t]*(pub[ \t]+)?type[ \t]+([a-zA-Z0-9_]+)/\2/T,types,type definitions/
 5 | --regex-Rust=/^[ \t]*(pub[ \t]+)?enum[ \t]+([a-zA-Z0-9_]+)/\2/g,enum,enumeration names/
 6 | --regex-Rust=/^[ \t]*(pub[ \t]+)?struct[ \t]+([a-zA-Z0-9_]+)/\2/s,structure names/
 7 | --regex-Rust=/^[ \t]*(pub[ \t]+)?mod[ \t]+([a-zA-Z0-9_]+)/\2/m,modules,module names/
 8 | --regex-Rust=/^[ \t]*(pub[ \t]+)?static[ \t]+([a-zA-Z0-9_]+)/\2/c,consts,static constants/
 9 | --regex-Rust=/^[ \t]*(pub[ \t]+)?trait[ \t]+([a-zA-Z0-9_]+)/\2/t,traits,traits/
10 | --regex-Rust=/^[ \t]*(pub[ \t]+)?impl([ \t\n]+<.*>)?[ \t]+([a-zA-Z0-9_]+)/\3/i,impls,trait implementations/
11 | --regex-Rust=/^[ \t]*macro_rules![ \t]+([a-zA-Z0-9_]+)/\1/d,macros,macro definitions/
12 | 


--------------------------------------------------------------------------------
/regex-match-tests.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | 
  3 | # Copyright 2014 The Rust Project Developers. See the COPYRIGHT
  4 | # file at the top-level directory of this distribution and at
  5 | # http://rust-lang.org/COPYRIGHT.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  8 | # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  9 | # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 10 | # option. This file may not be copied, modified, or distributed
 11 | # except according to those terms.
 12 | 
 13 | from __future__ import absolute_import, division, print_function
 14 | import argparse
 15 | import datetime
 16 | import os.path as path
 17 | 
 18 | 
 19 | def print_tests(tests):
 20 |     print('\n'.join([test_tostr(t) for t in tests]))
 21 | 
 22 | 
 23 | def read_tests(f):
 24 |     basename, _ = path.splitext(path.basename(f))
 25 |     tests = []
 26 |     for lineno, line in enumerate(open(f), 1):
 27 |         fields = filter(None, map(str.strip, line.split('\t')))
 28 |         if not (4 <= len(fields) <= 5) \
 29 |            or 'E' not in fields[0] or fields[0][0] == '#':
 30 |             continue
 31 | 
 32 |         opts, pat, text, sgroups = fields[0:4]
 33 |         groups = []  # groups as integer ranges
 34 |         if sgroups == 'NOMATCH':
 35 |             groups = [None]
 36 |         elif ',' in sgroups:
 37 |             noparen = map(lambda s: s.strip('()'), sgroups.split(')('))
 38 |             for g in noparen:
 39 |                 s, e = map(str.strip, g.split(','))
 40 |                 if s == '?' and e == '?':
 41 |                     groups.append(None)
 42 |                 else:
 43 |                     groups.append((int(s), int(e)))
 44 |         else:
 45 |             # This skips tests that should result in an error.
 46 |             # There aren't many, so I think we can just capture those
 47 |             # manually. Possibly fix this in future.
 48 |             continue
 49 | 
 50 |         if pat == 'SAME':
 51 |             pat = tests[-1][1]
 52 |         if '$' in opts:
 53 |             pat = pat.decode('string_escape')
 54 |             text = text.decode('string_escape')
 55 |         if 'i' in opts:
 56 |             pat = '(?i)%s' % pat
 57 | 
 58 |         name = '%s_%d' % (basename, lineno)
 59 |         tests.append((name, pat, text, groups))
 60 |     return tests
 61 | 
 62 | 
 63 | def test_tostr(t):
 64 |     lineno, pat, text, groups = t
 65 |     options = map(group_tostr, groups)
 66 |     return 'mat!(match_%s, r"%s", r"%s", %s)' \
 67 |            % (lineno, pat, '' if text == "NULL" else text, ', '.join(options))
 68 | 
 69 | 
 70 | def group_tostr(g):
 71 |     if g is None:
 72 |         return 'None'
 73 |     else:
 74 |         return 'Some((%d, %d))' % (g[0], g[1])
 75 | 
 76 | 
 77 | if __name__ == '__main__':
 78 |     parser = argparse.ArgumentParser(
 79 |         description='Generate match tests from an AT&T POSIX test file.')
 80 |     aa = parser.add_argument
 81 |     aa('files', nargs='+',
 82 |        help='A list of dat AT&T POSIX test files. See src/libregexp/testdata')
 83 |     args = parser.parse_args()
 84 | 
 85 |     tests = []
 86 |     for f in args.files:
 87 |         tests += read_tests(f)
 88 | 
 89 |     tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
 90 | // file at the top-level directory of this distribution and at
 91 | // http://rust-lang.org/COPYRIGHT.
 92 | //
 93 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 94 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 95 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 96 | // option. This file may not be copied, modified, or distributed
 97 | // except according to those terms.
 98 | 
 99 | // ignore-tidy-linelength
100 | 
101 | // DO NOT EDIT. Automatically generated by 'src/etc/regexp-match-tests'
102 | // on {date}.
103 | '''
104 |     print(tpl.format(date=str(datetime.datetime.now())))
105 | 
106 |     for f in args.files:
107 |         print('// Tests from %s' % path.basename(f))
108 |         print_tests(read_tests(f))
109 |         print('')
110 | 


--------------------------------------------------------------------------------
/regex-unicode-tables.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | 
  3 | # Copyright 2014 The Rust Project Developers. See the COPYRIGHT
  4 | # file at the top-level directory of this distribution and at
  5 | # http://rust-lang.org/COPYRIGHT.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  8 | # http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  9 | # <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 10 | # option. This file may not be copied, modified, or distributed
 11 | # except according to those terms.
 12 | 
 13 | from __future__ import absolute_import, division, print_function
 14 | import argparse
 15 | from collections import defaultdict
 16 | import csv
 17 | import datetime
 18 | import urllib2
 19 | 
 20 | BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/'
 21 | DATA = 'UnicodeData.txt'
 22 | SCRIPTS = 'Scripts.txt'
 23 | 
 24 | # Mapping taken from Table 12 from:
 25 | # http://www.unicode.org/reports/tr44/#General_Category_Values
 26 | expanded_categories = {
 27 |     'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
 28 |     'Lm': ['L'], 'Lo': ['L'],
 29 |     'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
 30 |     'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
 31 |     'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
 32 |     'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
 33 |     'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
 34 |     'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
 35 |     'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
 36 | }
 37 | 
 38 | 
 39 | def as_4byte_uni(n):
 40 |     s = hex(n)[2:]
 41 |     return '\\U%s%s' % ('0' * (8 - len(s)), s)
 42 | 
 43 | 
 44 | def expand_cat(c):
 45 |     return expanded_categories.get(c, []) + [c]
 46 | 
 47 | 
 48 | def is_valid_unicode(n):
 49 |     return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
 50 | 
 51 | 
 52 | def read_cats(f):
 53 |     assigned = defaultdict(list)
 54 |     for row in csv.reader(f, delimiter=';'):
 55 |         (hex, cats) = (int(row[0], 16), expand_cat(row[2]))
 56 |         if not is_valid_unicode(hex):
 57 |             continue
 58 |         for cat in cats:
 59 |             assigned[cat].append(hex)
 60 |     return assigned
 61 | 
 62 | 
 63 | def read_scripts(f):
 64 |     assigned = defaultdict(list)
 65 |     for line in f:
 66 |         line = line.strip()
 67 |         if not line or line.startswith('#'):
 68 |             continue
 69 |         hexes, name = map(str.strip, line.split(';'))[:2]
 70 |         name = name[:name.index('#')].strip()
 71 |         if '..' not in hexes:
 72 |             hex = int(hexes, 16)
 73 |             if is_valid_unicode(hex):
 74 |                 assigned[name].append(hex)
 75 |         else:
 76 |             hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..'))
 77 |             for hex in xrange(hex1, hex2 + 1):
 78 |                 if is_valid_unicode(hex):
 79 |                     assigned[name].append(hex)
 80 |     return assigned
 81 | 
 82 | 
 83 | def group(letters):
 84 |     letters = sorted(set(letters))
 85 |     grouped = []
 86 |     cur_start = letters.pop(0)
 87 |     cur_end = cur_start
 88 |     for letter in letters:
 89 |         assert letter > cur_end, \
 90 |             'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter))
 91 | 
 92 |         if letter == cur_end + 1:
 93 |             cur_end = letter
 94 |         else:
 95 |             grouped.append((cur_start, cur_end))
 96 |             cur_start, cur_end = letter, letter
 97 |     grouped.append((cur_start, cur_end))
 98 |     return grouped
 99 | 
100 | 
101 | def ranges_to_rust(rs):
102 |     rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs)
103 |     return ',\n    '.join(rs)
104 | 
105 | 
106 | def groups_to_rust(groups):
107 |     rust_groups = []
108 |     for group_name in sorted(groups):
109 |         rust_groups.append('("%s", &[\n    %s\n    ]),'
110 |                            % (group_name, ranges_to_rust(groups[group_name])))
111 |     return '\n'.join(rust_groups)
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     parser = argparse.ArgumentParser(
116 |         description='Generate Unicode character class tables.')
117 |     aa = parser.add_argument
118 |     aa('--local', action='store_true',
119 |        help='When set, Scripts.txt and UnicodeData.txt will be read from '
120 |             'the CWD.')
121 |     aa('--base-url', type=str, default=BASE_URL,
122 |        help='The base URL to use for downloading Unicode data files.')
123 |     args = parser.parse_args()
124 | 
125 |     if args.local:
126 |         cats = read_cats(open(DATA))
127 |         scripts = read_scripts(open(SCRIPTS))
128 |     else:
129 |         cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA))
130 |         scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS))
131 | 
132 |     # Get Rust code for all Unicode general categories and scripts.
133 |     combined = dict(cats, **scripts)
134 |     unigroups = groups_to_rust({k: group(letters)
135 |                                 for k, letters in combined.items()})
136 | 
137 |     # Now get Perl character classes that are Unicode friendly.
138 |     perld = range(ord('0'), ord('9') + 1)
139 |     dgroups = ranges_to_rust(group(perld + cats['Nd'][:]))
140 | 
141 |     perls = map(ord, ['\t', '\n', '\x0C', '\r', ' '])
142 |     sgroups = ranges_to_rust(group(perls + cats['Z'][:]))
143 | 
144 |     low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1))
145 |     perlw = [ord('_')] + perld + low + up
146 |     wgroups = ranges_to_rust(group(perlw + cats['L'][:]))
147 | 
148 |     tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
149 | // file at the top-level directory of this distribution and at
150 | // http://rust-lang.org/COPYRIGHT.
151 | //
152 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
153 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
154 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
155 | // option. This file may not be copied, modified, or distributed
156 | // except according to those terms.
157 | 
158 | // DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables'
159 | // on {date}.
160 | 
161 | use parse::{{Class, NamedClasses}};
162 | 
163 | pub static UNICODE_CLASSES: NamedClasses = &[
164 | 
165 | {groups}
166 | 
167 | ];
168 | 
169 | pub static PERLD: Class = &[
170 |     {dgroups}
171 | ];
172 | 
173 | pub static PERLS: Class = &[
174 |     {sgroups}
175 | ];
176 | 
177 | pub static PERLW: Class = &[
178 |     {wgroups}
179 | ];
180 | '''
181 |     now = datetime.datetime.now()
182 |     print(tpl.format(date=str(now), groups=unigroups,
183 |                      dgroups=dgroups, sgroups=sgroups, wgroups=wgroups))
184 | 


--------------------------------------------------------------------------------
/session.vim:
--------------------------------------------------------------------------------
1 | au BufWritePost *.rs silent!make ctags > /dev/null 2>&1
2 | 


--------------------------------------------------------------------------------
/src/compile.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
  2 | // file at the top-level directory of this distribution and at
  3 | // http://rust-lang.org/COPYRIGHT.
  4 | //
  5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  8 | // option. This file may not be copied, modified, or distributed
  9 | // except according to those terms.
 10 | 
 11 | // Enable this to squash warnings due to exporting pieces of the representation
 12 | // for use with the regex! macro. See lib.rs for explanation.
 13 | #![allow(visible_private_types)]
 14 | 
 15 | use std::cmp;
 16 | use std::iter;
 17 | use parse;
 18 | use parse::{
 19 |     Flags, FLAG_EMPTY,
 20 |     Nothing, Literal, Dot, Class, Begin, End, WordBoundary, Capture, Cat, Alt,
 21 |     Rep,
 22 |     ZeroOne, ZeroMore, OneMore,
 23 | };
 24 | 
 25 | type InstIdx = uint;
 26 | 
 27 | #[deriving(Show, Clone)]
 28 | pub enum Inst {
 29 |     // When a Match instruction is executed, the current thread is successful.
 30 |     Match,
 31 | 
 32 |     // The OneChar instruction matches a literal character.
 33 |     // The flags indicate whether to do a case insensitive match.
 34 |     OneChar(char, Flags),
 35 | 
 36 |     // The CharClass instruction tries to match one input character against
 37 |     // the range of characters given.
 38 |     // The flags indicate whether to do a case insentivie match and whether
 39 |     // the character class is negated or not.
 40 |     CharClass(Vec<(char, char)>, Flags),
 41 | 
 42 |     // Matches any character except new lines.
 43 |     // The flags indicate whether to include the '\n' character.
 44 |     Any(Flags),
 45 | 
 46 |     // Matches the beginning of the string, consumes no characters.
 47 |     // The flags indicate whether it matches if the preceding character
 48 |     // is a new line.
 49 |     EmptyBegin(Flags),
 50 | 
 51 |     // Matches the end of the string, consumes no characters.
 52 |     // The flags indicate whether it matches if the proceding character
 53 |     // is a new line.
 54 |     EmptyEnd(Flags),
 55 | 
 56 |     // Matches a word boundary (\w on one side and \W \A or \z on the other),
 57 |     // and consumes no character.
 58 |     // The flags indicate whether this matches a word boundary or something
 59 |     // that isn't a word boundary.
 60 |     EmptyWordBoundary(Flags),
 61 | 
 62 |     // Saves the current position in the input string to the Nth save slot.
 63 |     Save(uint),
 64 | 
 65 |     // Jumps to the instruction at the index given.
 66 |     Jump(InstIdx),
 67 | 
 68 |     // Jumps to the instruction at the first index given. If that leads to
 69 |     // a failing state, then the instruction at the second index given is
 70 |     // tried.
 71 |     Split(InstIdx, InstIdx),
 72 | }
 73 | 
 74 | /// Program represents a compiled regular expression. Once an expression is
 75 | /// compiled, its representation is immutable and will never change.
 76 | ///
 77 | /// All of the data in a compiled expression is wrapped in "MaybeStatic" or
 78 | /// "MaybeOwned" types so that a `Program` can be represented as static data.
 79 | /// (This makes it convenient and efficient for use with the `regex!` macro.)
 80 | #[deriving(Clone)]
 81 | pub struct Program {
 82 |     /// A sequence of instructions.
 83 |     pub insts: Vec<Inst>,
 84 |     /// If the regular expression requires a literal prefix in order to have a
 85 |     /// match, that prefix is stored here. (It's used in the VM to implement
 86 |     /// an optimization.)
 87 |     pub prefix: ~str,
 88 | }
 89 | 
 90 | impl Program {
 91 |     /// Compiles a Regex given its AST.
 92 |     pub fn new(ast: ~parse::Ast) -> (Program, ~[Option<~str>]) {
 93 |         let mut c = Compiler {
 94 |             insts: Vec::with_capacity(100),
 95 |             names: Vec::with_capacity(10),
 96 |         };
 97 | 
 98 |         c.insts.push(Save(0));
 99 |         c.compile(ast);
100 |         c.insts.push(Save(1));
101 |         c.insts.push(Match);
102 | 
103 |         // Try to discover a literal string prefix.
104 |         // This is a bit hacky since we have to skip over the initial
105 |         // 'Save' instruction.
106 |         let mut pre = StrBuf::with_capacity(5);
107 |         for i in iter::range(1, c.insts.len()) {
108 |             match *c.insts.get(i) {
109 |                 OneChar(c, FLAG_EMPTY) => pre.push_char(c),
110 |                 _ => break
111 |             }
112 |         }
113 | 
114 |         let names = c.names.as_slice().into_owned();
115 |         let prog = Program {
116 |             insts: c.insts,
117 |             prefix: pre.into_owned(),
118 |         };
119 |         (prog, names)
120 |     }
121 | 
122 |     /// Returns the total number of capture groups in the regular expression.
123 |     /// This includes the zeroth capture.
124 |     pub fn num_captures(&self) -> uint {
125 |         let mut n = 0;
126 |         for inst in self.insts.iter() {
127 |             match *inst {
128 |                 Save(c) => n = cmp::max(n, c+1),
129 |                 _ => {}
130 |             }
131 |         }
132 |         // There's exactly 2 Save slots for every capture.
133 |         n / 2
134 |     }
135 | }
136 | 
137 | struct Compiler<'r> {
138 |     insts: Vec<Inst>,
139 |     names: Vec<Option<~str>>,
140 | }
141 | 
142 | // The compiler implemented here is extremely simple. Most of the complexity
143 | // in this crate is in the parser or the VM.
144 | // The only tricky thing here is patching jump/split instructions to point to
145 | // the right instruction.
146 | impl<'r> Compiler<'r> {
147 |     fn compile(&mut self, ast: ~parse::Ast) {
148 |         match ast {
149 |             ~Nothing => {},
150 |             ~Literal(c, flags) => self.push(OneChar(c, flags)),
151 |             ~Dot(nl) => self.push(Any(nl)),
152 |             ~Class(ranges, flags) =>
153 |                 self.push(CharClass(ranges, flags)),
154 |             ~Begin(flags) => self.push(EmptyBegin(flags)),
155 |             ~End(flags) => self.push(EmptyEnd(flags)),
156 |             ~WordBoundary(flags) => self.push(EmptyWordBoundary(flags)),
157 |             ~Capture(cap, name, x) => {
158 |                 let len = self.names.len();
159 |                 if cap >= len {
160 |                     self.names.grow(10 + cap - len, &None)
161 |                 }
162 |                 *self.names.get_mut(cap) = name;
163 | 
164 |                 self.push(Save(2 * cap));
165 |                 self.compile(x);
166 |                 self.push(Save(2 * cap + 1));
167 |             }
168 |             ~Cat(xs) => {
169 |                 for x in xs.move_iter() {
170 |                     self.compile(x)
171 |                 }
172 |             }
173 |             ~Alt(x, y) => {
174 |                 let split = self.empty_split(); // push: split 0, 0
175 |                 let j1 = self.insts.len();
176 |                 self.compile(x);                // push: insts for x
177 |                 let jmp = self.empty_jump();    // push: jmp 0
178 |                 let j2 = self.insts.len();
179 |                 self.compile(y);                // push: insts for y
180 |                 let j3 = self.insts.len();
181 | 
182 |                 self.set_split(split, j1, j2);  // split 0, 0 -> split j1, j2
183 |                 self.set_jump(jmp, j3);         // jmp 0      -> jmp j3
184 |             }
185 |             ~Rep(x, ZeroOne, g) => {
186 |                 let split = self.empty_split();
187 |                 let j1 = self.insts.len();
188 |                 self.compile(x);
189 |                 let j2 = self.insts.len();
190 | 
191 |                 if g.is_greedy() {
192 |                     self.set_split(split, j1, j2);
193 |                 } else {
194 |                     self.set_split(split, j2, j1);
195 |                 }
196 |             }
197 |             ~Rep(x, ZeroMore, g) => {
198 |                 let j1 = self.insts.len();
199 |                 let split = self.empty_split();
200 |                 let j2 = self.insts.len();
201 |                 self.compile(x);
202 |                 let jmp = self.empty_jump();
203 |                 let j3 = self.insts.len();
204 | 
205 |                 self.set_jump(jmp, j1);
206 |                 if g.is_greedy() {
207 |                     self.set_split(split, j2, j3);
208 |                 } else {
209 |                     self.set_split(split, j3, j2);
210 |                 }
211 |             }
212 |             ~Rep(x, OneMore, g) => {
213 |                 let j1 = self.insts.len();
214 |                 self.compile(x);
215 |                 let split = self.empty_split();
216 |                 let j2 = self.insts.len();
217 | 
218 |                 if g.is_greedy() {
219 |                     self.set_split(split, j1, j2);
220 |                 } else {
221 |                     self.set_split(split, j2, j1);
222 |                 }
223 |             }
224 |         }
225 |     }
226 | 
227 |     /// Appends the given instruction to the program.
228 |     #[inline]
229 |     fn push(&mut self, x: Inst) {
230 |         self.insts.push(x)
231 |     }
232 | 
233 |     /// Appends an *empty* `Split` instruction to the program and returns
234 |     /// the index of that instruction. (The index can then be used to "patch"
235 |     /// the actual locations of the split in later.)
236 |     #[inline]
237 |     fn empty_split(&mut self) -> InstIdx {
238 |         self.insts.push(Split(0, 0));
239 |         self.insts.len() - 1
240 |     }
241 | 
242 |     /// Sets the left and right locations of a `Split` instruction at index
243 |     /// `i` to `pc1` and `pc2`, respectively.
244 |     /// If the instruction at index `i` isn't a `Split` instruction, then
245 |     /// `fail!` is called.
246 |     #[inline]
247 |     fn set_split(&mut self, i: InstIdx, pc1: InstIdx, pc2: InstIdx) {
248 |         let split = self.insts.get_mut(i);
249 |         match *split {
250 |             Split(_, _) => *split = Split(pc1, pc2),
251 |             _ => fail!("BUG: Invalid split index."),
252 |         }
253 |     }
254 | 
255 |     /// Appends an *empty* `Jump` instruction to the program and returns the
256 |     /// index of that instruction.
257 |     #[inline]
258 |     fn empty_jump(&mut self) -> InstIdx {
259 |         self.insts.push(Jump(0));
260 |         self.insts.len() - 1
261 |     }
262 | 
263 |     /// Sets the location of a `Jump` instruction at index `i` to `pc`.
264 |     /// If the instruction at index `i` isn't a `Jump` instruction, then
265 |     /// `fail!` is called.
266 |     #[inline]
267 |     fn set_jump(&mut self, i: InstIdx, pc: InstIdx) {
268 |         let jmp = self.insts.get_mut(i);
269 |         match *jmp {
270 |             Jump(_) => *jmp = Jump(pc),
271 |             _ => fail!("BUG: Invalid jump index."),
272 |         }
273 |     }
274 | }
275 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
  2 | // file at the top-level directory of this distribution and at
  3 | // http://rust-lang.org/COPYRIGHT.
  4 | //
  5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  8 | // option. This file may not be copied, modified, or distributed
  9 | // except according to those terms.
 10 | 
 11 | //! This crate provides a native implementation of regular expressions that is
 12 | //! heavily based on RE2 both in syntax and in implementation. Notably,
 13 | //! backreferences and arbitrary lookahead/lookbehind assertions are not
 14 | //! provided. In return, regular expression searching provided by this package
 15 | //! has excellent worst case performance. The specific syntax supported is
 16 | //! documented further down.
 17 | //!
 18 | //! This crate's documentation provides some simple examples, describes Unicode
 19 | //! support and exhaustively lists the supported syntax. For more specific
 20 | //! details on the API, please see the documentation for the `Regex` type.
 21 | //!
 22 | //! # First example: find a date
 23 | //!
 24 | //! General use of regular expressions in this package involves compiling an
 25 | //! expression and then using it to search, split or replace text. For example,
 26 | //! to confirm that some text resembles a date:
 27 | //!
 28 | //! ```rust
 29 | //! use regex::Regex;
 30 | //! let re = match Regex::new(r"^\d{4}-\d{2}-\d{2}$") {
 31 | //!     Ok(re) => re,
 32 | //!     Err(err) => fail!("{}", err),
 33 | //! };
 34 | //! assert_eq!(re.is_match("2014-01-01"), true);
 35 | //! ```
 36 | //!
 37 | //! Notice the use of the `^` and `$` anchors. In this crate, every expression
 38 | //! is executed with an implicit `.*?` at the beginning and end, which allows
 39 | //! it to match anywhere in the text. Anchors can be used to ensure that the
 40 | //! full text matches an expression.
 41 | //!
 42 | //! This example also demonstrates the utility of raw strings in Rust, which
 43 | //! are just like regular strings except they are prefixed with an `r` and do
 44 | //! not process any escape sequences. For example, `"\\d"` is the same
 45 | //! expression as `r"\d"`.
 46 | //!
 47 | //! # The `regex!` macro
 48 | //!
 49 | //! Rust's compile time meta-programming facilities provide a way to write a
 50 | //! `regex!` macro which compiles regular expressions *when your program
 51 | //! compiles*. Said differently, if you only use `regex!` to build regular
 52 | //! expressions in your program, then your program cannot compile with an
 53 | //! invalid regular expression. Moreover, the `regex!` macro compiles the
 54 | //! given expression to native Rust code, which makes it much faster for
 55 | //! searching text.
 56 | //!
 57 | //! Since `regex!` provides compiled regular expressions that are both safer
 58 | //! and faster to use, you should use them whenever possible. The only
 59 | //! requirement for using them is that you have a string literal corresponding
 60 | //! to your expression. Otherwise, it is indistinguishable from an expression
 61 | //! compiled at runtime with `Regex::new`.
 62 | //!
 63 | //! To use the `regex!` macro, you must enable the `phase` feature and import
 64 | //! the `regex_macros` crate as a syntax extension:
 65 | //!
 66 | //! ```rust
 67 | //! #![feature(phase)]
 68 | //! #[phase(syntax)]
 69 | //! extern crate regex_macros;
 70 | //! extern crate regex;
 71 | //!
 72 | //! fn main() {
 73 | //!     let re = regex!(r"^\d{4}-\d{2}-\d{2}$");
 74 | //!     assert_eq!(re.is_match("2014-01-01"), true);
 75 | //! }
 76 | //! ```
 77 | //!
 78 | //! There are a few things worth mentioning about using the `regex!` macro.
 79 | //! Firstly, the `regex!` macro *only* accepts string *literals*.
 80 | //! Secondly, the `regex` crate *must* be linked with the name `regex` since
 81 | //! the generated code depends on finding symbols in the `regex` crate.
 82 | //!
 83 | //! The only downside of using the `regex!` macro is that it can increase the
 84 | //! size of your program's binary since it generates specialized Rust code.
 85 | //! The extra size probably won't be significant for a small number of
 86 | //! expressions, but 100+ calls to `regex!` will probably result in a
 87 | //! noticeably bigger binary.
 88 | //!
 89 | //! # Example: iterating over capture groups
 90 | //!
 91 | //! This crate provides convenient iterators for matching an expression
 92 | //! repeatedly against a search string to find successive non-overlapping
 93 | //! matches. For example, to find all dates in a string and be able to access
 94 | //! them by their component pieces:
 95 | //!
 96 | //! ```rust
 97 | //! # #![feature(phase)]
 98 | //! # extern crate regex; #[phase(syntax)] extern crate regex_macros;
 99 | //! # fn main() {
100 | //! let re = regex!(r"(\d{4})-(\d{2})-(\d{2})");
101 | //! let text = "2012-03-14, 2013-01-01 and 2014-07-05";
102 | //! for cap in re.captures_iter(text) {
103 | //!     println!("Month: {} Day: {} Year: {}", cap.at(2), cap.at(3), cap.at(1));
104 | //! }
105 | //! // Output:
106 | //! // Month: 03 Day: 14 Year: 2012
107 | //! // Month: 01 Day: 01 Year: 2013
108 | //! // Month: 07 Day: 05 Year: 2014
109 | //! # }
110 | //! ```
111 | //!
112 | //! Notice that the year is in the capture group indexed at `1`. This is
113 | //! because the *entire match* is stored in the capture group at index `0`.
114 | //!
115 | //! # Example: replacement with named capture groups
116 | //!
117 | //! Building on the previous example, perhaps we'd like to rearrange the date
118 | //! formats. This can be done with text replacement. But to make the code
119 | //! clearer, we can *name*  our capture groups and use those names as variables
120 | //! in our replacement text:
121 | //!
122 | //! ```rust
123 | //! # #![feature(phase)]
124 | //! # extern crate regex; #[phase(syntax)] extern crate regex_macros;
125 | //! # fn main() {
126 | //! let re = regex!(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})");
127 | //! let before = "2012-03-14, 2013-01-01 and 2014-07-05";
128 | //! let after = re.replace_all(before, "$m/$d/$y");
129 | //! assert_eq!(after.as_slice(), "03/14/2012, 01/01/2013 and 07/05/2014");
130 | //! # }
131 | //! ```
132 | //!
133 | //! The `replace` methods are actually polymorphic in the replacement, which
134 | //! provides more flexibility than is seen here. (See the documentation for
135 | //! `Regex::replace` for more details.)
136 | //!
137 | //! # Pay for what you use
138 | //!
139 | //! With respect to searching text with a regular expression, there are three
140 | //! questions that can be asked:
141 | //!
142 | //! 1. Does the text match this expression?
143 | //! 2. If so, where does it match?
144 | //! 3. Where are the submatches?
145 | //!
146 | //! Generally speaking, this crate could provide a function to answer only #3,
147 | //! which would subsume #1 and #2 automatically. However, it can be
148 | //! significantly more expensive to compute the location of submatches, so it's
149 | //! best not to do it if you don't need to.
150 | //!
151 | //! Therefore, only use what you need. For example, don't use `find` if you
152 | //! only need to test if an expression matches a string. (Use `is_match`
153 | //! instead.)
154 | //!
155 | //! # Unicode
156 | //!
157 | //! This implementation executes regular expressions **only** on sequences of
158 | //! UTF8 codepoints while exposing match locations as byte indices.
159 | //!
160 | //! Currently, only naive case folding is supported. Namely, when matching
161 | //! case insensitively, the characters are first converted to their uppercase
162 | //! forms and then compared.
163 | //!
164 | //! Regular expressions themselves are also **only** interpreted as a sequence
165 | //! of UTF8 codepoints. This means you can embed Unicode characters directly
166 | //! into your expression:
167 | //!
168 | //! ```rust
169 | //! # #![feature(phase)]
170 | //! # extern crate regex; #[phase(syntax)] extern crate regex_macros;
171 | //! # fn main() {
172 | //! let re = regex!(r"(?i)Δ+");
173 | //! assert_eq!(re.find("ΔδΔ"), Some((0, 6)));
174 | //! # }
175 | //! ```
176 | //!
177 | //! Finally, Unicode general categories and scripts are available as character
178 | //! classes. For example, you can match a sequence of numerals, Greek or
179 | //! Cherokee letters:
180 | //!
181 | //! ```rust
182 | //! # #![feature(phase)]
183 | //! # extern crate regex; #[phase(syntax)] extern crate regex_macros;
184 | //! # fn main() {
185 | //! let re = regex!(r"[\pN\p{Greek}\p{Cherokee}]+");
186 | //! assert_eq!(re.find("abcΔᎠβⅠᏴγδⅡxyz"), Some((3, 23)));
187 | //! # }
188 | //! ```
189 | //!
190 | //! # Syntax
191 | //!
192 | //! The syntax supported in this crate is almost in an exact correspondence
193 | //! with the syntax supported by RE2.
194 | //!
195 | //! ## Matching one character
196 | //!
197 | //! <pre class="rust">
198 | //! .           any character except new line (includes new line with s flag)
199 | //! [xyz]       A character class matching either x, y or z.
200 | //! [^xyz]      A character class matching any character except x, y and z.
201 | //! [a-z]       A character class matching any character in range a-z.
202 | //! \d          Perl character class ([0-9])
203 | //! \D          Negated Perl character class ([^0-9])
204 | //! [:alpha:]   ASCII character class ([A-Za-z])
205 | //! [:^alpha:]  Negated ASCII character class ([^A-Za-z])
206 | //! \pN         One letter name Unicode character class
207 | //! \p{Greek}   Unicode character class (general category or script)
208 | //! \PN         Negated one letter name Unicode character class
209 | //! \P{Greek}   negated Unicode character class (general category or script)
210 | //! </pre>
211 | //!
212 | //! Any named character class may appear inside a bracketed `[...]` character
213 | //! class. For example, `[\p{Greek}\pN]` matches any Greek or numeral
214 | //! character.
215 | //!
216 | //! ## Composites
217 | //!
218 | //! <pre class="rust">
219 | //! xy    concatenation (x followed by y)
220 | //! x|y   alternation (x or y, prefer x)
221 | //! </pre>
222 | //!
223 | //! ## Repetitions
224 | //!
225 | //! <pre class="rust">
226 | //! x*        zero or more of x (greedy)
227 | //! x+        one or more of x (greedy)
228 | //! x?        zero or one of x (greedy)
229 | //! x*?       zero or more of x (ungreedy)
230 | //! x+?       one or more of x (ungreedy)
231 | //! x??       zero or one of x (ungreedy)
232 | //! x{n,m}    at least n and at most x (greedy)
233 | //! x{n,}     at least n x (greedy)
234 | //! x{n}      exactly n x
235 | //! x{n,m}?   at least n and at most x (ungreedy)
236 | //! x{n,}?    at least n x (ungreedy)
237 | //! x{n}?     exactly n x
238 | //! </pre>
239 | //!
240 | //! ## Empty matches
241 | //!
242 | //! <pre class="rust">
243 | //! ^     the beginning of text (or start-of-line with multi-line mode)
244 | //! $     the end of text (or end-of-line with multi-line mode)
245 | //! \A    only the beginning of text (even with multi-line mode enabled)
246 | //! \z    only the end of text (even with multi-line mode enabled)
247 | //! \b    a Unicode word boundary (\w on one side and \W, \A, or \z on other)
248 | //! \B    not a Unicode word boundary
249 | //! </pre>
250 | //!
251 | //! ## Grouping and flags
252 | //!
253 | //! <pre class="rust">
254 | //! (exp)          numbered capture group (indexed by opening parenthesis)
255 | //! (?P&lt;name&gt;exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
256 | //! (?:exp)        non-capturing group
257 | //! (?flags)       set flags within current group
258 | //! (?flags:exp)   set flags for exp (non-capturing)
259 | //! </pre>
260 | //!
261 | //! Flags are each a single character. For example, `(?x)` sets the flag `x`
262 | //! and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at
263 | //! the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets
264 | //! the `x` flag and clears the `y` flag.
265 | //!
266 | //! All flags are by default disabled. They are:
267 | //!
268 | //! <pre class="rust">
269 | //! i     case insensitive
270 | //! m     multi-line mode: ^ and $ match begin/end of line
271 | //! s     allow . to match \n
272 | //! U     swap the meaning of x* and x*?
273 | //! </pre>
274 | //!
275 | //! Here's an example that matches case insensitively for only part of the
276 | //! expression:
277 | //!
278 | //! ```rust
279 | //! # #![feature(phase)]
280 | //! # extern crate regex; #[phase(syntax)] extern crate regex_macros;
281 | //! # fn main() {
282 | //! let re = regex!(r"(?i)a+(?-i)b+");
283 | //! let cap = re.captures("AaAaAbbBBBb").unwrap();
284 | //! assert_eq!(cap.at(0), "AaAaAbb");
285 | //! # }
286 | //! ```
287 | //!
288 | //! Notice that the `a+` matches either `a` or `A`, but the `b+` only matches
289 | //! `b`.
290 | //!
291 | //! ## Escape sequences
292 | //!
293 | //! <pre class="rust">
294 | //! \*         literal *, works for any punctuation character: \.+*?()|[]{}^$
295 | //! \a         bell (\x07)
296 | //! \f         form feed (\x0C)
297 | //! \t         horizontal tab
298 | //! \n         new line
299 | //! \r         carriage return
300 | //! \v         vertical tab (\x0B)
301 | //! \123       octal character code (up to three digits)
302 | //! \x7F       hex character code (exactly two digits)
303 | //! \x{10FFFF} any hex character code corresponding to a valid UTF8 codepoint
304 | //! </pre>
305 | //!
306 | //! ## Perl character classes (Unicode friendly)
307 | //!
308 | //! <pre class="rust">
309 | //! \d     digit ([0-9] + \p{Nd})
310 | //! \D     not digit
311 | //! \s     whitespace ([\t\n\f\r ] + \p{Z})
312 | //! \S     not whitespace
313 | //! \w     word character ([0-9A-Za-z_] + \p{L})
314 | //! \W     not word character
315 | //! </pre>
316 | //!
317 | //! ## ASCII character classes
318 | //!
319 | //! <pre class="rust">
320 | //! [:alnum:]    alphanumeric ([0-9A-Za-z])
321 | //! [:alpha:]    alphabetic ([A-Za-z])
322 | //! [:ascii:]    ASCII ([\x00-\x7F])
323 | //! [:blank:]    blank ([\t ])
324 | //! [:cntrl:]    control ([\x00-\x1F\x7F])
325 | //! [:digit:]    digits ([0-9])
326 | //! [:graph:]    graphical ([!-~])
327 | //! [:lower:]    lower case ([a-z])
328 | //! [:print:]    printable ([ -~])
329 | //! [:punct:]    punctuation ([!-/:-@[-`{-~])
330 | //! [:space:]    whitespace ([\t\n\v\f\r ])
331 | //! [:upper:]    upper case ([A-Z])
332 | //! [:word:]     word characters ([0-9A-Za-z_])
333 | //! [:xdigit:]   hex digit ([0-9A-Fa-f])
334 | //! </pre>
335 | //!
336 | //! # Untrusted input
337 | //!
338 | //! There are two factors to consider here: untrusted regular expressions and
339 | //! untrusted search text.
340 | //!
341 | //! Currently, there are no counter-measures in place to prevent a malicious
342 | //! user from writing an expression that may use a lot of resources. One such
343 | //! example is to repeat counted repetitions: `((a{100}){100}){100}` will try
344 | //! to repeat the `a` instruction `100^3` times. Essentially, this means it's
345 | //! very easy for an attacker to exhaust your system's memory if they are
346 | //! allowed to execute arbitrary regular expressions. A possible solution to
347 | //! this is to impose a hard limit on the size of a compiled expression, but it
348 | //! does not yet exist.
349 | //!
350 | //! The story is a bit better with untrusted search text, since this crate's
351 | //! implementation provides `O(nm)` search where `n` is the number of
352 | //! characters in the search text and `m` is the number of instructions in a
353 | //! compiled expression.
354 | 
355 | #![crate_id = "regex#0.11-pre"]
356 | #![crate_type = "rlib"]
357 | #![crate_type = "dylib"]
358 | #![experimental]
359 | #![license = "MIT/ASL2"]
360 | #![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
361 |        html_favicon_url = "http://www.rust-lang.org/favicon.ico",
362 |        html_root_url = "http://static.rust-lang.org/doc/master")]
363 | 
364 | #![feature(macro_rules, phase)]
365 | #![deny(missing_doc)]
366 | 
367 | extern crate collections;
368 | #[cfg(test)]
369 | extern crate stdtest = "test";
370 | #[cfg(test)]
371 | extern crate rand;
372 | 
373 | // During tests, this links with the `regex` crate so that the `regex!` macro
374 | // can be tested.
375 | #[cfg(test)]
376 | extern crate regex;
377 | 
378 | pub use parse::Error;
379 | pub use re::{Regex, Captures, SubCaptures, SubCapturesPos};
380 | pub use re::{FindCaptures, FindMatches};
381 | pub use re::{Replacer, NoExpand, RegexSplits, RegexSplitsN};
382 | pub use re::{quote, is_match};
383 | 
384 | mod compile;
385 | mod parse;
386 | mod re;
387 | mod vm;
388 | 
389 | // FIXME(#13725) windows needs fixing.
390 | #[cfg(test, not(windows))]
391 | mod test;
392 | 
393 | /// The `program` module exists to support the `regex!` macro. Do not use.
394 | #[doc(hidden)]
395 | pub mod native {
396 |     // Exporting this stuff is bad form, but it's necessary for two reasons.
397 |     // Firstly, the `regex!` syntax extension is in a different crate and
398 |     // requires access to the representation of a regex (particularly the
399 |     // instruction set) in order to compile to native Rust. This could be
400 |     // mitigated if `regex!` was defined in the same crate, but this has
401 |     // undesirable consequences (such as requiring a dependency on
402 |     // `libsyntax`).
403 |     //
404 |     // Secondly, the code generated generated by `regex!` must *also* be able
405 |     // to access various functions in this crate to reduce code duplication
406 |     // and to provide a value with precisely the same `Regex` type in this
407 |     // crate. This, AFAIK, is impossible to mitigate.
408 |     //
409 |     // On the bright side, `rustdoc` lets us hide this from the public API
410 |     // documentation.
411 |     pub use compile::{
412 |         Program,
413 |         OneChar, CharClass, Any, Save, Jump, Split,
414 |         Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
415 |     };
416 |     pub use parse::{
417 |         FLAG_EMPTY, FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL,
418 |         FLAG_SWAP_GREED, FLAG_NEGATED,
419 |     };
420 |     pub use re::{Dynamic, Native};
421 |     pub use vm::{
422 |         MatchKind, Exists, Location, Submatches,
423 |         StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
424 |         CharReader, find_prefix,
425 |     };
426 | }
427 | 


--------------------------------------------------------------------------------
/src/macro.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
  2 | // file at the top-level directory of this distribution and at
  3 | // http://rust-lang.org/COPYRIGHT.
  4 | //
  5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  8 | // option. This file may not be copied, modified, or distributed
  9 | // except according to those terms.
 10 | 
 11 | //! This crate provides the `regex!` macro. Its use is documented in the
 12 | //! `regex` crate.
 13 | 
 14 | #![crate_id = "regex_macros#0.11-pre"]
 15 | #![crate_type = "dylib"]
 16 | #![experimental]
 17 | #![license = "MIT/ASL2"]
 18 | #![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
 19 |        html_favicon_url = "http://www.rust-lang.org/favicon.ico",
 20 |        html_root_url = "http://static.rust-lang.org/doc/master")]
 21 | 
 22 | #![feature(macro_registrar, managed_boxes, quote)]
 23 | 
 24 | extern crate regex;
 25 | extern crate syntax;
 26 | 
 27 | use syntax::ast;
 28 | use syntax::codemap;
 29 | use syntax::ext::base::{
 30 |     SyntaxExtension, ExtCtxt, MacResult, MacExpr, DummyResult,
 31 |     NormalTT, BasicMacroExpander,
 32 | };
 33 | use syntax::parse;
 34 | use syntax::parse::token;
 35 | use syntax::print::pprust;
 36 | 
 37 | use regex::Regex;
 38 | use regex::native::{
 39 |     OneChar, CharClass, Any, Save, Jump, Split,
 40 |     Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
 41 |     Program, Dynamic, Native,
 42 |     FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED,
 43 | };
 44 | 
 45 | /// For the `regex!` syntax extension. Do not use.
 46 | #[macro_registrar]
 47 | #[doc(hidden)]
 48 | pub fn macro_registrar(register: |ast::Name, SyntaxExtension|) {
 49 |     let expander = ~BasicMacroExpander { expander: native, span: None };
 50 |     register(token::intern("regex"), NormalTT(expander, None))
 51 | }
 52 | 
 53 | /// Generates specialized code for the Pike VM for a particular regular
 54 | /// expression.
 55 | ///
 56 | /// There are two primary differences between the code generated here and the
 57 | /// general code in vm.rs.
 58 | ///
 59 | /// 1. All heap allocation is removed. Sized vector types are used instead.
 60 | ///    Care must be taken to make sure that these vectors are not copied
 61 | ///    gratuitously. (If you're not sure, run the benchmarks. They will yell
 62 | ///    at you if you do.)
 63 | /// 2. The main `match instruction { ... }` expressions are replaced with more
 64 | ///    direct `match pc { ... }`. The generators can be found in
 65 | ///    `step_insts` and `add_insts`.
 66 | ///
 67 | /// Other more minor changes include eliding code when possible (although this
 68 | /// isn't completely thorough at the moment), and translating character class
 69 | /// matching from using a binary search to a simple `match` expression (see
 70 | /// `match_class`).
 71 | ///
 72 | /// It is strongly recommended to read the dynamic implementation in vm.rs
 73 | /// first before trying to understand the code generator. The implementation
 74 | /// strategy is identical and vm.rs has comments and will be easier to follow.
 75 | fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree])
 76 |          -> ~MacResult {
 77 |     let regex = match parse(cx, tts) {
 78 |         Some(r) => r,
 79 |         // error is logged in 'parse' with cx.span_err
 80 |         None => return DummyResult::any(sp),
 81 |     };
 82 |     let re = match Regex::new(regex.to_owned()) {
 83 |         Ok(re) => re,
 84 |         Err(err) => {
 85 |             cx.span_err(sp, err.to_str());
 86 |             return DummyResult::any(sp)
 87 |         }
 88 |     };
 89 |     let prog = match re.p {
 90 |         Dynamic(ref prog) => prog.clone(),
 91 |         Native(_) => unreachable!(),
 92 |     };
 93 | 
 94 |     let mut gen = NfaGen {
 95 |         cx: &*cx, sp: sp, prog: prog,
 96 |         names: re.names.clone(), original: re.original.clone(),
 97 |     };
 98 |     MacExpr::new(gen.code())
 99 | }
100 | 
101 | struct NfaGen<'a> {
102 |     cx: &'a ExtCtxt<'a>,
103 |     sp: codemap::Span,
104 |     prog: Program,
105 |     names: ~[Option<~str>],
106 |     original: ~str,
107 | }
108 | 
109 | impl<'a> NfaGen<'a> {
110 |     fn code(&mut self) -> @ast::Expr {
111 |         // Most or all of the following things are used in the quasiquoted
112 |         // expression returned.
113 |         let num_cap_locs = 2 * self.prog.num_captures();
114 |         let num_insts = self.prog.insts.len();
115 |         let cap_names = self.vec_expr(self.names,
116 |             |cx, name| match name {
117 |                 &Some(ref name) => {
118 |                     let name = name.as_slice();
119 |                     quote_expr!(cx, Some(~$name))
120 |                 }
121 |                 &None => quote_expr!(cx, None),
122 |             }
123 |         );
124 |         let prefix_anchor =
125 |             match self.prog.insts.as_slice()[1] {
126 |                 EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
127 |                 _ => false,
128 |             };
129 |         let init_groups = self.vec_from_fn(num_cap_locs,
130 |                                            |cx| quote_expr!(cx, None));
131 |         let prefix_bytes = self.vec_expr(self.prog.prefix.as_slice().as_bytes(),
132 |                                          |cx, b| quote_expr!(cx, $b));
133 |         let check_prefix = self.check_prefix();
134 |         let step_insts = self.step_insts();
135 |         let add_insts = self.add_insts();
136 |         let regex = self.original.as_slice();
137 | 
138 |         quote_expr!(self.cx, {
139 | fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
140 |             start: uint, end: uint) -> Vec<Option<uint>> {
141 |     #![allow(unused_imports)]
142 |     use regex::native::{
143 |         MatchKind, Exists, Location, Submatches,
144 |         StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
145 |         CharReader, find_prefix,
146 |     };
147 | 
148 |     return Nfa {
149 |         which: which,
150 |         input: input,
151 |         ic: 0,
152 |         chars: CharReader::new(input),
153 |     }.run(start, end);
154 | 
155 |     type Captures = [Option<uint>, ..$num_cap_locs];
156 | 
157 |     struct Nfa<'t> {
158 |         which: MatchKind,
159 |         input: &'t str,
160 |         ic: uint,
161 |         chars: CharReader<'t>,
162 |     }
163 | 
164 |     impl<'t> Nfa<'t> {
165 |         #[allow(unused_variable)]
166 |         fn run(&mut self, start: uint, end: uint) -> Vec<Option<uint>> {
167 |             let mut matched = false;
168 |             let prefix_bytes: &[u8] = &$prefix_bytes;
169 |             let mut clist = &mut Threads::new(self.which);
170 |             let mut nlist = &mut Threads::new(self.which);
171 | 
172 |             let mut groups = $init_groups;
173 | 
174 |             self.ic = start;
175 |             let mut next_ic = self.chars.set(start);
176 |             while self.ic <= end {
177 |                 if clist.size == 0 {
178 |                     if matched {
179 |                         break
180 |                     }
181 |                     $check_prefix
182 |                 }
183 |                 if clist.size == 0 || (!$prefix_anchor && !matched) {
184 |                     self.add(clist, 0, &mut groups)
185 |                 }
186 | 
187 |                 self.ic = next_ic;
188 |                 next_ic = self.chars.advance();
189 | 
190 |                 let mut i = 0;
191 |                 while i < clist.size {
192 |                     let pc = clist.pc(i);
193 |                     let step_state = self.step(&mut groups, nlist,
194 |                                                clist.groups(i), pc);
195 |                     match step_state {
196 |                         StepMatchEarlyReturn =>
197 |                             return vec![Some(0u), Some(0u)],
198 |                         StepMatch => { matched = true; clist.empty() },
199 |                         StepContinue => {},
200 |                     }
201 |                     i += 1;
202 |                 }
203 |                 ::std::mem::swap(&mut clist, &mut nlist);
204 |                 nlist.empty();
205 |             }
206 |             match self.which {
207 |                 Exists if matched     => vec![Some(0u), Some(0u)],
208 |                 Exists                => vec![None, None],
209 |                 Location | Submatches => groups.iter().map(|x| *x).collect(),
210 |             }
211 |         }
212 | 
213 |         // Sometimes `nlist` is never used (for empty regexes).
214 |         #[allow(unused_variable)]
215 |         #[inline]
216 |         fn step(&self, groups: &mut Captures, nlist: &mut Threads,
217 |                 caps: &mut Captures, pc: uint) -> StepState {
218 |             $step_insts
219 |             StepContinue
220 |         }
221 | 
222 |         fn add(&self, nlist: &mut Threads, pc: uint,
223 |                groups: &mut Captures) {
224 |             if nlist.contains(pc) {
225 |                 return
226 |             }
227 |             $add_insts
228 |         }
229 |     }
230 | 
231 |     struct Thread {
232 |         pc: uint,
233 |         groups: Captures,
234 |     }
235 | 
236 |     struct Threads {
237 |         which: MatchKind,
238 |         queue: [Thread, ..$num_insts],
239 |         sparse: [uint, ..$num_insts],
240 |         size: uint,
241 |     }
242 | 
243 |     impl Threads {
244 |         fn new(which: MatchKind) -> Threads {
245 |             Threads {
246 |                 which: which,
247 |                 // These unsafe blocks are used for performance reasons, as it
248 |                 // gives us a zero-cost initialization of a sparse set. The
249 |                 // trick is described in more detail here:
250 |                 // http://research.swtch.com/sparse
251 |                 // The idea here is to avoid initializing threads that never
252 |                 // need to be initialized, particularly for larger regexs with
253 |                 // a lot of instructions.
254 |                 queue: unsafe { ::std::mem::uninit() },
255 |                 sparse: unsafe { ::std::mem::uninit() },
256 |                 size: 0,
257 |             }
258 |         }
259 | 
260 |         #[inline]
261 |         fn add(&mut self, pc: uint, groups: &Captures) {
262 |             let t = &mut self.queue[self.size];
263 |             t.pc = pc;
264 |             match self.which {
265 |                 Exists => {},
266 |                 Location => {
267 |                     t.groups[0] = groups[0];
268 |                     t.groups[1] = groups[1];
269 |                 }
270 |                 Submatches => {
271 |                     for (slot, val) in t.groups.mut_iter().zip(groups.iter()) {
272 |                         *slot = *val;
273 |                     }
274 |                 }
275 |             }
276 |             self.sparse[pc] = self.size;
277 |             self.size += 1;
278 |         }
279 | 
280 |         #[inline]
281 |         fn add_empty(&mut self, pc: uint) {
282 |             self.queue[self.size].pc = pc;
283 |             self.sparse[pc] = self.size;
284 |             self.size += 1;
285 |         }
286 | 
287 |         #[inline]
288 |         fn contains(&self, pc: uint) -> bool {
289 |             let s = self.sparse[pc];
290 |             s < self.size && self.queue[s].pc == pc
291 |         }
292 | 
293 |         #[inline]
294 |         fn empty(&mut self) {
295 |             self.size = 0;
296 |         }
297 | 
298 |         #[inline]
299 |         fn pc(&self, i: uint) -> uint {
300 |             self.queue[i].pc
301 |         }
302 | 
303 |         #[inline]
304 |         fn groups<'r>(&'r mut self, i: uint) -> &'r mut Captures {
305 |             &'r mut self.queue[i].groups
306 |         }
307 |     }
308 | }
309 | 
310 | ::regex::Regex {
311 |     original: ~$regex,
312 |     names: ~$cap_names,
313 |     p: ::regex::native::Native(exec),
314 | }
315 |         })
316 |     }
317 | 
318 |     // Generates code for the `add` method, which is responsible for adding
319 |     // zero-width states to the next queue of states to visit.
320 |     fn add_insts(&self) -> @ast::Expr {
321 |         let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
322 |             let nextpc = pc + 1;
323 |             let body = match *inst {
324 |                 EmptyBegin(flags) => {
325 |                     let nl = '\n';
326 |                     let cond =
327 |                         if flags & FLAG_MULTI > 0 {
328 |                             quote_expr!(self.cx,
329 |                                 self.chars.is_begin()
330 |                                 || self.chars.prev == Some($nl)
331 |                             )
332 |                         } else {
333 |                             quote_expr!(self.cx, self.chars.is_begin())
334 |                         };
335 |                     quote_expr!(self.cx, {
336 |                         nlist.add_empty($pc);
337 |                         if $cond { self.add(nlist, $nextpc, &mut *groups) }
338 |                     })
339 |                 }
340 |                 EmptyEnd(flags) => {
341 |                     let nl = '\n';
342 |                     let cond =
343 |                         if flags & FLAG_MULTI > 0 {
344 |                             quote_expr!(self.cx,
345 |                                 self.chars.is_end()
346 |                                 || self.chars.cur == Some($nl)
347 |                             )
348 |                         } else {
349 |                             quote_expr!(self.cx, self.chars.is_end())
350 |                         };
351 |                     quote_expr!(self.cx, {
352 |                         nlist.add_empty($pc);
353 |                         if $cond { self.add(nlist, $nextpc, &mut *groups) }
354 |                     })
355 |                 }
356 |                 EmptyWordBoundary(flags) => {
357 |                     let cond =
358 |                         if flags & FLAG_NEGATED > 0 {
359 |                             quote_expr!(self.cx, !self.chars.is_word_boundary())
360 |                         } else {
361 |                             quote_expr!(self.cx, self.chars.is_word_boundary())
362 |                         };
363 |                     quote_expr!(self.cx, {
364 |                         nlist.add_empty($pc);
365 |                         if $cond { self.add(nlist, $nextpc, &mut *groups) }
366 |                     })
367 |                 }
368 |                 Save(slot) => {
369 |                     let save = quote_expr!(self.cx, {
370 |                         let old = groups[$slot];
371 |                         groups[$slot] = Some(self.ic);
372 |                         self.add(nlist, $nextpc, &mut *groups);
373 |                         groups[$slot] = old;
374 |                     });
375 |                     let add = quote_expr!(self.cx, {
376 |                         self.add(nlist, $nextpc, &mut *groups);
377 |                     });
378 |                     // If this is saving a submatch location but we request
379 |                     // existence or only full match location, then we can skip
380 |                     // right over it every time.
381 |                     if slot > 1 {
382 |                         quote_expr!(self.cx, {
383 |                             nlist.add_empty($pc);
384 |                             match self.which {
385 |                                 Submatches => $save,
386 |                                 Exists | Location => $add,
387 |                             }
388 |                         })
389 |                     } else {
390 |                         quote_expr!(self.cx, {
391 |                             nlist.add_empty($pc);
392 |                             match self.which {
393 |                                 Submatches | Location => $save,
394 |                                 Exists => $add,
395 |                             }
396 |                         })
397 |                     }
398 |                 }
399 |                 Jump(to) => {
400 |                     quote_expr!(self.cx, {
401 |                         nlist.add_empty($pc);
402 |                         self.add(nlist, $to, &mut *groups);
403 |                     })
404 |                 }
405 |                 Split(x, y) => {
406 |                     quote_expr!(self.cx, {
407 |                         nlist.add_empty($pc);
408 |                         self.add(nlist, $x, &mut *groups);
409 |                         self.add(nlist, $y, &mut *groups);
410 |                     })
411 |                 }
412 |                 // For Match, OneChar, CharClass, Any
413 |                 _ => quote_expr!(self.cx, nlist.add($pc, &*groups)),
414 |             };
415 |             self.arm_inst(pc, body)
416 |         }).collect::<Vec<ast::Arm>>();
417 | 
418 |         self.match_insts(arms)
419 |     }
420 | 
421 |     // Generates the code for the `step` method, which processes all states
422 |     // in the current queue that consume a single character.
423 |     fn step_insts(&self) -> @ast::Expr {
424 |         let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
425 |             let nextpc = pc + 1;
426 |             let body = match *inst {
427 |                 Match => {
428 |                     quote_expr!(self.cx, {
429 |                         match self.which {
430 |                             Exists => {
431 |                                 return StepMatchEarlyReturn
432 |                             }
433 |                             Location => {
434 |                                 groups[0] = caps[0];
435 |                                 groups[1] = caps[1];
436 |                                 return StepMatch
437 |                             }
438 |                             Submatches => {
439 |                                 for (slot, val) in groups.mut_iter().zip(caps.iter()) {
440 |                                     *slot = *val;
441 |                                 }
442 |                                 return StepMatch
443 |                             }
444 |                         }
445 |                     })
446 |                 }
447 |                 OneChar(c, flags) => {
448 |                     if flags & FLAG_NOCASE > 0 {
449 |                         let upc = c.to_uppercase();
450 |                         quote_expr!(self.cx, {
451 |                             let upc = self.chars.prev.map(|c| c.to_uppercase());
452 |                             if upc == Some($upc) {
453 |                                 self.add(nlist, $nextpc, caps);
454 |                             }
455 |                         })
456 |                     } else {
457 |                         quote_expr!(self.cx, {
458 |                             if self.chars.prev == Some($c) {
459 |                                 self.add(nlist, $nextpc, caps);
460 |                             }
461 |                         })
462 |                     }
463 |                 }
464 |                 CharClass(ref ranges, flags) => {
465 |                     let negate = flags & FLAG_NEGATED > 0;
466 |                     let casei = flags & FLAG_NOCASE > 0;
467 |                     let get_char =
468 |                         if casei {
469 |                             quote_expr!(self.cx, self.chars.prev.unwrap().to_uppercase())
470 |                         } else {
471 |                             quote_expr!(self.cx, self.chars.prev.unwrap())
472 |                         };
473 |                     let negcond =
474 |                         if negate {
475 |                             quote_expr!(self.cx, !found)
476 |                         } else {
477 |                             quote_expr!(self.cx, found)
478 |                         };
479 |                     let mranges = self.match_class(casei, ranges.as_slice());
480 |                     quote_expr!(self.cx, {
481 |                         if self.chars.prev.is_some() {
482 |                             let c = $get_char;
483 |                             let found = $mranges;
484 |                             if $negcond {
485 |                                 self.add(nlist, $nextpc, caps);
486 |                             }
487 |                         }
488 |                     })
489 |                 }
490 |                 Any(flags) => {
491 |                     if flags & FLAG_DOTNL > 0 {
492 |                         quote_expr!(self.cx, self.add(nlist, $nextpc, caps))
493 |                     } else {
494 |                         let nl = '\n'; // no char lits allowed? wtf?
495 |                         quote_expr!(self.cx, {
496 |                             if self.chars.prev != Some($nl) {
497 |                                 self.add(nlist, $nextpc, caps)
498 |                             }
499 |                         })
500 |                     }
501 |                 }
502 |                 // EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split
503 |                 _ => quote_expr!(self.cx, {}),
504 |             };
505 |             self.arm_inst(pc, body)
506 |         }).collect::<Vec<ast::Arm>>();
507 | 
508 |         self.match_insts(arms)
509 |     }
510 | 
511 |     // Translates a character class into a match expression.
512 |     // This avoids a binary search (and is hopefully replaced by a jump
513 |     // table).
514 |     fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> @ast::Expr {
515 |         let mut arms = ranges.iter().map(|&(mut start, mut end)| {
516 |             if casei {
517 |                 start = start.to_uppercase();
518 |                 end = end.to_uppercase();
519 |             }
520 |             ast::Arm {
521 |                 attrs: vec!(),
522 |                 pats: vec!(@ast::Pat{
523 |                     id: ast::DUMMY_NODE_ID,
524 |                     span: self.sp,
525 |                     node: ast::PatRange(quote_expr!(self.cx, $start),
526 |                                         quote_expr!(self.cx, $end)),
527 |                 }),
528 |                 guard: None,
529 |                 body: quote_expr!(self.cx, true),
530 |             }
531 |         }).collect::<Vec<ast::Arm>>();
532 | 
533 |         arms.push(self.wild_arm_expr(quote_expr!(self.cx, false)));
534 | 
535 |         let match_on = quote_expr!(self.cx, c);
536 |         self.dummy_expr(ast::ExprMatch(match_on, arms))
537 |     }
538 | 
539 |     // Generates code for checking a literal prefix of the search string.
540 |     // The code is only generated if the regex *has* a literal prefix.
541 |     // Otherwise, a no-op is returned.
542 |     fn check_prefix(&self) -> @ast::Expr {
543 |         if self.prog.prefix.len() == 0 {
544 |             quote_expr!(self.cx, {})
545 |         } else {
546 |             quote_expr!(self.cx,
547 |                 if clist.size == 0 {
548 |                     let haystack = self.input.as_bytes().slice_from(self.ic);
549 |                     match find_prefix(prefix_bytes, haystack) {
550 |                         None => break,
551 |                         Some(i) => {
552 |                             self.ic += i;
553 |                             next_ic = self.chars.set(self.ic);
554 |                         }
555 |                     }
556 |                 }
557 |             )
558 |         }
559 |     }
560 | 
561 |     // Builds a `match pc { ... }` expression from a list of arms, specifically
562 |     // for matching the current program counter with an instruction.
563 |     // A wild-card arm is automatically added that executes a no-op. It will
564 |     // never be used, but is added to satisfy the compiler complaining about
565 |     // non-exhaustive patterns.
566 |     fn match_insts(&self, mut arms: Vec<ast::Arm>) -> @ast::Expr {
567 |         let mat_pc = quote_expr!(self.cx, pc);
568 |         arms.push(self.wild_arm_expr(quote_expr!(self.cx, {})));
569 |         self.dummy_expr(ast::ExprMatch(mat_pc, arms))
570 |     }
571 | 
572 |     // Creates a match arm for the instruction at `pc` with the expression
573 |     // `body`.
574 |     fn arm_inst(&self, pc: uint, body: @ast::Expr) -> ast::Arm {
575 |         ast::Arm {
576 |             attrs: vec!(),
577 |             pats: vec!(@ast::Pat{
578 |                 id: ast::DUMMY_NODE_ID,
579 |                 span: self.sp,
580 |                 node: ast::PatLit(quote_expr!(self.cx, $pc)),
581 |             }),
582 |             guard: None,
583 |             body: body,
584 |         }
585 |     }
586 | 
587 |     // Creates a wild-card match arm with the expression `body`.
588 |     fn wild_arm_expr(&self, body: @ast::Expr) -> ast::Arm {
589 |         ast::Arm {
590 |             attrs: vec!(),
591 |             pats: vec!(@ast::Pat{
592 |                 id: ast::DUMMY_NODE_ID,
593 |                 span: self.sp,
594 |                 node: ast::PatWild,
595 |             }),
596 |             guard: None,
597 |             body: body,
598 |         }
599 |     }
600 | 
601 |     // Builds a `[a, b, .., len]` expression where each element is the result
602 |     // of executing `to_expr`.
603 |     fn vec_from_fn(&self, len: uint, to_expr: |&ExtCtxt| -> @ast::Expr)
604 |                   -> @ast::Expr {
605 |         self.vec_expr(Vec::from_elem(len, ()).as_slice(),
606 |                       |cx, _| to_expr(cx))
607 |     }
608 | 
609 |     // Converts `xs` to a `[x1, x2, .., xN]` expression by calling `to_expr`
610 |     // on each element in `xs`.
611 |     fn vec_expr<T>(&self, xs: &[T], to_expr: |&ExtCtxt, &T| -> @ast::Expr)
612 |                   -> @ast::Expr {
613 |         let mut exprs = vec!();
614 |         for x in xs.iter() {
615 |             exprs.push(to_expr(self.cx, x))
616 |         }
617 |         let vec_exprs = self.dummy_expr(ast::ExprVec(exprs));
618 |         quote_expr!(self.cx, $vec_exprs)
619 |     }
620 | 
621 |     // Creates an expression with a dummy node ID given an underlying
622 |     // `ast::Expr_`.
623 |     fn dummy_expr(&self, e: ast::Expr_) -> @ast::Expr {
624 |         @ast::Expr {
625 |             id: ast::DUMMY_NODE_ID,
626 |             node: e,
627 |             span: self.sp,
628 |         }
629 |     }
630 | }
631 | 
632 | // This trait is defined in the quote module in the syntax crate, but I
633 | // don't think it's exported.
634 | // Interestingly, quote_expr! only requires that a 'to_tokens' method be
635 | // defined rather than satisfying a particular trait.
636 | #[doc(hidden)]
637 | trait ToTokens {
638 |     fn to_tokens(&self, cx: &ExtCtxt) -> Vec<ast::TokenTree>;
639 | }
640 | 
641 | impl ToTokens for char {
642 |     fn to_tokens(&self, _: &ExtCtxt) -> Vec<ast::TokenTree> {
643 |         vec!(ast::TTTok(codemap::DUMMY_SP, token::LIT_CHAR((*self) as u32)))
644 |     }
645 | }
646 | 
647 | impl ToTokens for bool {
648 |     fn to_tokens(&self, _: &ExtCtxt) -> Vec<ast::TokenTree> {
649 |         let ident = token::IDENT(token::str_to_ident(self.to_str()), false);
650 |         vec!(ast::TTTok(codemap::DUMMY_SP, ident))
651 |     }
652 | }
653 | 
654 | /// Looks for a single string literal and returns it.
655 | /// Otherwise, logs an error with cx.span_err and returns None.
656 | fn parse(cx: &mut ExtCtxt, tts: &[ast::TokenTree]) -> Option<~str> {
657 |     let mut parser = parse::new_parser_from_tts(cx.parse_sess(), cx.cfg(),
658 |                                                 Vec::from_slice(tts));
659 |     let entry = cx.expand_expr(parser.parse_expr());
660 |     let regex = match entry.node {
661 |         ast::ExprLit(lit) => {
662 |             match lit.node {
663 |                 ast::LitStr(ref s, _) => s.to_str(),
664 |                 _ => {
665 |                     cx.span_err(entry.span, format!(
666 |                         "expected string literal but got `{}`",
667 |                         pprust::lit_to_str(lit)));
668 |                     return None
669 |                 }
670 |             }
671 |         }
672 |         _ => {
673 |             cx.span_err(entry.span, format!(
674 |                 "expected string literal but got `{}`",
675 |                 pprust::expr_to_str(entry)));
676 |             return None
677 |         }
678 |     };
679 |     if !parser.eat(&token::EOF) {
680 |         cx.span_err(parser.span, "only one string literal allowed");
681 |         return None;
682 |     }
683 |     Some(regex)
684 | }
685 | 


--------------------------------------------------------------------------------
/src/test/bench.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
  2 | // file at the top-level directory of this distribution and at
  3 | // http://rust-lang.org/COPYRIGHT.
  4 | //
  5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  8 | // option. This file may not be copied, modified, or distributed
  9 | // except according to those terms.
 10 | 
 11 | use rand::{Rng, task_rng};
 12 | use stdtest::Bencher;
 13 | use std::str;
 14 | use regex::{Regex, NoExpand};
 15 | 
 16 | fn bench_assert_match(b: &mut Bencher, re: Regex, text: &str) {
 17 |     b.iter(|| if !re.is_match(text) { fail!("no match") });
 18 | }
 19 | 
 20 | #[bench]
 21 | fn no_exponential(b: &mut Bencher) {
 22 |     let n = 100;
 23 |     let re = Regex::new("a?".repeat(n) + "a".repeat(n)).unwrap();
 24 |     let text = "a".repeat(n);
 25 |     bench_assert_match(b, re, text);
 26 | }
 27 | 
 28 | #[bench]
 29 | fn literal(b: &mut Bencher) {
 30 |     let re = regex!("y");
 31 |     let text = "x".repeat(50) + "y";
 32 |     bench_assert_match(b, re, text);
 33 | }
 34 | 
 35 | #[bench]
 36 | fn not_literal(b: &mut Bencher) {
 37 |     let re = regex!(".y");
 38 |     let text = "x".repeat(50) + "y";
 39 |     bench_assert_match(b, re, text);
 40 | }
 41 | 
 42 | #[bench]
 43 | fn match_class(b: &mut Bencher) {
 44 |     let re = regex!("[abcdw]");
 45 |     let text = "xxxx".repeat(20) + "w";
 46 |     bench_assert_match(b, re, text);
 47 | }
 48 | 
 49 | #[bench]
 50 | fn match_class_in_range(b: &mut Bencher) {
 51 |     // 'b' is between 'a' and 'c', so the class range checking doesn't help.
 52 |     let re = regex!("[ac]");
 53 |     let text = "bbbb".repeat(20) + "c";
 54 |     bench_assert_match(b, re, text);
 55 | }
 56 | 
 57 | #[bench]
 58 | fn replace_all(b: &mut Bencher) {
 59 |     let re = regex!("[cjrw]");
 60 |     let text = "abcdefghijklmnopqrstuvwxyz";
 61 |     // FIXME: This isn't using the $name expand stuff.
 62 |     // It's possible RE2/Go is using it, but currently, the expand in this
 63 |     // crate is actually compiling a regex, so it's incredibly slow.
 64 |     b.iter(|| re.replace_all(text, NoExpand("")));
 65 | }
 66 | 
 67 | #[bench]
 68 | fn anchored_literal_short_non_match(b: &mut Bencher) {
 69 |     let re = regex!("^zbc(d|e)");
 70 |     let text = "abcdefghijklmnopqrstuvwxyz";
 71 |     b.iter(|| re.is_match(text));
 72 | }
 73 | 
 74 | #[bench]
 75 | fn anchored_literal_long_non_match(b: &mut Bencher) {
 76 |     let re = regex!("^zbc(d|e)");
 77 |     let text = "abcdefghijklmnopqrstuvwxyz".repeat(15);
 78 |     b.iter(|| re.is_match(text));
 79 | }
 80 | 
 81 | #[bench]
 82 | fn anchored_literal_short_match(b: &mut Bencher) {
 83 |     let re = regex!("^.bc(d|e)");
 84 |     let text = "abcdefghijklmnopqrstuvwxyz";
 85 |     b.iter(|| re.is_match(text));
 86 | }
 87 | 
 88 | #[bench]
 89 | fn anchored_literal_long_match(b: &mut Bencher) {
 90 |     let re = regex!("^.bc(d|e)");
 91 |     let text = "abcdefghijklmnopqrstuvwxyz".repeat(15);
 92 |     b.iter(|| re.is_match(text));
 93 | }
 94 | 
 95 | #[bench]
 96 | fn one_pass_short_a(b: &mut Bencher) {
 97 |     let re = regex!("^.bc(d|e)*$");
 98 |     let text = "abcddddddeeeededd";
 99 |     b.iter(|| re.is_match(text));
100 | }
101 | 
102 | #[bench]
103 | fn one_pass_short_a_not(b: &mut Bencher) {
104 |     let re = regex!(".bc(d|e)*$");
105 |     let text = "abcddddddeeeededd";
106 |     b.iter(|| re.is_match(text));
107 | }
108 | 
109 | #[bench]
110 | fn one_pass_short_b(b: &mut Bencher) {
111 |     let re = regex!("^.bc(?:d|e)*$");
112 |     let text = "abcddddddeeeededd";
113 |     b.iter(|| re.is_match(text));
114 | }
115 | 
116 | #[bench]
117 | fn one_pass_short_b_not(b: &mut Bencher) {
118 |     let re = regex!(".bc(?:d|e)*$");
119 |     let text = "abcddddddeeeededd";
120 |     b.iter(|| re.is_match(text));
121 | }
122 | 
123 | #[bench]
124 | fn one_pass_long_prefix(b: &mut Bencher) {
125 |     let re = regex!("^abcdefghijklmnopqrstuvwxyz.*$");
126 |     let text = "abcdefghijklmnopqrstuvwxyz";
127 |     b.iter(|| re.is_match(text));
128 | }
129 | 
130 | #[bench]
131 | fn one_pass_long_prefix_not(b: &mut Bencher) {
132 |     let re = regex!("^.bcdefghijklmnopqrstuvwxyz.*$");
133 |     let text = "abcdefghijklmnopqrstuvwxyz";
134 |     b.iter(|| re.is_match(text));
135 | }
136 | 
137 | macro_rules! throughput(
138 |     ($name:ident, $regex:expr, $size:expr) => (
139 |         #[bench]
140 |         fn $name(b: &mut Bencher) {
141 |             let text = gen_text($size);
142 |             b.bytes = $size;
143 |             b.iter(|| if $regex.is_match(text) { fail!("match") });
144 |         }
145 |     );
146 | )
147 | 
148 | fn easy0() -> Regex { regex!("ABCDEFGHIJKLMNOPQRSTUVWXYZ$") }
149 | fn easy1() -> Regex { regex!("A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$") }
150 | fn medium() -> Regex { regex!("[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$") }
151 | fn hard() -> Regex { regex!("[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$") }
152 | 
153 | fn gen_text(n: uint) -> ~str {
154 |     let mut rng = task_rng();
155 |     let mut bytes = rng.gen_ascii_str(n).into_bytes();
156 |     for (i, b) in bytes.mut_iter().enumerate() {
157 |         if i % 20 == 0 {
158 |             *b = '\n' as u8
159 |         }
160 |     }
161 |     str::from_utf8(bytes).unwrap().to_owned()
162 | }
163 | 
164 | throughput!(easy0_32, easy0(), 32)
165 | throughput!(easy0_1K, easy0(), 1<<10)
166 | throughput!(easy0_32K, easy0(), 32<<10)
167 | 
168 | throughput!(easy1_32, easy1(), 32)
169 | throughput!(easy1_1K, easy1(), 1<<10)
170 | throughput!(easy1_32K, easy1(), 32<<10)
171 | 
172 | throughput!(medium_32, medium(), 32)
173 | throughput!(medium_1K, medium(), 1<<10)
174 | throughput!(medium_32K,medium(), 32<<10)
175 | 
176 | throughput!(hard_32, hard(), 32)
177 | throughput!(hard_1K, hard(), 1<<10)
178 | throughput!(hard_32K,hard(), 32<<10)
179 | 
180 | 


--------------------------------------------------------------------------------
/src/test/matches.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
  2 | // file at the top-level directory of this distribution and at
  3 | // http://rust-lang.org/COPYRIGHT.
  4 | //
  5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  8 | // option. This file may not be copied, modified, or distributed
  9 | // except according to those terms.
 10 | 
 11 | // ignore-tidy-linelength
 12 | 
 13 | // DO NOT EDIT. Automatically generated by 'src/etc/regex-match-tests'
 14 | // on 2014-04-23 01:33:36.539280.
 15 | 
 16 | // Tests from basic.dat
 17 | mat!(match_basic_3, r"abracadabra$", r"abracadabracadabra", Some((7, 18)))
 18 | mat!(match_basic_4, r"a...b", r"abababbb", Some((2, 7)))
 19 | mat!(match_basic_5, r"XXXXXX", r"..XXXXXX", Some((2, 8)))
 20 | mat!(match_basic_6, r"\)", r"()", Some((1, 2)))
 21 | mat!(match_basic_7, r"a]", r"a]a", Some((0, 2)))
 22 | mat!(match_basic_9, r"\}", r"}", Some((0, 1)))
 23 | mat!(match_basic_10, r"\]", r"]", Some((0, 1)))
 24 | mat!(match_basic_12, r"]", r"]", Some((0, 1)))
 25 | mat!(match_basic_15, r"^a", r"ax", Some((0, 1)))
 26 | mat!(match_basic_16, r"\^a", r"a^a", Some((1, 3)))
 27 | mat!(match_basic_17, r"a\^", r"a^", Some((0, 2)))
 28 | mat!(match_basic_18, r"a$", r"aa", Some((1, 2)))
 29 | mat!(match_basic_19, r"a\$", r"a$", Some((0, 2)))
 30 | mat!(match_basic_20, r"^$", r"", Some((0, 0)))
 31 | mat!(match_basic_21, r"$^", r"", Some((0, 0)))
 32 | mat!(match_basic_22, r"a($)", r"aa", Some((1, 2)), Some((2, 2)))
 33 | mat!(match_basic_23, r"a*(^a)", r"aa", Some((0, 1)), Some((0, 1)))
 34 | mat!(match_basic_24, r"(..)*(...)*", r"a", Some((0, 0)))
 35 | mat!(match_basic_25, r"(..)*(...)*", r"abcd", Some((0, 4)), Some((2, 4)))
 36 | mat!(match_basic_26, r"(ab|a)(bc|c)", r"abc", Some((0, 3)), Some((0, 2)), Some((2, 3)))
 37 | mat!(match_basic_27, r"(ab)c|abc", r"abc", Some((0, 3)), Some((0, 2)))
 38 | mat!(match_basic_28, r"a{0}b", r"ab", Some((1, 2)))
 39 | mat!(match_basic_29, r"(a*)(b?)(b+)b{3}", r"aaabbbbbbb", Some((0, 10)), Some((0, 3)), Some((3, 4)), Some((4, 7)))
 40 | mat!(match_basic_30, r"(a*)(b{0,1})(b{1,})b{3}", r"aaabbbbbbb", Some((0, 10)), Some((0, 3)), Some((3, 4)), Some((4, 7)))
 41 | mat!(match_basic_32, r"((a|a)|a)", r"a", Some((0, 1)), Some((0, 1)), Some((0, 1)))
 42 | mat!(match_basic_33, r"(a*)(a|aa)", r"aaaa", Some((0, 4)), Some((0, 3)), Some((3, 4)))
 43 | mat!(match_basic_34, r"a*(a.|aa)", r"aaaa", Some((0, 4)), Some((2, 4)))
 44 | mat!(match_basic_35, r"a(b)|c(d)|a(e)f", r"aef", Some((0, 3)), None, None, Some((1, 2)))
 45 | mat!(match_basic_36, r"(a|b)?.*", r"b", Some((0, 1)), Some((0, 1)))
 46 | mat!(match_basic_37, r"(a|b)c|a(b|c)", r"ac", Some((0, 2)), Some((0, 1)))
 47 | mat!(match_basic_38, r"(a|b)c|a(b|c)", r"ab", Some((0, 2)), None, Some((1, 2)))
 48 | mat!(match_basic_39, r"(a|b)*c|(a|ab)*c", r"abc", Some((0, 3)), Some((1, 2)))
 49 | mat!(match_basic_40, r"(a|b)*c|(a|ab)*c", r"xc", Some((1, 2)))
 50 | mat!(match_basic_41, r"(.a|.b).*|.*(.a|.b)", r"xa", Some((0, 2)), Some((0, 2)))
 51 | mat!(match_basic_42, r"a?(ab|ba)ab", r"abab", Some((0, 4)), Some((0, 2)))
 52 | mat!(match_basic_43, r"a?(ac{0}b|ba)ab", r"abab", Some((0, 4)), Some((0, 2)))
 53 | mat!(match_basic_44, r"ab|abab", r"abbabab", Some((0, 2)))
 54 | mat!(match_basic_45, r"aba|bab|bba", r"baaabbbaba", Some((5, 8)))
 55 | mat!(match_basic_46, r"aba|bab", r"baaabbbaba", Some((6, 9)))
 56 | mat!(match_basic_47, r"(aa|aaa)*|(a|aaaaa)", r"aa", Some((0, 2)), Some((0, 2)))
 57 | mat!(match_basic_48, r"(a.|.a.)*|(a|.a...)", r"aa", Some((0, 2)), Some((0, 2)))
 58 | mat!(match_basic_49, r"ab|a", r"xabc", Some((1, 3)))
 59 | mat!(match_basic_50, r"ab|a", r"xxabc", Some((2, 4)))
 60 | mat!(match_basic_51, r"(?i)(Ab|cD)*", r"aBcD", Some((0, 4)), Some((2, 4)))
 61 | mat!(match_basic_52, r"[^-]", r"--a", Some((2, 3)))
 62 | mat!(match_basic_53, r"[a-]*", r"--a", Some((0, 3)))
 63 | mat!(match_basic_54, r"[a-m-]*", r"--amoma--", Some((0, 4)))
 64 | mat!(match_basic_55, r":::1:::0:|:::1:1:0:", r":::0:::1:::1:::0:", Some((8, 17)))
 65 | mat!(match_basic_56, r":::1:::0:|:::1:1:1:", r":::0:::1:::1:::0:", Some((8, 17)))
 66 | mat!(match_basic_57, r"[[:upper:]]", r"A", Some((0, 1)))
 67 | mat!(match_basic_58, r"[[:lower:]]+", r"`az{", Some((1, 3)))
 68 | mat!(match_basic_59, r"[[:upper:]]+", r"@AZ[", Some((1, 3)))
 69 | mat!(match_basic_65, r"
 70 | ", r"
 71 | ", Some((0, 1)))
 72 | mat!(match_basic_66, r"
 73 | ", r"
 74 | ", Some((0, 1)))
 75 | mat!(match_basic_67, r"[^a]", r"
 76 | ", Some((0, 1)))
 77 | mat!(match_basic_68, r"
 78 | a", r"
 79 | a", Some((0, 2)))
 80 | mat!(match_basic_69, r"(a)(b)(c)", r"abc", Some((0, 3)), Some((0, 1)), Some((1, 2)), Some((2, 3)))
 81 | mat!(match_basic_70, r"xxx", r"xxx", Some((0, 3)))
 82 | mat!(match_basic_71, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"feb 6,", Some((0, 6)))
 83 | mat!(match_basic_72, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"2/7", Some((0, 3)))
 84 | mat!(match_basic_73, r"(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)", r"feb 1,Feb 6", Some((5, 11)))
 85 | mat!(match_basic_74, r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))", r"x", Some((0, 1)), Some((0, 1)), Some((0, 1)))
 86 | mat!(match_basic_75, r"((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*", r"xx", Some((0, 2)), Some((1, 2)), Some((1, 2)))
 87 | mat!(match_basic_76, r"a?(ab|ba)*", r"ababababababababababababababababababababababababababababababababababababababababa", Some((0, 81)), Some((79, 81)))
 88 | mat!(match_basic_77, r"abaa|abbaa|abbbaa|abbbbaa", r"ababbabbbabbbabbbbabbbbaa", Some((18, 25)))
 89 | mat!(match_basic_78, r"abaa|abbaa|abbbaa|abbbbaa", r"ababbabbbabbbabbbbabaa", Some((18, 22)))
 90 | mat!(match_basic_79, r"aaac|aabc|abac|abbc|baac|babc|bbac|bbbc", r"baaabbbabac", Some((7, 11)))
 91 | mat!(match_basic_80, r".*", r"", Some((0, 2)))
 92 | mat!(match_basic_81, r"aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll", r"XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa", Some((53, 57)))
 93 | mat!(match_basic_83, r"a*a*a*a*a*b", r"aaaaaaaaab", Some((0, 10)))
 94 | mat!(match_basic_84, r"^", r"", Some((0, 0)))
 95 | mat!(match_basic_85, r"$", r"", Some((0, 0)))
 96 | mat!(match_basic_86, r"^$", r"", Some((0, 0)))
 97 | mat!(match_basic_87, r"^a$", r"a", Some((0, 1)))
 98 | mat!(match_basic_88, r"abc", r"abc", Some((0, 3)))
 99 | mat!(match_basic_89, r"abc", r"xabcy", Some((1, 4)))
100 | mat!(match_basic_90, r"abc", r"ababc", Some((2, 5)))
101 | mat!(match_basic_91, r"ab*c", r"abc", Some((0, 3)))
102 | mat!(match_basic_92, r"ab*bc", r"abc", Some((0, 3)))
103 | mat!(match_basic_93, r"ab*bc", r"abbc", Some((0, 4)))
104 | mat!(match_basic_94, r"ab*bc", r"abbbbc", Some((0, 6)))
105 | mat!(match_basic_95, r"ab+bc", r"abbc", Some((0, 4)))
106 | mat!(match_basic_96, r"ab+bc", r"abbbbc", Some((0, 6)))
107 | mat!(match_basic_97, r"ab?bc", r"abbc", Some((0, 4)))
108 | mat!(match_basic_98, r"ab?bc", r"abc", Some((0, 3)))
109 | mat!(match_basic_99, r"ab?c", r"abc", Some((0, 3)))
110 | mat!(match_basic_100, r"^abc$", r"abc", Some((0, 3)))
111 | mat!(match_basic_101, r"^abc", r"abcc", Some((0, 3)))
112 | mat!(match_basic_102, r"abc$", r"aabc", Some((1, 4)))
113 | mat!(match_basic_103, r"^", r"abc", Some((0, 0)))
114 | mat!(match_basic_104, r"$", r"abc", Some((3, 3)))
115 | mat!(match_basic_105, r"a.c", r"abc", Some((0, 3)))
116 | mat!(match_basic_106, r"a.c", r"axc", Some((0, 3)))
117 | mat!(match_basic_107, r"a.*c", r"axyzc", Some((0, 5)))
118 | mat!(match_basic_108, r"a[bc]d", r"abd", Some((0, 3)))
119 | mat!(match_basic_109, r"a[b-d]e", r"ace", Some((0, 3)))
120 | mat!(match_basic_110, r"a[b-d]", r"aac", Some((1, 3)))
121 | mat!(match_basic_111, r"a[-b]", r"a-", Some((0, 2)))
122 | mat!(match_basic_112, r"a[b-]", r"a-", Some((0, 2)))
123 | mat!(match_basic_113, r"a]", r"a]", Some((0, 2)))
124 | mat!(match_basic_114, r"a[]]b", r"a]b", Some((0, 3)))
125 | mat!(match_basic_115, r"a[^bc]d", r"aed", Some((0, 3)))
126 | mat!(match_basic_116, r"a[^-b]c", r"adc", Some((0, 3)))
127 | mat!(match_basic_117, r"a[^]b]c", r"adc", Some((0, 3)))
128 | mat!(match_basic_118, r"ab|cd", r"abc", Some((0, 2)))
129 | mat!(match_basic_119, r"ab|cd", r"abcd", Some((0, 2)))
130 | mat!(match_basic_120, r"a\(b", r"a(b", Some((0, 3)))
131 | mat!(match_basic_121, r"a\(*b", r"ab", Some((0, 2)))
132 | mat!(match_basic_122, r"a\(*b", r"a((b", Some((0, 4)))
133 | mat!(match_basic_123, r"((a))", r"abc", Some((0, 1)), Some((0, 1)), Some((0, 1)))
134 | mat!(match_basic_124, r"(a)b(c)", r"abc", Some((0, 3)), Some((0, 1)), Some((2, 3)))
135 | mat!(match_basic_125, r"a+b+c", r"aabbabc", Some((4, 7)))
136 | mat!(match_basic_126, r"a*", r"aaa", Some((0, 3)))
137 | mat!(match_basic_128, r"(a*)*", r"-", Some((0, 0)), None)
138 | mat!(match_basic_129, r"(a*)+", r"-", Some((0, 0)), Some((0, 0)))
139 | mat!(match_basic_131, r"(a*|b)*", r"-", Some((0, 0)), None)
140 | mat!(match_basic_132, r"(a+|b)*", r"ab", Some((0, 2)), Some((1, 2)))
141 | mat!(match_basic_133, r"(a+|b)+", r"ab", Some((0, 2)), Some((1, 2)))
142 | mat!(match_basic_134, r"(a+|b)?", r"ab", Some((0, 1)), Some((0, 1)))
143 | mat!(match_basic_135, r"[^ab]*", r"cde", Some((0, 3)))
144 | mat!(match_basic_137, r"(^)*", r"-", Some((0, 0)), None)
145 | mat!(match_basic_138, r"a*", r"", Some((0, 0)))
146 | mat!(match_basic_139, r"([abc])*d", r"abbbcd", Some((0, 6)), Some((4, 5)))
147 | mat!(match_basic_140, r"([abc])*bcd", r"abcd", Some((0, 4)), Some((0, 1)))
148 | mat!(match_basic_141, r"a|b|c|d|e", r"e", Some((0, 1)))
149 | mat!(match_basic_142, r"(a|b|c|d|e)f", r"ef", Some((0, 2)), Some((0, 1)))
150 | mat!(match_basic_144, r"((a*|b))*", r"-", Some((0, 0)), None, None)
151 | mat!(match_basic_145, r"abcd*efg", r"abcdefg", Some((0, 7)))
152 | mat!(match_basic_146, r"ab*", r"xabyabbbz", Some((1, 3)))
153 | mat!(match_basic_147, r"ab*", r"xayabbbz", Some((1, 2)))
154 | mat!(match_basic_148, r"(ab|cd)e", r"abcde", Some((2, 5)), Some((2, 4)))
155 | mat!(match_basic_149, r"[abhgefdc]ij", r"hij", Some((0, 3)))
156 | mat!(match_basic_150, r"(a|b)c*d", r"abcd", Some((1, 4)), Some((1, 2)))
157 | mat!(match_basic_151, r"(ab|ab*)bc", r"abc", Some((0, 3)), Some((0, 1)))
158 | mat!(match_basic_152, r"a([bc]*)c*", r"abc", Some((0, 3)), Some((1, 3)))
159 | mat!(match_basic_153, r"a([bc]*)(c*d)", r"abcd", Some((0, 4)), Some((1, 3)), Some((3, 4)))
160 | mat!(match_basic_154, r"a([bc]+)(c*d)", r"abcd", Some((0, 4)), Some((1, 3)), Some((3, 4)))
161 | mat!(match_basic_155, r"a([bc]*)(c+d)", r"abcd", Some((0, 4)), Some((1, 2)), Some((2, 4)))
162 | mat!(match_basic_156, r"a[bcd]*dcdcde", r"adcdcde", Some((0, 7)))
163 | mat!(match_basic_157, r"(ab|a)b*c", r"abc", Some((0, 3)), Some((0, 2)))
164 | mat!(match_basic_158, r"((a)(b)c)(d)", r"abcd", Some((0, 4)), Some((0, 3)), Some((0, 1)), Some((1, 2)), Some((3, 4)))
165 | mat!(match_basic_159, r"[A-Za-z_][A-Za-z0-9_]*", r"alpha", Some((0, 5)))
166 | mat!(match_basic_160, r"^a(bc+|b[eh])g|.h$", r"abh", Some((1, 3)))
167 | mat!(match_basic_161, r"(bc+d$|ef*g.|h?i(j|k))", r"effgz", Some((0, 5)), Some((0, 5)))
168 | mat!(match_basic_162, r"(bc+d$|ef*g.|h?i(j|k))", r"ij", Some((0, 2)), Some((0, 2)), Some((1, 2)))
169 | mat!(match_basic_163, r"(bc+d$|ef*g.|h?i(j|k))", r"reffgz", Some((1, 6)), Some((1, 6)))
170 | mat!(match_basic_164, r"(((((((((a)))))))))", r"a", Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)), Some((0, 1)))
171 | mat!(match_basic_165, r"multiple words", r"multiple words yeah", Some((0, 14)))
172 | mat!(match_basic_166, r"(.*)c(.*)", r"abcde", Some((0, 5)), Some((0, 2)), Some((3, 5)))
173 | mat!(match_basic_167, r"abcd", r"abcd", Some((0, 4)))
174 | mat!(match_basic_168, r"a(bc)d", r"abcd", Some((0, 4)), Some((1, 3)))
175 | mat!(match_basic_169, r"a[-]?c", r"ac", Some((0, 3)))
176 | mat!(match_basic_170, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Qaddafi", Some((0, 15)), None, Some((10, 12)))
177 | mat!(match_basic_171, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mo'ammar Gadhafi", Some((0, 16)), None, Some((11, 13)))
178 | mat!(match_basic_172, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Kaddafi", Some((0, 15)), None, Some((10, 12)))
179 | mat!(match_basic_173, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Qadhafi", Some((0, 15)), None, Some((10, 12)))
180 | mat!(match_basic_174, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Gadafi", Some((0, 14)), None, Some((10, 11)))
181 | mat!(match_basic_175, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mu'ammar Qadafi", Some((0, 15)), None, Some((11, 12)))
182 | mat!(match_basic_176, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moamar Gaddafi", Some((0, 14)), None, Some((9, 11)))
183 | mat!(match_basic_177, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Mu'ammar Qadhdhafi", Some((0, 18)), None, Some((13, 15)))
184 | mat!(match_basic_178, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Khaddafi", Some((0, 16)), None, Some((11, 13)))
185 | mat!(match_basic_179, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghaddafy", Some((0, 16)), None, Some((11, 13)))
186 | mat!(match_basic_180, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghadafi", Some((0, 15)), None, Some((11, 12)))
187 | mat!(match_basic_181, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Ghaddafi", Some((0, 16)), None, Some((11, 13)))
188 | mat!(match_basic_182, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muamar Kaddafi", Some((0, 14)), None, Some((9, 11)))
189 | mat!(match_basic_183, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Quathafi", Some((0, 16)), None, Some((11, 13)))
190 | mat!(match_basic_184, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Muammar Gheddafi", Some((0, 16)), None, Some((11, 13)))
191 | mat!(match_basic_185, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moammar Khadafy", Some((0, 15)), None, Some((11, 12)))
192 | mat!(match_basic_186, r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", r"Moammar Qudhafi", Some((0, 15)), None, Some((10, 12)))
193 | mat!(match_basic_187, r"a+(b|c)*d+", r"aabcdd", Some((0, 6)), Some((3, 4)))
194 | mat!(match_basic_188, r"^.+$", r"vivi", Some((0, 4)))
195 | mat!(match_basic_189, r"^(.+)$", r"vivi", Some((0, 4)), Some((0, 4)))
196 | mat!(match_basic_190, r"^([^!.]+).att.com!(.+)$", r"gryphon.att.com!eby", Some((0, 19)), Some((0, 7)), Some((16, 19)))
197 | mat!(match_basic_191, r"^([^!]+!)?([^!]+)$", r"bas", Some((0, 3)), None, Some((0, 3)))
198 | mat!(match_basic_192, r"^([^!]+!)?([^!]+)$", r"bar!bas", Some((0, 7)), Some((0, 4)), Some((4, 7)))
199 | mat!(match_basic_193, r"^([^!]+!)?([^!]+)$", r"foo!bas", Some((0, 7)), Some((0, 4)), Some((4, 7)))
200 | mat!(match_basic_194, r"^.+!([^!]+!)([^!]+)$", r"foo!bar!bas", Some((0, 11)), Some((4, 8)), Some((8, 11)))
201 | mat!(match_basic_195, r"((foo)|(bar))!bas", r"bar!bas", Some((0, 7)), Some((0, 3)), None, Some((0, 3)))
202 | mat!(match_basic_196, r"((foo)|(bar))!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)), None, Some((4, 7)))
203 | mat!(match_basic_197, r"((foo)|(bar))!bas", r"foo!bas", Some((0, 7)), Some((0, 3)), Some((0, 3)))
204 | mat!(match_basic_198, r"((foo)|bar)!bas", r"bar!bas", Some((0, 7)), Some((0, 3)))
205 | mat!(match_basic_199, r"((foo)|bar)!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)))
206 | mat!(match_basic_200, r"((foo)|bar)!bas", r"foo!bas", Some((0, 7)), Some((0, 3)), Some((0, 3)))
207 | mat!(match_basic_201, r"(foo|(bar))!bas", r"bar!bas", Some((0, 7)), Some((0, 3)), Some((0, 3)))
208 | mat!(match_basic_202, r"(foo|(bar))!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)), Some((4, 7)))
209 | mat!(match_basic_203, r"(foo|(bar))!bas", r"foo!bas", Some((0, 7)), Some((0, 3)))
210 | mat!(match_basic_204, r"(foo|bar)!bas", r"bar!bas", Some((0, 7)), Some((0, 3)))
211 | mat!(match_basic_205, r"(foo|bar)!bas", r"foo!bar!bas", Some((4, 11)), Some((4, 7)))
212 | mat!(match_basic_206, r"(foo|bar)!bas", r"foo!bas", Some((0, 7)), Some((0, 3)))
213 | mat!(match_basic_207, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bar!bas", Some((0, 11)), Some((0, 11)), None, None, Some((4, 8)), Some((8, 11)))
214 | mat!(match_basic_208, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"bas", Some((0, 3)), None, Some((0, 3)))
215 | mat!(match_basic_209, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"bar!bas", Some((0, 7)), Some((0, 4)), Some((4, 7)))
216 | mat!(match_basic_210, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"foo!bar!bas", Some((0, 11)), None, None, Some((4, 8)), Some((8, 11)))
217 | mat!(match_basic_211, r"^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$", r"foo!bas", Some((0, 7)), Some((0, 4)), Some((4, 7)))
218 | mat!(match_basic_212, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"bas", Some((0, 3)), Some((0, 3)), None, Some((0, 3)))
219 | mat!(match_basic_213, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"bar!bas", Some((0, 7)), Some((0, 7)), Some((0, 4)), Some((4, 7)))
220 | mat!(match_basic_214, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bar!bas", Some((0, 11)), Some((0, 11)), None, None, Some((4, 8)), Some((8, 11)))
221 | mat!(match_basic_215, r"^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$", r"foo!bas", Some((0, 7)), Some((0, 7)), Some((0, 4)), Some((4, 7)))
222 | mat!(match_basic_216, r".*(/XXX).*", r"/XXX", Some((0, 4)), Some((0, 4)))
223 | mat!(match_basic_217, r".*(\\XXX).*", r"\XXX", Some((0, 4)), Some((0, 4)))
224 | mat!(match_basic_218, r"\\XXX", r"\XXX", Some((0, 4)))
225 | mat!(match_basic_219, r".*(/000).*", r"/000", Some((0, 4)), Some((0, 4)))
226 | mat!(match_basic_220, r".*(\\000).*", r"\000", Some((0, 4)), Some((0, 4)))
227 | mat!(match_basic_221, r"\\000", r"\000", Some((0, 4)))
228 | 
229 | // Tests from nullsubexpr.dat
230 | mat!(match_nullsubexpr_3, r"(a*)*", r"a", Some((0, 1)), Some((0, 1)))
231 | mat!(match_nullsubexpr_5, r"(a*)*", r"x", Some((0, 0)), None)
232 | mat!(match_nullsubexpr_6, r"(a*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)))
233 | mat!(match_nullsubexpr_7, r"(a*)*", r"aaaaaax", Some((0, 6)), Some((0, 6)))
234 | mat!(match_nullsubexpr_8, r"(a*)+", r"a", Some((0, 1)), Some((0, 1)))
235 | mat!(match_nullsubexpr_9, r"(a*)+", r"x", Some((0, 0)), Some((0, 0)))
236 | mat!(match_nullsubexpr_10, r"(a*)+", r"aaaaaa", Some((0, 6)), Some((0, 6)))
237 | mat!(match_nullsubexpr_11, r"(a*)+", r"aaaaaax", Some((0, 6)), Some((0, 6)))
238 | mat!(match_nullsubexpr_12, r"(a+)*", r"a", Some((0, 1)), Some((0, 1)))
239 | mat!(match_nullsubexpr_13, r"(a+)*", r"x", Some((0, 0)))
240 | mat!(match_nullsubexpr_14, r"(a+)*", r"aaaaaa", Some((0, 6)), Some((0, 6)))
241 | mat!(match_nullsubexpr_15, r"(a+)*", r"aaaaaax", Some((0, 6)), Some((0, 6)))
242 | mat!(match_nullsubexpr_16, r"(a+)+", r"a", Some((0, 1)), Some((0, 1)))
243 | mat!(match_nullsubexpr_17, r"(a+)+", r"x", None)
244 | mat!(match_nullsubexpr_18, r"(a+)+", r"aaaaaa", Some((0, 6)), Some((0, 6)))
245 | mat!(match_nullsubexpr_19, r"(a+)+", r"aaaaaax", Some((0, 6)), Some((0, 6)))
246 | mat!(match_nullsubexpr_21, r"([a]*)*", r"a", Some((0, 1)), Some((0, 1)))
247 | mat!(match_nullsubexpr_23, r"([a]*)*", r"x", Some((0, 0)), None)
248 | mat!(match_nullsubexpr_24, r"([a]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)))
249 | mat!(match_nullsubexpr_25, r"([a]*)*", r"aaaaaax", Some((0, 6)), Some((0, 6)))
250 | mat!(match_nullsubexpr_26, r"([a]*)+", r"a", Some((0, 1)), Some((0, 1)))
251 | mat!(match_nullsubexpr_27, r"([a]*)+", r"x", Some((0, 0)), Some((0, 0)))
252 | mat!(match_nullsubexpr_28, r"([a]*)+", r"aaaaaa", Some((0, 6)), Some((0, 6)))
253 | mat!(match_nullsubexpr_29, r"([a]*)+", r"aaaaaax", Some((0, 6)), Some((0, 6)))
254 | mat!(match_nullsubexpr_30, r"([^b]*)*", r"a", Some((0, 1)), Some((0, 1)))
255 | mat!(match_nullsubexpr_32, r"([^b]*)*", r"b", Some((0, 0)), None)
256 | mat!(match_nullsubexpr_33, r"([^b]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)))
257 | mat!(match_nullsubexpr_34, r"([^b]*)*", r"aaaaaab", Some((0, 6)), Some((0, 6)))
258 | mat!(match_nullsubexpr_35, r"([ab]*)*", r"a", Some((0, 1)), Some((0, 1)))
259 | mat!(match_nullsubexpr_36, r"([ab]*)*", r"aaaaaa", Some((0, 6)), Some((0, 6)))
260 | mat!(match_nullsubexpr_37, r"([ab]*)*", r"ababab", Some((0, 6)), Some((0, 6)))
261 | mat!(match_nullsubexpr_38, r"([ab]*)*", r"bababa", Some((0, 6)), Some((0, 6)))
262 | mat!(match_nullsubexpr_39, r"([ab]*)*", r"b", Some((0, 1)), Some((0, 1)))
263 | mat!(match_nullsubexpr_40, r"([ab]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6)))
264 | mat!(match_nullsubexpr_41, r"([ab]*)*", r"aaaabcde", Some((0, 5)), Some((0, 5)))
265 | mat!(match_nullsubexpr_42, r"([^a]*)*", r"b", Some((0, 1)), Some((0, 1)))
266 | mat!(match_nullsubexpr_43, r"([^a]*)*", r"bbbbbb", Some((0, 6)), Some((0, 6)))
267 | mat!(match_nullsubexpr_45, r"([^a]*)*", r"aaaaaa", Some((0, 0)), None)
268 | mat!(match_nullsubexpr_46, r"([^ab]*)*", r"ccccxx", Some((0, 6)), Some((0, 6)))
269 | mat!(match_nullsubexpr_48, r"([^ab]*)*", r"ababab", Some((0, 0)), None)
270 | mat!(match_nullsubexpr_50, r"((z)+|a)*", r"zabcde", Some((0, 2)), Some((1, 2)))
271 | mat!(match_nullsubexpr_69, r"(a*)*(x)", r"x", Some((0, 1)), None, Some((0, 1)))
272 | mat!(match_nullsubexpr_70, r"(a*)*(x)", r"ax", Some((0, 2)), Some((0, 1)), Some((1, 2)))
273 | mat!(match_nullsubexpr_71, r"(a*)*(x)", r"axa", Some((0, 2)), Some((0, 1)), Some((1, 2)))
274 | mat!(match_nullsubexpr_73, r"(a*)+(x)", r"x", Some((0, 1)), Some((0, 0)), Some((0, 1)))
275 | mat!(match_nullsubexpr_74, r"(a*)+(x)", r"ax", Some((0, 2)), Some((0, 1)), Some((1, 2)))
276 | mat!(match_nullsubexpr_75, r"(a*)+(x)", r"axa", Some((0, 2)), Some((0, 1)), Some((1, 2)))
277 | mat!(match_nullsubexpr_77, r"(a*){2}(x)", r"x", Some((0, 1)), Some((0, 0)), Some((0, 1)))
278 | mat!(match_nullsubexpr_78, r"(a*){2}(x)", r"ax", Some((0, 2)), Some((1, 1)), Some((1, 2)))
279 | mat!(match_nullsubexpr_79, r"(a*){2}(x)", r"axa", Some((0, 2)), Some((1, 1)), Some((1, 2)))
280 | 
281 | // Tests from repetition.dat
282 | mat!(match_repetition_10, r"((..)|(.))", r"", None)
283 | mat!(match_repetition_11, r"((..)|(.))((..)|(.))", r"", None)
284 | mat!(match_repetition_12, r"((..)|(.))((..)|(.))((..)|(.))", r"", None)
285 | mat!(match_repetition_14, r"((..)|(.)){1}", r"", None)
286 | mat!(match_repetition_15, r"((..)|(.)){2}", r"", None)
287 | mat!(match_repetition_16, r"((..)|(.)){3}", r"", None)
288 | mat!(match_repetition_18, r"((..)|(.))*", r"", Some((0, 0)))
289 | mat!(match_repetition_20, r"((..)|(.))", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1)))
290 | mat!(match_repetition_21, r"((..)|(.))((..)|(.))", r"a", None)
291 | mat!(match_repetition_22, r"((..)|(.))((..)|(.))((..)|(.))", r"a", None)
292 | mat!(match_repetition_24, r"((..)|(.)){1}", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1)))
293 | mat!(match_repetition_25, r"((..)|(.)){2}", r"a", None)
294 | mat!(match_repetition_26, r"((..)|(.)){3}", r"a", None)
295 | mat!(match_repetition_28, r"((..)|(.))*", r"a", Some((0, 1)), Some((0, 1)), None, Some((0, 1)))
296 | mat!(match_repetition_30, r"((..)|(.))", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
297 | mat!(match_repetition_31, r"((..)|(.))((..)|(.))", r"aa", Some((0, 2)), Some((0, 1)), None, Some((0, 1)), Some((1, 2)), None, Some((1, 2)))
298 | mat!(match_repetition_32, r"((..)|(.))((..)|(.))((..)|(.))", r"aa", None)
299 | mat!(match_repetition_34, r"((..)|(.)){1}", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
300 | mat!(match_repetition_35, r"((..)|(.)){2}", r"aa", Some((0, 2)), Some((1, 2)), None, Some((1, 2)))
301 | mat!(match_repetition_36, r"((..)|(.)){3}", r"aa", None)
302 | mat!(match_repetition_38, r"((..)|(.))*", r"aa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
303 | mat!(match_repetition_40, r"((..)|(.))", r"aaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
304 | mat!(match_repetition_41, r"((..)|(.))((..)|(.))", r"aaa", Some((0, 3)), Some((0, 2)), Some((0, 2)), None, Some((2, 3)), None, Some((2, 3)))
305 | mat!(match_repetition_42, r"((..)|(.))((..)|(.))((..)|(.))", r"aaa", Some((0, 3)), Some((0, 1)), None, Some((0, 1)), Some((1, 2)), None, Some((1, 2)), Some((2, 3)), None, Some((2, 3)))
306 | mat!(match_repetition_44, r"((..)|(.)){1}", r"aaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
307 | mat!(match_repetition_46, r"((..)|(.)){2}", r"aaa", Some((0, 3)), Some((2, 3)), Some((0, 2)), Some((2, 3)))
308 | mat!(match_repetition_47, r"((..)|(.)){3}", r"aaa", Some((0, 3)), Some((2, 3)), None, Some((2, 3)))
309 | mat!(match_repetition_50, r"((..)|(.))*", r"aaa", Some((0, 3)), Some((2, 3)), Some((0, 2)), Some((2, 3)))
310 | mat!(match_repetition_52, r"((..)|(.))", r"aaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
311 | mat!(match_repetition_53, r"((..)|(.))((..)|(.))", r"aaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None)
312 | mat!(match_repetition_54, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 3)), None, Some((2, 3)), Some((3, 4)), None, Some((3, 4)))
313 | mat!(match_repetition_56, r"((..)|(.)){1}", r"aaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
314 | mat!(match_repetition_57, r"((..)|(.)){2}", r"aaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None)
315 | mat!(match_repetition_59, r"((..)|(.)){3}", r"aaaa", Some((0, 4)), Some((3, 4)), Some((0, 2)), Some((3, 4)))
316 | mat!(match_repetition_61, r"((..)|(.))*", r"aaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None)
317 | mat!(match_repetition_63, r"((..)|(.))", r"aaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
318 | mat!(match_repetition_64, r"((..)|(.))((..)|(.))", r"aaaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None)
319 | mat!(match_repetition_65, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaaa", Some((0, 5)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None, Some((4, 5)), None, Some((4, 5)))
320 | mat!(match_repetition_67, r"((..)|(.)){1}", r"aaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
321 | mat!(match_repetition_68, r"((..)|(.)){2}", r"aaaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None)
322 | mat!(match_repetition_70, r"((..)|(.)){3}", r"aaaaa", Some((0, 5)), Some((4, 5)), Some((2, 4)), Some((4, 5)))
323 | mat!(match_repetition_73, r"((..)|(.))*", r"aaaaa", Some((0, 5)), Some((4, 5)), Some((2, 4)), Some((4, 5)))
324 | mat!(match_repetition_75, r"((..)|(.))", r"aaaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
325 | mat!(match_repetition_76, r"((..)|(.))((..)|(.))", r"aaaaaa", Some((0, 4)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None)
326 | mat!(match_repetition_77, r"((..)|(.))((..)|(.))((..)|(.))", r"aaaaaa", Some((0, 6)), Some((0, 2)), Some((0, 2)), None, Some((2, 4)), Some((2, 4)), None, Some((4, 6)), Some((4, 6)), None)
327 | mat!(match_repetition_79, r"((..)|(.)){1}", r"aaaaaa", Some((0, 2)), Some((0, 2)), Some((0, 2)), None)
328 | mat!(match_repetition_80, r"((..)|(.)){2}", r"aaaaaa", Some((0, 4)), Some((2, 4)), Some((2, 4)), None)
329 | mat!(match_repetition_81, r"((..)|(.)){3}", r"aaaaaa", Some((0, 6)), Some((4, 6)), Some((4, 6)), None)
330 | mat!(match_repetition_83, r"((..)|(.))*", r"aaaaaa", Some((0, 6)), Some((4, 6)), Some((4, 6)), None)
331 | mat!(match_repetition_90, r"X(.?){0,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
332 | mat!(match_repetition_91, r"X(.?){1,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
333 | mat!(match_repetition_92, r"X(.?){2,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
334 | mat!(match_repetition_93, r"X(.?){3,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
335 | mat!(match_repetition_94, r"X(.?){4,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
336 | mat!(match_repetition_95, r"X(.?){5,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
337 | mat!(match_repetition_96, r"X(.?){6,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
338 | mat!(match_repetition_97, r"X(.?){7,}Y", r"X1234567Y", Some((0, 9)), Some((7, 8)))
339 | mat!(match_repetition_98, r"X(.?){8,}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
340 | mat!(match_repetition_100, r"X(.?){0,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
341 | mat!(match_repetition_102, r"X(.?){1,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
342 | mat!(match_repetition_104, r"X(.?){2,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
343 | mat!(match_repetition_106, r"X(.?){3,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
344 | mat!(match_repetition_108, r"X(.?){4,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
345 | mat!(match_repetition_110, r"X(.?){5,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
346 | mat!(match_repetition_112, r"X(.?){6,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
347 | mat!(match_repetition_114, r"X(.?){7,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
348 | mat!(match_repetition_115, r"X(.?){8,8}Y", r"X1234567Y", Some((0, 9)), Some((8, 8)))
349 | mat!(match_repetition_126, r"(a|ab|c|bcd){0,}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
350 | mat!(match_repetition_127, r"(a|ab|c|bcd){1,}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
351 | mat!(match_repetition_128, r"(a|ab|c|bcd){2,}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6)))
352 | mat!(match_repetition_129, r"(a|ab|c|bcd){3,}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6)))
353 | mat!(match_repetition_130, r"(a|ab|c|bcd){4,}(d*)", r"ababcd", None)
354 | mat!(match_repetition_131, r"(a|ab|c|bcd){0,10}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
355 | mat!(match_repetition_132, r"(a|ab|c|bcd){1,10}(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
356 | mat!(match_repetition_133, r"(a|ab|c|bcd){2,10}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6)))
357 | mat!(match_repetition_134, r"(a|ab|c|bcd){3,10}(d*)", r"ababcd", Some((0, 6)), Some((3, 6)), Some((6, 6)))
358 | mat!(match_repetition_135, r"(a|ab|c|bcd){4,10}(d*)", r"ababcd", None)
359 | mat!(match_repetition_136, r"(a|ab|c|bcd)*(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
360 | mat!(match_repetition_137, r"(a|ab|c|bcd)+(d*)", r"ababcd", Some((0, 1)), Some((0, 1)), Some((1, 1)))
361 | mat!(match_repetition_143, r"(ab|a|c|bcd){0,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
362 | mat!(match_repetition_145, r"(ab|a|c|bcd){1,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
363 | mat!(match_repetition_147, r"(ab|a|c|bcd){2,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
364 | mat!(match_repetition_149, r"(ab|a|c|bcd){3,}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
365 | mat!(match_repetition_150, r"(ab|a|c|bcd){4,}(d*)", r"ababcd", None)
366 | mat!(match_repetition_152, r"(ab|a|c|bcd){0,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
367 | mat!(match_repetition_154, r"(ab|a|c|bcd){1,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
368 | mat!(match_repetition_156, r"(ab|a|c|bcd){2,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
369 | mat!(match_repetition_158, r"(ab|a|c|bcd){3,10}(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
370 | mat!(match_repetition_159, r"(ab|a|c|bcd){4,10}(d*)", r"ababcd", None)
371 | mat!(match_repetition_161, r"(ab|a|c|bcd)*(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
372 | mat!(match_repetition_163, r"(ab|a|c|bcd)+(d*)", r"ababcd", Some((0, 6)), Some((4, 5)), Some((5, 6)))
373 | 
374 | 


--------------------------------------------------------------------------------
/src/test/mod.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
 2 | // file at the top-level directory of this distribution and at
 3 | // http://rust-lang.org/COPYRIGHT.
 4 | //
 5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 8 | // option. This file may not be copied, modified, or distributed
 9 | // except according to those terms.
10 | 
11 | #[cfg(not(stage1))]
12 | #[phase(syntax)]
13 | extern crate regex_macros;
14 | 
15 | #[cfg(not(stage1))]
16 | #[path = "bench.rs"]
17 | mod native_bench;
18 | 
19 | #[cfg(not(stage1))]
20 | #[path = "tests.rs"]
21 | mod native_tests;
22 | 
23 | // Due to macro scoping rules, this definition only applies for the modules
24 | // defined below. Effectively, it allows us to use the same tests for both
25 | // native and dynamic regexes.
26 | macro_rules! regex(
27 |     ($re:expr) => (
28 |         match ::regex::Regex::new($re) {
29 |             Ok(re) => re,
30 |             Err(err) => fail!("{}", err),
31 |         }
32 |     );
33 | )
34 | 
35 | #[path = "bench.rs"]
36 | mod dynamic_bench;
37 | #[path = "tests.rs"]
38 | mod dynamic_tests;
39 | 
40 | 


--------------------------------------------------------------------------------
/src/test/tests.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
  2 | // file at the top-level directory of this distribution and at
  3 | // http://rust-lang.org/COPYRIGHT.
  4 | //
  5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  8 | // option. This file may not be copied, modified, or distributed
  9 | // except according to those terms.
 10 | 
 11 | // ignore-tidy-linelength
 12 | 
 13 | use regex::{Regex, NoExpand};
 14 | 
 15 | #[test]
 16 | fn splitn() {
 17 |     let re = regex!(r"\d+");
 18 |     let text = "cauchy123plato456tyler789binx";
 19 |     let subs: Vec<&str> = re.splitn(text, 2).collect();
 20 |     assert_eq!(subs, vec!("cauchy", "plato456tyler789binx"));
 21 | }
 22 | 
 23 | #[test]
 24 | fn split() {
 25 |     let re = regex!(r"\d+");
 26 |     let text = "cauchy123plato456tyler789binx";
 27 |     let subs: Vec<&str> = re.split(text).collect();
 28 |     assert_eq!(subs, vec!("cauchy", "plato", "tyler", "binx"));
 29 | }
 30 | 
 31 | macro_rules! replace(
 32 |     ($name:ident, $which:ident, $re:expr,
 33 |      $search:expr, $replace:expr, $result:expr) => (
 34 |         #[test]
 35 |         fn $name() {
 36 |             let re = regex!($re);
 37 |             assert_eq!(re.$which($search, $replace), StrBuf::from_str($result));
 38 |         }
 39 |     );
 40 | )
 41 | 
 42 | replace!(rep_first, replace, r"\d", "age: 26", "Z", "age: Z6")
 43 | replace!(rep_plus, replace, r"\d+", "age: 26", "Z", "age: Z")
 44 | replace!(rep_all, replace_all, r"\d", "age: 26", "Z", "age: ZZ")
 45 | replace!(rep_groups, replace, r"(\S+)\s+(\S+)", "w1 w2", "$2 $1", "w2 w1")
 46 | replace!(rep_double_dollar, replace,
 47 |          r"(\S+)\s+(\S+)", "w1 w2", "$2 $$1", "w2 $1")
 48 | replace!(rep_no_expand, replace,
 49 |          r"(\S+)\s+(\S+)", "w1 w2", NoExpand("$2 $1"), "$2 $1")
 50 | replace!(rep_named, replace_all,
 51 |          r"(?P<first>\S+)\s+(?P<last>\S+)(?P<space>\s*)",
 52 |          "w1 w2 w3 w4", "$last $first$space", "w2 w1 w4 w3")
 53 | replace!(rep_trim, replace_all, "^[ \t]+|[ \t]+$", " \t  trim me\t   \t",
 54 |          "", "trim me")
 55 | 
 56 | macro_rules! noparse(
 57 |     ($name:ident, $re:expr) => (
 58 |         #[test]
 59 |         fn $name() {
 60 |             let re = $re;
 61 |             match Regex::new(re) {
 62 |                 Err(_) => {},
 63 |                 Ok(_) => fail!("Regex '{}' should cause a parse error.", re),
 64 |             }
 65 |         }
 66 |     );
 67 | )
 68 | 
 69 | noparse!(fail_double_repeat, "a**")
 70 | noparse!(fail_no_repeat_arg, "*")
 71 | noparse!(fail_no_repeat_arg_begin, "^*")
 72 | noparse!(fail_incomplete_escape, "\\")
 73 | noparse!(fail_class_incomplete, "[A-")
 74 | noparse!(fail_class_not_closed, "[A")
 75 | noparse!(fail_class_no_begin, r"[\A]")
 76 | noparse!(fail_class_no_end, r"[\z]")
 77 | noparse!(fail_class_no_boundary, r"[\b]")
 78 | noparse!(fail_open_paren, "(")
 79 | noparse!(fail_close_paren, ")")
 80 | noparse!(fail_invalid_range, "[a-Z]")
 81 | noparse!(fail_empty_capture_name, "(?P<>a)")
 82 | noparse!(fail_empty_capture_exp, "(?P<name>)")
 83 | noparse!(fail_bad_capture_name, "(?P<na-me>)")
 84 | noparse!(fail_bad_flag, "(?a)a")
 85 | noparse!(fail_empty_alt_before, "|a")
 86 | noparse!(fail_empty_alt_after, "a|")
 87 | noparse!(fail_counted_big_exact, "a{1001}")
 88 | noparse!(fail_counted_big_min, "a{1001,}")
 89 | noparse!(fail_counted_no_close, "a{1001")
 90 | noparse!(fail_unfinished_cap, "(?")
 91 | noparse!(fail_unfinished_escape, "\\")
 92 | noparse!(fail_octal_digit, r"\8")
 93 | noparse!(fail_hex_digit, r"\xG0")
 94 | noparse!(fail_hex_short, r"\xF")
 95 | noparse!(fail_hex_long_digits, r"\x{fffg}")
 96 | noparse!(fail_flag_bad, "(?a)")
 97 | noparse!(fail_flag_empty, "(?)")
 98 | noparse!(fail_double_neg, "(?-i-i)")
 99 | noparse!(fail_neg_empty, "(?i-)")
100 | noparse!(fail_empty_group, "()")
101 | noparse!(fail_dupe_named, "(?P<a>.)(?P<a>.)")
102 | 
103 | macro_rules! mat(
104 |     ($name:ident, $re:expr, $text:expr, $($loc:tt)+) => (
105 |         #[test]
106 |         fn $name() {
107 |             let text = $text;
108 |             let expected: Vec<Option<(uint, uint)>> = vec!($($loc)+);
109 |             let r = regex!($re);
110 |             let got = match r.captures(text) {
111 |                 Some(c) => c.iter_pos().collect::<Vec<Option<(uint, uint)>>>(),
112 |                 None => vec!(None),
113 |             };
114 |             // The test set sometimes leave out capture groups, so truncate
115 |             // actual capture groups to match test set.
116 |             let (sexpect, mut sgot) = (expected.as_slice(), got.as_slice());
117 |             if sgot.len() > sexpect.len() {
118 |                 sgot = sgot.slice(0, sexpect.len())
119 |             }
120 |             if sexpect != sgot {
121 |                 fail!("For RE '{}' against '{}', expected '{}' but got '{}'",
122 |                       $re, text, sexpect, sgot);
123 |             }
124 |         }
125 |     );
126 | )
127 | 
128 | // Some crazy expressions from regular-expressions.info.
129 | mat!(match_ranges,
130 |      r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
131 |      "num: 255", Some((5, 8)))
132 | mat!(match_ranges_not,
133 |      r"\b(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])\b",
134 |      "num: 256", None)
135 | mat!(match_float1, r"[-+]?[0-9]*\.?[0-9]+", "0.1", Some((0, 3)))
136 | mat!(match_float2, r"[-+]?[0-9]*\.?[0-9]+", "0.1.2", Some((0, 3)))
137 | mat!(match_float3, r"[-+]?[0-9]*\.?[0-9]+", "a1.2", Some((1, 4)))
138 | mat!(match_float4, r"^[-+]?[0-9]*\.?[0-9]+$", "1.a", None)
139 | mat!(match_email, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
140 |      "mine is jam.slam@gmail.com ", Some((8, 26)))
141 | mat!(match_email_not, r"(?i)\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b",
142 |      "mine is jam.slam@gmail ", None)
143 | mat!(match_email_big, r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?",
144 |      "mine is jam.slam@gmail.com ", Some((8, 26)))
145 | mat!(match_date1,
146 |      r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
147 |      "1900-01-01", Some((0, 10)))
148 | mat!(match_date2,
149 |      r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
150 |      "1900-00-01", None)
151 | mat!(match_date3,
152 |      r"^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])$",
153 |      "1900-13-01", None)
154 | 
155 | // Exercise the flags.
156 | mat!(match_flag_case, "(?i)abc", "ABC", Some((0, 3)))
157 | mat!(match_flag_weird_case, "(?i)a(?-i)bc", "Abc", Some((0, 3)))
158 | mat!(match_flag_weird_case_not, "(?i)a(?-i)bc", "ABC", None)
159 | mat!(match_flag_case_dotnl, "(?is)a.", "A\n", Some((0, 2)))
160 | mat!(match_flag_case_dotnl_toggle, "(?is)a.(?-is)a.", "A\nab", Some((0, 4)))
161 | mat!(match_flag_case_dotnl_toggle_not, "(?is)a.(?-is)a.", "A\na\n", None)
162 | mat!(match_flag_case_dotnl_toggle_ok, "(?is)a.(?-is:a.)?", "A\na\n", Some((0, 2)))
163 | mat!(match_flag_multi, "(?m)(?:^\\d+$\n?)+", "123\n456\n789", Some((0, 11)))
164 | mat!(match_flag_ungreedy, "(?U)a+", "aa", Some((0, 1)))
165 | mat!(match_flag_ungreedy_greedy, "(?U)a+?", "aa", Some((0, 2)))
166 | mat!(match_flag_ungreedy_noop, "(?U)(?-U)a+", "aa", Some((0, 2)))
167 | 
168 | // Some Unicode tests.
169 | mat!(uni_literal, r"Ⅰ", "Ⅰ", Some((0, 3)))
170 | mat!(uni_one, r"\pN", "Ⅰ", Some((0, 3)))
171 | mat!(uni_mixed, r"\pN+", "Ⅰ1Ⅱ2", Some((0, 8)))
172 | mat!(uni_not, r"\PN+", "abⅠ", Some((0, 2)))
173 | mat!(uni_not_class, r"[\PN]+", "abⅠ", Some((0, 2)))
174 | mat!(uni_not_class_neg, r"[^\PN]+", "abⅠ", Some((2, 5)))
175 | mat!(uni_case, r"(?i)Δ", "δ", Some((0, 2)))
176 | mat!(uni_case_not, r"Δ", "δ", None)
177 | mat!(uni_case_upper, r"\p{Lu}+", "ΛΘΓΔα", Some((0, 8)))
178 | mat!(uni_case_upper_nocase_flag, r"(?i)\p{Lu}+", "ΛΘΓΔα", Some((0, 10)))
179 | mat!(uni_case_upper_nocase, r"\p{L}+", "ΛΘΓΔα", Some((0, 10)))
180 | mat!(uni_case_lower, r"\p{Ll}+", "ΛΘΓΔα", Some((8, 10)))
181 | 
182 | // Test the Unicode friendliness of Perl character classes.
183 | mat!(uni_perl_w, r"\w+", "dδd", Some((0, 4)))
184 | mat!(uni_perl_w_not, r"\w+", "Ⅱ", None)
185 | mat!(uni_perl_w_neg, r"\W+", "Ⅱ", Some((0, 3)))
186 | mat!(uni_perl_d, r"\d+", "1२३9", Some((0, 8)))
187 | mat!(uni_perl_d_not, r"\d+", "Ⅱ", None)
188 | mat!(uni_perl_d_neg, r"\D+", "Ⅱ", Some((0, 3)))
189 | mat!(uni_perl_s, r"\s+", " ", Some((0, 3)))
190 | mat!(uni_perl_s_not, r"\s+", "☃", None)
191 | mat!(uni_perl_s_neg, r"\S+", "☃", Some((0, 3)))
192 | 
193 | // And do the same for word boundaries.
194 | mat!(uni_boundary_none, r"\d\b", "6δ", None)
195 | mat!(uni_boundary_ogham, r"\d\b", "6 ", Some((0, 1)))
196 | 
197 | // A whole mess of tests from Glenn Fowler's regex test suite.
198 | // Generated by the 'src/etc/regex-match-tests' program.
199 | mod matches;
200 | 


--------------------------------------------------------------------------------
/src/testdata/LICENSE:
--------------------------------------------------------------------------------
 1 | The following license covers testregex.c and all associated test data.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a
 4 | copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
 5 | without restriction, including without limitation the rights to use,
 6 | copy, modify, merge, publish, distribute, and/or sell copies of the
 7 | Software, and to permit persons to whom the Software is furnished to do
 8 | so, subject to the following disclaimer:
 9 | 
10 | THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
11 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
12 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
13 | IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
14 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
15 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
16 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
17 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
18 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
19 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
20 | 


--------------------------------------------------------------------------------
/src/testdata/README:
--------------------------------------------------------------------------------
 1 | Test data was taken from the Go distribution, which was in turn taken from the 
 2 | testregex test suite:
 3 | 
 4 |   http://www2.research.att.com/~astopen/testregex/testregex.html
 5 | 
 6 | The LICENSE in this directory corresponds to the LICENSE that the data was
 7 | released under.
 8 | 
 9 | The tests themselves were modified for RE2/Go. A couple were modified further 
10 | by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them. 
11 | (Yes, it seems like RE2/Go includes failing test cases.) This may or may not 
12 | have been a bad idea, but I think being consistent with an established Regex 
13 | library is worth something.
14 | 
15 | Note that these files are read by 'src/etc/regexp-match-tests' and turned into 
16 | Rust tests found in 'src/libregexp/tests/matches.rs'.
17 | 
18 | 


--------------------------------------------------------------------------------
/src/testdata/basic.dat:
--------------------------------------------------------------------------------
  1 | NOTE	all standard compliant implementations should pass these : 2002-05-31
  2 | 
  3 | BE	abracadabra$	abracadabracadabra	(7,18)
  4 | BE	a...b		abababbb		(2,7)
  5 | BE	XXXXXX		..XXXXXX		(2,8)
  6 | E	\)		()	(1,2)
  7 | BE	a]		a]a	(0,2)
  8 | B	}		}	(0,1)
  9 | E	\}		}	(0,1)
 10 | BE	\]		]	(0,1)
 11 | B	]		]	(0,1)
 12 | E	]		]	(0,1)
 13 | B	{		{	(0,1)
 14 | B	}		}	(0,1)
 15 | BE	^a		ax	(0,1)
 16 | BE	\^a		a^a	(1,3)
 17 | BE	a\^		a^	(0,2)
 18 | BE	a$		aa	(1,2)
 19 | BE	a\$		a$	(0,2)
 20 | BE	^$		NULL	(0,0)
 21 | E	$^		NULL	(0,0)
 22 | E	a($)		aa	(1,2)(2,2)
 23 | E	a*(^a)		aa	(0,1)(0,1)
 24 | E	(..)*(...)*		a	(0,0)
 25 | E	(..)*(...)*		abcd	(0,4)(2,4)
 26 | E	(ab|a)(bc|c)		abc	(0,3)(0,2)(2,3)
 27 | E	(ab)c|abc		abc	(0,3)(0,2)
 28 | E	a{0}b		ab			(1,2)
 29 | E	(a*)(b?)(b+)b{3}	aaabbbbbbb	(0,10)(0,3)(3,4)(4,7)
 30 | E	(a*)(b{0,1})(b{1,})b{3}	aaabbbbbbb	(0,10)(0,3)(3,4)(4,7)
 31 | E	a{9876543210}	NULL	BADBR
 32 | E	((a|a)|a)			a	(0,1)(0,1)(0,1)
 33 | E	(a*)(a|aa)			aaaa	(0,4)(0,3)(3,4)
 34 | E	a*(a.|aa)			aaaa	(0,4)(2,4)
 35 | E	a(b)|c(d)|a(e)f			aef	(0,3)(?,?)(?,?)(1,2)
 36 | E	(a|b)?.*			b	(0,1)(0,1)
 37 | E	(a|b)c|a(b|c)			ac	(0,2)(0,1)
 38 | E	(a|b)c|a(b|c)			ab	(0,2)(?,?)(1,2)
 39 | E	(a|b)*c|(a|ab)*c		abc	(0,3)(1,2)
 40 | E	(a|b)*c|(a|ab)*c		xc	(1,2)
 41 | E	(.a|.b).*|.*(.a|.b)		xa	(0,2)(0,2)
 42 | E	a?(ab|ba)ab			abab	(0,4)(0,2)
 43 | E	a?(ac{0}b|ba)ab			abab	(0,4)(0,2)
 44 | E	ab|abab				abbabab	(0,2)
 45 | E	aba|bab|bba			baaabbbaba	(5,8)
 46 | E	aba|bab				baaabbbaba	(6,9)
 47 | E	(aa|aaa)*|(a|aaaaa)		aa	(0,2)(0,2)
 48 | E	(a.|.a.)*|(a|.a...)		aa	(0,2)(0,2)
 49 | E	ab|a				xabc	(1,3)
 50 | E	ab|a				xxabc	(2,4)
 51 | Ei	(Ab|cD)*			aBcD	(0,4)(2,4)
 52 | BE	[^-]			--a		(2,3)
 53 | BE	[a-]*			--a		(0,3)
 54 | BE	[a-m-]*			--amoma--	(0,4)
 55 | E	:::1:::0:|:::1:1:0:	:::0:::1:::1:::0:	(8,17)
 56 | E	:::1:::0:|:::1:1:1:	:::0:::1:::1:::0:	(8,17)
 57 | {E	[[:upper:]]		A		(0,1)	[[<element>]] not supported
 58 | E	[[:lower:]]+		`az{		(1,3)
 59 | E	[[:upper:]]+		@AZ[		(1,3)
 60 | # No collation in Go
 61 | #BE	[[-]]			[[-]]		(2,4)
 62 | #BE	[[.NIL.]]	NULL	ECOLLATE
 63 | #BE	[[=aleph=]]	NULL	ECOLLATE
 64 | }
 65 | BE$	\n		\n	(0,1)
 66 | BEn$	\n		\n	(0,1)
 67 | BE$	[^a]		\n	(0,1)
 68 | BE$	\na		\na	(0,2)
 69 | E	(a)(b)(c)	abc	(0,3)(0,1)(1,2)(2,3)
 70 | BE	xxx		xxx	(0,3)
 71 | E1	(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)	feb 6,	(0,6)
 72 | E1	(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)	2/7	(0,3)
 73 | E1	(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)	feb 1,Feb 6	(5,11)
 74 | E3	((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))	x	(0,1)(0,1)(0,1)
 75 | E3	((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*	xx	(0,2)(1,2)(1,2)
 76 | E	a?(ab|ba)*	ababababababababababababababababababababababababababababababababababababababababa	(0,81)(79,81)
 77 | E	abaa|abbaa|abbbaa|abbbbaa	ababbabbbabbbabbbbabbbbaa	(18,25)
 78 | E	abaa|abbaa|abbbaa|abbbbaa	ababbabbbabbbabbbbabaa	(18,22)
 79 | E	aaac|aabc|abac|abbc|baac|babc|bbac|bbbc	baaabbbabac	(7,11)
 80 | BE$	.*			\x01\x7f	(0,2)
 81 | E	aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll		XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa	(53,57)
 82 | L	aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll		XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa	NOMATCH
 83 | E	a*a*a*a*a*b		aaaaaaaaab	(0,10)
 84 | BE	^			NULL		(0,0)
 85 | BE	$			NULL		(0,0)
 86 | BE	^$			NULL		(0,0)
 87 | BE	^a$			a		(0,1)
 88 | BE	abc			abc		(0,3)
 89 | BE	abc			xabcy		(1,4)
 90 | BE	abc			ababc		(2,5)
 91 | BE	ab*c			abc		(0,3)
 92 | BE	ab*bc			abc		(0,3)
 93 | BE	ab*bc			abbc		(0,4)
 94 | BE	ab*bc			abbbbc		(0,6)
 95 | E	ab+bc			abbc		(0,4)
 96 | E	ab+bc			abbbbc		(0,6)
 97 | E	ab?bc			abbc		(0,4)
 98 | E	ab?bc			abc		(0,3)
 99 | E	ab?c			abc		(0,3)
100 | BE	^abc$			abc		(0,3)
101 | BE	^abc			abcc		(0,3)
102 | BE	abc$			aabc		(1,4)
103 | BE	^			abc		(0,0)
104 | BE	$			abc		(3,3)
105 | BE	a.c			abc		(0,3)
106 | BE	a.c			axc		(0,3)
107 | BE	a.*c			axyzc		(0,5)
108 | BE	a[bc]d			abd		(0,3)
109 | BE	a[b-d]e			ace		(0,3)
110 | BE	a[b-d]			aac		(1,3)
111 | BE	a[-b]			a-		(0,2)
112 | BE	a[b-]			a-		(0,2)
113 | BE	a]			a]		(0,2)
114 | BE	a[]]b			a]b		(0,3)
115 | BE	a[^bc]d			aed		(0,3)
116 | BE	a[^-b]c			adc		(0,3)
117 | BE	a[^]b]c			adc		(0,3)
118 | E	ab|cd			abc		(0,2)
119 | E	ab|cd			abcd		(0,2)
120 | E	a\(b			a(b		(0,3)
121 | E	a\(*b			ab		(0,2)
122 | E	a\(*b			a((b		(0,4)
123 | E	((a))			abc		(0,1)(0,1)(0,1)
124 | E	(a)b(c)			abc		(0,3)(0,1)(2,3)
125 | E	a+b+c			aabbabc		(4,7)
126 | E	a*			aaa		(0,3)
127 | #E	(a*)*			-		(0,0)(0,0)
128 | E	(a*)*			-		(0,0)(?,?)	RE2/Go
129 | E	(a*)+			-		(0,0)(0,0)
130 | #E	(a*|b)*			-		(0,0)(0,0)
131 | E	(a*|b)*			-		(0,0)(?,?)	RE2/Go
132 | E	(a+|b)*			ab		(0,2)(1,2)
133 | E	(a+|b)+			ab		(0,2)(1,2)
134 | E	(a+|b)?			ab		(0,1)(0,1)
135 | BE	[^ab]*			cde		(0,3)
136 | #E	(^)*			-		(0,0)(0,0)
137 | E	(^)*			-		(0,0)(?,?)	RE2/Go
138 | BE	a*			NULL		(0,0)
139 | E	([abc])*d		abbbcd		(0,6)(4,5)
140 | E	([abc])*bcd		abcd		(0,4)(0,1)
141 | E	a|b|c|d|e		e		(0,1)
142 | E	(a|b|c|d|e)f		ef		(0,2)(0,1)
143 | #E	((a*|b))*		-		(0,0)(0,0)(0,0)
144 | E	((a*|b))*		-		(0,0)(?,?)(?,?)	RE2/Go
145 | BE	abcd*efg		abcdefg		(0,7)
146 | BE	ab*			xabyabbbz	(1,3)
147 | BE	ab*			xayabbbz	(1,2)
148 | E	(ab|cd)e		abcde		(2,5)(2,4)
149 | BE	[abhgefdc]ij		hij		(0,3)
150 | E	(a|b)c*d		abcd		(1,4)(1,2)
151 | E	(ab|ab*)bc		abc		(0,3)(0,1)
152 | E	a([bc]*)c*		abc		(0,3)(1,3)
153 | E	a([bc]*)(c*d)		abcd		(0,4)(1,3)(3,4)
154 | E	a([bc]+)(c*d)		abcd		(0,4)(1,3)(3,4)
155 | E	a([bc]*)(c+d)		abcd		(0,4)(1,2)(2,4)
156 | E	a[bcd]*dcdcde		adcdcde		(0,7)
157 | E	(ab|a)b*c		abc		(0,3)(0,2)
158 | E	((a)(b)c)(d)		abcd		(0,4)(0,3)(0,1)(1,2)(3,4)
159 | BE	[A-Za-z_][A-Za-z0-9_]*	alpha		(0,5)
160 | E	^a(bc+|b[eh])g|.h$	abh		(1,3)
161 | E	(bc+d$|ef*g.|h?i(j|k))	effgz		(0,5)(0,5)
162 | E	(bc+d$|ef*g.|h?i(j|k))	ij		(0,2)(0,2)(1,2)
163 | E	(bc+d$|ef*g.|h?i(j|k))	reffgz		(1,6)(1,6)
164 | E	(((((((((a)))))))))	a		(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
165 | BE	multiple words		multiple words yeah	(0,14)
166 | E	(.*)c(.*)		abcde		(0,5)(0,2)(3,5)
167 | BE	abcd			abcd		(0,4)
168 | E	a(bc)d			abcd		(0,4)(1,3)
169 | E	a[-]?c		ac		(0,3)
170 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Qaddafi	(0,15)(?,?)(10,12)
171 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Mo'ammar Gadhafi	(0,16)(?,?)(11,13)
172 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Kaddafi	(0,15)(?,?)(10,12)
173 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Qadhafi	(0,15)(?,?)(10,12)
174 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Gadafi	(0,14)(?,?)(10,11)
175 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Mu'ammar Qadafi	(0,15)(?,?)(11,12)
176 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Moamar Gaddafi	(0,14)(?,?)(9,11)
177 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Mu'ammar Qadhdhafi	(0,18)(?,?)(13,15)
178 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Khaddafi	(0,16)(?,?)(11,13)
179 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Ghaddafy	(0,16)(?,?)(11,13)
180 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Ghadafi	(0,15)(?,?)(11,12)
181 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Ghaddafi	(0,16)(?,?)(11,13)
182 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muamar Kaddafi	(0,14)(?,?)(9,11)
183 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Quathafi	(0,16)(?,?)(11,13)
184 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Muammar Gheddafi	(0,16)(?,?)(11,13)
185 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Moammar Khadafy	(0,15)(?,?)(11,12)
186 | E	M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]	Moammar Qudhafi	(0,15)(?,?)(10,12)
187 | E	a+(b|c)*d+		aabcdd			(0,6)(3,4)
188 | E	^.+$			vivi			(0,4)
189 | E	^(.+)$			vivi			(0,4)(0,4)
190 | E	^([^!.]+).att.com!(.+)$	gryphon.att.com!eby	(0,19)(0,7)(16,19)
191 | E	^([^!]+!)?([^!]+)$	bas			(0,3)(?,?)(0,3)
192 | E	^([^!]+!)?([^!]+)$	bar!bas			(0,7)(0,4)(4,7)
193 | E	^([^!]+!)?([^!]+)$	foo!bas			(0,7)(0,4)(4,7)
194 | E	^.+!([^!]+!)([^!]+)$	foo!bar!bas		(0,11)(4,8)(8,11)
195 | E	((foo)|(bar))!bas	bar!bas			(0,7)(0,3)(?,?)(0,3)
196 | E	((foo)|(bar))!bas	foo!bar!bas		(4,11)(4,7)(?,?)(4,7)
197 | E	((foo)|(bar))!bas	foo!bas			(0,7)(0,3)(0,3)
198 | E	((foo)|bar)!bas		bar!bas			(0,7)(0,3)
199 | E	((foo)|bar)!bas		foo!bar!bas		(4,11)(4,7)
200 | E	((foo)|bar)!bas		foo!bas			(0,7)(0,3)(0,3)
201 | E	(foo|(bar))!bas		bar!bas			(0,7)(0,3)(0,3)
202 | E	(foo|(bar))!bas		foo!bar!bas		(4,11)(4,7)(4,7)
203 | E	(foo|(bar))!bas		foo!bas			(0,7)(0,3)
204 | E	(foo|bar)!bas		bar!bas			(0,7)(0,3)
205 | E	(foo|bar)!bas		foo!bar!bas		(4,11)(4,7)
206 | E	(foo|bar)!bas		foo!bas			(0,7)(0,3)
207 | E	^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$	foo!bar!bas	(0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
208 | E	^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$	bas		(0,3)(?,?)(0,3)
209 | E	^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$	bar!bas		(0,7)(0,4)(4,7)
210 | E	^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$	foo!bar!bas	(0,11)(?,?)(?,?)(4,8)(8,11)
211 | E	^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$	foo!bas		(0,7)(0,4)(4,7)
212 | E	^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$	bas		(0,3)(0,3)(?,?)(0,3)
213 | E	^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$	bar!bas		(0,7)(0,7)(0,4)(4,7)
214 | E	^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$	foo!bar!bas	(0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
215 | E	^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$	foo!bas		(0,7)(0,7)(0,4)(4,7)
216 | E	.*(/XXX).*			/XXX			(0,4)(0,4)
217 | E	.*(\\XXX).*			\XXX			(0,4)(0,4)
218 | E	\\XXX				\XXX			(0,4)
219 | E	.*(/000).*			/000			(0,4)(0,4)
220 | E	.*(\\000).*			\000			(0,4)(0,4)
221 | E	\\000				\000			(0,4)
222 | 


--------------------------------------------------------------------------------
/src/testdata/nullsubexpr.dat:
--------------------------------------------------------------------------------
 1 | NOTE	null subexpression matches : 2002-06-06
 2 | 
 3 | E	(a*)*		a		(0,1)(0,1)
 4 | #E	SAME		x		(0,0)(0,0)
 5 | E	SAME		x		(0,0)(?,?)	RE2/Go
 6 | E	SAME		aaaaaa		(0,6)(0,6)
 7 | E	SAME		aaaaaax		(0,6)(0,6)
 8 | E	(a*)+		a		(0,1)(0,1)
 9 | E	SAME		x		(0,0)(0,0)
10 | E	SAME		aaaaaa		(0,6)(0,6)
11 | E	SAME		aaaaaax		(0,6)(0,6)
12 | E	(a+)*		a		(0,1)(0,1)
13 | E	SAME		x		(0,0)
14 | E	SAME		aaaaaa		(0,6)(0,6)
15 | E	SAME		aaaaaax		(0,6)(0,6)
16 | E	(a+)+		a		(0,1)(0,1)
17 | E	SAME		x		NOMATCH
18 | E	SAME		aaaaaa		(0,6)(0,6)
19 | E	SAME		aaaaaax		(0,6)(0,6)
20 | 
21 | E	([a]*)*		a		(0,1)(0,1)
22 | #E	SAME		x		(0,0)(0,0)
23 | E	SAME		x		(0,0)(?,?)	RE2/Go
24 | E	SAME		aaaaaa		(0,6)(0,6)
25 | E	SAME		aaaaaax		(0,6)(0,6)
26 | E	([a]*)+		a		(0,1)(0,1)
27 | E	SAME		x		(0,0)(0,0)
28 | E	SAME		aaaaaa		(0,6)(0,6)
29 | E	SAME		aaaaaax		(0,6)(0,6)
30 | E	([^b]*)*	a		(0,1)(0,1)
31 | #E	SAME		b		(0,0)(0,0)
32 | E	SAME		b		(0,0)(?,?)	RE2/Go
33 | E	SAME		aaaaaa		(0,6)(0,6)
34 | E	SAME		aaaaaab		(0,6)(0,6)
35 | E	([ab]*)*	a		(0,1)(0,1)
36 | E	SAME		aaaaaa		(0,6)(0,6)
37 | E	SAME		ababab		(0,6)(0,6)
38 | E	SAME		bababa		(0,6)(0,6)
39 | E	SAME		b		(0,1)(0,1)
40 | E	SAME		bbbbbb		(0,6)(0,6)
41 | E	SAME		aaaabcde	(0,5)(0,5)
42 | E	([^a]*)*	b		(0,1)(0,1)
43 | E	SAME		bbbbbb		(0,6)(0,6)
44 | #E	SAME		aaaaaa		(0,0)(0,0)
45 | E	SAME		aaaaaa		(0,0)(?,?)	RE2/Go
46 | E	([^ab]*)*	ccccxx		(0,6)(0,6)
47 | #E	SAME		ababab		(0,0)(0,0)
48 | E	SAME		ababab		(0,0)(?,?)	RE2/Go
49 | 
50 | E	((z)+|a)*	zabcde		(0,2)(1,2)
51 | 
52 | #{E	a+?		aaaaaa		(0,1)	no *? +? mimimal match ops
53 | #E	(a)		aaa		(0,1)(0,1)
54 | #E	(a*?)		aaa		(0,0)(0,0)
55 | #E	(a)*?		aaa		(0,0)
56 | #E	(a*?)*?		aaa		(0,0)
57 | #}
58 | 
59 | B	\(a*\)*\(x\)		x	(0,1)(0,0)(0,1)
60 | B	\(a*\)*\(x\)		ax	(0,2)(0,1)(1,2)
61 | B	\(a*\)*\(x\)		axa	(0,2)(0,1)(1,2)
62 | B	\(a*\)*\(x\)\(\1\)	x	(0,1)(0,0)(0,1)(1,1)
63 | B	\(a*\)*\(x\)\(\1\)	ax	(0,2)(1,1)(1,2)(2,2)
64 | B	\(a*\)*\(x\)\(\1\)	axa	(0,3)(0,1)(1,2)(2,3)
65 | B	\(a*\)*\(x\)\(\1\)\(x\)	axax	(0,4)(0,1)(1,2)(2,3)(3,4)
66 | B	\(a*\)*\(x\)\(\1\)\(x\)	axxa	(0,3)(1,1)(1,2)(2,2)(2,3)
67 | 
68 | #E	(a*)*(x)		x	(0,1)(0,0)(0,1)
69 | E	(a*)*(x)		x	(0,1)(?,?)(0,1)	RE2/Go
70 | E	(a*)*(x)		ax	(0,2)(0,1)(1,2)
71 | E	(a*)*(x)		axa	(0,2)(0,1)(1,2)
72 | 
73 | E	(a*)+(x)		x	(0,1)(0,0)(0,1)
74 | E	(a*)+(x)		ax	(0,2)(0,1)(1,2)
75 | E	(a*)+(x)		axa	(0,2)(0,1)(1,2)
76 | 
77 | E	(a*){2}(x)		x	(0,1)(0,0)(0,1)
78 | E	(a*){2}(x)		ax	(0,2)(1,1)(1,2)
79 | E	(a*){2}(x)		axa	(0,2)(1,1)(1,2)
80 | 


--------------------------------------------------------------------------------
/src/testdata/repetition.dat:
--------------------------------------------------------------------------------
  1 | NOTE	implicit vs. explicit repetitions : 2009-02-02
  2 | 
  3 | # Glenn Fowler <gsf@research.att.com>
  4 | # conforming matches (column 4) must match one of the following BREs
  5 | #	NOMATCH
  6 | #	(0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
  7 | #	(0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
  8 | # i.e., each 3-tuple has two identical elements and one (?,?)
  9 | 
 10 | E	((..)|(.))				NULL		NOMATCH
 11 | E	((..)|(.))((..)|(.))			NULL		NOMATCH
 12 | E	((..)|(.))((..)|(.))((..)|(.))		NULL		NOMATCH
 13 | 
 14 | E	((..)|(.)){1}				NULL		NOMATCH
 15 | E	((..)|(.)){2}				NULL		NOMATCH
 16 | E	((..)|(.)){3}				NULL		NOMATCH
 17 | 
 18 | E	((..)|(.))*				NULL		(0,0)
 19 | 
 20 | E	((..)|(.))				a		(0,1)(0,1)(?,?)(0,1)
 21 | E	((..)|(.))((..)|(.))			a		NOMATCH
 22 | E	((..)|(.))((..)|(.))((..)|(.))		a		NOMATCH
 23 | 
 24 | E	((..)|(.)){1}				a		(0,1)(0,1)(?,?)(0,1)
 25 | E	((..)|(.)){2}				a		NOMATCH
 26 | E	((..)|(.)){3}				a		NOMATCH
 27 | 
 28 | E	((..)|(.))*				a		(0,1)(0,1)(?,?)(0,1)
 29 | 
 30 | E	((..)|(.))				aa		(0,2)(0,2)(0,2)(?,?)
 31 | E	((..)|(.))((..)|(.))			aa		(0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
 32 | E	((..)|(.))((..)|(.))((..)|(.))		aa		NOMATCH
 33 | 
 34 | E	((..)|(.)){1}				aa		(0,2)(0,2)(0,2)(?,?)
 35 | E	((..)|(.)){2}				aa		(0,2)(1,2)(?,?)(1,2)
 36 | E	((..)|(.)){3}				aa		NOMATCH
 37 | 
 38 | E	((..)|(.))*				aa		(0,2)(0,2)(0,2)(?,?)
 39 | 
 40 | E	((..)|(.))				aaa		(0,2)(0,2)(0,2)(?,?)
 41 | E	((..)|(.))((..)|(.))			aaa		(0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
 42 | E	((..)|(.))((..)|(.))((..)|(.))		aaa		(0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
 43 | 
 44 | E	((..)|(.)){1}				aaa		(0,2)(0,2)(0,2)(?,?)
 45 | #E	((..)|(.)){2}				aaa		(0,3)(2,3)(?,?)(2,3)
 46 | E	((..)|(.)){2}				aaa		(0,3)(2,3)(0,2)(2,3)	RE2/Go
 47 | E	((..)|(.)){3}				aaa		(0,3)(2,3)(?,?)(2,3)
 48 | 
 49 | #E	((..)|(.))*				aaa		(0,3)(2,3)(?,?)(2,3)
 50 | E	((..)|(.))*				aaa		(0,3)(2,3)(0,2)(2,3)	RE2/Go
 51 | 
 52 | E	((..)|(.))				aaaa		(0,2)(0,2)(0,2)(?,?)
 53 | E	((..)|(.))((..)|(.))			aaaa		(0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
 54 | E	((..)|(.))((..)|(.))((..)|(.))		aaaa		(0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
 55 | 
 56 | E	((..)|(.)){1}				aaaa		(0,2)(0,2)(0,2)(?,?)
 57 | E	((..)|(.)){2}				aaaa		(0,4)(2,4)(2,4)(?,?)
 58 | #E	((..)|(.)){3}				aaaa		(0,4)(3,4)(?,?)(3,4)
 59 | E	((..)|(.)){3}				aaaa		(0,4)(3,4)(0,2)(3,4)	RE2/Go
 60 | 
 61 | E	((..)|(.))*				aaaa		(0,4)(2,4)(2,4)(?,?)
 62 | 
 63 | E	((..)|(.))				aaaaa		(0,2)(0,2)(0,2)(?,?)
 64 | E	((..)|(.))((..)|(.))			aaaaa		(0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
 65 | E	((..)|(.))((..)|(.))((..)|(.))		aaaaa		(0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
 66 | 
 67 | E	((..)|(.)){1}				aaaaa		(0,2)(0,2)(0,2)(?,?)
 68 | E	((..)|(.)){2}				aaaaa		(0,4)(2,4)(2,4)(?,?)
 69 | #E	((..)|(.)){3}				aaaaa		(0,5)(4,5)(?,?)(4,5)
 70 | E	((..)|(.)){3}				aaaaa		(0,5)(4,5)(2,4)(4,5)	RE2/Go
 71 | 
 72 | #E	((..)|(.))*				aaaaa		(0,5)(4,5)(?,?)(4,5)
 73 | E	((..)|(.))*				aaaaa		(0,5)(4,5)(2,4)(4,5)	RE2/Go
 74 | 
 75 | E	((..)|(.))				aaaaaa		(0,2)(0,2)(0,2)(?,?)
 76 | E	((..)|(.))((..)|(.))			aaaaaa		(0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
 77 | E	((..)|(.))((..)|(.))((..)|(.))		aaaaaa		(0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
 78 | 
 79 | E	((..)|(.)){1}				aaaaaa		(0,2)(0,2)(0,2)(?,?)
 80 | E	((..)|(.)){2}				aaaaaa		(0,4)(2,4)(2,4)(?,?)
 81 | E	((..)|(.)){3}				aaaaaa		(0,6)(4,6)(4,6)(?,?)
 82 | 
 83 | E	((..)|(.))*				aaaaaa		(0,6)(4,6)(4,6)(?,?)
 84 | 
 85 | NOTE	additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
 86 | 
 87 | # These test a bug in OS X / FreeBSD / NetBSD, and libtree. 
 88 | # Linux/GLIBC gets the {8,} and {8,8} wrong.
 89 | 
 90 | :HA#100:E	X(.?){0,}Y	X1234567Y	(0,9)(7,8)
 91 | :HA#101:E	X(.?){1,}Y	X1234567Y	(0,9)(7,8)
 92 | :HA#102:E	X(.?){2,}Y	X1234567Y	(0,9)(7,8)
 93 | :HA#103:E	X(.?){3,}Y	X1234567Y	(0,9)(7,8)
 94 | :HA#104:E	X(.?){4,}Y	X1234567Y	(0,9)(7,8)
 95 | :HA#105:E	X(.?){5,}Y	X1234567Y	(0,9)(7,8)
 96 | :HA#106:E	X(.?){6,}Y	X1234567Y	(0,9)(7,8)
 97 | :HA#107:E	X(.?){7,}Y	X1234567Y	(0,9)(7,8)
 98 | :HA#108:E	X(.?){8,}Y	X1234567Y	(0,9)(8,8)
 99 | #:HA#110:E	X(.?){0,8}Y	X1234567Y	(0,9)(7,8)
100 | :HA#110:E	X(.?){0,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
101 | #:HA#111:E	X(.?){1,8}Y	X1234567Y	(0,9)(7,8)
102 | :HA#111:E	X(.?){1,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
103 | #:HA#112:E	X(.?){2,8}Y	X1234567Y	(0,9)(7,8)
104 | :HA#112:E	X(.?){2,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
105 | #:HA#113:E	X(.?){3,8}Y	X1234567Y	(0,9)(7,8)
106 | :HA#113:E	X(.?){3,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
107 | #:HA#114:E	X(.?){4,8}Y	X1234567Y	(0,9)(7,8)
108 | :HA#114:E	X(.?){4,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
109 | #:HA#115:E	X(.?){5,8}Y	X1234567Y	(0,9)(7,8)
110 | :HA#115:E	X(.?){5,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
111 | #:HA#116:E	X(.?){6,8}Y	X1234567Y	(0,9)(7,8)
112 | :HA#116:E	X(.?){6,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
113 | #:HA#117:E	X(.?){7,8}Y	X1234567Y	(0,9)(7,8)
114 | :HA#117:E	X(.?){7,8}Y	X1234567Y	(0,9)(8,8)	RE2/Go
115 | :HA#118:E	X(.?){8,8}Y	X1234567Y	(0,9)(8,8)
116 | 
117 | # These test a fixed bug in my regex-tdfa that did not keep the expanded
118 | # form properly grouped, so right association did the wrong thing with
119 | # these ambiguous patterns (crafted just to test my code when I became
120 | # suspicious of my implementation).  The first subexpression should use
121 | # "ab" then "a" then "bcd".
122 | 
123 | # OS X / FreeBSD / NetBSD badly fail many of these, with impossible
124 | # results like (0,6)(4,5)(6,6).
125 | 
126 | :HA#260:E	(a|ab|c|bcd){0,}(d*)	ababcd	(0,1)(0,1)(1,1)
127 | :HA#261:E	(a|ab|c|bcd){1,}(d*)	ababcd	(0,1)(0,1)(1,1)
128 | :HA#262:E	(a|ab|c|bcd){2,}(d*)	ababcd	(0,6)(3,6)(6,6)
129 | :HA#263:E	(a|ab|c|bcd){3,}(d*)	ababcd	(0,6)(3,6)(6,6)
130 | :HA#264:E	(a|ab|c|bcd){4,}(d*)	ababcd	NOMATCH
131 | :HA#265:E	(a|ab|c|bcd){0,10}(d*)	ababcd	(0,1)(0,1)(1,1)
132 | :HA#266:E	(a|ab|c|bcd){1,10}(d*)	ababcd	(0,1)(0,1)(1,1)
133 | :HA#267:E	(a|ab|c|bcd){2,10}(d*)	ababcd	(0,6)(3,6)(6,6)
134 | :HA#268:E	(a|ab|c|bcd){3,10}(d*)	ababcd	(0,6)(3,6)(6,6)
135 | :HA#269:E	(a|ab|c|bcd){4,10}(d*)	ababcd	NOMATCH
136 | :HA#270:E	(a|ab|c|bcd)*(d*)	ababcd	(0,1)(0,1)(1,1)
137 | :HA#271:E	(a|ab|c|bcd)+(d*)	ababcd	(0,1)(0,1)(1,1)
138 | 
139 | # The above worked on Linux/GLIBC but the following often fail.
140 | # They also trip up OS X / FreeBSD / NetBSD:
141 | 
142 | #:HA#280:E	(ab|a|c|bcd){0,}(d*)	ababcd	(0,6)(3,6)(6,6)
143 | :HA#280:E	(ab|a|c|bcd){0,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
144 | #:HA#281:E	(ab|a|c|bcd){1,}(d*)	ababcd	(0,6)(3,6)(6,6)
145 | :HA#281:E	(ab|a|c|bcd){1,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
146 | #:HA#282:E	(ab|a|c|bcd){2,}(d*)	ababcd	(0,6)(3,6)(6,6)
147 | :HA#282:E	(ab|a|c|bcd){2,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
148 | #:HA#283:E	(ab|a|c|bcd){3,}(d*)	ababcd	(0,6)(3,6)(6,6)
149 | :HA#283:E	(ab|a|c|bcd){3,}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
150 | :HA#284:E	(ab|a|c|bcd){4,}(d*)	ababcd	NOMATCH
151 | #:HA#285:E	(ab|a|c|bcd){0,10}(d*)	ababcd	(0,6)(3,6)(6,6)
152 | :HA#285:E	(ab|a|c|bcd){0,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
153 | #:HA#286:E	(ab|a|c|bcd){1,10}(d*)	ababcd	(0,6)(3,6)(6,6)
154 | :HA#286:E	(ab|a|c|bcd){1,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
155 | #:HA#287:E	(ab|a|c|bcd){2,10}(d*)	ababcd	(0,6)(3,6)(6,6)
156 | :HA#287:E	(ab|a|c|bcd){2,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
157 | #:HA#288:E	(ab|a|c|bcd){3,10}(d*)	ababcd	(0,6)(3,6)(6,6)
158 | :HA#288:E	(ab|a|c|bcd){3,10}(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
159 | :HA#289:E	(ab|a|c|bcd){4,10}(d*)	ababcd	NOMATCH
160 | #:HA#290:E	(ab|a|c|bcd)*(d*)	ababcd	(0,6)(3,6)(6,6)
161 | :HA#290:E	(ab|a|c|bcd)*(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
162 | #:HA#291:E	(ab|a|c|bcd)+(d*)	ababcd	(0,6)(3,6)(6,6)
163 | :HA#291:E	(ab|a|c|bcd)+(d*)	ababcd	(0,6)(4,5)(5,6)	RE2/Go
164 | 


--------------------------------------------------------------------------------
/src/vm.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
  2 | // file at the top-level directory of this distribution and at
  3 | // http://rust-lang.org/COPYRIGHT.
  4 | //
  5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  8 | // option. This file may not be copied, modified, or distributed
  9 | // except according to those terms.
 10 | 
 11 | // FIXME: Currently, the VM simulates an NFA. It would be nice to have another
 12 | // VM that simulates a DFA.
 13 | //
 14 | // According to Russ Cox[1], a DFA performs better than an NFA, principally
 15 | // because it reuses states previously computed by the machine *and* doesn't
 16 | // keep track of capture groups. The drawback of a DFA (aside from its
 17 | // complexity) is that it can't accurately return the locations of submatches.
 18 | // The NFA *can* do that. (This is my understanding anyway.)
 19 | //
 20 | // Cox suggests that a DFA ought to be used to answer "does this match" and
 21 | // "where does it match" questions. (In the latter, the starting position of
 22 | // the match is computed by executing the regex backwards.) Cox also suggests
 23 | // that a DFA should be run when asking "where are the submatches", which can
 24 | // 1) quickly answer "no" is there's no match and 2) discover the substring
 25 | // that matches, which means running the NFA on smaller input.
 26 | //
 27 | // Currently, the NFA simulation implemented below does some dirty tricks to
 28 | // avoid tracking capture groups when they aren't needed (which only works
 29 | // for 'is_match', not 'find'). This is a half-measure, but does provide some
 30 | // perf improvement.
 31 | //
 32 | // AFAIK, the DFA/NFA approach is implemented in RE2/C++ but *not* in RE2/Go.
 33 | //
 34 | // [1] - http://swtch.com/~rsc/regex/regex3.html
 35 | 
 36 | use std::cmp;
 37 | use std::mem;
 38 | use std::slice::MutableVector;
 39 | use compile::{
 40 |     Program,
 41 |     Match, OneChar, CharClass, Any, EmptyBegin, EmptyEnd, EmptyWordBoundary,
 42 |     Save, Jump, Split,
 43 | };
 44 | use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED};
 45 | use parse::unicode::PERLW;
 46 | 
 47 | pub type CaptureLocs = Vec<Option<uint>>;
 48 | 
 49 | /// Indicates the type of match to be performed by the VM.
 50 | pub enum MatchKind {
 51 |     /// Only checks if a match exists or not. Does not return location.
 52 |     Exists,
 53 |     /// Returns the start and end indices of the entire match in the input
 54 |     /// given.
 55 |     Location,
 56 |     /// Returns the start and end indices of each submatch in the input given.
 57 |     Submatches,
 58 | }
 59 | 
 60 | /// Runs an NFA simulation on the compiled expression given on the search text
 61 | /// `input`. The search begins at byte index `start` and ends at byte index
 62 | /// `end`. (The range is specified here so that zero-width assertions will work
 63 | /// correctly when searching for successive non-overlapping matches.)
 64 | ///
 65 | /// The `which` parameter indicates what kind of capture information the caller
 66 | /// wants. There are three choices: match existence only, the location of the
 67 | /// entire match or the locations of the entire match in addition to the
 68 | /// locations of each submatch.
 69 | pub fn run<'r, 't>(which: MatchKind, prog: &'r Program, input: &'t str,
 70 |                    start: uint, end: uint) -> CaptureLocs {
 71 |     Nfa {
 72 |         which: which,
 73 |         prog: prog,
 74 |         input: input,
 75 |         start: start,
 76 |         end: end,
 77 |         ic: 0,
 78 |         chars: CharReader::new(input),
 79 |     }.run()
 80 | }
 81 | 
 82 | struct Nfa<'r, 't> {
 83 |     which: MatchKind,
 84 |     prog: &'r Program,
 85 |     input: &'t str,
 86 |     start: uint,
 87 |     end: uint,
 88 |     ic: uint,
 89 |     chars: CharReader<'t>,
 90 | }
 91 | 
 92 | /// Indicates the next action to take after a single non-empty instruction
 93 | /// is processed.
 94 | pub enum StepState {
 95 |     /// This is returned if and only if a Match instruction is reached and
 96 |     /// we only care about the existence of a match. It instructs the VM to
 97 |     /// quit early.
 98 |     StepMatchEarlyReturn,
 99 |     /// Indicates that a match was found. Thus, the rest of the states in the
100 |     /// *current* queue should be dropped (i.e., leftmost-first semantics).
101 |     /// States in the "next" queue can still be processed.
102 |     StepMatch,
103 |     /// No match was found. Continue with the next state in the queue.
104 |     StepContinue,
105 | }
106 | 
107 | impl<'r, 't> Nfa<'r, 't> {
108 |     fn run(&mut self) -> CaptureLocs {
109 |         let ncaps = match self.which {
110 |             Exists => 0,
111 |             Location => 1,
112 |             Submatches => self.prog.num_captures(),
113 |         };
114 |         let mut matched = false;
115 |         let ninsts = self.prog.insts.len();
116 |         let mut clist = &mut Threads::new(self.which, ninsts, ncaps);
117 |         let mut nlist = &mut Threads::new(self.which, ninsts, ncaps);
118 | 
119 |         let mut groups = Vec::from_elem(ncaps * 2, None);
120 | 
121 |         // Determine if the expression starts with a '^' so we can avoid
122 |         // simulating .*?
123 |         // Make sure multi-line mode isn't enabled for it, otherwise we can't
124 |         // drop the initial .*?
125 |         let prefix_anchor =
126 |             match *self.prog.insts.get(1) {
127 |                 EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
128 |                 _ => false,
129 |             };
130 | 
131 |         self.ic = self.start;
132 |         let mut next_ic = self.chars.set(self.start);
133 |         while self.ic <= self.end {
134 |             if clist.size == 0 {
135 |                 // We have a match and we're done exploring alternatives.
136 |                 // Time to quit.
137 |                 if matched {
138 |                     break
139 |                 }
140 | 
141 |                 // If there are no threads to try, then we'll have to start
142 |                 // over at the beginning of the regex.
143 |                 // BUT, if there's a literal prefix for the program, try to
144 |                 // jump ahead quickly. If it can't be found, then we can bail
145 |                 // out early.
146 |                 if self.prog.prefix.len() > 0 && clist.size == 0 {
147 |                     let needle = self.prog.prefix.as_slice().as_bytes();
148 |                     let haystack = self.input.as_bytes().slice_from(self.ic);
149 |                     match find_prefix(needle, haystack) {
150 |                         None => break,
151 |                         Some(i) => {
152 |                             self.ic += i;
153 |                             next_ic = self.chars.set(self.ic);
154 |                         }
155 |                     }
156 |                 }
157 |             }
158 | 
159 |             // This simulates a preceding '.*?' for every regex by adding
160 |             // a state starting at the current position in the input for the
161 |             // beginning of the program only if we don't already have a match.
162 |             if clist.size == 0 || (!prefix_anchor && !matched) {
163 |                 self.add(clist, 0, groups.as_mut_slice())
164 |             }
165 | 
166 |             // Now we try to read the next character.
167 |             // As a result, the 'step' method will look at the previous
168 |             // character.
169 |             self.ic = next_ic;
170 |             next_ic = self.chars.advance();
171 | 
172 |             let mut i = 0;
173 |             while i < clist.size {
174 |                 let pc = clist.pc(i);
175 |                 let step_state = self.step(groups.as_mut_slice(), nlist,
176 |                                            clist.groups(i), pc);
177 |                 match step_state {
178 |                     StepMatchEarlyReturn => return vec![Some(0), Some(0)],
179 |                     StepMatch => { matched = true; clist.empty() },
180 |                     StepContinue => {},
181 |                 }
182 |                 i += 1;
183 |             }
184 |             mem::swap(&mut clist, &mut nlist);
185 |             nlist.empty();
186 |         }
187 |         match self.which {
188 |             Exists if matched     => vec![Some(0), Some(0)],
189 |             Exists                => vec![None, None],
190 |             Location | Submatches => groups,
191 |         }
192 |     }
193 | 
194 |     fn step(&self, groups: &mut [Option<uint>], nlist: &mut Threads,
195 |             caps: &mut [Option<uint>], pc: uint)
196 |            -> StepState {
197 |         match *self.prog.insts.get(pc) {
198 |             Match => {
199 |                 match self.which {
200 |                     Exists => {
201 |                         return StepMatchEarlyReturn
202 |                     }
203 |                     Location => {
204 |                         groups[0] = caps[0];
205 |                         groups[1] = caps[1];
206 |                         return StepMatch
207 |                     }
208 |                     Submatches => {
209 |                         for (slot, val) in groups.mut_iter().zip(caps.iter()) {
210 |                             *slot = *val;
211 |                         }
212 |                         return StepMatch
213 |                     }
214 |                 }
215 |             }
216 |             OneChar(c, flags) => {
217 |                 if self.char_eq(flags & FLAG_NOCASE > 0, self.chars.prev, c) {
218 |                     self.add(nlist, pc+1, caps);
219 |                 }
220 |             }
221 |             CharClass(ref ranges, flags) => {
222 |                 if self.chars.prev.is_some() {
223 |                     let c = self.chars.prev.unwrap();
224 |                     let negate = flags & FLAG_NEGATED > 0;
225 |                     let casei = flags & FLAG_NOCASE > 0;
226 |                     let found = ranges.as_slice();
227 |                     let found = found.bsearch(|&rc| class_cmp(casei, c, rc));
228 |                     let found = found.is_some();
229 |                     if (found && !negate) || (!found && negate) {
230 |                         self.add(nlist, pc+1, caps);
231 |                     }
232 |                 }
233 |             }
234 |             Any(flags) => {
235 |                 if flags & FLAG_DOTNL > 0
236 |                    || !self.char_eq(false, self.chars.prev, '\n') {
237 |                     self.add(nlist, pc+1, caps)
238 |                 }
239 |             }
240 |             EmptyBegin(_) | EmptyEnd(_) | EmptyWordBoundary(_)
241 |             | Save(_) | Jump(_) | Split(_, _) => {},
242 |         }
243 |         StepContinue
244 |     }
245 | 
246 |     fn add(&self, nlist: &mut Threads, pc: uint, groups: &mut [Option<uint>]) {
247 |         if nlist.contains(pc) {
248 |             return
249 |         }
250 |         // We have to add states to the threads list even if their empty.
251 |         // TL;DR - It prevents cycles.
252 |         // If we didn't care about cycles, we'd *only* add threads that
253 |         // correspond to non-jumping instructions (OneChar, Any, Match, etc.).
254 |         // But, it's possible for valid regexs (like '(a*)*') to result in
255 |         // a cycle in the instruction list. e.g., We'll keep chasing the Split
256 |         // instructions forever.
257 |         // So we add these instructions to our thread queue, but in the main
258 |         // VM loop, we look for them but simply ignore them.
259 |         // Adding them to the queue prevents them from being revisited so we
260 |         // can avoid cycles (and the inevitable stack overflow).
261 |         //
262 |         // We make a minor optimization by indicating that the state is "empty"
263 |         // so that its capture groups are not filled in.
264 |         match *self.prog.insts.get(pc) {
265 |             EmptyBegin(flags) => {
266 |                 let multi = flags & FLAG_MULTI > 0;
267 |                 nlist.add(pc, groups, true);
268 |                 if self.chars.is_begin()
269 |                    || (multi && self.char_is(self.chars.prev, '\n')) {
270 |                     self.add(nlist, pc + 1, groups)
271 |                 }
272 |             }
273 |             EmptyEnd(flags) => {
274 |                 let multi = flags & FLAG_MULTI > 0;
275 |                 nlist.add(pc, groups, true);
276 |                 if self.chars.is_end()
277 |                    || (multi && self.char_is(self.chars.cur, '\n')) {
278 |                     self.add(nlist, pc + 1, groups)
279 |                 }
280 |             }
281 |             EmptyWordBoundary(flags) => {
282 |                 nlist.add(pc, groups, true);
283 |                 if self.chars.is_word_boundary() == !(flags & FLAG_NEGATED > 0) {
284 |                     self.add(nlist, pc + 1, groups)
285 |                 }
286 |             }
287 |             Save(slot) => {
288 |                 nlist.add(pc, groups, true);
289 |                 match self.which {
290 |                     Location if slot <= 1 => {
291 |                         let old = groups[slot];
292 |                         groups[slot] = Some(self.ic);
293 |                         self.add(nlist, pc + 1, groups);
294 |                         groups[slot] = old;
295 |                     }
296 |                     Submatches => {
297 |                         let old = groups[slot];
298 |                         groups[slot] = Some(self.ic);
299 |                         self.add(nlist, pc + 1, groups);
300 |                         groups[slot] = old;
301 |                     }
302 |                     Exists | Location => self.add(nlist, pc + 1, groups),
303 |                 }
304 |             }
305 |             Jump(to) => {
306 |                 nlist.add(pc, groups, true);
307 |                 self.add(nlist, to, groups)
308 |             }
309 |             Split(x, y) => {
310 |                 nlist.add(pc, groups, true);
311 |                 self.add(nlist, x, groups);
312 |                 self.add(nlist, y, groups);
313 |             }
314 |             Match | OneChar(_, _) | CharClass(_, _) | Any(_) => {
315 |                 nlist.add(pc, groups, false);
316 |             }
317 |         }
318 |     }
319 | 
320 |     // FIXME: For case insensitive comparisons, it uses the uppercase
321 |     // character and tests for equality. IIUC, this does not generalize to
322 |     // all of Unicode. I believe we need to check the entire fold for each
323 |     // character. This will be easy to add if and when it gets added to Rust's
324 |     // standard library.
325 |     #[inline]
326 |     fn char_eq(&self, casei: bool, textc: Option<char>, regc: char) -> bool {
327 |         match textc {
328 |             None => false,
329 |             Some(textc) => {
330 |                 regc == textc
331 |                     || (casei && regc.to_uppercase() == textc.to_uppercase())
332 |             }
333 |         }
334 |     }
335 | 
336 |     #[inline]
337 |     fn char_is(&self, textc: Option<char>, regc: char) -> bool {
338 |         textc == Some(regc)
339 |     }
340 | }
341 | 
342 | /// CharReader is responsible for maintaining a "previous" and a "current"
343 | /// character. This one-character lookahead is necessary for assertions that
344 | /// look one character before or after the current position.
345 | pub struct CharReader<'t> {
346 |     /// The previous character read. It is None only when processing the first
347 |     /// character of the input.
348 |     pub prev: Option<char>,
349 |     /// The current character.
350 |     pub cur: Option<char>,
351 |     input: &'t str,
352 |     next: uint,
353 | }
354 | 
355 | impl<'t> CharReader<'t> {
356 |     /// Returns a new CharReader that advances through the input given.
357 |     /// Note that a CharReader has no knowledge of the range in which to search
358 |     /// the input.
359 |     pub fn new(input: &'t str) -> CharReader<'t> {
360 |         CharReader {
361 |             prev: None,
362 |             cur: None,
363 |             input: input,
364 |             next: 0,
365 |        }
366 |     }
367 | 
368 |     /// Sets the previous and current character given any arbitrary byte
369 |     /// index (at a unicode codepoint boundary).
370 |     #[inline]
371 |     pub fn set(&mut self, ic: uint) -> uint {
372 |         self.prev = None;
373 |         self.cur = None;
374 |         self.next = 0;
375 | 
376 |         if self.input.len() == 0 {
377 |             return 1
378 |         }
379 |         if ic > 0 {
380 |             let i = cmp::min(ic, self.input.len());
381 |             let prev = self.input.char_range_at_reverse(i);
382 |             self.prev = Some(prev.ch);
383 |         }
384 |         if ic < self.input.len() {
385 |             let cur = self.input.char_range_at(ic);
386 |             self.cur = Some(cur.ch);
387 |             self.next = cur.next;
388 |             self.next
389 |         } else {
390 |             self.input.len() + 1
391 |         }
392 |     }
393 | 
394 |     /// Does the same as `set`, except it always advances to the next
395 |     /// character in the input (and therefore does half as many UTF8 decodings).
396 |     #[inline]
397 |     pub fn advance(&mut self) -> uint {
398 |         self.prev = self.cur;
399 |         if self.next < self.input.len() {
400 |             let cur = self.input.char_range_at(self.next);
401 |             self.cur = Some(cur.ch);
402 |             self.next = cur.next;
403 |         } else {
404 |             self.cur = None;
405 |             self.next = self.input.len() + 1;
406 |         }
407 |         self.next
408 |     }
409 | 
410 |     /// Returns true if and only if this is the beginning of the input
411 |     /// (ignoring the range of the input to search).
412 |     #[inline]
413 |     pub fn is_begin(&self) -> bool { self.prev.is_none() }
414 | 
415 |     /// Returns true if and only if this is the end of the input
416 |     /// (ignoring the range of the input to search).
417 |     #[inline]
418 |     pub fn is_end(&self) -> bool { self.cur.is_none() }
419 | 
420 |     /// Returns true if and only if the current position is a word boundary.
421 |     /// (Ignoring the range of the input to search.)
422 |     pub fn is_word_boundary(&self) -> bool {
423 |         if self.is_begin() {
424 |             return is_word(self.cur)
425 |         }
426 |         if self.is_end() {
427 |             return is_word(self.prev)
428 |         }
429 |         (is_word(self.cur) && !is_word(self.prev))
430 |         || (is_word(self.prev) && !is_word(self.cur))
431 |     }
432 | }
433 | 
434 | struct Thread {
435 |     pc: uint,
436 |     groups: Vec<Option<uint>>,
437 | }
438 | 
439 | struct Threads {
440 |     which: MatchKind,
441 |     queue: Vec<Thread>,
442 |     sparse: Vec<uint>,
443 |     size: uint,
444 | }
445 | 
446 | impl Threads {
447 |     // This is using a wicked neat trick to provide constant time lookup
448 |     // for threads in the queue using a sparse set. A queue of threads is
449 |     // allocated once with maximal size when the VM initializes and is reused
450 |     // throughout execution. That is, there should be zero allocation during
451 |     // the execution of a VM.
452 |     //
453 |     // See http://research.swtch.com/sparse for the deets.
454 |     fn new(which: MatchKind, num_insts: uint, ncaps: uint) -> Threads {
455 |         Threads {
456 |             which: which,
457 |             queue: Vec::from_fn(num_insts, |_| {
458 |                 Thread { pc: 0, groups: Vec::from_elem(ncaps * 2, None) }
459 |             }),
460 |             sparse: Vec::from_elem(num_insts, 0u),
461 |             size: 0,
462 |         }
463 |     }
464 | 
465 |     fn add(&mut self, pc: uint, groups: &[Option<uint>], empty: bool) {
466 |         let t = self.queue.get_mut(self.size);
467 |         t.pc = pc;
468 |         match (empty, self.which) {
469 |             (_, Exists) | (true, _) => {},
470 |             (false, Location) => {
471 |                 *t.groups.get_mut(0) = groups[0];
472 |                 *t.groups.get_mut(1) = groups[1];
473 |             }
474 |             (false, Submatches) => {
475 |                 for (slot, val) in t.groups.mut_iter().zip(groups.iter()) {
476 |                     *slot = *val;
477 |                 }
478 |             }
479 |         }
480 |         *self.sparse.get_mut(pc) = self.size;
481 |         self.size += 1;
482 |     }
483 | 
484 |     #[inline]
485 |     fn contains(&self, pc: uint) -> bool {
486 |         let s = *self.sparse.get(pc);
487 |         s < self.size && self.queue.get(s).pc == pc
488 |     }
489 | 
490 |     #[inline]
491 |     fn empty(&mut self) {
492 |         self.size = 0;
493 |     }
494 | 
495 |     #[inline]
496 |     fn pc(&self, i: uint) -> uint {
497 |         self.queue.get(i).pc
498 |     }
499 | 
500 |     #[inline]
501 |     fn groups<'r>(&'r mut self, i: uint) -> &'r mut [Option<uint>] {
502 |         self.queue.get_mut(i).groups.as_mut_slice()
503 |     }
504 | }
505 | 
506 | /// Returns true if the character is a word character, according to the
507 | /// (Unicode friendly) Perl character class '\w'.
508 | /// Note that this is only use for testing word boundaries. The actual '\w'
509 | /// is encoded as a CharClass instruction.
510 | pub fn is_word(c: Option<char>) -> bool {
511 |     let c = match c {
512 |         None => return false,
513 |         Some(c) => c,
514 |     };
515 |     // Try the common ASCII case before invoking binary search.
516 |     match c {
517 |         '_' | '0' .. '9' | 'a' .. 'z' | 'A' .. 'Z' => true,
518 |         _ => PERLW.bsearch(|&(start, end)| {
519 |             if c >= start && c <= end {
520 |                 Equal
521 |             } else if start > c {
522 |                 Greater
523 |             } else {
524 |                 Less
525 |             }
526 |         }).is_some()
527 |     }
528 | }
529 | 
530 | /// Given a character and a single character class range, return an ordering
531 | /// indicating whether the character is less than the start of the range,
532 | /// in the range (inclusive) or greater than the end of the range.
533 | ///
534 | /// If `casei` is `true`, then this ordering is computed case insensitively.
535 | ///
536 | /// This function is meant to be used with a binary search.
537 | #[inline]
538 | fn class_cmp(casei: bool, mut textc: char,
539 |              (mut start, mut end): (char, char)) -> Ordering {
540 |     if casei {
541 |         // FIXME: This is pretty ridiculous. All of this case conversion
542 |         // can be moved outside this function:
543 |         // 1) textc should be uppercased outside the bsearch.
544 |         // 2) the character class itself should be uppercased either in the
545 |         //    parser or the compiler.
546 |         // FIXME: This is too simplistic for correct Unicode support.
547 |         //        See also: char_eq
548 |         textc = textc.to_uppercase();
549 |         start = start.to_uppercase();
550 |         end = end.to_uppercase();
551 |     }
552 |     if textc >= start && textc <= end {
553 |         Equal
554 |     } else if start > textc {
555 |         Greater
556 |     } else {
557 |         Less
558 |     }
559 | }
560 | 
561 | /// Returns the starting location of `needle` in `haystack`.
562 | /// If `needle` is not in `haystack`, then `None` is returned.
563 | ///
564 | /// Note that this is using a naive substring algorithm.
565 | #[inline]
566 | pub fn find_prefix(needle: &[u8], haystack: &[u8]) -> Option<uint> {
567 |     let (hlen, nlen) = (haystack.len(), needle.len());
568 |     if nlen > hlen || nlen == 0 {
569 |         return None
570 |     }
571 |     let mut hayi = 0u;
572 |     'HAYSTACK: loop {
573 |         if hayi > hlen - nlen {
574 |             break
575 |         }
576 |         let mut nedi = 0;
577 |         while nedi < nlen {
578 |             if haystack[hayi+nedi] != needle[nedi] {
579 |                 hayi += 1;
580 |                 continue 'HAYSTACK
581 |             }
582 |             nedi += 1;
583 |         }
584 |         return Some(hayi)
585 |     }
586 |     None
587 | }
588 | 


--------------------------------------------------------------------------------