├── .bazelrc
├── BUILD.bazel
├── LICENSE
├── MODULE.bazel
├── README.md
├── WORKSPACE.bazel
├── WORKSPACE.bzlmod
├── internal_configure.bzl
├── libutf-BUILD.bazel
├── parser.yy
├── redasm.cc
├── reddot.cc
├── redgrep.cc
├── redgrep.h
├── redgrep_main.cc
├── regexp.cc
├── regexp.h
└── regexp_test.cc


/.bazelrc:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Enable layering check features. Useful on Clang only.
16 | build --features=layering_check
17 | # Enable parse headers features. Enforcing that headers are self-contained.
18 | build --features=parse_headers
19 | 
20 | # LLVM requires C++17 at minimum.
21 | build --enable_platform_specific_config
22 | build:linux --cxxopt=-std=c++17
23 | build:macos --cxxopt=-std=c++17
24 | build:windows --cxxopt=/std:c++17
25 | 


--------------------------------------------------------------------------------
/BUILD.bazel:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | licenses(["notice"])
16 | 
17 | exports_files(["LICENSE"])
18 | 
19 | genrule(
20 |     name = "parser",
21 |     srcs = ["parser.yy"],
22 |     outs = [
23 |         "parser.tab.cc",
24 |         "parser.tab.hh",
25 |     ],
26 |     cmd = "bison -o $(location parser.tab.cc) $<",
27 | )
28 | 
29 | cc_library(
30 |     name = "library",
31 |     srcs = [
32 |         "redgrep.cc",
33 |         "regexp.cc",
34 |         ":parser",
35 |     ],
36 |     hdrs = [
37 |         "redgrep.h",
38 |         "regexp.h",
39 |     ],
40 |     deps = [
41 |         "@libutf//:utf",
42 |         "@local_config_llvm//:llvm",
43 |     ],
44 | )
45 | 
46 | cc_test(
47 |     name = "regexp_test",
48 |     srcs = ["regexp_test.cc"],
49 |     deps = [
50 |         ":library",
51 |         "@googletest//:gtest",
52 |         "@googletest//:gtest_main",
53 |     ],
54 | )
55 | 
56 | cc_binary(
57 |     name = "reddot",
58 |     srcs = ["reddot.cc"],
59 |     deps = [":library"],
60 | )
61 | 
62 | cc_binary(
63 |     name = "redasm",
64 |     srcs = ["redasm.cc"],
65 |     deps = [":library"],
66 | )
67 | 
68 | cc_binary(
69 |     name = "redgrep",
70 |     srcs = ["redgrep_main.cc"],
71 |     deps = [":library"],
72 | )
73 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/MODULE.bazel:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | module(
16 |     name = "redgrep",
17 |     version = "0.0.0",
18 | )
19 | 
20 | internal_configure = use_extension("//:internal_configure.bzl", "internal_configure_extension")
21 | use_repo(internal_configure, "libutf", "local_config_llvm")
22 | 
23 | bazel_dep(name = "googletest", version = "1.14.0.bcr.1", dev_dependency = True)
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # redgrep
 2 | 
 3 | ## About
 4 | 
 5 | redgrep is a grep based on regular expression derivatives. That is, it uses
 6 | regular expression derivatives to construct the DFA. It then uses LLVM to JIT
 7 | the DFA.
 8 | 
 9 | Since regular expression derivatives permit the three basic Boolean operations
10 | of disjunction (`|`), conjunction (`&`) and complement (`!`), redgrep enables
11 | you to write very powerful regular expressions very easily and guarantees to
12 | match them in linear time.
13 | 
14 | ## Building
15 | 
16 | You must have Bazel, GNU bison and either GCC or Clang.
17 | 
18 | redgrep attempts to keep up with LLVM development, so you should
19 | [get the source code and build LLVM](https://llvm.org/docs/GettingStarted.html#getting-the-source-code-and-building-llvm).
20 | (Debian and Ubuntu users might prefer to install the
21 | [nightly packages](https://apt.llvm.org/) instead.)
22 | 
23 | `llvm-config-17` must be in your path.
24 | 
25 | ## Contact
26 | 
27 | [redgrep@googlegroups.com](mailto:redgrep@googlegroups.com)
28 | 
29 | ## Disclaimer
30 | 
31 | This is not an official Google product.
32 | 


--------------------------------------------------------------------------------
/WORKSPACE.bazel:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | workspace(name = "com_github_google_redgrep")
16 | 


--------------------------------------------------------------------------------
/WORKSPACE.bzlmod:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | workspace(name = "com_github_google_redgrep")
16 | 


--------------------------------------------------------------------------------
/internal_configure.bzl:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
16 | 
17 | def _which(repository_ctx, program):
18 |     path = repository_ctx.which(program)
19 |     if not path:
20 |         fail("Finding %r failed" % (program,))
21 |     return path
22 | 
23 | def _execute(repository_ctx, arguments):
24 |     result = repository_ctx.execute(arguments)
25 |     if result.return_code:
26 |         fail("Executing %r failed: %r" % (arguments, result.stderr))
27 |     return result.stdout.strip()
28 | 
29 | def _llvm_repository_impl(repository_ctx):
30 |     llvm_config = _which(repository_ctx, "llvm-config-17")
31 |     libfiles = _execute(repository_ctx, [llvm_config, "--libfiles"])
32 |     includedir = _execute(repository_ctx, [llvm_config, "--includedir"])
33 |     repository_ctx.symlink("/", "ROOT")
34 |     repository_ctx.file(
35 |         "BUILD.bazel",
36 |         content = """\
37 | cc_library(
38 |     name = "llvm",
39 |     srcs = ["ROOT" + {libfiles}],
40 |     hdrs = glob(["ROOT" + {includedir} + "/**/*.*"]),
41 |     includes = ["ROOT" + {includedir}],
42 |     visibility = ["//visibility:public"],
43 | )
44 | """.format(
45 |             libfiles = repr(libfiles),
46 |             includedir = repr(includedir),
47 |         ),
48 |     )
49 | 
50 | _llvm_repository = repository_rule(implementation = _llvm_repository_impl)
51 | 
52 | def _internal_configure_extension_impl(module_ctx):
53 |     http_archive(
54 |         name = "libutf",
55 |         build_file = "//:libutf-BUILD.bazel",
56 |         strip_prefix = "libutf-master",
57 |         urls = ["https://github.com/cls/libutf/archive/master.zip"],
58 |     )
59 |     _llvm_repository(
60 |         name = "local_config_llvm",
61 |     )
62 | 
63 | internal_configure_extension = module_extension(implementation = _internal_configure_extension_impl)
64 | 


--------------------------------------------------------------------------------
/libutf-BUILD.bazel:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | licenses(["notice"])
16 | 
17 | exports_files(["LICENSE"])
18 | 
19 | cc_library(
20 |     name = "utf",
21 |     srcs = glob(["utf/*.c"]) + ["runetype/isvalidrune.c"],
22 |     hdrs = ["include/utf.h"],
23 |     includes = ["include"],
24 |     visibility = ["//visibility:public"],
25 | )
26 | 


--------------------------------------------------------------------------------
/parser.yy:
--------------------------------------------------------------------------------
  1 | // Copyright 2012 Google Inc. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | %require "3.2"
 16 | %language "c++"
 17 | %define api.value.type {redgrep::Exp}
 18 | %header
 19 | %lex-param   {llvm::StringRef* str}
 20 | %parse-param {llvm::StringRef* str} {redgrep::Exp* exp}
 21 | 
 22 | %code requires {
 23 | #include "llvm/ADT/StringRef.h"
 24 | #include "regexp.h"
 25 | }
 26 | 
 27 | %code {
 28 | #include "utf.h"
 29 | namespace yy {
 30 | int yylex(redgrep::Exp* exp, llvm::StringRef* str);
 31 | }  // namespace yy
 32 | }
 33 | 
 34 | %left DISJUNCTION
 35 | %left CONJUNCTION
 36 | %left COMPLEMENT
 37 | %right CONCATENATION
 38 | %left QUANTIFIER
 39 | %nonassoc LEFT_PARENTHESIS RIGHT_PARENTHESIS
 40 | %nonassoc FUNDAMENTAL
 41 | %token ERROR
 42 | 
 43 | %%
 44 | 
 45 | start:
 46 |   expression
 47 |   { *exp = $1; }
 48 | 
 49 | expression:
 50 |   expression DISJUNCTION expression
 51 |   { $$ = redgrep::Disjunction($1, $3); }
 52 | | expression CONJUNCTION expression
 53 |   { $$ = redgrep::Conjunction($1, $3); }
 54 | | COMPLEMENT expression
 55 |   { $$ = redgrep::Complement($2); }
 56 | | expression expression %prec CONCATENATION
 57 |   { $$ = redgrep::Concatenation($1, $2); }
 58 | | expression QUANTIFIER
 59 |   { redgrep::Exp sub; int min; int max;
 60 |     std::tie(sub, min, max) = $2->quantifier();
 61 |     redgrep::Mode mode; bool capture;
 62 |     std::tie(std::ignore, std::ignore, mode, capture) = sub->group();
 63 |     $$ = redgrep::Quantifier($1, min, max);
 64 |     $$ = redgrep::Group(-1, $$, mode, capture); }
 65 | | LEFT_PARENTHESIS expression RIGHT_PARENTHESIS
 66 |   { redgrep::Mode mode; bool capture;
 67 |     std::tie(std::ignore, std::ignore, mode, capture) = $1->group();
 68 |     $$ = redgrep::Group(-1, $2, mode, capture); }
 69 | | FUNDAMENTAL
 70 |   { $$ = $1; }
 71 | 
 72 | %%
 73 | 
 74 | static bool Character(llvm::StringRef* input,
 75 |                       Rune* character) {
 76 |   int len = charntorune(character, input->data(), input->size());
 77 |   if (len > 0) {
 78 |     *input = input->drop_front(len);
 79 |     return true;
 80 |   }
 81 |   return false;
 82 | }
 83 | 
 84 | static bool CharacterClass(llvm::StringRef* input,
 85 |                            std::set<Rune>* characters,
 86 |                            bool* complement) {
 87 |   if (input->startswith("^")) {
 88 |     *input = input->drop_front(1);
 89 |     *complement = true;
 90 |   } else {
 91 |     *complement = false;
 92 |   }
 93 |   Rune character;
 94 |   while (Character(input, &character)) {
 95 |     switch (character) {
 96 |       case '\\':
 97 |         if (!Character(input, &character)) {
 98 |           return false;
 99 |         }
100 |         switch (character) {
101 |           case 'f':
102 |             character = '\f';
103 |             break;
104 |           case 'n':
105 |             character = '\n';
106 |             break;
107 |           case 'r':
108 |             character = '\r';
109 |             break;
110 |           case 't':
111 |             character = '\t';
112 |             break;
113 |           default:
114 |             break;
115 |         }
116 |         // FALLTHROUGH
117 |       default:
118 |         characters->insert(character);
119 |         break;
120 |       case ']':
121 |         return true;
122 |     }
123 |   }
124 |   return false;
125 | }
126 | 
127 | static bool Quantifier(Rune character,
128 |                        llvm::StringRef* input,
129 |                        int* min,
130 |                        int* max) {
131 |   static constexpr char kDigits[] = "0123456789";
132 |   auto Number = [&input](int* output) -> bool {
133 |     if (input->find_first_of(kDigits) == 0) {
134 |       size_t len = input->find_first_not_of(kDigits);
135 |       if (len != llvm::StringRef::npos && len <= 9) {
136 |         sscanf(input->data(), "%d", output);
137 |         *input = input->drop_front(len);
138 |         return true;
139 |       }
140 |     }
141 |     return false;
142 |   };
143 |   switch (character) {
144 |     case '*':
145 |       *min = 0;
146 |       *max = -1;
147 |       return true;
148 |     case '+':
149 |       *min = 1;
150 |       *max = -1;
151 |       return true;
152 |     case '?':
153 |       *min = 0;
154 |       *max = 1;
155 |       return true;
156 |     case '{': {
157 |       if (Number(min) && *min >= 0) {
158 |         if (input->startswith("}")) {  // {n}
159 |           *input = input->drop_front(1);
160 |           *max = *min;
161 |           return true;
162 |         }
163 |         if (input->startswith(",")) {
164 |           *input = input->drop_front(1);
165 |           if (input->startswith("}")) {  // {n,}
166 |             *input = input->drop_front(1);
167 |             *max = -1;
168 |             return true;
169 |           }
170 |           if (Number(max) && *max >= *min) {
171 |             if (input->startswith("}")) {  // {n,m}
172 |               *input = input->drop_front(1);
173 |               return true;
174 |             }
175 |           }
176 |         }
177 |       }
178 |       return false;
179 |     }
180 |     default:
181 |       break;
182 |   }
183 |   abort();
184 | }
185 | 
186 | namespace yy {
187 | 
188 | int yylex(redgrep::Exp* exp, llvm::StringRef* str) {
189 |   Rune character;
190 |   if (!Character(str, &character)) {
191 |     return 0;
192 |   }
193 |   typedef parser::token_type TokenType;
194 |   switch (character) {
195 |     case '|':
196 |       return TokenType::DISJUNCTION;
197 |     case '&':
198 |       return TokenType::CONJUNCTION;
199 |     case '!':
200 |       return TokenType::COMPLEMENT;
201 |     case '*':
202 |     case '+':
203 |     case '?':
204 |     case '{': {
205 |       int min, max;
206 |       if (!Quantifier(character, str, &min, &max)) {
207 |         return TokenType::ERROR;
208 |       }
209 |       redgrep::Mode mode;
210 |       bool capture = false;
211 |       if (str->startswith("?")) {
212 |         *str = str->drop_front(1);
213 |         mode = redgrep::kMinimal;
214 |       } else {
215 |         mode = redgrep::kMaximal;
216 |       }
217 |       // Somewhat perversely, we bundle the Group into the Quantifier and then
218 |       // rebundle them back in the parser action.
219 |       *exp = redgrep::Group(-1, redgrep::Byte(-1), mode, capture);
220 |       *exp = redgrep::Quantifier(*exp, min, max);
221 |       return TokenType::QUANTIFIER;
222 |     }
223 |     case '(': {
224 |       redgrep::Mode mode = redgrep::kPassive;
225 |       bool capture;
226 |       if (str->startswith("?:")) {
227 |         *str = str->drop_front(2);
228 |         capture = false;
229 |       } else {
230 |         capture = true;
231 |       }
232 |       *exp = redgrep::Group(-1, redgrep::Byte(-1), mode, capture);
233 |       return TokenType::LEFT_PARENTHESIS;
234 |     }
235 |     case ')':
236 |       return TokenType::RIGHT_PARENTHESIS;
237 |     case '[': {
238 |       std::set<Rune> characters;
239 |       bool complement;
240 |       if (!CharacterClass(str, &characters, &complement) ||
241 |           characters.empty()) {
242 |         return TokenType::ERROR;
243 |       }
244 |       *exp = redgrep::CharacterClass(characters, complement);
245 |       return TokenType::FUNDAMENTAL;
246 |     }
247 |     case '\\':
248 |       if (!Character(str, &character)) {
249 |         return TokenType::ERROR;
250 |       }
251 |       switch (character) {
252 |         case 'C':
253 |           *exp = redgrep::AnyByte();
254 |           return TokenType::FUNDAMENTAL;
255 |         case 'f':
256 |           character = '\f';
257 |           break;
258 |         case 'n':
259 |           character = '\n';
260 |           break;
261 |         case 'r':
262 |           character = '\r';
263 |           break;
264 |         case 't':
265 |           character = '\t';
266 |           break;
267 |         default:
268 |           break;
269 |       }
270 |       // FALLTHROUGH
271 |     default:
272 |       *exp = redgrep::Character(character);
273 |       return TokenType::FUNDAMENTAL;
274 |     case '.':
275 |       *exp = redgrep::AnyCharacter();
276 |       return TokenType::FUNDAMENTAL;
277 |   }
278 | }
279 | 
280 | void parser::error(const std::string&) {
281 |   // TODO(junyer): Do something?
282 | }
283 | 
284 | }  // namespace yy
285 | 


--------------------------------------------------------------------------------
/redasm.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2012 Google Inc. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include <err.h>
16 | #include <stddef.h>
17 | #include <stdint.h>
18 | #include <stdio.h>
19 | 
20 | #include <string>
21 | 
22 | #include "llvm-c/Disassembler.h"
23 | #include "llvm/ExecutionEngine/ExecutionEngine.h"
24 | #include "llvm/Support/TargetSelect.h"
25 | #include "llvm/Target/TargetMachine.h"
26 | #include "regexp.h"
27 | 
28 | int main(int argc, char** argv) {
29 |   const char* argv1 = argv[1];
30 |   if (argv1 == nullptr) {
31 |     errx(1, "regular expression not specified");
32 |   }
33 |   redgrep::Exp exp;
34 |   if (!redgrep::Parse(argv1, &exp)) {
35 |     errx(1, "parse error");
36 |   }
37 |   redgrep::DFA dfa;
38 |   int nstates = redgrep::Compile(exp, &dfa);
39 |   printf("; dfa is %d states\n", nstates);
40 |   redgrep::Fun fun;
41 |   int nbytes = redgrep::Compile(dfa, &fun);
42 |   printf("; fun is %d bytes\n", nbytes);
43 | 
44 |   std::string triple = fun.engine_->getTargetMachine()->getTargetTriple().str();
45 |   std::string cpu(fun.engine_->getTargetMachine()->getTargetCPU());
46 |   printf("; target is %s (%s)\n", triple.c_str(), cpu.c_str());
47 | 
48 |   // We need these for the disassembler.
49 |   llvm::InitializeAllTargetInfos();
50 |   llvm::InitializeAllTargets();
51 |   llvm::InitializeAllTargetMCs();
52 |   llvm::InitializeAllAsmPrinters();
53 |   llvm::InitializeAllAsmParsers();
54 |   llvm::InitializeAllDisassemblers();
55 | 
56 |   LLVMDisasmContextRef disasm = LLVMCreateDisasmCPU(
57 |       triple.c_str(), cpu.c_str(), nullptr, 0, nullptr, nullptr);
58 |   // These are increased and decreased, respectively, as we iterate.
59 |   uint8_t* addr = reinterpret_cast<uint8_t*>(fun.machine_code_addr_);
60 |   uint64_t size = fun.machine_code_size_;
61 |   // These are the bounds.
62 |   uint8_t* base = addr;
63 |   uint8_t* limit = addr + size;
64 |   while (addr < limit) {
65 |     char buf[128];
66 |     size_t len = LLVMDisasmInstruction(disasm, addr, size, 0, buf, sizeof buf);
67 |     if (len == 0) {
68 |       errx(1, "bad machine code at %td (%p)", addr - base, addr);
69 |     }
70 |     printf("%8td%s\n", addr - base, buf);
71 |     addr += len;
72 |     size -= len;
73 |   }
74 |   LLVMDisasmDispose(disasm);
75 |   return 0;
76 | }
77 | 


--------------------------------------------------------------------------------
/reddot.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2012 Google Inc. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include <err.h>
 16 | #include <stdio.h>
 17 | #include <string.h>
 18 | #include <unistd.h>
 19 | 
 20 | #include <list>
 21 | #include <map>
 22 | #include <set>
 23 | #include <tuple>
 24 | #include <utility>
 25 | 
 26 | #include "regexp.h"
 27 | 
 28 | static void EmitHeader(const char* str) {
 29 |   printf("digraph reddot {\n");
 30 |   // TODO(junyer): Escape double quotes?
 31 |   printf("label=\"%s\"\n", str);
 32 |   printf("labelloc=\"t\"\n");
 33 | }
 34 | 
 35 | static void EmitState(int curr, const char* fillcolor) {
 36 |   printf("s%d", curr);
 37 |   printf(" [style=filled fillcolor=%s]", fillcolor);
 38 |   printf("\n");
 39 | }
 40 | 
 41 | static void EmitTransition(int curr, int next, int byte) {
 42 |   printf("s%d -> s%d [label=\"", curr, next);
 43 |   if (byte == -1) {
 44 |     printf("\" style=dashed]");
 45 |   } else {
 46 |     printf("%02X\"]", byte);
 47 |   }
 48 |   printf("\n");
 49 | }
 50 | 
 51 | static void EmitTransition(int curr, int next, int begin, int end) {
 52 |   printf("s%d -> s%d [label=\"", curr, next);
 53 |   printf("%02X-%02X\"]", begin, end);
 54 |   printf("\n");
 55 | }
 56 | 
 57 | static void EmitFooter() {
 58 |   printf("}\n");
 59 | }
 60 | 
 61 | inline void HandleImpl(const char* str, int nstates, const redgrep::FA& fa,
 62 |                        const std::set<std::tuple<int, int, int>>& transition_set) {
 63 |   EmitHeader(str);
 64 |   for (int i = 0; i < nstates; ++i) {
 65 |     int curr = i;
 66 |     if (fa.IsError(curr)) {
 67 |       // This is the error state.
 68 |       EmitState(curr, "red");
 69 |     } else if (fa.IsAccepting(curr)) {
 70 |       // This is an accepting state.
 71 |       EmitState(curr, "green");
 72 |     } else {
 73 |       // This is a normal state.
 74 |       EmitState(curr, "white");
 75 |     }
 76 |   }
 77 |   std::map<std::pair<int, int>, std::list<std::pair<int, int>>> transition_map;
 78 |   for (const auto& i : transition_set) {
 79 |     int curr; int next; int byte;
 80 |     std::tie(curr, next, byte) = i;
 81 |     if (byte == -1) {
 82 |       EmitTransition(curr, next, byte);
 83 |     } else {
 84 |       auto& range_list = transition_map[std::make_pair(curr, next)];
 85 |       if (range_list.empty() ||
 86 |           range_list.back().second + 1 != byte) {
 87 |         range_list.push_back(std::make_pair(byte, byte));
 88 |       } else {
 89 |         range_list.back().second = byte;
 90 |       }
 91 |     }
 92 |   }
 93 |   for (const auto& i : transition_map) {
 94 |     int curr = i.first.first;
 95 |     int next = i.first.second;
 96 |     const auto& range_list = i.second;
 97 |     for (const auto& j : range_list) {
 98 |       int begin = j.first;
 99 |       int end = j.second;
100 |       if (begin == end) {
101 |         EmitTransition(curr, next, begin);
102 |       } else {
103 |         EmitTransition(curr, next, begin, end);
104 |       }
105 |     }
106 |   }
107 |   EmitFooter();
108 | }
109 | 
110 | static void HandleDFA(const char* str) {
111 |   redgrep::Exp exp;
112 |   redgrep::DFA dfa;
113 |   if (!redgrep::Parse(str, &exp)) {
114 |     errx(1, "parse error");
115 |   }
116 |   int nstates = redgrep::Compile(exp, &dfa);
117 |   std::set<std::tuple<int, int, int>> transition_set;
118 |   for (const auto& i : dfa.transition_) {
119 |     int curr = i.first.first;
120 |     int byte = i.first.second;
121 |     int next = i.second;
122 |     if (!dfa.IsError(next) || byte != -1) {
123 |       transition_set.insert(std::make_tuple(curr, next, byte));
124 |     }
125 |   }
126 |   HandleImpl(str, nstates, dfa, transition_set);
127 | }
128 | 
129 | static void HandleTNFA(const char* str) {
130 |   redgrep::Exp exp;
131 |   redgrep::TNFA tnfa;
132 |   if (!redgrep::Parse(str, &exp, &tnfa.modes_, &tnfa.captures_)) {
133 |     errx(1, "parse error");
134 |   }
135 |   int nstates = redgrep::Compile(exp, &tnfa);
136 |   std::set<std::tuple<int, int, int>> transition_set;
137 |   for (const auto& i : tnfa.transition_) {
138 |     int curr = i.first.first;
139 |     int byte = i.first.second;
140 |     int next = i.second.first;
141 |     // TODO(junyer): Bindings?
142 |     if (!tnfa.IsError(next) || byte != -1) {
143 |       transition_set.insert(std::make_tuple(curr, next, byte));
144 |     }
145 |   }
146 |   HandleImpl(str, nstates, tnfa, transition_set);
147 | }
148 | 
149 | int main(int argc, char** argv) {
150 |   // Parse options.
151 |   enum {
152 |     kDFA, kTNFA, kTDFA,
153 |   } mode = kDFA;
154 |   for (;;) {
155 |     int opt = getopt(argc, argv, "m:");
156 |     if (opt == -1) {
157 |       break;
158 |     }
159 |     switch (opt) {
160 |       case 'm':
161 |         if (strcmp(optarg, "dfa") == 0) {
162 |           mode = kDFA;
163 |         } else if (strcmp(optarg, "tnfa") == 0) {
164 |           mode = kTNFA;
165 |         } else if (strcmp(optarg, "tdfa") == 0) {
166 |           mode = kTDFA;
167 |         } else {
168 |           errx(1, "invalid mode");
169 |         }
170 |         break;
171 |       default:
172 |         errx(1, "Usage: %s [OPTION]... REGEXP", argv[0]);
173 |     }
174 |   }
175 | 
176 |   if (optind == argc) {
177 |     errx(1, "regular expression not specified");
178 |   }
179 | 
180 |   switch (mode) {
181 |     case kDFA:
182 |       HandleDFA(argv[optind++]);
183 |       break;
184 |     case kTNFA:
185 |       HandleTNFA(argv[optind++]);
186 |       break;
187 |     case kTDFA:
188 |     default:
189 |       errx(1, "not implemented");
190 |   }
191 | 
192 |   return 0;
193 | }
194 | 


--------------------------------------------------------------------------------
/redgrep.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2012 Google Inc. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include "redgrep.h"
16 | 
17 | RED::RED(llvm::StringRef str) {
18 |   redgrep::Exp exp;
19 |   ok_ = redgrep::Parse(str, &exp);
20 |   if (ok()) {
21 |     redgrep::DFA dfa;
22 |     redgrep::Compile(exp, &dfa);
23 |     redgrep::Compile(dfa, &fun_);
24 |   }
25 | }
26 | 
27 | RED::~RED() {}
28 | 
29 | bool RED::FullMatch(llvm::StringRef str, const RED& re) {
30 |   return redgrep::Match(re.fun_, str);
31 | }
32 | 


--------------------------------------------------------------------------------
/redgrep.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2012 Google Inc. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef REDGREP_REDGREP_H_
16 | #define REDGREP_REDGREP_H_
17 | 
18 | #include "llvm/ADT/StringRef.h"
19 | #include "regexp.h"
20 | 
21 | // Represents a regular expression.
22 | // The interface is intended to resemble that of RE and RE2.
23 | class RED {
24 |  public:
25 |   explicit RED(llvm::StringRef str);
26 |   ~RED();
27 | 
28 |   // Returns true if the RED object is usable, false otherwise.
29 |   // TODO(junyer): Plumb and expose errors from the parser.
30 |   bool ok() const { return ok_; }
31 | 
32 |   // Returns the result of matching str using re.
33 |   static bool FullMatch(llvm::StringRef str, const RED& re);
34 | 
35 |  private:
36 |   bool ok_;
37 |   redgrep::Fun fun_;
38 | 
39 |   RED(const RED&) = delete;
40 |   RED& operator=(const RED&) = delete;
41 | };
42 | 
43 | #endif  // REDGREP_REDGREP_H_
44 | 


--------------------------------------------------------------------------------
/redgrep_main.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2012 Google Inc. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include <err.h>
 16 | #include <errno.h>
 17 | #include <stddef.h>
 18 | #include <stdio.h>
 19 | #include <stdlib.h>
 20 | #include <unistd.h>
 21 | 
 22 | #include <string>
 23 | 
 24 | #include "llvm/ADT/StringRef.h"
 25 | #include "redgrep.h"
 26 | 
 27 | static constexpr char kUsage[] =
 28 |   "Usage: %s [OPTION]... REGEXP [FILE]...\n"
 29 |   "\n"
 30 |   "Options:\n"
 31 |   "\n"
 32 |   "  -v  select non-matching lines\n"
 33 |   "  -n  print line number with output lines\n"
 34 |   "  -H  print the file name for each match\n"
 35 |   "  -h  suppress the file name prefix on output\n"
 36 |   "\n"
 37 |   "Similar to the way in which find(1) lets you construct expressions,\n"
 38 |   "REGEXP may comprise multiple subexpressions as separate arguments:\n"
 39 |   "\n"
 40 |   "  [-e] EXPR       regular expression\n"
 41 |   "  ( EXPR )        grouping\n"
 42 |   "  ! EXPR          complement\n"
 43 |   "  -not EXPR\n"
 44 |   "  EXPR & EXPR     conjunction\n"
 45 |   "  EXPR -a EXPR\n"
 46 |   "  EXPR -and EXPR\n"
 47 |   "  EXPR | EXPR     disjunction\n"
 48 |   "  EXPR -o EXPR\n"
 49 |   "  EXPR -or EXPR\n"
 50 |   "\n"
 51 |   "EXPR may begin with `^' in order to anchor it to the beginning of the\n"
 52 |   "line and may end with `$' in order to anchor it to the end of the line.\n"
 53 |   "\n";
 54 | 
 55 | int main(int argc, char** argv) {
 56 |   // Parse options.
 57 |   bool opt_invert_match = false;
 58 |   bool opt_line_number = false;
 59 |   enum {
 60 |     kAlways, kMaybe, kNever,
 61 |   } opt_with_filename = kMaybe;
 62 |   bool escape = false;
 63 |   while (!escape) {
 64 |     int opt = getopt(argc, argv, "+vnHhe:");
 65 |     if (opt == -1) {
 66 |       break;
 67 |     }
 68 |     switch (opt) {
 69 |       case 'v':
 70 |         opt_invert_match = true;
 71 |         break;
 72 |       case 'n':
 73 |         opt_line_number = true;
 74 |         break;
 75 |       case 'H':
 76 |         opt_with_filename = kAlways;
 77 |         break;
 78 |       case 'h':
 79 |         opt_with_filename = kNever;
 80 |         break;
 81 |       case 'e':
 82 |         argv[--optind] = optarg;
 83 |         escape = true;
 84 |         break;
 85 |       default:
 86 |         // TODO(junyer): Move most of the usage text to `--help'.
 87 |         fprintf(stderr, kUsage, program_invocation_short_name);
 88 |         return 2;
 89 |     }
 90 |   }
 91 | 
 92 |   // Shift off parsed options.
 93 |   argc -= optind;
 94 |   argv += optind;
 95 | 
 96 |   // Build regular expression string.
 97 |   // TODO(junyer): Factor out for testing.
 98 |   std::string re_str;
 99 |   int parens = 0;
100 |   bool complete = false;
101 |   while (argc > 0) {
102 |     std::string arg(*argv);
103 |     if (!escape && arg == "-e") {
104 |       if (complete) {
105 |         re_str += "|";
106 |       }
107 |       escape = true;
108 |       complete = false;
109 |     } else if (!escape && arg == "(") {
110 |       re_str += arg;
111 |       ++parens;
112 |     } else if (!escape && arg == ")") {
113 |       re_str += arg;
114 |       --parens;
115 |       if (parens < 0) {
116 |         errx(2, "unmatched right parenthesis");
117 |       }
118 |     } else if (!escape && (arg == "!" || arg == "-not")) {
119 |       re_str += "!";
120 |       complete = false;
121 |     } else if (!escape && (arg == "&" || arg == "-a" || arg == "-and")) {
122 |       re_str += "&";
123 |       complete = false;
124 |     } else if (!escape && (arg == "|" || arg == "-o" || arg == "-or")) {
125 |       re_str += "|";
126 |       complete = false;
127 |     } else if (escape || !complete) {
128 |       if (!arg.empty()) {
129 |         if (arg.front() == '^') {
130 |           arg = arg.substr(1);
131 |         } else {
132 |           arg = ".*" + arg;
133 |         }
134 |         if (arg.back() == '$') {
135 |           arg.back() = '\n';
136 |         } else {
137 |           arg += ".*";
138 |         }
139 |         re_str += arg;
140 |       }
141 |       escape = false;
142 |       complete = true;
143 |     } else {
144 |       break;
145 |     }
146 |     --argc;
147 |     ++argv;
148 |   }
149 | 
150 |   if (re_str.empty()) {
151 |     errx(2, "regular expression not specified");
152 |   }
153 | 
154 |   if (parens > 0) {
155 |     errx(2, "unmatched left parenthesis");
156 |   }
157 | 
158 |   if (!complete) {
159 |     errx(2, "incomplete arguments");
160 |   }
161 | 
162 |   if (opt_invert_match) {
163 |     re_str = "!(" + re_str + ")";
164 |   }
165 | 
166 |   RED re(re_str);
167 |   if (!re.ok()) {
168 |     errx(2, "parse error");
169 |   }
170 | 
171 |   // Parse files.
172 |   char const *const *files = argv;
173 |   int nfiles = argc;
174 |   if (nfiles == 0) {
175 |     static char const *const kFiles[] = { "-", nullptr, };
176 |     files = kFiles;
177 |     nfiles = 1;
178 |   }
179 | 
180 |   // Grep!
181 |   bool matched = false;
182 |   char* data = nullptr;
183 |   size_t size = 0;
184 |   for (int i = 0; i < nfiles; ++i) {
185 |     bool file_is_stdin = (files[i][0] == '-' &&
186 |                           files[i][1] == '\0');
187 |     FILE* file = (file_is_stdin
188 |                   // GNU grep lets you specify "-" more than once. To emulate
189 |                   // this, we dup stdin here so that we don't close it later.
190 |                   ? fdopen(dup(fileno(stdin)), "r")
191 |                   : fopen(files[i], "r"));
192 |     if (file == nullptr) {
193 |       warn("%s", files[i]);
194 |       continue;
195 |     }
196 |     for (int n = 1;; ++n) {
197 |       ssize_t len = getline(&data, &size, file);
198 |       if (len == -1) {
199 |         break;
200 |       }
201 |       llvm::StringRef str(data, len);
202 |       if (RED::FullMatch(str, re)) {
203 |         matched = true;
204 |         if (opt_with_filename == kAlways ||
205 |             (opt_with_filename == kMaybe && nfiles > 1)) {
206 |           printf("%s:", (file_is_stdin
207 |                          ? "(standard input)"
208 |                          : files[i]));
209 |         }
210 |         if (opt_line_number) {
211 |           printf("%d:", n);
212 |         }
213 |         printf("%.*s", static_cast<int>(len), data);
214 |       }
215 |     }
216 |     fclose(file);
217 |   }
218 |   free(data);
219 | 
220 |   // As per GNU grep, "The exit status is 0 if selected lines are found, and 1
221 |   // if not found. If an error occurred the exit status is 2."
222 |   return matched ? 0 : 1;
223 | }
224 | 


--------------------------------------------------------------------------------
/regexp.cc:
--------------------------------------------------------------------------------
   1 | // Copyright 2012 Google Inc. All Rights Reserved.
   2 | //
   3 | // Licensed under the Apache License, Version 2.0 (the "License");
   4 | // you may not use this file except in compliance with the License.
   5 | // You may obtain a copy of the License at
   6 | //
   7 | //     http://www.apache.org/licenses/LICENSE-2.0
   8 | //
   9 | // Unless required by applicable law or agreed to in writing, software
  10 | // distributed under the License is distributed on an "AS IS" BASIS,
  11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 | // See the License for the specific language governing permissions and
  13 | // limitations under the License.
  14 | 
  15 | #include "regexp.h"
  16 | 
  17 | #include <stdlib.h>
  18 | #include <string.h>
  19 | 
  20 | #include <bitset>
  21 | #include <list>
  22 | #include <map>
  23 | #include <mutex>
  24 | #include <set>
  25 | #include <tuple>
  26 | #include <utility>
  27 | #include <vector>
  28 | 
  29 | #include "llvm/ADT/StringRef.h"
  30 | #include "llvm/ExecutionEngine/ExecutionEngine.h"
  31 | #include "llvm/ExecutionEngine/JITEventListener.h"
  32 | #include "llvm/ExecutionEngine/MCJIT.h"
  33 | #include "llvm/ExecutionEngine/RuntimeDyld.h"
  34 | #include "llvm/IR/BasicBlock.h"
  35 | #include "llvm/IR/Constants.h"
  36 | #include "llvm/IR/DerivedTypes.h"
  37 | #include "llvm/IR/Function.h"
  38 | #include "llvm/IR/GlobalValue.h"
  39 | #include "llvm/IR/IRBuilder.h"
  40 | #include "llvm/IR/InstrTypes.h"
  41 | #include "llvm/IR/Instructions.h"
  42 | #include "llvm/IR/LLVMContext.h"
  43 | #include "llvm/IR/Module.h"
  44 | #include "llvm/Object/Binary.h"
  45 | #include "llvm/Object/ObjectFile.h"
  46 | #include "llvm/Object/SymbolSize.h"
  47 | #include "llvm/Object/SymbolicFile.h"
  48 | #include "llvm/Passes/PassBuilder.h"
  49 | #include "llvm/Support/Casting.h"
  50 | #include "llvm/Support/ErrorOr.h"
  51 | #include "llvm/Support/Host.h"
  52 | #include "llvm/Support/TargetSelect.h"
  53 | #include "llvm/Target/TargetMachine.h"
  54 | #include "parser.tab.hh"
  55 | #include "utf.h"
  56 | 
  57 | namespace redgrep {
  58 | 
  59 | #define CAST_TO_INTPTR_T(ptr) reinterpret_cast<intptr_t>(ptr)
  60 | 
  61 | Expression::Expression(Kind kind)
  62 |     : kind_(kind),
  63 |       data_(0),
  64 |       norm_(true) {}
  65 | 
  66 | Expression::Expression(Kind kind, const std::tuple<int, Exp, Mode, bool>& group)
  67 |     : kind_(kind),
  68 |       data_(CAST_TO_INTPTR_T((new std::tuple<int, Exp, Mode, bool>(group)))),
  69 |       norm_(false) {}
  70 | 
  71 | Expression::Expression(Kind kind, int byte)
  72 |     : kind_(kind),
  73 |       data_(byte),
  74 |       norm_(true) {}
  75 | 
  76 | Expression::Expression(Kind kind, const std::pair<int, int>& byte_range)
  77 |     : kind_(kind),
  78 |       data_(CAST_TO_INTPTR_T((new std::pair<int, int>(byte_range)))),
  79 |       norm_(true) {}
  80 | 
  81 | Expression::Expression(Kind kind, const std::list<Exp>& subexpressions, bool norm)
  82 |     : kind_(kind),
  83 |       data_(CAST_TO_INTPTR_T((new std::list<Exp>(subexpressions)))),
  84 |       norm_(norm) {}
  85 | 
  86 | Expression::Expression(Kind kind, const std::pair<std::set<Rune>, bool>& character_class)
  87 |     : kind_(kind),
  88 |       data_(CAST_TO_INTPTR_T((new std::pair<std::set<Rune>, bool>(character_class)))),
  89 |       norm_(false) {}
  90 | 
  91 | Expression::Expression(Kind kind, const std::tuple<Exp, int, int>& quantifier)
  92 |     : kind_(kind),
  93 |       data_(CAST_TO_INTPTR_T((new std::tuple<Exp, int, int>(quantifier)))),
  94 |       norm_(false) {}
  95 | 
  96 | Expression::~Expression() {
  97 |   switch (kind()) {
  98 |     case kEmptySet:
  99 |     case kEmptyString:
 100 |       break;
 101 | 
 102 |     case kGroup:
 103 |       delete reinterpret_cast<std::tuple<int, Exp, Mode, bool>*>(data());
 104 |       break;
 105 | 
 106 |     case kAnyByte:
 107 |       break;
 108 | 
 109 |     case kByte:
 110 |       break;
 111 | 
 112 |     case kByteRange:
 113 |       delete reinterpret_cast<std::pair<int, int>*>(data());
 114 |       break;
 115 | 
 116 |     case kKleeneClosure:
 117 |     case kConcatenation:
 118 |     case kComplement:
 119 |     case kConjunction:
 120 |     case kDisjunction:
 121 |       delete reinterpret_cast<std::list<Exp>*>(data());
 122 |       break;
 123 | 
 124 |     case kCharacterClass:
 125 |       delete reinterpret_cast<std::pair<std::set<Rune>, bool>*>(data());
 126 |       break;
 127 | 
 128 |     case kQuantifier:
 129 |       delete reinterpret_cast<std::tuple<Exp, int, int>*>(data());
 130 |       break;
 131 |   }
 132 | }
 133 | 
 134 | const std::tuple<int, Exp, Mode, bool>& Expression::group() const {
 135 |   return *reinterpret_cast<std::tuple<int, Exp, Mode, bool>*>(data());
 136 | }
 137 | 
 138 | int Expression::byte() const {
 139 |   return data();
 140 | }
 141 | 
 142 | const std::pair<int, int>& Expression::byte_range() const {
 143 |   return *reinterpret_cast<std::pair<int, int>*>(data());
 144 | }
 145 | 
 146 | const std::list<Exp>& Expression::subexpressions() const {
 147 |   return *reinterpret_cast<std::list<Exp>*>(data());
 148 | }
 149 | 
 150 | const std::pair<std::set<Rune>, bool>& Expression::character_class() const {
 151 |   return *reinterpret_cast<std::pair<std::set<Rune>, bool>*>(data());
 152 | }
 153 | 
 154 | const std::tuple<Exp, int, int>& Expression::quantifier() const {
 155 |   return *reinterpret_cast<std::tuple<Exp, int, int>*>(data());
 156 | }
 157 | 
 158 | int Expression::Compare(Exp x, Exp y) {
 159 |   if (x->kind() < y->kind()) {
 160 |     return -1;
 161 |   }
 162 |   if (x->kind() > y->kind()) {
 163 |     return +1;
 164 |   }
 165 |   switch (x->kind()) {
 166 |     case kEmptySet:
 167 |     case kEmptyString:
 168 |       return 0;
 169 | 
 170 |     case kGroup:
 171 |       if (x->group() < y->group()) {
 172 |         return -1;
 173 |       }
 174 |       if (x->group() > y->group()) {
 175 |         return +1;
 176 |       }
 177 |       return 0;
 178 | 
 179 |     case kAnyByte:
 180 |       return 0;
 181 | 
 182 |     case kByte:
 183 |       if (x->byte() < y->byte()) {
 184 |         return -1;
 185 |       }
 186 |       if (x->byte() > y->byte()) {
 187 |         return +1;
 188 |       }
 189 |       return 0;
 190 | 
 191 |     case kByteRange:
 192 |       if (x->byte_range() < y->byte_range()) {
 193 |         return -1;
 194 |       }
 195 |       if (x->byte_range() > y->byte_range()) {
 196 |         return +1;
 197 |       }
 198 |       return 0;
 199 | 
 200 |     case kKleeneClosure:
 201 |     case kConcatenation:
 202 |     case kComplement:
 203 |     case kConjunction:
 204 |     case kDisjunction: {
 205 |       // Perform a lexicographical compare.
 206 |       std::list<Exp>::const_iterator xi = x->subexpressions().begin();
 207 |       std::list<Exp>::const_iterator yi = y->subexpressions().begin();
 208 |       while (xi != x->subexpressions().end() &&
 209 |              yi != y->subexpressions().end()) {
 210 |         int compare = Compare(*xi, *yi);
 211 |         if (compare != 0) {
 212 |           return compare;
 213 |         }
 214 |         ++xi;
 215 |         ++yi;
 216 |       }
 217 |       if (xi == x->subexpressions().end() &&
 218 |           yi != y->subexpressions().end()) {
 219 |         return -1;
 220 |       }
 221 |       if (xi != x->subexpressions().end() &&
 222 |           yi == y->subexpressions().end()) {
 223 |         return +1;
 224 |       }
 225 |       return 0;
 226 |     }
 227 | 
 228 |     case kCharacterClass:
 229 |     case kQuantifier:
 230 |       break;
 231 |   }
 232 |   abort();
 233 | }
 234 | 
 235 | Exp EmptySet() {
 236 |   Exp exp(new Expression(kEmptySet));
 237 |   return exp;
 238 | }
 239 | 
 240 | Exp EmptyString() {
 241 |   Exp exp(new Expression(kEmptyString));
 242 |   return exp;
 243 | }
 244 | 
 245 | Exp Group(const std::tuple<int, Exp, Mode, bool>& group) {
 246 |   Exp exp(new Expression(kGroup, group));
 247 |   return exp;
 248 | }
 249 | 
 250 | Exp AnyByte() {
 251 |   Exp exp(new Expression(kAnyByte));
 252 |   return exp;
 253 | }
 254 | 
 255 | Exp Byte(int byte) {
 256 |   Exp exp(new Expression(kByte, byte));
 257 |   return exp;
 258 | }
 259 | 
 260 | Exp ByteRange(const std::pair<int, int>& byte_range) {
 261 |   Exp exp(new Expression(kByteRange, byte_range));
 262 |   return exp;
 263 | }
 264 | 
 265 | Exp KleeneClosure(const std::list<Exp>& subexpressions, bool norm) {
 266 |   Exp exp(new Expression(kKleeneClosure, subexpressions, norm));
 267 |   return exp;
 268 | }
 269 | 
 270 | Exp Concatenation(const std::list<Exp>& subexpressions, bool norm) {
 271 |   Exp exp(new Expression(kConcatenation, subexpressions, norm));
 272 |   return exp;
 273 | }
 274 | 
 275 | Exp Complement(const std::list<Exp>& subexpressions, bool norm) {
 276 |   Exp exp(new Expression(kComplement, subexpressions, norm));
 277 |   return exp;
 278 | }
 279 | 
 280 | Exp Conjunction(const std::list<Exp>& subexpressions, bool norm) {
 281 |   Exp exp(new Expression(kConjunction, subexpressions, norm));
 282 |   return exp;
 283 | }
 284 | 
 285 | Exp Disjunction(const std::list<Exp>& subexpressions, bool norm) {
 286 |   Exp exp(new Expression(kDisjunction, subexpressions, norm));
 287 |   return exp;
 288 | }
 289 | 
 290 | Exp CharacterClass(const std::pair<std::set<Rune>, bool>& character_class) {
 291 |   Exp exp(new Expression(kCharacterClass, character_class));
 292 |   return exp;
 293 | }
 294 | 
 295 | Exp Quantifier(const std::tuple<Exp, int, int>& quantifier) {
 296 |   Exp exp(new Expression(kQuantifier, quantifier));
 297 |   return exp;
 298 | }
 299 | 
 300 | Exp AnyCharacter() {
 301 |   Exp b1 = ByteRange(0x00, 0x7F);  // 0xxxxxxx
 302 |   Exp bx = ByteRange(0x80, 0xBF);  // 10xxxxxx
 303 |   Exp b2 = ByteRange(0xC2, 0xDF);  // 110xxxxx
 304 |   Exp b3 = ByteRange(0xE0, 0xEF);  // 1110xxxx
 305 |   Exp b4 = ByteRange(0xF0, 0xF4);  // 11110xxx
 306 |   return Disjunction(b1,
 307 |                      Concatenation(b2, bx),
 308 |                      Concatenation(b3, bx, bx),
 309 |                      Concatenation(b4, bx, bx, bx));
 310 | }
 311 | 
 312 | Exp Character(Rune character) {
 313 |   char buf[4];
 314 |   int len = runetochar(buf, &character);
 315 |   switch (len) {
 316 |     case 1:
 317 |       return Byte(static_cast<unsigned char>(buf[0]));
 318 |     case 2:
 319 |       return Concatenation(Byte(static_cast<unsigned char>(buf[0])),
 320 |                            Byte(static_cast<unsigned char>(buf[1])));
 321 |     case 3:
 322 |       return Concatenation(Byte(static_cast<unsigned char>(buf[0])),
 323 |                            Byte(static_cast<unsigned char>(buf[1])),
 324 |                            Byte(static_cast<unsigned char>(buf[2])));
 325 |     case 4:
 326 |       return Concatenation(Byte(static_cast<unsigned char>(buf[0])),
 327 |                            Byte(static_cast<unsigned char>(buf[1])),
 328 |                            Byte(static_cast<unsigned char>(buf[2])),
 329 |                            Byte(static_cast<unsigned char>(buf[3])));
 330 |     default:
 331 |       break;
 332 |   }
 333 |   abort();
 334 | }
 335 | 
 336 | Exp Normalised(Exp exp) {
 337 |   if (exp->norm()) {
 338 |     return exp;
 339 |   }
 340 |   switch (exp->kind()) {
 341 |     case kEmptySet:
 342 |     case kEmptyString:
 343 |       return exp;
 344 | 
 345 |     case kGroup: {
 346 |       int num; Exp sub; Mode mode; bool capture;
 347 |       std::tie(num, sub, mode, capture) = exp->group();
 348 |       sub = Normalised(sub);
 349 |       if (sub->kind() == kEmptySet) {
 350 |         return EmptySet();
 351 |       }
 352 |       if (sub->kind() == kEmptyString) {
 353 |         return EmptyString();
 354 |       }
 355 |       return Group(num, sub, mode, capture);
 356 |     }
 357 | 
 358 |     case kAnyByte:
 359 |     case kByte:
 360 |     case kByteRange:
 361 |       return exp;
 362 | 
 363 |     case kKleeneClosure: {
 364 |       Exp sub = Normalised(exp->sub());
 365 |       // (r∗)∗ ≈ r∗
 366 |       if (sub->kind() == kKleeneClosure) {
 367 |         return sub;
 368 |       }
 369 |       // ∅∗ ≈ ε
 370 |       if (sub->kind() == kEmptySet) {
 371 |         return EmptyString();
 372 |       }
 373 |       // ε∗ ≈ ε
 374 |       if (sub->kind() == kEmptyString) {
 375 |         return EmptyString();
 376 |       }
 377 |       // \C∗ ≈ ¬∅
 378 |       if (sub->kind() == kAnyByte) {
 379 |         return Complement({EmptySet()}, true);
 380 |       }
 381 |       return KleeneClosure({sub}, true);
 382 |     }
 383 | 
 384 |     case kConcatenation: {
 385 |       Exp head = exp->head();
 386 |       Exp tail = exp->tail();
 387 |       // (r · s) · t ≈ r · (s · t)
 388 |       head = Normalised(head);
 389 |       while (head->kind() == kConcatenation) {
 390 |         tail = Concatenation(head->tail(), tail);
 391 |         head = head->head();
 392 |       }
 393 |       tail = Normalised(tail);
 394 |       // ∅ · r ≈ ∅
 395 |       if (head->kind() == kEmptySet) {
 396 |         return head;
 397 |       }
 398 |       // r · ∅ ≈ ∅
 399 |       if (tail->kind() == kEmptySet) {
 400 |         return tail;
 401 |       }
 402 |       // ε · r ≈ r
 403 |       if (head->kind() == kEmptyString) {
 404 |         return tail;
 405 |       }
 406 |       // r · ε ≈ r
 407 |       if (tail->kind() == kEmptyString) {
 408 |         return head;
 409 |       }
 410 |       return Concatenation({head, tail}, true);
 411 |     }
 412 | 
 413 |     case kComplement: {
 414 |       Exp sub = Normalised(exp->sub());
 415 |       // ¬(¬r) ≈ r
 416 |       if (sub->kind() == kComplement) {
 417 |         return sub->sub();
 418 |       }
 419 |       return Complement({sub}, true);
 420 |     }
 421 | 
 422 |     case kConjunction: {
 423 |       std::list<Exp> subs;
 424 |       for (Exp sub : exp->subexpressions()) {
 425 |         sub = Normalised(sub);
 426 |         // ∅ & r ≈ ∅
 427 |         // r & ∅ ≈ ∅
 428 |         if (sub->kind() == kEmptySet) {
 429 |           return sub;
 430 |         }
 431 |         // (r & s) & t ≈ r & (s & t)
 432 |         if (sub->kind() == kConjunction) {
 433 |           std::list<Exp> copy = sub->subexpressions();
 434 |           subs.splice(subs.end(), copy);
 435 |         } else {
 436 |           subs.push_back(sub);
 437 |         }
 438 |       }
 439 |       // r & s ≈ s & r
 440 |       subs.sort();
 441 |       // r & r ≈ r
 442 |       subs.unique();
 443 |       // ¬∅ & r ≈ r
 444 |       // r & ¬∅ ≈ r
 445 |       subs.remove_if([&subs](Exp sub) -> bool {
 446 |         return (sub->kind() == kComplement &&
 447 |                 sub->sub()->kind() == kEmptySet &&
 448 |                 subs.size() > 1);
 449 |       });
 450 |       if (subs.size() == 1) {
 451 |         return subs.front();
 452 |       }
 453 |       return Conjunction(subs, true);
 454 |     }
 455 | 
 456 |     case kDisjunction: {
 457 |       std::list<Exp> subs;
 458 |       for (Exp sub : exp->subexpressions()) {
 459 |         sub = Normalised(sub);
 460 |         // ¬∅ + r ≈ ¬∅
 461 |         // r + ¬∅ ≈ ¬∅
 462 |         if (sub->kind() == kComplement &&
 463 |             sub->sub()->kind() == kEmptySet) {
 464 |           return sub;
 465 |         }
 466 |         // (r + s) + t ≈ r + (s + t)
 467 |         if (sub->kind() == kDisjunction) {
 468 |           std::list<Exp> copy = sub->subexpressions();
 469 |           subs.splice(subs.end(), copy);
 470 |         } else {
 471 |           subs.push_back(sub);
 472 |         }
 473 |       }
 474 |       // r + s ≈ s + r
 475 |       subs.sort();
 476 |       // r + r ≈ r
 477 |       subs.unique();
 478 |       // ∅ + r ≈ r
 479 |       // r + ∅ ≈ r
 480 |       subs.remove_if([&subs](Exp sub) -> bool {
 481 |         return (sub->kind() == kEmptySet &&
 482 |                 subs.size() > 1);
 483 |       });
 484 |       if (subs.size() == 1) {
 485 |         return subs.front();
 486 |       }
 487 |       return Disjunction(subs, true);
 488 |     }
 489 | 
 490 |     case kCharacterClass:
 491 |     case kQuantifier:
 492 |       break;
 493 |   }
 494 |   abort();
 495 | }
 496 | 
 497 | bool IsNullable(Exp exp) {
 498 |   switch (exp->kind()) {
 499 |     case kEmptySet:
 500 |       // ν(∅) = ∅
 501 |       return false;
 502 | 
 503 |     case kEmptyString:
 504 |       // ν(ε) = ε
 505 |       return true;
 506 | 
 507 |     case kGroup:
 508 |       return IsNullable(std::get<1>(exp->group()));
 509 | 
 510 |     case kAnyByte:
 511 |       // ν(\C) = ∅
 512 |       return false;
 513 | 
 514 |     case kByte:
 515 |       // ν(a) = ∅
 516 |       return false;
 517 | 
 518 |     case kByteRange:
 519 |       // ν(S) = ∅
 520 |       return false;
 521 | 
 522 |     case kKleeneClosure:
 523 |       // ν(r∗) = ε
 524 |       return true;
 525 | 
 526 |     case kConcatenation:
 527 |       // ν(r · s) = ν(r) & ν(s)
 528 |       return IsNullable(exp->head()) && IsNullable(exp->tail());
 529 | 
 530 |     case kComplement:
 531 |       // ν(¬r) = ∅ if ν(r) = ε
 532 |       //         ε if ν(r) = ∅
 533 |       return !IsNullable(exp->sub());
 534 | 
 535 |     case kConjunction:
 536 |       // ν(r & s) = ν(r) & ν(s)
 537 |       for (Exp sub : exp->subexpressions()) {
 538 |         if (!IsNullable(sub)) {
 539 |           return false;
 540 |         }
 541 |       }
 542 |       return true;
 543 | 
 544 |     case kDisjunction:
 545 |       // ν(r + s) = ν(r) + ν(s)
 546 |       for (Exp sub : exp->subexpressions()) {
 547 |         if (IsNullable(sub)) {
 548 |           return true;
 549 |         }
 550 |       }
 551 |       return false;
 552 | 
 553 |     case kCharacterClass:
 554 |     case kQuantifier:
 555 |       break;
 556 |   }
 557 |   abort();
 558 | }
 559 | 
 560 | Exp Derivative(Exp exp, int byte) {
 561 |   switch (exp->kind()) {
 562 |     case kEmptySet:
 563 |       // ∂a∅ = ∅
 564 |       return EmptySet();
 565 | 
 566 |     case kEmptyString:
 567 |       // ∂aε = ∅
 568 |       return EmptySet();
 569 | 
 570 |     case kGroup:
 571 |       // This should never happen.
 572 |       break;
 573 | 
 574 |     case kAnyByte:
 575 |       // ∂a\C = ε
 576 |       return EmptyString();
 577 | 
 578 |     case kByte:
 579 |       // ∂aa = ε
 580 |       // ∂ab = ∅ for b ≠ a
 581 |       if (exp->byte() == byte) {
 582 |         return EmptyString();
 583 |       } else {
 584 |         return EmptySet();
 585 |       }
 586 | 
 587 |     case kByteRange:
 588 |       // ∂aS = ε if a ∈ S
 589 |       //       ∅ if a ∉ S
 590 |       if (exp->byte_range().first <= byte &&
 591 |           byte <= exp->byte_range().second) {
 592 |         return EmptyString();
 593 |       } else {
 594 |         return EmptySet();
 595 |       }
 596 | 
 597 |     case kKleeneClosure:
 598 |       // ∂a(r∗) = ∂ar · r∗
 599 |       return Concatenation(Derivative(exp->sub(), byte),
 600 |                            exp);
 601 | 
 602 |     case kConcatenation:
 603 |       // ∂a(r · s) = ∂ar · s + ν(r) · ∂as
 604 |       if (IsNullable(exp->head())) {
 605 |         return Disjunction(Concatenation(Derivative(exp->head(), byte),
 606 |                                          exp->tail()),
 607 |                            Derivative(exp->tail(), byte));
 608 |       } else {
 609 |         return Concatenation(Derivative(exp->head(), byte),
 610 |                              exp->tail());
 611 |       }
 612 | 
 613 |     case kComplement:
 614 |       // ∂a(¬r) = ¬(∂ar)
 615 |       return Complement(Derivative(exp->sub(), byte));
 616 | 
 617 |     case kConjunction: {
 618 |       // ∂a(r & s) = ∂ar & ∂as
 619 |       std::list<Exp> subs;
 620 |       for (Exp sub : exp->subexpressions()) {
 621 |         sub = Derivative(sub, byte);
 622 |         subs.push_back(sub);
 623 |       }
 624 |       return Conjunction(subs, false);
 625 |     }
 626 | 
 627 |     case kDisjunction: {
 628 |       // ∂a(r + s) = ∂ar + ∂as
 629 |       std::list<Exp> subs;
 630 |       for (Exp sub : exp->subexpressions()) {
 631 |         sub = Derivative(sub, byte);
 632 |         subs.push_back(sub);
 633 |       }
 634 |       return Disjunction(subs, false);
 635 |     }
 636 | 
 637 |     case kCharacterClass:
 638 |     case kQuantifier:
 639 |       break;
 640 |   }
 641 |   abort();
 642 | }
 643 | 
 644 | Outer Denormalised(Exp exp) {
 645 |   Outer outer(new OuterSet);
 646 |   exp = Normalised(exp);
 647 |   if (exp->kind() != kDisjunction) {
 648 |     exp = Disjunction({exp}, false);
 649 |   }
 650 |   for (Exp sub : exp->subexpressions()) {
 651 |     if (sub->kind() != kConjunction) {
 652 |       sub = Conjunction({sub}, false);
 653 |     }
 654 |     outer->push_back(std::make_pair(sub, Bindings({})));
 655 |   }
 656 |   return outer;
 657 | }
 658 | 
 659 | Outer PartialConcatenation(Outer x, Exp y, const Bindings& initial) {
 660 |   // We mutate x as an optimisation.
 661 |   for (auto& xi : *x) {
 662 |     std::list<Exp> subs;
 663 |     for (Exp sub : xi.first->subexpressions()) {
 664 |       sub = Concatenation(sub, y);
 665 |       subs.push_back(sub);
 666 |     }
 667 |     xi.first = Conjunction(subs, false);
 668 |     xi.second.insert(xi.second.begin(), initial.begin(), initial.end());
 669 |   }
 670 |   return x;
 671 | }
 672 | 
 673 | Outer PartialComplement(Outer x) {
 674 |   Outer outer(nullptr);
 675 |   for (const auto& xi : *x) {
 676 |     Outer tmp(new OuterSet);
 677 |     for (Exp sub : xi.first->subexpressions()) {
 678 |       sub = Complement(sub);
 679 |       sub = Conjunction({sub}, false);
 680 |       tmp->push_back(std::make_pair(sub, Bindings({})));
 681 |     }
 682 |     if (outer == nullptr) {
 683 |       outer = std::move(tmp);
 684 |     } else {
 685 |       outer = PartialConjunction(std::move(outer), std::move(tmp));
 686 |     }
 687 |   }
 688 |   return outer;
 689 | }
 690 | 
 691 | Outer PartialConjunction(Outer x, Outer y) {
 692 |   Outer outer(new OuterSet);
 693 |   for (const auto& xi : *x) {
 694 |     for (const auto& yi : *y) {
 695 |       Exp sub = Conjunction(xi.first, yi.first);
 696 |       Bindings bindings;
 697 |       bindings.insert(bindings.end(), xi.second.begin(), xi.second.end());
 698 |       bindings.insert(bindings.end(), yi.second.begin(), yi.second.end());
 699 |       outer->push_back(std::make_pair(sub, bindings));
 700 |     }
 701 |   }
 702 |   return outer;
 703 | }
 704 | 
 705 | Outer PartialDisjunction(Outer x, Outer y) {
 706 |   // We mutate x as an optimisation.
 707 |   x->insert(x->end(), y->begin(), y->end());
 708 |   return x;
 709 | }
 710 | 
 711 | // Computes the cancel Bindings for exp.
 712 | static void CancelBindings(Exp exp, Bindings* bindings) {
 713 |   switch (exp->kind()) {
 714 |     case kEmptySet:
 715 |     case kEmptyString:
 716 |       return;
 717 | 
 718 |     case kGroup: {
 719 |       int num; Exp sub;
 720 |       std::tie(num, sub, std::ignore, std::ignore) = exp->group();
 721 |       bindings->push_back(std::make_pair(num, kCancel));
 722 |       CancelBindings(sub, bindings);
 723 |       return;
 724 |     }
 725 | 
 726 |     case kAnyByte:
 727 |     case kByte:
 728 |     case kByteRange:
 729 |       return;
 730 | 
 731 |     case kKleeneClosure:
 732 |       CancelBindings(exp->sub(), bindings);
 733 |       return;
 734 | 
 735 |     case kConcatenation:
 736 |       CancelBindings(exp->head(), bindings);
 737 |       CancelBindings(exp->tail(), bindings);
 738 |       return;
 739 | 
 740 |     case kComplement:
 741 |       return;
 742 | 
 743 |     case kConjunction:
 744 |     case kDisjunction:
 745 |       for (Exp sub : exp->subexpressions()) {
 746 |         CancelBindings(sub, bindings);
 747 |       }
 748 |       return;
 749 | 
 750 |     case kCharacterClass:
 751 |     case kQuantifier:
 752 |       break;
 753 |   }
 754 |   abort();
 755 | }
 756 | 
 757 | // Computes the epsilon Bindings for exp.
 758 | static void EpsilonBindings(Exp exp, Bindings* bindings) {
 759 |   switch (exp->kind()) {
 760 |     case kEmptySet:
 761 |     case kEmptyString:
 762 |       return;
 763 | 
 764 |     case kGroup: {
 765 |       int num; Exp sub;
 766 |       std::tie(num, sub, std::ignore, std::ignore) = exp->group();
 767 |       bindings->push_back(std::make_pair(num, kEpsilon));
 768 |       EpsilonBindings(sub, bindings);
 769 |       return;
 770 |     }
 771 | 
 772 |     case kAnyByte:
 773 |     case kByte:
 774 |     case kByteRange:
 775 |       return;
 776 | 
 777 |     case kKleeneClosure:
 778 |       if (IsNullable(exp->sub())) {
 779 |         EpsilonBindings(exp->sub(), bindings);
 780 |       }
 781 |       return;
 782 | 
 783 |     case kConcatenation:
 784 |       EpsilonBindings(exp->head(), bindings);
 785 |       EpsilonBindings(exp->tail(), bindings);
 786 |       return;
 787 | 
 788 |     case kComplement:
 789 |       return;
 790 | 
 791 |     case kConjunction:
 792 |       for (Exp sub : exp->subexpressions()) {
 793 |         EpsilonBindings(sub, bindings);
 794 |       }
 795 |       return;
 796 | 
 797 |     case kDisjunction:
 798 |       for (Exp sub : exp->subexpressions()) {
 799 |         if (IsNullable(sub)) {
 800 |           EpsilonBindings(sub, bindings);
 801 |           return;
 802 |         }
 803 |       }
 804 |       return;
 805 | 
 806 |     case kCharacterClass:
 807 |     case kQuantifier:
 808 |       break;
 809 |   }
 810 |   abort();
 811 | }
 812 | 
 813 | Outer Partial(Exp exp, int byte) {
 814 |   switch (exp->kind()) {
 815 |     case kEmptySet:
 816 |       // ∂a∅ = ∅
 817 |       return Denormalised(EmptySet());
 818 | 
 819 |     case kEmptyString:
 820 |       // ∂aε = ∅
 821 |       return Denormalised(EmptySet());
 822 | 
 823 |     case kGroup: {
 824 |       int num; Exp sub; Mode mode; bool capture;
 825 |       std::tie(num, sub, mode, capture) = exp->group();
 826 |       Outer outer = Partial(sub, byte);
 827 |       for (auto& i : *outer) {
 828 |         i.first = Group(num, i.first, mode, capture);
 829 |         i.first = Conjunction({i.first}, false);
 830 |         i.second.push_back(std::make_pair(num, kAppend));
 831 |       }
 832 |       return outer;
 833 |     }
 834 | 
 835 |     case kAnyByte:
 836 |       // ∂a\C = ε
 837 |       return Denormalised(EmptyString());
 838 | 
 839 |     case kByte:
 840 |       // ∂aa = ε
 841 |       // ∂ab = ∅ for b ≠ a
 842 |       if (exp->byte() == byte) {
 843 |         return Denormalised(EmptyString());
 844 |       } else {
 845 |         return Denormalised(EmptySet());
 846 |       }
 847 | 
 848 |     case kByteRange:
 849 |       // ∂aS = ε if a ∈ S
 850 |       //       ∅ if a ∉ S
 851 |       if (exp->byte_range().first <= byte &&
 852 |           byte <= exp->byte_range().second) {
 853 |         return Denormalised(EmptyString());
 854 |       } else {
 855 |         return Denormalised(EmptySet());
 856 |       }
 857 | 
 858 |     case kKleeneClosure: {
 859 |       // ∂a(r∗) = ∂ar · r∗
 860 |       Bindings cancel;
 861 |       CancelBindings(exp->sub(), &cancel);
 862 |       return PartialConcatenation(Partial(exp->sub(), byte),
 863 |                                   exp,
 864 |                                   cancel);
 865 |     }
 866 | 
 867 |     case kConcatenation:
 868 |       // ∂a(r · s) = ∂ar · s + ν(r) · ∂as
 869 |       if (IsNullable(exp->head())) {
 870 |         Bindings epsilon;
 871 |         EpsilonBindings(exp->head(), &epsilon);
 872 |         return PartialDisjunction(
 873 |             PartialConcatenation(Partial(exp->head(), byte),
 874 |                                  exp->tail(),
 875 |                                  Bindings({})),
 876 |             PartialConcatenation(Partial(exp->tail(), byte),
 877 |                                  EmptyString(),
 878 |                                  epsilon));
 879 |       } else {
 880 |         return PartialConcatenation(Partial(exp->head(), byte),
 881 |                                     exp->tail(),
 882 |                                     Bindings({}));
 883 |       }
 884 | 
 885 |     case kComplement:
 886 |       // ∂a(¬r) = ¬(∂ar)
 887 |       return PartialComplement(Partial(exp->sub(), byte));
 888 | 
 889 |     case kConjunction: {
 890 |       // ∂a(r & s) = ∂ar & ∂as
 891 |       Outer outer(nullptr);
 892 |       for (Exp sub : exp->subexpressions()) {
 893 |         Outer tmp = Partial(sub, byte);
 894 |         if (outer == nullptr) {
 895 |           outer = std::move(tmp);
 896 |         } else {
 897 |           outer = PartialConjunction(std::move(outer), std::move(tmp));
 898 |         }
 899 |       }
 900 |       return outer;
 901 |     }
 902 | 
 903 |     case kDisjunction: {
 904 |       // ∂a(r + s) = ∂ar + ∂as
 905 |       Outer outer(nullptr);
 906 |       for (Exp sub : exp->subexpressions()) {
 907 |         Outer tmp = Partial(sub, byte);
 908 |         if (outer == nullptr) {
 909 |           outer = std::move(tmp);
 910 |         } else {
 911 |           outer = PartialDisjunction(std::move(outer), std::move(tmp));
 912 |         }
 913 |       }
 914 |       return outer;
 915 |     }
 916 | 
 917 |     case kCharacterClass:
 918 |     case kQuantifier:
 919 |       break;
 920 |   }
 921 |   abort();
 922 | }
 923 | 
 924 | // Outputs the partitions obtained by intersecting the partitions in x and y.
 925 | // The first partition should be Σ-based. Any others should be ∅-based.
 926 | static void Intersection(const std::list<std::bitset<256>>& x,
 927 |                          const std::list<std::bitset<256>>& y,
 928 |                          std::list<std::bitset<256>>* z) {
 929 |   for (std::list<std::bitset<256>>::const_iterator xi = x.begin();
 930 |        xi != x.end();
 931 |        ++xi) {
 932 |     for (std::list<std::bitset<256>>::const_iterator yi = y.begin();
 933 |          yi != y.end();
 934 |          ++yi) {
 935 |       std::bitset<256> bs;
 936 |       if (xi == x.begin()) {
 937 |         if (yi == y.begin()) {
 938 |           // Perform set union: *xi is Σ-based, *yi is Σ-based.
 939 |           bs = *xi | *yi;
 940 |           // bs is Σ-based, so it can be empty.
 941 |           z->push_back(bs);
 942 |         } else {
 943 |           // Perform set difference: *xi is Σ-based, *yi is ∅-based.
 944 |           bs = *yi & ~*xi;
 945 |           if (bs.any()) {
 946 |             z->push_back(bs);
 947 |           }
 948 |         }
 949 |       } else {
 950 |         if (yi == y.begin()) {
 951 |           // Perform set difference: *xi is ∅-based, *yi is Σ-based.
 952 |           bs = *xi & ~*yi;
 953 |           if (bs.any()) {
 954 |             z->push_back(bs);
 955 |           }
 956 |         } else {
 957 |           // Perform set intersection: *xi is ∅-based, *yi is ∅-based.
 958 |           bs = *yi & *xi;
 959 |           if (bs.any()) {
 960 |             z->push_back(bs);
 961 |           }
 962 |         }
 963 |       }
 964 |     }
 965 |   }
 966 | }
 967 | 
 968 | void Partitions(Exp exp, std::list<std::bitset<256>>* partitions) {
 969 |   switch (exp->kind()) {
 970 |     case kEmptySet:
 971 |       // C(∅) = {Σ}
 972 |       partitions->push_back({});
 973 |       return;
 974 | 
 975 |     case kEmptyString:
 976 |       // C(ε) = {Σ}
 977 |       partitions->push_back({});
 978 |       return;
 979 | 
 980 |     case kGroup:
 981 |       Partitions(std::get<1>(exp->group()), partitions);
 982 |       return;
 983 | 
 984 |     case kAnyByte:
 985 |       // C(\C) = {Σ}
 986 |       partitions->push_back({});
 987 |       return;
 988 | 
 989 |     case kByte: {
 990 |       // C(a) = {Σ \ a, a}
 991 |       std::bitset<256> bs;
 992 |       bs.set(exp->byte());
 993 |       partitions->push_back(bs);
 994 |       partitions->push_back(bs);
 995 |       return;
 996 |     }
 997 | 
 998 |     case kByteRange: {
 999 |       // C(S) = {Σ \ S, S}
1000 |       std::bitset<256> bs;
1001 |       for (int i = exp->byte_range().first;
1002 |            i <= exp->byte_range().second;
1003 |            ++i) {
1004 |         bs.set(i);
1005 |       }
1006 |       partitions->push_back(bs);
1007 |       partitions->push_back(bs);
1008 |       return;
1009 |     }
1010 | 
1011 |     case kKleeneClosure:
1012 |       // C(r∗) = C(r)
1013 |       Partitions(exp->sub(), partitions);
1014 |       return;
1015 | 
1016 |     case kConcatenation:
1017 |       // C(r · s) = C(r) ∧ C(s) if ν(r) = ε
1018 |       //            C(r)        if ν(r) = ∅
1019 |       if (IsNullable(exp->head())) {
1020 |         std::list<std::bitset<256>> x, y;
1021 |         Partitions(exp->head(), &x);
1022 |         Partitions(exp->tail(), &y);
1023 |         Intersection(x, y, partitions);
1024 |         return;
1025 |       } else {
1026 |         Partitions(exp->head(), partitions);
1027 |         return;
1028 |       }
1029 | 
1030 |     case kComplement:
1031 |       // C(¬r) = C(r)
1032 |       Partitions(exp->sub(), partitions);
1033 |       return;
1034 | 
1035 |     case kConjunction:
1036 |       // C(r & s) = C(r) ∧ C(s)
1037 |       for (Exp sub : exp->subexpressions()) {
1038 |         if (partitions->empty()) {
1039 |           Partitions(sub, partitions);
1040 |         } else {
1041 |           std::list<std::bitset<256>> x, y;
1042 |           partitions->swap(x);
1043 |           Partitions(sub, &y);
1044 |           Intersection(x, y, partitions);
1045 |         }
1046 |       }
1047 |       return;
1048 | 
1049 |     case kDisjunction:
1050 |       // C(r + s) = C(r) ∧ C(s)
1051 |       for (Exp sub : exp->subexpressions()) {
1052 |         if (partitions->empty()) {
1053 |           Partitions(sub, partitions);
1054 |         } else {
1055 |           std::list<std::bitset<256>> x, y;
1056 |           partitions->swap(x);
1057 |           Partitions(sub, &y);
1058 |           Intersection(x, y, partitions);
1059 |         }
1060 |       }
1061 |       return;
1062 | 
1063 |     case kCharacterClass:
1064 |     case kQuantifier:
1065 |       break;
1066 |   }
1067 |   abort();
1068 | }
1069 | 
1070 | // A simple framework for implementing the post-parse rewrites.
1071 | class Walker {
1072 |  public:
1073 |   Walker() {}
1074 |   virtual ~Walker() {}
1075 | 
1076 |   virtual Exp WalkGroup(Exp exp) {
1077 |     int num; Exp sub; Mode mode; bool capture;
1078 |     std::tie(num, sub, mode, capture) = exp->group();
1079 |     sub = Walk(sub);
1080 |     return Group(num, sub, mode, capture);
1081 |   }
1082 | 
1083 |   virtual Exp WalkKleeneClosure(Exp exp) {
1084 |     Exp sub = Walk(exp->sub());
1085 |     return KleeneClosure(sub);
1086 |   }
1087 | 
1088 |   virtual Exp WalkConcatenation(Exp exp) {
1089 |     Exp head = Walk(exp->head());
1090 |     Exp tail = Walk(exp->tail());
1091 |     return Concatenation(head, tail);
1092 |   }
1093 | 
1094 |   virtual Exp WalkComplement(Exp exp) {
1095 |     Exp sub = Walk(exp->sub());
1096 |     return Complement(sub);
1097 |   }
1098 | 
1099 |   virtual Exp WalkConjunction(Exp exp) {
1100 |     std::list<Exp> subs;
1101 |     for (Exp sub : exp->subexpressions()) {
1102 |       sub = Walk(sub);
1103 |       subs.push_back(sub);
1104 |     }
1105 |     return Conjunction(subs, false);
1106 |   }
1107 | 
1108 |   virtual Exp WalkDisjunction(Exp exp) {
1109 |     std::list<Exp> subs;
1110 |     for (Exp sub : exp->subexpressions()) {
1111 |       sub = Walk(sub);
1112 |       subs.push_back(sub);
1113 |     }
1114 |     return Disjunction(subs, false);
1115 |   }
1116 | 
1117 |   virtual Exp WalkCharacterClass(Exp exp) {
1118 |     return exp;
1119 |   }
1120 | 
1121 |   virtual Exp WalkQuantifier(Exp exp) {
1122 |     Exp sub; int min; int max;
1123 |     std::tie(sub, min, max) = exp->quantifier();
1124 |     sub = Walk(sub);
1125 |     return Quantifier(sub, min, max);
1126 |   }
1127 | 
1128 |   Exp Walk(Exp exp) {
1129 |     switch (exp->kind()) {
1130 |       case kEmptySet:
1131 |       case kEmptyString:
1132 |         return exp;
1133 | 
1134 |       case kGroup:
1135 |         return WalkGroup(exp);
1136 | 
1137 |       case kAnyByte:
1138 |       case kByte:
1139 |       case kByteRange:
1140 |         return exp;
1141 | 
1142 |       case kKleeneClosure:
1143 |         return WalkKleeneClosure(exp);
1144 | 
1145 |       case kConcatenation:
1146 |         return WalkConcatenation(exp);
1147 | 
1148 |       case kComplement:
1149 |         return WalkComplement(exp);
1150 | 
1151 |       case kConjunction:
1152 |         return WalkConjunction(exp);
1153 | 
1154 |       case kDisjunction:
1155 |         return WalkDisjunction(exp);
1156 | 
1157 |       case kCharacterClass:
1158 |         return WalkCharacterClass(exp);
1159 | 
1160 |       case kQuantifier:
1161 |         return WalkQuantifier(exp);
1162 |     }
1163 |     abort();
1164 |   }
1165 | 
1166 |  private:
1167 |   Walker(const Walker&) = delete;
1168 |   Walker& operator=(const Walker&) = delete;
1169 | };
1170 | 
1171 | class FlattenConjunctionsAndDisjunctions : public Walker {
1172 |  public:
1173 |   FlattenConjunctionsAndDisjunctions() {}
1174 |   ~FlattenConjunctionsAndDisjunctions() override {}
1175 | 
1176 |   inline void FlattenImpl(Exp exp, std::list<Exp>* subs) {
1177 |     Kind kind = exp->kind();
1178 |     // In most cases, exp is a left-skewed binary tree.
1179 |     while (exp->kind() == kind &&
1180 |            exp->subexpressions().size() == 2) {
1181 |       subs->push_front(exp->tail());
1182 |       exp = exp->head();
1183 |     }
1184 |     if (exp->kind() == kind) {
1185 |       std::list<Exp> copy = exp->subexpressions();
1186 |       subs->splice(subs->begin(), copy);
1187 |     } else {
1188 |       subs->push_front(exp);
1189 |     }
1190 |     std::list<Exp>::iterator i = subs->begin();
1191 |     while (i != subs->end()) {
1192 |       Exp sub = *i;
1193 |       sub = Walk(sub);
1194 |       if (sub->kind() == kind) {
1195 |         std::list<Exp> copy = sub->subexpressions();
1196 |         subs->splice(i, copy);
1197 |         i = subs->erase(i);
1198 |       } else {
1199 |         *i = sub;
1200 |         ++i;
1201 |       }
1202 |     }
1203 |   }
1204 | 
1205 |   Exp WalkConjunction(Exp exp) override {
1206 |     std::list<Exp> subs;
1207 |     FlattenImpl(exp, &subs);
1208 |     return Conjunction(subs, false);
1209 |   }
1210 | 
1211 |   Exp WalkDisjunction(Exp exp) override {
1212 |     std::list<Exp> subs;
1213 |     FlattenImpl(exp, &subs);
1214 |     return Disjunction(subs, false);
1215 |   }
1216 | 
1217 |  private:
1218 |   FlattenConjunctionsAndDisjunctions(const FlattenConjunctionsAndDisjunctions&) = delete;
1219 |   FlattenConjunctionsAndDisjunctions& operator=(const FlattenConjunctionsAndDisjunctions&) = delete;
1220 | };
1221 | 
1222 | class StripGroups : public Walker {
1223 |  public:
1224 |   StripGroups() {}
1225 |   ~StripGroups() override {}
1226 | 
1227 |   Exp WalkGroup(Exp exp) override {
1228 |     Exp sub = Walk(std::get<1>(exp->group()));
1229 |     return sub;
1230 |   }
1231 | 
1232 |  private:
1233 |   StripGroups(const StripGroups&) = delete;
1234 |   StripGroups& operator=(const StripGroups&) = delete;
1235 | };
1236 | 
1237 | class ApplyGroups : public Walker {
1238 |  public:
1239 |   ApplyGroups() {}
1240 |   ~ApplyGroups() override {}
1241 | 
1242 |   Exp WalkComplement(Exp exp) override {
1243 |     Exp sub = Walk(exp->sub());
1244 |     sub = Complement(sub);
1245 |     return Group(-1, sub, kMaximal, false);
1246 |   }
1247 | 
1248 |   Exp WalkDisjunction(Exp exp) override {
1249 |     // Applying Groups to AnyCharacter would break the .∗ ≈ ¬∅ rewrite.
1250 |     if (exp == AnyCharacter()) {
1251 |       return exp;
1252 |     }
1253 |     // Applying Groups to the subexpressions will identify the leftmost.
1254 |     std::list<Exp> subs;
1255 |     for (Exp sub : exp->subexpressions()) {
1256 |       sub = Walk(sub);
1257 |       sub = Group(-1, sub, kPassive, false);
1258 |       subs.push_back(sub);
1259 |     }
1260 |     return Disjunction(subs, false);
1261 |   }
1262 | 
1263 |  private:
1264 |   ApplyGroups(const ApplyGroups&) = delete;
1265 |   ApplyGroups& operator=(const ApplyGroups&) = delete;
1266 | };
1267 | 
1268 | class NumberGroups : public Walker {
1269 |  public:
1270 |   NumberGroups(std::vector<Mode>* modes, std::vector<int>* captures)
1271 |       : num_(0), modes_(modes), captures_(captures) {}
1272 |   ~NumberGroups() override {}
1273 | 
1274 |   Exp WalkGroup(Exp exp) override {
1275 |     Exp sub; Mode mode; bool capture;
1276 |     std::tie(std::ignore, sub, mode, capture) = exp->group();
1277 |     int num = num_++;
1278 |     modes_->push_back(mode);
1279 |     if (capture) {
1280 |       captures_->push_back(num);
1281 |     }
1282 |     sub = Walk(sub);
1283 |     return Group(num, sub, mode, capture);
1284 |   }
1285 | 
1286 |  private:
1287 |   int num_;
1288 |   std::vector<Mode>* modes_;
1289 |   std::vector<int>* captures_;
1290 | 
1291 |   NumberGroups(const NumberGroups&) = delete;
1292 |   NumberGroups& operator=(const NumberGroups&) = delete;
1293 | };
1294 | 
1295 | class ExpandCharacterClasses : public Walker {
1296 |  public:
1297 |   ExpandCharacterClasses() {}
1298 |   ~ExpandCharacterClasses() override {}
1299 | 
1300 |   Exp WalkCharacterClass(Exp exp) override {
1301 |     std::list<Exp> subs;
1302 |     for (Rune character : exp->character_class().first) {
1303 |       subs.push_back(Character(character));
1304 |     }
1305 |     Exp tmp = Disjunction(subs, false);
1306 |     if (exp->character_class().second) {
1307 |       tmp = Conjunction(Complement(tmp), AnyCharacter());
1308 |     }
1309 |     return tmp;
1310 |   }
1311 | 
1312 |  private:
1313 |   ExpandCharacterClasses(const ExpandCharacterClasses&) = delete;
1314 |   ExpandCharacterClasses& operator=(const ExpandCharacterClasses&) = delete;
1315 | };
1316 | 
1317 | class ExpandQuantifiers : public Walker {
1318 |  public:
1319 |   ExpandQuantifiers(bool* exceeded)
1320 |       : exceeded_(exceeded), stack_({1000}) {}
1321 |   ~ExpandQuantifiers() override {}
1322 | 
1323 |   Exp WalkQuantifier(Exp exp) override {
1324 |     Exp sub; int min; int max;
1325 |     std::tie(sub, min, max) = exp->quantifier();
1326 |     // Validate the repetition.
1327 |     int limit = stack_.back();
1328 |     int rep = max;
1329 |     if (rep == -1) {
1330 |       rep = min;
1331 |     }
1332 |     if (rep > 0) {
1333 |       limit /= rep;
1334 |     }
1335 |     if (limit == 0) {
1336 |       *exceeded_ = true;
1337 |       return exp;
1338 |     }
1339 |     stack_.push_back(limit);
1340 |     sub = Walk(sub);
1341 |     stack_.pop_back();
1342 |     if (*exceeded_) {
1343 |       return exp;
1344 |     }
1345 |     // Perform the repetition.
1346 |     Exp tmp;
1347 |     if (max == -1) {
1348 |       tmp = KleeneClosure(sub);
1349 |     }
1350 |     while (max > min) {
1351 |       tmp = tmp == nullptr ? sub : Concatenation(sub, tmp);
1352 |       tmp = Disjunction(EmptyString(), tmp);
1353 |       --max;
1354 |     }
1355 |     while (min > 0) {
1356 |       tmp = tmp == nullptr ? sub : Concatenation(sub, tmp);
1357 |       --min;
1358 |     }
1359 |     tmp = tmp == nullptr ? EmptyString() : tmp;
1360 |     return tmp;
1361 |   }
1362 | 
1363 |  private:
1364 |   bool* exceeded_;
1365 |   std::vector<int> stack_;
1366 | 
1367 |   ExpandQuantifiers(const ExpandQuantifiers&) = delete;
1368 |   ExpandQuantifiers& operator=(const ExpandQuantifiers&) = delete;
1369 | };
1370 | 
1371 | bool Parse(llvm::StringRef str, Exp* exp) {
1372 |   yy::parser parser(&str, exp);
1373 |   if (parser.parse() != 0) {
1374 |     return false;
1375 |   }
1376 |   *exp = FlattenConjunctionsAndDisjunctions().Walk(*exp);
1377 |   *exp = StripGroups().Walk(*exp);
1378 |   *exp = ExpandCharacterClasses().Walk(*exp);
1379 |   bool exceeded = false;
1380 |   *exp = ExpandQuantifiers(&exceeded).Walk(*exp);
1381 |   return !exceeded;
1382 | }
1383 | 
1384 | bool Parse(llvm::StringRef str, Exp* exp,
1385 |            std::vector<Mode>* modes, std::vector<int>* captures) {
1386 |   yy::parser parser(&str, exp);
1387 |   if (parser.parse() != 0) {
1388 |     return false;
1389 |   }
1390 |   *exp = FlattenConjunctionsAndDisjunctions().Walk(*exp);
1391 |   *exp = ApplyGroups().Walk(*exp);
1392 |   *exp = NumberGroups(modes, captures).Walk(*exp);
1393 |   *exp = ExpandCharacterClasses().Walk(*exp);
1394 |   bool exceeded = false;
1395 |   *exp = ExpandQuantifiers(&exceeded).Walk(*exp);
1396 |   return !exceeded;
1397 | }
1398 | 
1399 | bool Match(Exp exp, llvm::StringRef str) {
1400 |   while (!str.empty()) {
1401 |     int byte = static_cast<unsigned char>(str[0]);
1402 |     str = str.drop_front(1);
1403 |     Exp der = Derivative(exp, byte);
1404 |     der = Normalised(der);
1405 |     exp = der;
1406 |   }
1407 |   bool match = IsNullable(exp);
1408 |   return match;
1409 | }
1410 | 
1411 | // Outputs the FA compiled from exp.
1412 | // If tagged is true, uses Antimirov partial derivatives to construct a TNFA.
1413 | // Otherwise, uses Brzozowski derivatives to construct a DFA.
1414 | inline size_t CompileImpl(Exp exp, bool tagged, FA* fa) {
1415 |   std::map<Exp, int> states;
1416 |   std::list<Exp> queue;
1417 |   auto LookupOrInsert = [&states, &queue](Exp exp) -> int {
1418 |     auto state = states.insert(std::make_pair(exp, states.size()));
1419 |     if (state.first->second > 0 &&
1420 |         state.second) {
1421 |       queue.push_back(exp);
1422 |     }
1423 |     return state.first->second;
1424 |   };
1425 |   queue.push_back(exp);
1426 |   while (!queue.empty()) {
1427 |     exp = queue.front();
1428 |     queue.pop_front();
1429 |     exp = Normalised(exp);
1430 |     int curr = LookupOrInsert(exp);
1431 |     if (exp->kind() == kEmptySet) {
1432 |       fa->error_ = curr;
1433 |     }
1434 |     if (exp->kind() == kEmptyString) {
1435 |       fa->empty_ = curr;
1436 |     }
1437 |     if (IsNullable(exp)) {
1438 |       fa->accepting_[curr] = true;
1439 |       if (tagged) {
1440 |         TNFA* tnfa = reinterpret_cast<TNFA*>(fa);
1441 |         EpsilonBindings(exp, &tnfa->final_[curr]);
1442 |       }
1443 |     } else {
1444 |       fa->accepting_[curr] = false;
1445 |     }
1446 |     std::list<std::bitset<256>>* partitions = &fa->partitions_[curr];
1447 |     Partitions(exp, partitions);
1448 |     for (std::list<std::bitset<256>>::const_iterator i = partitions->begin();
1449 |          i != partitions->end();
1450 |          ++i) {
1451 |       int byte;
1452 |       if (i == partitions->begin()) {
1453 |         // *i is Σ-based. Use a byte that it doesn't contain.
1454 |         byte = -1;
1455 |       } else {
1456 |         // *i is ∅-based. Use the first byte that it contains.
1457 |         for (byte = 0; !i->test(byte); ++byte) {}
1458 |       }
1459 |       if (tagged) {
1460 |         TNFA* tnfa = reinterpret_cast<TNFA*>(fa);
1461 |         Outer outer = Partial(exp, byte);
1462 |         std::set<std::pair<int, Bindings>> seen;
1463 |         for (const auto& j : *outer) {
1464 |           Exp par = Normalised(j.first);
1465 |           int next = LookupOrInsert(par);
1466 |           if (seen.count(std::make_pair(next, j.second)) == 0) {
1467 |             seen.insert(std::make_pair(next, j.second));
1468 |             if (i == partitions->begin()) {
1469 |               // Set the "default" transition.
1470 |               tnfa->transition_.insert(std::make_pair(
1471 |                   std::make_pair(curr, byte), std::make_pair(next, j.second)));
1472 |             } else {
1473 |               for (int byte = 0; byte < 256; ++byte) {
1474 |                 if (i->test(byte)) {
1475 |                   tnfa->transition_.insert(std::make_pair(
1476 |                       std::make_pair(curr, byte), std::make_pair(next, j.second)));
1477 |                 }
1478 |               }
1479 |             }
1480 |           }
1481 |         }
1482 |       } else {
1483 |         DFA* dfa = reinterpret_cast<DFA*>(fa);
1484 |         Exp der = Derivative(exp, byte);
1485 |         der = Normalised(der);
1486 |         int next = LookupOrInsert(der);
1487 |         if (i == partitions->begin()) {
1488 |           // Set the "default" transition.
1489 |           dfa->transition_[std::make_pair(curr, byte)] = next;
1490 |         } else {
1491 |           for (int byte = 0; byte < 256; ++byte) {
1492 |             if (i->test(byte)) {
1493 |               dfa->transition_[std::make_pair(curr, byte)] = next;
1494 |             }
1495 |           }
1496 |         }
1497 |       }
1498 |     }
1499 |   }
1500 |   return states.size();
1501 | }
1502 | 
1503 | size_t Compile(Exp exp, DFA* dfa) {
1504 |   return CompileImpl(exp, false, dfa);
1505 | }
1506 | 
1507 | size_t Compile(Exp exp, TNFA* tnfa) {
1508 |   return CompileImpl(exp, true, tnfa);
1509 | }
1510 | 
1511 | bool Match(const DFA& dfa, llvm::StringRef str) {
1512 |   int curr = 0;
1513 |   while (!str.empty()) {
1514 |     int byte = static_cast<unsigned char>(str[0]);
1515 |     str = str.drop_front(1);
1516 |     auto transition = dfa.transition_.find(std::make_pair(curr, byte));
1517 |     if (transition == dfa.transition_.end()) {
1518 |       // Get the "default" transition.
1519 |       transition = dfa.transition_.find(std::make_pair(curr, -1));
1520 |     }
1521 |     int next = transition->second;
1522 |     curr = next;
1523 |   }
1524 |   return dfa.IsAccepting(curr);
1525 | }
1526 | 
1527 | // Applies the Bindings to offsets using pos.
1528 | static void ApplyBindings(const Bindings& bindings,
1529 |                           int pos,
1530 |                           std::vector<int>* offsets) {
1531 |   for (const auto& i : bindings) {
1532 |     int l = 2 * i.first + 0;
1533 |     int r = 2 * i.first + 1;
1534 |     switch (i.second) {
1535 |       case kCancel:
1536 |         if ((*offsets)[l] != -1) {
1537 |           (*offsets)[l] = -1;
1538 |           (*offsets)[r] = -1;
1539 |         }
1540 |         continue;
1541 |       case kEpsilon:
1542 |       case kAppend:
1543 |         if ((*offsets)[l] == -1) {
1544 |           (*offsets)[l] = pos;
1545 |           (*offsets)[r] = pos;
1546 |         }
1547 |         if (i.second == kAppend) {
1548 |           ++(*offsets)[r];
1549 |         }
1550 |         continue;
1551 |     }
1552 |     abort();
1553 |   }
1554 | }
1555 | 
1556 | // Returns true iff x precedes y in the total order specified by modes.
1557 | static bool Precedes(const std::vector<int>& x,
1558 |                      const std::vector<int>& y,
1559 |                      const std::vector<Mode>& modes) {
1560 |   for (size_t i = 0; i < modes.size(); ++i) {
1561 |     int l = 2 * i + 0;
1562 |     int r = 2 * i + 1;
1563 |     if (x[l] == -1 && y[l] == -1) {
1564 |       continue;
1565 |     } else if (x[l] == -1) {
1566 |       return false;
1567 |     } else if (y[l] == -1) {
1568 |       return true;
1569 |     } else if (modes[i] == kPassive) {
1570 |       continue;
1571 |     } else if (x[l] < y[l]) {
1572 |       return true;
1573 |     } else if (x[l] > y[l]) {
1574 |       return false;
1575 |     } else if (x[r] < y[r]) {
1576 |       return modes[i] == kMinimal;
1577 |     } else if (x[r] > y[r]) {
1578 |       return modes[i] == kMaximal;
1579 |     } else {
1580 |       continue;
1581 |     }
1582 |   }
1583 |   return false;
1584 | }
1585 | 
1586 | bool Match(const TNFA& tnfa, llvm::StringRef str,
1587 |            std::vector<int>* offsets) {
1588 |   auto CompareOffsets = [&tnfa](const std::pair<int, std::vector<int>>& x,
1589 |                                 const std::pair<int, std::vector<int>>& y) -> bool {
1590 |     return Precedes(x.second, y.second, tnfa.modes_);
1591 |   };
1592 |   std::list<std::pair<int, std::vector<int>>> curr_states;
1593 |   curr_states.push_back(std::make_pair(0, std::vector<int>(2 * tnfa.modes_.size(), -1)));
1594 |   int pos = 0;
1595 |   while (!str.empty()) {
1596 |     int byte = static_cast<unsigned char>(str[0]);
1597 |     str = str.drop_front(1);
1598 |     // For each current state, determine the next states - applying Bindings -
1599 |     // and then sort them by comparing offsets. Doing this repeatedly from the
1600 |     // initial state and discarding next states that have been seen already in
1601 |     // the current round is intended to simulate a VM implementation.
1602 |     std::list<std::pair<int, std::vector<int>>> next_states;
1603 |     std::set<int> seen;
1604 |     for (const auto& i : curr_states) {
1605 |       int curr = i.first;
1606 |       std::pair<int, int> key = std::make_pair(curr, byte);
1607 |       auto transition = tnfa.transition_.lower_bound(key);
1608 |       if (transition == tnfa.transition_.upper_bound(key)) {
1609 |         // Get the "default" transition.
1610 |         key = std::make_pair(curr, -1);
1611 |         transition = tnfa.transition_.lower_bound(key);
1612 |       }
1613 |       std::list<std::pair<int, std::vector<int>>> tmp;
1614 |       while (transition != tnfa.transition_.upper_bound(key)) {
1615 |         int next = transition->second.first;
1616 |         if (seen.count(next) == 0 &&
1617 |             !tnfa.IsError(next)) {
1618 |           seen.insert(next);
1619 |           std::vector<int> copy = i.second;
1620 |           ApplyBindings(transition->second.second, pos, &copy);
1621 |           tmp.push_back(std::make_pair(next, copy));
1622 |         }
1623 |         ++transition;
1624 |       }
1625 |       tmp.sort(CompareOffsets);
1626 |       next_states.insert(next_states.end(), tmp.begin(), tmp.end());
1627 |     }
1628 |     curr_states.swap(next_states);
1629 |     ++pos;
1630 |   }
1631 |   for (const auto& i : curr_states) {
1632 |     int curr = i.first;
1633 |     if (tnfa.IsAccepting(curr)) {
1634 |       std::vector<int> copy = i.second;
1635 |       ApplyBindings(tnfa.final_.find(curr)->second, pos, &copy);
1636 |       offsets->resize(2 * tnfa.captures_.size());
1637 |       for (size_t j = 0; j < tnfa.captures_.size(); ++j) {
1638 |         (*offsets)[2 * j + 0] = copy[2 * tnfa.captures_[j] + 0];
1639 |         (*offsets)[2 * j + 1] = copy[2 * tnfa.captures_[j] + 1];
1640 |       }
1641 |       return true;
1642 |     }
1643 |   }
1644 |   return false;
1645 | }
1646 | 
1647 | typedef bool NativeMatch(const char*, size_t);
1648 | 
1649 | static llvm::FunctionType* getNativeMatchFnTy(llvm::LLVMContext& context) {
1650 |   return llvm::FunctionType::get(llvm::Type::getInt1Ty(context),
1651 |                                  {llvm::PointerType::getUnqual(context),
1652 |                                   llvm::Type::getScalarTy<size_t>(context)},
1653 |                                  false);
1654 | }
1655 | 
1656 | Fun::Fun() {
1657 |   static std::once_flag once_flag;
1658 |   std::call_once(once_flag, []() {
1659 |     llvm::InitializeNativeTarget();
1660 |     llvm::InitializeNativeTargetAsmPrinter();
1661 |     llvm::InitializeNativeTargetAsmParser();
1662 |   });
1663 |   context_.reset(new llvm::LLVMContext);
1664 |   module_ = new llvm::Module("M", *context_);
1665 |   engine_.reset(llvm::EngineBuilder(std::unique_ptr<llvm::Module>(module_))
1666 |                     .setMCPU(llvm::sys::getHostCPUName())
1667 |                     .create());
1668 |   function_ =
1669 |       llvm::Function::Create(getNativeMatchFnTy(*context_),
1670 |                              llvm::GlobalValue::ExternalLinkage, "F", module_);
1671 | }
1672 | 
1673 | Fun::~Fun() {}
1674 | 
1675 | // Generates the function for the DFA.
1676 | static void GenerateFunction(const DFA& dfa, Fun* fun) {
1677 |   llvm::LLVMContext& context = *fun->context_;  // for convenience
1678 |   llvm::IRBuilder<> bb(context);
1679 | 
1680 |   // Create the entry BasicBlock and two automatic variables, then store the
1681 |   // Function Arguments in the automatic variables.
1682 |   llvm::BasicBlock* entry =
1683 |       llvm::BasicBlock::Create(context, "entry", fun->function_);
1684 |   bb.SetInsertPoint(entry);
1685 |   llvm::AllocaInst* data = bb.CreateAlloca(
1686 |       llvm::PointerType::getUnqual(context), nullptr, "data");
1687 |   llvm::AllocaInst* size = bb.CreateAlloca(
1688 |       llvm::Type::getScalarTy<size_t>(context), nullptr, "size");
1689 |   llvm::Function::arg_iterator arg = fun->function_->arg_begin();
1690 |   bb.CreateStore(&*arg++, data);
1691 |   bb.CreateStore(&*arg++, size);
1692 | 
1693 |   // Create a BasicBlock that returns true.
1694 |   llvm::BasicBlock* return_true =
1695 |       llvm::BasicBlock::Create(context, "return_true", fun->function_);
1696 |   bb.SetInsertPoint(return_true);
1697 |   bb.CreateRet(bb.getTrue());
1698 | 
1699 |   // Create a BasicBlock that returns false.
1700 |   llvm::BasicBlock* return_false =
1701 |       llvm::BasicBlock::Create(context, "return_false", fun->function_);
1702 |   bb.SetInsertPoint(return_false);
1703 |   bb.CreateRet(bb.getFalse());
1704 | 
1705 |   // Create two BasicBlocks per DFA state: the first branches if we have hit
1706 |   // the end of the string; the second switches to the next DFA state after
1707 |   // updating the automatic variables.
1708 |   std::vector<std::pair<llvm::BasicBlock*, llvm::BasicBlock*>> states;
1709 |   states.reserve(dfa.accepting_.size());
1710 |   for (const auto& i : dfa.accepting_) {
1711 |     llvm::BasicBlock* bb0 =
1712 |         llvm::BasicBlock::Create(context, "", fun->function_);
1713 |     llvm::BasicBlock* bb1 =
1714 |         llvm::BasicBlock::Create(context, "", fun->function_);
1715 | 
1716 |     auto sizeTy = llvm::Type::getScalarTy<size_t>(context);
1717 |     auto int8PtrTy = llvm::PointerType::getUnqual(context);
1718 |     auto int8Ty = llvm::Type::getInt8Ty(context);
1719 | 
1720 |     bb.SetInsertPoint(bb0);
1721 |     bb.CreateCondBr(
1722 |         bb.CreateIsNull(bb.CreateLoad(sizeTy, size)),
1723 |         i.second ? return_true : return_false,
1724 |         bb1);
1725 | 
1726 |     bb.SetInsertPoint(bb1);
1727 |     llvm::LoadInst* bytep = bb.CreateLoad(int8PtrTy, data);
1728 |     llvm::LoadInst* byte = bb.CreateLoad(int8Ty, bytep);
1729 |     bb.CreateStore(
1730 |         bb.CreateGEP(int8Ty, bytep, bb.getInt64(1)),
1731 |         data);
1732 |     bb.CreateStore(
1733 |         bb.CreateSub(bb.CreateLoad(sizeTy, size), bb.getInt64(1)),
1734 |         size);
1735 |     // Set the "default" transition to ourselves for now. We could look it up,
1736 |     // but its BasicBlock might not exist yet, so we will just fix it up later.
1737 |     bb.CreateSwitch(byte, bb0);
1738 | 
1739 |     states.push_back(std::make_pair(bb0, bb1));
1740 |   }
1741 | 
1742 |   // Wire up the BasicBlocks.
1743 |   for (const auto& i : dfa.transition_) {
1744 |     // Get the current DFA state.
1745 |     llvm::BasicBlock* bb1 = states[i.first.first].second;
1746 |     llvm::SwitchInst* swi = llvm::cast<llvm::SwitchInst>(bb1->getTerminator());
1747 |     // Get the next DFA state.
1748 |     llvm::BasicBlock* bb0 = states[i.second].first;
1749 |     if (i.first.second == -1) {
1750 |       // Set the "default" transition.
1751 |       swi->setDefaultDest(bb0);
1752 |     } else {
1753 |       swi->addCase(llvm::ConstantInt::get(llvm::Type::getInt8Ty(context),
1754 |                                           i.first.second),
1755 |                    bb0);
1756 |     }
1757 |   }
1758 | 
1759 |   // Plug in the entry BasicBlock.
1760 |   bb.SetInsertPoint(entry);
1761 |   bb.CreateBr(states[0].first);
1762 | 
1763 |   // Do we begin by scanning memory for a byte? If so, we can make memchr(3) do
1764 |   // that for us. It will almost certainly be vectorised and thus much faster.
1765 |   {
1766 |     llvm::BasicBlock* bb0 = states[0].first;
1767 |     llvm::BasicBlock* bb1 = states[0].second;
1768 |     llvm::BranchInst* bra = llvm::cast<llvm::BranchInst>(bb0->getTerminator());
1769 |     llvm::SwitchInst* swi = llvm::cast<llvm::SwitchInst>(bb1->getTerminator());
1770 |     if (swi->getDefaultDest() == bb0 &&
1771 |         swi->getNumCases() == 1) {
1772 |       // What is the byte that we are trying to find?
1773 |       fun->memchr_byte_ = swi->case_begin()->getCaseValue()->getZExtValue();
1774 |       // What should we return if we fail to find it?
1775 |       fun->memchr_fail_ = bra->getSuccessor(0) == return_true;
1776 |     } else {
1777 |       fun->memchr_byte_ = -1;
1778 |     }
1779 |   }
1780 | 
1781 |   // Optimise the module.
1782 |   // NOTE(junyer): This was cargo-culted from Clang. Ordering matters!
1783 |   llvm::LoopAnalysisManager lam;
1784 |   llvm::FunctionAnalysisManager fam;
1785 |   llvm::CGSCCAnalysisManager cam;
1786 |   llvm::ModuleAnalysisManager mam;
1787 | 
1788 |   llvm::PassBuilder pb(fun->engine_->getTargetMachine());
1789 |   pb.registerModuleAnalyses(mam);
1790 |   pb.registerCGSCCAnalyses(cam);
1791 |   pb.registerFunctionAnalyses(fam);
1792 |   pb.registerLoopAnalyses(lam);
1793 |   pb.registerModuleAnalyses(mam);
1794 |   pb.crossRegisterProxies(lam, fam, cam, mam);
1795 | 
1796 |   llvm::ModulePassManager mpm =
1797 |       pb.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O2);
1798 |   mpm.run(*fun->module_, mam);
1799 | }
1800 | 
1801 | // This seems to be the only way to discover the machine code size.
1802 | class DiscoverMachineCodeSize : public llvm::JITEventListener {
1803 |  public:
1804 |   explicit DiscoverMachineCodeSize(Fun* fun) : fun_(fun) {}
1805 |   ~DiscoverMachineCodeSize() override {}
1806 | 
1807 |   void
1808 |   notifyObjectLoaded(ObjectKey, const llvm::object::ObjectFile &object,
1809 |                      const llvm::RuntimeDyld::LoadedObjectInfo &info) override {
1810 |     // We need this in order to obtain the addresses as well as the sizes.
1811 |     llvm::object::OwningBinary<llvm::object::ObjectFile> debug =
1812 |         info.getObjectForDebug(object);
1813 |     std::vector<std::pair<llvm::object::SymbolRef, uint64_t>> symbol_sizes =
1814 |         llvm::object::computeSymbolSizes(*debug.getBinary());
1815 |     for (const auto& i : symbol_sizes) {
1816 |       llvm::Expected<llvm::StringRef> name = i.first.getName();
1817 |       llvm::Expected<uint64_t> addr = i.first.getAddress();
1818 |       if (name && addr && *name == "F") {
1819 |         fun_->machine_code_addr_ = *addr;
1820 |         fun_->machine_code_size_ = i.second;
1821 |         return;
1822 |       }
1823 |     }
1824 |     abort();
1825 |   }
1826 | 
1827 |  private:
1828 |   Fun* fun_;
1829 | 
1830 |   DiscoverMachineCodeSize(const DiscoverMachineCodeSize&) = delete;
1831 |   DiscoverMachineCodeSize& operator=(const DiscoverMachineCodeSize&) = delete;
1832 | };
1833 | 
1834 | // Generates the machine code for the function.
1835 | static void GenerateMachineCode(Fun* fun) {
1836 |   DiscoverMachineCodeSize dmcs(fun);
1837 |   fun->engine_->RegisterJITEventListener(&dmcs);
1838 |   fun->engine_->finalizeObject();
1839 |   fun->engine_->UnregisterJITEventListener(&dmcs);
1840 | }
1841 | 
1842 | size_t Compile(const DFA& dfa, Fun* fun) {
1843 |   GenerateFunction(dfa, fun);
1844 |   GenerateMachineCode(fun);
1845 |   return fun->machine_code_size_;
1846 | }
1847 | 
1848 | bool Match(const Fun& fun, llvm::StringRef str) {
1849 |   if (fun.memchr_byte_ != -1) {
1850 |     const void* ptr = memchr(str.data(), fun.memchr_byte_, str.size());
1851 |     if (ptr == nullptr) {
1852 |       return fun.memchr_fail_;
1853 |     }
1854 |     str = str.drop_front(reinterpret_cast<const char*>(ptr) - str.data());
1855 |   }
1856 |   NativeMatch* match = reinterpret_cast<NativeMatch*>(fun.machine_code_addr_);
1857 |   return (*match)(str.data(), str.size());
1858 | }
1859 | 
1860 | }  // namespace redgrep
1861 | 


--------------------------------------------------------------------------------
/regexp.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2012 Google Inc. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #ifndef REDGREP_REGEXP_H_
 16 | #define REDGREP_REGEXP_H_
 17 | 
 18 | #include <stddef.h>
 19 | #include <stdint.h>
 20 | 
 21 | #include <bitset>
 22 | #include <functional>
 23 | #include <list>
 24 | #include <map>
 25 | #include <memory>
 26 | #include <set>
 27 | #include <tuple>
 28 | #include <utility>
 29 | #include <vector>
 30 | 
 31 | #include "llvm/ADT/StringRef.h"
 32 | #include "utf.h"
 33 | 
 34 | namespace llvm {
 35 | class ExecutionEngine;
 36 | class Function;
 37 | class LLVMContext;
 38 | class Module;
 39 | }  // namespace llvm
 40 | 
 41 | namespace redgrep {
 42 | 
 43 | // Implements regular expressions using Brzozowski derivatives, Antimirov
 44 | // partial derivatives, Sulzmann submatches and Laurikari tagged transitions.
 45 | //
 46 | // References
 47 | // ----------
 48 | //
 49 | // "Derivatives of Regular Expressions"
 50 | // Janusz Brzozowski
 51 | // Journal of the ACM, vol. 11 iss. 4, pp. 481-494, October 1964
 52 | // http://dl.acm.org/citation.cfm?id=321249
 53 | //
 54 | // "Regular-expression derivatives re-examined"
 55 | // Scott Owens, John Reppy, Aaron Turon
 56 | // Journal of Functional Programming, vol. 19 iss. 2, pp. 173-190, March 2009
 57 | // http://dl.acm.org/citation.cfm?id=1520288
 58 | //
 59 | // "Partial Derivatives of Regular Expressions and Finite Automaton Constructions"
 60 | // Valentin Antimirov
 61 | // Theoretical Computer Science, vol. 155 iss. 2, pp. 291-319, March 1996
 62 | // http://dl.acm.org/citation.cfm?id=231848
 63 | //
 64 | // "Partial Derivatives of an Extended Regular Expression"
 65 | // Pascal Caron, Jean-Marc Champarnaud, Ludovic Mignot
 66 | // Language and Automata Theory and Applications 2011, pp. 179-191, May 2011
 67 | // http://dl.acm.org/citation.cfm?id=2022911
 68 | //
 69 | // "A Flexible and Efficient ML Lexer Tool Based on Extended Regular Expression Submatching"
 70 | // Martin Sulzmann, Pippijn van Steenhoven
 71 | // Compiler Construction 2014, pp. 174-191, April 2014
 72 | // http://dx.doi.org/10.1007/978-3-642-54807-9_10
 73 | //
 74 | // "Efficient submatch addressing for regular expressions"
 75 | // Ville Laurikari
 76 | // Master's Thesis, November 2001
 77 | // http://laurikari.net/ville/regex-submatch.pdf
 78 | 
 79 | enum Kind {
 80 |   kEmptySet,
 81 |   kEmptyString,
 82 |   kGroup,
 83 |   kAnyByte,
 84 |   kByte,
 85 |   kByteRange,
 86 |   kKleeneClosure,
 87 |   kConcatenation,
 88 |   kComplement,
 89 |   kConjunction,
 90 |   kDisjunction,
 91 |   kCharacterClass,  // ephemeral
 92 |   kQuantifier,      // ephemeral
 93 | };
 94 | 
 95 | enum Mode {
 96 |   kMinimal,
 97 |   kPassive,
 98 |   kMaximal,
 99 | };
100 | 
101 | class Expression;
102 | typedef std::shared_ptr<Expression> Exp;
103 | 
104 | // Represents a regular expression.
105 | // Note that the data members are const in order to guarantee immutability,
106 | // which will matter later when we use expressions as STL container keys.
107 | class Expression {
108 |  public:
109 |   explicit Expression(Kind kind);
110 |   Expression(Kind kind, const std::tuple<int, Exp, Mode, bool>& group);
111 |   Expression(Kind kind, int byte);
112 |   Expression(Kind kind, const std::pair<int, int>& byte_range);
113 |   Expression(Kind kind, const std::list<Exp>& subexpressions, bool norm);
114 |   Expression(Kind kind, const std::pair<std::set<Rune>, bool>& character_class);
115 |   Expression(Kind kind, const std::tuple<Exp, int, int>& quantifier);
116 |   ~Expression();
117 | 
118 |   Kind kind() const { return kind_; }
119 |   intptr_t data() const { return data_; }
120 |   bool norm() const { return norm_; }
121 | 
122 |   // Accessors for the expression data. Of course, if you call the wrong
123 |   // function for the expression kind, you're gonna have a bad time.
124 |   const std::tuple<int, Exp, Mode, bool>& group() const;
125 |   int byte() const;
126 |   const std::pair<int, int>& byte_range() const;
127 |   const std::list<Exp>& subexpressions() const;
128 |   const std::pair<std::set<Rune>, bool>& character_class() const;
129 |   const std::tuple<Exp, int, int>& quantifier() const;
130 | 
131 |   // A KleeneClosure or Complement expression has one subexpression.
132 |   // Use sub() for convenience.
133 |   Exp sub() const { return subexpressions().front(); }
134 | 
135 |   // A Concatenation expression has two subexpressions, the second typically
136 |   // being another Concatenation. Thus, the concept of "head" and "tail".
137 |   // Use head() and tail() for convenience.
138 |   Exp head() const { return subexpressions().front(); }
139 |   Exp tail() const { return subexpressions().back(); }
140 | 
141 |   friend bool operator<(Exp x, Exp y) { return Compare(x, y) < 0; }
142 |   friend bool operator<=(Exp x, Exp y) { return Compare(x, y) <= 0; }
143 |   friend bool operator==(Exp x, Exp y) { return Compare(x, y) == 0; }
144 |   friend bool operator!=(Exp x, Exp y) { return Compare(x, y) != 0; }
145 |   friend bool operator>(Exp x, Exp y) { return Compare(x, y) > 0; }
146 |   friend bool operator>=(Exp x, Exp y) { return Compare(x, y) >= 0; }
147 | 
148 |  private:
149 |   // Returns -1, 0 or +1 when x is less than, equal to or greater than y,
150 |   // respectively, so that we can define operators above for convenience.
151 |   static int Compare(Exp x, Exp y);
152 | 
153 |   const Kind kind_;
154 |   const intptr_t data_;
155 |   const bool norm_;
156 | 
157 |   Expression(const Expression&) = delete;
158 |   Expression& operator=(const Expression&) = delete;
159 | };
160 | 
161 | // Builders for the various expression kinds.
162 | // Use the inline functions for convenience when building up expressions in
163 | // parser code, test code et cetera.
164 | 
165 | Exp EmptySet();
166 | Exp EmptyString();
167 | Exp Group(const std::tuple<int, Exp, Mode, bool>& group);
168 | Exp AnyByte();
169 | Exp Byte(int byte);
170 | Exp ByteRange(const std::pair<int, int>& byte_range);
171 | Exp KleeneClosure(const std::list<Exp>& subexpressions, bool norm);
172 | Exp Concatenation(const std::list<Exp>& subexpressions, bool norm);
173 | Exp Complement(const std::list<Exp>& subexpressions, bool norm);
174 | Exp Conjunction(const std::list<Exp>& subexpressions, bool norm);
175 | Exp Disjunction(const std::list<Exp>& subexpressions, bool norm);
176 | Exp CharacterClass(const std::pair<std::set<Rune>, bool>& character_class);
177 | Exp Quantifier(const std::tuple<Exp, int, int>& quantifier);
178 | 
179 | inline Exp Group(int num, Exp sub, Mode mode, bool capture) {
180 |   return Group(std::make_tuple(num, sub, mode, capture));
181 | }
182 | 
183 | inline Exp ByteRange(int min, int max) {
184 |   return ByteRange(std::make_pair(min, max));
185 | }
186 | 
187 | inline Exp KleeneClosure(Exp x) {
188 |   return KleeneClosure({x}, false);
189 | }
190 | 
191 | inline Exp Concatenation(Exp x, Exp y) {
192 |   return Concatenation({x, y}, false);
193 | }
194 | 
195 | template <typename... Variadic>
196 | inline Exp Concatenation(Exp x, Exp y, Variadic... z) {
197 |   return Concatenation({x, Concatenation(y, z...)}, false);
198 | }
199 | 
200 | inline Exp Complement(Exp x) {
201 |   return Complement({x}, false);
202 | }
203 | 
204 | template <typename... Variadic>
205 | inline Exp Conjunction(Exp x, Exp y, Variadic... z) {
206 |   return Conjunction({x, y, z...}, false);
207 | }
208 | 
209 | template <typename... Variadic>
210 | inline Exp Disjunction(Exp x, Exp y, Variadic... z) {
211 |   return Disjunction({x, y, z...}, false);
212 | }
213 | 
214 | inline Exp CharacterClass(const std::set<Rune>& characters, bool complement) {
215 |   return CharacterClass(std::make_pair(characters, complement));
216 | }
217 | 
218 | inline Exp Quantifier(Exp sub, int min, int max) {
219 |   return Quantifier(std::make_tuple(sub, min, max));
220 | }
221 | 
222 | Exp AnyCharacter();
223 | Exp Character(Rune character);
224 | 
225 | // Returns the normalised form of exp.
226 | Exp Normalised(Exp exp);
227 | 
228 | // Returns the nullability of exp as a bool.
229 | // EmptySet and EmptyString map to false and true, respectively.
230 | bool IsNullable(Exp exp);
231 | 
232 | // Returns the derivative of exp with respect to byte.
233 | Exp Derivative(Exp exp, int byte);
234 | 
235 | enum BindingType {
236 |   kCancel,
237 |   kEpsilon,
238 |   kAppend,
239 | };
240 | 
241 | typedef std::list<std::pair<int, BindingType>> Bindings;
242 | 
243 | // Conceptually, an OuterSet is a Disjunction and an InnerSet is a Conjunction.
244 | // For simplicity, we don't introduce a new type for the latter, but the former
245 | // needs to associate each InnerSet with its Bindings.
246 | typedef std::list<std::pair<Exp, Bindings>> OuterSet;
247 | typedef std::unique_ptr<OuterSet> Outer;
248 | 
249 | // Returns the denormalised form of exp.
250 | Outer Denormalised(Exp exp);
251 | 
252 | // Partial() helpers for building OuterSets. Exposed for ease of testing.
253 | Outer PartialConcatenation(Outer x, Exp y, const Bindings& initial);
254 | Outer PartialComplement(Outer x);
255 | Outer PartialConjunction(Outer x, Outer y);
256 | Outer PartialDisjunction(Outer x, Outer y);
257 | 
258 | // Returns the partial derivative of exp with respect to byte.
259 | Outer Partial(Exp exp, int byte);
260 | 
261 | // Outputs the partitions computed for exp.
262 | // The first partition should be Σ-based. Any others should be ∅-based.
263 | void Partitions(Exp exp, std::list<std::bitset<256>>* partitions);
264 | 
265 | // Outputs the expression parsed from str.
266 | // Returns true on success, false on failure.
267 | bool Parse(llvm::StringRef str, Exp* exp);
268 | 
269 | // Outputs the expression parsed from str as well as the mode of each Group and
270 | // which Groups capture.
271 | // Returns true on success, false on failure.
272 | bool Parse(llvm::StringRef str, Exp* exp,
273 |            std::vector<Mode>* modes, std::vector<int>* captures);
274 | 
275 | // Returns the result of matching str using exp.
276 | bool Match(Exp exp, llvm::StringRef str);
277 | 
278 | // Represents a finite automaton.
279 | class FA {
280 |  public:
281 |   FA() : error_(-1), empty_(-1) {}
282 |   virtual ~FA() {}
283 | 
284 |   bool IsError(int state) const {
285 |     return state == error_;
286 |   }
287 | 
288 |   bool IsEmpty(int state) const {
289 |     return state == empty_;
290 |   }
291 | 
292 |   bool IsAccepting(int state) const {
293 |     return accepting_.find(state)->second;
294 |   }
295 | 
296 |   int error_;
297 |   int empty_;
298 |   std::map<int, bool> accepting_;
299 |   std::map<int, std::list<std::bitset<256>>> partitions_;
300 | 
301 |  private:
302 |   FA(const FA&) = delete;
303 |   FA& operator=(const FA&) = delete;
304 | };
305 | 
306 | // Represents a deterministic finite automaton.
307 | class DFA : public FA {
308 |  public:
309 |   DFA() {}
310 |   ~DFA() override {}
311 | 
312 |   std::map<std::pair<int, int>, int> transition_;
313 | 
314 |  private:
315 |   DFA(const DFA&) = delete;
316 |   DFA& operator=(const DFA&) = delete;
317 | };
318 | 
319 | // Represents a tagged nondeterministic finite automaton.
320 | class TNFA : public FA {
321 |  public:
322 |   TNFA() {}
323 |   ~TNFA() override {}
324 | 
325 |   std::vector<Mode> modes_;
326 |   std::vector<int> captures_;
327 | 
328 |   std::multimap<std::pair<int, int>, std::pair<int, Bindings>> transition_;
329 |   std::map<int, Bindings> final_;
330 | 
331 |  private:
332 |   TNFA(const TNFA&) = delete;
333 |   TNFA& operator=(const TNFA&) = delete;
334 | };
335 | 
336 | // Outputs the DFA compiled from exp.
337 | // Returns the number of DFA states.
338 | size_t Compile(Exp exp, DFA* dfa);
339 | 
340 | // Outputs the TNFA compiled from exp.
341 | // Returns the number of TNFA states.
342 | size_t Compile(Exp exp, TNFA* tnfa);
343 | 
344 | // Returns the result of matching str using dfa.
345 | bool Match(const DFA& dfa, llvm::StringRef str);
346 | 
347 | // Returns the result of matching str using tnfa.
348 | // Outputs the offsets of the beginning and ending of each Group that captures.
349 | // Thus, the nth Group begins at offsets[2*n+0] and ends at offsets[2*n+1].
350 | bool Match(const TNFA& tnfa, llvm::StringRef str,
351 |            std::vector<int>* offsets);
352 | 
353 | // Represents a function and its machine code.
354 | struct Fun {
355 |   Fun();
356 |   ~Fun();
357 | 
358 |   std::unique_ptr<llvm::LLVMContext> context_;
359 |   llvm::Module* module_;  // Not owned.
360 |   std::unique_ptr<llvm::ExecutionEngine> engine_;
361 |   llvm::Function* function_;  // Not owned.
362 | 
363 |   int memchr_byte_;
364 |   bool memchr_fail_;
365 | 
366 |   uint64_t machine_code_addr_;
367 |   uint64_t machine_code_size_;
368 | };
369 | 
370 | // Outputs the function compiled from dfa.
371 | // Returns the number of bytes of machine code.
372 | size_t Compile(const DFA& dfa, Fun* fun);
373 | 
374 | // Returns the result of matching str using fun.
375 | bool Match(const Fun& fun, llvm::StringRef str);
376 | 
377 | }  // namespace redgrep
378 | 
379 | #endif  // REDGREP_REGEXP_H_
380 | 


--------------------------------------------------------------------------------
/regexp_test.cc:
--------------------------------------------------------------------------------
   1 | // Copyright 2012 Google Inc. All Rights Reserved.
   2 | //
   3 | // Licensed under the Apache License, Version 2.0 (the "License");
   4 | // you may not use this file except in compliance with the License.
   5 | // You may obtain a copy of the License at
   6 | //
   7 | //     http://www.apache.org/licenses/LICENSE-2.0
   8 | //
   9 | // Unless required by applicable law or agreed to in writing, software
  10 | // distributed under the License is distributed on an "AS IS" BASIS,
  11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 | // See the License for the specific language governing permissions and
  13 | // limitations under the License.
  14 | 
  15 | #include "gtest/gtest.h"
  16 | #include "regexp.h"
  17 | 
  18 | namespace redgrep {
  19 | 
  20 | TEST(Compare, EmptySet) {
  21 |   EXPECT_EQ(
  22 |       EmptySet(),
  23 |       EmptySet());
  24 | }
  25 | 
  26 | TEST(Compare, EmptyString) {
  27 |   EXPECT_EQ(
  28 |       EmptyString(),
  29 |       EmptyString());
  30 | }
  31 | 
  32 | TEST(Compare, Group) {
  33 |   EXPECT_EQ(
  34 |       Group(0, Byte('a'), kPassive, true),
  35 |       Group(0, Byte('a'), kPassive, true));
  36 |   EXPECT_LT(
  37 |       Group(0, Byte('a'), kPassive, true),
  38 |       Group(1, Byte('a'), kPassive, true));
  39 | }
  40 | 
  41 | TEST(Compare, AnyByte) {
  42 |   EXPECT_EQ(
  43 |       AnyByte(),
  44 |       AnyByte());
  45 | }
  46 | 
  47 | TEST(Compare, Byte) {
  48 |   EXPECT_EQ(
  49 |       Byte('a'),
  50 |       Byte('a'));
  51 |   EXPECT_LT(
  52 |       Byte('a'),
  53 |       Byte('b'));
  54 | }
  55 | 
  56 | TEST(Compare, ByteRange) {
  57 |   EXPECT_EQ(
  58 |       ByteRange('a', 'c'),
  59 |       ByteRange('a', 'c'));
  60 |   EXPECT_LT(
  61 |       ByteRange('a', 'c'),
  62 |       ByteRange('b', 'd'));
  63 | }
  64 | 
  65 | TEST(Compare, KleeneClosure) {
  66 |   EXPECT_EQ(
  67 |       KleeneClosure(Byte('a')),
  68 |       KleeneClosure(Byte('a')));
  69 |   EXPECT_LT(
  70 |       KleeneClosure(Byte('a')),
  71 |       KleeneClosure(Byte('b')));
  72 | }
  73 | 
  74 | TEST(Compare, Concatenation) {
  75 |   EXPECT_EQ(
  76 |       Concatenation(Byte('a'), Byte('b'), Byte('c')),
  77 |       Concatenation(Byte('a'), Byte('b'), Byte('c')));
  78 |   EXPECT_LT(
  79 |       Concatenation(Byte('a'), Byte('b'), Byte('c')),
  80 |       Concatenation(Byte('b'), Byte('c'), Byte('d')));
  81 | }
  82 | 
  83 | TEST(Compare, Complement) {
  84 |   EXPECT_EQ(
  85 |       Complement(Byte('a')),
  86 |       Complement(Byte('a')));
  87 |   EXPECT_LT(
  88 |       Complement(Byte('a')),
  89 |       Complement(Byte('b')));
  90 | }
  91 | 
  92 | TEST(Compare, Conjunction) {
  93 |   EXPECT_EQ(
  94 |       Conjunction(Byte('a'), Byte('b'), Byte('c')),
  95 |       Conjunction(Byte('a'), Byte('b'), Byte('c')));
  96 |   EXPECT_LT(
  97 |       Conjunction(Byte('a'), Byte('b'), Byte('c')),
  98 |       Conjunction(Byte('b'), Byte('c'), Byte('d')));
  99 | }
 100 | 
 101 | TEST(Compare, Disjunction) {
 102 |   EXPECT_EQ(
 103 |       Disjunction(Byte('a'), Byte('b'), Byte('c')),
 104 |       Disjunction(Byte('a'), Byte('b'), Byte('c')));
 105 |   EXPECT_LT(
 106 |       Disjunction(Byte('a'), Byte('b'), Byte('c')),
 107 |       Disjunction(Byte('b'), Byte('c'), Byte('d')));
 108 | }
 109 | 
 110 | #define EXPECT_NORMALISED(expected, exp)  \
 111 |   do {                                    \
 112 |     EXPECT_EQ(expected, Normalised(exp)); \
 113 |   } while (0)
 114 | 
 115 | TEST(Normalised, EmptySet) {
 116 |   EXPECT_NORMALISED(
 117 |       EmptySet(),
 118 |       EmptySet());
 119 | }
 120 | 
 121 | TEST(Normalised, EmptyString) {
 122 |   EXPECT_NORMALISED(
 123 |       EmptyString(),
 124 |       EmptyString());
 125 | }
 126 | 
 127 | TEST(Normalised, Group) {
 128 |   EXPECT_NORMALISED(
 129 |       EmptySet(),
 130 |       Group(0, EmptySet(), kPassive, true));
 131 |   EXPECT_NORMALISED(
 132 |       EmptyString(),
 133 |       Group(0, EmptyString(), kPassive, true));
 134 |   EXPECT_NORMALISED(
 135 |       Group(0, Byte('a'), kPassive, true),
 136 |       Group(0, Byte('a'), kPassive, true));
 137 | }
 138 | 
 139 | TEST(Normalised, AnyByte) {
 140 |   EXPECT_NORMALISED(
 141 |       AnyByte(),
 142 |       AnyByte());
 143 | }
 144 | 
 145 | TEST(Normalised, Byte) {
 146 |   EXPECT_NORMALISED(
 147 |       Byte('a'),
 148 |       Byte('a'));
 149 | }
 150 | 
 151 | TEST(Normalised, ByteRange) {
 152 |   EXPECT_NORMALISED(
 153 |       ByteRange('a', 'c'),
 154 |       ByteRange('a', 'c'));
 155 | }
 156 | 
 157 | TEST(Normalised, KleeneClosure) {
 158 |   EXPECT_NORMALISED(
 159 |       KleeneClosure(Byte('a')),
 160 |       KleeneClosure(KleeneClosure(Byte('a'))));
 161 |   EXPECT_NORMALISED(
 162 |       EmptyString(),
 163 |       KleeneClosure(EmptySet()));
 164 |   EXPECT_NORMALISED(
 165 |       EmptyString(),
 166 |       KleeneClosure(EmptyString()));
 167 |   EXPECT_NORMALISED(
 168 |       Complement(EmptySet()),
 169 |       KleeneClosure(AnyByte()));
 170 | }
 171 | 
 172 | TEST(Normalised, Concatenation) {
 173 |   EXPECT_NORMALISED(
 174 |       Concatenation(
 175 |           Byte('a'),
 176 |           Concatenation(
 177 |               Byte('b'),
 178 |               Byte('c'))),
 179 |       Concatenation(
 180 |           Concatenation(
 181 |               Byte('a'),
 182 |               Byte('b')),
 183 |           Byte('c')));
 184 |   EXPECT_NORMALISED(
 185 |       EmptySet(),
 186 |       Concatenation(EmptySet(), Byte('a')));
 187 |   EXPECT_NORMALISED(
 188 |       EmptySet(),
 189 |       Concatenation(Byte('a'), EmptySet()));
 190 |   EXPECT_NORMALISED(
 191 |       Byte('a'),
 192 |       Concatenation(EmptyString(), Byte('a')));
 193 |   EXPECT_NORMALISED(
 194 |       Byte('a'),
 195 |       Concatenation(Byte('a'), EmptyString()));
 196 | }
 197 | 
 198 | TEST(Normalised, Complement) {
 199 |   EXPECT_NORMALISED(
 200 |       Byte('a'),
 201 |       Complement(Complement(Byte('a'))));
 202 | }
 203 | 
 204 | TEST(Normalised, Conjunction) {
 205 |   EXPECT_NORMALISED(
 206 |       Conjunction(
 207 |           Byte('a'),
 208 |           Byte('b'),
 209 |           Byte('c')),
 210 |       Conjunction(
 211 |           Conjunction(
 212 |               Byte('a'),
 213 |               Byte('b')),
 214 |           Byte('c')));
 215 |   EXPECT_NORMALISED(
 216 |       Conjunction(Byte('a'), Byte('b')),
 217 |       Conjunction(Byte('b'), Byte('a')));
 218 |   EXPECT_NORMALISED(
 219 |       Byte('a'),
 220 |       Conjunction(Byte('a'), Byte('a')));
 221 |   EXPECT_NORMALISED(
 222 |       EmptySet(),
 223 |       Conjunction(Byte('a'), EmptySet()));
 224 |   EXPECT_NORMALISED(
 225 |       Byte('a'),
 226 |       Conjunction(Byte('a'), Complement(EmptySet())));
 227 | }
 228 | 
 229 | TEST(Normalised, Disjunction) {
 230 |   EXPECT_NORMALISED(
 231 |       Disjunction(
 232 |           Byte('a'),
 233 |           Byte('b'),
 234 |           Byte('c')),
 235 |       Disjunction(
 236 |           Disjunction(
 237 |               Byte('a'),
 238 |               Byte('b')),
 239 |           Byte('c')));
 240 |   EXPECT_NORMALISED(
 241 |       Disjunction(Byte('a'), Byte('b')),
 242 |       Disjunction(Byte('b'), Byte('a')));
 243 |   EXPECT_NORMALISED(
 244 |       Byte('a'),
 245 |       Disjunction(Byte('a'), Byte('a')));
 246 |   EXPECT_NORMALISED(
 247 |       Byte('a'),
 248 |       Disjunction(Byte('a'), EmptySet()));
 249 |   EXPECT_NORMALISED(
 250 |       Complement(EmptySet()),
 251 |       Disjunction(Byte('a'), Complement(EmptySet())));
 252 | }
 253 | 
 254 | #define EXPECT_ISNULLABLE(expected, exp)  \
 255 |   do {                                    \
 256 |     if (expected) {                       \
 257 |       EXPECT_TRUE(IsNullable(exp));       \
 258 |     } else {                              \
 259 |       EXPECT_FALSE(IsNullable(exp));      \
 260 |     }                                     \
 261 |   } while (0)
 262 | 
 263 | TEST(IsNullable, EmptySet) {
 264 |   EXPECT_ISNULLABLE(
 265 |       false,
 266 |       EmptySet());
 267 | }
 268 | 
 269 | TEST(IsNullable, EmptyString) {
 270 |   EXPECT_ISNULLABLE(
 271 |       true,
 272 |       EmptyString());
 273 | }
 274 | 
 275 | TEST(IsNullable, Group) {
 276 |   EXPECT_ISNULLABLE(
 277 |       false,
 278 |       Group(0, Byte('a'), kPassive, true));
 279 | }
 280 | 
 281 | TEST(IsNullable, AnyByte) {
 282 |   EXPECT_ISNULLABLE(
 283 |       false,
 284 |       AnyByte());
 285 | }
 286 | 
 287 | TEST(IsNullable, Byte) {
 288 |   EXPECT_ISNULLABLE(
 289 |       false,
 290 |       Byte('a'));
 291 | }
 292 | 
 293 | TEST(IsNullable, ByteRange) {
 294 |   EXPECT_ISNULLABLE(
 295 |       false,
 296 |       ByteRange('a', 'c'));
 297 | }
 298 | 
 299 | TEST(IsNullable, KleeneClosure) {
 300 |   EXPECT_ISNULLABLE(
 301 |       true,
 302 |       KleeneClosure(Byte('a')));
 303 | }
 304 | 
 305 | TEST(IsNullable, Concatenation) {
 306 |   EXPECT_ISNULLABLE(
 307 |       false,
 308 |       Concatenation(Byte('a'), Byte('b')));
 309 | }
 310 | 
 311 | TEST(IsNullable, Complement) {
 312 |   EXPECT_ISNULLABLE(
 313 |       true,
 314 |       Complement(Byte('a')));
 315 | }
 316 | 
 317 | TEST(IsNullable, Conjunction) {
 318 |   EXPECT_ISNULLABLE(
 319 |       false,
 320 |       Conjunction(Byte('a'), Byte('b')));
 321 | }
 322 | 
 323 | TEST(IsNullable, Disjunction) {
 324 |   EXPECT_ISNULLABLE(
 325 |       false,
 326 |       Disjunction(Byte('a'), Byte('b')));
 327 | }
 328 | 
 329 | #define EXPECT_DERIVATIVE(expected, exp)                    \
 330 |   do {                                                      \
 331 |     EXPECT_EQ(expected, Normalised(Derivative(exp, 'a')));  \
 332 |   } while (0)
 333 | 
 334 | TEST(Derivative, EmptySet) {
 335 |   EXPECT_DERIVATIVE(
 336 |       EmptySet(),
 337 |       EmptySet());
 338 | }
 339 | 
 340 | TEST(Derivative, EmptyString) {
 341 |   EXPECT_DERIVATIVE(
 342 |       EmptySet(),
 343 |       EmptyString());
 344 | }
 345 | 
 346 | TEST(Derivative, Group) {
 347 |   // This should never happen.
 348 | }
 349 | 
 350 | TEST(Derivative, AnyByte) {
 351 |   EXPECT_DERIVATIVE(
 352 |       EmptyString(),
 353 |       AnyByte());
 354 | }
 355 | 
 356 | TEST(Derivative, Byte) {
 357 |   EXPECT_DERIVATIVE(
 358 |       EmptyString(),
 359 |       Byte('a'));
 360 |   EXPECT_DERIVATIVE(
 361 |       EmptySet(),
 362 |       Byte('b'));
 363 | }
 364 | 
 365 | TEST(Derivative, ByteRange) {
 366 |   EXPECT_DERIVATIVE(
 367 |       EmptyString(),
 368 |       ByteRange('a', 'c'));
 369 |   EXPECT_DERIVATIVE(
 370 |       EmptySet(),
 371 |       ByteRange('b', 'd'));
 372 | }
 373 | 
 374 | TEST(Derivative, KleeneClosure) {
 375 |   EXPECT_DERIVATIVE(
 376 |       KleeneClosure(Byte('a')),
 377 |       KleeneClosure(Byte('a')));
 378 | }
 379 | 
 380 | TEST(Derivative, Concatenation) {
 381 |   EXPECT_DERIVATIVE(
 382 |       Byte('b'),
 383 |       Concatenation(Byte('a'), Byte('b')));
 384 |   EXPECT_DERIVATIVE(
 385 |       Concatenation(KleeneClosure(Byte('a')), Byte('b')),
 386 |       Concatenation(KleeneClosure(Byte('a')), Byte('b')));
 387 | }
 388 | 
 389 | TEST(Derivative, Complement) {
 390 |   EXPECT_DERIVATIVE(
 391 |       Complement(EmptyString()),
 392 |       Complement(Byte('a')));
 393 | }
 394 | 
 395 | TEST(Derivative, Conjunction) {
 396 |   EXPECT_DERIVATIVE(
 397 |       EmptySet(),
 398 |       Conjunction(Byte('a'), Byte('b')));
 399 | }
 400 | 
 401 | TEST(Derivative, Disjunction) {
 402 |   EXPECT_DERIVATIVE(
 403 |       EmptyString(),
 404 |       Disjunction(Byte('a'), Byte('b')));
 405 | }
 406 | 
 407 | #define EXPECT_OUTERSET(expected, outer)  \
 408 |   do {                                    \
 409 |     std::list<Exp> subs;                  \
 410 |     for (const auto& i : *outer) {        \
 411 |       subs.push_back(i.first);            \
 412 |     }                                     \
 413 |     Exp exp = Disjunction(subs, false);   \
 414 |     EXPECT_EQ(expected, Normalised(exp)); \
 415 |   } while (0)
 416 | 
 417 | TEST(OuterSet, PartialConcatenation) {
 418 |   Outer outer = PartialConcatenation(
 419 |       Denormalised(
 420 |           Disjunction(
 421 |               Conjunction(Byte('1'), Byte('2')),
 422 |               Byte('3'))),
 423 |       Byte('4'),
 424 |       Bindings({}));
 425 |   EXPECT_OUTERSET(
 426 |       Disjunction(
 427 |           Concatenation(Byte('3'), Byte('4')),
 428 |           Conjunction(
 429 |               Concatenation(Byte('1'), Byte('4')),
 430 |               Concatenation(Byte('2'), Byte('4')))),
 431 |       outer);
 432 | }
 433 | 
 434 | TEST(OuterSet, PartialComplement) {
 435 |   Outer outer = PartialComplement(
 436 |       Denormalised(
 437 |           Disjunction(
 438 |               Conjunction(Byte('1'), Byte('2')),
 439 |               Byte('3'))));
 440 |   EXPECT_OUTERSET(
 441 |       Disjunction(
 442 |           Conjunction(
 443 |               Complement(Byte('1')),
 444 |               Complement(Byte('3'))),
 445 |           Conjunction(
 446 |               Complement(Byte('2')),
 447 |               Complement(Byte('3')))),
 448 |       outer);
 449 | }
 450 | 
 451 | TEST(OuterSet, PartialConjunction) {
 452 |   Outer outer = PartialConjunction(
 453 |       Denormalised(
 454 |           Disjunction(Byte('1'), Byte('2'))),
 455 |       Denormalised(
 456 |           Disjunction(Byte('3'), Byte('4'))));
 457 |   EXPECT_OUTERSET(
 458 |       Disjunction(
 459 |           Conjunction(Byte('1'), Byte('3')),
 460 |           Conjunction(Byte('1'), Byte('4')),
 461 |           Conjunction(Byte('2'), Byte('3')),
 462 |           Conjunction(Byte('2'), Byte('4'))),
 463 |       outer);
 464 | }
 465 | 
 466 | TEST(OuterSet, PartialDisjunction) {
 467 |   Outer outer = PartialDisjunction(
 468 |       Denormalised(
 469 |           Disjunction(Byte('1'), Byte('2'))),
 470 |       Denormalised(
 471 |           Disjunction(Byte('3'), Byte('4'))));
 472 |   EXPECT_OUTERSET(
 473 |       Disjunction(Byte('1'), Byte('2'), Byte('3'), Byte('4')),
 474 |       outer);
 475 | }
 476 | 
 477 | #define EXPECT_PARTIAL(expected, exp) \
 478 |   do {                                \
 479 |     Outer outer = Partial(exp, 'a');  \
 480 |     EXPECT_OUTERSET(expected, outer); \
 481 |   } while (0)
 482 | 
 483 | TEST(Partial, EmptySet) {
 484 |   EXPECT_PARTIAL(
 485 |       EmptySet(),
 486 |       EmptySet());
 487 | }
 488 | 
 489 | TEST(Partial, EmptyString) {
 490 |   EXPECT_PARTIAL(
 491 |       EmptySet(),
 492 |       EmptyString());
 493 | }
 494 | 
 495 | TEST(Partial, Group) {
 496 |   EXPECT_PARTIAL(
 497 |       EmptyString(),
 498 |       Group(0, Byte('a'), kPassive, true));
 499 | }
 500 | 
 501 | TEST(Partial, AnyByte) {
 502 |   EXPECT_PARTIAL(
 503 |       EmptyString(),
 504 |       AnyByte());
 505 | }
 506 | 
 507 | TEST(Partial, Byte) {
 508 |   EXPECT_PARTIAL(
 509 |       EmptyString(),
 510 |       Byte('a'));
 511 |   EXPECT_PARTIAL(
 512 |       EmptySet(),
 513 |       Byte('b'));
 514 | }
 515 | 
 516 | TEST(Partial, ByteRange) {
 517 |   EXPECT_PARTIAL(
 518 |       EmptyString(),
 519 |       ByteRange('a', 'c'));
 520 |   EXPECT_PARTIAL(
 521 |       EmptySet(),
 522 |       ByteRange('b', 'd'));
 523 | }
 524 | 
 525 | TEST(Partial, KleeneClosure) {
 526 |   EXPECT_PARTIAL(
 527 |       KleeneClosure(Byte('a')),
 528 |       KleeneClosure(Byte('a')));
 529 | }
 530 | 
 531 | TEST(Partial, Concatenation) {
 532 |   EXPECT_PARTIAL(
 533 |       Byte('b'),
 534 |       Concatenation(Byte('a'), Byte('b')));
 535 |   EXPECT_PARTIAL(
 536 |       Concatenation(KleeneClosure(Byte('a')), Byte('b')),
 537 |       Concatenation(KleeneClosure(Byte('a')), Byte('b')));
 538 | }
 539 | 
 540 | TEST(Partial, Complement) {
 541 |   EXPECT_PARTIAL(
 542 |       Complement(EmptyString()),
 543 |       Complement(Byte('a')));
 544 | }
 545 | 
 546 | TEST(Partial, Conjunction) {
 547 |   EXPECT_PARTIAL(
 548 |       EmptySet(),
 549 |       Conjunction(Byte('a'), Byte('b')));
 550 | }
 551 | 
 552 | TEST(Partial, Disjunction) {
 553 |   EXPECT_PARTIAL(
 554 |       EmptyString(),
 555 |       Disjunction(Byte('a'), Byte('b')));
 556 | }
 557 | 
 558 | #define EXPECT_PARTITIONS(expected, exp)    \
 559 |   do {                                      \
 560 |     std::list<std::bitset<256>> partitions; \
 561 |     Partitions(exp, &partitions);           \
 562 |     EXPECT_EQ(expected, partitions);        \
 563 |   } while (0)
 564 | 
 565 | template <typename... Variadic>
 566 | inline std::bitset<256> BitSet(Variadic... bits) {
 567 |   std::set<int> s({bits...});
 568 |   std::bitset<256> bs;
 569 |   for (int bit : s) {
 570 |     bs.set(bit);
 571 |   }
 572 |   return bs;
 573 | }
 574 | 
 575 | TEST(Partitions, EmptySet) {
 576 |   EXPECT_PARTITIONS(
 577 |       std::list<std::bitset<256>>({BitSet()}),
 578 |       EmptySet());
 579 | }
 580 | 
 581 | TEST(Partitions, EmptyString) {
 582 |   EXPECT_PARTITIONS(
 583 |       std::list<std::bitset<256>>({BitSet()}),
 584 |       EmptyString());
 585 | }
 586 | 
 587 | TEST(Partitions, Group) {
 588 |   EXPECT_PARTITIONS(
 589 |       std::list<std::bitset<256>>({BitSet('a'),
 590 |                                    BitSet('a')}),
 591 |       Group(0, Byte('a'), kPassive, true));
 592 | }
 593 | 
 594 | TEST(Partitions, AnyByte) {
 595 |   EXPECT_PARTITIONS(
 596 |       std::list<std::bitset<256>>({BitSet()}),
 597 |       AnyByte());
 598 | }
 599 | 
 600 | TEST(Partitions, Byte) {
 601 |   EXPECT_PARTITIONS(
 602 |       std::list<std::bitset<256>>({BitSet('a'),
 603 |                                    BitSet('a')}),
 604 |       Byte('a'));
 605 | }
 606 | 
 607 | TEST(Partitions, ByteRange) {
 608 |   EXPECT_PARTITIONS(
 609 |       std::list<std::bitset<256>>({BitSet('a', 'b', 'c'),
 610 |                                    BitSet('a', 'b', 'c')}),
 611 |       ByteRange('a', 'c'));
 612 | }
 613 | 
 614 | TEST(Partitions, KleeneClosure) {
 615 |   EXPECT_PARTITIONS(
 616 |       std::list<std::bitset<256>>({BitSet('a'),
 617 |                                    BitSet('a')}),
 618 |       KleeneClosure(Byte('a')));
 619 | }
 620 | 
 621 | TEST(Partitions, Concatenation) {
 622 |   EXPECT_PARTITIONS(
 623 |       std::list<std::bitset<256>>({BitSet('a'),
 624 |                                    BitSet('a')}),
 625 |       Concatenation(Byte('a'), Byte('b')));
 626 |   EXPECT_PARTITIONS(
 627 |       std::list<std::bitset<256>>({BitSet('a', 'b'),
 628 |                                    BitSet('b'),
 629 |                                    BitSet('a')}),
 630 |       Concatenation(KleeneClosure(Byte('a')), Byte('b')));
 631 | }
 632 | 
 633 | TEST(Partitions, Complement) {
 634 |   EXPECT_PARTITIONS(
 635 |       std::list<std::bitset<256>>({BitSet('a'),
 636 |                                    BitSet('a')}),
 637 |       Complement(Byte('a')));
 638 | }
 639 | 
 640 | TEST(Partitions, Conjunction) {
 641 |   EXPECT_PARTITIONS(
 642 |       std::list<std::bitset<256>>({BitSet('a', 'b'),
 643 |                                    BitSet('b'),
 644 |                                    BitSet('a')}),
 645 |       Conjunction(Byte('a'), Byte('b')));
 646 | }
 647 | 
 648 | TEST(Partitions, Disjunction) {
 649 |   EXPECT_PARTITIONS(
 650 |       std::list<std::bitset<256>>({BitSet('a', 'b'),
 651 |                                    BitSet('b'),
 652 |                                    BitSet('a')}),
 653 |       Disjunction(Byte('a'), Byte('b')));
 654 | }
 655 | 
 656 | #define EXPECT_PARSE(expected, str) \
 657 |   do {                              \
 658 |     Exp exp;                        \
 659 |     ASSERT_TRUE(Parse(str, &exp));  \
 660 |     EXPECT_EQ(expected, exp);       \
 661 |   } while (0)
 662 | 
 663 | TEST(Parse, EscapeSequences) {
 664 |   EXPECT_PARSE(
 665 |       AnyByte(),
 666 |       "\\C");
 667 |   EXPECT_PARSE(
 668 |       Concatenation(
 669 |           Byte('\f'),
 670 |           Byte('\n'),
 671 |           Byte('\r'),
 672 |           Byte('\t')),
 673 |       "\\f\\n\\r\\t");
 674 | }
 675 | 
 676 | TEST(Parse, AnyCharacter) {
 677 |   EXPECT_PARSE(
 678 |       Disjunction(
 679 |           ByteRange(0x00, 0x7F),
 680 |           Concatenation(
 681 |               ByteRange(0xC2, 0xDF),
 682 |               ByteRange(0x80, 0xBF)),
 683 |           Concatenation(
 684 |               ByteRange(0xE0, 0xEF),
 685 |               ByteRange(0x80, 0xBF),
 686 |               ByteRange(0x80, 0xBF)),
 687 |           Concatenation(
 688 |               ByteRange(0xF0, 0xF4),
 689 |               ByteRange(0x80, 0xBF),
 690 |               ByteRange(0x80, 0xBF),
 691 |               ByteRange(0x80, 0xBF))),
 692 |       ".");
 693 | }
 694 | 
 695 | TEST(Parse, Character) {
 696 |   EXPECT_PARSE(
 697 |       Byte(0x61),
 698 |       "a");
 699 |   EXPECT_PARSE(
 700 |       Concatenation(
 701 |           Byte(0xC2),
 702 |           Byte(0xAC)),
 703 |       "¬");
 704 |   EXPECT_PARSE(
 705 |       Concatenation(
 706 |           Byte(0xE5),
 707 |           Byte(0x85),
 708 |           Byte(0x94)),
 709 |       "兔");
 710 |   EXPECT_PARSE(
 711 |       Concatenation(
 712 |           Byte(0xF0),
 713 |           Byte(0x9F),
 714 |           Byte(0x92),
 715 |           Byte(0xA9)),
 716 |       "💩");
 717 | }
 718 | 
 719 | TEST(Parse, CharacterClass) {
 720 |   EXPECT_PARSE(
 721 |       Disjunction(
 722 |           Byte(0x61),
 723 |           Concatenation(
 724 |               Byte(0xC2),
 725 |               Byte(0xAC)),
 726 |           Concatenation(
 727 |               Byte(0xE5),
 728 |               Byte(0x85),
 729 |               Byte(0x94)),
 730 |           Concatenation(
 731 |               Byte(0xF0),
 732 |               Byte(0x9F),
 733 |               Byte(0x92),
 734 |               Byte(0xA9))),
 735 |       "[a¬兔💩]");
 736 |   EXPECT_PARSE(
 737 |       Conjunction(
 738 |           Complement(
 739 |               Disjunction(
 740 |                   Byte(0x61),
 741 |                   Concatenation(
 742 |                       Byte(0xC2),
 743 |                       Byte(0xAC)),
 744 |                   Concatenation(
 745 |                       Byte(0xE5),
 746 |                       Byte(0x85),
 747 |                       Byte(0x94)),
 748 |                   Concatenation(
 749 |                       Byte(0xF0),
 750 |                       Byte(0x9F),
 751 |                       Byte(0x92),
 752 |                       Byte(0xA9)))),
 753 |           AnyCharacter()),
 754 |       "[^a¬兔💩]");
 755 | }
 756 | 
 757 | TEST(Parse, Quantifiers) {
 758 |   EXPECT_PARSE(
 759 |       KleeneClosure(
 760 |           Byte('a')),
 761 |       "a*");
 762 |   EXPECT_PARSE(
 763 |       KleeneClosure(
 764 |           Byte('a')),
 765 |       "a*?");
 766 |   EXPECT_PARSE(
 767 |       Concatenation(
 768 |           Byte('a'),
 769 |           KleeneClosure(
 770 |               Byte('a'))),
 771 |       "a+");
 772 |   EXPECT_PARSE(
 773 |       Concatenation(
 774 |           Byte('a'),
 775 |           KleeneClosure(
 776 |               Byte('a'))),
 777 |       "a+?");
 778 |   EXPECT_PARSE(
 779 |       Disjunction(
 780 |           EmptyString(),
 781 |           Byte('a')),
 782 |       "a?");
 783 |   EXPECT_PARSE(
 784 |       Disjunction(
 785 |           EmptyString(),
 786 |           Byte('a')),
 787 |       "a??");
 788 |   EXPECT_PARSE(
 789 |       Byte('a'),
 790 |       "a{1}");
 791 |   EXPECT_PARSE(
 792 |       Byte('a'),
 793 |       "a{1}?");
 794 |   EXPECT_PARSE(
 795 |       Concatenation(
 796 |           Byte('a'),
 797 |           KleeneClosure(
 798 |               Byte('a'))),
 799 |       "a{1,}");
 800 |   EXPECT_PARSE(
 801 |       Concatenation(
 802 |           Byte('a'),
 803 |           KleeneClosure(
 804 |               Byte('a'))),
 805 |       "a{1,}?");
 806 |   EXPECT_PARSE(
 807 |       Concatenation(
 808 |           Byte('a'),
 809 |           Disjunction(
 810 |               EmptyString(),
 811 |               Byte('a'))),
 812 |       "a{1,2}");
 813 |   EXPECT_PARSE(
 814 |       Concatenation(
 815 |           Byte('a'),
 816 |           Disjunction(
 817 |               EmptyString(),
 818 |               Byte('a'))),
 819 |       "a{1,2}?");
 820 | }
 821 | 
 822 | TEST(Parse, KleeneClosure) {
 823 |   EXPECT_PARSE(
 824 |       Concatenation(
 825 |           Byte('a'),
 826 |           KleeneClosure(
 827 |               Byte('b'))),
 828 |       "ab*");
 829 |   EXPECT_PARSE(
 830 |       KleeneClosure(
 831 |           Concatenation(
 832 |               Byte('a'),
 833 |               Byte('b'))),
 834 |       "(ab)*");
 835 |   EXPECT_PARSE(
 836 |       Concatenation(
 837 |           KleeneClosure(
 838 |               Byte('a')),
 839 |           Byte('b')),
 840 |       "a*b");
 841 |   EXPECT_PARSE(
 842 |       Concatenation(
 843 |           KleeneClosure(
 844 |               Byte('a')),
 845 |           Concatenation(
 846 |               KleeneClosure(
 847 |                   Byte('b')),
 848 |               Byte('c'))),
 849 |       "a*b*c");
 850 | }
 851 | 
 852 | TEST(Parse, Concatenation) {
 853 |   EXPECT_PARSE(
 854 |       Concatenation(
 855 |           Byte('a'),
 856 |           Byte('b')),
 857 |       "ab");
 858 |   EXPECT_PARSE(
 859 |       Concatenation(
 860 |           Byte('a'),
 861 |           Concatenation(
 862 |               Byte('b'),
 863 |               Byte('c'))),
 864 |       "abc");
 865 | }
 866 | 
 867 | TEST(Parse, Complement) {
 868 |   EXPECT_PARSE(
 869 |       Complement(
 870 |           Byte('a')),
 871 |       "!a");
 872 |   EXPECT_PARSE(
 873 |       Complement(
 874 |           Concatenation(
 875 |               Byte('a'),
 876 |               Byte('b'))),
 877 |       "!ab");
 878 |   EXPECT_PARSE(
 879 |       Complement(
 880 |           Concatenation(
 881 |               Byte('a'),
 882 |               Byte('b'))),
 883 |       "!(ab)");
 884 |   EXPECT_PARSE(
 885 |       Concatenation(
 886 |           Byte('a'),
 887 |           Complement(
 888 |               Byte('b'))),
 889 |       "a!b");
 890 |   EXPECT_PARSE(
 891 |       Concatenation(
 892 |           Concatenation(
 893 |               Byte('a'),
 894 |               Complement(
 895 |                   Byte('b'))),
 896 |           Complement(
 897 |               Byte('c'))),
 898 |       "a!b!c");
 899 | }
 900 | 
 901 | TEST(Parse, Conjunction) {
 902 |   EXPECT_PARSE(
 903 |       Conjunction(
 904 |           Byte('a'),
 905 |           Byte('b')),
 906 |       "a&b");
 907 |   EXPECT_PARSE(
 908 |       Conjunction(
 909 |           Byte('a'),
 910 |           Byte('b'),
 911 |           Byte('c')),
 912 |       "a&b&c");
 913 | }
 914 | 
 915 | TEST(Parse, Disjunction) {
 916 |   EXPECT_PARSE(
 917 |       Disjunction(
 918 |           Byte('a'),
 919 |           Byte('b')),
 920 |       "a|b");
 921 |   EXPECT_PARSE(
 922 |       Disjunction(
 923 |           Byte('a'),
 924 |           Byte('b'),
 925 |           Byte('c')),
 926 |       "a|b|c");
 927 | }
 928 | 
 929 | TEST(Parse, CountedRepetition) {
 930 |   Exp exp1;
 931 |   EXPECT_TRUE(Parse("a{0}", &exp1));
 932 |   EXPECT_EQ(EmptyString(), exp1);
 933 | 
 934 |   Exp exp2;
 935 |   EXPECT_TRUE(Parse("a{1000}", &exp2));
 936 |   Exp exp3;
 937 |   EXPECT_TRUE(Parse("a{2}{2}{2}{5}{5}{5}", &exp3));
 938 |   // They are structured differently, so compare their normalised forms.
 939 |   EXPECT_EQ(Normalised(exp2), Normalised(exp3));
 940 | 
 941 |   Exp exp4;
 942 |   EXPECT_FALSE(Parse("a{1001}", &exp4));
 943 |   EXPECT_FALSE(Parse("a{7}{11}{13}", &exp4));
 944 | 
 945 |   Exp exp5;
 946 |   EXPECT_FALSE(Parse("a{999999999}", &exp5));
 947 |   EXPECT_FALSE(Parse("a{10}{10}{10}{10}{10}{10}{10}{10}{10}{10}", &exp5));
 948 | }
 949 | 
 950 | #define EXPECT_PARSE_M_C(expected, expected_modes, expected_captures, str)  \
 951 |   do {                                                                      \
 952 |     Exp exp;                                                                \
 953 |     std::vector<Mode> modes;                                                \
 954 |     std::vector<int> captures;                                              \
 955 |     ASSERT_TRUE(Parse(str, &exp, &modes, &captures));                       \
 956 |     EXPECT_EQ(expected, exp);                                               \
 957 |     EXPECT_EQ(expected_modes, modes);                                       \
 958 |     EXPECT_EQ(expected_captures, captures);                                 \
 959 |   } while (0)
 960 | 
 961 | TEST(Parse_M_C, Parentheses) {
 962 |   EXPECT_PARSE_M_C(
 963 |       Group(0,
 964 |             Concatenation(
 965 |                 Byte('a'),
 966 |                 Byte('b')),
 967 |             kPassive, false),
 968 |       std::vector<Mode>({kPassive}),
 969 |       std::vector<int>({}),
 970 |       "(?:ab)");
 971 |   EXPECT_PARSE_M_C(
 972 |       Group(0,
 973 |             Concatenation(
 974 |                 Byte('a'),
 975 |                 Byte('b')),
 976 |             kPassive, true),
 977 |       std::vector<Mode>({kPassive}),
 978 |       std::vector<int>({0}),
 979 |       "(ab)");
 980 |   EXPECT_PARSE_M_C(
 981 |       Group(0,
 982 |             Concatenation(
 983 |                 Group(1,
 984 |                       Byte('a'),
 985 |                       kPassive, true),
 986 |                 Byte('b')),
 987 |             kPassive, true),
 988 |       std::vector<Mode>({kPassive, kPassive}),
 989 |       std::vector<int>({0, 1}),
 990 |       "((a)b)");
 991 |   EXPECT_PARSE_M_C(
 992 |       Group(0,
 993 |             Concatenation(
 994 |                 Byte('a'),
 995 |                 Group(1,
 996 |                       Byte('b'),
 997 |                       kPassive, true)),
 998 |             kPassive, true),
 999 |       std::vector<Mode>({kPassive, kPassive}),
1000 |       std::vector<int>({0, 1}),
1001 |       "(a(b))");
1002 |   EXPECT_PARSE_M_C(
1003 |       Concatenation(
1004 |           Group(0,
1005 |                 Byte('a'),
1006 |                 kPassive, true),
1007 |           Group(1,
1008 |                 Byte('b'),
1009 |                 kPassive, true)),
1010 |       std::vector<Mode>({kPassive, kPassive}),
1011 |       std::vector<int>({0, 1}),
1012 |       "(a)(b)");
1013 | }
1014 | 
1015 | TEST(Parse_M_C, Quantifiers) {
1016 |   EXPECT_PARSE_M_C(
1017 |       Group(0,
1018 |             KleeneClosure(Byte('a')),
1019 |             kMaximal, false),
1020 |       std::vector<Mode>({kMaximal}),
1021 |       std::vector<int>({}),
1022 |       "a*");
1023 |   EXPECT_PARSE_M_C(
1024 |       Group(0,
1025 |             KleeneClosure(Byte('a')),
1026 |             kMinimal, false),
1027 |       std::vector<Mode>({kMinimal}),
1028 |       std::vector<int>({}),
1029 |       "a*?");
1030 |   EXPECT_PARSE_M_C(
1031 |       Group(0,
1032 |             Concatenation(
1033 |                 Byte('a'),
1034 |                 KleeneClosure(Byte('a'))),
1035 |             kMaximal, false),
1036 |       std::vector<Mode>({kMaximal}),
1037 |       std::vector<int>({}),
1038 |       "a+");
1039 |   EXPECT_PARSE_M_C(
1040 |       Group(0,
1041 |             Concatenation(
1042 |                 Byte('a'),
1043 |                 KleeneClosure(Byte('a'))),
1044 |             kMinimal, false),
1045 |       std::vector<Mode>({kMinimal}),
1046 |       std::vector<int>({}),
1047 |       "a+?");
1048 |   EXPECT_PARSE_M_C(
1049 |       Group(0,
1050 |             Disjunction(
1051 |                 EmptyString(),
1052 |                 Byte('a')),
1053 |             kMaximal, false),
1054 |       std::vector<Mode>({kMaximal}),
1055 |       std::vector<int>({}),
1056 |       "a?");
1057 |   EXPECT_PARSE_M_C(
1058 |       Group(0,
1059 |             Disjunction(
1060 |                 EmptyString(),
1061 |                 Byte('a')),
1062 |             kMinimal, false),
1063 |       std::vector<Mode>({kMinimal}),
1064 |       std::vector<int>({}),
1065 |       "a??");
1066 |   EXPECT_PARSE_M_C(
1067 |       Group(0,
1068 |             Byte('a'),
1069 |             kMaximal, false),
1070 |       std::vector<Mode>({kMaximal}),
1071 |       std::vector<int>({}),
1072 |       "a{1}");
1073 |   EXPECT_PARSE_M_C(
1074 |       Group(0,
1075 |             Byte('a'),
1076 |             kMinimal, false),
1077 |       std::vector<Mode>({kMinimal}),
1078 |       std::vector<int>({}),
1079 |       "a{1}?");
1080 |   EXPECT_PARSE_M_C(
1081 |       Group(0,
1082 |             Concatenation(
1083 |                 Byte('a'),
1084 |                 KleeneClosure(Byte('a'))),
1085 |             kMaximal, false),
1086 |       std::vector<Mode>({kMaximal}),
1087 |       std::vector<int>({}),
1088 |       "a{1,}");
1089 |   EXPECT_PARSE_M_C(
1090 |       Group(0,
1091 |             Concatenation(
1092 |                 Byte('a'),
1093 |                 KleeneClosure(Byte('a'))),
1094 |             kMinimal, false),
1095 |       std::vector<Mode>({kMinimal}),
1096 |       std::vector<int>({}),
1097 |       "a{1,}?");
1098 |   EXPECT_PARSE_M_C(
1099 |       Group(0,
1100 |             Concatenation(
1101 |                 Byte('a'),
1102 |                 Disjunction(
1103 |                     EmptyString(),
1104 |                     Byte('a'))),
1105 |             kMaximal, false),
1106 |       std::vector<Mode>({kMaximal}),
1107 |       std::vector<int>({}),
1108 |       "a{1,2}");
1109 |   EXPECT_PARSE_M_C(
1110 |       Group(0,
1111 |             Concatenation(
1112 |                 Byte('a'),
1113 |                 Disjunction(
1114 |                     EmptyString(),
1115 |                     Byte('a'))),
1116 |             kMinimal, false),
1117 |       std::vector<Mode>({kMinimal}),
1118 |       std::vector<int>({}),
1119 |       "a{1,2}?");
1120 | }
1121 | 
1122 | TEST(Parse_M_C, ApplyGroups) {
1123 |   EXPECT_PARSE_M_C(
1124 |       AnyCharacter(),
1125 |       std::vector<Mode>({}),
1126 |       std::vector<int>({}),
1127 |       ".");
1128 |   EXPECT_PARSE_M_C(
1129 |       Disjunction(
1130 |           Byte('a'),
1131 |           Byte('b'),
1132 |           Byte('c')),
1133 |       std::vector<Mode>({}),
1134 |       std::vector<int>({}),
1135 |       "[abc]");
1136 |   EXPECT_PARSE_M_C(
1137 |       Conjunction(
1138 |           Complement(
1139 |               Disjunction(
1140 |                   Byte('a'),
1141 |                   Byte('b'),
1142 |                   Byte('c'))),
1143 |           AnyCharacter()),
1144 |       std::vector<Mode>({}),
1145 |       std::vector<int>({}),
1146 |       "[^abc]");
1147 |   EXPECT_PARSE_M_C(
1148 |       Disjunction(
1149 |           Group(0,
1150 |                 Concatenation(Byte('a'), Byte('a'), Byte('a')),
1151 |                 kPassive, false),
1152 |           Group(1,
1153 |                 Concatenation(Byte('b'), Byte('b'), Byte('b')),
1154 |                 kPassive, false),
1155 |           Group(2,
1156 |                 Concatenation(Byte('c'), Byte('c'), Byte('c')),
1157 |                 kPassive, false)),
1158 |       std::vector<Mode>({kPassive, kPassive, kPassive}),
1159 |       std::vector<int>({}),
1160 |       "aaa|bbb|ccc");
1161 |   EXPECT_PARSE_M_C(
1162 |       Group(0,
1163 |             Complement(
1164 |                 Concatenation(
1165 |                     Byte('a'),
1166 |                     Byte('b'),
1167 |                     Byte('c'))),
1168 |             kMaximal, false),
1169 |       std::vector<Mode>({kMaximal}),
1170 |       std::vector<int>({}),
1171 |       "!abc");
1172 | }
1173 | 
1174 | #define EXPECT_MATCH(expected, expected_values, str)  \
1175 |   do {                                                \
1176 |     std::vector<int> values;                          \
1177 |     if (expected) {                                   \
1178 |       EXPECT_TRUE(Match(exp1_, str));                 \
1179 |       EXPECT_TRUE(Match(dfa_, str));                  \
1180 |       EXPECT_TRUE(Match(fun1_, str));                 \
1181 |       EXPECT_TRUE(Match(tnfa_, str, &values));        \
1182 |       EXPECT_EQ(expected_values, values);             \
1183 |     } else {                                          \
1184 |       EXPECT_FALSE(Match(exp1_, str));                \
1185 |       EXPECT_FALSE(Match(dfa_, str));                 \
1186 |       EXPECT_FALSE(Match(fun1_, str));                \
1187 |       EXPECT_FALSE(Match(tnfa_, str, &values));       \
1188 |     }                                                 \
1189 |   } while (0)
1190 | 
1191 | class MatchTest : public testing::Test {
1192 |  protected:
1193 |   void ParseAll(llvm::StringRef str) {
1194 |     ASSERT_TRUE(Parse(str, &exp1_));
1195 |     ASSERT_TRUE(Parse(str, &exp2_, &tnfa_.modes_, &tnfa_.captures_));
1196 |   }
1197 | 
1198 |   void CompileAll() {
1199 |     Compile(exp1_, &dfa_);
1200 |     Compile(dfa_, &fun1_);
1201 |     Compile(exp2_, &tnfa_);
1202 |   }
1203 | 
1204 |   Exp exp1_;
1205 |   DFA dfa_;
1206 |   Fun fun1_;
1207 | 
1208 |   Exp exp2_;
1209 |   TNFA tnfa_;
1210 | };
1211 | 
1212 | TEST_F(MatchTest, EmptySet) {
1213 |   exp1_ = exp2_ = EmptySet();
1214 |   CompileAll();
1215 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1216 |   EXPECT_MATCH(false, std::vector<int>({}), "a");
1217 | }
1218 | 
1219 | TEST_F(MatchTest, EmptyString) {
1220 |   exp1_ = exp2_ = EmptyString();
1221 |   CompileAll();
1222 |   EXPECT_MATCH(true, std::vector<int>({}), "");
1223 |   EXPECT_MATCH(false, std::vector<int>({}), "a");
1224 | }
1225 | 
1226 | TEST_F(MatchTest, EscapeSequences_1) {
1227 |   ParseAll("(\\C)");
1228 |   CompileAll();
1229 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1230 |   EXPECT_MATCH(true, std::vector<int>({0, 1}), "a");
1231 | }
1232 | 
1233 | TEST_F(MatchTest, EscapeSequences_2) {
1234 |   ParseAll("(\\f\\n\\r\\t)");
1235 |   CompileAll();
1236 |   EXPECT_MATCH(false, std::vector<int>({}), "fnrt");
1237 |   EXPECT_MATCH(true, std::vector<int>({0, 4}), "\f\n\r\t");
1238 |   EXPECT_MATCH(false, std::vector<int>({}), "\\f\\n\\r\\t");
1239 | }
1240 | 
1241 | TEST_F(MatchTest, AnyCharacter) {
1242 |   ParseAll("(.)");
1243 |   CompileAll();
1244 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1245 |   EXPECT_MATCH(true, std::vector<int>({0, 1}), "a");
1246 |   EXPECT_MATCH(true, std::vector<int>({0, 2}), "¬");
1247 |   EXPECT_MATCH(true, std::vector<int>({0, 3}), "兔");
1248 |   EXPECT_MATCH(true, std::vector<int>({0, 4}), "💩");
1249 | }
1250 | 
1251 | TEST_F(MatchTest, Character_1) {
1252 |   ParseAll("(a)");
1253 |   CompileAll();
1254 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1255 |   EXPECT_MATCH(true, std::vector<int>({0, 1}), "a");
1256 |   EXPECT_MATCH(false, std::vector<int>({}), "X");
1257 | }
1258 | 
1259 | TEST_F(MatchTest, Character_2) {
1260 |   ParseAll("(¬)");
1261 |   CompileAll();
1262 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1263 |   EXPECT_MATCH(true, std::vector<int>({0, 2}), "¬");
1264 |   EXPECT_MATCH(false, std::vector<int>({}), "X");
1265 | }
1266 | 
1267 | TEST_F(MatchTest, Character_3) {
1268 |   ParseAll("(兔)");
1269 |   CompileAll();
1270 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1271 |   EXPECT_MATCH(true, std::vector<int>({0, 3}), "兔");
1272 |   EXPECT_MATCH(false, std::vector<int>({}), "X");
1273 | }
1274 | 
1275 | TEST_F(MatchTest, Character_4) {
1276 |   ParseAll("(💩)");
1277 |   CompileAll();
1278 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1279 |   EXPECT_MATCH(true, std::vector<int>({0, 4}), "💩");
1280 |   EXPECT_MATCH(false, std::vector<int>({}), "X");
1281 | }
1282 | 
1283 | TEST_F(MatchTest, CharacterClass_1) {
1284 |   ParseAll("([a¬兔💩])");
1285 |   CompileAll();
1286 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1287 |   EXPECT_MATCH(true, std::vector<int>({0, 1}), "a");
1288 |   EXPECT_MATCH(true, std::vector<int>({0, 2}), "¬");
1289 |   EXPECT_MATCH(true, std::vector<int>({0, 3}), "兔");
1290 |   EXPECT_MATCH(true, std::vector<int>({0, 4}), "💩");
1291 |   EXPECT_MATCH(false, std::vector<int>({}), "X");
1292 | }
1293 | 
1294 | TEST_F(MatchTest, CharacterClass_2) {
1295 |   ParseAll("([^a¬兔💩])");
1296 |   CompileAll();
1297 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1298 |   EXPECT_MATCH(false, std::vector<int>({}), "a");
1299 |   EXPECT_MATCH(false, std::vector<int>({}), "¬");
1300 |   EXPECT_MATCH(false, std::vector<int>({}), "兔");
1301 |   EXPECT_MATCH(false, std::vector<int>({}), "💩");
1302 |   EXPECT_MATCH(true, std::vector<int>({0, 1}), "X");
1303 | }
1304 | 
1305 | TEST_F(MatchTest, Quantifiers_1) {
1306 |   ParseAll("(a*)");
1307 |   CompileAll();
1308 |   EXPECT_MATCH(true, std::vector<int>({0, 0}), "");
1309 |   EXPECT_MATCH(true, std::vector<int>({0, 1}), "a");
1310 |   EXPECT_MATCH(true, std::vector<int>({0, 2}), "aa");
1311 |   EXPECT_MATCH(true, std::vector<int>({0, 3}), "aaa");
1312 | }
1313 | 
1314 | TEST_F(MatchTest, Quantifiers_2) {
1315 |   ParseAll("(a*)(a*)");
1316 |   CompileAll();
1317 |   EXPECT_MATCH(true, std::vector<int>({0, 0, 0, 0}), "");
1318 |   EXPECT_MATCH(true, std::vector<int>({0, 1, 1, 1}), "a");
1319 |   EXPECT_MATCH(true, std::vector<int>({0, 2, 2, 2}), "aa");
1320 |   EXPECT_MATCH(true, std::vector<int>({0, 3, 3, 3}), "aaa");
1321 | }
1322 | 
1323 | TEST_F(MatchTest, Quantifiers_3) {
1324 |   ParseAll("(a*?)(a*)");
1325 |   CompileAll();
1326 |   EXPECT_MATCH(true, std::vector<int>({0, 0, 0, 0}), "");
1327 |   EXPECT_MATCH(true, std::vector<int>({0, 0, 0, 1}), "a");
1328 |   EXPECT_MATCH(true, std::vector<int>({0, 0, 0, 2}), "aa");
1329 |   EXPECT_MATCH(true, std::vector<int>({0, 0, 0, 3}), "aaa");
1330 | }
1331 | 
1332 | TEST_F(MatchTest, Quantifiers_4) {
1333 |   ParseAll("(a+)");
1334 |   CompileAll();
1335 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1336 |   EXPECT_MATCH(true, std::vector<int>({0, 1}), "a");
1337 |   EXPECT_MATCH(true, std::vector<int>({0, 2}), "aa");
1338 |   EXPECT_MATCH(true, std::vector<int>({0, 3}), "aaa");
1339 | }
1340 | 
1341 | TEST_F(MatchTest, Quantifiers_5) {
1342 |   ParseAll("(a+)(a+)");
1343 |   CompileAll();
1344 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1345 |   EXPECT_MATCH(false, std::vector<int>({}), "a");
1346 |   EXPECT_MATCH(true, std::vector<int>({0, 1, 1, 2}), "aa");
1347 |   EXPECT_MATCH(true, std::vector<int>({0, 2, 2, 3}), "aaa");
1348 | }
1349 | 
1350 | TEST_F(MatchTest, Quantifiers_6) {
1351 |   ParseAll("(a+?)(a+)");
1352 |   CompileAll();
1353 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1354 |   EXPECT_MATCH(false, std::vector<int>({}), "a");
1355 |   EXPECT_MATCH(true, std::vector<int>({0, 1, 1, 2}), "aa");
1356 |   EXPECT_MATCH(true, std::vector<int>({0, 1, 1, 3}), "aaa");
1357 | }
1358 | 
1359 | TEST_F(MatchTest, Quantifiers_7) {
1360 |   ParseAll("(a?)");
1361 |   CompileAll();
1362 |   EXPECT_MATCH(true, std::vector<int>({0, 0}), "");
1363 |   EXPECT_MATCH(true, std::vector<int>({0, 1}), "a");
1364 |   EXPECT_MATCH(false, std::vector<int>({}), "aa");
1365 |   EXPECT_MATCH(false, std::vector<int>({}), "aaa");
1366 | }
1367 | 
1368 | TEST_F(MatchTest, Quantifiers_8) {
1369 |   ParseAll("(a?)(a?)");
1370 |   CompileAll();
1371 |   EXPECT_MATCH(true, std::vector<int>({0, 0, 0, 0}), "");
1372 |   EXPECT_MATCH(true, std::vector<int>({0, 1, 1, 1}), "a");
1373 |   EXPECT_MATCH(true, std::vector<int>({0, 1, 1, 2}), "aa");
1374 |   EXPECT_MATCH(false, std::vector<int>({}), "aaa");
1375 | }
1376 | 
1377 | TEST_F(MatchTest, Quantifiers_9) {
1378 |   ParseAll("(a?""?)(a?)");  // Avoid trigraph.
1379 |   CompileAll();
1380 |   EXPECT_MATCH(true, std::vector<int>({0, 0, 0, 0}), "");
1381 |   EXPECT_MATCH(true, std::vector<int>({0, 0, 0, 1}), "a");
1382 |   EXPECT_MATCH(true, std::vector<int>({0, 1, 1, 2}), "aa");
1383 |   EXPECT_MATCH(false, std::vector<int>({}), "aaa");
1384 | }
1385 | 
1386 | TEST_F(MatchTest, Quantifiers_10) {
1387 |   ParseAll("(a{1})");
1388 |   CompileAll();
1389 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1390 |   EXPECT_MATCH(true, std::vector<int>({0, 1}), "a");
1391 |   EXPECT_MATCH(false, std::vector<int>({}), "aa");
1392 |   EXPECT_MATCH(false, std::vector<int>({}), "aaa");
1393 | }
1394 | 
1395 | TEST_F(MatchTest, Quantifiers_11) {
1396 |   ParseAll("(a{1})(a{1})");
1397 |   CompileAll();
1398 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1399 |   EXPECT_MATCH(false, std::vector<int>({}), "a");
1400 |   EXPECT_MATCH(true, std::vector<int>({0, 1, 1, 2}), "aa");
1401 |   EXPECT_MATCH(false, std::vector<int>({}), "aaa");
1402 | }
1403 | 
1404 | TEST_F(MatchTest, Quantifiers_12) {
1405 |   ParseAll("(a{1}?)(a{1})");
1406 |   CompileAll();
1407 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1408 |   EXPECT_MATCH(false, std::vector<int>({}), "a");
1409 |   EXPECT_MATCH(true, std::vector<int>({0, 1, 1, 2}), "aa");
1410 |   EXPECT_MATCH(false, std::vector<int>({}), "aaa");
1411 | }
1412 | 
1413 | TEST_F(MatchTest, Quantifiers_13) {
1414 |   ParseAll("(a{1,})");
1415 |   CompileAll();
1416 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1417 |   EXPECT_MATCH(true, std::vector<int>({0, 1}), "a");
1418 |   EXPECT_MATCH(true, std::vector<int>({0, 2}), "aa");
1419 |   EXPECT_MATCH(true, std::vector<int>({0, 3}), "aaa");
1420 | }
1421 | 
1422 | TEST_F(MatchTest, Quantifiers_14) {
1423 |   ParseAll("(a{1,})(a{1,})");
1424 |   CompileAll();
1425 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1426 |   EXPECT_MATCH(false, std::vector<int>({}), "a");
1427 |   EXPECT_MATCH(true, std::vector<int>({0, 1, 1, 2}), "aa");
1428 |   EXPECT_MATCH(true, std::vector<int>({0, 2, 2, 3}), "aaa");
1429 | }
1430 | 
1431 | TEST_F(MatchTest, Quantifiers_15) {
1432 |   ParseAll("(a{1,}?)(a{1,})");
1433 |   CompileAll();
1434 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1435 |   EXPECT_MATCH(false, std::vector<int>({}), "a");
1436 |   EXPECT_MATCH(true, std::vector<int>({0, 1, 1, 2}), "aa");
1437 |   EXPECT_MATCH(true, std::vector<int>({0, 1, 1, 3}), "aaa");
1438 | }
1439 | 
1440 | TEST_F(MatchTest, Quantifiers_16) {
1441 |   ParseAll("(a{1,2})");
1442 |   CompileAll();
1443 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1444 |   EXPECT_MATCH(true, std::vector<int>({0, 1}), "a");
1445 |   EXPECT_MATCH(true, std::vector<int>({0, 2}), "aa");
1446 |   EXPECT_MATCH(false, std::vector<int>({}), "aaa");
1447 | }
1448 | 
1449 | TEST_F(MatchTest, Quantifiers_17) {
1450 |   ParseAll("(a{1,2})(a{1,2})");
1451 |   CompileAll();
1452 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1453 |   EXPECT_MATCH(false, std::vector<int>({}), "a");
1454 |   EXPECT_MATCH(true, std::vector<int>({0, 1, 1, 2}), "aa");
1455 |   EXPECT_MATCH(true, std::vector<int>({0, 2, 2, 3}), "aaa");
1456 | }
1457 | 
1458 | TEST_F(MatchTest, Quantifiers_18) {
1459 |   ParseAll("(a{1,2}?)(a{1,2})");
1460 |   CompileAll();
1461 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1462 |   EXPECT_MATCH(false, std::vector<int>({}), "a");
1463 |   EXPECT_MATCH(true, std::vector<int>({0, 1, 1, 2}), "aa");
1464 |   EXPECT_MATCH(true, std::vector<int>({0, 1, 1, 3}), "aaa");
1465 | }
1466 | 
1467 | TEST_F(MatchTest, Concatenation) {
1468 |   ParseAll("(aa)");
1469 |   CompileAll();
1470 |   EXPECT_MATCH(false, std::vector<int>({}), "");
1471 |   EXPECT_MATCH(false, std::vector<int>({}), "a");
1472 |   EXPECT_MATCH(true, std::vector<int>({0, 2}), "aa");
1473 |   EXPECT_MATCH(false, std::vector<int>({}), "aaa");
1474 | }
1475 | 
1476 | TEST_F(MatchTest, Complement_1) {
1477 |   ParseAll("(!a)");
1478 |   CompileAll();
1479 |   EXPECT_MATCH(true, std::vector<int>({0, 0}), "");
1480 |   EXPECT_MATCH(false, std::vector<int>({}), "a");
1481 |   EXPECT_MATCH(true, std::vector<int>({0, 2}), "aa");
1482 |   EXPECT_MATCH(true, std::vector<int>({0, 3}), "aaa");
1483 | }
1484 | 
1485 | TEST_F(MatchTest, Complement_2) {
1486 |   ParseAll("(!(a))");
1487 |   CompileAll();
1488 |   EXPECT_MATCH(true, std::vector<int>({0, 0, -1, -1}), "");
1489 |   EXPECT_MATCH(false, std::vector<int>({}), "a");
1490 |   EXPECT_MATCH(true, std::vector<int>({0, 2, -1, -1}), "aa");
1491 |   EXPECT_MATCH(true, std::vector<int>({0, 3, -1, -1}), "aaa");
1492 | }
1493 | 
1494 | TEST_F(MatchTest, Conjunction_1) {
1495 |   ParseAll("(a.)&(.b)");
1496 |   CompileAll();
1497 |   EXPECT_MATCH(false, std::vector<int>({}), "aa");
1498 |   EXPECT_MATCH(true, std::vector<int>({0, 2, 0, 2}), "ab");
1499 |   EXPECT_MATCH(false, std::vector<int>({}), "ba");
1500 |   EXPECT_MATCH(false, std::vector<int>({}), "bb");
1501 | }
1502 | 
1503 | TEST_F(MatchTest, Conjunction_2) {
1504 |   ParseAll("(a.*)&(.*b)");
1505 |   CompileAll();
1506 |   EXPECT_MATCH(false, std::vector<int>({}), "aa");
1507 |   EXPECT_MATCH(true, std::vector<int>({0, 2, 0, 2}), "ab");
1508 |   EXPECT_MATCH(false, std::vector<int>({}), "ba");
1509 |   EXPECT_MATCH(false, std::vector<int>({}), "bb");
1510 |   EXPECT_MATCH(false, std::vector<int>({}), "aXa");
1511 |   EXPECT_MATCH(true, std::vector<int>({0, 3, 0, 3}), "aXb");
1512 |   EXPECT_MATCH(false, std::vector<int>({}), "bXa");
1513 |   EXPECT_MATCH(false, std::vector<int>({}), "bXb");
1514 | }
1515 | 
1516 | TEST_F(MatchTest, Disjunction_1) {
1517 |   ParseAll("(a.)|(.b)");
1518 |   CompileAll();
1519 |   EXPECT_MATCH(true, std::vector<int>({0, 2, -1, -1}), "aa");
1520 |   EXPECT_MATCH(true, std::vector<int>({0, 2, -1, -1}), "ab");
1521 |   EXPECT_MATCH(false, std::vector<int>({}), "ba");
1522 |   EXPECT_MATCH(true, std::vector<int>({-1, -1, 0, 2}), "bb");
1523 | }
1524 | 
1525 | TEST_F(MatchTest, Disjunction_2) {
1526 |   ParseAll("(a.*)|(.*b)");
1527 |   CompileAll();
1528 |   EXPECT_MATCH(true, std::vector<int>({0, 2, -1, -1}), "aa");
1529 |   EXPECT_MATCH(true, std::vector<int>({0, 2, -1, -1}), "ab");
1530 |   EXPECT_MATCH(false, std::vector<int>({}), "ba");
1531 |   EXPECT_MATCH(true, std::vector<int>({-1, -1, 0, 2}), "bb");
1532 |   EXPECT_MATCH(true, std::vector<int>({0, 3, -1, -1}), "aXa");
1533 |   EXPECT_MATCH(true, std::vector<int>({0, 3, -1, -1}), "aXb");
1534 |   EXPECT_MATCH(false, std::vector<int>({}), "bXa");
1535 |   EXPECT_MATCH(true, std::vector<int>({-1, -1, 0, 3}), "bXb");
1536 | }
1537 | 
1538 | TEST_F(MatchTest, PerlSemantics_1) {
1539 |   ParseAll("(?:(a*?)|(a*))(a*)");
1540 |   CompileAll();
1541 |   EXPECT_MATCH(true, std::vector<int>({0, 0, -1, -1, 0, 0}), "");
1542 |   EXPECT_MATCH(true, std::vector<int>({0, 0, -1, -1, 0, 1}), "a");
1543 |   EXPECT_MATCH(true, std::vector<int>({0, 0, -1, -1, 0, 2}), "aa");
1544 |   EXPECT_MATCH(true, std::vector<int>({0, 0, -1, -1, 0, 3}), "aaa");
1545 | }
1546 | 
1547 | TEST_F(MatchTest, PerlSemantics_2) {
1548 |   ParseAll("(?:(a*)|(a*?))(a*)");
1549 |   CompileAll();
1550 |   EXPECT_MATCH(true, std::vector<int>({0, 0, -1, -1, 0, 0}), "");
1551 |   EXPECT_MATCH(true, std::vector<int>({0, 1, -1, -1, 1, 1}), "a");
1552 |   EXPECT_MATCH(true, std::vector<int>({0, 2, -1, -1, 2, 2}), "aa");
1553 |   EXPECT_MATCH(true, std::vector<int>({0, 3, -1, -1, 3, 3}), "aaa");
1554 | }
1555 | 
1556 | // http://swtch.com/~rsc/regexp/regexp2.html#posix
1557 | TEST_F(MatchTest, PerlSemantics_3) {
1558 |   ParseAll("(a|bcdef|g|ab|c|d|e|efg|fg)*");
1559 |   CompileAll();
1560 |   EXPECT_MATCH(true, std::vector<int>({6, 7}), "abcdefg");
1561 | }
1562 | 
1563 | }  // namespace redgrep
1564 | 


--------------------------------------------------------------------------------