├── .bazelrc ├── BUILD.bazel ├── LICENSE ├── MODULE.bazel ├── README.md ├── WORKSPACE.bazel ├── WORKSPACE.bzlmod ├── internal_configure.bzl ├── libutf-BUILD.bazel ├── parser.yy ├── redasm.cc ├── reddot.cc ├── redgrep.cc ├── redgrep.h ├── redgrep_main.cc ├── regexp.cc ├── regexp.h └── regexp_test.cc /.bazelrc: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Enable layering check features. Useful on Clang only. 16 | build --features=layering_check 17 | # Enable parse headers features. Enforcing that headers are self-contained. 18 | build --features=parse_headers 19 | 20 | # LLVM requires C++17 at minimum. 21 | build --enable_platform_specific_config 22 | build:linux --cxxopt=-std=c++17 23 | build:macos --cxxopt=-std=c++17 24 | build:windows --cxxopt=/std:c++17 25 | -------------------------------------------------------------------------------- /BUILD.bazel: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | licenses(["notice"]) 16 | 17 | exports_files(["LICENSE"]) 18 | 19 | genrule( 20 | name = "parser", 21 | srcs = ["parser.yy"], 22 | outs = [ 23 | "parser.tab.cc", 24 | "parser.tab.hh", 25 | ], 26 | cmd = "bison -o $(location parser.tab.cc) $<", 27 | ) 28 | 29 | cc_library( 30 | name = "library", 31 | srcs = [ 32 | "redgrep.cc", 33 | "regexp.cc", 34 | ":parser", 35 | ], 36 | hdrs = [ 37 | "redgrep.h", 38 | "regexp.h", 39 | ], 40 | deps = [ 41 | "@libutf//:utf", 42 | "@local_config_llvm//:llvm", 43 | ], 44 | ) 45 | 46 | cc_test( 47 | name = "regexp_test", 48 | srcs = ["regexp_test.cc"], 49 | deps = [ 50 | ":library", 51 | "@googletest//:gtest", 52 | "@googletest//:gtest_main", 53 | ], 54 | ) 55 | 56 | cc_binary( 57 | name = "reddot", 58 | srcs = ["reddot.cc"], 59 | deps = [":library"], 60 | ) 61 | 62 | cc_binary( 63 | name = "redasm", 64 | srcs = ["redasm.cc"], 65 | deps = [":library"], 66 | ) 67 | 68 | cc_binary( 69 | name = "redgrep", 70 | srcs = ["redgrep_main.cc"], 71 | deps = [":library"], 72 | ) 73 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /MODULE.bazel: -------------------------------------------------------------------------------- 1 | # Copyright 2024 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | module( 16 | name = "redgrep", 17 | version = "0.0.0", 18 | ) 19 | 20 | internal_configure = use_extension("//:internal_configure.bzl", "internal_configure_extension") 21 | use_repo(internal_configure, "libutf", "local_config_llvm") 22 | 23 | bazel_dep(name = "googletest", version = "1.14.0.bcr.1", dev_dependency = True) 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # redgrep 2 | 3 | ## About 4 | 5 | redgrep is a grep based on regular expression derivatives. That is, it uses 6 | regular expression derivatives to construct the DFA. It then uses LLVM to JIT 7 | the DFA. 8 | 9 | Since regular expression derivatives permit the three basic Boolean operations 10 | of disjunction (`|`), conjunction (`&`) and complement (`!`), redgrep enables 11 | you to write very powerful regular expressions very easily and guarantees to 12 | match them in linear time. 13 | 14 | ## Building 15 | 16 | You must have Bazel, GNU bison and either GCC or Clang. 17 | 18 | redgrep attempts to keep up with LLVM development, so you should 19 | [get the source code and build LLVM](https://llvm.org/docs/GettingStarted.html#getting-the-source-code-and-building-llvm). 20 | (Debian and Ubuntu users might prefer to install the 21 | [nightly packages](https://apt.llvm.org/) instead.) 22 | 23 | `llvm-config-17` must be in your path. 24 | 25 | ## Contact 26 | 27 | [redgrep@googlegroups.com](mailto:redgrep@googlegroups.com) 28 | 29 | ## Disclaimer 30 | 31 | This is not an official Google product. 32 | -------------------------------------------------------------------------------- /WORKSPACE.bazel: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | workspace(name = "com_github_google_redgrep") 16 | -------------------------------------------------------------------------------- /WORKSPACE.bzlmod: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | workspace(name = "com_github_google_redgrep") 16 | -------------------------------------------------------------------------------- /internal_configure.bzl: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") 16 | 17 | def _which(repository_ctx, program): 18 | path = repository_ctx.which(program) 19 | if not path: 20 | fail("Finding %r failed" % (program,)) 21 | return path 22 | 23 | def _execute(repository_ctx, arguments): 24 | result = repository_ctx.execute(arguments) 25 | if result.return_code: 26 | fail("Executing %r failed: %r" % (arguments, result.stderr)) 27 | return result.stdout.strip() 28 | 29 | def _llvm_repository_impl(repository_ctx): 30 | llvm_config = _which(repository_ctx, "llvm-config-17") 31 | libfiles = _execute(repository_ctx, [llvm_config, "--libfiles"]) 32 | includedir = _execute(repository_ctx, [llvm_config, "--includedir"]) 33 | repository_ctx.symlink("/", "ROOT") 34 | repository_ctx.file( 35 | "BUILD.bazel", 36 | content = """\ 37 | cc_library( 38 | name = "llvm", 39 | srcs = ["ROOT" + {libfiles}], 40 | hdrs = glob(["ROOT" + {includedir} + "/**/*.*"]), 41 | includes = ["ROOT" + {includedir}], 42 | visibility = ["//visibility:public"], 43 | ) 44 | """.format( 45 | libfiles = repr(libfiles), 46 | includedir = repr(includedir), 47 | ), 48 | ) 49 | 50 | _llvm_repository = repository_rule(implementation = _llvm_repository_impl) 51 | 52 | def _internal_configure_extension_impl(module_ctx): 53 | http_archive( 54 | name = "libutf", 55 | build_file = "//:libutf-BUILD.bazel", 56 | strip_prefix = "libutf-master", 57 | urls = ["https://github.com/cls/libutf/archive/master.zip"], 58 | ) 59 | _llvm_repository( 60 | name = "local_config_llvm", 61 | ) 62 | 63 | internal_configure_extension = module_extension(implementation = _internal_configure_extension_impl) 64 | -------------------------------------------------------------------------------- /libutf-BUILD.bazel: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | licenses(["notice"]) 16 | 17 | exports_files(["LICENSE"]) 18 | 19 | cc_library( 20 | name = "utf", 21 | srcs = glob(["utf/*.c"]) + ["runetype/isvalidrune.c"], 22 | hdrs = ["include/utf.h"], 23 | includes = ["include"], 24 | visibility = ["//visibility:public"], 25 | ) 26 | -------------------------------------------------------------------------------- /parser.yy: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | %require "3.2" 16 | %language "c++" 17 | %define api.value.type {redgrep::Exp} 18 | %header 19 | %lex-param {llvm::StringRef* str} 20 | %parse-param {llvm::StringRef* str} {redgrep::Exp* exp} 21 | 22 | %code requires { 23 | #include "llvm/ADT/StringRef.h" 24 | #include "regexp.h" 25 | } 26 | 27 | %code { 28 | #include "utf.h" 29 | namespace yy { 30 | int yylex(redgrep::Exp* exp, llvm::StringRef* str); 31 | } // namespace yy 32 | } 33 | 34 | %left DISJUNCTION 35 | %left CONJUNCTION 36 | %left COMPLEMENT 37 | %right CONCATENATION 38 | %left QUANTIFIER 39 | %nonassoc LEFT_PARENTHESIS RIGHT_PARENTHESIS 40 | %nonassoc FUNDAMENTAL 41 | %token ERROR 42 | 43 | %% 44 | 45 | start: 46 | expression 47 | { *exp = $1; } 48 | 49 | expression: 50 | expression DISJUNCTION expression 51 | { $$ = redgrep::Disjunction($1, $3); } 52 | | expression CONJUNCTION expression 53 | { $$ = redgrep::Conjunction($1, $3); } 54 | | COMPLEMENT expression 55 | { $$ = redgrep::Complement($2); } 56 | | expression expression %prec CONCATENATION 57 | { $$ = redgrep::Concatenation($1, $2); } 58 | | expression QUANTIFIER 59 | { redgrep::Exp sub; int min; int max; 60 | std::tie(sub, min, max) = $2->quantifier(); 61 | redgrep::Mode mode; bool capture; 62 | std::tie(std::ignore, std::ignore, mode, capture) = sub->group(); 63 | $$ = redgrep::Quantifier($1, min, max); 64 | $$ = redgrep::Group(-1, $$, mode, capture); } 65 | | LEFT_PARENTHESIS expression RIGHT_PARENTHESIS 66 | { redgrep::Mode mode; bool capture; 67 | std::tie(std::ignore, std::ignore, mode, capture) = $1->group(); 68 | $$ = redgrep::Group(-1, $2, mode, capture); } 69 | | FUNDAMENTAL 70 | { $$ = $1; } 71 | 72 | %% 73 | 74 | static bool Character(llvm::StringRef* input, 75 | Rune* character) { 76 | int len = charntorune(character, input->data(), input->size()); 77 | if (len > 0) { 78 | *input = input->drop_front(len); 79 | return true; 80 | } 81 | return false; 82 | } 83 | 84 | static bool CharacterClass(llvm::StringRef* input, 85 | std::set* characters, 86 | bool* complement) { 87 | if (input->startswith("^")) { 88 | *input = input->drop_front(1); 89 | *complement = true; 90 | } else { 91 | *complement = false; 92 | } 93 | Rune character; 94 | while (Character(input, &character)) { 95 | switch (character) { 96 | case '\\': 97 | if (!Character(input, &character)) { 98 | return false; 99 | } 100 | switch (character) { 101 | case 'f': 102 | character = '\f'; 103 | break; 104 | case 'n': 105 | character = '\n'; 106 | break; 107 | case 'r': 108 | character = '\r'; 109 | break; 110 | case 't': 111 | character = '\t'; 112 | break; 113 | default: 114 | break; 115 | } 116 | // FALLTHROUGH 117 | default: 118 | characters->insert(character); 119 | break; 120 | case ']': 121 | return true; 122 | } 123 | } 124 | return false; 125 | } 126 | 127 | static bool Quantifier(Rune character, 128 | llvm::StringRef* input, 129 | int* min, 130 | int* max) { 131 | static constexpr char kDigits[] = "0123456789"; 132 | auto Number = [&input](int* output) -> bool { 133 | if (input->find_first_of(kDigits) == 0) { 134 | size_t len = input->find_first_not_of(kDigits); 135 | if (len != llvm::StringRef::npos && len <= 9) { 136 | sscanf(input->data(), "%d", output); 137 | *input = input->drop_front(len); 138 | return true; 139 | } 140 | } 141 | return false; 142 | }; 143 | switch (character) { 144 | case '*': 145 | *min = 0; 146 | *max = -1; 147 | return true; 148 | case '+': 149 | *min = 1; 150 | *max = -1; 151 | return true; 152 | case '?': 153 | *min = 0; 154 | *max = 1; 155 | return true; 156 | case '{': { 157 | if (Number(min) && *min >= 0) { 158 | if (input->startswith("}")) { // {n} 159 | *input = input->drop_front(1); 160 | *max = *min; 161 | return true; 162 | } 163 | if (input->startswith(",")) { 164 | *input = input->drop_front(1); 165 | if (input->startswith("}")) { // {n,} 166 | *input = input->drop_front(1); 167 | *max = -1; 168 | return true; 169 | } 170 | if (Number(max) && *max >= *min) { 171 | if (input->startswith("}")) { // {n,m} 172 | *input = input->drop_front(1); 173 | return true; 174 | } 175 | } 176 | } 177 | } 178 | return false; 179 | } 180 | default: 181 | break; 182 | } 183 | abort(); 184 | } 185 | 186 | namespace yy { 187 | 188 | int yylex(redgrep::Exp* exp, llvm::StringRef* str) { 189 | Rune character; 190 | if (!Character(str, &character)) { 191 | return 0; 192 | } 193 | typedef parser::token_type TokenType; 194 | switch (character) { 195 | case '|': 196 | return TokenType::DISJUNCTION; 197 | case '&': 198 | return TokenType::CONJUNCTION; 199 | case '!': 200 | return TokenType::COMPLEMENT; 201 | case '*': 202 | case '+': 203 | case '?': 204 | case '{': { 205 | int min, max; 206 | if (!Quantifier(character, str, &min, &max)) { 207 | return TokenType::ERROR; 208 | } 209 | redgrep::Mode mode; 210 | bool capture = false; 211 | if (str->startswith("?")) { 212 | *str = str->drop_front(1); 213 | mode = redgrep::kMinimal; 214 | } else { 215 | mode = redgrep::kMaximal; 216 | } 217 | // Somewhat perversely, we bundle the Group into the Quantifier and then 218 | // rebundle them back in the parser action. 219 | *exp = redgrep::Group(-1, redgrep::Byte(-1), mode, capture); 220 | *exp = redgrep::Quantifier(*exp, min, max); 221 | return TokenType::QUANTIFIER; 222 | } 223 | case '(': { 224 | redgrep::Mode mode = redgrep::kPassive; 225 | bool capture; 226 | if (str->startswith("?:")) { 227 | *str = str->drop_front(2); 228 | capture = false; 229 | } else { 230 | capture = true; 231 | } 232 | *exp = redgrep::Group(-1, redgrep::Byte(-1), mode, capture); 233 | return TokenType::LEFT_PARENTHESIS; 234 | } 235 | case ')': 236 | return TokenType::RIGHT_PARENTHESIS; 237 | case '[': { 238 | std::set characters; 239 | bool complement; 240 | if (!CharacterClass(str, &characters, &complement) || 241 | characters.empty()) { 242 | return TokenType::ERROR; 243 | } 244 | *exp = redgrep::CharacterClass(characters, complement); 245 | return TokenType::FUNDAMENTAL; 246 | } 247 | case '\\': 248 | if (!Character(str, &character)) { 249 | return TokenType::ERROR; 250 | } 251 | switch (character) { 252 | case 'C': 253 | *exp = redgrep::AnyByte(); 254 | return TokenType::FUNDAMENTAL; 255 | case 'f': 256 | character = '\f'; 257 | break; 258 | case 'n': 259 | character = '\n'; 260 | break; 261 | case 'r': 262 | character = '\r'; 263 | break; 264 | case 't': 265 | character = '\t'; 266 | break; 267 | default: 268 | break; 269 | } 270 | // FALLTHROUGH 271 | default: 272 | *exp = redgrep::Character(character); 273 | return TokenType::FUNDAMENTAL; 274 | case '.': 275 | *exp = redgrep::AnyCharacter(); 276 | return TokenType::FUNDAMENTAL; 277 | } 278 | } 279 | 280 | void parser::error(const std::string&) { 281 | // TODO(junyer): Do something? 282 | } 283 | 284 | } // namespace yy 285 | -------------------------------------------------------------------------------- /redasm.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | 22 | #include "llvm-c/Disassembler.h" 23 | #include "llvm/ExecutionEngine/ExecutionEngine.h" 24 | #include "llvm/Support/TargetSelect.h" 25 | #include "llvm/Target/TargetMachine.h" 26 | #include "regexp.h" 27 | 28 | int main(int argc, char** argv) { 29 | const char* argv1 = argv[1]; 30 | if (argv1 == nullptr) { 31 | errx(1, "regular expression not specified"); 32 | } 33 | redgrep::Exp exp; 34 | if (!redgrep::Parse(argv1, &exp)) { 35 | errx(1, "parse error"); 36 | } 37 | redgrep::DFA dfa; 38 | int nstates = redgrep::Compile(exp, &dfa); 39 | printf("; dfa is %d states\n", nstates); 40 | redgrep::Fun fun; 41 | int nbytes = redgrep::Compile(dfa, &fun); 42 | printf("; fun is %d bytes\n", nbytes); 43 | 44 | std::string triple = fun.engine_->getTargetMachine()->getTargetTriple().str(); 45 | std::string cpu(fun.engine_->getTargetMachine()->getTargetCPU()); 46 | printf("; target is %s (%s)\n", triple.c_str(), cpu.c_str()); 47 | 48 | // We need these for the disassembler. 49 | llvm::InitializeAllTargetInfos(); 50 | llvm::InitializeAllTargets(); 51 | llvm::InitializeAllTargetMCs(); 52 | llvm::InitializeAllAsmPrinters(); 53 | llvm::InitializeAllAsmParsers(); 54 | llvm::InitializeAllDisassemblers(); 55 | 56 | LLVMDisasmContextRef disasm = LLVMCreateDisasmCPU( 57 | triple.c_str(), cpu.c_str(), nullptr, 0, nullptr, nullptr); 58 | // These are increased and decreased, respectively, as we iterate. 59 | uint8_t* addr = reinterpret_cast(fun.machine_code_addr_); 60 | uint64_t size = fun.machine_code_size_; 61 | // These are the bounds. 62 | uint8_t* base = addr; 63 | uint8_t* limit = addr + size; 64 | while (addr < limit) { 65 | char buf[128]; 66 | size_t len = LLVMDisasmInstruction(disasm, addr, size, 0, buf, sizeof buf); 67 | if (len == 0) { 68 | errx(1, "bad machine code at %td (%p)", addr - base, addr); 69 | } 70 | printf("%8td%s\n", addr - base, buf); 71 | addr += len; 72 | size -= len; 73 | } 74 | LLVMDisasmDispose(disasm); 75 | return 0; 76 | } 77 | -------------------------------------------------------------------------------- /reddot.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include "regexp.h" 27 | 28 | static void EmitHeader(const char* str) { 29 | printf("digraph reddot {\n"); 30 | // TODO(junyer): Escape double quotes? 31 | printf("label=\"%s\"\n", str); 32 | printf("labelloc=\"t\"\n"); 33 | } 34 | 35 | static void EmitState(int curr, const char* fillcolor) { 36 | printf("s%d", curr); 37 | printf(" [style=filled fillcolor=%s]", fillcolor); 38 | printf("\n"); 39 | } 40 | 41 | static void EmitTransition(int curr, int next, int byte) { 42 | printf("s%d -> s%d [label=\"", curr, next); 43 | if (byte == -1) { 44 | printf("\" style=dashed]"); 45 | } else { 46 | printf("%02X\"]", byte); 47 | } 48 | printf("\n"); 49 | } 50 | 51 | static void EmitTransition(int curr, int next, int begin, int end) { 52 | printf("s%d -> s%d [label=\"", curr, next); 53 | printf("%02X-%02X\"]", begin, end); 54 | printf("\n"); 55 | } 56 | 57 | static void EmitFooter() { 58 | printf("}\n"); 59 | } 60 | 61 | inline void HandleImpl(const char* str, int nstates, const redgrep::FA& fa, 62 | const std::set>& transition_set) { 63 | EmitHeader(str); 64 | for (int i = 0; i < nstates; ++i) { 65 | int curr = i; 66 | if (fa.IsError(curr)) { 67 | // This is the error state. 68 | EmitState(curr, "red"); 69 | } else if (fa.IsAccepting(curr)) { 70 | // This is an accepting state. 71 | EmitState(curr, "green"); 72 | } else { 73 | // This is a normal state. 74 | EmitState(curr, "white"); 75 | } 76 | } 77 | std::map, std::list>> transition_map; 78 | for (const auto& i : transition_set) { 79 | int curr; int next; int byte; 80 | std::tie(curr, next, byte) = i; 81 | if (byte == -1) { 82 | EmitTransition(curr, next, byte); 83 | } else { 84 | auto& range_list = transition_map[std::make_pair(curr, next)]; 85 | if (range_list.empty() || 86 | range_list.back().second + 1 != byte) { 87 | range_list.push_back(std::make_pair(byte, byte)); 88 | } else { 89 | range_list.back().second = byte; 90 | } 91 | } 92 | } 93 | for (const auto& i : transition_map) { 94 | int curr = i.first.first; 95 | int next = i.first.second; 96 | const auto& range_list = i.second; 97 | for (const auto& j : range_list) { 98 | int begin = j.first; 99 | int end = j.second; 100 | if (begin == end) { 101 | EmitTransition(curr, next, begin); 102 | } else { 103 | EmitTransition(curr, next, begin, end); 104 | } 105 | } 106 | } 107 | EmitFooter(); 108 | } 109 | 110 | static void HandleDFA(const char* str) { 111 | redgrep::Exp exp; 112 | redgrep::DFA dfa; 113 | if (!redgrep::Parse(str, &exp)) { 114 | errx(1, "parse error"); 115 | } 116 | int nstates = redgrep::Compile(exp, &dfa); 117 | std::set> transition_set; 118 | for (const auto& i : dfa.transition_) { 119 | int curr = i.first.first; 120 | int byte = i.first.second; 121 | int next = i.second; 122 | if (!dfa.IsError(next) || byte != -1) { 123 | transition_set.insert(std::make_tuple(curr, next, byte)); 124 | } 125 | } 126 | HandleImpl(str, nstates, dfa, transition_set); 127 | } 128 | 129 | static void HandleTNFA(const char* str) { 130 | redgrep::Exp exp; 131 | redgrep::TNFA tnfa; 132 | if (!redgrep::Parse(str, &exp, &tnfa.modes_, &tnfa.captures_)) { 133 | errx(1, "parse error"); 134 | } 135 | int nstates = redgrep::Compile(exp, &tnfa); 136 | std::set> transition_set; 137 | for (const auto& i : tnfa.transition_) { 138 | int curr = i.first.first; 139 | int byte = i.first.second; 140 | int next = i.second.first; 141 | // TODO(junyer): Bindings? 142 | if (!tnfa.IsError(next) || byte != -1) { 143 | transition_set.insert(std::make_tuple(curr, next, byte)); 144 | } 145 | } 146 | HandleImpl(str, nstates, tnfa, transition_set); 147 | } 148 | 149 | int main(int argc, char** argv) { 150 | // Parse options. 151 | enum { 152 | kDFA, kTNFA, kTDFA, 153 | } mode = kDFA; 154 | for (;;) { 155 | int opt = getopt(argc, argv, "m:"); 156 | if (opt == -1) { 157 | break; 158 | } 159 | switch (opt) { 160 | case 'm': 161 | if (strcmp(optarg, "dfa") == 0) { 162 | mode = kDFA; 163 | } else if (strcmp(optarg, "tnfa") == 0) { 164 | mode = kTNFA; 165 | } else if (strcmp(optarg, "tdfa") == 0) { 166 | mode = kTDFA; 167 | } else { 168 | errx(1, "invalid mode"); 169 | } 170 | break; 171 | default: 172 | errx(1, "Usage: %s [OPTION]... REGEXP", argv[0]); 173 | } 174 | } 175 | 176 | if (optind == argc) { 177 | errx(1, "regular expression not specified"); 178 | } 179 | 180 | switch (mode) { 181 | case kDFA: 182 | HandleDFA(argv[optind++]); 183 | break; 184 | case kTNFA: 185 | HandleTNFA(argv[optind++]); 186 | break; 187 | case kTDFA: 188 | default: 189 | errx(1, "not implemented"); 190 | } 191 | 192 | return 0; 193 | } 194 | -------------------------------------------------------------------------------- /redgrep.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "redgrep.h" 16 | 17 | RED::RED(llvm::StringRef str) { 18 | redgrep::Exp exp; 19 | ok_ = redgrep::Parse(str, &exp); 20 | if (ok()) { 21 | redgrep::DFA dfa; 22 | redgrep::Compile(exp, &dfa); 23 | redgrep::Compile(dfa, &fun_); 24 | } 25 | } 26 | 27 | RED::~RED() {} 28 | 29 | bool RED::FullMatch(llvm::StringRef str, const RED& re) { 30 | return redgrep::Match(re.fun_, str); 31 | } 32 | -------------------------------------------------------------------------------- /redgrep.h: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef REDGREP_REDGREP_H_ 16 | #define REDGREP_REDGREP_H_ 17 | 18 | #include "llvm/ADT/StringRef.h" 19 | #include "regexp.h" 20 | 21 | // Represents a regular expression. 22 | // The interface is intended to resemble that of RE and RE2. 23 | class RED { 24 | public: 25 | explicit RED(llvm::StringRef str); 26 | ~RED(); 27 | 28 | // Returns true if the RED object is usable, false otherwise. 29 | // TODO(junyer): Plumb and expose errors from the parser. 30 | bool ok() const { return ok_; } 31 | 32 | // Returns the result of matching str using re. 33 | static bool FullMatch(llvm::StringRef str, const RED& re); 34 | 35 | private: 36 | bool ok_; 37 | redgrep::Fun fun_; 38 | 39 | RED(const RED&) = delete; 40 | RED& operator=(const RED&) = delete; 41 | }; 42 | 43 | #endif // REDGREP_REDGREP_H_ 44 | -------------------------------------------------------------------------------- /redgrep_main.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include 23 | 24 | #include "llvm/ADT/StringRef.h" 25 | #include "redgrep.h" 26 | 27 | static constexpr char kUsage[] = 28 | "Usage: %s [OPTION]... REGEXP [FILE]...\n" 29 | "\n" 30 | "Options:\n" 31 | "\n" 32 | " -v select non-matching lines\n" 33 | " -n print line number with output lines\n" 34 | " -H print the file name for each match\n" 35 | " -h suppress the file name prefix on output\n" 36 | "\n" 37 | "Similar to the way in which find(1) lets you construct expressions,\n" 38 | "REGEXP may comprise multiple subexpressions as separate arguments:\n" 39 | "\n" 40 | " [-e] EXPR regular expression\n" 41 | " ( EXPR ) grouping\n" 42 | " ! EXPR complement\n" 43 | " -not EXPR\n" 44 | " EXPR & EXPR conjunction\n" 45 | " EXPR -a EXPR\n" 46 | " EXPR -and EXPR\n" 47 | " EXPR | EXPR disjunction\n" 48 | " EXPR -o EXPR\n" 49 | " EXPR -or EXPR\n" 50 | "\n" 51 | "EXPR may begin with `^' in order to anchor it to the beginning of the\n" 52 | "line and may end with `$' in order to anchor it to the end of the line.\n" 53 | "\n"; 54 | 55 | int main(int argc, char** argv) { 56 | // Parse options. 57 | bool opt_invert_match = false; 58 | bool opt_line_number = false; 59 | enum { 60 | kAlways, kMaybe, kNever, 61 | } opt_with_filename = kMaybe; 62 | bool escape = false; 63 | while (!escape) { 64 | int opt = getopt(argc, argv, "+vnHhe:"); 65 | if (opt == -1) { 66 | break; 67 | } 68 | switch (opt) { 69 | case 'v': 70 | opt_invert_match = true; 71 | break; 72 | case 'n': 73 | opt_line_number = true; 74 | break; 75 | case 'H': 76 | opt_with_filename = kAlways; 77 | break; 78 | case 'h': 79 | opt_with_filename = kNever; 80 | break; 81 | case 'e': 82 | argv[--optind] = optarg; 83 | escape = true; 84 | break; 85 | default: 86 | // TODO(junyer): Move most of the usage text to `--help'. 87 | fprintf(stderr, kUsage, program_invocation_short_name); 88 | return 2; 89 | } 90 | } 91 | 92 | // Shift off parsed options. 93 | argc -= optind; 94 | argv += optind; 95 | 96 | // Build regular expression string. 97 | // TODO(junyer): Factor out for testing. 98 | std::string re_str; 99 | int parens = 0; 100 | bool complete = false; 101 | while (argc > 0) { 102 | std::string arg(*argv); 103 | if (!escape && arg == "-e") { 104 | if (complete) { 105 | re_str += "|"; 106 | } 107 | escape = true; 108 | complete = false; 109 | } else if (!escape && arg == "(") { 110 | re_str += arg; 111 | ++parens; 112 | } else if (!escape && arg == ")") { 113 | re_str += arg; 114 | --parens; 115 | if (parens < 0) { 116 | errx(2, "unmatched right parenthesis"); 117 | } 118 | } else if (!escape && (arg == "!" || arg == "-not")) { 119 | re_str += "!"; 120 | complete = false; 121 | } else if (!escape && (arg == "&" || arg == "-a" || arg == "-and")) { 122 | re_str += "&"; 123 | complete = false; 124 | } else if (!escape && (arg == "|" || arg == "-o" || arg == "-or")) { 125 | re_str += "|"; 126 | complete = false; 127 | } else if (escape || !complete) { 128 | if (!arg.empty()) { 129 | if (arg.front() == '^') { 130 | arg = arg.substr(1); 131 | } else { 132 | arg = ".*" + arg; 133 | } 134 | if (arg.back() == '$') { 135 | arg.back() = '\n'; 136 | } else { 137 | arg += ".*"; 138 | } 139 | re_str += arg; 140 | } 141 | escape = false; 142 | complete = true; 143 | } else { 144 | break; 145 | } 146 | --argc; 147 | ++argv; 148 | } 149 | 150 | if (re_str.empty()) { 151 | errx(2, "regular expression not specified"); 152 | } 153 | 154 | if (parens > 0) { 155 | errx(2, "unmatched left parenthesis"); 156 | } 157 | 158 | if (!complete) { 159 | errx(2, "incomplete arguments"); 160 | } 161 | 162 | if (opt_invert_match) { 163 | re_str = "!(" + re_str + ")"; 164 | } 165 | 166 | RED re(re_str); 167 | if (!re.ok()) { 168 | errx(2, "parse error"); 169 | } 170 | 171 | // Parse files. 172 | char const *const *files = argv; 173 | int nfiles = argc; 174 | if (nfiles == 0) { 175 | static char const *const kFiles[] = { "-", nullptr, }; 176 | files = kFiles; 177 | nfiles = 1; 178 | } 179 | 180 | // Grep! 181 | bool matched = false; 182 | char* data = nullptr; 183 | size_t size = 0; 184 | for (int i = 0; i < nfiles; ++i) { 185 | bool file_is_stdin = (files[i][0] == '-' && 186 | files[i][1] == '\0'); 187 | FILE* file = (file_is_stdin 188 | // GNU grep lets you specify "-" more than once. To emulate 189 | // this, we dup stdin here so that we don't close it later. 190 | ? fdopen(dup(fileno(stdin)), "r") 191 | : fopen(files[i], "r")); 192 | if (file == nullptr) { 193 | warn("%s", files[i]); 194 | continue; 195 | } 196 | for (int n = 1;; ++n) { 197 | ssize_t len = getline(&data, &size, file); 198 | if (len == -1) { 199 | break; 200 | } 201 | llvm::StringRef str(data, len); 202 | if (RED::FullMatch(str, re)) { 203 | matched = true; 204 | if (opt_with_filename == kAlways || 205 | (opt_with_filename == kMaybe && nfiles > 1)) { 206 | printf("%s:", (file_is_stdin 207 | ? "(standard input)" 208 | : files[i])); 209 | } 210 | if (opt_line_number) { 211 | printf("%d:", n); 212 | } 213 | printf("%.*s", static_cast(len), data); 214 | } 215 | } 216 | fclose(file); 217 | } 218 | free(data); 219 | 220 | // As per GNU grep, "The exit status is 0 if selected lines are found, and 1 221 | // if not found. If an error occurred the exit status is 2." 222 | return matched ? 0 : 1; 223 | } 224 | -------------------------------------------------------------------------------- /regexp.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "regexp.h" 16 | 17 | #include 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include "llvm/ADT/StringRef.h" 30 | #include "llvm/ExecutionEngine/ExecutionEngine.h" 31 | #include "llvm/ExecutionEngine/JITEventListener.h" 32 | #include "llvm/ExecutionEngine/MCJIT.h" 33 | #include "llvm/ExecutionEngine/RuntimeDyld.h" 34 | #include "llvm/IR/BasicBlock.h" 35 | #include "llvm/IR/Constants.h" 36 | #include "llvm/IR/DerivedTypes.h" 37 | #include "llvm/IR/Function.h" 38 | #include "llvm/IR/GlobalValue.h" 39 | #include "llvm/IR/IRBuilder.h" 40 | #include "llvm/IR/InstrTypes.h" 41 | #include "llvm/IR/Instructions.h" 42 | #include "llvm/IR/LLVMContext.h" 43 | #include "llvm/IR/Module.h" 44 | #include "llvm/Object/Binary.h" 45 | #include "llvm/Object/ObjectFile.h" 46 | #include "llvm/Object/SymbolSize.h" 47 | #include "llvm/Object/SymbolicFile.h" 48 | #include "llvm/Passes/PassBuilder.h" 49 | #include "llvm/Support/Casting.h" 50 | #include "llvm/Support/ErrorOr.h" 51 | #include "llvm/Support/Host.h" 52 | #include "llvm/Support/TargetSelect.h" 53 | #include "llvm/Target/TargetMachine.h" 54 | #include "parser.tab.hh" 55 | #include "utf.h" 56 | 57 | namespace redgrep { 58 | 59 | #define CAST_TO_INTPTR_T(ptr) reinterpret_cast(ptr) 60 | 61 | Expression::Expression(Kind kind) 62 | : kind_(kind), 63 | data_(0), 64 | norm_(true) {} 65 | 66 | Expression::Expression(Kind kind, const std::tuple& group) 67 | : kind_(kind), 68 | data_(CAST_TO_INTPTR_T((new std::tuple(group)))), 69 | norm_(false) {} 70 | 71 | Expression::Expression(Kind kind, int byte) 72 | : kind_(kind), 73 | data_(byte), 74 | norm_(true) {} 75 | 76 | Expression::Expression(Kind kind, const std::pair& byte_range) 77 | : kind_(kind), 78 | data_(CAST_TO_INTPTR_T((new std::pair(byte_range)))), 79 | norm_(true) {} 80 | 81 | Expression::Expression(Kind kind, const std::list& subexpressions, bool norm) 82 | : kind_(kind), 83 | data_(CAST_TO_INTPTR_T((new std::list(subexpressions)))), 84 | norm_(norm) {} 85 | 86 | Expression::Expression(Kind kind, const std::pair, bool>& character_class) 87 | : kind_(kind), 88 | data_(CAST_TO_INTPTR_T((new std::pair, bool>(character_class)))), 89 | norm_(false) {} 90 | 91 | Expression::Expression(Kind kind, const std::tuple& quantifier) 92 | : kind_(kind), 93 | data_(CAST_TO_INTPTR_T((new std::tuple(quantifier)))), 94 | norm_(false) {} 95 | 96 | Expression::~Expression() { 97 | switch (kind()) { 98 | case kEmptySet: 99 | case kEmptyString: 100 | break; 101 | 102 | case kGroup: 103 | delete reinterpret_cast*>(data()); 104 | break; 105 | 106 | case kAnyByte: 107 | break; 108 | 109 | case kByte: 110 | break; 111 | 112 | case kByteRange: 113 | delete reinterpret_cast*>(data()); 114 | break; 115 | 116 | case kKleeneClosure: 117 | case kConcatenation: 118 | case kComplement: 119 | case kConjunction: 120 | case kDisjunction: 121 | delete reinterpret_cast*>(data()); 122 | break; 123 | 124 | case kCharacterClass: 125 | delete reinterpret_cast, bool>*>(data()); 126 | break; 127 | 128 | case kQuantifier: 129 | delete reinterpret_cast*>(data()); 130 | break; 131 | } 132 | } 133 | 134 | const std::tuple& Expression::group() const { 135 | return *reinterpret_cast*>(data()); 136 | } 137 | 138 | int Expression::byte() const { 139 | return data(); 140 | } 141 | 142 | const std::pair& Expression::byte_range() const { 143 | return *reinterpret_cast*>(data()); 144 | } 145 | 146 | const std::list& Expression::subexpressions() const { 147 | return *reinterpret_cast*>(data()); 148 | } 149 | 150 | const std::pair, bool>& Expression::character_class() const { 151 | return *reinterpret_cast, bool>*>(data()); 152 | } 153 | 154 | const std::tuple& Expression::quantifier() const { 155 | return *reinterpret_cast*>(data()); 156 | } 157 | 158 | int Expression::Compare(Exp x, Exp y) { 159 | if (x->kind() < y->kind()) { 160 | return -1; 161 | } 162 | if (x->kind() > y->kind()) { 163 | return +1; 164 | } 165 | switch (x->kind()) { 166 | case kEmptySet: 167 | case kEmptyString: 168 | return 0; 169 | 170 | case kGroup: 171 | if (x->group() < y->group()) { 172 | return -1; 173 | } 174 | if (x->group() > y->group()) { 175 | return +1; 176 | } 177 | return 0; 178 | 179 | case kAnyByte: 180 | return 0; 181 | 182 | case kByte: 183 | if (x->byte() < y->byte()) { 184 | return -1; 185 | } 186 | if (x->byte() > y->byte()) { 187 | return +1; 188 | } 189 | return 0; 190 | 191 | case kByteRange: 192 | if (x->byte_range() < y->byte_range()) { 193 | return -1; 194 | } 195 | if (x->byte_range() > y->byte_range()) { 196 | return +1; 197 | } 198 | return 0; 199 | 200 | case kKleeneClosure: 201 | case kConcatenation: 202 | case kComplement: 203 | case kConjunction: 204 | case kDisjunction: { 205 | // Perform a lexicographical compare. 206 | std::list::const_iterator xi = x->subexpressions().begin(); 207 | std::list::const_iterator yi = y->subexpressions().begin(); 208 | while (xi != x->subexpressions().end() && 209 | yi != y->subexpressions().end()) { 210 | int compare = Compare(*xi, *yi); 211 | if (compare != 0) { 212 | return compare; 213 | } 214 | ++xi; 215 | ++yi; 216 | } 217 | if (xi == x->subexpressions().end() && 218 | yi != y->subexpressions().end()) { 219 | return -1; 220 | } 221 | if (xi != x->subexpressions().end() && 222 | yi == y->subexpressions().end()) { 223 | return +1; 224 | } 225 | return 0; 226 | } 227 | 228 | case kCharacterClass: 229 | case kQuantifier: 230 | break; 231 | } 232 | abort(); 233 | } 234 | 235 | Exp EmptySet() { 236 | Exp exp(new Expression(kEmptySet)); 237 | return exp; 238 | } 239 | 240 | Exp EmptyString() { 241 | Exp exp(new Expression(kEmptyString)); 242 | return exp; 243 | } 244 | 245 | Exp Group(const std::tuple& group) { 246 | Exp exp(new Expression(kGroup, group)); 247 | return exp; 248 | } 249 | 250 | Exp AnyByte() { 251 | Exp exp(new Expression(kAnyByte)); 252 | return exp; 253 | } 254 | 255 | Exp Byte(int byte) { 256 | Exp exp(new Expression(kByte, byte)); 257 | return exp; 258 | } 259 | 260 | Exp ByteRange(const std::pair& byte_range) { 261 | Exp exp(new Expression(kByteRange, byte_range)); 262 | return exp; 263 | } 264 | 265 | Exp KleeneClosure(const std::list& subexpressions, bool norm) { 266 | Exp exp(new Expression(kKleeneClosure, subexpressions, norm)); 267 | return exp; 268 | } 269 | 270 | Exp Concatenation(const std::list& subexpressions, bool norm) { 271 | Exp exp(new Expression(kConcatenation, subexpressions, norm)); 272 | return exp; 273 | } 274 | 275 | Exp Complement(const std::list& subexpressions, bool norm) { 276 | Exp exp(new Expression(kComplement, subexpressions, norm)); 277 | return exp; 278 | } 279 | 280 | Exp Conjunction(const std::list& subexpressions, bool norm) { 281 | Exp exp(new Expression(kConjunction, subexpressions, norm)); 282 | return exp; 283 | } 284 | 285 | Exp Disjunction(const std::list& subexpressions, bool norm) { 286 | Exp exp(new Expression(kDisjunction, subexpressions, norm)); 287 | return exp; 288 | } 289 | 290 | Exp CharacterClass(const std::pair, bool>& character_class) { 291 | Exp exp(new Expression(kCharacterClass, character_class)); 292 | return exp; 293 | } 294 | 295 | Exp Quantifier(const std::tuple& quantifier) { 296 | Exp exp(new Expression(kQuantifier, quantifier)); 297 | return exp; 298 | } 299 | 300 | Exp AnyCharacter() { 301 | Exp b1 = ByteRange(0x00, 0x7F); // 0xxxxxxx 302 | Exp bx = ByteRange(0x80, 0xBF); // 10xxxxxx 303 | Exp b2 = ByteRange(0xC2, 0xDF); // 110xxxxx 304 | Exp b3 = ByteRange(0xE0, 0xEF); // 1110xxxx 305 | Exp b4 = ByteRange(0xF0, 0xF4); // 11110xxx 306 | return Disjunction(b1, 307 | Concatenation(b2, bx), 308 | Concatenation(b3, bx, bx), 309 | Concatenation(b4, bx, bx, bx)); 310 | } 311 | 312 | Exp Character(Rune character) { 313 | char buf[4]; 314 | int len = runetochar(buf, &character); 315 | switch (len) { 316 | case 1: 317 | return Byte(static_cast(buf[0])); 318 | case 2: 319 | return Concatenation(Byte(static_cast(buf[0])), 320 | Byte(static_cast(buf[1]))); 321 | case 3: 322 | return Concatenation(Byte(static_cast(buf[0])), 323 | Byte(static_cast(buf[1])), 324 | Byte(static_cast(buf[2]))); 325 | case 4: 326 | return Concatenation(Byte(static_cast(buf[0])), 327 | Byte(static_cast(buf[1])), 328 | Byte(static_cast(buf[2])), 329 | Byte(static_cast(buf[3]))); 330 | default: 331 | break; 332 | } 333 | abort(); 334 | } 335 | 336 | Exp Normalised(Exp exp) { 337 | if (exp->norm()) { 338 | return exp; 339 | } 340 | switch (exp->kind()) { 341 | case kEmptySet: 342 | case kEmptyString: 343 | return exp; 344 | 345 | case kGroup: { 346 | int num; Exp sub; Mode mode; bool capture; 347 | std::tie(num, sub, mode, capture) = exp->group(); 348 | sub = Normalised(sub); 349 | if (sub->kind() == kEmptySet) { 350 | return EmptySet(); 351 | } 352 | if (sub->kind() == kEmptyString) { 353 | return EmptyString(); 354 | } 355 | return Group(num, sub, mode, capture); 356 | } 357 | 358 | case kAnyByte: 359 | case kByte: 360 | case kByteRange: 361 | return exp; 362 | 363 | case kKleeneClosure: { 364 | Exp sub = Normalised(exp->sub()); 365 | // (r∗)∗ ≈ r∗ 366 | if (sub->kind() == kKleeneClosure) { 367 | return sub; 368 | } 369 | // ∅∗ ≈ ε 370 | if (sub->kind() == kEmptySet) { 371 | return EmptyString(); 372 | } 373 | // ε∗ ≈ ε 374 | if (sub->kind() == kEmptyString) { 375 | return EmptyString(); 376 | } 377 | // \C∗ ≈ ¬∅ 378 | if (sub->kind() == kAnyByte) { 379 | return Complement({EmptySet()}, true); 380 | } 381 | return KleeneClosure({sub}, true); 382 | } 383 | 384 | case kConcatenation: { 385 | Exp head = exp->head(); 386 | Exp tail = exp->tail(); 387 | // (r · s) · t ≈ r · (s · t) 388 | head = Normalised(head); 389 | while (head->kind() == kConcatenation) { 390 | tail = Concatenation(head->tail(), tail); 391 | head = head->head(); 392 | } 393 | tail = Normalised(tail); 394 | // ∅ · r ≈ ∅ 395 | if (head->kind() == kEmptySet) { 396 | return head; 397 | } 398 | // r · ∅ ≈ ∅ 399 | if (tail->kind() == kEmptySet) { 400 | return tail; 401 | } 402 | // ε · r ≈ r 403 | if (head->kind() == kEmptyString) { 404 | return tail; 405 | } 406 | // r · ε ≈ r 407 | if (tail->kind() == kEmptyString) { 408 | return head; 409 | } 410 | return Concatenation({head, tail}, true); 411 | } 412 | 413 | case kComplement: { 414 | Exp sub = Normalised(exp->sub()); 415 | // ¬(¬r) ≈ r 416 | if (sub->kind() == kComplement) { 417 | return sub->sub(); 418 | } 419 | return Complement({sub}, true); 420 | } 421 | 422 | case kConjunction: { 423 | std::list subs; 424 | for (Exp sub : exp->subexpressions()) { 425 | sub = Normalised(sub); 426 | // ∅ & r ≈ ∅ 427 | // r & ∅ ≈ ∅ 428 | if (sub->kind() == kEmptySet) { 429 | return sub; 430 | } 431 | // (r & s) & t ≈ r & (s & t) 432 | if (sub->kind() == kConjunction) { 433 | std::list copy = sub->subexpressions(); 434 | subs.splice(subs.end(), copy); 435 | } else { 436 | subs.push_back(sub); 437 | } 438 | } 439 | // r & s ≈ s & r 440 | subs.sort(); 441 | // r & r ≈ r 442 | subs.unique(); 443 | // ¬∅ & r ≈ r 444 | // r & ¬∅ ≈ r 445 | subs.remove_if([&subs](Exp sub) -> bool { 446 | return (sub->kind() == kComplement && 447 | sub->sub()->kind() == kEmptySet && 448 | subs.size() > 1); 449 | }); 450 | if (subs.size() == 1) { 451 | return subs.front(); 452 | } 453 | return Conjunction(subs, true); 454 | } 455 | 456 | case kDisjunction: { 457 | std::list subs; 458 | for (Exp sub : exp->subexpressions()) { 459 | sub = Normalised(sub); 460 | // ¬∅ + r ≈ ¬∅ 461 | // r + ¬∅ ≈ ¬∅ 462 | if (sub->kind() == kComplement && 463 | sub->sub()->kind() == kEmptySet) { 464 | return sub; 465 | } 466 | // (r + s) + t ≈ r + (s + t) 467 | if (sub->kind() == kDisjunction) { 468 | std::list copy = sub->subexpressions(); 469 | subs.splice(subs.end(), copy); 470 | } else { 471 | subs.push_back(sub); 472 | } 473 | } 474 | // r + s ≈ s + r 475 | subs.sort(); 476 | // r + r ≈ r 477 | subs.unique(); 478 | // ∅ + r ≈ r 479 | // r + ∅ ≈ r 480 | subs.remove_if([&subs](Exp sub) -> bool { 481 | return (sub->kind() == kEmptySet && 482 | subs.size() > 1); 483 | }); 484 | if (subs.size() == 1) { 485 | return subs.front(); 486 | } 487 | return Disjunction(subs, true); 488 | } 489 | 490 | case kCharacterClass: 491 | case kQuantifier: 492 | break; 493 | } 494 | abort(); 495 | } 496 | 497 | bool IsNullable(Exp exp) { 498 | switch (exp->kind()) { 499 | case kEmptySet: 500 | // ν(∅) = ∅ 501 | return false; 502 | 503 | case kEmptyString: 504 | // ν(ε) = ε 505 | return true; 506 | 507 | case kGroup: 508 | return IsNullable(std::get<1>(exp->group())); 509 | 510 | case kAnyByte: 511 | // ν(\C) = ∅ 512 | return false; 513 | 514 | case kByte: 515 | // ν(a) = ∅ 516 | return false; 517 | 518 | case kByteRange: 519 | // ν(S) = ∅ 520 | return false; 521 | 522 | case kKleeneClosure: 523 | // ν(r∗) = ε 524 | return true; 525 | 526 | case kConcatenation: 527 | // ν(r · s) = ν(r) & ν(s) 528 | return IsNullable(exp->head()) && IsNullable(exp->tail()); 529 | 530 | case kComplement: 531 | // ν(¬r) = ∅ if ν(r) = ε 532 | // ε if ν(r) = ∅ 533 | return !IsNullable(exp->sub()); 534 | 535 | case kConjunction: 536 | // ν(r & s) = ν(r) & ν(s) 537 | for (Exp sub : exp->subexpressions()) { 538 | if (!IsNullable(sub)) { 539 | return false; 540 | } 541 | } 542 | return true; 543 | 544 | case kDisjunction: 545 | // ν(r + s) = ν(r) + ν(s) 546 | for (Exp sub : exp->subexpressions()) { 547 | if (IsNullable(sub)) { 548 | return true; 549 | } 550 | } 551 | return false; 552 | 553 | case kCharacterClass: 554 | case kQuantifier: 555 | break; 556 | } 557 | abort(); 558 | } 559 | 560 | Exp Derivative(Exp exp, int byte) { 561 | switch (exp->kind()) { 562 | case kEmptySet: 563 | // ∂a∅ = ∅ 564 | return EmptySet(); 565 | 566 | case kEmptyString: 567 | // ∂aε = ∅ 568 | return EmptySet(); 569 | 570 | case kGroup: 571 | // This should never happen. 572 | break; 573 | 574 | case kAnyByte: 575 | // ∂a\C = ε 576 | return EmptyString(); 577 | 578 | case kByte: 579 | // ∂aa = ε 580 | // ∂ab = ∅ for b ≠ a 581 | if (exp->byte() == byte) { 582 | return EmptyString(); 583 | } else { 584 | return EmptySet(); 585 | } 586 | 587 | case kByteRange: 588 | // ∂aS = ε if a ∈ S 589 | // ∅ if a ∉ S 590 | if (exp->byte_range().first <= byte && 591 | byte <= exp->byte_range().second) { 592 | return EmptyString(); 593 | } else { 594 | return EmptySet(); 595 | } 596 | 597 | case kKleeneClosure: 598 | // ∂a(r∗) = ∂ar · r∗ 599 | return Concatenation(Derivative(exp->sub(), byte), 600 | exp); 601 | 602 | case kConcatenation: 603 | // ∂a(r · s) = ∂ar · s + ν(r) · ∂as 604 | if (IsNullable(exp->head())) { 605 | return Disjunction(Concatenation(Derivative(exp->head(), byte), 606 | exp->tail()), 607 | Derivative(exp->tail(), byte)); 608 | } else { 609 | return Concatenation(Derivative(exp->head(), byte), 610 | exp->tail()); 611 | } 612 | 613 | case kComplement: 614 | // ∂a(¬r) = ¬(∂ar) 615 | return Complement(Derivative(exp->sub(), byte)); 616 | 617 | case kConjunction: { 618 | // ∂a(r & s) = ∂ar & ∂as 619 | std::list subs; 620 | for (Exp sub : exp->subexpressions()) { 621 | sub = Derivative(sub, byte); 622 | subs.push_back(sub); 623 | } 624 | return Conjunction(subs, false); 625 | } 626 | 627 | case kDisjunction: { 628 | // ∂a(r + s) = ∂ar + ∂as 629 | std::list subs; 630 | for (Exp sub : exp->subexpressions()) { 631 | sub = Derivative(sub, byte); 632 | subs.push_back(sub); 633 | } 634 | return Disjunction(subs, false); 635 | } 636 | 637 | case kCharacterClass: 638 | case kQuantifier: 639 | break; 640 | } 641 | abort(); 642 | } 643 | 644 | Outer Denormalised(Exp exp) { 645 | Outer outer(new OuterSet); 646 | exp = Normalised(exp); 647 | if (exp->kind() != kDisjunction) { 648 | exp = Disjunction({exp}, false); 649 | } 650 | for (Exp sub : exp->subexpressions()) { 651 | if (sub->kind() != kConjunction) { 652 | sub = Conjunction({sub}, false); 653 | } 654 | outer->push_back(std::make_pair(sub, Bindings({}))); 655 | } 656 | return outer; 657 | } 658 | 659 | Outer PartialConcatenation(Outer x, Exp y, const Bindings& initial) { 660 | // We mutate x as an optimisation. 661 | for (auto& xi : *x) { 662 | std::list subs; 663 | for (Exp sub : xi.first->subexpressions()) { 664 | sub = Concatenation(sub, y); 665 | subs.push_back(sub); 666 | } 667 | xi.first = Conjunction(subs, false); 668 | xi.second.insert(xi.second.begin(), initial.begin(), initial.end()); 669 | } 670 | return x; 671 | } 672 | 673 | Outer PartialComplement(Outer x) { 674 | Outer outer(nullptr); 675 | for (const auto& xi : *x) { 676 | Outer tmp(new OuterSet); 677 | for (Exp sub : xi.first->subexpressions()) { 678 | sub = Complement(sub); 679 | sub = Conjunction({sub}, false); 680 | tmp->push_back(std::make_pair(sub, Bindings({}))); 681 | } 682 | if (outer == nullptr) { 683 | outer = std::move(tmp); 684 | } else { 685 | outer = PartialConjunction(std::move(outer), std::move(tmp)); 686 | } 687 | } 688 | return outer; 689 | } 690 | 691 | Outer PartialConjunction(Outer x, Outer y) { 692 | Outer outer(new OuterSet); 693 | for (const auto& xi : *x) { 694 | for (const auto& yi : *y) { 695 | Exp sub = Conjunction(xi.first, yi.first); 696 | Bindings bindings; 697 | bindings.insert(bindings.end(), xi.second.begin(), xi.second.end()); 698 | bindings.insert(bindings.end(), yi.second.begin(), yi.second.end()); 699 | outer->push_back(std::make_pair(sub, bindings)); 700 | } 701 | } 702 | return outer; 703 | } 704 | 705 | Outer PartialDisjunction(Outer x, Outer y) { 706 | // We mutate x as an optimisation. 707 | x->insert(x->end(), y->begin(), y->end()); 708 | return x; 709 | } 710 | 711 | // Computes the cancel Bindings for exp. 712 | static void CancelBindings(Exp exp, Bindings* bindings) { 713 | switch (exp->kind()) { 714 | case kEmptySet: 715 | case kEmptyString: 716 | return; 717 | 718 | case kGroup: { 719 | int num; Exp sub; 720 | std::tie(num, sub, std::ignore, std::ignore) = exp->group(); 721 | bindings->push_back(std::make_pair(num, kCancel)); 722 | CancelBindings(sub, bindings); 723 | return; 724 | } 725 | 726 | case kAnyByte: 727 | case kByte: 728 | case kByteRange: 729 | return; 730 | 731 | case kKleeneClosure: 732 | CancelBindings(exp->sub(), bindings); 733 | return; 734 | 735 | case kConcatenation: 736 | CancelBindings(exp->head(), bindings); 737 | CancelBindings(exp->tail(), bindings); 738 | return; 739 | 740 | case kComplement: 741 | return; 742 | 743 | case kConjunction: 744 | case kDisjunction: 745 | for (Exp sub : exp->subexpressions()) { 746 | CancelBindings(sub, bindings); 747 | } 748 | return; 749 | 750 | case kCharacterClass: 751 | case kQuantifier: 752 | break; 753 | } 754 | abort(); 755 | } 756 | 757 | // Computes the epsilon Bindings for exp. 758 | static void EpsilonBindings(Exp exp, Bindings* bindings) { 759 | switch (exp->kind()) { 760 | case kEmptySet: 761 | case kEmptyString: 762 | return; 763 | 764 | case kGroup: { 765 | int num; Exp sub; 766 | std::tie(num, sub, std::ignore, std::ignore) = exp->group(); 767 | bindings->push_back(std::make_pair(num, kEpsilon)); 768 | EpsilonBindings(sub, bindings); 769 | return; 770 | } 771 | 772 | case kAnyByte: 773 | case kByte: 774 | case kByteRange: 775 | return; 776 | 777 | case kKleeneClosure: 778 | if (IsNullable(exp->sub())) { 779 | EpsilonBindings(exp->sub(), bindings); 780 | } 781 | return; 782 | 783 | case kConcatenation: 784 | EpsilonBindings(exp->head(), bindings); 785 | EpsilonBindings(exp->tail(), bindings); 786 | return; 787 | 788 | case kComplement: 789 | return; 790 | 791 | case kConjunction: 792 | for (Exp sub : exp->subexpressions()) { 793 | EpsilonBindings(sub, bindings); 794 | } 795 | return; 796 | 797 | case kDisjunction: 798 | for (Exp sub : exp->subexpressions()) { 799 | if (IsNullable(sub)) { 800 | EpsilonBindings(sub, bindings); 801 | return; 802 | } 803 | } 804 | return; 805 | 806 | case kCharacterClass: 807 | case kQuantifier: 808 | break; 809 | } 810 | abort(); 811 | } 812 | 813 | Outer Partial(Exp exp, int byte) { 814 | switch (exp->kind()) { 815 | case kEmptySet: 816 | // ∂a∅ = ∅ 817 | return Denormalised(EmptySet()); 818 | 819 | case kEmptyString: 820 | // ∂aε = ∅ 821 | return Denormalised(EmptySet()); 822 | 823 | case kGroup: { 824 | int num; Exp sub; Mode mode; bool capture; 825 | std::tie(num, sub, mode, capture) = exp->group(); 826 | Outer outer = Partial(sub, byte); 827 | for (auto& i : *outer) { 828 | i.first = Group(num, i.first, mode, capture); 829 | i.first = Conjunction({i.first}, false); 830 | i.second.push_back(std::make_pair(num, kAppend)); 831 | } 832 | return outer; 833 | } 834 | 835 | case kAnyByte: 836 | // ∂a\C = ε 837 | return Denormalised(EmptyString()); 838 | 839 | case kByte: 840 | // ∂aa = ε 841 | // ∂ab = ∅ for b ≠ a 842 | if (exp->byte() == byte) { 843 | return Denormalised(EmptyString()); 844 | } else { 845 | return Denormalised(EmptySet()); 846 | } 847 | 848 | case kByteRange: 849 | // ∂aS = ε if a ∈ S 850 | // ∅ if a ∉ S 851 | if (exp->byte_range().first <= byte && 852 | byte <= exp->byte_range().second) { 853 | return Denormalised(EmptyString()); 854 | } else { 855 | return Denormalised(EmptySet()); 856 | } 857 | 858 | case kKleeneClosure: { 859 | // ∂a(r∗) = ∂ar · r∗ 860 | Bindings cancel; 861 | CancelBindings(exp->sub(), &cancel); 862 | return PartialConcatenation(Partial(exp->sub(), byte), 863 | exp, 864 | cancel); 865 | } 866 | 867 | case kConcatenation: 868 | // ∂a(r · s) = ∂ar · s + ν(r) · ∂as 869 | if (IsNullable(exp->head())) { 870 | Bindings epsilon; 871 | EpsilonBindings(exp->head(), &epsilon); 872 | return PartialDisjunction( 873 | PartialConcatenation(Partial(exp->head(), byte), 874 | exp->tail(), 875 | Bindings({})), 876 | PartialConcatenation(Partial(exp->tail(), byte), 877 | EmptyString(), 878 | epsilon)); 879 | } else { 880 | return PartialConcatenation(Partial(exp->head(), byte), 881 | exp->tail(), 882 | Bindings({})); 883 | } 884 | 885 | case kComplement: 886 | // ∂a(¬r) = ¬(∂ar) 887 | return PartialComplement(Partial(exp->sub(), byte)); 888 | 889 | case kConjunction: { 890 | // ∂a(r & s) = ∂ar & ∂as 891 | Outer outer(nullptr); 892 | for (Exp sub : exp->subexpressions()) { 893 | Outer tmp = Partial(sub, byte); 894 | if (outer == nullptr) { 895 | outer = std::move(tmp); 896 | } else { 897 | outer = PartialConjunction(std::move(outer), std::move(tmp)); 898 | } 899 | } 900 | return outer; 901 | } 902 | 903 | case kDisjunction: { 904 | // ∂a(r + s) = ∂ar + ∂as 905 | Outer outer(nullptr); 906 | for (Exp sub : exp->subexpressions()) { 907 | Outer tmp = Partial(sub, byte); 908 | if (outer == nullptr) { 909 | outer = std::move(tmp); 910 | } else { 911 | outer = PartialDisjunction(std::move(outer), std::move(tmp)); 912 | } 913 | } 914 | return outer; 915 | } 916 | 917 | case kCharacterClass: 918 | case kQuantifier: 919 | break; 920 | } 921 | abort(); 922 | } 923 | 924 | // Outputs the partitions obtained by intersecting the partitions in x and y. 925 | // The first partition should be Σ-based. Any others should be ∅-based. 926 | static void Intersection(const std::list>& x, 927 | const std::list>& y, 928 | std::list>* z) { 929 | for (std::list>::const_iterator xi = x.begin(); 930 | xi != x.end(); 931 | ++xi) { 932 | for (std::list>::const_iterator yi = y.begin(); 933 | yi != y.end(); 934 | ++yi) { 935 | std::bitset<256> bs; 936 | if (xi == x.begin()) { 937 | if (yi == y.begin()) { 938 | // Perform set union: *xi is Σ-based, *yi is Σ-based. 939 | bs = *xi | *yi; 940 | // bs is Σ-based, so it can be empty. 941 | z->push_back(bs); 942 | } else { 943 | // Perform set difference: *xi is Σ-based, *yi is ∅-based. 944 | bs = *yi & ~*xi; 945 | if (bs.any()) { 946 | z->push_back(bs); 947 | } 948 | } 949 | } else { 950 | if (yi == y.begin()) { 951 | // Perform set difference: *xi is ∅-based, *yi is Σ-based. 952 | bs = *xi & ~*yi; 953 | if (bs.any()) { 954 | z->push_back(bs); 955 | } 956 | } else { 957 | // Perform set intersection: *xi is ∅-based, *yi is ∅-based. 958 | bs = *yi & *xi; 959 | if (bs.any()) { 960 | z->push_back(bs); 961 | } 962 | } 963 | } 964 | } 965 | } 966 | } 967 | 968 | void Partitions(Exp exp, std::list>* partitions) { 969 | switch (exp->kind()) { 970 | case kEmptySet: 971 | // C(∅) = {Σ} 972 | partitions->push_back({}); 973 | return; 974 | 975 | case kEmptyString: 976 | // C(ε) = {Σ} 977 | partitions->push_back({}); 978 | return; 979 | 980 | case kGroup: 981 | Partitions(std::get<1>(exp->group()), partitions); 982 | return; 983 | 984 | case kAnyByte: 985 | // C(\C) = {Σ} 986 | partitions->push_back({}); 987 | return; 988 | 989 | case kByte: { 990 | // C(a) = {Σ \ a, a} 991 | std::bitset<256> bs; 992 | bs.set(exp->byte()); 993 | partitions->push_back(bs); 994 | partitions->push_back(bs); 995 | return; 996 | } 997 | 998 | case kByteRange: { 999 | // C(S) = {Σ \ S, S} 1000 | std::bitset<256> bs; 1001 | for (int i = exp->byte_range().first; 1002 | i <= exp->byte_range().second; 1003 | ++i) { 1004 | bs.set(i); 1005 | } 1006 | partitions->push_back(bs); 1007 | partitions->push_back(bs); 1008 | return; 1009 | } 1010 | 1011 | case kKleeneClosure: 1012 | // C(r∗) = C(r) 1013 | Partitions(exp->sub(), partitions); 1014 | return; 1015 | 1016 | case kConcatenation: 1017 | // C(r · s) = C(r) ∧ C(s) if ν(r) = ε 1018 | // C(r) if ν(r) = ∅ 1019 | if (IsNullable(exp->head())) { 1020 | std::list> x, y; 1021 | Partitions(exp->head(), &x); 1022 | Partitions(exp->tail(), &y); 1023 | Intersection(x, y, partitions); 1024 | return; 1025 | } else { 1026 | Partitions(exp->head(), partitions); 1027 | return; 1028 | } 1029 | 1030 | case kComplement: 1031 | // C(¬r) = C(r) 1032 | Partitions(exp->sub(), partitions); 1033 | return; 1034 | 1035 | case kConjunction: 1036 | // C(r & s) = C(r) ∧ C(s) 1037 | for (Exp sub : exp->subexpressions()) { 1038 | if (partitions->empty()) { 1039 | Partitions(sub, partitions); 1040 | } else { 1041 | std::list> x, y; 1042 | partitions->swap(x); 1043 | Partitions(sub, &y); 1044 | Intersection(x, y, partitions); 1045 | } 1046 | } 1047 | return; 1048 | 1049 | case kDisjunction: 1050 | // C(r + s) = C(r) ∧ C(s) 1051 | for (Exp sub : exp->subexpressions()) { 1052 | if (partitions->empty()) { 1053 | Partitions(sub, partitions); 1054 | } else { 1055 | std::list> x, y; 1056 | partitions->swap(x); 1057 | Partitions(sub, &y); 1058 | Intersection(x, y, partitions); 1059 | } 1060 | } 1061 | return; 1062 | 1063 | case kCharacterClass: 1064 | case kQuantifier: 1065 | break; 1066 | } 1067 | abort(); 1068 | } 1069 | 1070 | // A simple framework for implementing the post-parse rewrites. 1071 | class Walker { 1072 | public: 1073 | Walker() {} 1074 | virtual ~Walker() {} 1075 | 1076 | virtual Exp WalkGroup(Exp exp) { 1077 | int num; Exp sub; Mode mode; bool capture; 1078 | std::tie(num, sub, mode, capture) = exp->group(); 1079 | sub = Walk(sub); 1080 | return Group(num, sub, mode, capture); 1081 | } 1082 | 1083 | virtual Exp WalkKleeneClosure(Exp exp) { 1084 | Exp sub = Walk(exp->sub()); 1085 | return KleeneClosure(sub); 1086 | } 1087 | 1088 | virtual Exp WalkConcatenation(Exp exp) { 1089 | Exp head = Walk(exp->head()); 1090 | Exp tail = Walk(exp->tail()); 1091 | return Concatenation(head, tail); 1092 | } 1093 | 1094 | virtual Exp WalkComplement(Exp exp) { 1095 | Exp sub = Walk(exp->sub()); 1096 | return Complement(sub); 1097 | } 1098 | 1099 | virtual Exp WalkConjunction(Exp exp) { 1100 | std::list subs; 1101 | for (Exp sub : exp->subexpressions()) { 1102 | sub = Walk(sub); 1103 | subs.push_back(sub); 1104 | } 1105 | return Conjunction(subs, false); 1106 | } 1107 | 1108 | virtual Exp WalkDisjunction(Exp exp) { 1109 | std::list subs; 1110 | for (Exp sub : exp->subexpressions()) { 1111 | sub = Walk(sub); 1112 | subs.push_back(sub); 1113 | } 1114 | return Disjunction(subs, false); 1115 | } 1116 | 1117 | virtual Exp WalkCharacterClass(Exp exp) { 1118 | return exp; 1119 | } 1120 | 1121 | virtual Exp WalkQuantifier(Exp exp) { 1122 | Exp sub; int min; int max; 1123 | std::tie(sub, min, max) = exp->quantifier(); 1124 | sub = Walk(sub); 1125 | return Quantifier(sub, min, max); 1126 | } 1127 | 1128 | Exp Walk(Exp exp) { 1129 | switch (exp->kind()) { 1130 | case kEmptySet: 1131 | case kEmptyString: 1132 | return exp; 1133 | 1134 | case kGroup: 1135 | return WalkGroup(exp); 1136 | 1137 | case kAnyByte: 1138 | case kByte: 1139 | case kByteRange: 1140 | return exp; 1141 | 1142 | case kKleeneClosure: 1143 | return WalkKleeneClosure(exp); 1144 | 1145 | case kConcatenation: 1146 | return WalkConcatenation(exp); 1147 | 1148 | case kComplement: 1149 | return WalkComplement(exp); 1150 | 1151 | case kConjunction: 1152 | return WalkConjunction(exp); 1153 | 1154 | case kDisjunction: 1155 | return WalkDisjunction(exp); 1156 | 1157 | case kCharacterClass: 1158 | return WalkCharacterClass(exp); 1159 | 1160 | case kQuantifier: 1161 | return WalkQuantifier(exp); 1162 | } 1163 | abort(); 1164 | } 1165 | 1166 | private: 1167 | Walker(const Walker&) = delete; 1168 | Walker& operator=(const Walker&) = delete; 1169 | }; 1170 | 1171 | class FlattenConjunctionsAndDisjunctions : public Walker { 1172 | public: 1173 | FlattenConjunctionsAndDisjunctions() {} 1174 | ~FlattenConjunctionsAndDisjunctions() override {} 1175 | 1176 | inline void FlattenImpl(Exp exp, std::list* subs) { 1177 | Kind kind = exp->kind(); 1178 | // In most cases, exp is a left-skewed binary tree. 1179 | while (exp->kind() == kind && 1180 | exp->subexpressions().size() == 2) { 1181 | subs->push_front(exp->tail()); 1182 | exp = exp->head(); 1183 | } 1184 | if (exp->kind() == kind) { 1185 | std::list copy = exp->subexpressions(); 1186 | subs->splice(subs->begin(), copy); 1187 | } else { 1188 | subs->push_front(exp); 1189 | } 1190 | std::list::iterator i = subs->begin(); 1191 | while (i != subs->end()) { 1192 | Exp sub = *i; 1193 | sub = Walk(sub); 1194 | if (sub->kind() == kind) { 1195 | std::list copy = sub->subexpressions(); 1196 | subs->splice(i, copy); 1197 | i = subs->erase(i); 1198 | } else { 1199 | *i = sub; 1200 | ++i; 1201 | } 1202 | } 1203 | } 1204 | 1205 | Exp WalkConjunction(Exp exp) override { 1206 | std::list subs; 1207 | FlattenImpl(exp, &subs); 1208 | return Conjunction(subs, false); 1209 | } 1210 | 1211 | Exp WalkDisjunction(Exp exp) override { 1212 | std::list subs; 1213 | FlattenImpl(exp, &subs); 1214 | return Disjunction(subs, false); 1215 | } 1216 | 1217 | private: 1218 | FlattenConjunctionsAndDisjunctions(const FlattenConjunctionsAndDisjunctions&) = delete; 1219 | FlattenConjunctionsAndDisjunctions& operator=(const FlattenConjunctionsAndDisjunctions&) = delete; 1220 | }; 1221 | 1222 | class StripGroups : public Walker { 1223 | public: 1224 | StripGroups() {} 1225 | ~StripGroups() override {} 1226 | 1227 | Exp WalkGroup(Exp exp) override { 1228 | Exp sub = Walk(std::get<1>(exp->group())); 1229 | return sub; 1230 | } 1231 | 1232 | private: 1233 | StripGroups(const StripGroups&) = delete; 1234 | StripGroups& operator=(const StripGroups&) = delete; 1235 | }; 1236 | 1237 | class ApplyGroups : public Walker { 1238 | public: 1239 | ApplyGroups() {} 1240 | ~ApplyGroups() override {} 1241 | 1242 | Exp WalkComplement(Exp exp) override { 1243 | Exp sub = Walk(exp->sub()); 1244 | sub = Complement(sub); 1245 | return Group(-1, sub, kMaximal, false); 1246 | } 1247 | 1248 | Exp WalkDisjunction(Exp exp) override { 1249 | // Applying Groups to AnyCharacter would break the .∗ ≈ ¬∅ rewrite. 1250 | if (exp == AnyCharacter()) { 1251 | return exp; 1252 | } 1253 | // Applying Groups to the subexpressions will identify the leftmost. 1254 | std::list subs; 1255 | for (Exp sub : exp->subexpressions()) { 1256 | sub = Walk(sub); 1257 | sub = Group(-1, sub, kPassive, false); 1258 | subs.push_back(sub); 1259 | } 1260 | return Disjunction(subs, false); 1261 | } 1262 | 1263 | private: 1264 | ApplyGroups(const ApplyGroups&) = delete; 1265 | ApplyGroups& operator=(const ApplyGroups&) = delete; 1266 | }; 1267 | 1268 | class NumberGroups : public Walker { 1269 | public: 1270 | NumberGroups(std::vector* modes, std::vector* captures) 1271 | : num_(0), modes_(modes), captures_(captures) {} 1272 | ~NumberGroups() override {} 1273 | 1274 | Exp WalkGroup(Exp exp) override { 1275 | Exp sub; Mode mode; bool capture; 1276 | std::tie(std::ignore, sub, mode, capture) = exp->group(); 1277 | int num = num_++; 1278 | modes_->push_back(mode); 1279 | if (capture) { 1280 | captures_->push_back(num); 1281 | } 1282 | sub = Walk(sub); 1283 | return Group(num, sub, mode, capture); 1284 | } 1285 | 1286 | private: 1287 | int num_; 1288 | std::vector* modes_; 1289 | std::vector* captures_; 1290 | 1291 | NumberGroups(const NumberGroups&) = delete; 1292 | NumberGroups& operator=(const NumberGroups&) = delete; 1293 | }; 1294 | 1295 | class ExpandCharacterClasses : public Walker { 1296 | public: 1297 | ExpandCharacterClasses() {} 1298 | ~ExpandCharacterClasses() override {} 1299 | 1300 | Exp WalkCharacterClass(Exp exp) override { 1301 | std::list subs; 1302 | for (Rune character : exp->character_class().first) { 1303 | subs.push_back(Character(character)); 1304 | } 1305 | Exp tmp = Disjunction(subs, false); 1306 | if (exp->character_class().second) { 1307 | tmp = Conjunction(Complement(tmp), AnyCharacter()); 1308 | } 1309 | return tmp; 1310 | } 1311 | 1312 | private: 1313 | ExpandCharacterClasses(const ExpandCharacterClasses&) = delete; 1314 | ExpandCharacterClasses& operator=(const ExpandCharacterClasses&) = delete; 1315 | }; 1316 | 1317 | class ExpandQuantifiers : public Walker { 1318 | public: 1319 | ExpandQuantifiers(bool* exceeded) 1320 | : exceeded_(exceeded), stack_({1000}) {} 1321 | ~ExpandQuantifiers() override {} 1322 | 1323 | Exp WalkQuantifier(Exp exp) override { 1324 | Exp sub; int min; int max; 1325 | std::tie(sub, min, max) = exp->quantifier(); 1326 | // Validate the repetition. 1327 | int limit = stack_.back(); 1328 | int rep = max; 1329 | if (rep == -1) { 1330 | rep = min; 1331 | } 1332 | if (rep > 0) { 1333 | limit /= rep; 1334 | } 1335 | if (limit == 0) { 1336 | *exceeded_ = true; 1337 | return exp; 1338 | } 1339 | stack_.push_back(limit); 1340 | sub = Walk(sub); 1341 | stack_.pop_back(); 1342 | if (*exceeded_) { 1343 | return exp; 1344 | } 1345 | // Perform the repetition. 1346 | Exp tmp; 1347 | if (max == -1) { 1348 | tmp = KleeneClosure(sub); 1349 | } 1350 | while (max > min) { 1351 | tmp = tmp == nullptr ? sub : Concatenation(sub, tmp); 1352 | tmp = Disjunction(EmptyString(), tmp); 1353 | --max; 1354 | } 1355 | while (min > 0) { 1356 | tmp = tmp == nullptr ? sub : Concatenation(sub, tmp); 1357 | --min; 1358 | } 1359 | tmp = tmp == nullptr ? EmptyString() : tmp; 1360 | return tmp; 1361 | } 1362 | 1363 | private: 1364 | bool* exceeded_; 1365 | std::vector stack_; 1366 | 1367 | ExpandQuantifiers(const ExpandQuantifiers&) = delete; 1368 | ExpandQuantifiers& operator=(const ExpandQuantifiers&) = delete; 1369 | }; 1370 | 1371 | bool Parse(llvm::StringRef str, Exp* exp) { 1372 | yy::parser parser(&str, exp); 1373 | if (parser.parse() != 0) { 1374 | return false; 1375 | } 1376 | *exp = FlattenConjunctionsAndDisjunctions().Walk(*exp); 1377 | *exp = StripGroups().Walk(*exp); 1378 | *exp = ExpandCharacterClasses().Walk(*exp); 1379 | bool exceeded = false; 1380 | *exp = ExpandQuantifiers(&exceeded).Walk(*exp); 1381 | return !exceeded; 1382 | } 1383 | 1384 | bool Parse(llvm::StringRef str, Exp* exp, 1385 | std::vector* modes, std::vector* captures) { 1386 | yy::parser parser(&str, exp); 1387 | if (parser.parse() != 0) { 1388 | return false; 1389 | } 1390 | *exp = FlattenConjunctionsAndDisjunctions().Walk(*exp); 1391 | *exp = ApplyGroups().Walk(*exp); 1392 | *exp = NumberGroups(modes, captures).Walk(*exp); 1393 | *exp = ExpandCharacterClasses().Walk(*exp); 1394 | bool exceeded = false; 1395 | *exp = ExpandQuantifiers(&exceeded).Walk(*exp); 1396 | return !exceeded; 1397 | } 1398 | 1399 | bool Match(Exp exp, llvm::StringRef str) { 1400 | while (!str.empty()) { 1401 | int byte = static_cast(str[0]); 1402 | str = str.drop_front(1); 1403 | Exp der = Derivative(exp, byte); 1404 | der = Normalised(der); 1405 | exp = der; 1406 | } 1407 | bool match = IsNullable(exp); 1408 | return match; 1409 | } 1410 | 1411 | // Outputs the FA compiled from exp. 1412 | // If tagged is true, uses Antimirov partial derivatives to construct a TNFA. 1413 | // Otherwise, uses Brzozowski derivatives to construct a DFA. 1414 | inline size_t CompileImpl(Exp exp, bool tagged, FA* fa) { 1415 | std::map states; 1416 | std::list queue; 1417 | auto LookupOrInsert = [&states, &queue](Exp exp) -> int { 1418 | auto state = states.insert(std::make_pair(exp, states.size())); 1419 | if (state.first->second > 0 && 1420 | state.second) { 1421 | queue.push_back(exp); 1422 | } 1423 | return state.first->second; 1424 | }; 1425 | queue.push_back(exp); 1426 | while (!queue.empty()) { 1427 | exp = queue.front(); 1428 | queue.pop_front(); 1429 | exp = Normalised(exp); 1430 | int curr = LookupOrInsert(exp); 1431 | if (exp->kind() == kEmptySet) { 1432 | fa->error_ = curr; 1433 | } 1434 | if (exp->kind() == kEmptyString) { 1435 | fa->empty_ = curr; 1436 | } 1437 | if (IsNullable(exp)) { 1438 | fa->accepting_[curr] = true; 1439 | if (tagged) { 1440 | TNFA* tnfa = reinterpret_cast(fa); 1441 | EpsilonBindings(exp, &tnfa->final_[curr]); 1442 | } 1443 | } else { 1444 | fa->accepting_[curr] = false; 1445 | } 1446 | std::list>* partitions = &fa->partitions_[curr]; 1447 | Partitions(exp, partitions); 1448 | for (std::list>::const_iterator i = partitions->begin(); 1449 | i != partitions->end(); 1450 | ++i) { 1451 | int byte; 1452 | if (i == partitions->begin()) { 1453 | // *i is Σ-based. Use a byte that it doesn't contain. 1454 | byte = -1; 1455 | } else { 1456 | // *i is ∅-based. Use the first byte that it contains. 1457 | for (byte = 0; !i->test(byte); ++byte) {} 1458 | } 1459 | if (tagged) { 1460 | TNFA* tnfa = reinterpret_cast(fa); 1461 | Outer outer = Partial(exp, byte); 1462 | std::set> seen; 1463 | for (const auto& j : *outer) { 1464 | Exp par = Normalised(j.first); 1465 | int next = LookupOrInsert(par); 1466 | if (seen.count(std::make_pair(next, j.second)) == 0) { 1467 | seen.insert(std::make_pair(next, j.second)); 1468 | if (i == partitions->begin()) { 1469 | // Set the "default" transition. 1470 | tnfa->transition_.insert(std::make_pair( 1471 | std::make_pair(curr, byte), std::make_pair(next, j.second))); 1472 | } else { 1473 | for (int byte = 0; byte < 256; ++byte) { 1474 | if (i->test(byte)) { 1475 | tnfa->transition_.insert(std::make_pair( 1476 | std::make_pair(curr, byte), std::make_pair(next, j.second))); 1477 | } 1478 | } 1479 | } 1480 | } 1481 | } 1482 | } else { 1483 | DFA* dfa = reinterpret_cast(fa); 1484 | Exp der = Derivative(exp, byte); 1485 | der = Normalised(der); 1486 | int next = LookupOrInsert(der); 1487 | if (i == partitions->begin()) { 1488 | // Set the "default" transition. 1489 | dfa->transition_[std::make_pair(curr, byte)] = next; 1490 | } else { 1491 | for (int byte = 0; byte < 256; ++byte) { 1492 | if (i->test(byte)) { 1493 | dfa->transition_[std::make_pair(curr, byte)] = next; 1494 | } 1495 | } 1496 | } 1497 | } 1498 | } 1499 | } 1500 | return states.size(); 1501 | } 1502 | 1503 | size_t Compile(Exp exp, DFA* dfa) { 1504 | return CompileImpl(exp, false, dfa); 1505 | } 1506 | 1507 | size_t Compile(Exp exp, TNFA* tnfa) { 1508 | return CompileImpl(exp, true, tnfa); 1509 | } 1510 | 1511 | bool Match(const DFA& dfa, llvm::StringRef str) { 1512 | int curr = 0; 1513 | while (!str.empty()) { 1514 | int byte = static_cast(str[0]); 1515 | str = str.drop_front(1); 1516 | auto transition = dfa.transition_.find(std::make_pair(curr, byte)); 1517 | if (transition == dfa.transition_.end()) { 1518 | // Get the "default" transition. 1519 | transition = dfa.transition_.find(std::make_pair(curr, -1)); 1520 | } 1521 | int next = transition->second; 1522 | curr = next; 1523 | } 1524 | return dfa.IsAccepting(curr); 1525 | } 1526 | 1527 | // Applies the Bindings to offsets using pos. 1528 | static void ApplyBindings(const Bindings& bindings, 1529 | int pos, 1530 | std::vector* offsets) { 1531 | for (const auto& i : bindings) { 1532 | int l = 2 * i.first + 0; 1533 | int r = 2 * i.first + 1; 1534 | switch (i.second) { 1535 | case kCancel: 1536 | if ((*offsets)[l] != -1) { 1537 | (*offsets)[l] = -1; 1538 | (*offsets)[r] = -1; 1539 | } 1540 | continue; 1541 | case kEpsilon: 1542 | case kAppend: 1543 | if ((*offsets)[l] == -1) { 1544 | (*offsets)[l] = pos; 1545 | (*offsets)[r] = pos; 1546 | } 1547 | if (i.second == kAppend) { 1548 | ++(*offsets)[r]; 1549 | } 1550 | continue; 1551 | } 1552 | abort(); 1553 | } 1554 | } 1555 | 1556 | // Returns true iff x precedes y in the total order specified by modes. 1557 | static bool Precedes(const std::vector& x, 1558 | const std::vector& y, 1559 | const std::vector& modes) { 1560 | for (size_t i = 0; i < modes.size(); ++i) { 1561 | int l = 2 * i + 0; 1562 | int r = 2 * i + 1; 1563 | if (x[l] == -1 && y[l] == -1) { 1564 | continue; 1565 | } else if (x[l] == -1) { 1566 | return false; 1567 | } else if (y[l] == -1) { 1568 | return true; 1569 | } else if (modes[i] == kPassive) { 1570 | continue; 1571 | } else if (x[l] < y[l]) { 1572 | return true; 1573 | } else if (x[l] > y[l]) { 1574 | return false; 1575 | } else if (x[r] < y[r]) { 1576 | return modes[i] == kMinimal; 1577 | } else if (x[r] > y[r]) { 1578 | return modes[i] == kMaximal; 1579 | } else { 1580 | continue; 1581 | } 1582 | } 1583 | return false; 1584 | } 1585 | 1586 | bool Match(const TNFA& tnfa, llvm::StringRef str, 1587 | std::vector* offsets) { 1588 | auto CompareOffsets = [&tnfa](const std::pair>& x, 1589 | const std::pair>& y) -> bool { 1590 | return Precedes(x.second, y.second, tnfa.modes_); 1591 | }; 1592 | std::list>> curr_states; 1593 | curr_states.push_back(std::make_pair(0, std::vector(2 * tnfa.modes_.size(), -1))); 1594 | int pos = 0; 1595 | while (!str.empty()) { 1596 | int byte = static_cast(str[0]); 1597 | str = str.drop_front(1); 1598 | // For each current state, determine the next states - applying Bindings - 1599 | // and then sort them by comparing offsets. Doing this repeatedly from the 1600 | // initial state and discarding next states that have been seen already in 1601 | // the current round is intended to simulate a VM implementation. 1602 | std::list>> next_states; 1603 | std::set seen; 1604 | for (const auto& i : curr_states) { 1605 | int curr = i.first; 1606 | std::pair key = std::make_pair(curr, byte); 1607 | auto transition = tnfa.transition_.lower_bound(key); 1608 | if (transition == tnfa.transition_.upper_bound(key)) { 1609 | // Get the "default" transition. 1610 | key = std::make_pair(curr, -1); 1611 | transition = tnfa.transition_.lower_bound(key); 1612 | } 1613 | std::list>> tmp; 1614 | while (transition != tnfa.transition_.upper_bound(key)) { 1615 | int next = transition->second.first; 1616 | if (seen.count(next) == 0 && 1617 | !tnfa.IsError(next)) { 1618 | seen.insert(next); 1619 | std::vector copy = i.second; 1620 | ApplyBindings(transition->second.second, pos, ©); 1621 | tmp.push_back(std::make_pair(next, copy)); 1622 | } 1623 | ++transition; 1624 | } 1625 | tmp.sort(CompareOffsets); 1626 | next_states.insert(next_states.end(), tmp.begin(), tmp.end()); 1627 | } 1628 | curr_states.swap(next_states); 1629 | ++pos; 1630 | } 1631 | for (const auto& i : curr_states) { 1632 | int curr = i.first; 1633 | if (tnfa.IsAccepting(curr)) { 1634 | std::vector copy = i.second; 1635 | ApplyBindings(tnfa.final_.find(curr)->second, pos, ©); 1636 | offsets->resize(2 * tnfa.captures_.size()); 1637 | for (size_t j = 0; j < tnfa.captures_.size(); ++j) { 1638 | (*offsets)[2 * j + 0] = copy[2 * tnfa.captures_[j] + 0]; 1639 | (*offsets)[2 * j + 1] = copy[2 * tnfa.captures_[j] + 1]; 1640 | } 1641 | return true; 1642 | } 1643 | } 1644 | return false; 1645 | } 1646 | 1647 | typedef bool NativeMatch(const char*, size_t); 1648 | 1649 | static llvm::FunctionType* getNativeMatchFnTy(llvm::LLVMContext& context) { 1650 | return llvm::FunctionType::get(llvm::Type::getInt1Ty(context), 1651 | {llvm::PointerType::getUnqual(context), 1652 | llvm::Type::getScalarTy(context)}, 1653 | false); 1654 | } 1655 | 1656 | Fun::Fun() { 1657 | static std::once_flag once_flag; 1658 | std::call_once(once_flag, []() { 1659 | llvm::InitializeNativeTarget(); 1660 | llvm::InitializeNativeTargetAsmPrinter(); 1661 | llvm::InitializeNativeTargetAsmParser(); 1662 | }); 1663 | context_.reset(new llvm::LLVMContext); 1664 | module_ = new llvm::Module("M", *context_); 1665 | engine_.reset(llvm::EngineBuilder(std::unique_ptr(module_)) 1666 | .setMCPU(llvm::sys::getHostCPUName()) 1667 | .create()); 1668 | function_ = 1669 | llvm::Function::Create(getNativeMatchFnTy(*context_), 1670 | llvm::GlobalValue::ExternalLinkage, "F", module_); 1671 | } 1672 | 1673 | Fun::~Fun() {} 1674 | 1675 | // Generates the function for the DFA. 1676 | static void GenerateFunction(const DFA& dfa, Fun* fun) { 1677 | llvm::LLVMContext& context = *fun->context_; // for convenience 1678 | llvm::IRBuilder<> bb(context); 1679 | 1680 | // Create the entry BasicBlock and two automatic variables, then store the 1681 | // Function Arguments in the automatic variables. 1682 | llvm::BasicBlock* entry = 1683 | llvm::BasicBlock::Create(context, "entry", fun->function_); 1684 | bb.SetInsertPoint(entry); 1685 | llvm::AllocaInst* data = bb.CreateAlloca( 1686 | llvm::PointerType::getUnqual(context), nullptr, "data"); 1687 | llvm::AllocaInst* size = bb.CreateAlloca( 1688 | llvm::Type::getScalarTy(context), nullptr, "size"); 1689 | llvm::Function::arg_iterator arg = fun->function_->arg_begin(); 1690 | bb.CreateStore(&*arg++, data); 1691 | bb.CreateStore(&*arg++, size); 1692 | 1693 | // Create a BasicBlock that returns true. 1694 | llvm::BasicBlock* return_true = 1695 | llvm::BasicBlock::Create(context, "return_true", fun->function_); 1696 | bb.SetInsertPoint(return_true); 1697 | bb.CreateRet(bb.getTrue()); 1698 | 1699 | // Create a BasicBlock that returns false. 1700 | llvm::BasicBlock* return_false = 1701 | llvm::BasicBlock::Create(context, "return_false", fun->function_); 1702 | bb.SetInsertPoint(return_false); 1703 | bb.CreateRet(bb.getFalse()); 1704 | 1705 | // Create two BasicBlocks per DFA state: the first branches if we have hit 1706 | // the end of the string; the second switches to the next DFA state after 1707 | // updating the automatic variables. 1708 | std::vector> states; 1709 | states.reserve(dfa.accepting_.size()); 1710 | for (const auto& i : dfa.accepting_) { 1711 | llvm::BasicBlock* bb0 = 1712 | llvm::BasicBlock::Create(context, "", fun->function_); 1713 | llvm::BasicBlock* bb1 = 1714 | llvm::BasicBlock::Create(context, "", fun->function_); 1715 | 1716 | auto sizeTy = llvm::Type::getScalarTy(context); 1717 | auto int8PtrTy = llvm::PointerType::getUnqual(context); 1718 | auto int8Ty = llvm::Type::getInt8Ty(context); 1719 | 1720 | bb.SetInsertPoint(bb0); 1721 | bb.CreateCondBr( 1722 | bb.CreateIsNull(bb.CreateLoad(sizeTy, size)), 1723 | i.second ? return_true : return_false, 1724 | bb1); 1725 | 1726 | bb.SetInsertPoint(bb1); 1727 | llvm::LoadInst* bytep = bb.CreateLoad(int8PtrTy, data); 1728 | llvm::LoadInst* byte = bb.CreateLoad(int8Ty, bytep); 1729 | bb.CreateStore( 1730 | bb.CreateGEP(int8Ty, bytep, bb.getInt64(1)), 1731 | data); 1732 | bb.CreateStore( 1733 | bb.CreateSub(bb.CreateLoad(sizeTy, size), bb.getInt64(1)), 1734 | size); 1735 | // Set the "default" transition to ourselves for now. We could look it up, 1736 | // but its BasicBlock might not exist yet, so we will just fix it up later. 1737 | bb.CreateSwitch(byte, bb0); 1738 | 1739 | states.push_back(std::make_pair(bb0, bb1)); 1740 | } 1741 | 1742 | // Wire up the BasicBlocks. 1743 | for (const auto& i : dfa.transition_) { 1744 | // Get the current DFA state. 1745 | llvm::BasicBlock* bb1 = states[i.first.first].second; 1746 | llvm::SwitchInst* swi = llvm::cast(bb1->getTerminator()); 1747 | // Get the next DFA state. 1748 | llvm::BasicBlock* bb0 = states[i.second].first; 1749 | if (i.first.second == -1) { 1750 | // Set the "default" transition. 1751 | swi->setDefaultDest(bb0); 1752 | } else { 1753 | swi->addCase(llvm::ConstantInt::get(llvm::Type::getInt8Ty(context), 1754 | i.first.second), 1755 | bb0); 1756 | } 1757 | } 1758 | 1759 | // Plug in the entry BasicBlock. 1760 | bb.SetInsertPoint(entry); 1761 | bb.CreateBr(states[0].first); 1762 | 1763 | // Do we begin by scanning memory for a byte? If so, we can make memchr(3) do 1764 | // that for us. It will almost certainly be vectorised and thus much faster. 1765 | { 1766 | llvm::BasicBlock* bb0 = states[0].first; 1767 | llvm::BasicBlock* bb1 = states[0].second; 1768 | llvm::BranchInst* bra = llvm::cast(bb0->getTerminator()); 1769 | llvm::SwitchInst* swi = llvm::cast(bb1->getTerminator()); 1770 | if (swi->getDefaultDest() == bb0 && 1771 | swi->getNumCases() == 1) { 1772 | // What is the byte that we are trying to find? 1773 | fun->memchr_byte_ = swi->case_begin()->getCaseValue()->getZExtValue(); 1774 | // What should we return if we fail to find it? 1775 | fun->memchr_fail_ = bra->getSuccessor(0) == return_true; 1776 | } else { 1777 | fun->memchr_byte_ = -1; 1778 | } 1779 | } 1780 | 1781 | // Optimise the module. 1782 | // NOTE(junyer): This was cargo-culted from Clang. Ordering matters! 1783 | llvm::LoopAnalysisManager lam; 1784 | llvm::FunctionAnalysisManager fam; 1785 | llvm::CGSCCAnalysisManager cam; 1786 | llvm::ModuleAnalysisManager mam; 1787 | 1788 | llvm::PassBuilder pb(fun->engine_->getTargetMachine()); 1789 | pb.registerModuleAnalyses(mam); 1790 | pb.registerCGSCCAnalyses(cam); 1791 | pb.registerFunctionAnalyses(fam); 1792 | pb.registerLoopAnalyses(lam); 1793 | pb.registerModuleAnalyses(mam); 1794 | pb.crossRegisterProxies(lam, fam, cam, mam); 1795 | 1796 | llvm::ModulePassManager mpm = 1797 | pb.buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O2); 1798 | mpm.run(*fun->module_, mam); 1799 | } 1800 | 1801 | // This seems to be the only way to discover the machine code size. 1802 | class DiscoverMachineCodeSize : public llvm::JITEventListener { 1803 | public: 1804 | explicit DiscoverMachineCodeSize(Fun* fun) : fun_(fun) {} 1805 | ~DiscoverMachineCodeSize() override {} 1806 | 1807 | void 1808 | notifyObjectLoaded(ObjectKey, const llvm::object::ObjectFile &object, 1809 | const llvm::RuntimeDyld::LoadedObjectInfo &info) override { 1810 | // We need this in order to obtain the addresses as well as the sizes. 1811 | llvm::object::OwningBinary debug = 1812 | info.getObjectForDebug(object); 1813 | std::vector> symbol_sizes = 1814 | llvm::object::computeSymbolSizes(*debug.getBinary()); 1815 | for (const auto& i : symbol_sizes) { 1816 | llvm::Expected name = i.first.getName(); 1817 | llvm::Expected addr = i.first.getAddress(); 1818 | if (name && addr && *name == "F") { 1819 | fun_->machine_code_addr_ = *addr; 1820 | fun_->machine_code_size_ = i.second; 1821 | return; 1822 | } 1823 | } 1824 | abort(); 1825 | } 1826 | 1827 | private: 1828 | Fun* fun_; 1829 | 1830 | DiscoverMachineCodeSize(const DiscoverMachineCodeSize&) = delete; 1831 | DiscoverMachineCodeSize& operator=(const DiscoverMachineCodeSize&) = delete; 1832 | }; 1833 | 1834 | // Generates the machine code for the function. 1835 | static void GenerateMachineCode(Fun* fun) { 1836 | DiscoverMachineCodeSize dmcs(fun); 1837 | fun->engine_->RegisterJITEventListener(&dmcs); 1838 | fun->engine_->finalizeObject(); 1839 | fun->engine_->UnregisterJITEventListener(&dmcs); 1840 | } 1841 | 1842 | size_t Compile(const DFA& dfa, Fun* fun) { 1843 | GenerateFunction(dfa, fun); 1844 | GenerateMachineCode(fun); 1845 | return fun->machine_code_size_; 1846 | } 1847 | 1848 | bool Match(const Fun& fun, llvm::StringRef str) { 1849 | if (fun.memchr_byte_ != -1) { 1850 | const void* ptr = memchr(str.data(), fun.memchr_byte_, str.size()); 1851 | if (ptr == nullptr) { 1852 | return fun.memchr_fail_; 1853 | } 1854 | str = str.drop_front(reinterpret_cast(ptr) - str.data()); 1855 | } 1856 | NativeMatch* match = reinterpret_cast(fun.machine_code_addr_); 1857 | return (*match)(str.data(), str.size()); 1858 | } 1859 | 1860 | } // namespace redgrep 1861 | -------------------------------------------------------------------------------- /regexp.h: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef REDGREP_REGEXP_H_ 16 | #define REDGREP_REGEXP_H_ 17 | 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #include "llvm/ADT/StringRef.h" 32 | #include "utf.h" 33 | 34 | namespace llvm { 35 | class ExecutionEngine; 36 | class Function; 37 | class LLVMContext; 38 | class Module; 39 | } // namespace llvm 40 | 41 | namespace redgrep { 42 | 43 | // Implements regular expressions using Brzozowski derivatives, Antimirov 44 | // partial derivatives, Sulzmann submatches and Laurikari tagged transitions. 45 | // 46 | // References 47 | // ---------- 48 | // 49 | // "Derivatives of Regular Expressions" 50 | // Janusz Brzozowski 51 | // Journal of the ACM, vol. 11 iss. 4, pp. 481-494, October 1964 52 | // http://dl.acm.org/citation.cfm?id=321249 53 | // 54 | // "Regular-expression derivatives re-examined" 55 | // Scott Owens, John Reppy, Aaron Turon 56 | // Journal of Functional Programming, vol. 19 iss. 2, pp. 173-190, March 2009 57 | // http://dl.acm.org/citation.cfm?id=1520288 58 | // 59 | // "Partial Derivatives of Regular Expressions and Finite Automaton Constructions" 60 | // Valentin Antimirov 61 | // Theoretical Computer Science, vol. 155 iss. 2, pp. 291-319, March 1996 62 | // http://dl.acm.org/citation.cfm?id=231848 63 | // 64 | // "Partial Derivatives of an Extended Regular Expression" 65 | // Pascal Caron, Jean-Marc Champarnaud, Ludovic Mignot 66 | // Language and Automata Theory and Applications 2011, pp. 179-191, May 2011 67 | // http://dl.acm.org/citation.cfm?id=2022911 68 | // 69 | // "A Flexible and Efficient ML Lexer Tool Based on Extended Regular Expression Submatching" 70 | // Martin Sulzmann, Pippijn van Steenhoven 71 | // Compiler Construction 2014, pp. 174-191, April 2014 72 | // http://dx.doi.org/10.1007/978-3-642-54807-9_10 73 | // 74 | // "Efficient submatch addressing for regular expressions" 75 | // Ville Laurikari 76 | // Master's Thesis, November 2001 77 | // http://laurikari.net/ville/regex-submatch.pdf 78 | 79 | enum Kind { 80 | kEmptySet, 81 | kEmptyString, 82 | kGroup, 83 | kAnyByte, 84 | kByte, 85 | kByteRange, 86 | kKleeneClosure, 87 | kConcatenation, 88 | kComplement, 89 | kConjunction, 90 | kDisjunction, 91 | kCharacterClass, // ephemeral 92 | kQuantifier, // ephemeral 93 | }; 94 | 95 | enum Mode { 96 | kMinimal, 97 | kPassive, 98 | kMaximal, 99 | }; 100 | 101 | class Expression; 102 | typedef std::shared_ptr Exp; 103 | 104 | // Represents a regular expression. 105 | // Note that the data members are const in order to guarantee immutability, 106 | // which will matter later when we use expressions as STL container keys. 107 | class Expression { 108 | public: 109 | explicit Expression(Kind kind); 110 | Expression(Kind kind, const std::tuple& group); 111 | Expression(Kind kind, int byte); 112 | Expression(Kind kind, const std::pair& byte_range); 113 | Expression(Kind kind, const std::list& subexpressions, bool norm); 114 | Expression(Kind kind, const std::pair, bool>& character_class); 115 | Expression(Kind kind, const std::tuple& quantifier); 116 | ~Expression(); 117 | 118 | Kind kind() const { return kind_; } 119 | intptr_t data() const { return data_; } 120 | bool norm() const { return norm_; } 121 | 122 | // Accessors for the expression data. Of course, if you call the wrong 123 | // function for the expression kind, you're gonna have a bad time. 124 | const std::tuple& group() const; 125 | int byte() const; 126 | const std::pair& byte_range() const; 127 | const std::list& subexpressions() const; 128 | const std::pair, bool>& character_class() const; 129 | const std::tuple& quantifier() const; 130 | 131 | // A KleeneClosure or Complement expression has one subexpression. 132 | // Use sub() for convenience. 133 | Exp sub() const { return subexpressions().front(); } 134 | 135 | // A Concatenation expression has two subexpressions, the second typically 136 | // being another Concatenation. Thus, the concept of "head" and "tail". 137 | // Use head() and tail() for convenience. 138 | Exp head() const { return subexpressions().front(); } 139 | Exp tail() const { return subexpressions().back(); } 140 | 141 | friend bool operator<(Exp x, Exp y) { return Compare(x, y) < 0; } 142 | friend bool operator<=(Exp x, Exp y) { return Compare(x, y) <= 0; } 143 | friend bool operator==(Exp x, Exp y) { return Compare(x, y) == 0; } 144 | friend bool operator!=(Exp x, Exp y) { return Compare(x, y) != 0; } 145 | friend bool operator>(Exp x, Exp y) { return Compare(x, y) > 0; } 146 | friend bool operator>=(Exp x, Exp y) { return Compare(x, y) >= 0; } 147 | 148 | private: 149 | // Returns -1, 0 or +1 when x is less than, equal to or greater than y, 150 | // respectively, so that we can define operators above for convenience. 151 | static int Compare(Exp x, Exp y); 152 | 153 | const Kind kind_; 154 | const intptr_t data_; 155 | const bool norm_; 156 | 157 | Expression(const Expression&) = delete; 158 | Expression& operator=(const Expression&) = delete; 159 | }; 160 | 161 | // Builders for the various expression kinds. 162 | // Use the inline functions for convenience when building up expressions in 163 | // parser code, test code et cetera. 164 | 165 | Exp EmptySet(); 166 | Exp EmptyString(); 167 | Exp Group(const std::tuple& group); 168 | Exp AnyByte(); 169 | Exp Byte(int byte); 170 | Exp ByteRange(const std::pair& byte_range); 171 | Exp KleeneClosure(const std::list& subexpressions, bool norm); 172 | Exp Concatenation(const std::list& subexpressions, bool norm); 173 | Exp Complement(const std::list& subexpressions, bool norm); 174 | Exp Conjunction(const std::list& subexpressions, bool norm); 175 | Exp Disjunction(const std::list& subexpressions, bool norm); 176 | Exp CharacterClass(const std::pair, bool>& character_class); 177 | Exp Quantifier(const std::tuple& quantifier); 178 | 179 | inline Exp Group(int num, Exp sub, Mode mode, bool capture) { 180 | return Group(std::make_tuple(num, sub, mode, capture)); 181 | } 182 | 183 | inline Exp ByteRange(int min, int max) { 184 | return ByteRange(std::make_pair(min, max)); 185 | } 186 | 187 | inline Exp KleeneClosure(Exp x) { 188 | return KleeneClosure({x}, false); 189 | } 190 | 191 | inline Exp Concatenation(Exp x, Exp y) { 192 | return Concatenation({x, y}, false); 193 | } 194 | 195 | template 196 | inline Exp Concatenation(Exp x, Exp y, Variadic... z) { 197 | return Concatenation({x, Concatenation(y, z...)}, false); 198 | } 199 | 200 | inline Exp Complement(Exp x) { 201 | return Complement({x}, false); 202 | } 203 | 204 | template 205 | inline Exp Conjunction(Exp x, Exp y, Variadic... z) { 206 | return Conjunction({x, y, z...}, false); 207 | } 208 | 209 | template 210 | inline Exp Disjunction(Exp x, Exp y, Variadic... z) { 211 | return Disjunction({x, y, z...}, false); 212 | } 213 | 214 | inline Exp CharacterClass(const std::set& characters, bool complement) { 215 | return CharacterClass(std::make_pair(characters, complement)); 216 | } 217 | 218 | inline Exp Quantifier(Exp sub, int min, int max) { 219 | return Quantifier(std::make_tuple(sub, min, max)); 220 | } 221 | 222 | Exp AnyCharacter(); 223 | Exp Character(Rune character); 224 | 225 | // Returns the normalised form of exp. 226 | Exp Normalised(Exp exp); 227 | 228 | // Returns the nullability of exp as a bool. 229 | // EmptySet and EmptyString map to false and true, respectively. 230 | bool IsNullable(Exp exp); 231 | 232 | // Returns the derivative of exp with respect to byte. 233 | Exp Derivative(Exp exp, int byte); 234 | 235 | enum BindingType { 236 | kCancel, 237 | kEpsilon, 238 | kAppend, 239 | }; 240 | 241 | typedef std::list> Bindings; 242 | 243 | // Conceptually, an OuterSet is a Disjunction and an InnerSet is a Conjunction. 244 | // For simplicity, we don't introduce a new type for the latter, but the former 245 | // needs to associate each InnerSet with its Bindings. 246 | typedef std::list> OuterSet; 247 | typedef std::unique_ptr Outer; 248 | 249 | // Returns the denormalised form of exp. 250 | Outer Denormalised(Exp exp); 251 | 252 | // Partial() helpers for building OuterSets. Exposed for ease of testing. 253 | Outer PartialConcatenation(Outer x, Exp y, const Bindings& initial); 254 | Outer PartialComplement(Outer x); 255 | Outer PartialConjunction(Outer x, Outer y); 256 | Outer PartialDisjunction(Outer x, Outer y); 257 | 258 | // Returns the partial derivative of exp with respect to byte. 259 | Outer Partial(Exp exp, int byte); 260 | 261 | // Outputs the partitions computed for exp. 262 | // The first partition should be Σ-based. Any others should be ∅-based. 263 | void Partitions(Exp exp, std::list>* partitions); 264 | 265 | // Outputs the expression parsed from str. 266 | // Returns true on success, false on failure. 267 | bool Parse(llvm::StringRef str, Exp* exp); 268 | 269 | // Outputs the expression parsed from str as well as the mode of each Group and 270 | // which Groups capture. 271 | // Returns true on success, false on failure. 272 | bool Parse(llvm::StringRef str, Exp* exp, 273 | std::vector* modes, std::vector* captures); 274 | 275 | // Returns the result of matching str using exp. 276 | bool Match(Exp exp, llvm::StringRef str); 277 | 278 | // Represents a finite automaton. 279 | class FA { 280 | public: 281 | FA() : error_(-1), empty_(-1) {} 282 | virtual ~FA() {} 283 | 284 | bool IsError(int state) const { 285 | return state == error_; 286 | } 287 | 288 | bool IsEmpty(int state) const { 289 | return state == empty_; 290 | } 291 | 292 | bool IsAccepting(int state) const { 293 | return accepting_.find(state)->second; 294 | } 295 | 296 | int error_; 297 | int empty_; 298 | std::map accepting_; 299 | std::map>> partitions_; 300 | 301 | private: 302 | FA(const FA&) = delete; 303 | FA& operator=(const FA&) = delete; 304 | }; 305 | 306 | // Represents a deterministic finite automaton. 307 | class DFA : public FA { 308 | public: 309 | DFA() {} 310 | ~DFA() override {} 311 | 312 | std::map, int> transition_; 313 | 314 | private: 315 | DFA(const DFA&) = delete; 316 | DFA& operator=(const DFA&) = delete; 317 | }; 318 | 319 | // Represents a tagged nondeterministic finite automaton. 320 | class TNFA : public FA { 321 | public: 322 | TNFA() {} 323 | ~TNFA() override {} 324 | 325 | std::vector modes_; 326 | std::vector captures_; 327 | 328 | std::multimap, std::pair> transition_; 329 | std::map final_; 330 | 331 | private: 332 | TNFA(const TNFA&) = delete; 333 | TNFA& operator=(const TNFA&) = delete; 334 | }; 335 | 336 | // Outputs the DFA compiled from exp. 337 | // Returns the number of DFA states. 338 | size_t Compile(Exp exp, DFA* dfa); 339 | 340 | // Outputs the TNFA compiled from exp. 341 | // Returns the number of TNFA states. 342 | size_t Compile(Exp exp, TNFA* tnfa); 343 | 344 | // Returns the result of matching str using dfa. 345 | bool Match(const DFA& dfa, llvm::StringRef str); 346 | 347 | // Returns the result of matching str using tnfa. 348 | // Outputs the offsets of the beginning and ending of each Group that captures. 349 | // Thus, the nth Group begins at offsets[2*n+0] and ends at offsets[2*n+1]. 350 | bool Match(const TNFA& tnfa, llvm::StringRef str, 351 | std::vector* offsets); 352 | 353 | // Represents a function and its machine code. 354 | struct Fun { 355 | Fun(); 356 | ~Fun(); 357 | 358 | std::unique_ptr context_; 359 | llvm::Module* module_; // Not owned. 360 | std::unique_ptr engine_; 361 | llvm::Function* function_; // Not owned. 362 | 363 | int memchr_byte_; 364 | bool memchr_fail_; 365 | 366 | uint64_t machine_code_addr_; 367 | uint64_t machine_code_size_; 368 | }; 369 | 370 | // Outputs the function compiled from dfa. 371 | // Returns the number of bytes of machine code. 372 | size_t Compile(const DFA& dfa, Fun* fun); 373 | 374 | // Returns the result of matching str using fun. 375 | bool Match(const Fun& fun, llvm::StringRef str); 376 | 377 | } // namespace redgrep 378 | 379 | #endif // REDGREP_REGEXP_H_ 380 | -------------------------------------------------------------------------------- /regexp_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "gtest/gtest.h" 16 | #include "regexp.h" 17 | 18 | namespace redgrep { 19 | 20 | TEST(Compare, EmptySet) { 21 | EXPECT_EQ( 22 | EmptySet(), 23 | EmptySet()); 24 | } 25 | 26 | TEST(Compare, EmptyString) { 27 | EXPECT_EQ( 28 | EmptyString(), 29 | EmptyString()); 30 | } 31 | 32 | TEST(Compare, Group) { 33 | EXPECT_EQ( 34 | Group(0, Byte('a'), kPassive, true), 35 | Group(0, Byte('a'), kPassive, true)); 36 | EXPECT_LT( 37 | Group(0, Byte('a'), kPassive, true), 38 | Group(1, Byte('a'), kPassive, true)); 39 | } 40 | 41 | TEST(Compare, AnyByte) { 42 | EXPECT_EQ( 43 | AnyByte(), 44 | AnyByte()); 45 | } 46 | 47 | TEST(Compare, Byte) { 48 | EXPECT_EQ( 49 | Byte('a'), 50 | Byte('a')); 51 | EXPECT_LT( 52 | Byte('a'), 53 | Byte('b')); 54 | } 55 | 56 | TEST(Compare, ByteRange) { 57 | EXPECT_EQ( 58 | ByteRange('a', 'c'), 59 | ByteRange('a', 'c')); 60 | EXPECT_LT( 61 | ByteRange('a', 'c'), 62 | ByteRange('b', 'd')); 63 | } 64 | 65 | TEST(Compare, KleeneClosure) { 66 | EXPECT_EQ( 67 | KleeneClosure(Byte('a')), 68 | KleeneClosure(Byte('a'))); 69 | EXPECT_LT( 70 | KleeneClosure(Byte('a')), 71 | KleeneClosure(Byte('b'))); 72 | } 73 | 74 | TEST(Compare, Concatenation) { 75 | EXPECT_EQ( 76 | Concatenation(Byte('a'), Byte('b'), Byte('c')), 77 | Concatenation(Byte('a'), Byte('b'), Byte('c'))); 78 | EXPECT_LT( 79 | Concatenation(Byte('a'), Byte('b'), Byte('c')), 80 | Concatenation(Byte('b'), Byte('c'), Byte('d'))); 81 | } 82 | 83 | TEST(Compare, Complement) { 84 | EXPECT_EQ( 85 | Complement(Byte('a')), 86 | Complement(Byte('a'))); 87 | EXPECT_LT( 88 | Complement(Byte('a')), 89 | Complement(Byte('b'))); 90 | } 91 | 92 | TEST(Compare, Conjunction) { 93 | EXPECT_EQ( 94 | Conjunction(Byte('a'), Byte('b'), Byte('c')), 95 | Conjunction(Byte('a'), Byte('b'), Byte('c'))); 96 | EXPECT_LT( 97 | Conjunction(Byte('a'), Byte('b'), Byte('c')), 98 | Conjunction(Byte('b'), Byte('c'), Byte('d'))); 99 | } 100 | 101 | TEST(Compare, Disjunction) { 102 | EXPECT_EQ( 103 | Disjunction(Byte('a'), Byte('b'), Byte('c')), 104 | Disjunction(Byte('a'), Byte('b'), Byte('c'))); 105 | EXPECT_LT( 106 | Disjunction(Byte('a'), Byte('b'), Byte('c')), 107 | Disjunction(Byte('b'), Byte('c'), Byte('d'))); 108 | } 109 | 110 | #define EXPECT_NORMALISED(expected, exp) \ 111 | do { \ 112 | EXPECT_EQ(expected, Normalised(exp)); \ 113 | } while (0) 114 | 115 | TEST(Normalised, EmptySet) { 116 | EXPECT_NORMALISED( 117 | EmptySet(), 118 | EmptySet()); 119 | } 120 | 121 | TEST(Normalised, EmptyString) { 122 | EXPECT_NORMALISED( 123 | EmptyString(), 124 | EmptyString()); 125 | } 126 | 127 | TEST(Normalised, Group) { 128 | EXPECT_NORMALISED( 129 | EmptySet(), 130 | Group(0, EmptySet(), kPassive, true)); 131 | EXPECT_NORMALISED( 132 | EmptyString(), 133 | Group(0, EmptyString(), kPassive, true)); 134 | EXPECT_NORMALISED( 135 | Group(0, Byte('a'), kPassive, true), 136 | Group(0, Byte('a'), kPassive, true)); 137 | } 138 | 139 | TEST(Normalised, AnyByte) { 140 | EXPECT_NORMALISED( 141 | AnyByte(), 142 | AnyByte()); 143 | } 144 | 145 | TEST(Normalised, Byte) { 146 | EXPECT_NORMALISED( 147 | Byte('a'), 148 | Byte('a')); 149 | } 150 | 151 | TEST(Normalised, ByteRange) { 152 | EXPECT_NORMALISED( 153 | ByteRange('a', 'c'), 154 | ByteRange('a', 'c')); 155 | } 156 | 157 | TEST(Normalised, KleeneClosure) { 158 | EXPECT_NORMALISED( 159 | KleeneClosure(Byte('a')), 160 | KleeneClosure(KleeneClosure(Byte('a')))); 161 | EXPECT_NORMALISED( 162 | EmptyString(), 163 | KleeneClosure(EmptySet())); 164 | EXPECT_NORMALISED( 165 | EmptyString(), 166 | KleeneClosure(EmptyString())); 167 | EXPECT_NORMALISED( 168 | Complement(EmptySet()), 169 | KleeneClosure(AnyByte())); 170 | } 171 | 172 | TEST(Normalised, Concatenation) { 173 | EXPECT_NORMALISED( 174 | Concatenation( 175 | Byte('a'), 176 | Concatenation( 177 | Byte('b'), 178 | Byte('c'))), 179 | Concatenation( 180 | Concatenation( 181 | Byte('a'), 182 | Byte('b')), 183 | Byte('c'))); 184 | EXPECT_NORMALISED( 185 | EmptySet(), 186 | Concatenation(EmptySet(), Byte('a'))); 187 | EXPECT_NORMALISED( 188 | EmptySet(), 189 | Concatenation(Byte('a'), EmptySet())); 190 | EXPECT_NORMALISED( 191 | Byte('a'), 192 | Concatenation(EmptyString(), Byte('a'))); 193 | EXPECT_NORMALISED( 194 | Byte('a'), 195 | Concatenation(Byte('a'), EmptyString())); 196 | } 197 | 198 | TEST(Normalised, Complement) { 199 | EXPECT_NORMALISED( 200 | Byte('a'), 201 | Complement(Complement(Byte('a')))); 202 | } 203 | 204 | TEST(Normalised, Conjunction) { 205 | EXPECT_NORMALISED( 206 | Conjunction( 207 | Byte('a'), 208 | Byte('b'), 209 | Byte('c')), 210 | Conjunction( 211 | Conjunction( 212 | Byte('a'), 213 | Byte('b')), 214 | Byte('c'))); 215 | EXPECT_NORMALISED( 216 | Conjunction(Byte('a'), Byte('b')), 217 | Conjunction(Byte('b'), Byte('a'))); 218 | EXPECT_NORMALISED( 219 | Byte('a'), 220 | Conjunction(Byte('a'), Byte('a'))); 221 | EXPECT_NORMALISED( 222 | EmptySet(), 223 | Conjunction(Byte('a'), EmptySet())); 224 | EXPECT_NORMALISED( 225 | Byte('a'), 226 | Conjunction(Byte('a'), Complement(EmptySet()))); 227 | } 228 | 229 | TEST(Normalised, Disjunction) { 230 | EXPECT_NORMALISED( 231 | Disjunction( 232 | Byte('a'), 233 | Byte('b'), 234 | Byte('c')), 235 | Disjunction( 236 | Disjunction( 237 | Byte('a'), 238 | Byte('b')), 239 | Byte('c'))); 240 | EXPECT_NORMALISED( 241 | Disjunction(Byte('a'), Byte('b')), 242 | Disjunction(Byte('b'), Byte('a'))); 243 | EXPECT_NORMALISED( 244 | Byte('a'), 245 | Disjunction(Byte('a'), Byte('a'))); 246 | EXPECT_NORMALISED( 247 | Byte('a'), 248 | Disjunction(Byte('a'), EmptySet())); 249 | EXPECT_NORMALISED( 250 | Complement(EmptySet()), 251 | Disjunction(Byte('a'), Complement(EmptySet()))); 252 | } 253 | 254 | #define EXPECT_ISNULLABLE(expected, exp) \ 255 | do { \ 256 | if (expected) { \ 257 | EXPECT_TRUE(IsNullable(exp)); \ 258 | } else { \ 259 | EXPECT_FALSE(IsNullable(exp)); \ 260 | } \ 261 | } while (0) 262 | 263 | TEST(IsNullable, EmptySet) { 264 | EXPECT_ISNULLABLE( 265 | false, 266 | EmptySet()); 267 | } 268 | 269 | TEST(IsNullable, EmptyString) { 270 | EXPECT_ISNULLABLE( 271 | true, 272 | EmptyString()); 273 | } 274 | 275 | TEST(IsNullable, Group) { 276 | EXPECT_ISNULLABLE( 277 | false, 278 | Group(0, Byte('a'), kPassive, true)); 279 | } 280 | 281 | TEST(IsNullable, AnyByte) { 282 | EXPECT_ISNULLABLE( 283 | false, 284 | AnyByte()); 285 | } 286 | 287 | TEST(IsNullable, Byte) { 288 | EXPECT_ISNULLABLE( 289 | false, 290 | Byte('a')); 291 | } 292 | 293 | TEST(IsNullable, ByteRange) { 294 | EXPECT_ISNULLABLE( 295 | false, 296 | ByteRange('a', 'c')); 297 | } 298 | 299 | TEST(IsNullable, KleeneClosure) { 300 | EXPECT_ISNULLABLE( 301 | true, 302 | KleeneClosure(Byte('a'))); 303 | } 304 | 305 | TEST(IsNullable, Concatenation) { 306 | EXPECT_ISNULLABLE( 307 | false, 308 | Concatenation(Byte('a'), Byte('b'))); 309 | } 310 | 311 | TEST(IsNullable, Complement) { 312 | EXPECT_ISNULLABLE( 313 | true, 314 | Complement(Byte('a'))); 315 | } 316 | 317 | TEST(IsNullable, Conjunction) { 318 | EXPECT_ISNULLABLE( 319 | false, 320 | Conjunction(Byte('a'), Byte('b'))); 321 | } 322 | 323 | TEST(IsNullable, Disjunction) { 324 | EXPECT_ISNULLABLE( 325 | false, 326 | Disjunction(Byte('a'), Byte('b'))); 327 | } 328 | 329 | #define EXPECT_DERIVATIVE(expected, exp) \ 330 | do { \ 331 | EXPECT_EQ(expected, Normalised(Derivative(exp, 'a'))); \ 332 | } while (0) 333 | 334 | TEST(Derivative, EmptySet) { 335 | EXPECT_DERIVATIVE( 336 | EmptySet(), 337 | EmptySet()); 338 | } 339 | 340 | TEST(Derivative, EmptyString) { 341 | EXPECT_DERIVATIVE( 342 | EmptySet(), 343 | EmptyString()); 344 | } 345 | 346 | TEST(Derivative, Group) { 347 | // This should never happen. 348 | } 349 | 350 | TEST(Derivative, AnyByte) { 351 | EXPECT_DERIVATIVE( 352 | EmptyString(), 353 | AnyByte()); 354 | } 355 | 356 | TEST(Derivative, Byte) { 357 | EXPECT_DERIVATIVE( 358 | EmptyString(), 359 | Byte('a')); 360 | EXPECT_DERIVATIVE( 361 | EmptySet(), 362 | Byte('b')); 363 | } 364 | 365 | TEST(Derivative, ByteRange) { 366 | EXPECT_DERIVATIVE( 367 | EmptyString(), 368 | ByteRange('a', 'c')); 369 | EXPECT_DERIVATIVE( 370 | EmptySet(), 371 | ByteRange('b', 'd')); 372 | } 373 | 374 | TEST(Derivative, KleeneClosure) { 375 | EXPECT_DERIVATIVE( 376 | KleeneClosure(Byte('a')), 377 | KleeneClosure(Byte('a'))); 378 | } 379 | 380 | TEST(Derivative, Concatenation) { 381 | EXPECT_DERIVATIVE( 382 | Byte('b'), 383 | Concatenation(Byte('a'), Byte('b'))); 384 | EXPECT_DERIVATIVE( 385 | Concatenation(KleeneClosure(Byte('a')), Byte('b')), 386 | Concatenation(KleeneClosure(Byte('a')), Byte('b'))); 387 | } 388 | 389 | TEST(Derivative, Complement) { 390 | EXPECT_DERIVATIVE( 391 | Complement(EmptyString()), 392 | Complement(Byte('a'))); 393 | } 394 | 395 | TEST(Derivative, Conjunction) { 396 | EXPECT_DERIVATIVE( 397 | EmptySet(), 398 | Conjunction(Byte('a'), Byte('b'))); 399 | } 400 | 401 | TEST(Derivative, Disjunction) { 402 | EXPECT_DERIVATIVE( 403 | EmptyString(), 404 | Disjunction(Byte('a'), Byte('b'))); 405 | } 406 | 407 | #define EXPECT_OUTERSET(expected, outer) \ 408 | do { \ 409 | std::list subs; \ 410 | for (const auto& i : *outer) { \ 411 | subs.push_back(i.first); \ 412 | } \ 413 | Exp exp = Disjunction(subs, false); \ 414 | EXPECT_EQ(expected, Normalised(exp)); \ 415 | } while (0) 416 | 417 | TEST(OuterSet, PartialConcatenation) { 418 | Outer outer = PartialConcatenation( 419 | Denormalised( 420 | Disjunction( 421 | Conjunction(Byte('1'), Byte('2')), 422 | Byte('3'))), 423 | Byte('4'), 424 | Bindings({})); 425 | EXPECT_OUTERSET( 426 | Disjunction( 427 | Concatenation(Byte('3'), Byte('4')), 428 | Conjunction( 429 | Concatenation(Byte('1'), Byte('4')), 430 | Concatenation(Byte('2'), Byte('4')))), 431 | outer); 432 | } 433 | 434 | TEST(OuterSet, PartialComplement) { 435 | Outer outer = PartialComplement( 436 | Denormalised( 437 | Disjunction( 438 | Conjunction(Byte('1'), Byte('2')), 439 | Byte('3')))); 440 | EXPECT_OUTERSET( 441 | Disjunction( 442 | Conjunction( 443 | Complement(Byte('1')), 444 | Complement(Byte('3'))), 445 | Conjunction( 446 | Complement(Byte('2')), 447 | Complement(Byte('3')))), 448 | outer); 449 | } 450 | 451 | TEST(OuterSet, PartialConjunction) { 452 | Outer outer = PartialConjunction( 453 | Denormalised( 454 | Disjunction(Byte('1'), Byte('2'))), 455 | Denormalised( 456 | Disjunction(Byte('3'), Byte('4')))); 457 | EXPECT_OUTERSET( 458 | Disjunction( 459 | Conjunction(Byte('1'), Byte('3')), 460 | Conjunction(Byte('1'), Byte('4')), 461 | Conjunction(Byte('2'), Byte('3')), 462 | Conjunction(Byte('2'), Byte('4'))), 463 | outer); 464 | } 465 | 466 | TEST(OuterSet, PartialDisjunction) { 467 | Outer outer = PartialDisjunction( 468 | Denormalised( 469 | Disjunction(Byte('1'), Byte('2'))), 470 | Denormalised( 471 | Disjunction(Byte('3'), Byte('4')))); 472 | EXPECT_OUTERSET( 473 | Disjunction(Byte('1'), Byte('2'), Byte('3'), Byte('4')), 474 | outer); 475 | } 476 | 477 | #define EXPECT_PARTIAL(expected, exp) \ 478 | do { \ 479 | Outer outer = Partial(exp, 'a'); \ 480 | EXPECT_OUTERSET(expected, outer); \ 481 | } while (0) 482 | 483 | TEST(Partial, EmptySet) { 484 | EXPECT_PARTIAL( 485 | EmptySet(), 486 | EmptySet()); 487 | } 488 | 489 | TEST(Partial, EmptyString) { 490 | EXPECT_PARTIAL( 491 | EmptySet(), 492 | EmptyString()); 493 | } 494 | 495 | TEST(Partial, Group) { 496 | EXPECT_PARTIAL( 497 | EmptyString(), 498 | Group(0, Byte('a'), kPassive, true)); 499 | } 500 | 501 | TEST(Partial, AnyByte) { 502 | EXPECT_PARTIAL( 503 | EmptyString(), 504 | AnyByte()); 505 | } 506 | 507 | TEST(Partial, Byte) { 508 | EXPECT_PARTIAL( 509 | EmptyString(), 510 | Byte('a')); 511 | EXPECT_PARTIAL( 512 | EmptySet(), 513 | Byte('b')); 514 | } 515 | 516 | TEST(Partial, ByteRange) { 517 | EXPECT_PARTIAL( 518 | EmptyString(), 519 | ByteRange('a', 'c')); 520 | EXPECT_PARTIAL( 521 | EmptySet(), 522 | ByteRange('b', 'd')); 523 | } 524 | 525 | TEST(Partial, KleeneClosure) { 526 | EXPECT_PARTIAL( 527 | KleeneClosure(Byte('a')), 528 | KleeneClosure(Byte('a'))); 529 | } 530 | 531 | TEST(Partial, Concatenation) { 532 | EXPECT_PARTIAL( 533 | Byte('b'), 534 | Concatenation(Byte('a'), Byte('b'))); 535 | EXPECT_PARTIAL( 536 | Concatenation(KleeneClosure(Byte('a')), Byte('b')), 537 | Concatenation(KleeneClosure(Byte('a')), Byte('b'))); 538 | } 539 | 540 | TEST(Partial, Complement) { 541 | EXPECT_PARTIAL( 542 | Complement(EmptyString()), 543 | Complement(Byte('a'))); 544 | } 545 | 546 | TEST(Partial, Conjunction) { 547 | EXPECT_PARTIAL( 548 | EmptySet(), 549 | Conjunction(Byte('a'), Byte('b'))); 550 | } 551 | 552 | TEST(Partial, Disjunction) { 553 | EXPECT_PARTIAL( 554 | EmptyString(), 555 | Disjunction(Byte('a'), Byte('b'))); 556 | } 557 | 558 | #define EXPECT_PARTITIONS(expected, exp) \ 559 | do { \ 560 | std::list> partitions; \ 561 | Partitions(exp, &partitions); \ 562 | EXPECT_EQ(expected, partitions); \ 563 | } while (0) 564 | 565 | template 566 | inline std::bitset<256> BitSet(Variadic... bits) { 567 | std::set s({bits...}); 568 | std::bitset<256> bs; 569 | for (int bit : s) { 570 | bs.set(bit); 571 | } 572 | return bs; 573 | } 574 | 575 | TEST(Partitions, EmptySet) { 576 | EXPECT_PARTITIONS( 577 | std::list>({BitSet()}), 578 | EmptySet()); 579 | } 580 | 581 | TEST(Partitions, EmptyString) { 582 | EXPECT_PARTITIONS( 583 | std::list>({BitSet()}), 584 | EmptyString()); 585 | } 586 | 587 | TEST(Partitions, Group) { 588 | EXPECT_PARTITIONS( 589 | std::list>({BitSet('a'), 590 | BitSet('a')}), 591 | Group(0, Byte('a'), kPassive, true)); 592 | } 593 | 594 | TEST(Partitions, AnyByte) { 595 | EXPECT_PARTITIONS( 596 | std::list>({BitSet()}), 597 | AnyByte()); 598 | } 599 | 600 | TEST(Partitions, Byte) { 601 | EXPECT_PARTITIONS( 602 | std::list>({BitSet('a'), 603 | BitSet('a')}), 604 | Byte('a')); 605 | } 606 | 607 | TEST(Partitions, ByteRange) { 608 | EXPECT_PARTITIONS( 609 | std::list>({BitSet('a', 'b', 'c'), 610 | BitSet('a', 'b', 'c')}), 611 | ByteRange('a', 'c')); 612 | } 613 | 614 | TEST(Partitions, KleeneClosure) { 615 | EXPECT_PARTITIONS( 616 | std::list>({BitSet('a'), 617 | BitSet('a')}), 618 | KleeneClosure(Byte('a'))); 619 | } 620 | 621 | TEST(Partitions, Concatenation) { 622 | EXPECT_PARTITIONS( 623 | std::list>({BitSet('a'), 624 | BitSet('a')}), 625 | Concatenation(Byte('a'), Byte('b'))); 626 | EXPECT_PARTITIONS( 627 | std::list>({BitSet('a', 'b'), 628 | BitSet('b'), 629 | BitSet('a')}), 630 | Concatenation(KleeneClosure(Byte('a')), Byte('b'))); 631 | } 632 | 633 | TEST(Partitions, Complement) { 634 | EXPECT_PARTITIONS( 635 | std::list>({BitSet('a'), 636 | BitSet('a')}), 637 | Complement(Byte('a'))); 638 | } 639 | 640 | TEST(Partitions, Conjunction) { 641 | EXPECT_PARTITIONS( 642 | std::list>({BitSet('a', 'b'), 643 | BitSet('b'), 644 | BitSet('a')}), 645 | Conjunction(Byte('a'), Byte('b'))); 646 | } 647 | 648 | TEST(Partitions, Disjunction) { 649 | EXPECT_PARTITIONS( 650 | std::list>({BitSet('a', 'b'), 651 | BitSet('b'), 652 | BitSet('a')}), 653 | Disjunction(Byte('a'), Byte('b'))); 654 | } 655 | 656 | #define EXPECT_PARSE(expected, str) \ 657 | do { \ 658 | Exp exp; \ 659 | ASSERT_TRUE(Parse(str, &exp)); \ 660 | EXPECT_EQ(expected, exp); \ 661 | } while (0) 662 | 663 | TEST(Parse, EscapeSequences) { 664 | EXPECT_PARSE( 665 | AnyByte(), 666 | "\\C"); 667 | EXPECT_PARSE( 668 | Concatenation( 669 | Byte('\f'), 670 | Byte('\n'), 671 | Byte('\r'), 672 | Byte('\t')), 673 | "\\f\\n\\r\\t"); 674 | } 675 | 676 | TEST(Parse, AnyCharacter) { 677 | EXPECT_PARSE( 678 | Disjunction( 679 | ByteRange(0x00, 0x7F), 680 | Concatenation( 681 | ByteRange(0xC2, 0xDF), 682 | ByteRange(0x80, 0xBF)), 683 | Concatenation( 684 | ByteRange(0xE0, 0xEF), 685 | ByteRange(0x80, 0xBF), 686 | ByteRange(0x80, 0xBF)), 687 | Concatenation( 688 | ByteRange(0xF0, 0xF4), 689 | ByteRange(0x80, 0xBF), 690 | ByteRange(0x80, 0xBF), 691 | ByteRange(0x80, 0xBF))), 692 | "."); 693 | } 694 | 695 | TEST(Parse, Character) { 696 | EXPECT_PARSE( 697 | Byte(0x61), 698 | "a"); 699 | EXPECT_PARSE( 700 | Concatenation( 701 | Byte(0xC2), 702 | Byte(0xAC)), 703 | "¬"); 704 | EXPECT_PARSE( 705 | Concatenation( 706 | Byte(0xE5), 707 | Byte(0x85), 708 | Byte(0x94)), 709 | "兔"); 710 | EXPECT_PARSE( 711 | Concatenation( 712 | Byte(0xF0), 713 | Byte(0x9F), 714 | Byte(0x92), 715 | Byte(0xA9)), 716 | "💩"); 717 | } 718 | 719 | TEST(Parse, CharacterClass) { 720 | EXPECT_PARSE( 721 | Disjunction( 722 | Byte(0x61), 723 | Concatenation( 724 | Byte(0xC2), 725 | Byte(0xAC)), 726 | Concatenation( 727 | Byte(0xE5), 728 | Byte(0x85), 729 | Byte(0x94)), 730 | Concatenation( 731 | Byte(0xF0), 732 | Byte(0x9F), 733 | Byte(0x92), 734 | Byte(0xA9))), 735 | "[a¬兔💩]"); 736 | EXPECT_PARSE( 737 | Conjunction( 738 | Complement( 739 | Disjunction( 740 | Byte(0x61), 741 | Concatenation( 742 | Byte(0xC2), 743 | Byte(0xAC)), 744 | Concatenation( 745 | Byte(0xE5), 746 | Byte(0x85), 747 | Byte(0x94)), 748 | Concatenation( 749 | Byte(0xF0), 750 | Byte(0x9F), 751 | Byte(0x92), 752 | Byte(0xA9)))), 753 | AnyCharacter()), 754 | "[^a¬兔💩]"); 755 | } 756 | 757 | TEST(Parse, Quantifiers) { 758 | EXPECT_PARSE( 759 | KleeneClosure( 760 | Byte('a')), 761 | "a*"); 762 | EXPECT_PARSE( 763 | KleeneClosure( 764 | Byte('a')), 765 | "a*?"); 766 | EXPECT_PARSE( 767 | Concatenation( 768 | Byte('a'), 769 | KleeneClosure( 770 | Byte('a'))), 771 | "a+"); 772 | EXPECT_PARSE( 773 | Concatenation( 774 | Byte('a'), 775 | KleeneClosure( 776 | Byte('a'))), 777 | "a+?"); 778 | EXPECT_PARSE( 779 | Disjunction( 780 | EmptyString(), 781 | Byte('a')), 782 | "a?"); 783 | EXPECT_PARSE( 784 | Disjunction( 785 | EmptyString(), 786 | Byte('a')), 787 | "a??"); 788 | EXPECT_PARSE( 789 | Byte('a'), 790 | "a{1}"); 791 | EXPECT_PARSE( 792 | Byte('a'), 793 | "a{1}?"); 794 | EXPECT_PARSE( 795 | Concatenation( 796 | Byte('a'), 797 | KleeneClosure( 798 | Byte('a'))), 799 | "a{1,}"); 800 | EXPECT_PARSE( 801 | Concatenation( 802 | Byte('a'), 803 | KleeneClosure( 804 | Byte('a'))), 805 | "a{1,}?"); 806 | EXPECT_PARSE( 807 | Concatenation( 808 | Byte('a'), 809 | Disjunction( 810 | EmptyString(), 811 | Byte('a'))), 812 | "a{1,2}"); 813 | EXPECT_PARSE( 814 | Concatenation( 815 | Byte('a'), 816 | Disjunction( 817 | EmptyString(), 818 | Byte('a'))), 819 | "a{1,2}?"); 820 | } 821 | 822 | TEST(Parse, KleeneClosure) { 823 | EXPECT_PARSE( 824 | Concatenation( 825 | Byte('a'), 826 | KleeneClosure( 827 | Byte('b'))), 828 | "ab*"); 829 | EXPECT_PARSE( 830 | KleeneClosure( 831 | Concatenation( 832 | Byte('a'), 833 | Byte('b'))), 834 | "(ab)*"); 835 | EXPECT_PARSE( 836 | Concatenation( 837 | KleeneClosure( 838 | Byte('a')), 839 | Byte('b')), 840 | "a*b"); 841 | EXPECT_PARSE( 842 | Concatenation( 843 | KleeneClosure( 844 | Byte('a')), 845 | Concatenation( 846 | KleeneClosure( 847 | Byte('b')), 848 | Byte('c'))), 849 | "a*b*c"); 850 | } 851 | 852 | TEST(Parse, Concatenation) { 853 | EXPECT_PARSE( 854 | Concatenation( 855 | Byte('a'), 856 | Byte('b')), 857 | "ab"); 858 | EXPECT_PARSE( 859 | Concatenation( 860 | Byte('a'), 861 | Concatenation( 862 | Byte('b'), 863 | Byte('c'))), 864 | "abc"); 865 | } 866 | 867 | TEST(Parse, Complement) { 868 | EXPECT_PARSE( 869 | Complement( 870 | Byte('a')), 871 | "!a"); 872 | EXPECT_PARSE( 873 | Complement( 874 | Concatenation( 875 | Byte('a'), 876 | Byte('b'))), 877 | "!ab"); 878 | EXPECT_PARSE( 879 | Complement( 880 | Concatenation( 881 | Byte('a'), 882 | Byte('b'))), 883 | "!(ab)"); 884 | EXPECT_PARSE( 885 | Concatenation( 886 | Byte('a'), 887 | Complement( 888 | Byte('b'))), 889 | "a!b"); 890 | EXPECT_PARSE( 891 | Concatenation( 892 | Concatenation( 893 | Byte('a'), 894 | Complement( 895 | Byte('b'))), 896 | Complement( 897 | Byte('c'))), 898 | "a!b!c"); 899 | } 900 | 901 | TEST(Parse, Conjunction) { 902 | EXPECT_PARSE( 903 | Conjunction( 904 | Byte('a'), 905 | Byte('b')), 906 | "a&b"); 907 | EXPECT_PARSE( 908 | Conjunction( 909 | Byte('a'), 910 | Byte('b'), 911 | Byte('c')), 912 | "a&b&c"); 913 | } 914 | 915 | TEST(Parse, Disjunction) { 916 | EXPECT_PARSE( 917 | Disjunction( 918 | Byte('a'), 919 | Byte('b')), 920 | "a|b"); 921 | EXPECT_PARSE( 922 | Disjunction( 923 | Byte('a'), 924 | Byte('b'), 925 | Byte('c')), 926 | "a|b|c"); 927 | } 928 | 929 | TEST(Parse, CountedRepetition) { 930 | Exp exp1; 931 | EXPECT_TRUE(Parse("a{0}", &exp1)); 932 | EXPECT_EQ(EmptyString(), exp1); 933 | 934 | Exp exp2; 935 | EXPECT_TRUE(Parse("a{1000}", &exp2)); 936 | Exp exp3; 937 | EXPECT_TRUE(Parse("a{2}{2}{2}{5}{5}{5}", &exp3)); 938 | // They are structured differently, so compare their normalised forms. 939 | EXPECT_EQ(Normalised(exp2), Normalised(exp3)); 940 | 941 | Exp exp4; 942 | EXPECT_FALSE(Parse("a{1001}", &exp4)); 943 | EXPECT_FALSE(Parse("a{7}{11}{13}", &exp4)); 944 | 945 | Exp exp5; 946 | EXPECT_FALSE(Parse("a{999999999}", &exp5)); 947 | EXPECT_FALSE(Parse("a{10}{10}{10}{10}{10}{10}{10}{10}{10}{10}", &exp5)); 948 | } 949 | 950 | #define EXPECT_PARSE_M_C(expected, expected_modes, expected_captures, str) \ 951 | do { \ 952 | Exp exp; \ 953 | std::vector modes; \ 954 | std::vector captures; \ 955 | ASSERT_TRUE(Parse(str, &exp, &modes, &captures)); \ 956 | EXPECT_EQ(expected, exp); \ 957 | EXPECT_EQ(expected_modes, modes); \ 958 | EXPECT_EQ(expected_captures, captures); \ 959 | } while (0) 960 | 961 | TEST(Parse_M_C, Parentheses) { 962 | EXPECT_PARSE_M_C( 963 | Group(0, 964 | Concatenation( 965 | Byte('a'), 966 | Byte('b')), 967 | kPassive, false), 968 | std::vector({kPassive}), 969 | std::vector({}), 970 | "(?:ab)"); 971 | EXPECT_PARSE_M_C( 972 | Group(0, 973 | Concatenation( 974 | Byte('a'), 975 | Byte('b')), 976 | kPassive, true), 977 | std::vector({kPassive}), 978 | std::vector({0}), 979 | "(ab)"); 980 | EXPECT_PARSE_M_C( 981 | Group(0, 982 | Concatenation( 983 | Group(1, 984 | Byte('a'), 985 | kPassive, true), 986 | Byte('b')), 987 | kPassive, true), 988 | std::vector({kPassive, kPassive}), 989 | std::vector({0, 1}), 990 | "((a)b)"); 991 | EXPECT_PARSE_M_C( 992 | Group(0, 993 | Concatenation( 994 | Byte('a'), 995 | Group(1, 996 | Byte('b'), 997 | kPassive, true)), 998 | kPassive, true), 999 | std::vector({kPassive, kPassive}), 1000 | std::vector({0, 1}), 1001 | "(a(b))"); 1002 | EXPECT_PARSE_M_C( 1003 | Concatenation( 1004 | Group(0, 1005 | Byte('a'), 1006 | kPassive, true), 1007 | Group(1, 1008 | Byte('b'), 1009 | kPassive, true)), 1010 | std::vector({kPassive, kPassive}), 1011 | std::vector({0, 1}), 1012 | "(a)(b)"); 1013 | } 1014 | 1015 | TEST(Parse_M_C, Quantifiers) { 1016 | EXPECT_PARSE_M_C( 1017 | Group(0, 1018 | KleeneClosure(Byte('a')), 1019 | kMaximal, false), 1020 | std::vector({kMaximal}), 1021 | std::vector({}), 1022 | "a*"); 1023 | EXPECT_PARSE_M_C( 1024 | Group(0, 1025 | KleeneClosure(Byte('a')), 1026 | kMinimal, false), 1027 | std::vector({kMinimal}), 1028 | std::vector({}), 1029 | "a*?"); 1030 | EXPECT_PARSE_M_C( 1031 | Group(0, 1032 | Concatenation( 1033 | Byte('a'), 1034 | KleeneClosure(Byte('a'))), 1035 | kMaximal, false), 1036 | std::vector({kMaximal}), 1037 | std::vector({}), 1038 | "a+"); 1039 | EXPECT_PARSE_M_C( 1040 | Group(0, 1041 | Concatenation( 1042 | Byte('a'), 1043 | KleeneClosure(Byte('a'))), 1044 | kMinimal, false), 1045 | std::vector({kMinimal}), 1046 | std::vector({}), 1047 | "a+?"); 1048 | EXPECT_PARSE_M_C( 1049 | Group(0, 1050 | Disjunction( 1051 | EmptyString(), 1052 | Byte('a')), 1053 | kMaximal, false), 1054 | std::vector({kMaximal}), 1055 | std::vector({}), 1056 | "a?"); 1057 | EXPECT_PARSE_M_C( 1058 | Group(0, 1059 | Disjunction( 1060 | EmptyString(), 1061 | Byte('a')), 1062 | kMinimal, false), 1063 | std::vector({kMinimal}), 1064 | std::vector({}), 1065 | "a??"); 1066 | EXPECT_PARSE_M_C( 1067 | Group(0, 1068 | Byte('a'), 1069 | kMaximal, false), 1070 | std::vector({kMaximal}), 1071 | std::vector({}), 1072 | "a{1}"); 1073 | EXPECT_PARSE_M_C( 1074 | Group(0, 1075 | Byte('a'), 1076 | kMinimal, false), 1077 | std::vector({kMinimal}), 1078 | std::vector({}), 1079 | "a{1}?"); 1080 | EXPECT_PARSE_M_C( 1081 | Group(0, 1082 | Concatenation( 1083 | Byte('a'), 1084 | KleeneClosure(Byte('a'))), 1085 | kMaximal, false), 1086 | std::vector({kMaximal}), 1087 | std::vector({}), 1088 | "a{1,}"); 1089 | EXPECT_PARSE_M_C( 1090 | Group(0, 1091 | Concatenation( 1092 | Byte('a'), 1093 | KleeneClosure(Byte('a'))), 1094 | kMinimal, false), 1095 | std::vector({kMinimal}), 1096 | std::vector({}), 1097 | "a{1,}?"); 1098 | EXPECT_PARSE_M_C( 1099 | Group(0, 1100 | Concatenation( 1101 | Byte('a'), 1102 | Disjunction( 1103 | EmptyString(), 1104 | Byte('a'))), 1105 | kMaximal, false), 1106 | std::vector({kMaximal}), 1107 | std::vector({}), 1108 | "a{1,2}"); 1109 | EXPECT_PARSE_M_C( 1110 | Group(0, 1111 | Concatenation( 1112 | Byte('a'), 1113 | Disjunction( 1114 | EmptyString(), 1115 | Byte('a'))), 1116 | kMinimal, false), 1117 | std::vector({kMinimal}), 1118 | std::vector({}), 1119 | "a{1,2}?"); 1120 | } 1121 | 1122 | TEST(Parse_M_C, ApplyGroups) { 1123 | EXPECT_PARSE_M_C( 1124 | AnyCharacter(), 1125 | std::vector({}), 1126 | std::vector({}), 1127 | "."); 1128 | EXPECT_PARSE_M_C( 1129 | Disjunction( 1130 | Byte('a'), 1131 | Byte('b'), 1132 | Byte('c')), 1133 | std::vector({}), 1134 | std::vector({}), 1135 | "[abc]"); 1136 | EXPECT_PARSE_M_C( 1137 | Conjunction( 1138 | Complement( 1139 | Disjunction( 1140 | Byte('a'), 1141 | Byte('b'), 1142 | Byte('c'))), 1143 | AnyCharacter()), 1144 | std::vector({}), 1145 | std::vector({}), 1146 | "[^abc]"); 1147 | EXPECT_PARSE_M_C( 1148 | Disjunction( 1149 | Group(0, 1150 | Concatenation(Byte('a'), Byte('a'), Byte('a')), 1151 | kPassive, false), 1152 | Group(1, 1153 | Concatenation(Byte('b'), Byte('b'), Byte('b')), 1154 | kPassive, false), 1155 | Group(2, 1156 | Concatenation(Byte('c'), Byte('c'), Byte('c')), 1157 | kPassive, false)), 1158 | std::vector({kPassive, kPassive, kPassive}), 1159 | std::vector({}), 1160 | "aaa|bbb|ccc"); 1161 | EXPECT_PARSE_M_C( 1162 | Group(0, 1163 | Complement( 1164 | Concatenation( 1165 | Byte('a'), 1166 | Byte('b'), 1167 | Byte('c'))), 1168 | kMaximal, false), 1169 | std::vector({kMaximal}), 1170 | std::vector({}), 1171 | "!abc"); 1172 | } 1173 | 1174 | #define EXPECT_MATCH(expected, expected_values, str) \ 1175 | do { \ 1176 | std::vector values; \ 1177 | if (expected) { \ 1178 | EXPECT_TRUE(Match(exp1_, str)); \ 1179 | EXPECT_TRUE(Match(dfa_, str)); \ 1180 | EXPECT_TRUE(Match(fun1_, str)); \ 1181 | EXPECT_TRUE(Match(tnfa_, str, &values)); \ 1182 | EXPECT_EQ(expected_values, values); \ 1183 | } else { \ 1184 | EXPECT_FALSE(Match(exp1_, str)); \ 1185 | EXPECT_FALSE(Match(dfa_, str)); \ 1186 | EXPECT_FALSE(Match(fun1_, str)); \ 1187 | EXPECT_FALSE(Match(tnfa_, str, &values)); \ 1188 | } \ 1189 | } while (0) 1190 | 1191 | class MatchTest : public testing::Test { 1192 | protected: 1193 | void ParseAll(llvm::StringRef str) { 1194 | ASSERT_TRUE(Parse(str, &exp1_)); 1195 | ASSERT_TRUE(Parse(str, &exp2_, &tnfa_.modes_, &tnfa_.captures_)); 1196 | } 1197 | 1198 | void CompileAll() { 1199 | Compile(exp1_, &dfa_); 1200 | Compile(dfa_, &fun1_); 1201 | Compile(exp2_, &tnfa_); 1202 | } 1203 | 1204 | Exp exp1_; 1205 | DFA dfa_; 1206 | Fun fun1_; 1207 | 1208 | Exp exp2_; 1209 | TNFA tnfa_; 1210 | }; 1211 | 1212 | TEST_F(MatchTest, EmptySet) { 1213 | exp1_ = exp2_ = EmptySet(); 1214 | CompileAll(); 1215 | EXPECT_MATCH(false, std::vector({}), ""); 1216 | EXPECT_MATCH(false, std::vector({}), "a"); 1217 | } 1218 | 1219 | TEST_F(MatchTest, EmptyString) { 1220 | exp1_ = exp2_ = EmptyString(); 1221 | CompileAll(); 1222 | EXPECT_MATCH(true, std::vector({}), ""); 1223 | EXPECT_MATCH(false, std::vector({}), "a"); 1224 | } 1225 | 1226 | TEST_F(MatchTest, EscapeSequences_1) { 1227 | ParseAll("(\\C)"); 1228 | CompileAll(); 1229 | EXPECT_MATCH(false, std::vector({}), ""); 1230 | EXPECT_MATCH(true, std::vector({0, 1}), "a"); 1231 | } 1232 | 1233 | TEST_F(MatchTest, EscapeSequences_2) { 1234 | ParseAll("(\\f\\n\\r\\t)"); 1235 | CompileAll(); 1236 | EXPECT_MATCH(false, std::vector({}), "fnrt"); 1237 | EXPECT_MATCH(true, std::vector({0, 4}), "\f\n\r\t"); 1238 | EXPECT_MATCH(false, std::vector({}), "\\f\\n\\r\\t"); 1239 | } 1240 | 1241 | TEST_F(MatchTest, AnyCharacter) { 1242 | ParseAll("(.)"); 1243 | CompileAll(); 1244 | EXPECT_MATCH(false, std::vector({}), ""); 1245 | EXPECT_MATCH(true, std::vector({0, 1}), "a"); 1246 | EXPECT_MATCH(true, std::vector({0, 2}), "¬"); 1247 | EXPECT_MATCH(true, std::vector({0, 3}), "兔"); 1248 | EXPECT_MATCH(true, std::vector({0, 4}), "💩"); 1249 | } 1250 | 1251 | TEST_F(MatchTest, Character_1) { 1252 | ParseAll("(a)"); 1253 | CompileAll(); 1254 | EXPECT_MATCH(false, std::vector({}), ""); 1255 | EXPECT_MATCH(true, std::vector({0, 1}), "a"); 1256 | EXPECT_MATCH(false, std::vector({}), "X"); 1257 | } 1258 | 1259 | TEST_F(MatchTest, Character_2) { 1260 | ParseAll("(¬)"); 1261 | CompileAll(); 1262 | EXPECT_MATCH(false, std::vector({}), ""); 1263 | EXPECT_MATCH(true, std::vector({0, 2}), "¬"); 1264 | EXPECT_MATCH(false, std::vector({}), "X"); 1265 | } 1266 | 1267 | TEST_F(MatchTest, Character_3) { 1268 | ParseAll("(兔)"); 1269 | CompileAll(); 1270 | EXPECT_MATCH(false, std::vector({}), ""); 1271 | EXPECT_MATCH(true, std::vector({0, 3}), "兔"); 1272 | EXPECT_MATCH(false, std::vector({}), "X"); 1273 | } 1274 | 1275 | TEST_F(MatchTest, Character_4) { 1276 | ParseAll("(💩)"); 1277 | CompileAll(); 1278 | EXPECT_MATCH(false, std::vector({}), ""); 1279 | EXPECT_MATCH(true, std::vector({0, 4}), "💩"); 1280 | EXPECT_MATCH(false, std::vector({}), "X"); 1281 | } 1282 | 1283 | TEST_F(MatchTest, CharacterClass_1) { 1284 | ParseAll("([a¬兔💩])"); 1285 | CompileAll(); 1286 | EXPECT_MATCH(false, std::vector({}), ""); 1287 | EXPECT_MATCH(true, std::vector({0, 1}), "a"); 1288 | EXPECT_MATCH(true, std::vector({0, 2}), "¬"); 1289 | EXPECT_MATCH(true, std::vector({0, 3}), "兔"); 1290 | EXPECT_MATCH(true, std::vector({0, 4}), "💩"); 1291 | EXPECT_MATCH(false, std::vector({}), "X"); 1292 | } 1293 | 1294 | TEST_F(MatchTest, CharacterClass_2) { 1295 | ParseAll("([^a¬兔💩])"); 1296 | CompileAll(); 1297 | EXPECT_MATCH(false, std::vector({}), ""); 1298 | EXPECT_MATCH(false, std::vector({}), "a"); 1299 | EXPECT_MATCH(false, std::vector({}), "¬"); 1300 | EXPECT_MATCH(false, std::vector({}), "兔"); 1301 | EXPECT_MATCH(false, std::vector({}), "💩"); 1302 | EXPECT_MATCH(true, std::vector({0, 1}), "X"); 1303 | } 1304 | 1305 | TEST_F(MatchTest, Quantifiers_1) { 1306 | ParseAll("(a*)"); 1307 | CompileAll(); 1308 | EXPECT_MATCH(true, std::vector({0, 0}), ""); 1309 | EXPECT_MATCH(true, std::vector({0, 1}), "a"); 1310 | EXPECT_MATCH(true, std::vector({0, 2}), "aa"); 1311 | EXPECT_MATCH(true, std::vector({0, 3}), "aaa"); 1312 | } 1313 | 1314 | TEST_F(MatchTest, Quantifiers_2) { 1315 | ParseAll("(a*)(a*)"); 1316 | CompileAll(); 1317 | EXPECT_MATCH(true, std::vector({0, 0, 0, 0}), ""); 1318 | EXPECT_MATCH(true, std::vector({0, 1, 1, 1}), "a"); 1319 | EXPECT_MATCH(true, std::vector({0, 2, 2, 2}), "aa"); 1320 | EXPECT_MATCH(true, std::vector({0, 3, 3, 3}), "aaa"); 1321 | } 1322 | 1323 | TEST_F(MatchTest, Quantifiers_3) { 1324 | ParseAll("(a*?)(a*)"); 1325 | CompileAll(); 1326 | EXPECT_MATCH(true, std::vector({0, 0, 0, 0}), ""); 1327 | EXPECT_MATCH(true, std::vector({0, 0, 0, 1}), "a"); 1328 | EXPECT_MATCH(true, std::vector({0, 0, 0, 2}), "aa"); 1329 | EXPECT_MATCH(true, std::vector({0, 0, 0, 3}), "aaa"); 1330 | } 1331 | 1332 | TEST_F(MatchTest, Quantifiers_4) { 1333 | ParseAll("(a+)"); 1334 | CompileAll(); 1335 | EXPECT_MATCH(false, std::vector({}), ""); 1336 | EXPECT_MATCH(true, std::vector({0, 1}), "a"); 1337 | EXPECT_MATCH(true, std::vector({0, 2}), "aa"); 1338 | EXPECT_MATCH(true, std::vector({0, 3}), "aaa"); 1339 | } 1340 | 1341 | TEST_F(MatchTest, Quantifiers_5) { 1342 | ParseAll("(a+)(a+)"); 1343 | CompileAll(); 1344 | EXPECT_MATCH(false, std::vector({}), ""); 1345 | EXPECT_MATCH(false, std::vector({}), "a"); 1346 | EXPECT_MATCH(true, std::vector({0, 1, 1, 2}), "aa"); 1347 | EXPECT_MATCH(true, std::vector({0, 2, 2, 3}), "aaa"); 1348 | } 1349 | 1350 | TEST_F(MatchTest, Quantifiers_6) { 1351 | ParseAll("(a+?)(a+)"); 1352 | CompileAll(); 1353 | EXPECT_MATCH(false, std::vector({}), ""); 1354 | EXPECT_MATCH(false, std::vector({}), "a"); 1355 | EXPECT_MATCH(true, std::vector({0, 1, 1, 2}), "aa"); 1356 | EXPECT_MATCH(true, std::vector({0, 1, 1, 3}), "aaa"); 1357 | } 1358 | 1359 | TEST_F(MatchTest, Quantifiers_7) { 1360 | ParseAll("(a?)"); 1361 | CompileAll(); 1362 | EXPECT_MATCH(true, std::vector({0, 0}), ""); 1363 | EXPECT_MATCH(true, std::vector({0, 1}), "a"); 1364 | EXPECT_MATCH(false, std::vector({}), "aa"); 1365 | EXPECT_MATCH(false, std::vector({}), "aaa"); 1366 | } 1367 | 1368 | TEST_F(MatchTest, Quantifiers_8) { 1369 | ParseAll("(a?)(a?)"); 1370 | CompileAll(); 1371 | EXPECT_MATCH(true, std::vector({0, 0, 0, 0}), ""); 1372 | EXPECT_MATCH(true, std::vector({0, 1, 1, 1}), "a"); 1373 | EXPECT_MATCH(true, std::vector({0, 1, 1, 2}), "aa"); 1374 | EXPECT_MATCH(false, std::vector({}), "aaa"); 1375 | } 1376 | 1377 | TEST_F(MatchTest, Quantifiers_9) { 1378 | ParseAll("(a?""?)(a?)"); // Avoid trigraph. 1379 | CompileAll(); 1380 | EXPECT_MATCH(true, std::vector({0, 0, 0, 0}), ""); 1381 | EXPECT_MATCH(true, std::vector({0, 0, 0, 1}), "a"); 1382 | EXPECT_MATCH(true, std::vector({0, 1, 1, 2}), "aa"); 1383 | EXPECT_MATCH(false, std::vector({}), "aaa"); 1384 | } 1385 | 1386 | TEST_F(MatchTest, Quantifiers_10) { 1387 | ParseAll("(a{1})"); 1388 | CompileAll(); 1389 | EXPECT_MATCH(false, std::vector({}), ""); 1390 | EXPECT_MATCH(true, std::vector({0, 1}), "a"); 1391 | EXPECT_MATCH(false, std::vector({}), "aa"); 1392 | EXPECT_MATCH(false, std::vector({}), "aaa"); 1393 | } 1394 | 1395 | TEST_F(MatchTest, Quantifiers_11) { 1396 | ParseAll("(a{1})(a{1})"); 1397 | CompileAll(); 1398 | EXPECT_MATCH(false, std::vector({}), ""); 1399 | EXPECT_MATCH(false, std::vector({}), "a"); 1400 | EXPECT_MATCH(true, std::vector({0, 1, 1, 2}), "aa"); 1401 | EXPECT_MATCH(false, std::vector({}), "aaa"); 1402 | } 1403 | 1404 | TEST_F(MatchTest, Quantifiers_12) { 1405 | ParseAll("(a{1}?)(a{1})"); 1406 | CompileAll(); 1407 | EXPECT_MATCH(false, std::vector({}), ""); 1408 | EXPECT_MATCH(false, std::vector({}), "a"); 1409 | EXPECT_MATCH(true, std::vector({0, 1, 1, 2}), "aa"); 1410 | EXPECT_MATCH(false, std::vector({}), "aaa"); 1411 | } 1412 | 1413 | TEST_F(MatchTest, Quantifiers_13) { 1414 | ParseAll("(a{1,})"); 1415 | CompileAll(); 1416 | EXPECT_MATCH(false, std::vector({}), ""); 1417 | EXPECT_MATCH(true, std::vector({0, 1}), "a"); 1418 | EXPECT_MATCH(true, std::vector({0, 2}), "aa"); 1419 | EXPECT_MATCH(true, std::vector({0, 3}), "aaa"); 1420 | } 1421 | 1422 | TEST_F(MatchTest, Quantifiers_14) { 1423 | ParseAll("(a{1,})(a{1,})"); 1424 | CompileAll(); 1425 | EXPECT_MATCH(false, std::vector({}), ""); 1426 | EXPECT_MATCH(false, std::vector({}), "a"); 1427 | EXPECT_MATCH(true, std::vector({0, 1, 1, 2}), "aa"); 1428 | EXPECT_MATCH(true, std::vector({0, 2, 2, 3}), "aaa"); 1429 | } 1430 | 1431 | TEST_F(MatchTest, Quantifiers_15) { 1432 | ParseAll("(a{1,}?)(a{1,})"); 1433 | CompileAll(); 1434 | EXPECT_MATCH(false, std::vector({}), ""); 1435 | EXPECT_MATCH(false, std::vector({}), "a"); 1436 | EXPECT_MATCH(true, std::vector({0, 1, 1, 2}), "aa"); 1437 | EXPECT_MATCH(true, std::vector({0, 1, 1, 3}), "aaa"); 1438 | } 1439 | 1440 | TEST_F(MatchTest, Quantifiers_16) { 1441 | ParseAll("(a{1,2})"); 1442 | CompileAll(); 1443 | EXPECT_MATCH(false, std::vector({}), ""); 1444 | EXPECT_MATCH(true, std::vector({0, 1}), "a"); 1445 | EXPECT_MATCH(true, std::vector({0, 2}), "aa"); 1446 | EXPECT_MATCH(false, std::vector({}), "aaa"); 1447 | } 1448 | 1449 | TEST_F(MatchTest, Quantifiers_17) { 1450 | ParseAll("(a{1,2})(a{1,2})"); 1451 | CompileAll(); 1452 | EXPECT_MATCH(false, std::vector({}), ""); 1453 | EXPECT_MATCH(false, std::vector({}), "a"); 1454 | EXPECT_MATCH(true, std::vector({0, 1, 1, 2}), "aa"); 1455 | EXPECT_MATCH(true, std::vector({0, 2, 2, 3}), "aaa"); 1456 | } 1457 | 1458 | TEST_F(MatchTest, Quantifiers_18) { 1459 | ParseAll("(a{1,2}?)(a{1,2})"); 1460 | CompileAll(); 1461 | EXPECT_MATCH(false, std::vector({}), ""); 1462 | EXPECT_MATCH(false, std::vector({}), "a"); 1463 | EXPECT_MATCH(true, std::vector({0, 1, 1, 2}), "aa"); 1464 | EXPECT_MATCH(true, std::vector({0, 1, 1, 3}), "aaa"); 1465 | } 1466 | 1467 | TEST_F(MatchTest, Concatenation) { 1468 | ParseAll("(aa)"); 1469 | CompileAll(); 1470 | EXPECT_MATCH(false, std::vector({}), ""); 1471 | EXPECT_MATCH(false, std::vector({}), "a"); 1472 | EXPECT_MATCH(true, std::vector({0, 2}), "aa"); 1473 | EXPECT_MATCH(false, std::vector({}), "aaa"); 1474 | } 1475 | 1476 | TEST_F(MatchTest, Complement_1) { 1477 | ParseAll("(!a)"); 1478 | CompileAll(); 1479 | EXPECT_MATCH(true, std::vector({0, 0}), ""); 1480 | EXPECT_MATCH(false, std::vector({}), "a"); 1481 | EXPECT_MATCH(true, std::vector({0, 2}), "aa"); 1482 | EXPECT_MATCH(true, std::vector({0, 3}), "aaa"); 1483 | } 1484 | 1485 | TEST_F(MatchTest, Complement_2) { 1486 | ParseAll("(!(a))"); 1487 | CompileAll(); 1488 | EXPECT_MATCH(true, std::vector({0, 0, -1, -1}), ""); 1489 | EXPECT_MATCH(false, std::vector({}), "a"); 1490 | EXPECT_MATCH(true, std::vector({0, 2, -1, -1}), "aa"); 1491 | EXPECT_MATCH(true, std::vector({0, 3, -1, -1}), "aaa"); 1492 | } 1493 | 1494 | TEST_F(MatchTest, Conjunction_1) { 1495 | ParseAll("(a.)&(.b)"); 1496 | CompileAll(); 1497 | EXPECT_MATCH(false, std::vector({}), "aa"); 1498 | EXPECT_MATCH(true, std::vector({0, 2, 0, 2}), "ab"); 1499 | EXPECT_MATCH(false, std::vector({}), "ba"); 1500 | EXPECT_MATCH(false, std::vector({}), "bb"); 1501 | } 1502 | 1503 | TEST_F(MatchTest, Conjunction_2) { 1504 | ParseAll("(a.*)&(.*b)"); 1505 | CompileAll(); 1506 | EXPECT_MATCH(false, std::vector({}), "aa"); 1507 | EXPECT_MATCH(true, std::vector({0, 2, 0, 2}), "ab"); 1508 | EXPECT_MATCH(false, std::vector({}), "ba"); 1509 | EXPECT_MATCH(false, std::vector({}), "bb"); 1510 | EXPECT_MATCH(false, std::vector({}), "aXa"); 1511 | EXPECT_MATCH(true, std::vector({0, 3, 0, 3}), "aXb"); 1512 | EXPECT_MATCH(false, std::vector({}), "bXa"); 1513 | EXPECT_MATCH(false, std::vector({}), "bXb"); 1514 | } 1515 | 1516 | TEST_F(MatchTest, Disjunction_1) { 1517 | ParseAll("(a.)|(.b)"); 1518 | CompileAll(); 1519 | EXPECT_MATCH(true, std::vector({0, 2, -1, -1}), "aa"); 1520 | EXPECT_MATCH(true, std::vector({0, 2, -1, -1}), "ab"); 1521 | EXPECT_MATCH(false, std::vector({}), "ba"); 1522 | EXPECT_MATCH(true, std::vector({-1, -1, 0, 2}), "bb"); 1523 | } 1524 | 1525 | TEST_F(MatchTest, Disjunction_2) { 1526 | ParseAll("(a.*)|(.*b)"); 1527 | CompileAll(); 1528 | EXPECT_MATCH(true, std::vector({0, 2, -1, -1}), "aa"); 1529 | EXPECT_MATCH(true, std::vector({0, 2, -1, -1}), "ab"); 1530 | EXPECT_MATCH(false, std::vector({}), "ba"); 1531 | EXPECT_MATCH(true, std::vector({-1, -1, 0, 2}), "bb"); 1532 | EXPECT_MATCH(true, std::vector({0, 3, -1, -1}), "aXa"); 1533 | EXPECT_MATCH(true, std::vector({0, 3, -1, -1}), "aXb"); 1534 | EXPECT_MATCH(false, std::vector({}), "bXa"); 1535 | EXPECT_MATCH(true, std::vector({-1, -1, 0, 3}), "bXb"); 1536 | } 1537 | 1538 | TEST_F(MatchTest, PerlSemantics_1) { 1539 | ParseAll("(?:(a*?)|(a*))(a*)"); 1540 | CompileAll(); 1541 | EXPECT_MATCH(true, std::vector({0, 0, -1, -1, 0, 0}), ""); 1542 | EXPECT_MATCH(true, std::vector({0, 0, -1, -1, 0, 1}), "a"); 1543 | EXPECT_MATCH(true, std::vector({0, 0, -1, -1, 0, 2}), "aa"); 1544 | EXPECT_MATCH(true, std::vector({0, 0, -1, -1, 0, 3}), "aaa"); 1545 | } 1546 | 1547 | TEST_F(MatchTest, PerlSemantics_2) { 1548 | ParseAll("(?:(a*)|(a*?))(a*)"); 1549 | CompileAll(); 1550 | EXPECT_MATCH(true, std::vector({0, 0, -1, -1, 0, 0}), ""); 1551 | EXPECT_MATCH(true, std::vector({0, 1, -1, -1, 1, 1}), "a"); 1552 | EXPECT_MATCH(true, std::vector({0, 2, -1, -1, 2, 2}), "aa"); 1553 | EXPECT_MATCH(true, std::vector({0, 3, -1, -1, 3, 3}), "aaa"); 1554 | } 1555 | 1556 | // http://swtch.com/~rsc/regexp/regexp2.html#posix 1557 | TEST_F(MatchTest, PerlSemantics_3) { 1558 | ParseAll("(a|bcdef|g|ab|c|d|e|efg|fg)*"); 1559 | CompileAll(); 1560 | EXPECT_MATCH(true, std::vector({6, 7}), "abcdefg"); 1561 | } 1562 | 1563 | } // namespace redgrep 1564 | --------------------------------------------------------------------------------