├── .circleci └── config.yml ├── .clj-kondo └── config.edn ├── .gitattributes ├── .gitignore ├── CHANGES.md ├── LICENSE ├── README.md ├── docs ├── ABNF.md ├── ExperimentalFeatures.md ├── Performance.md └── Tracing.md ├── images └── vizexample1.png ├── project.clj ├── resources └── clj-kondo.exports │ └── instaparse │ └── config.edn ├── runner └── cljs │ └── runner │ └── runner.cljs ├── src └── instaparse │ ├── abnf.cljc │ ├── auto_flatten_seq.cljc │ ├── cfg.cljc │ ├── combinators.cljc │ ├── combinators_source.cljc │ ├── core.cljc │ ├── failure.cljc │ ├── gll.cljc │ ├── line_col.cljc │ ├── macros.clj │ ├── print.cljc │ ├── reduction.cljc │ ├── repeat.cljc │ ├── transform.cljc │ ├── util.cljc │ ├── viz.clj │ └── viz.cljs └── test ├── data ├── abnf_uri.txt ├── defparser_grammar.txt └── phone_uri.txt └── instaparse ├── abnf_test.cljc ├── auto_flatten_seq_test.cljc ├── core_test.cljc ├── defparser_test.cljc ├── failure_test.cljc ├── grammars.cljc ├── namespaced_nts_test.cljc ├── repeat_test.cljc ├── specs.cljc └── viz_test.clj /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | workflows: 4 | version: 2 5 | build: 6 | jobs: 7 | - test-clj 8 | - test-cljs 9 | 10 | jobs: 11 | test-clj: 12 | working_directory: ~/project 13 | docker: 14 | - image: circleci/clojure:lein-2.8.1 15 | steps: 16 | - checkout 17 | - run: lein check 18 | - run: lein test-all 19 | test-cljs: 20 | working_directory: ~/project 21 | docker: 22 | - image: circleci/clojure:lein-2.8.1-node 23 | steps: 24 | - checkout 25 | - run: lein test-cljs-all -------------------------------------------------------------------------------- /.clj-kondo/config.edn: -------------------------------------------------------------------------------- 1 | {:config-paths ["../resources/clj-kondo.exports/instaparse"]} 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text auto 2 | *.clj text 3 | *.md text 4 | *.png binary -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /lib 3 | /classes 4 | /checkouts 5 | /bin 6 | /out 7 | deps.edn 8 | .cpcache 9 | .project 10 | .classpath 11 | pom.xml 12 | deps.edn 13 | *.jar 14 | *.class 15 | .lein-deps-sum 16 | .lein-failures 17 | .lein-plugins 18 | ideas.txt 19 | benchmarks.txt 20 | todo.txt 21 | /.settings 22 | .nrepl-port 23 | .lein-repl-history 24 | *~ 25 | *#*# 26 | .cljs_node_repl/ 27 | .idea/ 28 | *.iml 29 | *.asc 30 | .nrepl-history 31 | /.clj-kondo 32 | !/.clj-kondo/config.edn 33 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | # Instaparse Change Log 2 | 3 | ## 1.5.0 4 | 5 | ### Enhancements 6 | 7 | * instaparse.core/parser now accepts an optional keyword argument `:allow-namespaced-nts true` which accepts namespaced non-terminals in the parser's grammar, thus building a parser that will tag the output with the corresponding namespaced keywords. 8 | 9 | ## 1.4.14 10 | 11 | ### Enhancements 12 | 13 | * Now leverages clojurescript's implicit sugar for :require-macros, :include-macros, and :refer-macros in namespace declaration. Thanks to sumbach for the pull request! 14 | 15 | ## 1.4.13 16 | 17 | ### Enhancements 18 | 19 | * Added clj-kondo resource file. Thanks to toniz4 for the pull request! 20 | * Added new arity to add-line-and-column-info-to-metadata that supports starting-line and starting-column. Thanks to mainej for the pull request! 21 | 22 | ## 1.4.12 23 | 24 | ### Bugfixes 25 | 26 | * Instaparse error messages weren't pointing the caret at the right character when the text had tab characters. Thanks to ema-fox and seltzer1717 for the pull request. 27 | 28 | ## 1.4.11 29 | 30 | ### Bugfixes 31 | 32 | * Fixed problem where `:start` option wasn't being respected when grammar was provided as a file. 33 | 34 | ## 1.4.10 35 | 36 | ### Enhancements 37 | 38 | * Change to remove warning caused by latest version of Clojurescript, which warned about use of private var from tools.reader. 39 | 40 | * Added type hints to support native compilation under Graal. 41 | 42 | * Removed test case broken by Clojure 1.10. 43 | 44 | ## 1.4.9 45 | 46 | ### Enhancements 47 | 48 | * ABNF parsers' string case-insensitivity can now be disabled by setting `:string-ci false`. 49 | 50 | * `ebnf` and `abnf` combinators now support an optional `:string-ci` argument, which overrides the default case-insensitivity behavior for that input format. 51 | 52 | ### Bugfixes 53 | 54 | * Case-insensitive regexp flag on Clojurescript 55 | 56 | * Better handling for when rhizome is present in compilation environment, but not at runtime. 57 | 58 | ## 1.4.8 59 | 60 | ### Updates 61 | 62 | * Update to support Clojurescript 1.9.854 and above, due to a breaking change in Clojurescript to use tools.reader. 63 | 64 | ## 1.4.7 65 | 66 | ### Enhancements 67 | 68 | * `visualize` now supports `:output-file :buffered-image`, which returns a java.awt.image.BufferedImage object. 69 | 70 | ### Bugfixes 71 | 72 | * Fixed problem where `visualize` with `:output-file` didn't work on rootless trees. 73 | 74 | ## 1.4.6 75 | 76 | ### Performance improvements 77 | 78 | * Better performance for ABNF grammars in Clojurescript. 79 | 80 | ## 1.4.5 81 | 82 | ### Bugfixes 83 | 84 | * Fixed regression in 1.4.4 involving parsers based off of URIs. 85 | 86 | * defparser now supports the full range of relevant parser options. 87 | 88 | ## 1.4.4 89 | 90 | ### Enhancements 91 | 92 | * Instaparse is now cross-platform compatible between Clojure and Clojurescript. 93 | 94 | ### Features 95 | 96 | * defparser - builds parser at compile time 97 | 98 | ## 1.4.3 99 | 100 | ### Bugfixes 101 | 102 | * Fixed bug with insta/transform on tree with hidden root tag and strings at the top level of the tree. 103 | 104 | ## 1.4.2 105 | 106 | ### Bugfixes 107 | 108 | * Fixed problem with counted repetitions in ABNF. 109 | 110 | ## 1.4.1 111 | 112 | ### Features 113 | 114 | * New function `add-line-and-column-info-to-metadata` in the instaparse.core namespace. 115 | 116 | ### Enhancements 117 | 118 | * Added new combinators for unicode character ranges, for better portability to Clojurescript. 119 | 120 | ### Bugfixes 121 | 122 | * Improved compatibility with boot, which allows having multiple versions of Clojure on the classpath, by making change to string-reader which needs to 123 | be aware of what version of Clojure it is running due to a breaking change in Clojure 1.7. 124 | 125 | * Fixed bug with the way failure messages were printed in certain cases. 126 | 127 | ## 1.4.0 128 | 129 | ### Bugfixes 130 | 131 | * In 1.3.6, parsing of any CharSequence was introduced, however, the error messages 132 | for failed parses weren't printing properly. This has been fixed. 133 | 134 | * 1.4.0 uses a more robust algorithm for handling nested negative lookaheads, in 135 | response to a bug report where the existing mechanism produced incorrect parses 136 | (in addition to the correct parse) for a very unusual case. 137 | 138 | ### Enhancements 139 | 140 | * New support for tracing the steps the parser goes through. Call your parser with 141 | the optional flag `:trace true`. The first time you use this flag, it triggers a 142 | recompilation of the code with additional tracing and profiling steps. 143 | To restore the code to its non-instrumented form, call `(insta/disable-tracing!)`. 144 | 145 | ## 1.3.6 146 | 147 | ### Enhancements 148 | 149 | * Modified for compatibility with Clojure 1.7.0-alpha6 150 | * Instaparse now can parse anything supporting the CharSequence interface, not just strings. 151 | Specifically, this allows instaparse to operate on StringBuilder objects. 152 | 153 | ## 1.3.5 154 | 155 | ### Bugfixes 156 | 157 | * Fixed bug with `transform` on hiccup data structures with numbers or other atomic data as leaves. 158 | 159 | * Fixed bug with character concatenation support in ABNF grammar 160 | 161 | ### Enhancements 162 | 163 | * Added support for Unicode characters to ABNF. 164 | 165 | ## 1.3.4 166 | 167 | ### Enhancements 168 | 169 | * Modified for compatibility with Clojure 1.7.0-alpha2. 170 | 171 | ## 1.3.3 172 | 173 | ### Enhancements 174 | 175 | Made two changes to make it possible to use instaparse on Google App Engine. 176 | 177 | * Removed dependency on javax.swing.text.Segment class. 178 | * Added `:no-slurp true` keyword option to `insta/parser` to disable URI slurping behavior, since GAE does not support slurp. 179 | 180 | ## 1.3.2 181 | 182 | ### Bugfixes 183 | 184 | * Regular expressions on empty strings weren't properly returning a failure. 185 | 186 | ## 1.3.1 187 | 188 | ### Enhancements 189 | 190 | * Updated tests to use Clojure 1.6.0's final release. 191 | * Added `:ci-string true` flag to `insta/parser`. 192 | 193 | ## 1.3.0 194 | 195 | ### Compatibility with Clojure 1.6 196 | 197 | ## 1.2.16 198 | 199 | ### Bugfixes 200 | 201 | * Calling `empty` on a FlattenOnDemandVector now returns []. 202 | 203 | ## 1.2.15 204 | 205 | ### Enhancements 206 | 207 | * :auto-whitespace can now take the keyword :standard or :comma to access one of the predefined whitespace parsers. 208 | 209 | ### Bugfixes 210 | 211 | * Fixed newline problem visualizing parse trees on Linux. 212 | * Fixed problem with visualizing rootless trees. 213 | 214 | ## 1.2.11 215 | 216 | ### Minor enhancements 217 | 218 | * Further refinements to the way ordered choice interacts with epsilon parsers. 219 | 220 | ## 1.2.10 221 | 222 | ### Bugfixes 223 | 224 | * Fixed bug introduced by 1.2.9 affecting ordered choice. 225 | 226 | ## 1.2.9 227 | 228 | ### Bugfixes 229 | 230 | * Fixed bug where ordered choice was ignoring epsilon parser. 231 | 232 | ## 1.2.8 233 | 234 | ### Bugfixes 235 | 236 | * Fixed bug introduced by 1.2.7, affecting printing of grammars with regexes. 237 | 238 | ### Enhancements 239 | 240 | * Parser printing format now includes <> hidden information and tags. 241 | 242 | ## 1.2.7 243 | 244 | ### Bugfixes 245 | 246 | * Fixed bug when regular expression contains | character. 247 | 248 | ## 1.2.6 249 | 250 | ### Bugfixes 251 | 252 | * Changed pre-condition assertion for auto-whitespace option which was causing a problem with "lein jar". 253 | 254 | ## 1.2.5 255 | 256 | ### Bugfixes 257 | 258 | * Improved handling of unusual characters in ABNF grammars. 259 | 260 | ## 1.2.4 261 | 262 | ### Bugfixes 263 | 264 | * When parsing in :total mode with :enlive as the output format, changed the content of failure node from vector to list to match the rest of the enlive output. 265 | 266 | ## 1.2.3 267 | 268 | ### Bugfixes 269 | 270 | * Fixed problem when epsilon was the only thing in a nonterminal, e.g., "S = epsilon" 271 | 272 | ### Features 273 | 274 | * Added experimental `:auto-whitespace` feature. See the [Experimental Features Document](docs/ExperimentalFeatures.md) for more details. 275 | 276 | ## 1.2.2 277 | 278 | ### Bugfixes 279 | 280 | * Fixed reflection warning. 281 | 282 | ## 1.2.1 283 | 284 | ### Bugfixes 285 | 286 | * I had accidentally left a dependency on tools.trace in the repeat.clj file, used while I was debugging that namespace. Removed it. 287 | 288 | ## 1.2.0 289 | 290 | ### New Features 291 | 292 | * `span` function returns substring indexes into the parsed text for a portion of the parse tree. 293 | * `visualize` function draws the parse tree, using rhizome and graphviz if installed. 294 | * `:optimize :memory` flag that, for suitable parsers, will perform the parsing in discrete chunks, using less memory. 295 | * New parsing flag to undo the effect of the <> hide notation. 296 | + `(my-parser text :unhide :tags)` - reveals tags, i.e., `<>` applied on the left-hand sides of rules. 297 | + `(my-parser text :unhide :content)` - reveals content hidden on the right-hand side of rules with `<>` 298 | + `(my-parser text :unhide :all)` - reveals both tags and content. 299 | 300 | ### Notable Performance Improvements 301 | 302 | * Dramatic performance improvement (quadratic time reduced to linear) when repetition parsers (+ or *) operate on text whose parse tree contains a large number of repetitions. 303 | * Performance improvement for regular expressions. 304 | 305 | ### Minor Enhancements 306 | 307 | * Added more support to IncrementalVector for a wider variety of vector operations, including subvec, nth, and vec. 308 | 309 | ## 1.1.0 310 | 311 | ### Breaking Changes 312 | 313 | * When you run a parser in "total" mode, the failure node is no longer tagged with `:failure`, but instead is tagged with `:instaparse/failure`. 314 | 315 | ### New Features 316 | 317 | * Comments now supported in CFGs. Use (* and *) notation. 318 | * Added `ebnf` combinator to the `instaparse/combinators` namespace. This new combinator converts string specifications to the combinator-built equivalent. See combinator section of the updated tutorial for details. 319 | * ABNF: can now create a parser from a specification using `:input-format :abnf` for ABNF parser syntax. 320 | * New combinators related to ABNF: 321 | 1. `abnf` -- converts ABNF string fragments to combinators. 322 | 2. `string-ci` -- case-insensitive strings. 323 | 3. `rep` -- between m and n repetitions. 324 | * New core function related to ABNF: 325 | `set-default-input-format!` -- initially defaults to :ebnf 326 | 327 | ### Minor Enhancements 328 | 329 | * Added comments to regexes used by the parser that processes the context-free grammar syntax, improving the readability of error messages if you have a faulty grammar specification. 330 | 331 | ### Bug Fixes 332 | 333 | * Backslashes in front of quotation mark were escaping the quotation mark, even if the backslash itself was escaped. 334 | * Unescaped double-quote marks weren't properly handled, e.g., (parser "A = '\"'"). 335 | * Nullable Plus: ((parser "S = ('a'?)+") "") previously returned a failure, now returns [:S] 336 | * Fixed problem with failure reporting that would occur if parse failed on an input that ended with a newline character. 337 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC 2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM 3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 4 | 5 | 1. DEFINITIONS 6 | 7 | "Contribution" means: 8 | 9 | a) in the case of the initial Contributor, the initial code and 10 | documentation distributed under this Agreement, and 11 | 12 | b) in the case of each subsequent Contributor: 13 | 14 | i) changes to the Program, and 15 | 16 | ii) additions to the Program; 17 | 18 | where such changes and/or additions to the Program originate from and are 19 | distributed by that particular Contributor. A Contribution 'originates' from 20 | a Contributor if it was added to the Program by such Contributor itself or 21 | anyone acting on such Contributor's behalf. Contributions do not include 22 | additions to the Program which: (i) are separate modules of software 23 | distributed in conjunction with the Program under their own license 24 | agreement, and (ii) are not derivative works of the Program. 25 | 26 | "Contributor" means any person or entity that distributes the Program. 27 | 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are 29 | necessarily infringed by the use or sale of its Contribution alone or when 30 | combined with the Program. 31 | 32 | "Program" means the Contributions distributed in accordance with this 33 | Agreement. 34 | 35 | "Recipient" means anyone who receives the Program under this Agreement, 36 | including all Contributors. 37 | 38 | 2. GRANT OF RIGHTS 39 | 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to 42 | reproduce, prepare derivative works of, publicly display, publicly perform, 43 | distribute and sublicense the Contribution of such Contributor, if any, and 44 | such derivative works, in source code and object code form. 45 | 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise 49 | transfer the Contribution of such Contributor, if any, in source code and 50 | object code form. This patent license shall apply to the combination of the 51 | Contribution and the Program if, at the time the Contribution is added by the 52 | Contributor, such addition of the Contribution causes such combination to be 53 | covered by the Licensed Patents. The patent license shall not apply to any 54 | other combinations which include the Contribution. No hardware per se is 55 | licensed hereunder. 56 | 57 | c) Recipient understands that although each Contributor grants the licenses 58 | to its Contributions set forth herein, no assurances are provided by any 59 | Contributor that the Program does not infringe the patent or other 60 | intellectual property rights of any other entity. Each Contributor disclaims 61 | any liability to Recipient for claims brought by any other entity based on 62 | infringement of intellectual property rights or otherwise. As a condition to 63 | exercising the rights and licenses granted hereunder, each Recipient hereby 64 | assumes sole responsibility to secure any other intellectual property rights 65 | needed, if any. For example, if a third party patent license is required to 66 | allow Recipient to distribute the Program, it is Recipient's responsibility 67 | to acquire that license before distributing the Program. 68 | 69 | d) Each Contributor represents that to its knowledge it has sufficient 70 | copyright rights in its Contribution, if any, to grant the copyright license 71 | set forth in this Agreement. 72 | 73 | 3. REQUIREMENTS 74 | 75 | A Contributor may choose to distribute the Program in object code form under 76 | its own license agreement, provided that: 77 | 78 | a) it complies with the terms and conditions of this Agreement; and 79 | 80 | b) its license agreement: 81 | 82 | i) effectively disclaims on behalf of all Contributors all warranties and 83 | conditions, express and implied, including warranties or conditions of title 84 | and non-infringement, and implied warranties or conditions of merchantability 85 | and fitness for a particular purpose; 86 | 87 | ii) effectively excludes on behalf of all Contributors all liability for 88 | damages, including direct, indirect, special, incidental and consequential 89 | damages, such as lost profits; 90 | 91 | iii) states that any provisions which differ from this Agreement are offered 92 | by that Contributor alone and not by any other party; and 93 | 94 | iv) states that source code for the Program is available from such 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on 96 | or through a medium customarily used for software exchange. 97 | 98 | When the Program is made available in source code form: 99 | 100 | a) it must be made available under this Agreement; and 101 | 102 | b) a copy of this Agreement must be included with each copy of the Program. 103 | 104 | Contributors may not remove or alter any copyright notices contained within 105 | the Program. 106 | 107 | Each Contributor must identify itself as the originator of its Contribution, 108 | if any, in a manner that reasonably allows subsequent Recipients to identify 109 | the originator of the Contribution. 110 | 111 | 4. COMMERCIAL DISTRIBUTION 112 | 113 | Commercial distributors of software may accept certain responsibilities with 114 | respect to end users, business partners and the like. While this license is 115 | intended to facilitate the commercial use of the Program, the Contributor who 116 | includes the Program in a commercial product offering should do so in a 117 | manner which does not create potential liability for other Contributors. 118 | Therefore, if a Contributor includes the Program in a commercial product 119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend 120 | and indemnify every other Contributor ("Indemnified Contributor") against any 121 | losses, damages and costs (collectively "Losses") arising from claims, 122 | lawsuits and other legal actions brought by a third party against the 123 | Indemnified Contributor to the extent caused by the acts or omissions of such 124 | Commercial Contributor in connection with its distribution of the Program in 125 | a commercial product offering. The obligations in this section do not apply 126 | to any claims or Losses relating to any actual or alleged intellectual 127 | property infringement. In order to qualify, an Indemnified Contributor must: 128 | a) promptly notify the Commercial Contributor in writing of such claim, and 129 | b) allow the Commercial Contributor tocontrol, and cooperate with the 130 | Commercial Contributor in, the defense and any related settlement 131 | negotiations. The Indemnified Contributor may participate in any such claim 132 | at its own expense. 133 | 134 | For example, a Contributor might include the Program in a commercial product 135 | offering, Product X. That Contributor is then a Commercial Contributor. If 136 | that Commercial Contributor then makes performance claims, or offers 137 | warranties related to Product X, those performance claims and warranties are 138 | such Commercial Contributor's responsibility alone. Under this section, the 139 | Commercial Contributor would have to defend claims against the other 140 | Contributors related to those performance claims and warranties, and if a 141 | court requires any other Contributor to pay any damages as a result, the 142 | Commercial Contributor must pay those damages. 143 | 144 | 5. NO WARRANTY 145 | 146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON 147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER 148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR 149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A 150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the 151 | appropriateness of using and distributing the Program and assumes all risks 152 | associated with its exercise of rights under this Agreement , including but 153 | not limited to the risks and costs of program errors, compliance with 154 | applicable laws, damage to or loss of data, programs or equipment, and 155 | unavailability or interruption of operations. 156 | 157 | 6. DISCLAIMER OF LIABILITY 158 | 159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY 160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, 161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION 162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE 165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY 166 | OF SUCH DAMAGES. 167 | 168 | 7. GENERAL 169 | 170 | If any provision of this Agreement is invalid or unenforceable under 171 | applicable law, it shall not affect the validity or enforceability of the 172 | remainder of the terms of this Agreement, and without further action by the 173 | parties hereto, such provision shall be reformed to the minimum extent 174 | necessary to make such provision valid and enforceable. 175 | 176 | If Recipient institutes patent litigation against any entity (including a 177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself 178 | (excluding combinations of the Program with other software or hardware) 179 | infringes such Recipient's patent(s), then such Recipient's rights granted 180 | under Section 2(b) shall terminate as of the date such litigation is filed. 181 | 182 | All Recipient's rights under this Agreement shall terminate if it fails to 183 | comply with any of the material terms or conditions of this Agreement and 184 | does not cure such failure in a reasonable period of time after becoming 185 | aware of such noncompliance. If all Recipient's rights under this Agreement 186 | terminate, Recipient agrees to cease use and distribution of the Program as 187 | soon as reasonably practicable. However, Recipient's obligations under this 188 | Agreement and any licenses granted by Recipient relating to the Program shall 189 | continue and survive. 190 | 191 | Everyone is permitted to copy and distribute copies of this Agreement, but in 192 | order to avoid inconsistency the Agreement is copyrighted and may only be 193 | modified in the following manner. The Agreement Steward reserves the right to 194 | publish new versions (including revisions) of this Agreement from time to 195 | time. No one other than the Agreement Steward has the right to modify this 196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The 197 | Eclipse Foundation may assign the responsibility to serve as the Agreement 198 | Steward to a suitable separate entity. Each new version of the Agreement will 199 | be given a distinguishing version number. The Program (including 200 | Contributions) may always be distributed subject to the version of the 201 | Agreement under which it was received. In addition, after a new version of 202 | the Agreement is published, Contributor may elect to distribute the Program 203 | (including its Contributions) under the new version. Except as expressly 204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or 205 | licenses to the intellectual property of any Contributor under this 206 | Agreement, whether expressly, by implication, estoppel or otherwise. All 207 | rights in the Program not expressly granted under this Agreement are 208 | reserved. 209 | 210 | This Agreement is governed by the laws of the State of Washington and the 211 | intellectual property laws of the United States of America. No party to this 212 | Agreement will bring a legal action under this Agreement more than one year 213 | after the cause of action arose. Each party waives its rights to a jury trial 214 | in any resulting litigation. -------------------------------------------------------------------------------- /docs/ABNF.md: -------------------------------------------------------------------------------- 1 | # ABNF Input Format 2 | 3 | ABNF is an alternative input format for instaparse grammar specifications. ABNF does not provide any additional expressive power over instaparse's default EBNF-based syntax, so if you are new to instaparse and parsing, you do not need to read this document -- stick with the syntax described in [the tutorial](https://github.com/Engelberg/instaparse/blob/master/README.md). 4 | 5 | ABNF's main virtue is that it is precisely specified and commonly used in protocol specifications. If you use such protocols, instaparse's ABNF input format is a simple way to turn the ABNF specification into an executable parser. However, unless you are working with such specifications, you do not need the ABNF input format. 6 | 7 | ## EBNF vs ABNF 8 | 9 | ### EBNF 10 | 11 | The most common notation for expressing context-free grammars is [Backus-Naur Form](http://en.wikipedia.org/wiki/Backus%E2%80%93Naur_Form), or BNF for short. BNF, however, is a little too simplistic. People wanted more convenient notation for expressing repetitions, so [EBNF](http://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_Form), or *Extended* Backus-Naur Form was developed. 12 | 13 | There is a hodge-podge of various syntax extensions that all fall under the umbrella of EBNF. For example, one standard specifies that repetitions should be specified with `{}`, but regular expression operators such as `+`, `*`, and `?` are far more popular. 14 | 15 | When creating the primary input format for instaparse, I based the syntax off of EBNF. I consulted various standards I found on the internet, and filtered it through my own experience of what I've seen in various textbooks and specs over the years. I included the official repetition operators as well as the ones derived from regular expressions. I also incorporated PEG-like syntax extensions. 16 | 17 | What I ended up with was a slightly tweaked version of EBNF, making it relatively easy to turn any EBNF-specified grammar into an executable parser. However, with multiple competing standards and actively-used variations, there's no guarantee that an EBNF grammar that you find will perfectly align with instaparse's syntax. You may need to make a few tweaks to get it to work. 18 | 19 | ### ABNF 20 | 21 | From what I can tell, the purpose of [ABNF](http://en.wikipedia.org/wiki/Augmented_Backus%E2%80%93Naur_Form), or *Augmented* Backus-Naur Form, was to create a grammar syntax that would have a single, well-defined, formal standard, so that all ABNF grammars would look exactly the same. 22 | 23 | For this reason, ABNF seems to be a more popular grammar syntax in the world of specifications and protocols. For example, if you want to know the formal definition of what constitutes a valid URI, there's an ABNF grammar for that. 24 | 25 | After instaparse's initial release, I received a couple requests to support ABNF as an alternative input format. Since ABNF is so precisely defined, in theory, any ABNF grammar should work without modification. In practice, I've found that many ABNF specifications have one or two small typos; nevertheless, applying instaparse to ABNF is mostly a trivial copy-paste exercise. 26 | 27 | I included whatever further extensions and extra instaparse goodies I could safely include, but omitted any extension that would conflict with the ABNF standard and jeopardize the ability to use ABNF grammar specifications without modification. 28 | 29 | Aside from just wanting to adhere to the ABNF specifcation, I can think of a few niceties that ABNF provides over EBNF: 30 | 31 | 1. ABNF has a convenient syntax for specifying bounded repetitions, for example, something like "between 3 and 5 repetitions of the letter a". 32 | 33 | 2. Convenient syntax for expressing characters and ranges of characters. 34 | 35 | 3. ABNF comes with a "standard library" of a dozen or so common token rules. 36 | 37 | ## Usage 38 | 39 | To get a feeling for what ABNF syntax looks like, first check out this [ABNF specification for phone URIs.](https://raw.githubusercontent.com/Engelberg/instaparse/master/test/data/phone_uri.txt) I copied and pasted it directly from the formal spec -- found one typo which I fixed. 40 | 41 | (def phone-uri-parser 42 | (insta/parser "https://raw.githubusercontent.com/Engelberg/instaparse/master/test/data/phone_uri.txt" 43 | :input-format :abnf)) 44 | 45 | => (phone-uri-parser "tel:+1-201-555-0123") 46 | [:telephone-uri 47 | "tel:" 48 | [:telephone-subscriber 49 | [:global-number 50 | [:global-number-digits 51 | "+" 52 | [:DIGIT "1"] 53 | [:phonedigit [:visual-separator "-"]] 54 | [:phonedigit [:DIGIT "2"]] 55 | [:phonedigit [:DIGIT "0"]] 56 | [:phonedigit [:DIGIT "1"]] 57 | [:phonedigit [:visual-separator "-"]] 58 | [:phonedigit [:DIGIT "5"]] 59 | [:phonedigit [:DIGIT "5"]] 60 | [:phonedigit [:DIGIT "5"]] 61 | [:phonedigit [:visual-separator "-"]] 62 | [:phonedigit [:DIGIT "0"]] 63 | [:phonedigit [:DIGIT "1"]] 64 | [:phonedigit [:DIGIT "2"]] 65 | [:phonedigit [:DIGIT "3"]]]]]] 66 | 67 | The usage, as you can see, is almost identical to the way you build parsers using the `insta/parser` constructor. The only difference is the additional keyword argument `:input-format :abnf`. 68 | 69 | If you find yourself working with a whole series of ABNF parser specifications, you may find it more convenient to call 70 | 71 | (insta/set-default-input-format! :abnf) 72 | 73 | to alter the default input format. Changing the default makes it unnecessary to specify `:input-format :abnf` with each call to the parser constructor. 74 | 75 | Here is the doc string: 76 | 77 | => (doc insta/set-default-input-format!) 78 | ------------------------- 79 | instaparse.core/set-default-input-format! 80 | ([type]) 81 | Changes the default input format. Input should be :abnf or :ebnf 82 | 83 | ## ABNF Syntax Guide 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 |
CategoryNotationsExampleNotes
Rule= =/S = A=/ is usually used to extend an already-defined rule
Alternation/A / BDespite the use of /, this is unordered choice
ConcatenationwhitespaceA B
Grouping()(A / B) C
Bounded Repetition*3*5 AIn ABNF, repetition precedes the element
Optional*1*1 A
One or more1*1* A
Zero or more**A
String terminal"" '''a' "a"Single-quoted strings are an instaparse extension
Regex terminal#"" #''#'a' #"a"Regexes are an instaparse extension
Character terminal%d %b %x%x30-37
Comment;; comment to the end of the line
Lookahead&&ALookahead is an instaparse extension
Negative lookahead!!ANegative lookahead is an instaparse extension
102 | 103 | Some important things to be aware of: 104 | 105 | + According to the ABNF standard, all strings are *case-insensitive*. 106 | + ABNF strings do not support any kind of escape characters. Use ABNF's character notation to specify unusual characters. 107 | + In ABNF, there is one repetition operator, `*`, and it *precedes* the thing that it is operating on. So, for example, `3*5` means "between 3 and 5 repetitions". The first number defaults to 0 and the second defaults to infinity, so you can omit one or both numbers to get effects comparable to EBNF's `+`, `*`, and `?`. `4*4` could just be written as `4`. 108 | + Use `;` for comments to the end of the line. The ABNF specification has rigid definitions about where comments can be, but in instaparse the rules for comment placement are a bit more flexible and intuitive. 109 | + ABNF uses `/` for the ordinary alternative operator with no order implied. 110 | + ABNF allows the restatement of a rule name to specify multiple alternatives. The custom is to use `=/` in definitions that are adding alternatives, for example `S = 'a' / 'b'` could be written as: 111 | 112 |
113 | 114 | S = 'a' 115 | S =/ 'b' 116 | 117 | ## Extensions 118 | 119 | Instaparse extends ABNF by allowing single-quoted strings and both double-quoted and single-quoted regular expressions. The PEG extensions of lookahead `&` and negative lookahead `!` are permitted, but the PEG extension of ordered choice could not be included because of the syntactic conflict with ABNF's usage of `/` for unordered alternatives. 120 | 121 | Instaparse is somewhat more flexible with whitespace than the ABNF specification dictates, but somewhat less flexible than you might expect from the EBNF input format. For example, in instaparse's EBNF mode, `(A B)C` would be just fine, but ABNF insists on at least one space to indicate concatenation, so you'd have to write `(A B) C`. I relaxed whitespace restrictions when I could do so without radically deviating from the specification. 122 | 123 | ### Angle brackets 124 | 125 | The ABNF input format supports instaparse's angle bracket notation, where angle brackets can be used to hide certain parts of the grammar from the resulting tree structure. Including instaparse's angle bracket notation was a bit of a tough decision because technically angle brackets are reserved for special use in ABNF grammars. 126 | 127 | However, in ABNF notation, angle brackets are meant to be used for prose descriptions of some concept that can't be mechanically specified in the grammar. For example: 128 | 129 | P = 130 | 131 | I realized that such constructs can't be mechanically handled anyway, so I might as well co-opt the angle bracket notation, as I did with the EBNF syntax, for the very handy purpose of hiding. 132 | 133 | This means that when you paste in an ABNF specification, it is always wise to do a quick scan to make sure that no angle brackets were used. They are rarely used, but one [notably strange use of angle brackets](http://w3-org.9356.n7.nabble.com/ipath-empty-ABNF-rule-td192464.html) occurs in the URI specification, which uses `0` to designate the empty string. So be aware of these sorts of possibilities, but you're unlikely to run into them. 134 | 135 | ## The standard rules 136 | 137 | The ABNF specification states that the following rules are always available for use in ABNF grammars: 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 |
NameExplanation
ALPHAAlphabetic character
BIT0 or 1
CHARASCII character
CR\r
CRLF\r\n
CTLcontrol character
DIGIT0-9
DQUOTE"
HEXDIGHexadecimal digit: 0-9 or A-F
HTAB\t
LF\n
LWSPA specific mixture of whitespace and CRLF (see note below)
OCTET8-bit character
SPthe space character
VCHARvisible character
WSPspace or tab
158 | 159 | LWSP is particularly quirky, defined to be either a space or tab character, or an alternating sequence of carriage-return-linefeed and a single space or tab character. It's very specific, presumably relevant to some particular protocol, but not generally useful and I don't recommend using it. 160 | 161 | ## Combinators 162 | 163 | The `instaparse.combinators` contains a few combinators that are not documented in the main tutorial, but are listed here because they are only relevant to ABNF grammars. 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 |
String syntaxCombinatorFunctionality
"abc" (as used in ABNF)(string-ci "abc")string, case-insensitive
3*5 (as used in ABNF)(rep 3 5 parser)repetition
%d97 (as used in ABNF)(unicode-char 97)unicode code point
%d97-122 (as used in ABNF)(unicode-char 97 122)unicode range
172 | 173 | Finally, just as there exists an `ebnf` function in the combinators namespace that turns EBNF fragments into combinator-built data structures, there exists an `abnf` function which does the same for ABNF fragments. 174 | 175 | This means it is entirely possible to take fragments of EBNF syntax along with fragments of ABNF syntax, and convert all the pieces, merging them into a grammar map along with other pieces built from combinators. I don't expect that many people will need this ability to mix and match, but it's there if you need it. 176 | 177 | ## Case Sensitivity 178 | 179 | I've already mentioned that in ABNF syntax, strings are *case-insensitive*, meaning that the string terminal "abc" in an ABNF grammar also matches "aBc", "AbC", etc. Many ABNF grammar specifications leverage this case insensitivity, for example, the spec for hexadecimal digits include the strings "A", "B", "C", "D", "E", and "F", and this is intended to match the lowercase letters as well. 180 | 181 | A lesser-known quirk of ABNF syntax is that, in theory, non-terminal rule names are also case-insensitive. So for example, in the ABNF rule `S = 'a' s`, the lowercase `s` is actually referring back to the uppercase `S`. Although the specification of ABNF syntax allows for this possibility, as best as I can determine, this "feature" simply isn't used. It would be confusing and bad form to refer to a non-terminal in different places of your grammar with a different mixture of cases. 182 | 183 | Therefore, by default in instaparse, ABNF non-terminals are in fact, case-sensitive. This makes it easier for ABNF grammars to play nicely with EBNF grammars, grammar maps, and instaparse's transform function, all of which are case-sensitive. 184 | 185 | If you find yourself working with an ABNF grammar that uses an inconsistent mix of lowercase and uppercase letters to refer to the same non-terminal rules, you have two options available to you. The first possibility, of course, is to simply go through and fix the inconsistencies. The second option is to bind the dynamic variable `instaparse.abnf/*case-insensitive*` to true while building the parser from the ABNF grammar. 186 | 187 | Under the hood, this works by *converting all non-terminals to uppercase*. This means that in the resulting parse tree, all the rule names will be uppercase, so plan your tree traversals and transformations accordingly. 188 | 189 | As an example, let's revisit the usage example from above: 190 | 191 | (def phone-uri-parser 192 | (binding [instaparse.abnf/*case-insensitive* true] 193 | (insta/parser "https://raw.github.com/Engelberg/instaparse/master/test/instaparse/phone_uri.txt" 194 | :input-format :abnf))) 195 | 196 | => (phone-uri-parser "tel:+1-201-555-0123") 197 | [:TELEPHONE-URI 198 | "tel:" 199 | [:TELEPHONE-SUBSCRIBER 200 | [:GLOBAL-NUMBER 201 | [:GLOBAL-NUMBER-DIGITS 202 | "+" 203 | [:DIGIT "1"] 204 | [:PHONEDIGIT [:VISUAL-SEPARATOR "-"]] 205 | [:PHONEDIGIT [:DIGIT "2"]] 206 | [:PHONEDIGIT [:DIGIT "0"]] 207 | [:PHONEDIGIT [:DIGIT "1"]] 208 | [:PHONEDIGIT [:VISUAL-SEPARATOR "-"]] 209 | [:PHONEDIGIT [:DIGIT "5"]] 210 | [:PHONEDIGIT [:DIGIT "5"]] 211 | [:PHONEDIGIT [:DIGIT "5"]] 212 | [:PHONEDIGIT [:VISUAL-SEPARATOR "-"]] 213 | [:PHONEDIGIT [:DIGIT "0"]] 214 | [:PHONEDIGIT [:DIGIT "1"]] 215 | [:PHONEDIGIT [:DIGIT "2"]] 216 | [:PHONEDIGIT [:DIGIT "3"]]]]]] 217 | 218 | The `*case-insensitive*` dynamic variable is also obeyed by the `abnf` combinator. 219 | -------------------------------------------------------------------------------- /docs/ExperimentalFeatures.md: -------------------------------------------------------------------------------- 1 | # Instaparse Experimental Features 2 | 3 | This document provides an explanation of some of the things I'm experimenting with in instaparse. Please try the new features and let me know what you think. 4 | 5 | ## Optimizing memory 6 | 7 | I've added a new, experimental `:optimize :memory` flag that can conserve memory usage for certain classes of grammars. I discussed the motivation for this in the [Performance document](Performance.md). The idea is to make it more practical to use instaparse in situations where you need to parse files containing a large number of independent chunks. 8 | 9 | Usage looks like this: 10 | 11 | (def my-parser (insta/parser my-grammar)) 12 | (my-parser text :optimize :memory) 13 | 14 | 15 | It works for grammars where the top-level production is of the form 16 | 17 | start = chunk+ 18 | 19 | or 20 | 21 | start = header chunk+ 22 | 23 | I don't mean that it literally needs to use the words `start` or `header` or `chunk`. What I mean is that the optimizer looks for top-level productions that finish off with some sort of repeating structure. To be properly optimized, you want to ensure that the `chunk` rule is written with no ambiguity about where a chunk begins and ends. 24 | 25 | Behind the scenes, here's what the optimization algorithm is doing: After successfully parsing a `chunk`, the parser *forgets* all the backtracking information and continues parsing the remaining text totally fresh looking for the next chunk, with no sense of history about what has come before. As long as it keeps finding one chunk after another, it can get through a very large file with far less memory usage than the standard algorithm. 26 | 27 | The downside of this approach is that if the parser hits a spot that doesn't match the repeating chunk rule, there's no way for it to know for sure that this is a fatal failure. It is entirely possible that there is some other interpretation of an eariler chunk that would make the whole input parseable. The standard instaparse approach is to backtrack and look for alternative interpretations before declaring a failure. However, without that backtracking history, there's no way to do that. 28 | 29 | So when you use the `:optimize :memory` flag and your parser hits an error using the "parse one chunk at a time and forget the past" strategy, it *restarts the entire parse process* with the original strategy. 30 | 31 | I'm not entirely sure this was the right design decision, and would welcome feedback on this point. Here are the tradeoffs: 32 | 33 | Advantage of the current approach: With this *fall back to the original strategy if the optimizer doesn't work* approach, it should be totally safe to try the optimizer, even if you don't know for sure up front whether the optimizer will work. With the `:optimize :memory` flag, the output will always be exactly the same as if you hadn't used the flag. (A metadata annotation, however, will let you know whether the parse was successfully completed entirely with the optimization strategy.) I like the safety of this approach, and how it is amenable to the attitude of "Let's try this optimization flag out and see if it helps." 34 | 35 | Disadvantage of the current approach: If you're operating on a block of input text so large that the memory optimization is a *necessity*, then if you have a flaw in your text, you're in trouble -- the parsing restarts with the original strategy and if the flaw is fairly late in your file, you could exhaust your memory. 36 | 37 | An alternative design would be to say that if you've enabled the `:optimize :memory` flag, and it hits an apparent flaw in the input, then it's immediately reported as a failure, without any attempt to try the more sophisticated strategy and see whether backtracking might help the situation. This would be good for people willing to expend the effort to ensure the grammar conforms to the optimizer's constraints and has no ambiguity in the chunk definition. It would then correct to report a failure right away if encountered by the optimization strategy -- no need to fall back to the original strategy because there's no ambiguity and no alternative interpretation. 38 | 39 | However, if the flag behaved in this way, then it is possible that if the grammar weren't well-suited for the optimizer, the `:optimize :memory` flag might return a failure in some instances where the regular strategy would return success. In some sense, this would give the programmer maximum control: the programmer can *choose* to rerun the input without the `:optimize :memory` flag or can accept the failure at face value if confident in the grammar's suitability for the optimization strategy. 40 | 41 | So I'm torn: right now the optimizer falls back to the regular strategy because I like that it is dead simple to use, it's safe to try without a deep understanding of what is going on, and it will always give correct output. But I recognize that having the optimizer simply report the failure gives the programmer greatest control over whether to restart with the regular strategy or not. 42 | 43 | What do you think is the better design choice? 44 | 45 | ## Auto Whitespace 46 | 47 | I have received several requests for instaparse to support the parsing of streams of tokens, rather than just strings. There appear to be two main motivations for this request: 48 | 49 | 1. For some grammars, explicitly specifying all the places where whitespace can go is a pain. 50 | 2. For parsing indentation-sensitive languages, it is useful to have a pre-processing pass that identifies `indent` and `dedent` tokens. 51 | 52 | I'm still thinking about developing a token-processing version of instaparse. But if I can find a way to address the underlying needs while maintaining the "token-free" simplicity of instaparse, that would be even better. 53 | 54 | This new experimental "auto whitespace" feature addresses the first issue, simplifying the specification of grammars where you pretty much want to allow optional whitespace between all your tokens. Here's how to use the new feature: 55 | 56 | First, you want to develop a parser that consumes whitespace. The simplest, most common way to do this would be: 57 | 58 | (def whitespace 59 | (insta/parser 60 | "whitespace = #'\\s+'")) 61 | 62 | Let's test it out: 63 | 64 | => (whitespace " ") 65 | [:whitespace " "] 66 | => (whitespace " \t \n \t ") 67 | [:whitespace " \t \n \t "] 68 | 69 | Important: Your whitespace parser should *not* accept the empty string. 70 | 71 | => (whitespace "") 72 | Parse error at line 1, column 1: 73 | nil 74 | ^ 75 | Expected: 76 | #"^\s+" (followed by end-of-string) 77 | 78 | Good, this is what we want. Now, we can define a parser similar to the `words-and-numbers` parser from the tutorial, but this time we'll use the auto-whitespace feature. 79 | 80 | (def words-and-numbers-auto-whitespace 81 | (insta/parser 82 | "sentence = token+ 83 | = word | number 84 | word = #'[a-zA-Z]+' 85 | number = #'[0-9]+'" 86 | 87 | :auto-whitespace whitespace)) 88 | 89 | Notice the use of the `:auto-whitespace` keyword, and how we call it with the whitespace parser we developed earlier. 90 | 91 | => (words-and-numbers-auto-whitespace " abc 123 45 de ") 92 | [:sentence [:word "abc"] [:number "123"] [:number "45"] [:word "de"]] 93 | 94 | Behind the scenes, here's what's going on: the whitespace parsing rule(s) are merged into the new parser, and an optional version of the starting production for the whitespace rule is liberally inserted before all tokens and at the end. In this case, that means `` is inserted all over the place. You can see the insertion points by viewing the parser: 95 | 96 | => words-and-numbers-auto-whitespace 97 | 98 | sentence = token+ whitespace? 99 | whitespace = #"\s+" 100 | token = word | number 101 | word = whitespace? #"[a-zA-Z]+" 102 | number = whitespace? #"[0-9]+" 103 | 104 | You can also see that the whitespace is in fact getting parsed, and is just being hidden: 105 | 106 | => (words-and-numbers-auto-whitespace " abc 123 45 de " :unhide :content) 107 | [:sentence " " [:word "abc"] " " [:number "123"] " " [:number "45"] " " [:word "de"] " "] 108 | 109 | Because the whitespace parser rules are merged into the new parser, don't create any rules in your parser with the same names as those in the whitespace parser. If you do, one of the rules will get clobbered and you'll run into problems. (TODO: Report an error if a user tries to do this) 110 | 111 | Note that it makes no difference whether the `:output-format` of the whitespace parser is :enlive or :hiccup. The rules and the starting production for the whitespace parser are all that matter. 112 | 113 | Because the :auto-whitespace feature allows you to specify your notion of whitespace, you have the total flexibility to define this however you want. For example, let's say I want to allow not only whitespace, but `(* comments *)` between any tokens. Again, we start by developing a corresponding parser: 114 | 115 | (def whitespace-or-comments-v1 116 | (insta/parser 117 | "ws-or-comment = #'\\s+' | comment 118 | comment = '(*' inside-comment* '*)' 119 | inside-comment = !( '*)' | '(*' ) #'.' | comment")) 120 | 121 | Does it eat whitespace? 122 | 123 | => (whitespace-or-comments-v1 " ") 124 | [:ws-or-comment " "] 125 | 126 | Check. Does it handle a comment? 127 | 128 | => (whitespace-or-comments-v1 "(* comment *)") 129 | 130 | 131 | Check. Can it handle nested comments? 132 | 133 | => (whitespace-or-comments-v1 "(* (* comment *) *)") 134 | 135 | 136 | And we mustn't forget -- make sure it *doesn't* parse the empty string: 137 | 138 | => (whitespace-or-comments-v1 "") 139 | 140 | 141 | However, there's a problem here. The auto-whitespace feature inserts optional `?` versions of the whitespace parser everywhere, *not* repeating versions. It's up to us to make sure that the whitespace parser consumes the *full extent* of any whitespace that could appear between tokens. In other words, if we want to allow multiple comments in a row, we need to spell that out: 142 | 143 | (def whitespace-or-comments-v2 144 | (insta/parser 145 | "ws-or-comments = #'\\s+' | comments 146 | comments = comment+ 147 | comment = '(*' inside-comment* '*)' 148 | inside-comment = !( '*)' | '(*' ) #'.' | comment")) 149 | 150 | => (whitespace-or-comments-v2 "(* comment1 *)(* (* nested comment *) *)") 151 | 152 | 153 | There's still one more issue, though. Right now, our parser specifies complete empty whitespace, or a series of comments. But if we want to intermingle whitespace and comments, it won't work: 154 | 155 | => (whitespace-or-comments-v2 " (* comment1 *) (* comment2 *) ") 156 | Parse error at line 1, column 1: 157 | (* comment1 *) (* comment2 *) 158 | ^ 159 | Expected one of: 160 | #"^\s+" (followed by end-of-string) 161 | "(*" 162 | 163 | I could go through and manually insert optional whitespace, but wouldn't it be deliciously meta to use the auto-whitespace feature with our previous, simple whitespace parser to define our whitespace-or-comments parser? 164 | 165 | (def whitespace-or-comments 166 | (insta/parser 167 | "ws-or-comments = #'\\s+' | comments 168 | comments = comment+ 169 | comment = '(*' inside-comment* '*)' 170 | inside-comment = !( '*)' | '(*' ) #'.' | comment" 171 | 172 | :auto-whitespace whitespace)) 173 | 174 | Now it works: 175 | 176 | => (whitespace-or-comments " (* comment1 *) (* comment2 *) ") 177 | 178 | 179 | Just out of curiosity, let's see where the `` got inserted: 180 | 181 | => whitespace-or-comments 182 | ws-or-comments = (whitespace? #"\s+" | comments) whitespace? 183 | whitespace = #"\s+" 184 | comments = comment+ 185 | comment = whitespace? "(*" inside-comment* whitespace? "*)" 186 | inside-comment = !(whitespace? "*)" | whitespace? "(*") whitespace? #"." | comment 187 | 188 | Note that the auto-insertion process inserted `whitespace?` right before the `"*)"`, but this isn't particularly useful, because all whitespace before `*)` would already be eaten by the `inside-comment` rule. If you were inserting the optional whitespace by hand, you'd probably realize it was unnecessary there. However, when you let the system automatically insert it everywhere, some of the insertions might be gratuitous. But that's okay, having the extra optional whitespace inserted there doesn't really hurt us either. 189 | 190 | Now that we have thoroughly tested our whitespace-or-comments parser, we can use it to enrich our words-and-numbers parser: 191 | 192 | (def words-and-numbers-auto-whitespace-and-comments 193 | (insta/parser 194 | "sentence = token+ 195 | = word | number 196 | word = #'[a-zA-Z]+' 197 | number = #'[0-9]+'" 198 | 199 | :auto-whitespace whitespace-or-comments)) 200 | 201 | => (words-and-numbers-auto-whitespace-and-comments " abc 123 (* 456 *) (* (* 7*) 89 *) def ") 202 | [:sentence [:word "abc"] [:number "123"] [:word "def"]] 203 | 204 | => words-and-numbers-auto-whitespace-and-comments 205 | 206 | sentence = token+ ws-or-comments? 207 | inside-comment = !(whitespace? "*)" | whitespace? "(*") whitespace? #"." | comment 208 | comment = whitespace? "(*" inside-comment* whitespace? "*)" 209 | comments = comment+ 210 | ws-or-comments = (whitespace? #"\s+" | comments) whitespace? 211 | whitespace = #"\s+" 212 | token = word | number 213 | word = ws-or-comments? #"[a-zA-Z]+" 214 | number = ws-or-comments? #"[0-9]+" 215 | 216 | Note that this feature is only useful in grammars where all the strings and regexes are, conceptually, the "tokens" of your language. Occasionally, you'll see situations where grammars specify tokens through rules that build up the tokens character-by-character, for example: 217 | 218 | month = ('M'|'m') 'arch' 219 | 220 | If you try to use the auto-whitespace feature with a grammar like this, it will end up allowing space between the "m" and the "arch", which isn't what you want. The key is to try to express such tokens using a single regular expression: 221 | 222 | month = #'[Mm]arch' 223 | 224 | ### Predefined whitespace parsers 225 | 226 | There's no doubt that the following whitespace rule is by far the most common: 227 | 228 | whitespace = #"\s+" 229 | 230 | So for this common case, there's no need to create a separate whitespace parser. You can access this predefined whitespace parser with the option: 231 | 232 | :auto-whitespace :standard 233 | 234 | At this time, one other predefined whitespace parser is available, for Clojure-like parsing tasks where the comma is also treated as whitespace. The rule that will be added to your grammar is: 235 | 236 | whitespace = #"[,\s]+" 237 | 238 | and you can access it with the option: 239 | 240 | :auto-whitespace :comma 241 | 242 | Let me know what you think of the auto-whitespace feature. Is it sufficiently simple and useful to belong in the instaparse library? -------------------------------------------------------------------------------- /docs/Performance.md: -------------------------------------------------------------------------------- 1 | # Instaparse Performance Notes 2 | 3 | In the instaparse tutorial, I make the claim that instaparse is performant without really defining what I mean. I explained that I've spent a lot of time on optimization, without really specifying what I'm tring to optimize. In this document, I'd like to [elaborate on these points](https://github.com/Engelberg/instaparse/blob/master/docs/Performance.md#specific-performance-goals), and talk a bit about how I view [instaparse's role](https://github.com/Engelberg/instaparse/blob/master/docs/Performance.md#the-role-of-instaparse) in the parser ecosystem. Finally, I'll provide [specific tips on how to get good performance from instaparse parsers](https://github.com/Engelberg/instaparse/blob/master/docs/Performance.md#performance-tips). 4 | 5 | ## A bit of history 6 | 7 | For decades, parsing has been considered a "solved problem" because there are well-known algorithms that can parse a stream of text blazingly fast, in a single linear pass, using minimal memory. The catch is that these algorithms only apply to certain types of context-free grammars -- these classes of easily-parsed grammars go by names like LL(1) and LALR(1), acronyms describing the parsing technique that applies. The good news is that most context-free grammars can, with some effort, be converted into the kind of format required by parsing algorithms. Furthermore, if you are knowledgable about parsing algorithms and are the one constructing the language / data format to be parsed, you can intentionally constrain the syntax to ensure that it can easily be parsed. 8 | 9 | If you can do that, great! If there's already a parser written for the kind of data you're working with, even better! However, the programming world is awash with ad hoc config files and data files that don't use an existing standard like XML or JSON. Sometimes you find yourself needing to work with something that's a little too complex to tease apart just with regular expressions, yet hard to justify the time and energy it would take to study up on LL, LALR, etc. and learn how to parse the data within the constraints of tools using those parsing algorithms. 10 | 11 | ## The role of instaparse 12 | 13 | That's where instaparse comes in. Instaparse can handle arbitrary context-free grammars written using standard notation, so it's easy to apply it, even for a quickie one-time parsing task. 14 | 15 | Shortly after the release of instaparse, there were a couple great testimonial blog posts about instaparse. [This blog post by Brandon Harvey](http://walkwithoutrhythm.net/blog/2013/05/16/instaparse-parsing-with-clojure-is-the-new-black/) especially made my day, because it perfectly captured what I had hoped to achieve with instaparse. 16 | 17 | In his blog post, Brandon describes some cave data that he wanted to parse. Ideally, he wanted to figure out how to get "from a big fat unwieldy string to a nice, regular tree-shaped data structure in 20 minutes or less." The cave data is clearly structured and looks kind of like JSON, but it isn't quite JSON. 18 | 19 | First, he tried using another Clojure parsing library (a rather excellent library provided you're working with a grammar that fits its constraints), but couldn't figure out how to express his grammar in a way that worked. He got bogged down with a bunch of shift/reduce conflicts and other errors that he didn't know how to interpret without understanding the underlying machinery. Using instaparse, he expressed the grammar in the way that seemed most natural, and it worked. 20 | 21 | This brings me to a point I'd like to make before discussing performance: 22 | 23 | *Instaparse aims to be more flexible than traditional parser libraries --- more tolerant of grammars that are ambiguous, require backtracking, or use a mixture of left and right recursion.* 24 | 25 | To accomplish this, instaparse uses a fundamentally different algorithm than those found in traditional parser libraries, which achieve their speeds and performance guarantees by restricting lookahead and limiting backtracking. 26 | 27 | ## Specific performance goals 28 | 29 | With that disclaimer in mind, here are the specifics of what I strive for: 30 | 31 | + For typical, real-world grammars, I want the running time to be linear with respect to the size of the input. In other words, if you double the size of your text, it should take about twice as long to parse. (Of course, I'm using Clojure data structures, so in practice, the running time is more like O(n * log32 n), but that's pretty close to linear.) 32 | + If your grammar is unambiguous and LL(1), the parser should be competitive with parsers generated by tools that *only* accept unambiguous LL(1) grammars (i.e., within some reasonable constant factor). 33 | + If you have a reasonable grammar, even one that isn't expressed in "just the right way", it should still have solid performance. 34 | + Performance should degrade gracefully as you incorporate more ambiguity and heavy backtracking into the grammar. 35 | 36 | Roughly speaking, the goal is for instaparse to be performant in the same sense that Clojure is performant. Clojure is not quite as fast as languages like Java or C++ and consumes considerably more memory, but we use it because it offers greater expressivity and flexibility with enough speed to be useful for a wide range of tasks. 37 | 38 | ## Specific optimizations 39 | 40 | There were a lot of algorithmic coding decisions that I made by benchmarking multiple alternatives and data structures. I won't go into them all here. My aim in this section is to give you a sense for how I go about optimizing and what sorts of things I focus on. 41 | 42 | Here is the gist of my optimization process: I take a grammar, try it on increasingly large inputs, and track the running-time growth. If the growth is quadratic (or worse), I profile and investigate to try to track down the offending code and rework it into linear behavior. My goal is to ensure that as many grammars as possible have linear growth. 43 | 44 | As I mentioned in the tutorial, one of the first things I noticed in my profiling was how critical hashing was. This is a great example of how an algorithm that seems like it *should* be linear can go awry without careful attention to implementation details. We all know that inserting something into a hash map is essentially constant time, so we take that for granted in our analysis. As long as the algorithm only performs O(n) insertions/lookups in the hash table, it should have linear performance, right? Well, if the thing you are inserting into the hash table takes O(n) time to compute the hash, you're in big trouble! 45 | 46 | So the first big accomplishment of my optimization efforts was to reduce the hashing time to constant for all the information cached by instaparse. Version 1.2 of instaparse sports two new equally significant performance improvements: 47 | 48 | First, I discovered that on long texts with long repeating sequences, linear-time concatenation of the internal partial tree results was a huge bottleneck, leading to overall quadratic behavior. So in 1.2, I converted over to using a custom data structure with O(1) concatentation. RRB-trees would be another data structure that could potentially solve my concatenation problem, so this is something I intend to look at after the Clojure implementation of RRB-trees matures. 49 | 50 | The other major performance improvement in 1.2 compensated for an unfortunate change that Oracle made in Java 1.7 to the String class, changing Strings so that the substring operation is O(n) rather than O(1), copying the substring into a freshly allocated string. Instaparse handles regular expressions by testing the regular expression against a substring of the input text that skips past the part of the text already parsed. This strategy, which creates rather large substrings frequently, needed to be modified in light of Java 1.7's poor substring behavior. 51 | 52 | With these version 1.2 modifications in place, I'm now getting linear-time behavior for all the parsers in my test suite that aren't explicitly designed to demonstrate huge amounts of ambiguity. This is exactly where I want instaparse to be. 53 | 54 | ## Memory 55 | 56 | When talking about performance, the other big discussion point is, of course, memory consumption. As I mentioned in the tutorial, instaparse does use a lot of memory. There's really no way around this; it all comes back to my earlier point that instaparse aims to gracefully handle arbitrary levels of ambiguity and backtracking, which means that the entire text needs to reside in memory and lots of intermediate results need to be cached. 57 | 58 | Instaparse's own syntax for context-free grammars is parsed by an instaparse parser, and is a great example of the practical value of backtracking. 59 | 60 | Consider the following grammar. The actual semantics of the grammar is not important here, just think about the syntax of the grammar specification and consider how instaparse's `parser` function needs to parse the grammar string as a series of rules: 61 | 62 | (insta/parser 63 | "A = B B 64 | B = 'b'") 65 | 66 | You might expect instaparse to impose a requirement that each line of the grammar be clearly terminated by an end-of-line character, such as `;` or a newline, but in fact, instaparse's CFG parser has no problem if you write out the grammar all mushed together on one line: 67 | 68 | (insta/parser "A = B B B = 'b'") 69 | 70 | Working from left-to-right, when it processes the third `B`, it is entirely possible that what it has seen so far should be interpreted as the rule: 71 | 72 | A = B B B 73 | 74 | But when it encounters the `=`, it realizes that the only sensible interpretation is for the third `B` to be the beginning of a new rule, and instaparse sorts it all out. 75 | 76 | Taken to an extreme, consider the parser defined by the following grammar: 77 | 78 | S = 'ab'+ | 'a' 'ba'+ 79 | 80 | If you use this parser to parse a long string of "abababab...aba", there's no way to determine when looking at the first 'a' which way to interpret it. The parser can try one path, perhaps assuming that it is part of the `'ab'+` rule, but it won't know until it gets to the very end of the string that it has chosen incorrectly, and has to back up and try another path. Looking at this example, it should be clear that there's no way to parse the input string in a single linear pass with bounded memory. 81 | 82 | For this reason, I haven't put as much effort into optimizing memory usage -- a lot of data needs to be retained throughout the parsing process, and there simply is less scope for improvement, I think. Certainly Java 1.7's substring behavior was causing massive memory churn, so the changes I made in instaparse 1.2 will also benefit the memory side of the performance equation. But other than that, I haven't found any big wins for optimizing memory consumption. 83 | 84 | In theory, I can imagine that there might be a way to intelligently figure out which cached data can be safely discarded, but in the context of left-recursion this is an extremely hard problem to solve. Chalk this up as a future research problem, but one that is not likely to bear fruit in the short-term. I have made one step in this direction which I will detail further in the section below about performance tips. 85 | 86 | ## Performance Tips 87 | 88 | Occasionally, I receive a question about whether there's a *best* way to write instaparse grammars for maximum performance. I've tried very hard to make it so that instaparse's performance isn't ultra-sensitive to the exact way you word the grammar. My hope is that most people will find these performance tips to be completely unnecessary. However, for those that are interested, here are some recommendations: 89 | 90 | 1. Instaparse's algorithm is in the family of LL parsing algorithms. So if you know how to easily write your grammar as an LL grammar, that's probably going to yield the best possible performance. If not, don't worry about it. 91 | 92 | 2. If your token is a string, use a string literal, not a regular epxression. For example, prefer `'apple'` to `#'apple'`. 93 | 94 | 3. When the greedy behavior of regular expressions is what you want, prefer using `*` and `+` *inside* the regular expression rather than outside. This comes up very commonly in processing whitespace. In most applications, once you hit whitespace, you want to eat up all the whitespace to get to the next token. So you'll get better performance with `#'\\s*'` than with `#'\\s'*`. In my parsers, I routinely have a rule for optional whitespace that looks like `ows = #'\\s*'` and then I sprinkle `` liberally in my other rules wherever I want to potentially allow whitespace. 95 | 96 | 4. Related to the previous point, prefer using regular expressions to define tokens in their entirety rather than using instaparse to build up the tokens by analyzing the string character by character. For example, if an identifer in your language is a letter followed by a series of letters or digits, you'll be better off with the rule 97 | 98 | Identifier = #'[a-zA-Z][a-zA-Z0-9]*' 99 | 100 | rather than 101 | 102 | Identifer = Letter Digit* 103 | Letter = #'[a-zA-Z]' 104 | Digit = #'[a-zA-Z0-9]' 105 | 106 | 5. Remove as much ambiguity from your grammar as you can. Instaparse works with ambiguous grammars, but dealing with that ambiguity can take a toll on performance. Use the `insta/parses` function on a variety of sample inputs in order to troubleshoot your grammar and discover ways in which your inputs might have multiple interpretations. 107 | 108 | 6. Even if `insta/parses` returns a single answer, think about whether you've created a lot of *internal ambiguity*, i.e., situations where the parser won't be able to work out the interpretation of the text until it has gotten much further along. One way to analyze this is to test the various rules in your grammar using `insta/parses` with the `:partial true` flag to get a feel for how many scenarios it has to consider before it can be sure it has found the whole chunk of text defined by that rule. 109 | 110 | 7. Watch out for ambiguity in your hidden content. One time I was working with a grammar that I was convinced was unambiguous -- `insta/parses` always returned a single answer. However, it turned out that the definition of whitespace was highly ambiguous. I didn't realize it because the whitespace was hidden. To help diagnose these sorts of problems, try running `insta/parses` with the `:unhide :all` flag. 111 | 112 | 8. Prefer Java 1.7. I've received one report where instaparse, running on Java 1.6, was running out of memory on a large input, whereas the exact same grammar on the same input ran perfectly fine on Java 1.7. 113 | 114 | 9. Prefer using * and + over recursion to describe simple repetition. For example, the rule: 115 | 116 |
= 'a'+ 117 | 118 | can be internally optimized in ways that 119 | 120 | = 'a' A | 'a' 121 | 122 | cannot. 123 | 124 | 10. Feed instaparse smaller chunks of text. The reality is that most large parsing tasks involve a series of individual data records that could potentially be parsed independently of one another. As has been discussed earlier in this document, if you feed instaparse the entire block of text, instaparse has to assume the worst -- that it might encounter some sort of failure that causes it to go back and reintrepret all the text it has processed so far. Consider preprocessing the text, chopping it into strings representing the individual data records, and pass the smaller strings into instaparse in order to limit the scope of what possibilities it needs to consider and how much history it needs to track. 125 | 126 | For example, I saw one grammar where each line of text represented a new record, and the grammar looked like: 127 | 128 | document = line+ 129 | line = ... 130 | 131 | Instead of applying this grammar to the entire document at once, why not build a parser where `line` is the top-level starting rule, and then map this parser over a `line-seq` of the text? 132 | 133 | I've added a new, experimental `:optimize :memory` flag that attempts to automate this kind of preprocessing, chopping the text into smaller independent chunks in order to use less memory. This only works on grammars that describe these sorts of repeated data records (possibly with a header at the beginning of the file). If instaparse can't find the pattern or runs into any sort of failure, it will fall back to its usual parsing strategy in order to make sure it has considered all possibilities. Using this flag will likely slow down your parser, but if your data lends itself to this alternative strategy, you'll use much less memory. 134 | 135 | I consider the `:optimize :memory` flag to be an *alpha* feature, subject to change. If you try it and find it useful, or try it on something where you'd expect it to help and it doesn't, please send me your feedback. 136 | 137 | If you need to annotate your chunks of text with line and column information, recall that `add-line-and-column-info-to-metadata` can take a starting line and column number for its annotation process: 138 | 139 | (insta/add-line-and-column-info-to-metadata text starting-line starting-column parse-tree) 140 | 141 | 11. As of version 1.2, the enlive output format is slightly faster than hiccup. This may change in the future, so I don't recommend that you base your choice of output format on this slight differential. However, if you're trying to eke out the best possible performance, you might find it useful to experiment with both output formats to see whether one performs better for you than the other. 142 | 143 | 12. As of version 1.4, instaparse has a way to print a trace of the parser's execution process, as well as some profiling information which can be useful to detmerine whether your parser behaves linearly with respect to the size of the input. [Read about the new tracing feature here.](https://github.com/Engelberg/instaparse/blob/master/docs/Tracing.md) 144 | -------------------------------------------------------------------------------- /docs/Tracing.md: -------------------------------------------------------------------------------- 1 | # Tracing 2 | 3 | Instaparse 1.4.0 and up (in Clojure only) features the ability to look at a trace of what the parser is doing. As an example, let's take a look at the as-and-bs parser from the tutorial. 4 | 5 | ``` 6 | => (as-and-bs "aaabb") 7 | [:S [:AB [:A "a" "a" "a"] [:B "b" "b"]]] 8 | ``` 9 | 10 | Now let's look at a trace. We do this by calling the parser with the optional keyword argument `:trace true`. `insta/parse` and `insta/parses` both can take this optional argument. 11 | 12 | ``` 13 | => (as-and-bs "aaabb" :trace true) 14 | ``` 15 | 16 | One of my design goals for the tracing feature was that if you don't use it, you shouldn't pay a performance penalty. So by default, the parsing code is not instrumented for tracing. The very first time you call a parser with `:trace true`, you may notice a slight pause as instaparse recompiles itself to support tracing. The trace the prints to standard out, and looks like this: 17 | 18 | ``` 19 | Initiating full parse: S at index 0 (aaabb) 20 | Initiating full parse: AB* at index 0 (aaabb) 21 | Initiating parse: AB at index 0 (aaabb) 22 | Initiating parse: A B at index 0 (aaabb) 23 | Initiating parse: A at index 0 (aaabb) 24 | Initiating parse: "a"+ at index 0 (aaabb) 25 | Initiating parse: "a" at index 0 (aaabb) 26 | Result for "a" at index 0 (aaabb) => "a" 27 | Result for "a"+ at index 0 (aaabb) => ("a") 28 | Result for A at index 0 (aaabb) => [:A "a"] 29 | Initiating parse: B at index 1 (aabb) 30 | Initiating parse: "b"+ at index 1 (aabb) 31 | Initiating parse: "b" at index 1 (aabb) 32 | No result for "b" at index 1 (aabb) 33 | Initiating parse: "a" at index 1 (aabb) 34 | Result for "a" at index 1 (aabb) => "a" 35 | Result for "a"+ at index 0 (aaabb) => ("a" "a") 36 | Result for A at index 0 (aaabb) => [:A "a" "a"] 37 | Initiating parse: B at index 2 (abb) 38 | Initiating parse: "b"+ at index 2 (abb) 39 | Initiating parse: "b" at index 2 (abb) 40 | No result for "b" at index 2 (abb) 41 | Initiating parse: "a" at index 2 (abb) 42 | Result for "a" at index 2 (abb) => "a" 43 | Result for "a"+ at index 0 (aaabb) => ("a" "a" "a") 44 | Result for A at index 0 (aaabb) => [:A "a" "a" "a"] 45 | Initiating parse: B at index 3 (bb) 46 | Initiating parse: "b"+ at index 3 (bb) 47 | Initiating parse: "b" at index 3 (bb) 48 | Result for "b" at index 3 (bb) => "b" 49 | Result for "b"+ at index 3 (bb) => ("b") 50 | Result for B at index 3 (bb) => [:B "b"] 51 | Result for A B at index 0 (aaabb) => ([:A "a" "a" "a"] [:B "b"]) 52 | Result for AB at index 0 (aaabb) => [:AB [:A "a" "a" "a"] [:B "b"]] 53 | Initiating parse: AB at index 4 (b) 54 | Initiating parse: A B at index 4 (b) 55 | Initiating parse: A at index 4 (b) 56 | Initiating parse: "a"+ at index 4 (b) 57 | Initiating parse: "a" at index 4 (b) 58 | No result for "a" at index 4 (b) 59 | Initiating parse: "b" at index 4 (b) 60 | Result for "b" at index 4 (b) => "b" 61 | Result for "b"+ at index 3 (bb) => ("b" "b") 62 | Result for B at index 3 (bb) => [:B "b" "b"] 63 | Result for A B at index 0 (aaabb) => ([:A "a" "a" "a"] [:B "b" "b"]) 64 | Result for AB at index 0 (aaabb) => [:AB [:A "a" "a" "a"] [:B "b" "b"]] 65 | Result for AB* at index 0 (aaabb) => ([:AB [:A "a" "a" "a"] [:B "b" "b"]]) 66 | Result for S at index 0 (aaabb) => [:S [:AB [:A "a" "a" "a"] [:B "b" "b"]]] 67 | Successful parse. 68 | Profile: {:push-message 21, :push-result 21, :push-listener 24, :push-stack 26, :push-full-listener 2, :create-node 26} 69 | [:S [:AB [:A "a" "a" "a"] [:B "b" "b"]]] 70 | ``` 71 | 72 | Let me explain what some of these lines mean. 73 | 74 | ``` 75 | Initiating full parse: S at index 0 (aaabb) 76 | ``` 77 | 78 | A "full parse" means that it only succeeds if it consumes the entire string. Usually, we're looking to completely parse an entire string, and that's what "full parse" reflects. 79 | 80 | It is important to understand that the word "initiating" does not necessarily mean that it is starting to work on that parse sub-problem right away. It just means that we're putting it on a stack of sub-problems to try to solve. 81 | 82 | Notice the `(aaabb)` in parens. This is giving us the next several characters from this point in the string, which makes it a little easier to see at a glance where we are in the string (although, of course the index number can always be used to figure it out precisely). 83 | 84 | ``` 85 | Initiating full parse: AB* at index 0 (aaabb) 86 | Initiating parse: AB at index 0 (aaabb) 87 | ``` 88 | 89 | Note that AB* needs to be a full parse to be satisfied, but that kicks off another subproblem, which is to look for a parse of AB (not necessarily a full parse) at index 0. 90 | 91 | ``` 92 | Initiating parse: A at index 0 (aaabb) 93 | Initiating parse: "a"+ at index 0 (aaabb) 94 | Initiating parse: "a" at index 0 (aaabb) 95 | Result for "a" at index 0 (aaabb) => "a" 96 | Result for "a"+ at index 0 (aaabb) => ("a") 97 | Result for A at index 0 (aaabb) => [:A "a"] 98 | ``` 99 | 100 | Note that after initiating a bunch of parse subtasks, we start to see some results. Again, the content in the parentheses is a look ahead at the next several characters in the string, just to get our bearings. The information after the `=>` is the parse result that was found. Typically, the parse results are found in reverse order from the order in which the subtasks are initiated, because when initiated, the subtasks are put on a stack. 101 | 102 | ``` 103 | No result for "b" at index 1 (aabb) 104 | ``` 105 | 106 | The tracing mechanism reports when tokens (i.e., strings or regular expressions) are sought but not found. In general, the tracing mechanism does not report when subtasks involving non-terminals fail (because internally, instaparse does not transmit failure messages between subtasks). 107 | 108 | ``` 109 | Result for S at index 0 (aaabb) => [:S [:AB [:A "a" "a" "a"] [:B "b" "b"]]] 110 | Successful parse. 111 | ``` 112 | 113 | At the end, we see the final parse, followed by some profiling data: 114 | 115 | ``` 116 | Profile: {:push-message 21, :push-result 21, :push-listener 24, :push-stack 26, :push-full-listener 2, :create-node 26} 117 | ``` 118 | 119 | The details of the profiling data don't matter that much, other than to know that it's a measure of how much work instaparse had to do to come up with the result. Repeating the trace with an input of `"aaaaaabbbb"` we get the profiling results: 120 | 121 | ``` 122 | Profile: {:push-message 40, :push-result 40, :push-listener 48, :push-stack 50, :push-full-listener 2, :create-node 50} 123 | ``` 124 | 125 | The key here is that we doubled the length of the input string, and this doubled-the amount of work that instaparse needed to do. That's good, it means that this parser behaves linearly with respect to its input size. Even though the code is instrumented with tracing functionality, you still need to explicitly request the trace each time. If you don't request the trace, it won't display: 126 | 127 | ``` 128 | => (as-and-bs "aaabb") 129 | [:S [:AB [:A "a" "a" "a"] [:B "b" "b"]]] 130 | ``` 131 | 132 | Now let's look at an example with negative lookahead. Here is the parser: 133 | 134 | ``` 135 | => negative-lookahead-example 136 | S = !"ab" ("a" | "b")+ 137 | => (negative-lookahead-example "aabb") 138 | [:S "a" "a" "b" "b"] 139 | ``` 140 | 141 | Let's run it with the trace: 142 | 143 | ``` 144 | => (negative-lookahead-example "aabb" :trace true) 145 | Initiating full parse: S at index 0 (aabb) 146 | Initiating full parse: !"ab" ("a" | "b")+ at index 0 (aabb) 147 | Initiating parse: !"ab" at index 0 (aabb) 148 | Initiating parse: "ab" at index 0 (aabb) 149 | No result for "ab" at index 0 (aabb) 150 | Exhausted results for "ab" at index 0 (aabb) 151 | Negation satisfied: !"ab" at index 0 (aabb) 152 | Initiating full parse: ("a" | "b")+ at index 0 (aabb) 153 | Initiating parse: "a" | "b" at index 0 (aabb) 154 | Initiating parse: "b" at index 0 (aabb) 155 | No result for "b" at index 0 (aabb) 156 | Initiating parse: "a" at index 0 (aabb) 157 | Result for "a" at index 0 (aabb) => "a" 158 | Result for "a" | "b" at index 0 (aabb) => "a" 159 | Initiating parse: "a" | "b" at index 1 (abb) 160 | Initiating parse: "b" at index 1 (abb) 161 | No result for "b" at index 1 (abb) 162 | Initiating parse: "a" at index 1 (abb) 163 | Result for "a" at index 1 (abb) => "a" 164 | Result for "a" | "b" at index 1 (abb) => "a" 165 | Initiating parse: "a" | "b" at index 2 (bb) 166 | Initiating parse: "b" at index 2 (bb) 167 | Result for "b" at index 2 (bb) => "b" 168 | Result for "a" | "b" at index 2 (bb) => "b" 169 | Initiating parse: "a" | "b" at index 3 (b) 170 | Initiating parse: "b" at index 3 (b) 171 | Result for "b" at index 3 (b) => "b" 172 | Result for "a" | "b" at index 3 (b) => "b" 173 | Result for ("a" | "b")+ at index 0 (aabb) => ("a" "a" "b" "b") 174 | Result for !"ab" ("a" | "b")+ at index 0 (aabb) => ("a" "a" "b" "b") 175 | Result for S at index 0 (aabb) => [:S "a" "a" "b" "b"] 176 | Successful parse. 177 | Profile: {:push-message 12, :push-result 12, :push-listener 14, :push-stack 17, :push-full-listener 3, :create-node 17} 178 | [:S "a" "a" "b" "b"] 179 | ``` 180 | 181 | The interesting thing with negative lookahead (or ordered choice) is the following lines: 182 | 183 | ``` 184 | Initiating parse: !"ab" at index 0 (aabb) 185 | Initiating parse: "ab" at index 0 (aabb) 186 | No result for "ab" at index 0 (aabb) 187 | Exhausted results for "ab" at index 0 (aabb) 188 | Negation satisfied: !"ab" at index 0 (aabb) 189 | ``` 190 | 191 | To do negative lookahead, the parser sets up a subtask to try to parse the very thing we want to avoid. If the parser runs out of work to do, then the trace tells us that the negation was in fact satisfied. 192 | 193 | When you are done tracing, you probably will want to recompile the code without all the tracing and profiling instrumentation. You can either restart the REPL or just type: 194 | 195 | ``` 196 | => (insta/disable-tracing!) 197 | nil 198 | ``` 199 | -------------------------------------------------------------------------------- /images/vizexample1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Engelberg/instaparse/da886e71a4afa80f8b83d1d67f058b2f02cdc0e3/images/vizexample1.png -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject instaparse "1.5.0" 2 | :description "Instaparse: No grammar left behind" 3 | :url "https://github.com/Engelberg/instaparse" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | :dependencies [[org.clojure/clojure "1.11.1"]] 7 | :resource-paths ["resources"] 8 | :profiles {:dev {:dependencies 9 | [[org.clojure/clojurescript "1.11.132"] 10 | [org.clojure/tools.trace "0.7.11"] 11 | [criterium "0.4.6"] 12 | [rhizome "0.2.9"]]} 13 | :1.5 {:dependencies [[org.clojure/clojure "1.5.1"]]} 14 | :1.6 {:dependencies [[org.clojure/clojure "1.6.0"]]} 15 | :1.7 {:dependencies [[org.clojure/clojure "1.7.0"] 16 | [org.clojure/clojurescript "1.7.28"]]} 17 | :1.8 {:dependencies [[org.clojure/clojure "1.8.0"] 18 | [org.clojure/clojurescript "1.8.34"]]} 19 | :1.9 {:dependencies [[org.clojure/clojure "1.9.0"] 20 | [org.clojure/clojurescript "1.10.238"] 21 | [org.clojure/tools.reader "1.2.1"]]} 22 | :1.10 {:dependencies [[org.clojure/clojure "1.10.0"] 23 | [org.clojure/clojurescript "1.10.439"] 24 | [org.clojure/tools.reader "1.3.2"]]} 25 | :1.11 {:dependencies [[org.clojure/clojure "1.11.1"] 26 | [org.clojure/clojurescript "1.11.132"] 27 | [org.clojure/tools.reader "1.3.6"]]}} 28 | :aliases {"test-all" ["with-profile" "+1.5:+1.6:+1.7:+1.8:+1.9:+1.10:+1.11" "test"] 29 | "test-cljs" ["cljsbuild" "test" "unit-tests"] 30 | "test-cljs-all" ["with-profile" "+1.9:+1.10:+1.11" "do" "clean," "test-cljs"]} 31 | :test-paths ["test/" "target/generated/test/clj"] 32 | :source-paths ["src/" "target/generated/src/clj"] 33 | :cljsee {:builds [{:source-paths ["src/"] 34 | :output-path "target/generated/src/clj" 35 | :rules :clj} 36 | {:source-paths ["test/"] 37 | :output-path "target/generated/test/clj" 38 | :rules :clj}]} 39 | :plugins [[lein-cljsbuild "1.1.8"] 40 | [cljsee "0.1.0"]] 41 | ;:hooks [leiningen.cljsbuild] 42 | :target-path "target" 43 | :scm {:name "git" 44 | :url "https://github.com/Engelberg/instaparse"} 45 | :prep-tasks [["cljsee" "once"]] 46 | :cljsbuild {:builds [{:id "none" 47 | :source-paths ["src/"] 48 | :compiler {:output-to "target/js/none.js" 49 | :optimizations :none 50 | :pretty-print true}} 51 | {:id "test" 52 | :source-paths ["src/" 53 | "test/" 54 | "runner/cljs"] 55 | :compiler {:output-to "target/js/advanced-test.js" 56 | :optimizations :advanced 57 | :target :nodejs 58 | :pretty-print false}}] 59 | :test-commands {"unit-tests" ["node" "target/js/advanced-test.js"]}}) 60 | -------------------------------------------------------------------------------- /resources/clj-kondo.exports/instaparse/config.edn: -------------------------------------------------------------------------------- 1 | {:lint-as {instaparse.macros/defclone clojure.core/def 2 | instaparse.macros/set-global-var! clojure.core/set!}} 3 | -------------------------------------------------------------------------------- /runner/cljs/runner/runner.cljs: -------------------------------------------------------------------------------- 1 | (ns instaparse.runner.runner 2 | (:require [cljs.nodejs :as nodejs] 3 | [instaparse.abnf-test] 4 | [instaparse.auto-flatten-seq-test] 5 | [instaparse.core-test] 6 | [instaparse.defparser-test] 7 | [instaparse.failure-test] 8 | [instaparse.grammars] 9 | [instaparse.repeat-test] 10 | [instaparse.specs] 11 | [cljs.test :as test :refer-macros [run-tests]])) 12 | 13 | (nodejs/enable-util-print!) 14 | 15 | (defmethod cljs.test/report [:cljs.test/default :end-run-tests] [m] 16 | (if (test/successful? m) 17 | (println "Tests succeeded!") 18 | (do 19 | (println "Tests failed.") 20 | ((aget js/process "exit") 1)))) 21 | 22 | (defn -main [] 23 | (run-tests 'instaparse.abnf-test 24 | 'instaparse.auto-flatten-seq-test 25 | 'instaparse.core-test 26 | 'instaparse.defparser-test 27 | 'instaparse.failure-test 28 | 'instaparse.grammars 29 | 'instaparse.repeat-test 30 | 'instaparse.specs)) 31 | 32 | (set! *main-cli-fn* -main) 33 | -------------------------------------------------------------------------------- /src/instaparse/abnf.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.abnf 2 | "This is the context free grammar that recognizes ABNF notation." 3 | (:refer-clojure :exclude [cat]) 4 | (:require [instaparse.transform :as t] 5 | [instaparse.cfg :as cfg] 6 | [instaparse.gll :as gll] 7 | [instaparse.reduction :as red] 8 | [instaparse.util :refer [throw-runtime-exception]] 9 | [instaparse.combinators-source :refer 10 | [Epsilon opt plus star rep alt ord cat string-ci string 11 | string-ci regexp nt look neg hide hide-tag unicode-char]] 12 | #?(:cljs [goog.string.format]) 13 | [clojure.walk :as walk]) 14 | #?(:cljs (:require-macros [instaparse.abnf :refer [precompile-cljs-grammar]]))) 15 | 16 | (def ^:dynamic *case-insensitive* 17 | "This is normally set to false, in which case the non-terminals 18 | are treated as case-sensitive, which is NOT the norm 19 | for ABNF grammars. If you really want case-insensitivity, 20 | bind this to true, in which case all non-terminals 21 | will be converted to upper-case internally (which 22 | you'll have to keep in mind when transforming)." 23 | false) 24 | 25 | (def abnf-core 26 | {:ALPHA (regexp "[a-zA-Z]") 27 | :BIT (regexp "[01]") 28 | :CHAR (regexp "[\\u0001-\\u007F]") 29 | :CR (string "\u000D") 30 | :CRLF (string "\u000D\u000A") 31 | :CTL (regexp "[\\u0000-\\u001F|\\u007F]") 32 | :DIGIT (regexp "[0-9]") 33 | :DQUOTE (string "\u0022") 34 | :HEXDIG (regexp "[0-9a-fA-F]") 35 | :HTAB (string "\u0009") 36 | :LF (string "\u000A") 37 | :LWSP (alt (alt (string "\u0020") (string "\u0009")) ;WSP 38 | (star 39 | (cat (string "\u000D\u000A") ;CRLF 40 | (alt (string "\u0020") (string "\u0009"))))) ;WSP 41 | :OCTET (regexp "[\\u0000-\\u00FF]") 42 | :SP (string "\u0020") 43 | :VCHAR (regexp "[\\u0021-\\u007E]") 44 | :WSP (alt (string "\u0020") ;SP 45 | (string "\u0009"))}) ;HTAB 46 | 47 | (def abnf-grammar-common 48 | " 49 | = (rule | hide-tag-rule)+; 50 | rule = rulename-left alternation ; 51 | hide-tag-rule = hide-tag alternation ; 52 | rulename-left = rulename; 53 | rulename-right = rulename; 54 | = <'<' opt-whitespace> rulename-left '>; 55 | defined-as = ('=' | '=/') ; 56 | alternation = concatenation ( concatenation)*; 57 | concatenation = repetition ( repetition)*; 58 | repetition = [repeat] element; 59 | repeat = NUM | (NUM? '*' NUM?); 60 | = rulename-right | group | hide | option | char-val | num-val 61 | | look | neg | regexp; 62 | look = <'&' opt-whitespace> element; 63 | neg = <'!' opt-whitespace> element; 64 | = <'(' opt-whitespace> alternation ; 65 | option = <'[' opt-whitespace> alternation ; 66 | hide = <'<' opt-whitespace> alternation '>; 67 | char-val = <'\\u0022'> #'[\\u0020-\\u0021\\u0023-\\u007E]'* <'\\u0022'> (* double-quoted strings *) 68 | | <'\\u0027'> #'[\\u0020-\\u0026\u0028-\u007E]'* <'\\u0027'>; (* single-quoted strings *) 69 | = <'%'> (bin-val | dec-val | hex-val); 70 | bin-val = <'b'> bin-char 71 | [ (<'.'> bin-char)+ | ('-' bin-char) ]; 72 | bin-char = ('0' | '1')+; 73 | dec-val = <'d'> dec-char 74 | [ (<'.'> dec-char)+ | ('-' dec-char) ]; 75 | dec-char = DIGIT+; 76 | hex-val = <'x'> hex-char 77 | [ (<'.'> hex-char)+ | ('-' hex-char) ]; 78 | hex-char = HEXDIG+; 79 | NUM = DIGIT+; 80 | = #'[0-9]'; 81 | = #'[0-9a-fA-F]'; 82 | 83 | 84 | (* extra entrypoint to be used by the abnf combinator *) 85 | = rulelist | alternation; 86 | ") 87 | 88 | (def abnf-grammar-clj-only 89 | " 90 | = #'[a-zA-Z][-a-zA-Z0-9]*(?x) #identifier'; 91 | opt-whitespace = #'\\s*(?:;.*?(?:\\u000D?\\u000A\\s*|$))*(?x) # optional whitespace or comments'; 92 | whitespace = #'\\s+(?:;.*?\\u000D?\\u000A\\s*)*(?x) # whitespace or comments'; 93 | regexp = #\"#'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'(?x) #Single-quoted regexp\" 94 | | #\"#\\\"[^\\\"\\\\]*(?:\\\\.[^\\\"\\\\]*)*\\\"(?x) #Double-quoted regexp\" 95 | ") 96 | 97 | (def abnf-grammar-cljs-only 98 | " 99 | = #'[a-zA-Z][-a-zA-Z0-9]*'; 100 | opt-whitespace = #'\\s*(?:;.*?(?:\\u000D?\\u000A\\s*|$))*'; 101 | whitespace = #'\\s+(?:;.*?\\u000D?\\u000A\\s*)*'; 102 | regexp = #\"#'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'\" 103 | | #\"#\\\"[^\\\"\\\\]*(?:\\\\.[^\\\"\\\\]*)*\\\"\" 104 | ") 105 | 106 | #?(:clj 107 | (defmacro precompile-cljs-grammar 108 | [] 109 | (let [combinators (red/apply-standard-reductions 110 | :hiccup (cfg/ebnf (str abnf-grammar-common 111 | abnf-grammar-cljs-only)))] 112 | (walk/postwalk 113 | (fn [form] 114 | (cond 115 | ;; Lists cannot be evaluated verbatim 116 | (seq? form) 117 | (list* 'list form) 118 | 119 | ;; Regexp terminals are handled differently in cljs 120 | (= :regexp (:tag form)) 121 | `(merge (regexp ~(str (:regexp form))) 122 | ~(dissoc form :tag :regexp)) 123 | 124 | :else form)) 125 | combinators)))) 126 | 127 | #?(:clj 128 | (def abnf-parser (red/apply-standard-reductions 129 | :hiccup (cfg/ebnf (str abnf-grammar-common 130 | abnf-grammar-clj-only)))) 131 | :cljs 132 | (def abnf-parser (precompile-cljs-grammar))) 133 | 134 | (defn get-char-combinator 135 | [& nums] 136 | (cond 137 | (= "-" (second nums)) (let [[lo _ hi] nums] 138 | (unicode-char lo hi)) 139 | :else (apply cat (for [n nums] 140 | (unicode-char n))))) 141 | 142 | (defn project 143 | "Restricts map to certain keys" 144 | [m ks] 145 | (into {} 146 | (for [k ks 147 | :when (contains? m k)] 148 | [k (m k)]))) 149 | 150 | (defn merge-core 151 | "Merges abnf-core map in with parsed grammar map" 152 | [grammar-map] 153 | (merge 154 | (project abnf-core (distinct (mapcat cfg/seq-nt (vals grammar-map)))) 155 | grammar-map)) 156 | 157 | (defn hide-tag? 158 | "Tests whether parser was constructed with hide-tag" 159 | [p] 160 | (= (:red p) red/raw-non-terminal-reduction)) 161 | 162 | (defn alt-preserving-hide-tag [p1 p2] 163 | (let [hide-tag-p1? (hide-tag? p1) 164 | hide-tag-p2? (hide-tag? p2)] 165 | (cond 166 | (and hide-tag-p1? hide-tag-p2?) 167 | (hide-tag (alt (dissoc p1 :red) (dissoc p2 :red))) 168 | hide-tag-p1? 169 | (hide-tag (alt (dissoc p1 :red) p2)) 170 | hide-tag-p2? 171 | (hide-tag (alt p1 (dissoc p2 :red))) 172 | :else 173 | (alt p1 p2)))) 174 | 175 | #?(:clj 176 | (defn parse-int 177 | ([string] (Integer/parseInt string)) 178 | ([string radix] (Integer/parseInt string radix))) 179 | :cljs 180 | (def parse-int js/parseInt)) 181 | 182 | (def abnf-transformer 183 | { 184 | :rule hash-map 185 | :hide-tag-rule (fn [tag rule] {tag (hide-tag rule)}) 186 | :rulename-left #(if *case-insensitive* 187 | (keyword (clojure.string/upper-case (apply str %&))) 188 | (keyword (apply str %&))) 189 | :rulename-right #(if *case-insensitive* 190 | (nt (keyword (clojure.string/upper-case (apply str %&)))) 191 | (nt (keyword (apply str %&)))) 192 | ; since rulenames are case insensitive, convert it to upper case internally to be consistent 193 | :alternation alt 194 | :concatenation cat 195 | :repeat (fn [& items] 196 | (case (count items) 197 | 1 (cond 198 | (= (first items) "*") {} ; * 199 | :else {:low (first items), :high (first items)}) ; x 200 | 2 (cond 201 | (= (first items) "*") {:high (second items)} ; *x 202 | :else {:low (first items)}) ; x* 203 | 3 {:low (first items), :high (nth items 2)})) ; x*y 204 | 205 | :repetition (fn 206 | ([repeat element] 207 | (cond 208 | (empty? repeat) (star element) 209 | (= (count repeat) 2) (rep (:low repeat) (:high repeat) element) 210 | (= (:low repeat) 1) (plus element) 211 | (= (:high repeat) 1) (opt element) 212 | :else (rep (or (:low repeat) 0) 213 | (or (:high repeat) #?(:clj Double/POSITIVE_INFINITY 214 | :cljs js/Infinity)) 215 | element))) 216 | ([element] 217 | element)) 218 | :option opt 219 | :hide hide 220 | :look look 221 | :neg neg 222 | :regexp (comp regexp cfg/process-regexp) 223 | :char-val (fn [& cs] 224 | (cfg/string+ (apply str cs) true)) 225 | :bin-char (fn [& cs] 226 | (parse-int (apply str cs) 2)) 227 | :dec-char (fn [& cs] 228 | (parse-int (apply str cs))) 229 | :hex-char (fn [& cs] 230 | (parse-int (apply str cs) 16)) 231 | :bin-val get-char-combinator 232 | :dec-val get-char-combinator 233 | :hex-val get-char-combinator 234 | :NUM #(parse-int (apply str %&))}) 235 | 236 | (defn rules->grammar-map 237 | [rules] 238 | (merge-core (apply merge-with alt-preserving-hide-tag rules))) 239 | 240 | (defn abnf 241 | "Takes an ABNF grammar specification string and returns the combinator version. 242 | If you give it the right-hand side of a rule, it will return the combinator equivalent. 243 | If you give it a series of rules, it will give you back a grammar map. 244 | Useful for combining with other combinators." 245 | [spec & {:as opts}] 246 | (binding [cfg/*case-insensitive-literals* (:string-ci opts :default)] 247 | (let [tree (gll/parse abnf-parser :rules-or-parser spec false)] 248 | (cond 249 | (instance? instaparse.gll.Failure tree) 250 | (throw-runtime-exception 251 | "Error parsing grammar specification:\n" 252 | (with-out-str (println tree))) 253 | (= :alternation (ffirst tree)) 254 | (t/transform abnf-transformer (first tree)) 255 | 256 | :else (rules->grammar-map (t/transform abnf-transformer tree)))))) 257 | 258 | (defn build-parser [spec output-format] 259 | (let [rule-tree (gll/parse abnf-parser :rulelist spec false)] 260 | (if (instance? instaparse.gll.Failure rule-tree) 261 | (throw-runtime-exception 262 | "Error parsing grammar specification:\n" 263 | (with-out-str (println rule-tree))) 264 | (let [rules (t/transform abnf-transformer rule-tree) 265 | grammar-map (rules->grammar-map rules) 266 | start-production (first (first (first rules)))] 267 | {:grammar (cfg/check-grammar (red/apply-standard-reductions output-format grammar-map)) 268 | :start-production start-production 269 | :output-format output-format})))) 270 | 271 | -------------------------------------------------------------------------------- /src/instaparse/auto_flatten_seq.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.auto-flatten-seq 2 | #?(:clj (:import clojure.lang.PersistentVector)) 3 | #?(:clj (:require [clojure.core.protocols :refer [IKVReduce]]))) 4 | 5 | (def ^:const threshold 32) 6 | 7 | (defprotocol ConjFlat 8 | (conj-flat [self obj]) 9 | (cached? [self])) 10 | 11 | ; Need a backwards compatible version of mix-collection-hash 12 | #?(:clj (defmacro compile-if [test then else] 13 | (if (eval test) 14 | then 15 | else))) 16 | 17 | #?(:clj (defmacro mix-collection-hash-bc [x y] 18 | ;; backwards-compatible 19 | `(compile-if (resolve 'clojure.core/mix-collection-hash) 20 | (mix-collection-hash ~x ~y) 21 | ~x))) 22 | 23 | (declare EMPTY hash-cat afs? true-count) 24 | 25 | #?(:clj 26 | (defmacro hash-conj [premix-hash-v item] 27 | `(unchecked-add-int (unchecked-multiply-int 31 ~premix-hash-v) (hash ~item))) 28 | :cljs 29 | (defn ^number hash-conj 30 | "Returns the hash code, consistent with =, for an external ordered 31 | collection implementing Iterable. 32 | See http://clojure.org/data_structures#hash for full algorithms." 33 | [unmixed-hash item] 34 | (+ (imul 31 unmixed-hash) (hash item)))) 35 | 36 | #?(:clj 37 | (defn- expt [base pow] 38 | (if (zero? pow) 39 | 1 40 | (loop [n (int pow), y (int 1), z (int base)] 41 | (let [t (even? n), n (quot n 2)] 42 | (cond 43 | t (recur n y (unchecked-multiply-int z z)) 44 | (zero? n) (unchecked-multiply-int z y) 45 | :else (recur n (unchecked-multiply-int z y) (unchecked-multiply-int z z))))))) 46 | :cljs 47 | (defn- expt [base pow] 48 | (if (zero? pow) 49 | 1 50 | (loop [n (int pow), y (int 1), z (int base)] 51 | (let [t (even? n), n (quot n 2)] 52 | (cond 53 | t (recur n y (imul z z)) 54 | (zero? n) (imul z y) 55 | :else (recur n (imul z y) (imul z z)))))))) 56 | 57 | (defn delve [v index] 58 | (loop [v (get-in v index) 59 | index index] 60 | (if (afs? v) 61 | (recur (get v 0) (conj index 0)) 62 | index))) 63 | 64 | (defn advance [v index] 65 | (cond 66 | (= (count index) 1) 67 | (when (< (peek index) (dec (true-count v))) 68 | (delve v [(inc (peek index))])) 69 | 70 | (< (peek index) (dec (true-count (get-in v (pop index))))) 71 | (delve v (conj (pop index) (inc (peek index)))) 72 | 73 | :else 74 | (recur v (pop index)))) 75 | 76 | (defn flat-seq 77 | ([v] (if (pos? (count v)) 78 | (flat-seq v (delve v [0])) 79 | nil)) 80 | ([v index] 81 | (lazy-seq 82 | (cons (get-in v index) 83 | (when-let [next-index (advance v index)] 84 | (flat-seq v next-index)))))) 85 | 86 | #?(:clj 87 | (deftype AutoFlattenSeq [^PersistentVector v ^int premix-hashcode ^int hashcode 88 | ^int cnt ^boolean dirty 89 | ^:unsynchronized-mutable ^clojure.lang.ISeq cached-seq] 90 | Object 91 | (toString [self] (.toString (seq self))) 92 | (hashCode [self] hashcode) 93 | (equals [self other] 94 | (and (instance? AutoFlattenSeq other) 95 | (== hashcode (.hashcode ^AutoFlattenSeq other)) 96 | (== cnt (.cnt ^AutoFlattenSeq other)) 97 | (= dirty (.dirty ^AutoFlattenSeq other)) 98 | (= v (.v ^AutoFlattenSeq other)))) 99 | clojure.lang.IHashEq 100 | (hasheq [self] hashcode) 101 | java.util.Collection 102 | (iterator [self] 103 | (if-let [^java.util.Collection s (seq self)] 104 | (.iterator s) 105 | (let [^java.util.Collection e ()] 106 | (.iterator e)))) 107 | (size [self] 108 | cnt) 109 | (toArray [self] 110 | (let [^java.util.Collection s (seq self)] 111 | (.toArray s))) 112 | clojure.lang.Sequential 113 | clojure.lang.ISeq 114 | (equiv [self other] 115 | (and (== hashcode (hash other)) 116 | (== cnt (count other)) 117 | (or (== cnt 0) 118 | (= (seq self) other)))) 119 | (empty [self] (with-meta EMPTY (meta self))) 120 | (first [self] (first (seq self))) 121 | (next [self] (next (seq self))) 122 | (more [self] (rest (seq self))) 123 | (cons [self obj] 124 | (cons obj self)) 125 | ConjFlat 126 | (conj-flat [self obj] 127 | (cond 128 | (nil? obj) self 129 | (afs? obj) 130 | (cond 131 | (zero? cnt) obj 132 | (<= (count obj) threshold) 133 | (let [phc (hash-cat self obj) 134 | new-cnt (+ cnt (count obj))] 135 | (AutoFlattenSeq. (into v obj) phc (mix-collection-hash-bc phc new-cnt) new-cnt 136 | (or dirty (.dirty ^AutoFlattenSeq obj)) nil)) 137 | :else 138 | (let [phc (hash-cat self obj) 139 | new-cnt (+ cnt (count obj))] 140 | (AutoFlattenSeq. (conj v obj) phc (mix-collection-hash-bc phc new-cnt) new-cnt 141 | true nil))) 142 | :else 143 | (let [phc (hash-conj premix-hashcode obj) 144 | new-cnt (inc cnt)] 145 | (AutoFlattenSeq. (conj v obj) phc (mix-collection-hash-bc phc new-cnt) new-cnt dirty nil)))) 146 | (cached? [self] cached-seq) 147 | clojure.lang.Counted 148 | (count [self] cnt) 149 | clojure.lang.ILookup 150 | (valAt [self key] 151 | (.valAt v key)) 152 | (valAt [self key not-found] 153 | (.valAt v key not-found)) 154 | clojure.lang.IObj 155 | (withMeta [self metamap] 156 | (AutoFlattenSeq. (with-meta v metamap) premix-hashcode hashcode cnt dirty nil)) 157 | clojure.lang.IMeta 158 | (meta [self] 159 | (meta v)) 160 | clojure.lang.Seqable 161 | (seq [self] 162 | (if cached-seq cached-seq 163 | (do 164 | (set! cached-seq (if dirty (flat-seq v) (seq v))) 165 | cached-seq)))) 166 | :cljs 167 | (deftype AutoFlattenSeq [^PersistentVector v ^number premix-hashcode ^number hashcode ^number cnt ^boolean dirty 168 | ^:unsynchronized-mutable ^ISeq cached-seq] 169 | Object 170 | (toString [self] (pr-str* (seq self))) 171 | IHash 172 | (-hash [self] hashcode) 173 | ISequential 174 | ISeq 175 | (-first [self] (first (seq self))) 176 | (-rest [self] (rest (seq self))) 177 | IEquiv 178 | (-equiv [self other] 179 | (and ;(instance? AutoFlattenSeq other) 180 | (= hashcode (hash other)) 181 | (= cnt (count other)) 182 | (or (= cnt 0) 183 | (= (seq self) other)))) 184 | ICollection 185 | (-conj [self o] (cons o self)) 186 | IEmptyableCollection 187 | (-empty [self] (with-meta EMPTY (meta self))) 188 | INext 189 | (-next [self] (next (seq self))) 190 | ConjFlat 191 | (conj-flat [self obj] 192 | (cond 193 | (nil? obj) self 194 | (afs? obj) 195 | (cond 196 | (zero? cnt) obj 197 | (<= (count obj) threshold) 198 | (let [phc (hash-cat self obj) 199 | new-cnt (+ cnt (count obj))] 200 | (AutoFlattenSeq. (into v obj) phc (mix-collection-hash phc new-cnt) new-cnt 201 | (or dirty (.-dirty ^AutoFlattenSeq obj)) nil)) 202 | :else 203 | (let [phc (hash-cat self obj) 204 | new-cnt (+ cnt (count obj))] 205 | (AutoFlattenSeq. (conj v obj) phc (mix-collection-hash phc new-cnt) new-cnt 206 | true nil))) 207 | :else 208 | (let [phc (hash-conj premix-hashcode obj) 209 | new-cnt (inc cnt)] 210 | (AutoFlattenSeq. (conj v obj) phc (mix-collection-hash phc new-cnt) new-cnt dirty nil)))) 211 | (cached? [self] cached-seq) 212 | ICounted 213 | (-count [self] cnt) 214 | ILookup 215 | (-lookup [self key] 216 | (-lookup v key)) 217 | (-lookup [self key not-found] 218 | (-lookup v key not-found)) 219 | IWithMeta 220 | (-with-meta [self metamap] 221 | (AutoFlattenSeq. (with-meta v metamap) premix-hashcode hashcode cnt dirty nil)) 222 | IMeta 223 | (-meta [self] 224 | (meta v)) 225 | ISeqable 226 | (-seq [self] 227 | (if cached-seq cached-seq 228 | (do 229 | (set! cached-seq (if dirty (flat-seq v) (seq v))) 230 | cached-seq))))) 231 | 232 | #?(:clj 233 | (defn- hash-cat ^long [^AutoFlattenSeq v1 ^AutoFlattenSeq v2] 234 | (let [c (count v2) 235 | e (int (expt 31 c))] 236 | (unchecked-add-int 237 | (unchecked-multiply-int e (.premix-hashcode v1)) 238 | (unchecked-subtract-int (.premix-hashcode v2) e)))) 239 | :cljs 240 | (defn- hash-cat ^number [^AutoFlattenSeq v1 ^AutoFlattenSeq v2] 241 | (let [c (count v2) 242 | e (int (expt 31 c))] 243 | (+ (imul e (.-premix-hashcode v1)) 244 | (- (.-premix-hashcode v2) e))))) 245 | 246 | #?(:clj 247 | (defn hash-ordered-coll-without-mix ^long [v] 248 | (compile-if (resolve 'clojure.core/mix-collection-hash) 249 | (let [thirty-one (int 31) 250 | cnt (count v)] 251 | (loop [acc (int 1) i (int 0)] 252 | (if (< i cnt) 253 | (recur (unchecked-add-int 254 | (unchecked-multiply-int thirty-one acc) 255 | (hash (v i))) 256 | (inc i)) 257 | acc))) 258 | (hash v))) 259 | :cljs 260 | (defn ^number hash-ordered-coll-without-mix 261 | "Returns the partially calculated hash code, still requires a call to mix-collection-hash" 262 | ([coll] 263 | (hash-ordered-coll-without-mix 1 coll)) 264 | ([existing-unmixed-hash coll] 265 | (loop [unmixed-hash existing-unmixed-hash 266 | coll (seq coll)] 267 | (if-not (nil? coll) 268 | (recur (bit-or (+ (imul 31 unmixed-hash) (hash (first coll))) 0) 269 | (next coll)) 270 | unmixed-hash))))) 271 | 272 | #?(:cljs 273 | (extend-protocol IPrintWithWriter 274 | instaparse.auto-flatten-seq/AutoFlattenSeq 275 | (-pr-writer [afs writer opts] 276 | (-pr-writer (seq afs) writer opts)))) 277 | 278 | (defn auto-flatten-seq [v] 279 | (let [v (vec v)] 280 | (AutoFlattenSeq. v 281 | (hash-ordered-coll-without-mix v) 282 | (hash v) (count v) 283 | false nil))) 284 | 285 | (def EMPTY (auto-flatten-seq [])) 286 | 287 | (defn afs? [s] 288 | (instance? AutoFlattenSeq s)) 289 | 290 | (defn true-count [v] 291 | (if (afs? v) 292 | (count (.-v ^AutoFlattenSeq v)) 293 | (count v))) 294 | 295 | ;; For hiccup format, we need to be able to convert the seq to a vector. 296 | 297 | (defn flat-vec-helper [acc v] 298 | (if-let [s (seq v)] 299 | (let [fst (first v)] 300 | (if (afs? fst) 301 | (recur (flat-vec-helper acc fst) (next v)) 302 | (recur (conj! acc fst) (next v)))) 303 | acc)) 304 | 305 | (defn flat-vec 306 | "Turns deep vector (like the vector inside of FlattenOnDemandVector) into a flat vec" 307 | [v] 308 | (persistent! (flat-vec-helper (transient []) v))) 309 | 310 | (defprotocol GetVec 311 | (^PersistentVector get-vec [self])) 312 | 313 | #?(:clj 314 | (deftype FlattenOnDemandVector [v ; ref containing PersistentVector or nil 315 | ^int hashcode 316 | ^int cnt 317 | flat] ; ref containing PersistentVector or nil 318 | GetVec 319 | (get-vec [self] 320 | (when (not @flat) 321 | (dosync 322 | (when (not @flat) 323 | (ref-set flat (with-meta (flat-vec @v) (meta @v))) 324 | (ref-set v nil)))) ; clear out v so it can be garbage collected 325 | @flat) 326 | 327 | Object 328 | (toString [self] (.toString (get-vec self))) 329 | (hashCode [self] hashcode) 330 | (equals [self other] 331 | (and (instance? FlattenOnDemandVector other) 332 | (== hashcode (.hashcode ^FlattenOnDemandVector other)) 333 | (== cnt (.cnt ^FlattenOnDemandVector other)) 334 | (= v (.v ^FlattenOnDemandVector other)) 335 | (= flat (.flat ^FlattenOnDemandVector other)))) 336 | clojure.lang.IHashEq 337 | (hasheq [self] hashcode) 338 | java.util.Collection 339 | (iterator [self] 340 | (.iterator (get-vec self))) 341 | (size [self] 342 | cnt) 343 | (toArray [self] 344 | (.toArray (get-vec self))) 345 | clojure.lang.IPersistentCollection 346 | (equiv [self other] 347 | (or 348 | (and (== hashcode (hash other)) 349 | (== cnt (count other)) 350 | (= (get-vec self) other)))) 351 | (empty [self] (with-meta [] (meta self))) 352 | clojure.lang.Counted 353 | (count [self] cnt) 354 | clojure.lang.IPersistentVector 355 | (assoc [self i val] 356 | (assoc (get-vec self) i val)) 357 | (assocN [self i val] 358 | (.assocN (get-vec self) i val)) 359 | (length [self] 360 | cnt) 361 | (cons [self obj] 362 | (conj (get-vec self) obj)) 363 | clojure.lang.IObj 364 | (withMeta [self metamap] 365 | (if @flat 366 | (FlattenOnDemandVector. (ref @v) hashcode cnt (ref (with-meta @flat metamap))) 367 | (FlattenOnDemandVector. (ref (with-meta @v metamap)) hashcode cnt (ref @flat)))) 368 | clojure.lang.IMeta 369 | (meta [self] 370 | (if @flat (meta @flat) (meta @v))) 371 | clojure.lang.Seqable 372 | (seq [self] 373 | (seq (get-vec self))) 374 | clojure.lang.ILookup 375 | (valAt [self key] 376 | (.valAt (get-vec self) key)) 377 | (valAt [self key not-found] 378 | (.valAt (get-vec self) key not-found)) 379 | clojure.lang.Indexed 380 | (nth [self i] 381 | (.nth (get-vec self) i)) 382 | (nth [self i not-found] 383 | (.nth (get-vec self) i not-found)) 384 | clojure.lang.IFn 385 | (invoke [self arg] 386 | (.invoke (get-vec self) arg)) 387 | (applyTo [self arglist] 388 | (.applyTo (get-vec self) arglist)) 389 | clojure.lang.Reversible 390 | (rseq [self] 391 | (if (pos? cnt) 392 | (rseq (get-vec self)) 393 | nil)) 394 | clojure.lang.IPersistentStack 395 | (peek [self] 396 | (peek (get-vec self))) 397 | (pop [self] 398 | (pop (get-vec self))) 399 | clojure.lang.Associative 400 | (containsKey [self k] 401 | (.containsKey (get-vec self) k)) 402 | (entryAt [self k] 403 | (.entryAt (get-vec self) k)) 404 | IKVReduce 405 | (kv-reduce [self f init] 406 | (.kvreduce (get-vec self) f init)) 407 | java.lang.Comparable 408 | (compareTo [self that] 409 | (.compareTo (get-vec self) that)) 410 | java.util.List 411 | (get [self i] (nth (get-vec self) i)) 412 | (indexOf [self o] (.indexOf (get-vec self) o)) 413 | (lastIndexOf [self o] (.lastIndexOf (get-vec self) o)) 414 | (listIterator [self] 415 | (.listIterator (get-vec self) 0)) 416 | (listIterator [self i] 417 | (.listIterator (get-vec self) i)) 418 | (subList [self a z] 419 | (.subList (get-vec self) a z)) 420 | ) 421 | :cljs 422 | (deftype FlattenOnDemandVector [v ; atom containing PersistentVector or nil 423 | ^number hashcode 424 | ^number cnt 425 | flat] ; atom containing PersistentVector or nil 426 | GetVec 427 | (get-vec [self] 428 | (when (not @flat) 429 | (swap! flat (fn [_] (with-meta (flat-vec @v) (meta @v)))) 430 | (swap! v (fn [_] nil))) ; clear out v so it can be garbage collected 431 | @flat) 432 | 433 | Object 434 | (toString [self] 435 | (pr-str* (get-vec self))) 436 | IHash 437 | (-hash [self] hashcode) 438 | IEquiv 439 | (-equiv [self other] 440 | (or 441 | (and (= hashcode (hash other)) 442 | (= cnt (count other)) 443 | (= (get-vec self) other)))) 444 | IEmptyableCollection 445 | (-empty [self] (with-meta [] (meta self))) 446 | ICounted 447 | (-count [self] cnt) 448 | IVector 449 | (-assoc-n [self i val] 450 | (-assoc-n (get-vec self) i val)) 451 | ICollection 452 | (-conj [self obj] 453 | (conj (get-vec self) obj)) 454 | IWithMeta 455 | (-with-meta [self metamap] 456 | (if @flat 457 | (FlattenOnDemandVector. (atom @v) hashcode cnt (atom (with-meta @flat metamap))) 458 | (FlattenOnDemandVector. (atom (with-meta @v metamap)) hashcode cnt (atom @flat)))) 459 | IMeta 460 | (-meta [self] 461 | (if @flat (meta @flat) (meta @v))) 462 | ISequential 463 | ISeqable 464 | (-seq [self] 465 | (seq (get-vec self))) 466 | ILookup 467 | (-lookup [self key] 468 | (-lookup (get-vec self) key)) 469 | (-lookup [self key not-found] 470 | (-lookup (get-vec self) key not-found)) 471 | IIndexed 472 | (-nth [self i] 473 | (-nth (get-vec self) i)) 474 | (-nth [self i not-found] 475 | (-nth (get-vec self) i not-found)) 476 | IFn 477 | (-invoke [self arg] 478 | (-invoke (get-vec self) arg)) 479 | (-invoke [self arg not-found] 480 | (-invoke (get-vec self) arg not-found)) 481 | IReversible 482 | (-rseq [self] 483 | (if (pos? cnt) 484 | (rseq (get-vec self)) 485 | nil)) 486 | IStack 487 | (-peek [self] 488 | (-peek (get-vec self))) 489 | (-pop [self] 490 | (-pop (get-vec self))) 491 | IAssociative 492 | (-assoc [self i val] 493 | (assoc (get-vec self) i val)) 494 | (-contains-key? [self k] 495 | (-contains-key? (get-vec self) k)) 496 | IKVReduce 497 | (-kv-reduce [self f init] 498 | (-kv-reduce (get-vec self) f init)) 499 | IComparable 500 | (-compare [self that] 501 | (-compare (get-vec self) that)) 502 | )) 503 | 504 | #?(:cljs 505 | (extend-protocol IPrintWithWriter 506 | instaparse.auto-flatten-seq/FlattenOnDemandVector 507 | (-pr-writer [v writer opts] 508 | (-pr-writer (get-vec v) writer opts)))) 509 | 510 | (defn convert-afs-to-vec [^AutoFlattenSeq afs] 511 | (cond 512 | (.-dirty afs) 513 | (if (cached? afs) 514 | (vec (seq afs)) 515 | #?(:clj 516 | (FlattenOnDemandVector. 517 | (ref (.-v afs)) 518 | (.-hashcode afs) 519 | (.-cnt afs) 520 | (ref nil)) 521 | :cljs 522 | (FlattenOnDemandVector. 523 | (atom (.-v afs)) 524 | (.-hashcode afs) 525 | (.-cnt afs) 526 | (atom nil)))) 527 | :else 528 | (.-v afs))) 529 | -------------------------------------------------------------------------------- /src/instaparse/cfg.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.cfg 2 | "This is the context free grammar that recognizes context free grammars." 3 | (:refer-clojure :exclude [cat]) 4 | (:require [instaparse.combinators-source :refer 5 | [Epsilon opt plus star rep alt ord cat string-ci string 6 | string-ci regexp nt look neg hide hide-tag]] 7 | [instaparse.reduction :refer [apply-standard-reductions]] 8 | [instaparse.gll :refer [parse]] 9 | [instaparse.util :refer [throw-illegal-argument-exception 10 | throw-runtime-exception]] 11 | [clojure.string :as str] 12 | #?(:cljs [cljs.tools.reader :as reader]) 13 | #?(:cljs [cljs.tools.reader.reader-types :as readers]))) 14 | 15 | (def ^:dynamic *case-insensitive-literals* 16 | "Sets whether all string literal terminals in a built grammar 17 | will be treated as case insensitive. 18 | 19 | `true`: case-insensitive 20 | `false`: case-sensitive 21 | `:default`: case-sensitive for EBNF, case-insensitive for ABNF" 22 | :default) 23 | 24 | (defn string+ 25 | "Returns a string combinator that may be case-insensntive, based 26 | on (in priority order): 27 | 28 | 1) the value of `*case-insensitive-literals*`, if it has been 29 | overridden to a boolean 30 | 2) the supplied `ci-by-default?` parameter" 31 | [s ci-by-default?] 32 | (case *case-insensitive-literals* 33 | true (string-ci s) 34 | false (string s) 35 | :default (if ci-by-default? (string-ci s) (string s)))) 36 | 37 | (defn regex-doc 38 | "Adds a comment to a Clojure regex, or no-op in ClojureScript" 39 | [pattern-str comment] 40 | #?(:clj (re-pattern (str pattern-str "(?x) #" comment)) 41 | :cljs (re-pattern pattern-str))) 42 | 43 | (def single-quoted-string (regex-doc #"'[^'\\]*(?:\\.[^'\\]*)*'" "Single-quoted string")) 44 | (def single-quoted-regexp (regex-doc #"#'[^'\\]*(?:\\.[^'\\]*)*'" "Single-quoted regexp")) 45 | (def double-quoted-string (regex-doc #"\"[^\"\\]*(?:\\.[^\"\\]*)*\"" "Double-quoted string")) 46 | (def double-quoted-regexp (regex-doc #"#\"[^\"\\]*(?:\\.[^\"\\]*)*\"" "Double-quoted regexp")) 47 | (def inside-comment #?(:clj #"(?s)(?:(?!(?:\(\*|\*\))).)*(?x) #Comment text" 48 | :cljs #"(?:(?!(?:\(\*|\*\)))[\s\S])*")) 49 | (def ws (regex-doc "[,\\s]*" "optional whitespace")) 50 | 51 | (def opt-whitespace (hide (nt :opt-whitespace))) 52 | 53 | (def non-terminal 54 | (regex-doc "[^, \\r\\t\\n<>(){}\\[\\]+*?:=|'\"#&!;./]+" "Non-terminal")) 55 | 56 | (def non-terminal-namespace-allowed 57 | (let [no-slash "[^, \\r\\t\\n<>(){}\\[\\]+*?:=|'\"#&!;/.]" 58 | with-slash "[^, \\r\\t\\n<>(){}\\[\\]+*?:=|'\"#&!;]"] 59 | (regex-doc (str no-slash with-slash "*") "Non-terminal-namespace-allowed"))) 60 | 61 | (defn make-cfg [allow-namespaced-nts?] 62 | (apply-standard-reductions 63 | :hiccup ; use the hiccup output format 64 | {:rules (hide-tag (cat opt-whitespace 65 | (plus (nt :rule)))) 66 | :comment (cat (string "(*") (nt :inside-comment) (string "*)")) 67 | :inside-comment (cat (regexp inside-comment) 68 | (star (cat (nt :comment) 69 | (regexp inside-comment)))) 70 | :opt-whitespace (cat (regexp ws) 71 | (star (cat (nt :comment) 72 | (regexp ws)))) 73 | :rule-separator (alt (string ":") 74 | (string ":=") 75 | (string "::=") 76 | (string "=")) 77 | :rule (cat (alt (nt :nt) 78 | (nt :hide-nt)) 79 | opt-whitespace 80 | (hide (nt :rule-separator)) 81 | opt-whitespace 82 | (nt :alt-or-ord) 83 | (hide (alt (nt :opt-whitespace) 84 | (cat (nt :opt-whitespace) (alt (string ";") (string ".")) (nt :opt-whitespace))))) 85 | :nt (cat 86 | (neg (nt :epsilon)) 87 | (regexp 88 | (if allow-namespaced-nts? 89 | non-terminal-namespace-allowed 90 | non-terminal))) 91 | :hide-nt (cat (hide (string "<")) 92 | opt-whitespace 93 | (nt :nt) 94 | opt-whitespace 95 | (hide (string ">"))) 96 | :alt-or-ord (hide-tag (alt (nt :alt) (nt :ord))) 97 | :alt (cat (nt :cat) 98 | (star 99 | (cat 100 | opt-whitespace 101 | (hide (string "|")) 102 | opt-whitespace 103 | (nt :cat)))) 104 | :ord (cat (nt :cat) 105 | (plus 106 | (cat 107 | opt-whitespace 108 | (hide (string "/")) 109 | opt-whitespace 110 | (nt :cat)))) 111 | :paren (cat (hide (string "(")) 112 | opt-whitespace 113 | (nt :alt-or-ord) 114 | opt-whitespace 115 | (hide (string ")"))) 116 | :hide (cat (hide (string "<")) 117 | opt-whitespace 118 | (nt :alt-or-ord) 119 | opt-whitespace 120 | (hide (string ">"))) 121 | :cat (plus (cat 122 | opt-whitespace 123 | (alt (nt :factor) (nt :look) (nt :neg)) 124 | opt-whitespace)) 125 | :string (alt 126 | (regexp single-quoted-string) 127 | (regexp double-quoted-string)) 128 | :regexp (alt 129 | (regexp single-quoted-regexp) 130 | (regexp double-quoted-regexp)) 131 | :opt (alt 132 | (cat (hide (string "[")) 133 | opt-whitespace 134 | (nt :alt-or-ord) 135 | opt-whitespace 136 | (hide (string "]"))) 137 | (cat (nt :factor) 138 | opt-whitespace 139 | (hide (string "?")))) 140 | :star (alt 141 | (cat (hide (string "{")) 142 | opt-whitespace 143 | (nt :alt-or-ord) 144 | opt-whitespace 145 | (hide (string "}"))) 146 | (cat (nt :factor) 147 | opt-whitespace 148 | (hide (string "*")))) 149 | :plus (cat (nt :factor) 150 | opt-whitespace 151 | (hide (string "+"))) 152 | :look (cat (hide (string "&")) 153 | opt-whitespace 154 | (nt :factor)) 155 | :neg (cat (hide (string "!")) 156 | opt-whitespace 157 | (nt :factor)) 158 | :epsilon (alt (string "Epsilon") 159 | (string "epsilon") 160 | (string "EPSILON") 161 | (string "eps") 162 | (string "\u03b5")) 163 | :factor (hide-tag (alt (nt :nt) 164 | (nt :string) 165 | (nt :regexp) 166 | (nt :opt) 167 | (nt :star) 168 | (nt :plus) 169 | (nt :paren) 170 | (nt :hide) 171 | (nt :epsilon))) 172 | ;; extra entrypoint to be used by the ebnf combinator 173 | :rules-or-parser (hide-tag (alt (nt :rules) (nt :alt-or-ord)))})) 174 | 175 | (def cfg (make-cfg false)) ;; the original parser for instaparse's ebnf notation flavor of context-free grammars 176 | (def cfg-allow-namespaced-nts (make-cfg true)) ;; new version recognizes namespaced non-terminals 177 | 178 | ;; Internally, we're converting the grammar into a hiccup parse tree 179 | ;; Here's how you extract the relevant information 180 | (def tag first) 181 | (def contents next) 182 | (def content fnext) 183 | 184 | ;;;; Helper functions for reading strings and regexes 185 | 186 | (defn escape 187 | "Converts escaped single-quotes to unescaped, and unescaped double-quotes to escaped" 188 | [s] 189 | (loop [sq (seq s), v []] 190 | (if-let [c (first sq)] 191 | (case c 192 | \\ (if-let [c2 (second sq)] 193 | (if (= c2 \') 194 | (recur (drop 2 sq) (conj v c2)) 195 | (recur (drop 2 sq) (conj v c c2))) 196 | (throw-runtime-exception 197 | "Encountered backslash character at end of string: " s)) 198 | \" (recur (next sq) (conj v \\ \")) 199 | (recur (next sq) (conj v c))) 200 | (apply str v)))) 201 | 202 | ;(defn safe-read-string [s] 203 | ; (binding [*read-eval* false] 204 | ; (read-string s))) 205 | 206 | #?(:clj 207 | (defn wrap-reader [reader] 208 | (let [{major :major minor :minor} *clojure-version*] 209 | (if (and (<= major 1) (<= minor 6)) 210 | reader 211 | (fn [r s] (reader r s {} (java.util.LinkedList.))))))) 212 | 213 | #?(:clj 214 | (let [string-reader (wrap-reader 215 | (clojure.lang.LispReader$StringReader.))] 216 | (defn safe-read-string 217 | "Expects a double-quote at the end of the string" 218 | [s] 219 | (with-in-str s (string-reader *in* nil)))) 220 | 221 | :cljs 222 | (let [read-string* @#'reader/read-string*] ;; since read-string* is private 223 | (defn safe-read-string [s] 224 | (read-string* (readers/string-push-back-reader s) nil nil nil)))) 225 | 226 | ; I think re-pattern is sufficient, but here's how to do it without. 227 | ;(let [regexp-reader (clojure.lang.LispReader$RegexReader.)] 228 | ; (defn safe-read-regexp 229 | ; "Expects a double-quote at the end of the string" 230 | ; [s] 231 | ; (with-in-str s (regexp-reader *in* nil)))) 232 | 233 | (defn process-string 234 | "Converts single quoted string to double-quoted" 235 | [s] 236 | (let [stripped 237 | (subs s 1 (dec (count s))) 238 | remove-escaped-single-quotes 239 | (escape stripped) 240 | final-string 241 | (safe-read-string (str remove-escaped-single-quotes \"))] 242 | 243 | final-string)) 244 | 245 | (defn process-regexp 246 | "Converts single quoted regexp to double-quoted" 247 | [s] 248 | ;(println (with-out-str (pr s))) 249 | (let [stripped 250 | (subs s 2 (dec (count s))) 251 | remove-escaped-single-quotes 252 | (escape stripped) 253 | final-string 254 | (re-pattern remove-escaped-single-quotes)] 255 | ; (safe-read-regexp (str remove-escaped-single-quotes \"))] 256 | 257 | final-string)) 258 | 259 | ;;; Now we need to convert the grammar's parse tree into combinators 260 | 261 | (defn build-rule 262 | "Convert one parsed rule from the grammar into combinators" 263 | [tree] 264 | (case (tag tree) 265 | :rule (let [[nt alt-or-ord] (contents tree)] 266 | (if (= (tag nt) :hide-nt) 267 | [(keyword (content (content nt))) 268 | (hide-tag (build-rule alt-or-ord))] 269 | [(keyword (content nt)) 270 | (build-rule alt-or-ord)])) 271 | :nt (nt (keyword (content tree))) 272 | :alt (apply alt (map build-rule (contents tree))) 273 | :ord (apply ord (map build-rule (contents tree))) 274 | :paren (recur (content tree)) 275 | :hide (hide (build-rule (content tree))) 276 | :cat (apply cat (map build-rule (contents tree))) 277 | :string (string+ (process-string (content tree)) false) 278 | :regexp (regexp (process-regexp (content tree))) 279 | :opt (opt (build-rule (content tree))) 280 | :star (star (build-rule (content tree))) 281 | :plus (plus (build-rule (content tree))) 282 | :look (look (build-rule (content tree))) 283 | :neg (neg (build-rule (content tree))) 284 | :epsilon Epsilon)) 285 | 286 | (defn seq-nt 287 | "Returns a sequence of all non-terminals in a parser built from combinators." 288 | [parser] 289 | (case (:tag parser) 290 | :nt [(:keyword parser)] 291 | (:string :string-ci :char :regexp :epsilon) [] 292 | (:opt :plus :star :look :neg :rep) (recur (:parser parser)) 293 | (:alt :cat) (mapcat seq-nt (:parsers parser)) 294 | :ord (mapcat seq-nt 295 | [(:parser1 parser) (:parser2 parser)]))) 296 | 297 | (defn check-grammar 298 | "Throw error if grammar uses any invalid non-terminals in its productions" 299 | [grammar-map] 300 | (let [valid-nts (set (keys grammar-map))] 301 | (doseq [nt (distinct (mapcat seq-nt (vals grammar-map)))] 302 | (when-not (valid-nts nt) 303 | (throw-runtime-exception 304 | (subs (str nt) 1) 305 | " occurs on the right-hand side of your grammar, but not on the left")))) 306 | grammar-map) 307 | 308 | (defn build-parser 309 | ([spec output-format] (build-parser spec output-format false)) 310 | ([spec output-format allow-namespaced-nts?] 311 | (let [rules (parse (if allow-namespaced-nts? cfg-allow-namespaced-nts cfg) :rules spec false)] 312 | (if (instance? instaparse.gll.Failure rules) 313 | (throw-runtime-exception 314 | "Error parsing grammar specification:\n" 315 | (with-out-str (println rules))) 316 | (let [productions (map build-rule rules) 317 | start-production (first (first productions))] 318 | {:grammar (check-grammar (apply-standard-reductions output-format (into {} productions))) 319 | :start-production start-production 320 | :output-format output-format}))))) 321 | 322 | (defn build-parser-from-combinators [grammar-map output-format start-production] 323 | (if (nil? start-production) 324 | (throw-illegal-argument-exception 325 | "When you build a parser from a map of parser combinators, you must provide a start production using the :start keyword argument.") 326 | {:grammar (check-grammar (apply-standard-reductions output-format grammar-map)) 327 | :start-production start-production 328 | :output-format output-format})) 329 | 330 | (defn ebnf 331 | "Takes an EBNF grammar specification string and returns the combinator version. 332 | If you give it the right-hand side of a rule, it will return the combinator equivalent. 333 | If you give it a series of rules, it will give you back a grammar map. 334 | Useful for combining with other combinators." 335 | [spec & {:as opts}] 336 | (binding [*case-insensitive-literals* (:string-ci opts :default)] 337 | (let [rules (parse cfg :rules-or-parser spec false)] 338 | (cond 339 | (instance? instaparse.gll.Failure rules) 340 | (throw-runtime-exception 341 | "Error parsing grammar specification:\n" 342 | (with-out-str (println rules))) 343 | (= :rule (ffirst rules)) 344 | (into {} (map build-rule rules)) 345 | 346 | :else (build-rule (first rules)))))) 347 | -------------------------------------------------------------------------------- /src/instaparse/combinators.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.combinators 2 | "The combinator public API for instaparse" 3 | (:refer-clojure :exclude [cat]) 4 | #?(:clj (:use instaparse.macros) 5 | :cljs (:require-macros 6 | [instaparse.macros :refer [defclone]])) 7 | (:require [instaparse.combinators-source :as c] 8 | [instaparse.cfg :as cfg] 9 | [instaparse.abnf :as abnf])) 10 | 11 | ;; The actual source is in combinators-source. 12 | ;; This was necessary to avoid a cyclical dependency in the namespaces. 13 | 14 | (defclone Epsilon c/Epsilon) 15 | (defclone opt c/opt) 16 | (defclone plus c/plus) 17 | (defclone star c/star) 18 | (defclone rep c/rep) 19 | (defclone alt c/alt) 20 | (defclone ord c/ord) 21 | (defclone cat c/cat) 22 | (defclone string c/string) 23 | (defclone string-ci c/string-ci) 24 | (defclone unicode-char c/unicode-char) 25 | (defclone regexp c/regexp) 26 | (defclone nt c/nt) 27 | (defclone look c/look) 28 | (defclone neg c/neg) 29 | (defclone hide c/hide) 30 | (defclone hide-tag c/hide-tag) 31 | 32 | (defclone ebnf cfg/ebnf) 33 | (defclone abnf abnf/abnf) 34 | 35 | -------------------------------------------------------------------------------- /src/instaparse/combinators_source.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.combinators-source 2 | "This is the underlying implementation of the various combinators." 3 | (:refer-clojure :exclude [cat]) 4 | (:require [instaparse.reduction :refer [singleton? red 5 | raw-non-terminal-reduction 6 | reduction-types]] 7 | [instaparse.util :refer [throw-illegal-argument-exception #?(:cljs regexp-flags)]])) 8 | 9 | ;; Ways to build parsers 10 | 11 | (def Epsilon {:tag :epsilon}) 12 | 13 | (defn opt "Optional, i.e., parser?" 14 | [parser] 15 | (if (= parser Epsilon) Epsilon 16 | {:tag :opt :parser parser})) 17 | 18 | (defn plus "One or more, i.e., parser+" 19 | [parser] 20 | (if (= parser Epsilon) Epsilon 21 | {:tag :plus :parser parser})) 22 | 23 | (defn star "Zero or more, i.e., parser*" 24 | [parser] 25 | (if (= parser Epsilon) Epsilon 26 | {:tag :star :parser parser})) 27 | 28 | (defn rep "Between m and n repetitions" 29 | [m n parser] 30 | {:pre [(<= m n)]} 31 | (if (= parser Epsilon) Epsilon 32 | {:tag :rep :parser parser :min m :max n})) 33 | 34 | (defn alt "Alternation, i.e., parser1 | parser2 | parser3 | ..." 35 | [& parsers] 36 | (cond 37 | (every? (partial = Epsilon) parsers) Epsilon 38 | (singleton? parsers) (first parsers) 39 | :else {:tag :alt :parsers parsers})) 40 | 41 | (defn- ord2 [parser1 parser2] 42 | {:tag :ord :parser1 parser1 :parser2 parser2}) 43 | 44 | (defn ord "Ordered choice, i.e., parser1 / parser2" 45 | ([] Epsilon) 46 | ([parser1 & parsers] 47 | (let [parsers (if (= parser1 Epsilon) 48 | (remove #{Epsilon} parsers) 49 | parsers)] 50 | (if (seq parsers) 51 | (ord2 parser1 (apply ord parsers)) 52 | parser1)))) 53 | 54 | (defn cat "Concatenation, i.e., parser1 parser2 ..." 55 | [& parsers] 56 | (if (every? (partial = Epsilon) parsers) Epsilon 57 | (let [parsers (remove #{Epsilon} parsers)] 58 | (if (singleton? parsers) (first parsers) ; apply vector reduction 59 | {:tag :cat :parsers parsers})))) 60 | 61 | (defn string "Create a string terminal out of s" 62 | [s] 63 | (if (= s "") Epsilon 64 | {:tag :string :string s})) 65 | 66 | (defn string-ci "Create a case-insensitive string terminal out of s" 67 | [s] 68 | (if (= s "") Epsilon 69 | {:tag :string-ci :string s})) 70 | 71 | (defn unicode-char 72 | "Matches a Unicode code point or a range of code points" 73 | ([code-point] 74 | (unicode-char code-point code-point)) 75 | ([lo hi] 76 | (assert (<= lo hi) "Character range minimum must be less than or equal the maximum") 77 | {:tag :char :lo lo :hi hi})) 78 | 79 | #?(:cljs 80 | (defn- add-beginning-constraint 81 | "JavaScript regexes have no .lookingAt method, so in cljs we just 82 | add a '^' character to the front of the regex." 83 | [r] 84 | (if (regexp? r) 85 | (js/RegExp. (str "^" (.-source r)) (regexp-flags r)) 86 | r))) 87 | 88 | (defn regexp "Create a regexp terminal out of regular expression r" 89 | [r] 90 | (if (= r "") Epsilon 91 | {:tag :regexp 92 | :regexp (-> (re-pattern r) 93 | #?(:cljs add-beginning-constraint))})) 94 | 95 | (defn nt "Refers to a non-terminal defined by the grammar map" 96 | [s] 97 | {:tag :nt :keyword s}) 98 | 99 | (defn look "Lookahead, i.e., &parser" 100 | [parser] 101 | {:tag :look :parser parser}) 102 | 103 | (defn neg "Negative lookahead, i.e., !parser" 104 | [parser] 105 | {:tag :neg :parser parser}) 106 | 107 | (defn hide "Hide the result of parser, i.e., " 108 | [parser] 109 | (assoc parser :hide true)) 110 | 111 | (defn hide-tag "Hide the tag associated with this rule. 112 | Wrap this combinator around the entire right-hand side." 113 | [parser] 114 | (red parser raw-non-terminal-reduction)) 115 | 116 | ; Ways to alter a parser with hidden information, unhiding that information 117 | 118 | (defn hidden-tag? 119 | "Tests whether parser was created with hide-tag combinator" 120 | [parser] 121 | (= (:red parser) raw-non-terminal-reduction)) 122 | 123 | (defn unhide-content 124 | "Recursively undoes the effect of hide on one parser" 125 | [parser] 126 | (let [parser (if (:hide parser) (dissoc parser :hide) parser)] 127 | (cond 128 | (:parser parser) (assoc parser :parser (unhide-content (:parser parser))) 129 | (:parsers parser) (assoc parser :parsers (map unhide-content (:parsers parser))) 130 | (= (:tag parser) :ord) (assoc parser 131 | :parser1 (unhide-content (:parser1 parser)) 132 | :parser2 (unhide-content (:parser2 parser))) 133 | :else parser))) 134 | 135 | (defn unhide-all-content 136 | "Recursively undoes the effect of hide on all parsers in the grammar" 137 | [grammar] 138 | (into {} (for [[k v] grammar] 139 | [k (unhide-content v)]))) 140 | 141 | (defn unhide-tags 142 | "Recursively undoes the effect of hide-tag" 143 | [reduction-type grammar] 144 | (if-let [reduction (reduction-types reduction-type)] 145 | (into {} (for [[k v] grammar] 146 | [k (assoc v :red (reduction k))])) 147 | (throw-illegal-argument-exception 148 | "Invalid output format " reduction-type ". Use :enlive or :hiccup."))) 149 | 150 | (defn unhide-all 151 | "Recursively undoes the effect of both hide and hide-tag" 152 | [reduction-type grammar] 153 | (if-let [reduction (reduction-types reduction-type)] 154 | (into {} (for [[k v] grammar] 155 | [k (assoc (unhide-content v) :red (reduction k))])) 156 | (throw-illegal-argument-exception 157 | "Invalid output format " reduction-type ". Use :enlive or :hiccup."))) 158 | 159 | 160 | ;; New beta feature: automatically add whitespace 161 | 162 | (defn auto-whitespace-parser [parser ws-parser] 163 | (case (:tag parser) 164 | (:nt :epsilon) parser 165 | (:opt :plus :star :rep :look :neg) (update-in parser [:parser] auto-whitespace-parser ws-parser) 166 | (:alt :cat) (assoc parser :parsers 167 | (map #(auto-whitespace-parser % ws-parser) (:parsers parser))) 168 | :ord (assoc parser 169 | :parser1 (auto-whitespace-parser (:parser1 parser) ws-parser) 170 | :parser2 (auto-whitespace-parser (:parser2 parser) ws-parser)) 171 | (:string :string-ci :regexp) 172 | ; If the string/regexp has a reduction associated with it, 173 | ; we need to "lift" that reduction out to the (cat whitespace string) 174 | ; parser that is being created. 175 | (if (:red parser) 176 | (assoc (cat ws-parser (dissoc parser :red)) :red (:red parser)) 177 | (cat ws-parser parser)))) 178 | 179 | (defn auto-whitespace [grammar start grammar-ws start-ws] 180 | (let [ws-parser (hide (opt (nt start-ws))) 181 | grammar-ws (assoc grammar-ws start-ws (hide-tag (grammar-ws start-ws))) 182 | modified-grammar (into {} 183 | (for [[nt parser] grammar] 184 | [nt (auto-whitespace-parser parser ws-parser)])) 185 | final-grammar (assoc modified-grammar start 186 | (assoc (cat (dissoc (modified-grammar start) :red) 187 | ws-parser) 188 | :red (:red (modified-grammar start))))] 189 | (merge final-grammar grammar-ws))) 190 | -------------------------------------------------------------------------------- /src/instaparse/core.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.core 2 | #?(:cljs 3 | (:require-macros [instaparse.core] 4 | [instaparse.macros :refer [defclone set-global-var!]])) 5 | (:require [clojure.walk :as walk] 6 | [instaparse.gll :as gll] 7 | [instaparse.cfg :as cfg] 8 | [instaparse.failure :as fail] 9 | [instaparse.print :as print] 10 | [instaparse.reduction :as red] 11 | [instaparse.transform :as t] 12 | [instaparse.abnf :as abnf] 13 | [instaparse.repeat :as repeat] 14 | [instaparse.combinators-source :as c] 15 | [instaparse.line-col :as lc] 16 | [instaparse.viz :as viz] 17 | [instaparse.util :refer [throw-illegal-argument-exception]] 18 | #?(:clj [instaparse.macros :refer [defclone set-global-var!]]))) 19 | 20 | (def ^:dynamic *default-output-format* :hiccup) 21 | (defn set-default-output-format! 22 | "Changes the default output format. Input should be :hiccup or :enlive" 23 | [type] 24 | {:pre [(#{:hiccup :enlive} type)]} 25 | (set-global-var! *default-output-format* type)) 26 | 27 | (def ^:dynamic *default-input-format* :ebnf) 28 | (defn set-default-input-format! 29 | "Changes the default input format. Input should be :abnf or :ebnf" 30 | [type] 31 | {:pre [(#{:abnf :ebnf} type)]} 32 | (set-global-var! *default-input-format* type)) 33 | 34 | (declare failure? standard-whitespace-parsers enable-tracing!) 35 | 36 | (defn- unhide-parser [parser unhide] 37 | (case unhide 38 | nil parser 39 | :content 40 | (assoc parser :grammar (c/unhide-all-content (:grammar parser))) 41 | :tags 42 | (assoc parser :grammar (c/unhide-tags (:output-format parser) 43 | (:grammar parser))) 44 | :all 45 | (assoc parser :grammar (c/unhide-all (:output-format parser) 46 | (:grammar parser))))) 47 | 48 | (defn parse 49 | "Use parser to parse the text. Returns first parse tree found 50 | that completely parses the text. If no parse tree is possible, returns 51 | a Failure object. 52 | 53 | Optional keyword arguments: 54 | :start :keyword (where :keyword is name of starting production rule) 55 | :partial true (parses that don't consume the whole string are okay) 56 | :total true (if parse fails, embed failure node in tree) 57 | :unhide <:tags or :content or :all> (for this parse, disable hiding) 58 | :optimize :memory (when possible, employ strategy to use less memory) 59 | 60 | Clj only: 61 | :trace true (print diagnostic trace while parsing)" 62 | [parser text &{:as options}] 63 | {:pre [(contains? #{:tags :content :all nil} (get options :unhide)) 64 | (contains? #{:memory nil} (get options :optimize))]} 65 | (let [start-production 66 | (get options :start (:start-production parser)), 67 | 68 | partial? 69 | (get options :partial false) 70 | 71 | optimize? 72 | (get options :optimize false) 73 | 74 | unhide 75 | (get options :unhide) 76 | 77 | trace? 78 | (get options :trace false) 79 | 80 | #?@(:clj [_ (when (and trace? (not gll/TRACE)) (enable-tracing!))]) 81 | 82 | parser (unhide-parser parser unhide)] 83 | (->> (cond 84 | (:total options) 85 | (gll/parse-total (:grammar parser) start-production text 86 | partial? (red/node-builders (:output-format parser))) 87 | 88 | (and optimize? (not partial?)) 89 | (let [result (repeat/try-repeating-parse-strategy parser text start-production)] 90 | (if (failure? result) 91 | (gll/parse (:grammar parser) start-production text partial?) 92 | result)) 93 | 94 | :else 95 | (gll/parse (:grammar parser) start-production text partial?)) 96 | 97 | #?(:clj (gll/bind-trace trace?))))) 98 | 99 | (defn parses 100 | "Use parser to parse the text. Returns lazy seq of all parse trees 101 | that completely parse the text. If no parse tree is possible, returns 102 | () with a Failure object attached as metadata. 103 | 104 | Optional keyword arguments: 105 | :start :keyword (where :keyword is name of starting production rule) 106 | :partial true (parses that don't consume the whole string are okay) 107 | :total true (if parse fails, embed failure node in tree) 108 | :unhide <:tags or :content or :all> (for this parse, disable hiding) 109 | 110 | Clj only: 111 | :trace true (print diagnostic trace while parsing)" 112 | [parser text &{:as options}] 113 | {:pre [(contains? #{:tags :content :all nil} (get options :unhide))]} 114 | (let [start-production 115 | (get options :start (:start-production parser)), 116 | 117 | partial? 118 | (get options :partial false) 119 | 120 | unhide 121 | (get options :unhide) 122 | 123 | trace? 124 | (get options :trace false) 125 | 126 | #?@(:clj [_ (when (and trace? (not gll/TRACE)) (enable-tracing!))]) 127 | 128 | parser (unhide-parser parser unhide)] 129 | (->> (cond 130 | (:total options) 131 | (gll/parses-total (:grammar parser) start-production text 132 | partial? (red/node-builders (:output-format parser))) 133 | 134 | :else 135 | (gll/parses (:grammar parser) start-production text partial?)) 136 | 137 | #?(:clj (gll/bind-trace trace?))))) 138 | 139 | (defrecord Parser [grammar start-production output-format] 140 | #?@(:clj 141 | [clojure.lang.IFn 142 | (invoke [parser text] (parse parser text)) 143 | (invoke [parser text key1 val1] (parse parser text key1 val1)) 144 | (invoke [parser text key1 val1 key2 val2] (parse parser text key1 val1 key2 val2)) 145 | (invoke [parser text key1 val1 key2 val2 key3 val3] (parse parser text key1 val1 key2 val2 key3 val3)) 146 | (applyTo [parser args] (apply parse parser args))] 147 | 148 | :cljs 149 | [IFn 150 | (-invoke [parser text] (parse parser text)) 151 | (-invoke [parser text key1 val1] (parse parser text key1 val1)) 152 | (-invoke [parser text key1 val1 key2 val2] (parse parser text key1 val1 key2 val2)) 153 | (-invoke [parser text key1 val1 key2 val2 key3 val3] (parse parser text key1 val1 key2 val2 key3 val3)) 154 | (-invoke [parser text a b c d e f g h] (parse parser text a b c d e f g h)) 155 | (-invoke [parser text a b c d e f g h i j] (parse parser text a b c d e f g h i j)) 156 | (-invoke [parser text a b c d e f g h i j k l] (parse parser text a b c d e f g h i j k l)) 157 | (-invoke [parser text a b c d e f g h i j k l m n] (parse parser text a b c d e f g h i j k l m n)) 158 | (-invoke [parser text a b c d e f g h i j k l m n o p] (parse parser text a b c d e f g h i j k l m n o p)) 159 | (-invoke [parser text a b c d e f g h i j k l m n o p q r] (parse parser text a b c d e f g h i j k l m n o p)) 160 | (-invoke [parser text a b c d e f g h i j k l m n o p q r s more] (apply parse parser text a b c d e f g h i j k l m n o p q r s more))])) 161 | 162 | #?(:clj 163 | (defmethod clojure.core/print-method Parser [x writer] 164 | (binding [*out* writer] 165 | (println (print/Parser->str x)))) 166 | :cljs 167 | (extend-protocol IPrintWithWriter 168 | instaparse.core/Parser 169 | (-pr-writer [parser writer _] 170 | (-write writer (print/Parser->str parser))))) 171 | 172 | (defn parser 173 | "Takes a string specification of a context-free grammar, 174 | or a URI for a text file containing such a specification (Clj only), 175 | or a map of parser combinators and returns a parser for that grammar. 176 | 177 | Optional keyword arguments: 178 | :input-format :ebnf 179 | or 180 | :input-format :abnf 181 | 182 | :output-format :enlive 183 | or 184 | :output-format :hiccup 185 | 186 | :start :keyword (where :keyword is name of starting production rule) 187 | 188 | :string-ci true (treat all string literals as case insensitive) 189 | 190 | :allow-namespaced-nts true (allow namespaced non-terminals in parser specification; 191 | parser's output will use corresponding namespaced keywords) 192 | 193 | :auto-whitespace (:standard or :comma) 194 | or 195 | :auto-whitespace custom-whitespace-parser 196 | 197 | Clj only: 198 | :no-slurp true (disables use of slurp to auto-detect whether 199 | input is a URI. When using this option, input 200 | must be a grammar string or grammar map. Useful 201 | for platforms where slurp is slow or not available.)" 202 | [grammar-specification &{:as options}] 203 | {:pre [(contains? #{:abnf :ebnf nil} (get options :input-format)) 204 | (contains? #{:enlive :hiccup nil} (get options :output-format)) 205 | (let [ws-parser (get options :auto-whitespace)] 206 | (or (nil? ws-parser) 207 | (contains? standard-whitespace-parsers ws-parser) 208 | (and 209 | (map? ws-parser) 210 | (contains? ws-parser :grammar) 211 | (contains? ws-parser :start-production))))]} 212 | (let [input-format (get options :input-format *default-input-format*) 213 | build-parser 214 | (fn [spec output-format] 215 | (binding [cfg/*case-insensitive-literals* (:string-ci options :default)] 216 | (case input-format 217 | :abnf (abnf/build-parser spec output-format) 218 | :ebnf (cfg/build-parser spec output-format (:allow-namespaced-nts options false))))) 219 | output-format (get options :output-format *default-output-format*) 220 | start (get options :start nil) 221 | 222 | built-parser 223 | (cond 224 | (string? grammar-specification) 225 | (let [parser 226 | #?(:clj 227 | (if (get options :no-slurp) 228 | ;; if :no-slurp is set to true, string is a grammar spec 229 | (build-parser grammar-specification output-format) 230 | ;; otherwise, grammar-specification might be a URI, 231 | ;; let's slurp to see 232 | (try (let [spec (slurp grammar-specification)] 233 | (build-parser spec output-format)) 234 | (catch java.io.FileNotFoundException e 235 | (build-parser grammar-specification output-format)))) 236 | :cljs 237 | (build-parser grammar-specification output-format))] 238 | (if start (map->Parser (assoc parser :start-production start)) 239 | (map->Parser parser))) 240 | 241 | (map? grammar-specification) 242 | (let [parser 243 | (cfg/build-parser-from-combinators grammar-specification 244 | output-format 245 | start)] 246 | (map->Parser parser)) 247 | 248 | (vector? grammar-specification) 249 | (let [start (if start start (grammar-specification 0)) 250 | parser 251 | (cfg/build-parser-from-combinators (apply hash-map grammar-specification) 252 | output-format 253 | start)] 254 | (map->Parser parser)) 255 | 256 | :else 257 | #?(:clj 258 | (let [spec (slurp grammar-specification) 259 | parser (build-parser spec output-format)] 260 | (if start (map->Parser (assoc parser :start-production start)) 261 | (map->Parser parser))) 262 | :cljs 263 | (throw-illegal-argument-exception 264 | "Expected string, map, or vector as grammar specification, got " 265 | (pr-str grammar-specification))))] 266 | 267 | (let [auto-whitespace (get options :auto-whitespace) 268 | ; auto-whitespace is keyword, parser, or nil 269 | whitespace-parser (if (keyword? auto-whitespace) 270 | (get standard-whitespace-parsers auto-whitespace) 271 | auto-whitespace)] 272 | (if-let [{ws-grammar :grammar ws-start :start-production} whitespace-parser] 273 | (assoc built-parser :grammar 274 | (c/auto-whitespace (:grammar built-parser) (:start-production built-parser) 275 | ws-grammar ws-start)) 276 | built-parser)))) 277 | 278 | #?(:clj 279 | (defmacro defparser 280 | "Takes a string specification of a context-free grammar, 281 | or a string URI for a text file containing such a specification, 282 | or a map/vector of parser combinators, and sets a variable to a parser for that grammar. 283 | 284 | String specifications are processed at macro-time, not runtime, so this is an 285 | appealing alternative to (def _ (parser \"...\")) for ClojureScript users. 286 | 287 | Optional keyword arguments unique to `defparser`: 288 | - :instaparse.abnf/case-insensitive true" 289 | [name grammar & {:as opts}] 290 | ;; For each of the macro-time opts, ensure that they are the data 291 | ;; types we expect, not more complex quoted expressions. 292 | {:pre [(or (nil? (:input-format opts)) 293 | (keyword? (:input-format opts))) 294 | (or (nil? (:output-format opts)) 295 | (keyword? (:output-format opts))) 296 | (contains? #{true false nil} (:string-ci opts)) 297 | (contains? #{true false nil} (:no-slurp opts))]} 298 | (if (string? grammar) 299 | `(def ~name 300 | (map->Parser 301 | ~(binding [abnf/*case-insensitive* (:instaparse.abnf/case-insensitive opts false)] 302 | (let [macro-time-opts (select-keys opts [:input-format 303 | :output-format 304 | :string-ci 305 | :no-slurp]) 306 | runtime-opts (dissoc opts :start) 307 | macro-time-parser (apply parser grammar (apply concat macro-time-opts)) 308 | pre-processed-grammar (:grammar macro-time-parser) 309 | 310 | grammar-producing-code 311 | (->> pre-processed-grammar 312 | (walk/postwalk 313 | (fn [form] 314 | (cond 315 | ;; Lists cannot be evaluated verbatim 316 | (seq? form) 317 | (list* 'list form) 318 | 319 | ;; Regexp terminals are handled differently in cljs 320 | (= :regexp (:tag form)) 321 | `(merge (c/regexp ~(str (:regexp form))) 322 | ~(dissoc form :tag :regexp)) 323 | 324 | :else form)))) 325 | 326 | start-production 327 | (or (:start opts) (:start-production macro-time-parser))] 328 | `(parser ~grammar-producing-code 329 | :start ~start-production 330 | ~@(apply concat runtime-opts)))))) 331 | `(def ~name (parser ~grammar ~@(apply concat opts)))))) 332 | 333 | (defn failure? 334 | "Tests whether a parse result is a failure." 335 | [result] 336 | (or 337 | (instance? gll/failure-type result) 338 | (instance? gll/failure-type (meta result)))) 339 | 340 | (defn get-failure 341 | "Extracts failure object from failed parse result." 342 | [result] 343 | (cond 344 | (instance? gll/failure-type result) 345 | result 346 | (instance? gll/failure-type (meta result)) 347 | (meta result) 348 | :else 349 | nil)) 350 | 351 | (def ^:private standard-whitespace-parsers 352 | {:standard (parser "whitespace = #'\\s+'") 353 | :comma (parser "whitespace = #'[,\\s]+'")}) 354 | 355 | #?(:clj 356 | (defn enable-tracing! 357 | "Recompiles instaparse with tracing enabled. 358 | This is called implicitly the first time you invoke a parser with 359 | `:trace true` so usually you will not need to call this directly." 360 | [] 361 | (alter-var-root #'gll/TRACE (constantly true)) 362 | (alter-var-root #'gll/PROFILE (constantly true)) 363 | (require 'instaparse.gll :reload))) 364 | 365 | #?(:clj 366 | (defn disable-tracing! 367 | "Recompiles instaparse with tracing disabled. 368 | Call this to restore regular performance characteristics, eliminating 369 | the small performance hit imposed by tracing." 370 | [] 371 | (alter-var-root #'gll/TRACE (constantly false)) 372 | (alter-var-root #'gll/PROFILE (constantly false)) 373 | (require 'instaparse.gll :reload))) 374 | 375 | (defclone transform t/transform) 376 | 377 | (defclone add-line-and-column-info-to-metadata lc/add-line-col-spans) 378 | 379 | (defclone span viz/span) 380 | 381 | #?(:clj (defclone visualize viz/tree-viz)) 382 | -------------------------------------------------------------------------------- /src/instaparse/failure.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.failure 2 | "Facilities for printing and manipulating error messages." 3 | #?(:clj (:import java.io.BufferedReader java.io.StringReader)) 4 | (:require [instaparse.print :as print])) 5 | 6 | (defn index->line-column 7 | "Takes an index into text, and determines the line and column info" 8 | [index text] 9 | (loop [line 1, col 1, counter 0] 10 | (cond 11 | (= index counter) {:line line :column col} 12 | (= \newline (get text counter)) (recur (inc line) 1 (inc counter)) 13 | :else (recur line (inc col) (inc counter))))) 14 | 15 | #?(:clj 16 | (defn get-line 17 | "Returns nth line of text, 1-based" 18 | [n text] 19 | (try (nth (line-seq (BufferedReader. (StringReader. (str text)))) (dec n)) 20 | (catch Exception e ""))) 21 | :cljs 22 | (defn get-line 23 | [n text] 24 | (loop [chars (seq (clojure.string/replace text "\r\n" "\n")) 25 | n n] 26 | (cond 27 | (empty? chars) "" 28 | (= n 1) (apply str (take-while (complement #{\newline}) chars)) 29 | (= \newline (first chars)) (recur (next chars) (dec n)) 30 | :else (recur (next chars) n))))) 31 | 32 | (defn marker 33 | "Creates string with caret at nth position, 1-based 34 | and accounts for horizontal tabs which might change 35 | the alignment of the '^' to the error location." 36 | [text n] 37 | (when (and text (integer? n)) 38 | (let [marker-text (clojure.string/replace text #"[^\s]" " ")] 39 | (if (<= n 1) 40 | "^" 41 | (str (subs marker-text 0 (dec n)) \^))))) 42 | 43 | (defn augment-failure 44 | "Adds text, line, and column info to failure object." 45 | [failure text] 46 | (let [lc (index->line-column (:index failure) text)] 47 | (merge failure 48 | lc 49 | {:text (get-line (:line lc) text)}))) 50 | 51 | (defn print-reason 52 | "Provides special case for printing negative lookahead reasons" 53 | [r] 54 | (cond 55 | (:NOT r) 56 | (do (print "NOT ") 57 | (print (:NOT r))), 58 | (:char-range r) 59 | (print (print/char-range->str r)) 60 | (instance? #?(:clj java.util.regex.Pattern 61 | :cljs js/RegExp) 62 | r) 63 | (print (print/regexp->str r)) 64 | :else 65 | (pr r))) 66 | 67 | (defn pprint-failure 68 | "Takes an augmented failure object and prints the error message" 69 | [{:keys [line column text reason]}] 70 | (println (str "Parse error at line " line ", column " column ":")) 71 | (println text) 72 | (println (marker text column)) 73 | (let [full-reasons (distinct (map :expecting 74 | (filter :full reason))) 75 | partial-reasons (distinct (map :expecting 76 | (filter (complement :full) reason))) 77 | total (+ (count full-reasons) (count partial-reasons))] 78 | (cond (zero? total) nil 79 | (= 1 total) (println "Expected:") 80 | :else (println "Expected one of:")) 81 | (doseq [r full-reasons] 82 | (print-reason r) 83 | (println " (followed by end-of-string)")) 84 | (doseq [r partial-reasons] 85 | (print-reason r) 86 | (println)))) 87 | -------------------------------------------------------------------------------- /src/instaparse/line_col.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.line-col 2 | (:require [instaparse.transform] 3 | [instaparse.util :refer [throw-illegal-argument-exception]])) 4 | 5 | ; Function to annotate parse-tree with line and column metadata. 6 | 7 | (defrecord Cursor [^int index ^long line ^long column]) 8 | 9 | (defn- advance-cursor [^Cursor cursor ^String text new-index] 10 | (let [new-index (int new-index)] 11 | (assert (<= (.-index cursor) new-index)) 12 | (if (= (.-index cursor) new-index) cursor 13 | (loop [index (.-index cursor), line (.-line cursor), column (.-column cursor)] 14 | (cond 15 | (= index new-index) (Cursor. index line column) 16 | (= (.charAt text index) \newline) (recur (inc index) (inc line) 1) 17 | :else (recur (inc index) line (inc column))))))) 18 | 19 | (defn- make-line-col-fn 20 | "Given a string `text`, returns a function that takes an index into the string, 21 | and returns a cursor, including line and column information. For efficiency, 22 | inputs must be fed into the function in increasing order." 23 | [^String text start-line start-column] 24 | (let [cursor-state (atom (Cursor. 0 start-line start-column))] 25 | (fn line-col [i] 26 | (swap! cursor-state advance-cursor text i) 27 | @cursor-state))) 28 | 29 | (defn- hiccup-add-line-col-spans 30 | [line-col-fn parse-tree] 31 | (let [m (meta parse-tree), 32 | start-index (:instaparse.gll/start-index m), 33 | end-index (:instaparse.gll/end-index m)] 34 | (if (and start-index end-index) 35 | (let [start-cursor (line-col-fn start-index), 36 | children (doall (map (partial hiccup-add-line-col-spans line-col-fn) (next parse-tree))), 37 | end-cursor (line-col-fn end-index)] 38 | (with-meta 39 | (into [(first parse-tree)] children) 40 | (merge (meta parse-tree) 41 | {:instaparse.gll/start-line (:line start-cursor) 42 | :instaparse.gll/start-column (:column start-cursor) 43 | :instaparse.gll/end-line (:line end-cursor) 44 | :instaparse.gll/end-column (:column end-cursor)}))) 45 | parse-tree))) 46 | 47 | (defn- enlive-add-line-col-spans 48 | [line-col-fn parse-tree] 49 | (let [m (meta parse-tree), 50 | start-index (:instaparse.gll/start-index m), 51 | end-index (:instaparse.gll/end-index m)] 52 | (if (and start-index end-index) 53 | (let [start-cursor (line-col-fn start-index), 54 | children (doall (map (partial enlive-add-line-col-spans line-col-fn) (:content parse-tree))), 55 | end-cursor (line-col-fn end-index)] 56 | (with-meta 57 | (assoc parse-tree :content children) 58 | (merge (meta parse-tree) 59 | {:instaparse.gll/start-line (:line start-cursor) 60 | :instaparse.gll/start-column (:column start-cursor) 61 | :instaparse.gll/end-line (:line end-cursor) 62 | :instaparse.gll/end-column (:column end-cursor)}))) 63 | parse-tree))) 64 | 65 | (defn add-line-col-spans 66 | "Given a string `text` and a `parse-tree` for text, return parse tree 67 | with its metadata annotated with line and column info. The info can 68 | then be found in the metadata map under the keywords: 69 | 70 | :instaparse.gll/start-line, :instaparse.gll/start-column, 71 | :instaparse.gll/end-line, :instaparse.gll/end-column 72 | 73 | The start is inclusive, the end is exclusive. Lines and columns are 1-based." 74 | ([text parse-tree] (add-line-col-spans text 1 1 parse-tree)) 75 | ([text start-line start-column parse-tree] 76 | (let [line-col-fn (make-line-col-fn text start-line start-column)] 77 | (cond 78 | (nil? parse-tree) nil 79 | 80 | (and (map? parse-tree) (:tag parse-tree)) 81 | ; This is an enlive tree-seq 82 | (enlive-add-line-col-spans line-col-fn parse-tree) 83 | 84 | (and (vector? parse-tree) (keyword? (first parse-tree))) 85 | ; This is a hiccup tree-seq 86 | (hiccup-add-line-col-spans line-col-fn parse-tree) 87 | 88 | (and (sequential? parse-tree) (map? (first parse-tree)) (:tag (first parse-tree))) 89 | ; This is an enlive tree with hidden root tag 90 | (instaparse.transform/map-preserving-meta 91 | (partial enlive-add-line-col-spans line-col-fn) parse-tree) 92 | 93 | (and (sequential? parse-tree) (vector? (first parse-tree)) (keyword? (first (first parse-tree)))) 94 | ; This is a hiccup tree with hidden root tag 95 | (instaparse.transform/map-preserving-meta 96 | (partial hiccup-add-line-col-spans line-col-fn) parse-tree) 97 | 98 | (instance? instaparse.gll.Failure parse-tree) 99 | ; pass failures through unchanged 100 | parse-tree 101 | 102 | :else 103 | (throw-illegal-argument-exception 104 | "Invalid parse-tree, not recognized as either enlive or hiccup format."))))) 105 | -------------------------------------------------------------------------------- /src/instaparse/macros.clj: -------------------------------------------------------------------------------- 1 | (ns instaparse.macros) 2 | 3 | (defmacro defclone [here there] 4 | (if (contains? &env :locals) 5 | ;; cljs 6 | `(def ~here ~there) 7 | ;; clj 8 | `(do 9 | (def ~here ~there) 10 | (alter-meta! (var ~here) assoc 11 | :doc (:doc (meta (var ~there))) 12 | :arglists (:arglists (meta (var ~there))) 13 | :file (:file (meta (var ~there))) 14 | :line (:line (meta (var ~there))) 15 | :column (:column (meta (var ~there)))) 16 | (var ~here)))) 17 | 18 | (defmacro set-global-var! 19 | [v value] 20 | (if (contains? &env :locals) 21 | ;; cljs 22 | `(set! ~v ~value) 23 | ;; clj 24 | `(alter-var-root (var ~v) (constantly ~value)))) 25 | -------------------------------------------------------------------------------- /src/instaparse/print.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.print 2 | "Facilities for taking parsers and grammars, and converting them to strings. 3 | Used for pretty-printing." 4 | (:require [clojure.string :as str])) 5 | 6 | (declare combinators->str) ; mutual recursion 7 | 8 | (defn paren-for-tags [tag-set hidden? parser] 9 | (if (and (not hidden?) (tag-set (parser :tag))) 10 | (str "(" (combinators->str parser false) ")") 11 | (combinators->str parser false))) 12 | 13 | (def paren-for-compound 14 | (partial paren-for-tags #{:alt :ord :cat})) 15 | 16 | (defn regexp-replace 17 | "Replaces whitespace characters with escape sequences for better printing" 18 | [s] 19 | (case s 20 | "\n" "\\n" 21 | "\b" "\\b" 22 | "\f" "\\f" 23 | "\r" "\\r" 24 | "\t" "\\t" 25 | s)) 26 | 27 | (defn regexp->str [r] 28 | (str/replace 29 | (str "#\"" 30 | #?(:clj (str r) 31 | :cljs (subs (.-source r) 1)) 32 | "\"") 33 | #"[\s]" regexp-replace)) 34 | 35 | #?(:clj 36 | (defn char-range->str [{:keys [lo hi]}] 37 | (if (= lo hi) 38 | (format "%%x%04x" lo) 39 | (format "%%x%04x-%04x" lo hi))) 40 | 41 | :cljs 42 | (do 43 | (defn number->hex-padded [n] 44 | (if (<= n 0xFFF) 45 | (.substr (str "0000" (.toString n 16)) -4) 46 | (.toString n 16))) 47 | 48 | (defn char-range->str [{:keys [lo hi]}] 49 | (if (= lo hi) 50 | (str "%x" (number->hex-padded lo)) 51 | (str "%x" (number->hex-padded lo) "-" (number->hex-padded hi)))))) 52 | 53 | (defn combinators->str 54 | "Stringifies a parser built from combinators" 55 | ([p] (combinators->str p false)) 56 | ([{:keys [parser parser1 parser2 parsers tag] :as p} hidden?] 57 | (if (and (not hidden?) (:hide p)) 58 | (str \< (combinators->str p true) \>) 59 | (case tag 60 | :epsilon "\u03b5" 61 | :opt (str (paren-for-compound hidden? parser) "?") 62 | :plus (str (paren-for-compound hidden? parser) "+") 63 | :star (str (paren-for-compound hidden? parser) "*") 64 | :rep (if (not= (:min p) (:max p)) 65 | (str (paren-for-compound hidden? parser) \{ 66 | (:min p) \, (:max p) \}) 67 | (str (paren-for-compound hidden? parser) \{ 68 | (:min p)\})) 69 | :alt (str/join " | " (map (partial paren-for-tags #{:ord} hidden?) parsers)) 70 | :ord (str (paren-for-tags #{:alt} hidden? parser1) 71 | " / " 72 | (paren-for-tags #{:alt} hidden? parser2)) 73 | :cat (str/join " " (map (partial paren-for-tags #{:alt :ord} hidden?) parsers)) 74 | :string (with-out-str (pr (:string p))) 75 | :string-ci (with-out-str (pr (:string p))) 76 | :char (char-range->str p) 77 | :regexp (regexp->str (:regexp p)) 78 | :nt (subs (str (:keyword p)) 1) 79 | :look (str "&" (paren-for-compound hidden? parser)) 80 | :neg (str "!" (paren-for-compound hidden? parser)))))) 81 | 82 | (defn non-terminal->str [non-terminal] 83 | (if-let [ns (namespace non-terminal)] 84 | (str ns "/" (name non-terminal)) 85 | (name non-terminal))) 86 | 87 | (defn rule->str 88 | "Takes a non-terminal symbol and a parser built from combinators, 89 | and returns a string for the rule." 90 | [non-terminal parser] 91 | (if (= (-> parser :red :reduction-type) :raw) 92 | (str \< (name non-terminal) \> 93 | " = " 94 | (combinators->str parser)) 95 | (str (non-terminal->str non-terminal) 96 | " = " 97 | (combinators->str parser)))) 98 | 99 | (defn Parser->str 100 | "Takes a Parser object, i.e., something with a grammar map and a start 101 | production keyword, and stringifies it." 102 | [{grammar :grammar start :start-production}] 103 | (str/join \newline 104 | (cons 105 | ; Put starting production first 106 | (rule->str start (grammar start)) 107 | ; Then the others 108 | (for [[non-terminal parser] grammar 109 | :when (not= non-terminal start)] 110 | (rule->str non-terminal parser))))) 111 | -------------------------------------------------------------------------------- /src/instaparse/reduction.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.reduction 2 | (:require [instaparse.auto-flatten-seq :as afs] 3 | [instaparse.util :refer [throw-illegal-argument-exception]])) 4 | 5 | ;; utilities 6 | 7 | (defn singleton? [s] 8 | (and (seq s) (not (next s)))) 9 | 10 | ;; red is a reduction combinator for expert use only 11 | ;; because it is used internally to control the tree tags that 12 | ;; are displayed, so adding a different reduction would change 13 | ;; that behavior. 14 | 15 | (defn red [parser f] (assoc parser :red f)) 16 | 17 | ;; Flattening and reductions 18 | 19 | (def raw-non-terminal-reduction {:reduction-type :raw}) 20 | 21 | (defn HiccupNonTerminalReduction [key] 22 | {:reduction-type :hiccup :key key}) 23 | 24 | (defn EnliveNonTerminalReduction [key] 25 | {:reduction-type :enlive, :key key}) 26 | 27 | (def ^:constant reduction-types 28 | {:hiccup HiccupNonTerminalReduction 29 | :enlive EnliveNonTerminalReduction}) 30 | 31 | (def ^:constant node-builders 32 | ; A map of functions for building a node that only has one item 33 | ; These functions are used in total-parse mode to build failure nodes 34 | {:enlive (fn [tag item] {:tag tag :content (list item)}) 35 | :hiccup (fn [tag item] [tag item])}) 36 | 37 | (def standard-non-terminal-reduction :hiccup) 38 | 39 | (defn apply-reduction [f result] 40 | (case (:reduction-type f) 41 | :raw (afs/conj-flat afs/EMPTY result) 42 | :hiccup (afs/convert-afs-to-vec (afs/conj-flat (afs/auto-flatten-seq [(:key f)]) result)) 43 | :enlive 44 | (let [content (afs/conj-flat afs/EMPTY result)] 45 | {:tag (:key f), :content (if (zero? (count content)) nil content)}) 46 | (f result))) 47 | 48 | (defn apply-standard-reductions 49 | ([grammar] (apply-standard-reductions standard-non-terminal-reduction grammar)) 50 | ([reduction-type grammar] 51 | (if-let [reduction (reduction-types reduction-type)] 52 | (into {} (for [[k v] grammar] 53 | (if (:red v) [k v] 54 | [k (assoc v :red (reduction k))]))) 55 | (throw-illegal-argument-exception 56 | "Invalid output format " reduction-type ". Use :enlive or :hiccup.")))) 57 | -------------------------------------------------------------------------------- /src/instaparse/repeat.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.repeat 2 | (:require [instaparse.gll :as gll 3 | #?@(:clj [:refer [profile]])] 4 | [instaparse.combinators-source :as c] 5 | [instaparse.auto-flatten-seq :as afs] 6 | [instaparse.viz :as viz] 7 | [instaparse.reduction :as red] 8 | [instaparse.failure :as fail]) 9 | #?(:cljs 10 | (:require-macros [instaparse.gll :refer [profile]]))) 11 | 12 | (defn empty-result? [result] 13 | (or (and (vector? result) (= (count result) 1)) 14 | (and (map? result) (contains? result :tag) (empty? (get result :content))) 15 | (empty? result))) 16 | 17 | (def ^:constant failure-signal (gll/->Failure nil nil)) 18 | 19 | (defn get-end 20 | (#?(:clj ^long [parse] 21 | :cljs ^number [parse]) 22 | (let [[start end] (viz/span parse)] 23 | (if end (long end) (count parse)))) 24 | (#?(:clj ^long [parse ^long index] 25 | :cljs ^number [parse ^number index]) 26 | (let [[start end] (viz/span parse)] 27 | (if end (long end) (+ index (count parse)))))) 28 | 29 | (defn parse-from-index [grammar initial-parser text segment index] 30 | (let [tramp (gll/make-tramp grammar text segment)] 31 | (gll/push-listener tramp [index initial-parser] (gll/TopListener tramp)) 32 | (gll/run tramp))) 33 | 34 | (defn select-parse 35 | "Returns either: 36 | [a-parse end-index a-list-of-valid-follow-up-parses] 37 | [a-parse end-index nil] (successfully reached end of text) 38 | nil (hit a dead-end with this strategy)" 39 | [grammar initial-parser text segment index parses] 40 | ;(clojure.pprint/pprint parses) 41 | (let [length (count text)] 42 | (loop [parses (seq parses)] 43 | (when parses 44 | (let [parse (first parses) 45 | [start end] (viz/span parse) 46 | end (if end end (+ index (count parse)))] 47 | (cond 48 | (= end length) [parse end nil] 49 | :else 50 | (if-let [follow-ups (seq (parse-from-index grammar initial-parser text segment end))] 51 | [parse end follow-ups] 52 | (recur (next parses))))))))) 53 | 54 | (defn repeat-parse-hiccup 55 | ([grammar initial-parser root-tag text segment] 56 | (repeat-parse-hiccup grammar initial-parser root-tag text segment 0)) 57 | ([grammar initial-parser root-tag text segment index] 58 | (let [length (count text) 59 | first-result (parse-from-index grammar initial-parser text segment index)] 60 | (loop [index (long index) 61 | parses (afs/auto-flatten-seq [root-tag]) 62 | 63 | [parse end follow-ups :as selection] 64 | (select-parse grammar initial-parser text segment index first-result)] 65 | (cond 66 | (nil? selection) failure-signal 67 | (= index end) failure-signal 68 | (nil? follow-ups) (gll/safe-with-meta 69 | (afs/convert-afs-to-vec 70 | (afs/conj-flat parses parse)) 71 | {:optimize :memory 72 | :instaparse.gll/start-index 0 73 | :instaparse.gll/end-index length}) 74 | :else (recur (long end) 75 | (afs/conj-flat parses parse) 76 | (select-parse grammar initial-parser text segment end follow-ups))))))) 77 | 78 | (defn repeat-parse-enlive 79 | ([grammar initial-parser root-tag text segment] 80 | (repeat-parse-enlive grammar initial-parser root-tag text segment 0)) 81 | ([grammar initial-parser root-tag text segment index] 82 | (let [length (count text) 83 | first-result (parse-from-index grammar initial-parser text segment index)] 84 | (loop [index (long index) 85 | parses afs/EMPTY 86 | 87 | [parse end follow-ups :as selection] 88 | (select-parse grammar initial-parser text segment index first-result)] 89 | (cond 90 | (nil? selection) failure-signal 91 | (= index end) failure-signal 92 | (nil? follow-ups) (gll/safe-with-meta 93 | {:tag root-tag 94 | :content (seq (afs/conj-flat parses parse))} 95 | {:optimize :memory 96 | :instaparse.gll/start-index 0 97 | :instaparse.gll/end-index length}) 98 | :else (recur (long end) 99 | (afs/conj-flat parses parse) 100 | (select-parse grammar initial-parser text segment end follow-ups))))))) 101 | 102 | (defn repeat-parse-no-tag 103 | ([grammar initial-parser text segment] 104 | (repeat-parse-no-tag grammar initial-parser text segment 0)) 105 | ([grammar initial-parser text segment index] 106 | (let [length (count text) 107 | first-result (parse-from-index grammar initial-parser text segment index)] 108 | (loop [index (long index) 109 | parses afs/EMPTY 110 | 111 | [parse end follow-ups :as selection] 112 | (select-parse grammar initial-parser text segment index first-result)] 113 | (cond 114 | (nil? selection) failure-signal 115 | (= index end) failure-signal 116 | (nil? follow-ups) (gll/safe-with-meta 117 | (afs/conj-flat parses parse) 118 | {:optimize :memory 119 | :instaparse.gll/start-index 0 120 | :instaparse.gll/end-index length}) 121 | :else (recur (long end) 122 | (afs/conj-flat parses parse) 123 | (select-parse grammar initial-parser text segment end follow-ups))))))) 124 | 125 | (defn repeat-parse 126 | ([grammar initial-parser output-format text] (repeat-parse-no-tag grammar initial-parser text (gll/text->segment text))) 127 | ([grammar initial-parser output-format root-tag text] 128 | {:pre [(#{:hiccup :enlive} output-format)]} 129 | (cond 130 | (= output-format :hiccup) 131 | (repeat-parse-hiccup grammar initial-parser root-tag text (gll/text->segment text)) 132 | (= output-format :enlive) 133 | (repeat-parse-enlive grammar initial-parser root-tag text (gll/text->segment text))))) 134 | 135 | (defn repeat-parse-with-header 136 | ([grammar header-parser repeating-parser output-format root-tag text] 137 | (let [segment (gll/text->segment text) 138 | length (count text) 139 | header-results (parse-from-index grammar header-parser text segment 0)] 140 | (if (or (empty? header-results) 141 | (:hide header-parser)) 142 | failure-signal 143 | (let [header-result (apply max-key get-end header-results) 144 | end (get-end header-result) 145 | repeat-result (repeat-parse-no-tag grammar (:parser repeating-parser) text segment end) 146 | span-meta {:optimize :memory 147 | :instaparse.gll/start-index 0 148 | :instaparse.gll/end-index length}] 149 | (if (or (instance? instaparse.gll.Failure repeat-result) 150 | (and (= (:tag repeating-parser) :star) 151 | (empty-result? repeat-result))) 152 | failure-signal 153 | (case output-format 154 | :enlive 155 | (gll/safe-with-meta 156 | {:tag root-tag 157 | :content 158 | (afs/conj-flat (afs/conj-flat afs/EMPTY header-result) repeat-result)} 159 | span-meta) 160 | :hiccup 161 | (gll/safe-with-meta 162 | (afs/convert-afs-to-vec 163 | (afs/conj-flat (afs/conj-flat (afs/auto-flatten-seq [root-tag]) 164 | header-result) 165 | repeat-result)) 166 | span-meta) 167 | (gll/safe-with-meta 168 | (afs/conj-flat (afs/conj-flat afs/EMPTY header-result) repeat-result) 169 | span-meta)))))))) 170 | 171 | (defn try-repeating-parse-strategy-with-header 172 | [grammar text start-production start-rule output-format] 173 | (gll/profile (gll/clear!)) 174 | (let [parsers (:parsers start-rule) 175 | repeating-parser (last parsers)] 176 | (if 177 | (not (and (= (:tag start-rule) :cat) 178 | (#{:star :plus} (:tag repeating-parser)) 179 | (not (:hide repeating-parser)) 180 | (not (:hide (:parser repeating-parser))))) 181 | failure-signal 182 | (let [header-parser (apply c/cat (butlast parsers))] 183 | (if (= (:red start-rule) red/raw-non-terminal-reduction) 184 | (repeat-parse-with-header grammar header-parser repeating-parser nil start-production text) 185 | (repeat-parse-with-header grammar header-parser repeating-parser output-format start-production text)))))) 186 | 187 | (defn try-repeating-parse-strategy 188 | [parser text start-production] 189 | (let [grammar (:grammar parser) 190 | output-format (:output-format parser) 191 | start-rule (get grammar start-production)] 192 | (profile (gll/clear!)) 193 | (cond 194 | (= (:hide start-rule) true) failure-signal 195 | (= (:red start-rule) red/raw-non-terminal-reduction) 196 | (cond 197 | (= (:tag start-rule) :star) 198 | (repeat-parse grammar (:parser start-rule) output-format text) 199 | (= (:tag start-rule) :plus) 200 | (let [result (repeat-parse grammar (:parser start-rule) output-format text)] 201 | (if (empty-result? result) 202 | failure-signal 203 | result)) 204 | :else (try-repeating-parse-strategy-with-header 205 | grammar text start-production start-rule output-format)) 206 | 207 | (= (:tag start-rule) :star) 208 | (repeat-parse grammar (:parser start-rule) output-format start-production text) 209 | (= (:tag start-rule) :plus) 210 | (let [result (repeat-parse grammar (:parser start-rule) output-format start-production text)] 211 | (if (empty-result? result) 212 | failure-signal 213 | result)) 214 | 215 | :else (try-repeating-parse-strategy-with-header 216 | grammar text start-production start-rule output-format)))) 217 | 218 | (defn used-memory-optimization? [tree] 219 | (= :memory (-> tree meta :optimize))) -------------------------------------------------------------------------------- /src/instaparse/transform.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.transform 2 | "Functions to transform parse trees" 3 | (:require [instaparse.gll] 4 | [instaparse.util :refer [throw-illegal-argument-exception]])) 5 | 6 | (defn map-preserving-meta [f l] 7 | (with-meta (map f l) (meta l))) 8 | 9 | (defn merge-meta 10 | "This variation of the merge-meta in gll does nothing if obj is not 11 | something that can have a metamap attached." 12 | [obj metamap] 13 | (if #?(:clj (instance? clojure.lang.IObj obj) 14 | :cljs (satisfies? IWithMeta obj)) 15 | (instaparse.gll/merge-meta obj metamap) 16 | obj)) 17 | 18 | (defn- enlive-transform 19 | [transform-map parse-tree] 20 | (let [transform (transform-map (:tag parse-tree))] 21 | (cond 22 | transform 23 | (merge-meta 24 | (apply transform (map (partial enlive-transform transform-map) 25 | (:content parse-tree))) 26 | (meta parse-tree)) 27 | (:tag parse-tree) 28 | (assoc parse-tree :content (map (partial enlive-transform transform-map) 29 | (:content parse-tree))) 30 | :else 31 | parse-tree))) 32 | 33 | (defn- hiccup-transform 34 | [transform-map parse-tree] 35 | (if (and (sequential? parse-tree) (seq parse-tree)) 36 | (if-let [transform (transform-map (first parse-tree))] 37 | (merge-meta 38 | (apply transform (map (partial hiccup-transform transform-map) 39 | (next parse-tree))) 40 | (meta parse-tree)) 41 | (with-meta 42 | (into [(first parse-tree)] 43 | (map (partial hiccup-transform transform-map) 44 | (next parse-tree))) 45 | (meta parse-tree))) 46 | parse-tree)) 47 | 48 | (defn transform 49 | "Takes a transform map and a parse tree (or seq of parse-trees). 50 | A transform map is a mapping from tags to 51 | functions that take a node's contents and return 52 | a replacement for the node, i.e., 53 | {:node-tag (fn [child1 child2 ...] node-replacement), 54 | :another-node-tag (fn [child1 child2 ...] node-replacement)}" 55 | [transform-map parse-tree] 56 | ; Detect what kind of tree this is 57 | (cond 58 | (string? parse-tree) 59 | ; This is a leaf of the tree that should pass through unchanged 60 | parse-tree 61 | 62 | (and (map? parse-tree) (:tag parse-tree)) 63 | ; This is an enlive tree-seq 64 | (enlive-transform transform-map parse-tree) 65 | 66 | (and (vector? parse-tree) (keyword? (first parse-tree))) 67 | ; This is a hiccup tree-seq 68 | (hiccup-transform transform-map parse-tree) 69 | 70 | (sequential? parse-tree) 71 | ; This is either a sequence of parse results, or a tree 72 | ; with a hidden root tag. 73 | (map-preserving-meta (partial transform transform-map) parse-tree) 74 | 75 | (instance? instaparse.gll.Failure parse-tree) 76 | ; pass failures through unchanged 77 | parse-tree 78 | 79 | :else 80 | (throw-illegal-argument-exception 81 | "Invalid parse-tree, not recognized as either enlive or hiccup format."))) 82 | -------------------------------------------------------------------------------- /src/instaparse/util.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.util) 2 | 3 | ;; Both appear to be called with several strings as separate 4 | ;; arguments: 5 | (defn throw-runtime-exception 6 | [& message] 7 | (let [^String text (apply str message)] 8 | (-> text 9 | #?(:clj RuntimeException.) 10 | throw))) 11 | 12 | (defn throw-illegal-argument-exception 13 | [& message] 14 | (let [^String text (apply str message)] 15 | (-> text 16 | #?(:clj IllegalArgumentException.) 17 | throw))) 18 | 19 | #?(:cljs 20 | (defn regexp-flags [re] 21 | (cond-> "" 22 | (.-ignoreCase re) (str "i") 23 | (.-multiline re) (str "m") 24 | (.-unicode re) (str "u")))) 25 | -------------------------------------------------------------------------------- /src/instaparse/viz.clj: -------------------------------------------------------------------------------- 1 | (ns instaparse.viz 2 | (:import java.io.IOException)) 3 | 4 | (defn span 5 | "Takes a subtree of the parse tree and returns a [start-index end-index] pair 6 | indicating the span of text parsed by this subtree. 7 | start-index is inclusive and end-index is exclusive, as is customary 8 | with substrings. 9 | Returns nil if no span metadata is attached." 10 | [tree] 11 | (let [m (meta tree) 12 | s (:instaparse.gll/start-index m) 13 | e (:instaparse.gll/end-index m)] 14 | (when (and s e) 15 | [s e]))) 16 | 17 | (def rhizome-newline 18 | ;; Prior to Rhizome 0.2.5., \ was not an escape character so \n needed extra escaping. 19 | (when-let [escape-chars (try (ns-resolve (find-ns 'rhizome.dot) 'escapable-characters) 20 | (catch Exception e nil))] 21 | (if (= escape-chars "|{}\"") 22 | "\\n" 23 | "\n"))) 24 | 25 | 26 | (defn- hiccup-tree-viz 27 | "visualize instaparse hiccup output as a rhizome graph. Requires rhizome: https://github.com/ztellman/rhizome" 28 | [mytree options] 29 | (let [tree->image (resolve 'rhizome.viz/tree->image)] 30 | (tree->image sequential? rest mytree 31 | :node->descriptor (fn [n] {:label (if (sequential? n) 32 | (apply str (first n) 33 | (when (span n) 34 | [rhizome-newline (span n)])) 35 | (with-out-str (pr n)))}) 36 | :options options))) 37 | 38 | (defn- enlive-tree-viz 39 | "visualize enlive trees" 40 | [mytree options] 41 | (let [tree->image (resolve 'rhizome.viz/tree->image)] 42 | (tree->image (comp seq :content) :content mytree 43 | :node->descriptor (fn [n] 44 | {:label (if (and (map? n) (:tag n)) 45 | (apply str (:tag n) 46 | (when (span n) 47 | [rhizome-newline (span n)])) 48 | (with-out-str (pr n)))}) 49 | :options options))) 50 | 51 | (defn tree-type 52 | [tree] 53 | (cond 54 | (and (map? tree) (:tag tree)) :enlive 55 | (and (vector? tree) (keyword? (first tree))) :hiccup 56 | (empty? tree) :nil 57 | (seq? tree) :rootless 58 | :else :invalid)) 59 | 60 | (defn fake-root 61 | "Create a root for a rootless tree" 62 | [children] 63 | (case (tree-type (first children)) 64 | :enlive {:tag :hidden-root-tag 65 | :content children} 66 | :hiccup (into [:hidden-root-tag] 67 | children) 68 | :nil nil 69 | :invalid)) 70 | 71 | (defn tree-viz 72 | "Creates a graphviz visualization of the parse tree. 73 | Optional keyword arguments: 74 | :output-file :buffered-image (return a java.awt.image.BufferedImage object) 75 | or 76 | :output-file output-file (will save the tree image to output-file) 77 | 78 | :options options (options passed along to rhizome) 79 | 80 | Important: This function will only work if you have added rhizome 81 | to your dependencies, and installed graphviz on your system. 82 | See https://github.com/ztellman/rhizome for more information." 83 | [tree & {output-file :output-file options :options}] 84 | {:pre [(not= (tree-type tree) :invalid)]} 85 | (let [ttype (tree-type tree)] 86 | (if (= ttype :rootless) 87 | (tree-viz (fake-root tree) :output-file output-file :options options) 88 | (do 89 | (try 90 | (require 'rhizome.viz) 91 | (catch Exception e 92 | (throw (UnsupportedOperationException. 93 | "\n\nVisualization of parse trees is only supported if you have rhizome among your project dependencies and graphviz installed on your computer.\n 94 | Visit https://github.com/ztellman/rhizome to find out the version info to put in your project.clj file and for links to the graphviz installer.")))) 95 | (let [image 96 | (try 97 | (case (tree-type tree) 98 | :enlive (enlive-tree-viz tree options) 99 | (:hiccup :nil) (hiccup-tree-viz tree options)) 100 | (catch IOException e 101 | (throw (UnsupportedOperationException. 102 | "\n\nYou appear to have rhizome in your dependencies, but have not installed GraphViz on your system. 103 | \nSee https://github.com/ztellman/rhizome for more information.\n")))) 104 | save-image (resolve 'rhizome.viz/save-image) 105 | view-image (resolve 'rhizome.viz/view-image)] 106 | (cond 107 | (= output-file :buffered-image) image 108 | output-file (save-image image output-file) 109 | :else (view-image image))))))) -------------------------------------------------------------------------------- /src/instaparse/viz.cljs: -------------------------------------------------------------------------------- 1 | (ns instaparse.viz) 2 | 3 | (defn span 4 | "Takes a subtree of the parse tree and returns a [start-index end-index] pair 5 | indicating the span of text parsed by this subtree. 6 | start-index is inclusive and end-index is exclusive, as is customary 7 | with substrings. 8 | Returns nil if no span metadata is attached." 9 | [tree] 10 | (let [m (meta tree) 11 | s (:instaparse.gll/start-index m) 12 | e (:instaparse.gll/end-index m)] 13 | (when (and s e) 14 | [s e]))) 15 | 16 | -------------------------------------------------------------------------------- /test/data/abnf_uri.txt: -------------------------------------------------------------------------------- 1 | URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] 2 | 3 | hier-part = "//" authority path-abempty 4 | / path-absolute 5 | / path-rootless 6 | / path-empty 7 | 8 | URI-reference = URI / relative-ref 9 | 10 | absolute-URI = scheme ":" hier-part [ "?" query ] 11 | 12 | relative-ref = relative-part [ "?" query ] [ "#" fragment ] 13 | 14 | relative-part = "//" authority path-abempty 15 | / path-absolute 16 | / path-noscheme 17 | / path-empty 18 | 19 | scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / ".") 20 | 21 | authority = [ userinfo "@" ] host [ ":" port ] 22 | userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) 23 | host = IP-literal / IPv4address / reg-name 24 | port = *DIGIT 25 | 26 | IP-literal = "[" ( IPv6address / IPvFuture ) "]" 27 | 28 | IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) 29 | 30 | IPv6address = 6( h16 ":" ) ls32 31 | / "::" 5( h16 ":" ) ls32 32 | / [ h16 ] "::" 4( h16 ":" ) ls32 33 | / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 34 | / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 35 | / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 36 | / [ *4( h16 ":" ) h16 ] "::" ls32 37 | / [ *5( h16 ":" ) h16 ] "::" h16 38 | / [ *6( h16 ":" ) h16 ] "::" 39 | 40 | h16 = 1*4HEXDIG 41 | ls32 = ( h16 ":" h16 ) / IPv4address 42 | IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet 43 | 44 | dec-octet = DIGIT ; 0-9 45 | / %x31-39 DIGIT ; 10-99 46 | / "1" 2DIGIT ; 100-199 47 | / "2" %x30-34 DIGIT ; 200-249 48 | / "25" %x30-35 ; 250-255 49 | 50 | reg-name = *( unreserved / pct-encoded / sub-delims ) 51 | 52 | path = path-abempty ; begins with "/" or is empty 53 | / path-absolute ; begins with "/" but not "//" 54 | / path-noscheme ; begins with a non-colon segment 55 | / path-rootless ; begins with a segment 56 | / path-empty ; zero characters 57 | 58 | path-abempty = *( "/" segment ) 59 | path-absolute = "/" [ segment-nz *( "/" segment ) ] 60 | path-noscheme = segment-nz-nc *( "/" segment ) 61 | path-rootless = segment-nz *( "/" segment ) 62 | path-empty = 0pchar 63 | 64 | segment = *pchar 65 | segment-nz = 1*pchar 66 | segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) 67 | ; non-zero-length segment without any colon ":" 68 | 69 | pchar = unreserved / pct-encoded / sub-delims / ":" / "@" 70 | 71 | query = *( pchar / "/" / "?" ) 72 | 73 | fragment = *( pchar / "/" / "?" ) 74 | 75 | pct-encoded = "%" HEXDIG HEXDIG 76 | 77 | unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 78 | reserved = gen-delims / sub-delims 79 | gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" 80 | sub-delims = "!" / "$" / "&" / "'" / "(" / ")" 81 | / "*" / "+" / "," / ";" / "=" ; comment -------------------------------------------------------------------------------- /test/data/defparser_grammar.txt: -------------------------------------------------------------------------------- 1 | S = #'a' | 'b' 2 | -------------------------------------------------------------------------------- /test/data/phone_uri.txt: -------------------------------------------------------------------------------- 1 | telephone-uri = "tel:" telephone-subscriber 2 | telephone-subscriber = global-number / local-number 3 | global-number = global-number-digits *par 4 | local-number = local-number-digits *par context *par 5 | par = parameter / extension / isdn-subaddress 6 | isdn-subaddress = ";isub=" 1*uric 7 | extension = ";ext=" 1*phonedigit 8 | context = ";phone-context=" descriptor 9 | descriptor = domainname / global-number-digits 10 | global-number-digits = "+" *phonedigit DIGIT *phonedigit 11 | local-number-digits = 12 | *phonedigit-hex (HEXDIG / "*" / "#") *phonedigit-hex 13 | domainname = *( domainlabel "." ) toplabel [ "." ] 14 | domainlabel = alphanum 15 | / alphanum *( alphanum / "-" ) alphanum 16 | toplabel = ALPHA / ALPHA *( alphanum / "-" ) alphanum 17 | parameter = ";" pname ["=" pvalue ] 18 | pname = 1*( alphanum / "-" ) 19 | pvalue = 1*paramchar 20 | paramchar = param-unreserved / unreserved / pct-encoded 21 | unreserved = alphanum / mark 22 | mark = "-" / "_" / "." / "!" / "~" / "*" / 23 | "'" / "(" / ")" 24 | pct-encoded = "%" HEXDIG HEXDIG 25 | param-unreserved = "[" / "]" / "/" / ":" / "&" / "+" / "$" 26 | phonedigit = DIGIT / [ visual-separator ] 27 | phonedigit-hex = HEXDIG / "*" / "#" / [ visual-separator ] 28 | visual-separator = "-" / "." / "(" / ")" 29 | alphanum = ALPHA / DIGIT 30 | reserved = ";" / "/" / "?" / ":" / "@" / "&" / 31 | "=" / "+" / "$" / "," 32 | uric = reserved / unreserved / pct-encoded -------------------------------------------------------------------------------- /test/instaparse/abnf_test.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.abnf-test 2 | (:require 3 | #?(:clj [instaparse.core :refer [parser parses defparser]] 4 | :cljs [instaparse.core :refer [parser parses] :refer-macros [defparser]]) 5 | [instaparse.core-test :refer [parsers-similar?]] 6 | [instaparse.combinators :refer [ebnf abnf]] 7 | #?(:clj [clojure.test :refer [deftest are is]] 8 | :cljs [cljs.test]) 9 | #?(:clj [clojure.java.io :as io])) 10 | #?(:cljs (:require-macros 11 | [cljs.test :refer [is are deftest]]))) 12 | 13 | (defparser uri-parser 14 | "test/data/abnf_uri.txt" 15 | :input-format :abnf 16 | :instaparse.abnf/case-insensitive true) 17 | 18 | (defparser phone-uri-parser 19 | "test/data/phone_uri.txt" 20 | :input-format :abnf 21 | :instaparse.abnf/case-insensitive true) 22 | 23 | #?(:clj 24 | (deftest slurping-test 25 | (is (parsers-similar? 26 | uri-parser 27 | (binding [instaparse.abnf/*case-insensitive* true] 28 | (parser 29 | "test/data/abnf_uri.txt" 30 | :input-format :abnf 31 | :instaparse.abnf/case-insensitive true)) 32 | (binding [instaparse.abnf/*case-insensitive* true] 33 | (parser 34 | (io/resource "data/abnf_uri.txt") 35 | :input-format :abnf 36 | :instaparse.abnf/case-insensitive true)) 37 | (binding [instaparse.abnf/*case-insensitive* true] 38 | (parser 39 | (slurp "test/data/abnf_uri.txt") 40 | :input-format :abnf 41 | :instaparse.abnf/case-insensitive true))) 42 | "Verify that defparser, auto-slurp from string filename, 43 | auto-slurp from resource (URL), and manual slurp all return 44 | equivalent parsers."))) 45 | 46 | (deftest abnf-uri 47 | (are [x y] (= x y) 48 | (uri-parser "http://www.google.com") 49 | [:URI [:SCHEME [:ALPHA "h"] [:ALPHA "t"] [:ALPHA "t"] [:ALPHA "p"]] ":" [:HIER-PART "//" [:AUTHORITY [:HOST [:REG-NAME [:UNRESERVED [:ALPHA "w"]] [:UNRESERVED [:ALPHA "w"]] [:UNRESERVED [:ALPHA "w"]] [:UNRESERVED "."] [:UNRESERVED [:ALPHA "g"]] [:UNRESERVED [:ALPHA "o"]] [:UNRESERVED [:ALPHA "o"]] [:UNRESERVED [:ALPHA "g"]] [:UNRESERVED [:ALPHA "l"]] [:UNRESERVED [:ALPHA "e"]] [:UNRESERVED "."] [:UNRESERVED [:ALPHA "c"]] [:UNRESERVED [:ALPHA "o"]] [:UNRESERVED [:ALPHA "m"]]]]] [:PATH-ABEMPTY]]] 50 | 51 | (uri-parser "ftp://ftp.is.co.za/rfc/rfc1808.txt") 52 | [:URI [:SCHEME [:ALPHA "f"] [:ALPHA "t"] [:ALPHA "p"]] ":" [:HIER-PART "//" [:AUTHORITY [:HOST [:REG-NAME [:UNRESERVED [:ALPHA "f"]] [:UNRESERVED [:ALPHA "t"]] [:UNRESERVED [:ALPHA "p"]] [:UNRESERVED "."] [:UNRESERVED [:ALPHA "i"]] [:UNRESERVED [:ALPHA "s"]] [:UNRESERVED "."] [:UNRESERVED [:ALPHA "c"]] [:UNRESERVED [:ALPHA "o"]] [:UNRESERVED "."] [:UNRESERVED [:ALPHA "z"]] [:UNRESERVED [:ALPHA "a"]]]]] [:PATH-ABEMPTY "/" [:SEGMENT [:PCHAR [:UNRESERVED [:ALPHA "r"]]] [:PCHAR [:UNRESERVED [:ALPHA "f"]]] [:PCHAR [:UNRESERVED [:ALPHA "c"]]]] "/" [:SEGMENT [:PCHAR [:UNRESERVED [:ALPHA "r"]]] [:PCHAR [:UNRESERVED [:ALPHA "f"]]] [:PCHAR [:UNRESERVED [:ALPHA "c"]]] [:PCHAR [:UNRESERVED [:DIGIT "1"]]] [:PCHAR [:UNRESERVED [:DIGIT "8"]]] [:PCHAR [:UNRESERVED [:DIGIT "0"]]] [:PCHAR [:UNRESERVED [:DIGIT "8"]]] [:PCHAR [:UNRESERVED "."]] [:PCHAR [:UNRESERVED [:ALPHA "t"]]] [:PCHAR [:UNRESERVED [:ALPHA "x"]]] [:PCHAR [:UNRESERVED [:ALPHA "t"]]]]]]] 53 | 54 | (uri-parser "mailto:John.Doe@example.com") 55 | [:URI [:SCHEME [:ALPHA "m"] [:ALPHA "a"] [:ALPHA "i"] [:ALPHA "l"] [:ALPHA "t"] [:ALPHA "o"]] ":" [:HIER-PART [:PATH-ROOTLESS [:SEGMENT-NZ [:PCHAR [:UNRESERVED [:ALPHA "J"]]] [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "h"]]] [:PCHAR [:UNRESERVED [:ALPHA "n"]]] [:PCHAR [:UNRESERVED "."]] [:PCHAR [:UNRESERVED [:ALPHA "D"]]] [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "e"]]] [:PCHAR "@"] [:PCHAR [:UNRESERVED [:ALPHA "e"]]] [:PCHAR [:UNRESERVED [:ALPHA "x"]]] [:PCHAR [:UNRESERVED [:ALPHA "a"]]] [:PCHAR [:UNRESERVED [:ALPHA "m"]]] [:PCHAR [:UNRESERVED [:ALPHA "p"]]] [:PCHAR [:UNRESERVED [:ALPHA "l"]]] [:PCHAR [:UNRESERVED [:ALPHA "e"]]] [:PCHAR [:UNRESERVED "."]] [:PCHAR [:UNRESERVED [:ALPHA "c"]]] [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "m"]]]]]]] 56 | 57 | (uri-parser "tel:+1-816-555-1212") 58 | [:URI [:SCHEME [:ALPHA "t"] [:ALPHA "e"] [:ALPHA "l"]] ":" [:HIER-PART [:PATH-ROOTLESS [:SEGMENT-NZ [:PCHAR [:SUB-DELIMS "+"]] [:PCHAR [:UNRESERVED [:DIGIT "1"]]] [:PCHAR [:UNRESERVED "-"]] [:PCHAR [:UNRESERVED [:DIGIT "8"]]] [:PCHAR [:UNRESERVED [:DIGIT "1"]]] [:PCHAR [:UNRESERVED [:DIGIT "6"]]] [:PCHAR [:UNRESERVED "-"]] [:PCHAR [:UNRESERVED [:DIGIT "5"]]] [:PCHAR [:UNRESERVED [:DIGIT "5"]]] [:PCHAR [:UNRESERVED [:DIGIT "5"]]] [:PCHAR [:UNRESERVED "-"]] [:PCHAR [:UNRESERVED [:DIGIT "1"]]] [:PCHAR [:UNRESERVED [:DIGIT "2"]]] [:PCHAR [:UNRESERVED [:DIGIT "1"]]] [:PCHAR [:UNRESERVED [:DIGIT "2"]]]]]]] 59 | 60 | (uri-parser "telnet://192.0.2.16:80/") 61 | [:URI [:SCHEME [:ALPHA "t"] [:ALPHA "e"] [:ALPHA "l"] [:ALPHA "n"] [:ALPHA "e"] [:ALPHA "t"]] ":" [:HIER-PART "//" [:AUTHORITY [:HOST [:REG-NAME [:UNRESERVED [:DIGIT "1"]] [:UNRESERVED [:DIGIT "9"]] [:UNRESERVED [:DIGIT "2"]] [:UNRESERVED "."] [:UNRESERVED [:DIGIT "0"]] [:UNRESERVED "."] [:UNRESERVED [:DIGIT "2"]] [:UNRESERVED "."] [:UNRESERVED [:DIGIT "1"]] [:UNRESERVED [:DIGIT "6"]]]] ":" [:PORT [:DIGIT "8"] [:DIGIT "0"]]] [:PATH-ABEMPTY "/" [:SEGMENT]]]] 62 | 63 | (uri-parser "urn:oasis:names:specification:docbook:dtd:xml:4.1.2") 64 | [:URI [:SCHEME [:ALPHA "u"] [:ALPHA "r"] [:ALPHA "n"]] ":" [:HIER-PART [:PATH-ROOTLESS [:SEGMENT-NZ [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "a"]]] [:PCHAR [:UNRESERVED [:ALPHA "s"]]] [:PCHAR [:UNRESERVED [:ALPHA "i"]]] [:PCHAR [:UNRESERVED [:ALPHA "s"]]] [:PCHAR ":"] [:PCHAR [:UNRESERVED [:ALPHA "n"]]] [:PCHAR [:UNRESERVED [:ALPHA "a"]]] [:PCHAR [:UNRESERVED [:ALPHA "m"]]] [:PCHAR [:UNRESERVED [:ALPHA "e"]]] [:PCHAR [:UNRESERVED [:ALPHA "s"]]] [:PCHAR ":"] [:PCHAR [:UNRESERVED [:ALPHA "s"]]] [:PCHAR [:UNRESERVED [:ALPHA "p"]]] [:PCHAR [:UNRESERVED [:ALPHA "e"]]] [:PCHAR [:UNRESERVED [:ALPHA "c"]]] [:PCHAR [:UNRESERVED [:ALPHA "i"]]] [:PCHAR [:UNRESERVED [:ALPHA "f"]]] [:PCHAR [:UNRESERVED [:ALPHA "i"]]] [:PCHAR [:UNRESERVED [:ALPHA "c"]]] [:PCHAR [:UNRESERVED [:ALPHA "a"]]] [:PCHAR [:UNRESERVED [:ALPHA "t"]]] [:PCHAR [:UNRESERVED [:ALPHA "i"]]] [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "n"]]] [:PCHAR ":"] [:PCHAR [:UNRESERVED [:ALPHA "d"]]] [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "c"]]] [:PCHAR [:UNRESERVED [:ALPHA "b"]]] [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "k"]]] [:PCHAR ":"] [:PCHAR [:UNRESERVED [:ALPHA "d"]]] [:PCHAR [:UNRESERVED [:ALPHA "t"]]] [:PCHAR [:UNRESERVED [:ALPHA "d"]]] [:PCHAR ":"] [:PCHAR [:UNRESERVED [:ALPHA "x"]]] [:PCHAR [:UNRESERVED [:ALPHA "m"]]] [:PCHAR [:UNRESERVED [:ALPHA "l"]]] [:PCHAR ":"] [:PCHAR [:UNRESERVED [:DIGIT "4"]]] [:PCHAR [:UNRESERVED "."]] [:PCHAR [:UNRESERVED [:DIGIT "1"]]] [:PCHAR [:UNRESERVED "."]] [:PCHAR [:UNRESERVED [:DIGIT "2"]]]]]]] 65 | 66 | (uri-parser "ldap://[2001:db8::7]/c=GB?objectClass?one") 67 | [:URI [:SCHEME [:ALPHA "l"] [:ALPHA "d"] [:ALPHA "a"] [:ALPHA "p"]] ":" [:HIER-PART "//" [:AUTHORITY [:HOST [:IP-LITERAL "[" [:IPV6ADDRESS [:H16 [:HEXDIG "2"] [:HEXDIG "0"] [:HEXDIG "0"] [:HEXDIG "1"]] ":" [:H16 [:HEXDIG "d"] [:HEXDIG "b"] [:HEXDIG "8"]] "::" [:H16 [:HEXDIG "7"]]] "]"]]] [:PATH-ABEMPTY "/" [:SEGMENT [:PCHAR [:UNRESERVED [:ALPHA "c"]]] [:PCHAR [:SUB-DELIMS "="]] [:PCHAR [:UNRESERVED [:ALPHA "G"]]] [:PCHAR [:UNRESERVED [:ALPHA "B"]]]]]] "?" [:QUERY [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "b"]]] [:PCHAR [:UNRESERVED [:ALPHA "j"]]] [:PCHAR [:UNRESERVED [:ALPHA "e"]]] [:PCHAR [:UNRESERVED [:ALPHA "c"]]] [:PCHAR [:UNRESERVED [:ALPHA "t"]]] [:PCHAR [:UNRESERVED [:ALPHA "C"]]] [:PCHAR [:UNRESERVED [:ALPHA "l"]]] [:PCHAR [:UNRESERVED [:ALPHA "a"]]] [:PCHAR [:UNRESERVED [:ALPHA "s"]]] [:PCHAR [:UNRESERVED [:ALPHA "s"]]] "?" [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "n"]]] [:PCHAR [:UNRESERVED [:ALPHA "e"]]]]])) 68 | 69 | (deftest phone-uri 70 | (are [x y] (= x y) 71 | (phone-uri-parser "tel:+1-201-555-0123") 72 | [:TELEPHONE-URI 73 | "tel:" 74 | [:TELEPHONE-SUBSCRIBER 75 | [:GLOBAL-NUMBER 76 | [:GLOBAL-NUMBER-DIGITS 77 | "+" 78 | [:DIGIT "1"] 79 | [:PHONEDIGIT [:VISUAL-SEPARATOR "-"]] 80 | [:PHONEDIGIT [:DIGIT "2"]] 81 | [:PHONEDIGIT [:DIGIT "0"]] 82 | [:PHONEDIGIT [:DIGIT "1"]] 83 | [:PHONEDIGIT [:VISUAL-SEPARATOR "-"]] 84 | [:PHONEDIGIT [:DIGIT "5"]] 85 | [:PHONEDIGIT [:DIGIT "5"]] 86 | [:PHONEDIGIT [:DIGIT "5"]] 87 | [:PHONEDIGIT [:VISUAL-SEPARATOR "-"]] 88 | [:PHONEDIGIT [:DIGIT "0"]] 89 | [:PHONEDIGIT [:DIGIT "1"]] 90 | [:PHONEDIGIT [:DIGIT "2"]] 91 | [:PHONEDIGIT [:DIGIT "3"]]]]]])) 92 | 93 | (def abnf-german 94 | "Testing the ABNF regular expressions" 95 | (parser 96 | " 97 | ; a parser for the German programming language 98 | ; http://esolangs.org/wiki/German 99 | 100 | S = <*1space> (A / B) *( (A / B)) <*1space> 101 | A = #'BEER' 102 | B = #'SCHNITZEL' 103 | space = #'\\s+' 104 | " :input-format :abnf)) 105 | 106 | (deftest german 107 | (are [x y] (= x y) 108 | (abnf-german " BEER SCHNITZEL BEER BEER SCHNITZEL SCHNITZEL 109 | BEER BEER BEER ") 110 | [:S 111 | [:A "BEER"] 112 | [:B "SCHNITZEL"] 113 | [:A "BEER"] 114 | [:A "BEER"] 115 | [:B "SCHNITZEL"] 116 | [:B "SCHNITZEL"] 117 | [:A "BEER"] 118 | [:A "BEER"] 119 | [:A "BEER"]])) 120 | 121 | (def abnf-abc 122 | "Trying the \"equal amount of A's, B's, and C's\" parser in ABNF, 123 | to test the lookahead" 124 | (parser 125 | "S = &(A 'c') 1*'a' B 126 | A = 'a' [A] 'b' 127 | = 'b' [B] 'c'" 128 | :input-format :abnf)) 129 | 130 | (deftest abc 131 | (are [x y] (= x y) 132 | (abnf-abc "aaaabbbbcccc") 133 | [:S "a" "a" "a" "a" "b" "b" "b" "b" "c" "c" "c" "c"] 134 | (abnf-abc "aaabbbc" :total true) 135 | [:S "a" "a" "a" "b" "b" "b" "c" [:instaparse/failure ""] [:instaparse/failure ""]])) 136 | 137 | (def reps 138 | "Testing the different kinds of repetitions" 139 | (parser 140 | "S = A B C D E FG 141 | A = *'a' 142 | B = 2*'b' 143 | C = *2'c' 144 | D = 2'd' 145 | E = 2*4'e' 146 | FG = 2('f' 'g')" 147 | :input-format :abnf)) 148 | 149 | (deftest rep-test 150 | (are [x] (not (instance? instaparse.gll.Failure x)) 151 | (reps "aabbccddeefgfg") 152 | (reps "bbbbbbddeeeefgfg") 153 | (reps "bbcddeefgfg"))) 154 | 155 | (deftest rep-test-errors 156 | (are [x] (instance? instaparse.gll.Failure x) 157 | (reps "") 158 | (reps "bccddeefgfg") 159 | (reps "aaaabbbbcccddeefgfg") 160 | (reps "aabbccddeefg") 161 | (reps "aabbccddeeffgg"))) 162 | 163 | (def regex-chars 164 | "Testing %d42-91. The boundary chars are \"*\" and \"[\", which normally aren't allowed in a regex." 165 | (parser 166 | "S = %d42-91" 167 | :input-format :abnf)) 168 | 169 | (deftest regex-char-test 170 | (doseq [i (range 1 (inc 100)) 171 | :let [c (char i)]] 172 | (if (<= 42 i 91) 173 | (is (not (instance? instaparse.gll.Failure (regex-chars (str c))))) 174 | (is (instance? instaparse.gll.Failure (regex-chars (str c))))))) 175 | 176 | (deftest unicode-test 177 | (let [poop "\uD83D\uDCA9"] ; U+1F4A9 PILE OF POO 178 | (let [parser1 (parser "S = %x1F4A9" 179 | :input-format :abnf)] 180 | (are [x y] (= x y) 181 | (parses parser1 poop) [[:S poop]]) 182 | (are [x] (instance? instaparse.gll.Failure x) 183 | (parser1 (str poop poop)) 184 | (parser1 (str (first poop))) 185 | ;; shouldn't work on the surrogate characters individually 186 | (parser1 (str (second poop))))) 187 | (let [parser2 (parser "S = %x1F4A8-1F4A9" 188 | :input-format :abnf)] 189 | (are [x y] (= x y) 190 | (parses parser2 poop) [[:S poop]]) 191 | (are [x] (instance? instaparse.gll.Failure x) 192 | (parser2 (str poop poop)) 193 | (parser2 (str (first poop))) 194 | (parser2 (str (second poop))))) 195 | (let [parser3 (parser "S = %x1F4A9.1F4A9.1F4A9" 196 | :input-format :abnf)] 197 | (are [x y] (= x y) 198 | (parses parser3 (str poop poop poop)) [[:S poop poop poop]]) 199 | (are [x] (instance? instaparse.gll.Failure x) 200 | (parser3 (str poop)))) 201 | ;; it would be cool if EBNF supported unicode in a parser spec 202 | ;; (ABNF doesn't allow that though) 203 | (let [parser4 (parser (str "S = '" poop "'*"))] 204 | (are [x y] (= x y) 205 | (parses parser4 (str poop poop poop)) [[:S poop poop poop]]) 206 | (are [x] (instance? instaparse.gll.Failure x) 207 | (parser4 (str (first poop))) 208 | (parser4 (str (second poop))) 209 | (parser4 (str poop poop (first poop))))))) 210 | 211 | (deftest abnf-combinator-test 212 | (let [p (parser (merge 213 | {:S (abnf "A / B")} 214 | (abnf " = 1*'a'") 215 | {:B (abnf "'='")}) 216 | :start :S)] 217 | (are [x y] (= y x) 218 | (p "aAaa") 219 | [:S "a" "a" "a" "a"] 220 | (p "=") 221 | [:S [:B "="]]))) 222 | 223 | (defn output-matches? 224 | [expected actual] 225 | (if (= :fail expected) 226 | (instance? instaparse.gll.Failure actual) 227 | (= expected actual))) 228 | 229 | (deftest string-ci-test 230 | (are [p input expected] (output-matches? expected (p input)) 231 | (parser "S = 'Hi'" :input-format :ebnf) "Hi" [:S "Hi"] 232 | (parser "S = 'Hi'" :input-format :ebnf) "hi" :fail 233 | (parser "S = 'Hi'" :input-format :ebnf :string-ci false) "Hi" [:S "Hi"] 234 | (parser "S = 'Hi'" :input-format :ebnf :string-ci false) "hi" :fail 235 | (parser "S = 'Hi'" :input-format :ebnf :string-ci true) "Hi" [:S "Hi"] 236 | (parser "S = 'Hi'" :input-format :ebnf :string-ci true) "hi" [:S "Hi"] 237 | 238 | (parser [:S (ebnf "'Hi'")]) "Hi" [:S "Hi"] 239 | (parser [:S (ebnf "'Hi'")]) "hi" :fail 240 | (parser [:S (ebnf "'Hi'" :string-ci true)]) "Hi" [:S "Hi"] 241 | (parser [:S (ebnf "'Hi'" :string-ci true)]) "hi" [:S "Hi"] 242 | 243 | (parser "S = 'Hi'" :input-format :abnf) "Hi" [:S "Hi"] 244 | (parser "S = 'Hi'" :input-format :abnf) "hi" [:S "Hi"] 245 | (parser "S = 'Hi'" :input-format :abnf :string-ci false) "Hi" [:S "Hi"] 246 | (parser "S = 'Hi'" :input-format :abnf :string-ci false) "hi" :fail 247 | (parser "S = 'Hi'" :input-format :abnf :string-ci true) "Hi" [:S "Hi"] 248 | (parser "S = 'Hi'" :input-format :abnf :string-ci true) "hi" [:S "Hi"] 249 | 250 | (parser [:S (abnf "'Hi'")]) "Hi" [:S "Hi"] 251 | (parser [:S (abnf "'Hi'")]) "hi" [:S "Hi"] 252 | (parser [:S (abnf "'Hi'" :string-ci false)]) "Hi" [:S "Hi"] 253 | (parser [:S (abnf "'Hi'" :string-ci false)]) "hi" :fail)) 254 | -------------------------------------------------------------------------------- /test/instaparse/auto_flatten_seq_test.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.auto-flatten-seq-test 2 | (:require 3 | [instaparse.auto-flatten-seq :refer [auto-flatten-seq conj-flat convert-afs-to-vec]] 4 | #?(:clj [clojure.test :refer [deftest are is]] 5 | :cljs [cljs.test])) 6 | #?(:cljs (:require-macros [cljs.test :refer [deftest are is]]))) 7 | 8 | (defn rand-mutation [v iv] 9 | (let [rnd (int (rand-int 3))] 10 | (case rnd 11 | 0 (let [n (rand-int 50000)] [(conj v n) (conj-flat iv n) rnd]) 12 | 2 (let [i (rand-int 64), r (auto-flatten-seq (repeat i (rand-int 50000)))] 13 | [(into v r) (conj-flat iv r) rnd]) 14 | 1 (let [i (rand-int 64), r (auto-flatten-seq (repeat i (rand-int 50000)))] 15 | [(into (vec (seq r)) v) (conj-flat r iv) rnd])))) 16 | 17 | (deftest rand-incremental-vector-test 18 | (is (= (conj-flat (auto-flatten-seq [:s]) nil) [:s])) 19 | (loop [v (vec (range 100)) iv (auto-flatten-seq (range 100)) n 50 loops 20] 20 | (let [[v iv rnd] (rand-mutation v iv)] 21 | (cond 22 | (zero? loops) nil 23 | (zero? n) (recur (vec (range 100)) (auto-flatten-seq (range 100)) 50 (dec loops)) 24 | :else 25 | (do 26 | (is (= (count v) (count iv))) 27 | (is (= v iv)) 28 | (is (= iv v)) 29 | (is (= (hash v) (hash iv))) 30 | (is (= (seq v) (seq iv))) 31 | (is (= v (convert-afs-to-vec iv))) 32 | (is (= (convert-afs-to-vec iv) v)) 33 | (is (= (type (empty (convert-afs-to-vec iv))) (type v))) 34 | (is (= (hash v) (hash (convert-afs-to-vec iv)))) 35 | (recur v iv (dec n) loops)))))) 36 | 37 | (defn depth [v] 38 | (cond 39 | (empty? v) 0 40 | (sequential? (first v)) (max (inc (depth (first v))) (depth (rest v))) 41 | :else (depth (rest v)))) 42 | -------------------------------------------------------------------------------- /test/instaparse/defparser_test.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.defparser-test 2 | (:require 3 | #?(:clj [clojure.test :as t :refer [deftest are is]] 4 | :cljs [cljs.test :as t :refer-macros [deftest are is]]) 5 | #?(:clj [instaparse.core :as insta :refer [defparser]] 6 | :cljs [instaparse.core :as insta :refer-macros [defparser]]) 7 | [instaparse.combinators :as c] 8 | [instaparse.core-test :refer [parsers-similar?]])) 9 | 10 | (defparser p1 "S = #'a' | 'b'") 11 | 12 | (defparser p2 [:S (c/alt (c/regexp #"a") (c/string "b"))]) 13 | 14 | (defparser p3 {:S (c/alt (c/regexp #"a") (c/string "b"))} 15 | :start :S) 16 | 17 | (defparser p4 "test/data/defparser_grammar.txt") 18 | 19 | (def p5 (insta/parser "S = #'a' | 'b'")) 20 | 21 | (deftest defparser-test-standard 22 | (is (parsers-similar? p1 p2 p3 p4 p5)) 23 | 24 | #?(:clj 25 | (are [x y] (thrown? y (eval (quote x))) 26 | (instaparse.core/defparser p6 "test/data/parser_not_found.txt") 27 | Exception 28 | 29 | (instaparse.core/defparser p7 "test/data/defparser_grammar.txt" :no-slurp true) 30 | Exception))) 31 | 32 | ;; We catch up front when someone tries to do something overly 33 | ;; complicated in the macro-time options 34 | ;; [test removed due to a bug in Clojure 1.10 which prevents capture of 35 | ;; errors triggered during macroexpansion] 36 | ;; (instaparse.core/defparser p8 "S = #'a' | 'b'" :input-format (do :ebnf)) 37 | ;; AssertionError 38 | 39 | 40 | 41 | 42 | (defparser a1 "S = #'a' / 'b'" 43 | :input-format :abnf) 44 | 45 | (def a2 (insta/parser "S = #'a' / 'b'" :input-format :abnf)) 46 | 47 | (defparser a3 "S = #'a' | 'b'" 48 | :input-format :ebnf, :string-ci true) 49 | 50 | (deftest defparser-test-abnf 51 | (is (parsers-similar? a1 a2 a3))) 52 | 53 | 54 | 55 | (defparser ws1 "S = ( 'a')+ ; = #'\\s+'") 56 | 57 | (defparser ws2 "S = 'a'+" :auto-whitespace :standard) 58 | 59 | (defparser ws3 "S = 'a'+" :auto-whitespace (insta/parser "whitespace = #'\\s+'")) 60 | 61 | (let [ws (insta/parser "whitespace = #'\\s+'")] 62 | (defparser ws4 "S = 'a'+" :auto-whitespace ws)) 63 | 64 | (def ws5 (insta/parser "S = 'a'+" :auto-whitespace :standard)) 65 | 66 | (defparser ws6 " = #'\\s+'; S = ( 'a')+ " 67 | :start :S) 68 | 69 | (deftest defparser-test-auto-whitespace 70 | (is (parsers-similar? ws1 ws2 ws3 ws4 ws5 ws6))) 71 | 72 | 73 | 74 | (defparser e1 "S = 'a'+" :output-format :enlive) 75 | 76 | (def e2 (insta/parser "S = 'a'+" :output-format :enlive)) 77 | 78 | (deftest defparser-test-enlive 79 | (is (parsers-similar? e1 e2)) 80 | (is (= (e2 "a") (e1 "a")))) 81 | -------------------------------------------------------------------------------- /test/instaparse/failure_test.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.failure-test 2 | (:require 3 | [instaparse.failure :refer [marker pprint-failure]] 4 | #?(:clj [clojure.test :refer [deftest are is]] 5 | :cljs [cljs.test])) 6 | #?(:cljs (:require-macros 7 | [cljs.test :refer [is are deftest]]))) 8 | 9 | ;; Tests new marker function by counting the number of tabs in both text 10 | ;; and marker lines to make sure the count is the same. 11 | (deftest marker-test 12 | (let [text "\t\ti'm a sample error line with tabs." 13 | n 16 14 | marker (marker text n)] 15 | (let [text-tabs (count (filter #{"\t"} text)) 16 | marker-tabs (count (filter #{"\t"} marker))] 17 | (is (= text-tabs marker-tabs))))) 18 | 19 | (deftest pprint-failure-test 20 | (let [request {:line 3 21 | :column 16 22 | :text "\t\ti'm a sample error line with tabs." 23 | :reason [{:tag :string :expecting "A"}]} 24 | nl (println-str)] 25 | (is (= (with-out-str (pprint-failure request)) 26 | (str "Parse error at line 3, column 16:" nl 27 | (:text request) nl 28 | "\t\t ^" nl 29 | "Expected:" nl 30 | "\"A\"" nl))))) 31 | -------------------------------------------------------------------------------- /test/instaparse/namespaced_nts_test.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.namespaced-nts-test 2 | (:require 3 | #?(:clj [clojure.test :refer [deftest is]] 4 | :cljs [cljs.test :as t]) 5 | #?(:clj [instaparse.core :as insta] 6 | :cljs [instaparse.core :as insta])) 7 | #?(:cljs (:require-macros 8 | [cljs.test :refer [is deftest]]))) 9 | 10 | (def namespaced-nts-parser 11 | (insta/parser 12 | "S = token ( token)* 13 | ws = #'\\s+' 14 | keyword/hello = 'hello' 15 | keyword.namespaced/bye = 'bye' 16 | = keyword/hello | keyword.namespaced/bye 17 | identifier = #'\\S+' 18 | token = keyword / identifier" 19 | :allow-namespaced-nts true)) 20 | 21 | (deftest parser 22 | (is (= (namespaced-nts-parser "bye") [:S [:token [:keyword.namespaced/bye "bye"]]]))) 23 | 24 | (deftest round-trip 25 | (let [grammar (prn-str namespaced-nts-parser)] 26 | (is (= grammar 27 | (prn-str (insta/parser 28 | grammar 29 | :allow-namespaced-nts true)))))) 30 | 31 | -------------------------------------------------------------------------------- /test/instaparse/repeat_test.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.repeat-test 2 | (:require #?(:clj [clojure.test :refer [deftest are]] 3 | :cljs [cljs.test :as t]) 4 | [instaparse.core :as insta] 5 | [instaparse.repeat :as repeat]) 6 | #?(:cljs (:require-macros [cljs.test :refer [are deftest]]))) 7 | 8 | (def user-parser 9 | "content = user-block* 10 | user-block = (user before-section after-section < blank-line* >) 11 | user = prefix separator number separator name newline 12 | before-section = < before > lines error-line* 13 | after-section = < after > lines 14 | = < 'BEFORE' newline > 15 | = < 'AFTER' newline > 16 | = < 'User' > 17 | = line* 18 | = <#'\\s+'> subscription newline 19 | = ( '(no dates!)' | 'FIXUP!' ) newline 20 | blank-line = #'\\s*\n' 21 | name = #'.*' 22 | (*WIP why infinite loop?*) 23 | subscription = !prefix #'.*?(?=\\s+-)' < separator > date 24 | date = #'.*' 25 | = <'\n'> 26 | = <#'[ -]+'> 27 | number = #'[0-9]+' 28 | 29 | ") 30 | 31 | (deftest memory-optimize-test 32 | (are [grammar text optimize?] 33 | (let [parser (insta/parser grammar) 34 | parser-enlive (insta/parser grammar :output-format :enlive) 35 | tree1 (parser text) 36 | tree2 (parser text :optimize :memory) 37 | tree3 (parser-enlive text) 38 | tree4 (parser-enlive text :optimize :memory)] 39 | (and (= tree1 tree2) (= tree3 tree4) 40 | (= optimize? (repeat/used-memory-optimization? tree2)) 41 | (= optimize? (repeat/used-memory-optimization? tree4)))) 42 | 43 | ;user-parser text true 44 | "S = 'ab'*" "ababab" true 45 | "S = 'ab'*" "abababd" false 46 | "S = 'ab'*" "" false 47 | " = 'ab'*" "ababab" true 48 | " = 'ab'*" "abababd" false 49 | " = 'ab'*" "" false 50 | "S = <'ab'>*" "ababab" false 51 | "S = <'ab'*>" "ababab" false 52 | 53 | "S = A*; A = 'a'" "aaaa" true 54 | "S = A*; A = 'a'" "aaaad" false 55 | "S = A*; A = 'a'" "" false 56 | " = A*; A = 'a'" "aaaa" true 57 | " = A*; A = 'a'" "aaaad" false 58 | " = A*; A = 'a'" "" false 59 | "S = *; A = 'a'" "aaaa" false 60 | "S = ; A = 'a'" "aaaa" false 61 | 62 | "S = 'ab'+" "ababab" true 63 | "S = 'ab'+" "abababd" false 64 | "S = 'ab'+" "" false 65 | " = 'ab'+" "ababab" true 66 | " = 'ab'+" "abababd" false 67 | " = 'ab'+" "" false 68 | "S = <'ab'>+" "ababab" false 69 | "S = <'ab'+>" "ababab" false 70 | 71 | "S = A+; A = 'a'" "aaaa" true 72 | "S = A+; A = 'a'" "aaaad" false 73 | "S = A+; A = 'a'" "" false 74 | " = A+; A = 'a'" "aaaa" true 75 | " = A+; A = 'a'" "aaaad" false 76 | " = A+; A = 'a'" "" false 77 | "S = +; A = 'a'" "aaaa" false 78 | "S = ; A = 'a'" "aaaa" false 79 | 80 | "S = 'c' 'ab'*" "cababab" true 81 | "S = 'c' 'ab'*" "cabababd" false 82 | "S = 'c' 'ab'*" "dababab" false 83 | "S = 'c' 'ab'*" "c" false 84 | "S = 'c' 'ab'*" "" false 85 | " = 'c' 'ab'*" "cababab" true 86 | " = 'c' 'ab'*" "cabababd" false 87 | " = 'c' 'ab'*" "dcababab" false 88 | " = 'c' 'ab'*" "c" false 89 | " = 'c' 'ab'*" "" false 90 | "S = 'c' <'ab'>*" "cababab" false 91 | "S = 'c' <'ab'*>" "cababab" false 92 | "S = <'c'> <'ab'>*" "cababab" false 93 | "S = <'c'> 'ab'*" "cababab" false 94 | 95 | "S = 'c' A*; A = 'a'" "caaaa" true 96 | "S = 'c' A*; A = 'a'" "caaaad" false 97 | "S = 'c' A*; A = 'a'" "dcaaaad" false 98 | "S = 'c' A*; A = 'a'" "c" false 99 | " = 'c' A*; A = 'a'" "caaaa" true 100 | " = 'c' A*; A = 'a'" "caaaad" false 101 | " = 'c' A*; A = 'a'" "daaaad" false 102 | " = 'c' A*; A = 'a'" "c" false 103 | "S = 'c' *; A = 'a'" "caaaa" false 104 | "S = 'c' ; A = 'a'" "caaaa" false 105 | 106 | "S = 'c' 'ab'+" "cababab" true 107 | "S = 'c' 'ab'+" "dababab" false 108 | "S = 'c' 'ab'+" "abababd" false 109 | "S = 'c' 'ab'+" "c" false 110 | "S = 'c' 'ab'+" "" false 111 | " = 'c' 'ab'+" "cababab" true 112 | " = 'c' 'ab'+" "cabababd" false 113 | " = 'c' 'ab'+" "dcababab" false 114 | " = 'c' 'ab'+" "c" false 115 | " = 'c' 'ab'+" "" false 116 | "S = 'c' <'ab'>+" "cababab" false 117 | "S = 'c' <'ab'+>" "cababab" false 118 | "S = <'c'> <'ab'>+" "cababab" false 119 | "S = <'c'> 'ab'+" "cababab" false 120 | 121 | "S = 'c' A+; A = 'a'" "caaaa" true 122 | "S = 'c' A+; A = 'a'" "caaaad" false 123 | "S = 'c' A+; A = 'a'" "dcaaaa" false 124 | "S = 'c' A+; A = 'a'" "c" false 125 | " = 'c' A+; A = 'a'" "caaaa" true 126 | " = 'c' A+; A = 'a'" "caaaad" false 127 | " = 'c' A+; A = 'a'" "dcaaaa" false 128 | " = 'c' A+; A = 'a'" "c" false 129 | "S = 'c' +; A = 'a'" "caaaa" false 130 | "S = 'c' ; A = 'a'" "caaaa" false 131 | 132 | "S = C A+; C = 'c'; A = 'a'" "caaaa" true 133 | "S = C A+; C = 'c'; = 'a'" "caaaa" true 134 | "S = C A+; = 'c'; A = 'a'" "caaaa" true 135 | "S = C A+; = 'c'; = 'a'" "caaaa" true 136 | "S = A+; C = 'c'; A = 'a'" "caaaa" false 137 | "S = C A+; C = 'c'; A = 'a'" "caaaad" false 138 | "S = C A+; C = 'c'; A = 'a'" "dcaaaa" false 139 | "S = C A+; C = 'c'; A = 'a'" "c" false 140 | " = C A+; C = 'c'; A = 'a'" "caaaa" true 141 | " = A+; C = 'c'; A = 'a'" "caaaa" false 142 | " = C A+; C = 'c'; A = 'a'" "caaaad" false 143 | " = C A+; C = 'c'; A = 'a'" "dcaaaa" false 144 | " = C A+; C = 'c'; A = 'a'" "c" false 145 | "S = C +; C = 'c'; A = 'a'" "caaaa" false 146 | "S = C ; C = 'c'; A = 'a'" "caaaa" false 147 | )) 148 | -------------------------------------------------------------------------------- /test/instaparse/specs.cljc: -------------------------------------------------------------------------------- 1 | (ns instaparse.specs) 2 | 3 | (def cfg1 "S = 'a'") 4 | (def cfg2 5 | "S = X 6 | X = Y 7 | Y = Z") 8 | (def cfg3 9 | "S = X | Y 10 | Y = A Z 11 | Z = 'a'") 12 | (def cfg4 13 | "S := A B | C 14 | C := (A | B) C") 15 | (def cfg5 16 | "S=A?") 17 | (def cfg6 18 | "S =(A | B)?") 19 | (def cfg7 20 | "S = A, B?, (C C)*, D+, E") 21 | (def cfg8 22 | " = (C | D)") 23 | (def cfg9 24 | "S = A, &B") 25 | (def cfg10 26 | "S = &B A") 27 | (def cfg11 28 | "S = &B+ A") 29 | (def cfg12 30 | "S = !B A") 31 | (def cfg13 32 | "S = !&B A") 33 | (def cfg15 34 | "S = 'a' S | Epsilon; 35 | C = 'b'. 36 | D = A") 37 | (def cfg16 38 | "S = 'a' / 'b'") 39 | (def cfg17 40 | "S = 'a' / 'b' | 'c'") 41 | (def cfg18 42 | "S = 'a' | 'b' / 'c'") 43 | (def cfg19 44 | "S = A ('a' | 'b')+ 45 | A = !B 46 | B = 'a' !'b'") 47 | (def cfg20 48 | "(* A comment about this grammar 49 | *split* (across) lines *) 50 | (* And some (* nested *) comments *) 51 | S = (A*) 52 | A = 'a'") 53 | -------------------------------------------------------------------------------- /test/instaparse/viz_test.clj: -------------------------------------------------------------------------------- 1 | (ns instaparse.viz-test 2 | (:require instaparse.core) 3 | (:use instaparse.viz)) 4 | 5 | (def make-tree-e 6 | "simple tree parser" 7 | (instaparse.core/parser "tree: node* 8 | node: leaf | <'('> node (<'('> node <')'>)* node* <')'> 9 | leaf: #'a+' 10 | " :output-format :enlive)) 11 | 12 | (def make-tree-h 13 | "simple tree parser" 14 | (instaparse.core/parser "tree: node* 15 | node: leaf | <'('> node (<'('> node <')'>)* node* <')'> 16 | leaf: #'a+' 17 | " :output-format :hiccup)) 18 | 19 | (def make-tree-se 20 | "simple tree parser" 21 | (instaparse.core/parser ": node* 22 | node: leaf | <'('> node (<'('> node <')'>)* node* <')'> 23 | leaf: #'a+' 24 | " :output-format :enlive)) 25 | 26 | (def make-tree-sh 27 | "simple tree parser" 28 | (instaparse.core/parser ": node* 29 | node: leaf | <'('> node (<'('> node <')'>)* node* <')'> 30 | leaf: #'a+' 31 | " :output-format :hiccup)) 32 | 33 | (defn view-test-trees [t] 34 | (tree-viz (make-tree-e "((a)((a)))(a)")) 35 | (Thread/sleep t) 36 | (tree-viz (make-tree-h "((a)((a)))(a)")) 37 | (Thread/sleep t) 38 | (tree-viz (make-tree-sh "((a)((a)))(a)")) 39 | (Thread/sleep t) 40 | (tree-viz (make-tree-se "((a)((a)))(a)")) 41 | (Thread/sleep t) 42 | (tree-viz (make-tree-e "")) 43 | (Thread/sleep t) 44 | (tree-viz (make-tree-se ""))) 45 | --------------------------------------------------------------------------------