├── .circleci
    └── config.yml
├── .clj-kondo
    └── config.edn
├── .gitattributes
├── .gitignore
├── CHANGES.md
├── LICENSE
├── README.md
├── docs
    ├── ABNF.md
    ├── ExperimentalFeatures.md
    ├── Performance.md
    └── Tracing.md
├── images
    └── vizexample1.png
├── project.clj
├── resources
    └── clj-kondo.exports
    │   └── instaparse
    │       └── config.edn
├── runner
    └── cljs
    │   └── runner
    │       └── runner.cljs
├── src
    └── instaparse
    │   ├── abnf.cljc
    │   ├── auto_flatten_seq.cljc
    │   ├── cfg.cljc
    │   ├── combinators.cljc
    │   ├── combinators_source.cljc
    │   ├── core.cljc
    │   ├── failure.cljc
    │   ├── gll.cljc
    │   ├── line_col.cljc
    │   ├── macros.clj
    │   ├── print.cljc
    │   ├── reduction.cljc
    │   ├── repeat.cljc
    │   ├── transform.cljc
    │   ├── util.cljc
    │   ├── viz.clj
    │   └── viz.cljs
└── test
    ├── data
        ├── abnf_uri.txt
        ├── defparser_grammar.txt
        └── phone_uri.txt
    └── instaparse
        ├── abnf_test.cljc
        ├── auto_flatten_seq_test.cljc
        ├── core_test.cljc
        ├── defparser_test.cljc
        ├── failure_test.cljc
        ├── grammars.cljc
        ├── namespaced_nts_test.cljc
        ├── repeat_test.cljc
        ├── specs.cljc
        └── viz_test.clj


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | workflows:
 4 |   version: 2
 5 |   build:
 6 |     jobs:
 7 |       - test-clj
 8 |       - test-cljs
 9 | 
10 | jobs:
11 |   test-clj:
12 |     working_directory: ~/project
13 |     docker:
14 |       - image: circleci/clojure:lein-2.8.1
15 |     steps:
16 |       - checkout
17 |       - run: lein check
18 |       - run: lein test-all
19 |   test-cljs:
20 |     working_directory: ~/project
21 |     docker:
22 |       - image: circleci/clojure:lein-2.8.1-node
23 |     steps:
24 |       - checkout
25 |       - run: lein test-cljs-all


--------------------------------------------------------------------------------
/.clj-kondo/config.edn:
--------------------------------------------------------------------------------
1 | {:config-paths ["../resources/clj-kondo.exports/instaparse"]}
2 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text auto
2 | *.clj text
3 | *.md text
4 | *.png binary


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /target
 2 | /lib
 3 | /classes
 4 | /checkouts
 5 | /bin
 6 | /out
 7 | deps.edn
 8 | .cpcache
 9 | .project
10 | .classpath
11 | pom.xml
12 | deps.edn
13 | *.jar
14 | *.class
15 | .lein-deps-sum
16 | .lein-failures
17 | .lein-plugins
18 | ideas.txt
19 | benchmarks.txt
20 | todo.txt
21 | /.settings
22 | .nrepl-port
23 | .lein-repl-history
24 | *~
25 | *#*#
26 | .cljs_node_repl/
27 | .idea/
28 | *.iml
29 | *.asc
30 | .nrepl-history
31 | /.clj-kondo
32 | !/.clj-kondo/config.edn
33 | 


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
  1 | # Instaparse Change Log
  2 | 
  3 | ## 1.5.0
  4 | 
  5 | ### Enhancements
  6 | 
  7 | * instaparse.core/parser now accepts an optional keyword argument `:allow-namespaced-nts true` which accepts namespaced non-terminals in the parser's grammar, thus building a parser that will tag the output with the corresponding namespaced keywords.
  8 | 
  9 | ## 1.4.14
 10 | 
 11 | ### Enhancements
 12 | 
 13 | * Now leverages clojurescript's implicit sugar for :require-macros, :include-macros, and :refer-macros in namespace declaration.  Thanks to sumbach for the pull request!
 14 | 
 15 | ## 1.4.13
 16 | 
 17 | ### Enhancements
 18 | 
 19 | * Added clj-kondo resource file. Thanks to toniz4 for the pull request!
 20 | * Added new arity to add-line-and-column-info-to-metadata that supports starting-line and starting-column. Thanks to mainej for the pull request!
 21 | 
 22 | ## 1.4.12
 23 | 
 24 | ### Bugfixes
 25 | 
 26 | * Instaparse error messages weren't pointing the caret at the right character when the text had tab characters. Thanks to ema-fox and seltzer1717 for the pull request.
 27 | 
 28 | ## 1.4.11
 29 | 
 30 | ### Bugfixes
 31 | 
 32 | * Fixed problem where `:start` option wasn't being respected when grammar was provided as a file.
 33 | 
 34 | ## 1.4.10
 35 | 
 36 | ### Enhancements
 37 | 
 38 | * Change to remove warning caused by latest version of Clojurescript, which warned about use of private var from tools.reader.
 39 | 
 40 | * Added type hints to support native compilation under Graal.
 41 | 
 42 | * Removed test case broken by Clojure 1.10.
 43 | 
 44 | ## 1.4.9
 45 | 
 46 | ### Enhancements
 47 | 
 48 | * ABNF parsers' string case-insensitivity can now be disabled by setting `:string-ci false`.
 49 | 
 50 | * `ebnf` and `abnf` combinators now support an optional `:string-ci` argument, which overrides the default case-insensitivity behavior for that input format.
 51 | 
 52 | ### Bugfixes
 53 | 
 54 | * Case-insensitive regexp flag on Clojurescript
 55 | 
 56 | * Better handling for when rhizome is present in compilation environment, but not at runtime.
 57 | 
 58 | ## 1.4.8
 59 | 
 60 | ### Updates
 61 | 
 62 | * Update to support Clojurescript 1.9.854 and above, due to a breaking change in Clojurescript to use tools.reader.
 63 | 
 64 | ## 1.4.7
 65 | 
 66 | ### Enhancements
 67 | 
 68 | * `visualize` now supports `:output-file :buffered-image`, which returns a java.awt.image.BufferedImage object.
 69 | 
 70 | ### Bugfixes
 71 | 
 72 | * Fixed problem where `visualize` with `:output-file` didn't work on rootless trees.
 73 | 
 74 | ## 1.4.6
 75 | 
 76 | ### Performance improvements
 77 | 
 78 | * Better performance for ABNF grammars in Clojurescript.
 79 | 
 80 | ## 1.4.5
 81 | 
 82 | ### Bugfixes
 83 | 
 84 | * Fixed regression in 1.4.4 involving parsers based off of URIs.
 85 | 
 86 | * defparser now supports the full range of relevant parser options.
 87 | 
 88 | ## 1.4.4
 89 | 
 90 | ### Enhancements
 91 | 
 92 | * Instaparse is now cross-platform compatible between Clojure and Clojurescript.
 93 | 
 94 | ### Features
 95 | 
 96 | * defparser - builds parser at compile time
 97 | 
 98 | ## 1.4.3
 99 | 
100 | ### Bugfixes
101 | 
102 | * Fixed bug with insta/transform on tree with hidden root tag and strings at the top level of the tree.
103 | 
104 | ## 1.4.2
105 | 
106 | ### Bugfixes
107 | 
108 | * Fixed problem with counted repetitions in ABNF.
109 | 
110 | ## 1.4.1
111 | 
112 | ### Features
113 | 
114 | * New function `add-line-and-column-info-to-metadata` in the instaparse.core namespace.
115 | 
116 | ### Enhancements
117 | 
118 | * Added new combinators for unicode character ranges, for better portability to Clojurescript.
119 | 
120 | ### Bugfixes
121 | 
122 | * Improved compatibility with boot, which allows having multiple versions of Clojure on the classpath, by making change to string-reader which needs to
123 | be aware of what version of Clojure it is running due to a breaking change in Clojure 1.7.
124 | 
125 | * Fixed bug with the way failure messages were printed in certain cases.
126 | 
127 | ## 1.4.0
128 | 
129 | ### Bugfixes
130 | 
131 | * In 1.3.6, parsing of any CharSequence was introduced, however, the error messages
132 |   for failed parses weren't printing properly.  This has been fixed.    
133 | 
134 | * 1.4.0 uses a more robust algorithm for handling nested negative lookaheads, in
135 |   response to a bug report where the existing mechanism produced incorrect parses
136 |   (in addition to the correct parse) for a very unusual case.   
137 | 
138 | ### Enhancements
139 | 
140 | * New support for tracing the steps the parser goes through.  Call your parser with
141 |   the optional flag `:trace true`.  The first time you use this flag, it triggers a
142 |   recompilation of the code with additional tracing and profiling steps.
143 |   To restore the code to its non-instrumented form, call `(insta/disable-tracing!)`.
144 | 
145 | ## 1.3.6
146 | 
147 | ### Enhancements
148 | 
149 | * Modified for compatibility with Clojure 1.7.0-alpha6
150 | * Instaparse now can parse anything supporting the CharSequence interface, not just strings.
151 |   Specifically, this allows instaparse to operate on StringBuilder objects. 
152 | 
153 | ## 1.3.5
154 | 
155 | ### Bugfixes
156 | 
157 | * Fixed bug with `transform` on hiccup data structures with numbers or other atomic data as leaves.
158 | 
159 | * Fixed bug with character concatenation support in ABNF grammar
160 | 
161 | ### Enhancements
162 | 
163 | * Added support for Unicode characters to ABNF.
164 | 
165 | ## 1.3.4
166 | 
167 | ### Enhancements
168 | 
169 | * Modified for compatibility with Clojure 1.7.0-alpha2.
170 | 
171 | ## 1.3.3
172 | 
173 | ### Enhancements
174 | 
175 | Made two changes to make it possible to use instaparse on Google App Engine.
176 | 
177 | * Removed dependency on javax.swing.text.Segment class.
178 | * Added `:no-slurp true` keyword option to `insta/parser` to disable URI slurping behavior, since GAE does not support slurp.
179 | 
180 | ## 1.3.2
181 | 
182 | ### Bugfixes
183 | 
184 | * Regular expressions on empty strings weren't properly returning a failure.
185 | 
186 | ## 1.3.1
187 | 
188 | ### Enhancements
189 | 
190 | * Updated tests to use Clojure 1.6.0's final release.
191 | * Added `:ci-string true` flag to `insta/parser`.
192 | 
193 | ## 1.3.0
194 | 
195 | ### Compatibility with Clojure 1.6
196 | 
197 | ## 1.2.16
198 | 
199 | ### Bugfixes
200 | 
201 | * Calling `empty` on a FlattenOnDemandVector now returns [].
202 | 
203 | ## 1.2.15
204 | 
205 | ### Enhancements
206 | 
207 | * :auto-whitespace can now take the keyword :standard or :comma to access one of the predefined whitespace parsers.
208 | 
209 | ### Bugfixes
210 | 
211 | * Fixed newline problem visualizing parse trees on Linux.
212 | * Fixed problem with visualizing rootless trees.
213 | 
214 | ## 1.2.11
215 | 
216 | ### Minor enhancements
217 | 
218 | * Further refinements to the way ordered choice interacts with epsilon parsers.
219 | 
220 | ## 1.2.10
221 | 
222 | ### Bugfixes
223 | 
224 | * Fixed bug introduced by 1.2.9 affecting ordered choice.
225 | 
226 | ## 1.2.9
227 | 
228 | ### Bugfixes
229 | 
230 | * Fixed bug where ordered choice was ignoring epsilon parser.
231 | 
232 | ## 1.2.8
233 | 
234 | ### Bugfixes
235 | 
236 | * Fixed bug introduced by 1.2.7, affecting printing of grammars with regexes.
237 | 
238 | ### Enhancements
239 | 
240 | * Parser printing format now includes <> hidden information and tags.
241 | 
242 | ## 1.2.7
243 | 
244 | ### Bugfixes
245 | 
246 | * Fixed bug when regular expression contains | character.
247 | 
248 | ## 1.2.6
249 | 
250 | ### Bugfixes
251 | 
252 | * Changed pre-condition assertion for auto-whitespace option which was causing a problem with "lein jar".
253 | 
254 | ## 1.2.5
255 | 
256 | ### Bugfixes
257 | 
258 | * Improved handling of unusual characters in ABNF grammars.
259 | 
260 | ## 1.2.4
261 | 
262 | ### Bugfixes
263 | 
264 | * When parsing in :total mode with :enlive as the output format, changed the content of failure node from vector to list to match the rest of the enlive output.
265 | 
266 | ## 1.2.3
267 | 
268 | ### Bugfixes
269 | 
270 | * Fixed problem when epsilon was the only thing in a nonterminal, e.g., "S = epsilon"
271 | 
272 | ### Features
273 | 
274 | * Added experimental `:auto-whitespace` feature.  See the [Experimental Features Document](docs/ExperimentalFeatures.md) for more details.
275 | 
276 | ## 1.2.2
277 | 
278 | ### Bugfixes
279 | 
280 | * Fixed reflection warning.
281 | 
282 | ## 1.2.1
283 | 
284 | ### Bugfixes
285 | 
286 | * I had accidentally left a dependency on tools.trace in the repeat.clj file, used while I was debugging that namespace.  Removed it.
287 | 
288 | ## 1.2.0
289 | 
290 | ### New Features
291 | 
292 | * `span` function returns substring indexes into the parsed text for a portion of the parse tree.
293 | * `visualize` function draws the parse tree, using rhizome and graphviz if installed.
294 | * `:optimize :memory` flag that, for suitable parsers, will perform the parsing in discrete chunks, using less memory.
295 | * New parsing flag to undo the effect of the <> hide notation.
296 |     + `(my-parser text :unhide :tags)` - reveals tags, i.e., `<>` applied on the left-hand sides of rules. 
297 |     + `(my-parser text :unhide :content)` - reveals content hidden on the right-hand side of rules with `<>`
298 |     + `(my-parser text :unhide :all)` - reveals both tags and content.
299 | 
300 | ### Notable Performance Improvements
301 | 
302 | * Dramatic performance improvement (quadratic time reduced to linear) when repetition parsers (+ or *) operate on text whose parse tree contains a large number of repetitions.
303 | * Performance improvement for regular expressions. 
304 | 
305 | ### Minor Enhancements
306 | 
307 | * Added more support to IncrementalVector for a wider variety of vector operations, including subvec, nth, and vec.
308 | 
309 | ## 1.1.0
310 | 
311 | ### Breaking Changes
312 | 
313 | * When you run a parser in "total" mode, the failure node is no longer tagged with `:failure`, but instead is tagged with `:instaparse/failure`.
314 | 
315 | ### New Features
316 | 
317 | * Comments now supported in CFGs.  Use (* and *) notation.
318 | * Added `ebnf` combinator to the `instaparse/combinators` namespace.  This new combinator converts string specifications to the combinator-built equivalent.  See combinator section of the updated tutorial for details.
319 | * ABNF: can now create a parser from a specification using `:input-format :abnf` for ABNF parser syntax.
320 |     * New combinators related to ABNF:
321 |         1. `abnf` -- converts ABNF string fragments to combinators.
322 |         2. `string-ci` -- case-insensitive strings.
323 |         3. `rep` -- between m and n repetitions.
324 |     * New core function related to ABNF:
325 |         `set-default-input-format!` -- initially defaults to :ebnf
326 | 
327 | ### Minor Enhancements
328 | 
329 | * Added comments to regexes used by the parser that processes the context-free grammar syntax, improving the readability of error messages if you have a faulty grammar specification.
330 | 
331 | ### Bug Fixes
332 | 
333 | * Backslashes in front of quotation mark were escaping the quotation mark, even if the backslash itself was escaped.
334 | * Unescaped double-quote marks weren't properly handled, e.g., (parser "A = '\"'").
335 | * Nullable Plus: ((parser "S = ('a'?)+") "") previously returned a failure, now returns [:S]
336 | * Fixed problem with failure reporting that would occur if parse failed on an input that ended with a newline character.
337 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC
  2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM
  3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
  4 | 
  5 | 1. DEFINITIONS
  6 | 
  7 | "Contribution" means:
  8 | 
  9 | a) in the case of the initial Contributor, the initial code and
 10 | documentation distributed under this Agreement, and
 11 | 
 12 | b) in the case of each subsequent Contributor:
 13 | 
 14 | i) changes to the Program, and
 15 | 
 16 | ii) additions to the Program;
 17 | 
 18 | where such changes and/or additions to the Program originate from and are
 19 | distributed by that particular Contributor. A Contribution 'originates' from
 20 | a Contributor if it was added to the Program by such Contributor itself or
 21 | anyone acting on such Contributor's behalf. Contributions do not include
 22 | additions to the Program which: (i) are separate modules of software
 23 | distributed in conjunction with the Program under their own license
 24 | agreement, and (ii) are not derivative works of the Program.
 25 | 
 26 | "Contributor" means any person or entity that distributes the Program.
 27 | 
 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are
 29 | necessarily infringed by the use or sale of its Contribution alone or when
 30 | combined with the Program.
 31 | 
 32 | "Program" means the Contributions distributed in accordance with this
 33 | Agreement.
 34 | 
 35 | "Recipient" means anyone who receives the Program under this Agreement,
 36 | including all Contributors.
 37 | 
 38 | 2. GRANT OF RIGHTS
 39 | 
 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants
 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to
 42 | reproduce, prepare derivative works of, publicly display, publicly perform,
 43 | distribute and sublicense the Contribution of such Contributor, if any, and
 44 | such derivative works, in source code and object code form.
 45 | 
 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants
 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under
 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise
 49 | transfer the Contribution of such Contributor, if any, in source code and
 50 | object code form.  This patent license shall apply to the combination of the
 51 | Contribution and the Program if, at the time the Contribution is added by the
 52 | Contributor, such addition of the Contribution causes such combination to be
 53 | covered by the Licensed Patents. The patent license shall not apply to any
 54 | other combinations which include the Contribution. No hardware per se is
 55 | licensed hereunder.
 56 | 
 57 | c) Recipient understands that although each Contributor grants the licenses
 58 | to its Contributions set forth herein, no assurances are provided by any
 59 | Contributor that the Program does not infringe the patent or other
 60 | intellectual property rights of any other entity. Each Contributor disclaims
 61 | any liability to Recipient for claims brought by any other entity based on
 62 | infringement of intellectual property rights or otherwise. As a condition to
 63 | exercising the rights and licenses granted hereunder, each Recipient hereby
 64 | assumes sole responsibility to secure any other intellectual property rights
 65 | needed, if any. For example, if a third party patent license is required to
 66 | allow Recipient to distribute the Program, it is Recipient's responsibility
 67 | to acquire that license before distributing the Program.
 68 | 
 69 | d) Each Contributor represents that to its knowledge it has sufficient
 70 | copyright rights in its Contribution, if any, to grant the copyright license
 71 | set forth in this Agreement.
 72 | 
 73 | 3. REQUIREMENTS
 74 | 
 75 | A Contributor may choose to distribute the Program in object code form under
 76 | its own license agreement, provided that:
 77 | 
 78 | a) it complies with the terms and conditions of this Agreement; and
 79 | 
 80 | b) its license agreement:
 81 | 
 82 | i) effectively disclaims on behalf of all Contributors all warranties and
 83 | conditions, express and implied, including warranties or conditions of title
 84 | and non-infringement, and implied warranties or conditions of merchantability
 85 | and fitness for a particular purpose;
 86 | 
 87 | ii) effectively excludes on behalf of all Contributors all liability for
 88 | damages, including direct, indirect, special, incidental and consequential
 89 | damages, such as lost profits;
 90 | 
 91 | iii) states that any provisions which differ from this Agreement are offered
 92 | by that Contributor alone and not by any other party; and
 93 | 
 94 | iv) states that source code for the Program is available from such
 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on
 96 | or through a medium customarily used for software exchange.
 97 | 
 98 | When the Program is made available in source code form:
 99 | 
100 | a) it must be made available under this Agreement; and
101 | 
102 | b) a copy of this Agreement must be included with each copy of the Program.
103 | 
104 | Contributors may not remove or alter any copyright notices contained within
105 | the Program.
106 | 
107 | Each Contributor must identify itself as the originator of its Contribution,
108 | if any, in a manner that reasonably allows subsequent Recipients to identify
109 | the originator of the Contribution.
110 | 
111 | 4. COMMERCIAL DISTRIBUTION
112 | 
113 | Commercial distributors of software may accept certain responsibilities with
114 | respect to end users, business partners and the like. While this license is
115 | intended to facilitate the commercial use of the Program, the Contributor who
116 | includes the Program in a commercial product offering should do so in a
117 | manner which does not create potential liability for other Contributors.
118 | Therefore, if a Contributor includes the Program in a commercial product
119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend
120 | and indemnify every other Contributor ("Indemnified Contributor") against any
121 | losses, damages and costs (collectively "Losses") arising from claims,
122 | lawsuits and other legal actions brought by a third party against the
123 | Indemnified Contributor to the extent caused by the acts or omissions of such
124 | Commercial Contributor in connection with its distribution of the Program in
125 | a commercial product offering.  The obligations in this section do not apply
126 | to any claims or Losses relating to any actual or alleged intellectual
127 | property infringement. In order to qualify, an Indemnified Contributor must:
128 | a) promptly notify the Commercial Contributor in writing of such claim, and
129 | b) allow the Commercial Contributor tocontrol, and cooperate with the
130 | Commercial Contributor in, the defense and any related settlement
131 | negotiations. The Indemnified Contributor may participate in any such claim
132 | at its own expense.
133 | 
134 | For example, a Contributor might include the Program in a commercial product
135 | offering, Product X. That Contributor is then a Commercial Contributor. If
136 | that Commercial Contributor then makes performance claims, or offers
137 | warranties related to Product X, those performance claims and warranties are
138 | such Commercial Contributor's responsibility alone. Under this section, the
139 | Commercial Contributor would have to defend claims against the other
140 | Contributors related to those performance claims and warranties, and if a
141 | court requires any other Contributor to pay any damages as a result, the
142 | Commercial Contributor must pay those damages.
143 | 
144 | 5. NO WARRANTY
145 | 
146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON
147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR
149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A
150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the
151 | appropriateness of using and distributing the Program and assumes all risks
152 | associated with its exercise of rights under this Agreement , including but
153 | not limited to the risks and costs of program errors, compliance with
154 | applicable laws, damage to or loss of data, programs or equipment, and
155 | unavailability or interruption of operations.
156 | 
157 | 6. DISCLAIMER OF LIABILITY
158 | 
159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY
160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL,
161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION
162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE
165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY
166 | OF SUCH DAMAGES.
167 | 
168 | 7. GENERAL
169 | 
170 | If any provision of this Agreement is invalid or unenforceable under
171 | applicable law, it shall not affect the validity or enforceability of the
172 | remainder of the terms of this Agreement, and without further action by the
173 | parties hereto, such provision shall be reformed to the minimum extent
174 | necessary to make such provision valid and enforceable.
175 | 
176 | If Recipient institutes patent litigation against any entity (including a
177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself
178 | (excluding combinations of the Program with other software or hardware)
179 | infringes such Recipient's patent(s), then such Recipient's rights granted
180 | under Section 2(b) shall terminate as of the date such litigation is filed.
181 | 
182 | All Recipient's rights under this Agreement shall terminate if it fails to
183 | comply with any of the material terms or conditions of this Agreement and
184 | does not cure such failure in a reasonable period of time after becoming
185 | aware of such noncompliance. If all Recipient's rights under this Agreement
186 | terminate, Recipient agrees to cease use and distribution of the Program as
187 | soon as reasonably practicable. However, Recipient's obligations under this
188 | Agreement and any licenses granted by Recipient relating to the Program shall
189 | continue and survive.
190 | 
191 | Everyone is permitted to copy and distribute copies of this Agreement, but in
192 | order to avoid inconsistency the Agreement is copyrighted and may only be
193 | modified in the following manner. The Agreement Steward reserves the right to
194 | publish new versions (including revisions) of this Agreement from time to
195 | time. No one other than the Agreement Steward has the right to modify this
196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The
197 | Eclipse Foundation may assign the responsibility to serve as the Agreement
198 | Steward to a suitable separate entity. Each new version of the Agreement will
199 | be given a distinguishing version number. The Program (including
200 | Contributions) may always be distributed subject to the version of the
201 | Agreement under which it was received. In addition, after a new version of
202 | the Agreement is published, Contributor may elect to distribute the Program
203 | (including its Contributions) under the new version. Except as expressly
204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or
205 | licenses to the intellectual property of any Contributor under this
206 | Agreement, whether expressly, by implication, estoppel or otherwise. All
207 | rights in the Program not expressly granted under this Agreement are
208 | reserved.
209 | 
210 | This Agreement is governed by the laws of the State of Washington and the
211 | intellectual property laws of the United States of America. No party to this
212 | Agreement will bring a legal action under this Agreement more than one year
213 | after the cause of action arose. Each party waives its rights to a jury trial
214 | in any resulting litigation.


--------------------------------------------------------------------------------
/docs/ABNF.md:
--------------------------------------------------------------------------------
  1 | # ABNF Input Format
  2 | 
  3 | ABNF is an alternative input format for instaparse grammar specifications.  ABNF does not provide any additional expressive power over instaparse's default EBNF-based syntax, so if you are new to instaparse and parsing, you do not need to read this document -- stick with the syntax described in [the tutorial](https://github.com/Engelberg/instaparse/blob/master/README.md).
  4 | 
  5 | ABNF's main virtue is that it is precisely specified and commonly used in protocol specifications.  If you use such protocols, instaparse's ABNF input format is a simple way to turn the ABNF specification into an executable parser.  However, unless you are working with such specifications, you do not need the ABNF input format.
  6 | 
  7 | ## EBNF vs ABNF
  8 | 
  9 | ### EBNF
 10 | 
 11 | The most common notation for expressing context-free grammars is [Backus-Naur Form](http://en.wikipedia.org/wiki/Backus%E2%80%93Naur_Form), or BNF for short.  BNF, however, is a little too simplistic.  People wanted more convenient notation for expressing repetitions, so [EBNF](http://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_Form), or *Extended* Backus-Naur Form was developed.
 12 | 
 13 | There is a hodge-podge of various syntax extensions that all fall under the umbrella of EBNF.  For example, one standard specifies that repetitions should be specified with `{}`, but regular expression operators such as `+`, `*`, and `?` are far more popular.
 14 | 
 15 | When creating the primary input format for instaparse, I based the syntax off of EBNF.  I consulted various standards I found on the internet, and filtered it through my own experience of what I've seen in various textbooks and specs over the years.  I included the official repetition operators as well as the ones derived from regular expressions.  I also incorporated PEG-like syntax extensions.
 16 | 
 17 | What I ended up with was a slightly tweaked version of EBNF, making it relatively easy to turn any EBNF-specified grammar into an executable parser.  However, with multiple competing standards and actively-used variations, there's no guarantee that an EBNF grammar that you find will perfectly align with instaparse's syntax.  You may need to make a few tweaks to get it to work.
 18 | 
 19 | ### ABNF
 20 | 
 21 | From what I can tell, the purpose of [ABNF](http://en.wikipedia.org/wiki/Augmented_Backus%E2%80%93Naur_Form), or *Augmented* Backus-Naur Form, was to create a grammar syntax that would have a single, well-defined, formal standard, so that all ABNF grammars would look exactly the same.
 22 | 
 23 | For this reason, ABNF seems to be a more popular grammar syntax in the world of specifications and protocols.  For example, if you want to know the formal definition of what constitutes a valid URI, there's an ABNF grammar for that.
 24 | 
 25 | After instaparse's initial release, I received a couple requests to support ABNF as an alternative input format.  Since ABNF is so precisely defined, in theory, any ABNF grammar should work without modification.  In practice, I've found that many ABNF specifications have one or two small typos; nevertheless, applying instaparse to ABNF is mostly a trivial copy-paste exercise.
 26 | 
 27 | I included whatever further extensions and extra instaparse goodies I could safely include, but omitted any extension that would conflict with the ABNF standard and jeopardize the ability to use ABNF grammar specifications without modification.
 28 | 
 29 | Aside from just wanting to adhere to the ABNF specifcation, I can think of a few niceties that ABNF provides over EBNF:
 30 | 
 31 | 1. ABNF has a convenient syntax for specifying bounded repetitions, for example, something like "between 3 and 5 repetitions of the letter a".
 32 | 
 33 | 2. Convenient syntax for expressing characters and ranges of characters.
 34 | 
 35 | 3. ABNF comes with a "standard library" of a dozen or so common token rules.
 36 | 
 37 | ## Usage
 38 | 
 39 | To get a feeling for what ABNF syntax looks like, first check out this [ABNF specification for phone URIs.](https://raw.githubusercontent.com/Engelberg/instaparse/master/test/data/phone_uri.txt)  I copied and pasted it directly from the formal spec -- found one typo which I fixed.
 40 | 
 41 | 	(def phone-uri-parser
 42 | 	  (insta/parser "https://raw.githubusercontent.com/Engelberg/instaparse/master/test/data/phone_uri.txt"
 43 | 	          :input-format :abnf))
 44 | 
 45 | 	=> (phone-uri-parser "tel:+1-201-555-0123")
 46 | 	[:telephone-uri
 47 | 	 "tel:"
 48 | 	 [:telephone-subscriber
 49 | 	  [:global-number
 50 | 	   [:global-number-digits
 51 | 	    "+"
 52 | 	    [:DIGIT "1"]
 53 | 	    [:phonedigit [:visual-separator "-"]]
 54 | 	    [:phonedigit [:DIGIT "2"]]
 55 | 	    [:phonedigit [:DIGIT "0"]]
 56 | 	    [:phonedigit [:DIGIT "1"]]
 57 | 	    [:phonedigit [:visual-separator "-"]]
 58 | 	    [:phonedigit [:DIGIT "5"]]
 59 | 	    [:phonedigit [:DIGIT "5"]]
 60 | 	    [:phonedigit [:DIGIT "5"]]
 61 | 	    [:phonedigit [:visual-separator "-"]]
 62 | 	    [:phonedigit [:DIGIT "0"]]
 63 | 	    [:phonedigit [:DIGIT "1"]]
 64 | 	    [:phonedigit [:DIGIT "2"]]
 65 | 	    [:phonedigit [:DIGIT "3"]]]]]]
 66 | 
 67 | The usage, as you can see, is almost identical to the way you build parsers using the `insta/parser` constructor.  The only difference is the additional keyword argument `:input-format :abnf`.
 68 | 
 69 | If you find yourself working with a whole series of ABNF parser specifications, you may find it more convenient to call
 70 | 
 71 | 	(insta/set-default-input-format! :abnf)
 72 | 
 73 | to alter the default input format.  Changing the default makes it unnecessary to specify `:input-format :abnf` with each call to the parser constructor.
 74 | 
 75 | Here is the doc string:
 76 | 
 77 | 	=> (doc insta/set-default-input-format!)
 78 | 	-------------------------
 79 | 	instaparse.core/set-default-input-format!
 80 | 	([type])
 81 | 	  Changes the default input format.  Input should be :abnf or :ebnf
 82 | 
 83 | ## ABNF Syntax Guide
 84 | 
 85 | <table>
 86 | <tr><th>Category</th><th>Notations</th><th>Example</th><th>Notes</th></tr>
 87 | <tr><td>Rule</td><td>= =/</td><td>S = A</td><td>=/ is usually used to extend an already-defined rule</td></tr>
 88 | <tr><td>Alternation</td><td>/</td><td>A / B</td><td>Despite the use of /, this is <i>unordered</i> choice</td></tr>
 89 | <tr><td>Concatenation</td><td>whitespace</td><td>A B</td><td></td></tr>
 90 | <tr><td>Grouping</td><td>()</td><td>(A / B) C</td><td></td></tr>
 91 | <tr><td>Bounded Repetition</td><td>*</td><td>3*5 A</td><td>In ABNF, repetition <i>precedes</i> the element</td></tr>
 92 | <tr><td>Optional</td><td>*1</td><td>*1 A</td><td></td></tr>
 93 | <tr><td>One or more</td><td>1*</td><td>1* A</td><td></td></tr>
 94 | <tr><td>Zero or more</td><td>*</td><td>*A</td></tr>
 95 | <tr><td>String terminal</td><td>"" ''</td><td>'a' "a"</td><td>Single-quoted strings are an instaparse extension</td></tr>
 96 | <tr><td>Regex terminal</td><td>#"" #''</td><td>#'a' #"a"</td><td>Regexes are an instaparse extension</td></tr>
 97 | <tr><td>Character terminal</td><td>%d %b %x</td><td>%x30-37</td></tr>
 98 | <tr><td>Comment</td><td>;</td><td>; comment to the end of the line</td></tr>
 99 | <tr><td>Lookahead</td><td>&</td><td>&A</td><td>Lookahead is an instaparse extension</td></tr>
100 | <tr><td>Negative lookahead</td><td>!</td><td>!A</td><td>Negative lookahead is an instaparse extension</td></tr>
101 | </table>
102 | 
103 | Some important things to be aware of:
104 | 
105 | + According to the ABNF standard, all strings are *case-insensitive*.
106 | + ABNF strings do not support any kind of escape characters.  Use ABNF's character notation to specify unusual characters.
107 | + In ABNF, there is one repetition operator, `*`, and it *precedes* the thing that it is operating on.  So, for example, `3*5` means "between 3 and 5 repetitions".  The first number defaults to 0 and the second defaults to infinity, so you can omit one or both numbers to get effects comparable to EBNF's `+`, `*`, and `?`.  `4*4` could just be written as `4`.
108 | + Use `;` for comments to the end of the line.  The ABNF specification has rigid definitions about where comments can be, but in instaparse the rules for comment placement are a bit more flexible and intuitive.
109 | + ABNF uses `/` for the ordinary alternative operator with no order implied.
110 | + ABNF allows the restatement of a rule name to specify multiple alternatives.  The custom is to use `=/` in definitions that are adding alternatives, for example `S = 'a' / 'b'` could be written as:
111 | 
112 | <br>
113 | 
114 | 	S = 'a'
115 | 	S =/ 'b'
116 | 
117 | ## Extensions
118 | 
119 | Instaparse extends ABNF by allowing single-quoted strings and both double-quoted and single-quoted regular expressions.  The PEG extensions of lookahead `&` and negative lookahead `!` are permitted, but the PEG extension of ordered choice could not be included because of the syntactic conflict with ABNF's usage of `/` for unordered alternatives.
120 | 
121 | Instaparse is somewhat more flexible with whitespace than the ABNF specification dictates, but somewhat less flexible than you might expect from the EBNF input format.  For example, in instaparse's EBNF mode, `(A B)C` would be just fine, but ABNF insists on at least one space to indicate concatenation, so you'd have to write `(A B) C`.  I relaxed whitespace restrictions when I could do so without radically deviating from the specification.
122 | 
123 | ### Angle brackets
124 | 
125 | The ABNF input format supports instaparse's angle bracket notation, where angle brackets can be used to hide certain parts of the grammar from the resulting tree structure.  Including instaparse's angle bracket notation was a bit of a tough decision because technically angle brackets are reserved for special use in ABNF grammars.
126 | 
127 | However, in ABNF notation, angle brackets are meant to be used for prose descriptions of some concept that can't be mechanically specified in the grammar.  For example:
128 | 
129 | 	P = <a prime number>
130 | 
131 | I realized that such constructs can't be mechanically handled anyway, so I might as well co-opt the angle bracket notation, as I did with the EBNF syntax, for the very handy purpose of hiding.
132 | 
133 | This means that when you paste in an ABNF specification, it is always wise to do a quick scan to make sure that no angle brackets were used.  They are rarely used, but one [notably strange use of angle brackets](http://w3-org.9356.n7.nabble.com/ipath-empty-ABNF-rule-td192464.html) occurs in the URI specification, which uses `0<ipchar>` to designate the empty string.  So be aware of these sorts of possibilities, but you're unlikely to run into them.
134 | 
135 | ## The standard rules
136 | 
137 | The ABNF specification states that the following rules are always available for use in ABNF grammars:
138 | 
139 | <table>
140 | <tr><th>Name</th><th>Explanation</th></tr>
141 | <tr><td>ALPHA</td><td>Alphabetic character</td></tr>
142 | <tr><td>BIT</td><td>0 or 1</td></tr>
143 | <tr><td>CHAR</td><td>ASCII character</td></tr>
144 | <tr><td>CR</td><td>\r</td></tr>
145 | <tr><td>CRLF</td><td>\r\n</td></tr>
146 | <tr><td>CTL</td><td>control character</td></tr>
147 | <tr><td>DIGIT</td><td>0-9</td></tr>
148 | <tr><td>DQUOTE</td><td>"</td></tr>
149 | <tr><td>HEXDIG</td><td>Hexadecimal digit: 0-9 or A-F</td></tr>
150 | <tr><td>HTAB</td><td>\t</td></tr>
151 | <tr><td>LF</td><td>\n</td></tr>
152 | <tr><td>LWSP</td><td>A specific mixture of whitespace and CRLF (see note below)</td></tr>
153 | <tr><td>OCTET</td><td>8-bit character</td></tr>
154 | <tr><td>SP</td><td>the space character</td></tr>
155 | <tr><td>VCHAR</td><td>visible character</td></tr>
156 | <tr><td>WSP</td><td>space or tab</td></tr>
157 | </table>
158 | 
159 | LWSP is particularly quirky, defined to be either a space or tab character, or an alternating sequence of carriage-return-linefeed and a single space or tab character.  It's very specific, presumably relevant to some particular protocol, but not generally useful and I don't recommend using it.
160 | 
161 | ## Combinators
162 | 
163 | The `instaparse.combinators` contains a few combinators that are not documented in the main tutorial, but are listed here because they are only relevant to ABNF grammars.
164 | 
165 | <table>
166 | <tr><th>String syntax</th><th>Combinator</th><th>Functionality</th></tr>
167 | <tr><td>"abc" (as used in ABNF)</td><td>(string-ci "abc")</td><td>string, case-insensitive</td></tr>
168 | <tr><td>3*5 (as used in ABNF)</td><td>(rep 3 5 parser)</td><td>repetition</td></tr>
169 | <tr><td>%d97 (as used in ABNF)</td><td>(unicode-char 97)</td><td>unicode code point</td></tr>
170 | <tr><td>%d97-122 (as used in ABNF)</td><td>(unicode-char 97 122)</td><td>unicode range</td></tr>
171 | </table>
172 | 
173 | Finally, just as there exists an `ebnf` function in the combinators namespace that turns EBNF fragments into combinator-built data structures, there exists an `abnf` function which does the same for ABNF fragments.
174 | 
175 | This means it is entirely possible to take fragments of EBNF syntax along with fragments of ABNF syntax, and convert all the pieces, merging them into a grammar map along with other pieces built from combinators.  I don't expect that many people will need this ability to mix and match, but it's there if you need it.
176 | 
177 | ## Case Sensitivity
178 | 
179 | I've already mentioned that in ABNF syntax, strings are *case-insensitive*, meaning that the string terminal "abc" in an ABNF grammar also matches "aBc", "AbC", etc.  Many ABNF grammar specifications leverage this case insensitivity, for example, the spec for hexadecimal digits include the strings "A", "B", "C", "D", "E", and "F", and this is intended to match the lowercase letters as well.
180 | 
181 | A lesser-known quirk of ABNF syntax is that, in theory, non-terminal rule names are also case-insensitive.  So for example, in the ABNF rule `S = 'a' s`, the lowercase `s` is actually referring back to the uppercase `S`.  Although the specification of ABNF syntax allows for this possibility, as best as I can determine, this "feature" simply isn't used.  It would be confusing and bad form to refer to a non-terminal in different places of your grammar with a different mixture of cases.
182 | 
183 | Therefore, by default in instaparse, ABNF non-terminals are in fact, case-sensitive.  This makes it easier for ABNF grammars to play nicely with EBNF grammars, grammar maps, and instaparse's transform function, all of which are case-sensitive.
184 | 
185 | If you find yourself working with an ABNF grammar that uses an inconsistent mix of lowercase and uppercase letters to refer to the same non-terminal rules, you have two options available to you.  The first possibility, of course, is to simply go through and fix the inconsistencies.  The second option is to bind the dynamic variable `instaparse.abnf/*case-insensitive*` to true while building the parser from the ABNF grammar.
186 | 
187 | Under the hood, this works by *converting all non-terminals to uppercase*.  This means that in the resulting parse tree, all the rule names will be uppercase, so plan your tree traversals and transformations accordingly.
188 | 
189 | As an example, let's revisit the usage example from above:
190 | 
191 | 	(def phone-uri-parser
192 | 	  (binding [instaparse.abnf/*case-insensitive* true]
193 | 	    (insta/parser "https://raw.github.com/Engelberg/instaparse/master/test/instaparse/phone_uri.txt"
194 | 	                  :input-format :abnf)))
195 | 
196 | 	=> (phone-uri-parser "tel:+1-201-555-0123")
197 | 	[:TELEPHONE-URI
198 | 	 "tel:"
199 | 	 [:TELEPHONE-SUBSCRIBER
200 | 	  [:GLOBAL-NUMBER
201 | 	   [:GLOBAL-NUMBER-DIGITS
202 | 	    "+"
203 | 	    [:DIGIT "1"]
204 | 	    [:PHONEDIGIT [:VISUAL-SEPARATOR "-"]]
205 | 	    [:PHONEDIGIT [:DIGIT "2"]]
206 | 	    [:PHONEDIGIT [:DIGIT "0"]]
207 | 	    [:PHONEDIGIT [:DIGIT "1"]]
208 | 	    [:PHONEDIGIT [:VISUAL-SEPARATOR "-"]]
209 | 	    [:PHONEDIGIT [:DIGIT "5"]]
210 | 	    [:PHONEDIGIT [:DIGIT "5"]]
211 | 	    [:PHONEDIGIT [:DIGIT "5"]]
212 | 	    [:PHONEDIGIT [:VISUAL-SEPARATOR "-"]]
213 | 	    [:PHONEDIGIT [:DIGIT "0"]]
214 | 	    [:PHONEDIGIT [:DIGIT "1"]]
215 | 	    [:PHONEDIGIT [:DIGIT "2"]]
216 | 	    [:PHONEDIGIT [:DIGIT "3"]]]]]]
217 | 
218 | The `*case-insensitive*` dynamic variable is also obeyed by the `abnf` combinator.
219 | 


--------------------------------------------------------------------------------
/docs/ExperimentalFeatures.md:
--------------------------------------------------------------------------------
  1 | # Instaparse Experimental Features
  2 | 
  3 | This document provides an explanation of some of the things I'm experimenting with in instaparse.  Please try the new features and let me know what you think.
  4 | 
  5 | ## Optimizing memory
  6 | 
  7 | I've added a new, experimental `:optimize :memory` flag that can conserve memory usage for certain classes of grammars.  I discussed the motivation for this in the [Performance document](Performance.md).  The idea is to make it more practical to use instaparse in situations where you need to parse files containing a large number of independent chunks.
  8 | 
  9 | Usage looks like this:
 10 | 
 11 | 	(def my-parser (insta/parser my-grammar))
 12 | 	(my-parser text :optimize :memory)
 13 | 
 14 | 
 15 | It works for grammars where the top-level production is of the form
 16 | 
 17 | 	start = chunk+
 18 | 
 19 | or
 20 | 
 21 | 	start = header chunk+
 22 | 
 23 | I don't mean that it literally needs to use the words `start` or `header` or `chunk`.  What I mean is that the optimizer looks for top-level productions that finish off with some sort of repeating structure.  To be properly optimized, you want to ensure that the `chunk` rule is written with no ambiguity about where a chunk begins and ends.
 24 | 
 25 | Behind the scenes, here's what the optimization algorithm is doing:  After successfully parsing a `chunk`, the parser *forgets* all the backtracking information and continues parsing the remaining text totally fresh looking for the next chunk, with no sense of history about what has come before.  As long as it keeps finding one chunk after another, it can get through a very large file with far less memory usage than the standard algorithm.
 26 | 
 27 | The downside of this approach is that if the parser hits a spot that doesn't match the repeating chunk rule, there's no way for it to know for sure that this is a fatal failure.  It is entirely possible that there is some other interpretation of an eariler chunk that would make the whole input parseable.  The standard instaparse approach is to backtrack and look for alternative interpretations before declaring a failure.  However, without that backtracking history, there's no way to do that.
 28 | 
 29 | So when you use the `:optimize :memory` flag and your parser hits an error using the "parse one chunk at a time and forget the past" strategy, it *restarts the entire parse process* with the original strategy.
 30 | 
 31 | I'm not entirely sure this was the right design decision, and would welcome feedback on this point.  Here are the tradeoffs:
 32 | 
 33 | Advantage of the current approach:  With this *fall back to the original strategy if the optimizer doesn't work* approach, it should be totally safe to try the optimizer, even if you don't know for sure up front whether the optimizer will work.  With the `:optimize :memory` flag, the output will always be exactly the same as if you hadn't used the flag.  (A metadata annotation, however, will let you know whether the parse was successfully completed entirely with the optimization strategy.)  I like the safety of this approach, and how it is amenable to the attitude of "Let's try this optimization flag out and see if it helps."
 34 | 
 35 | Disadvantage of the current approach: If you're operating on a block of input text so large that the memory optimization is a *necessity*, then if you have a flaw in your text, you're in trouble -- the parsing restarts with the original strategy and if the flaw is fairly late in your file, you could exhaust your memory.
 36 | 
 37 | An alternative design would be to say that if you've enabled the `:optimize :memory` flag, and it hits an apparent flaw in the input, then it's immediately reported as a failure, without any attempt to try the more sophisticated strategy and see whether backtracking might help the situation.  This would be good for people willing to expend the effort to ensure the grammar conforms to the optimizer's constraints and has no ambiguity in the chunk definition.  It would then correct to report a failure right away if encountered by the optimization strategy -- no need to fall back to the original strategy because there's no ambiguity and no alternative interpretation.
 38 | 
 39 | However, if the flag behaved in this way, then it is possible that if the grammar weren't well-suited for the optimizer, the `:optimize :memory` flag might return a failure in some instances where the regular strategy would return success.  In some sense, this would give the programmer maximum control: the programmer can *choose* to rerun the input without the `:optimize :memory` flag or can accept the failure at face value if confident in the grammar's suitability for the optimization strategy.
 40 | 
 41 | So I'm torn: right now the optimizer falls back to the regular strategy because I like that it is dead simple to use, it's safe to try without a deep understanding of what is going on, and it will always give correct output.  But I recognize that having the optimizer simply report the failure gives the programmer greatest control over whether to restart with the regular strategy or not.
 42 | 
 43 | What do you think is the better design choice?
 44 | 
 45 | ## Auto Whitespace
 46 | 
 47 | I have received several requests for instaparse to support the parsing of streams of tokens, rather than just strings.  There appear to be two main motivations for this request:
 48 | 
 49 | 1. For some grammars, explicitly specifying all the places where whitespace can go is a pain.
 50 | 2. For parsing indentation-sensitive languages, it is useful to have a pre-processing pass that identifies `indent` and `dedent` tokens.
 51 | 
 52 | I'm still thinking about developing a token-processing version of instaparse.  But if I can find a way to address the underlying needs while maintaining the "token-free" simplicity of instaparse, that would be even better.
 53 | 
 54 | This new experimental "auto whitespace" feature addresses the first issue, simplifying the specification of grammars where you pretty much want to allow optional whitespace between all your tokens.  Here's how to use the new feature:
 55 | 
 56 | First, you want to develop a parser that consumes whitespace.  The simplest, most common way to do this would be:
 57 | 
 58 | 	(def whitespace
 59 | 	  (insta/parser
 60 | 	    "whitespace = #'\\s+'"))
 61 | 
 62 | Let's test it out:
 63 | 
 64 | 	=> (whitespace "       ")
 65 | 	[:whitespace "       "]
 66 | 	=> (whitespace " \t \n  \t ")
 67 | 	[:whitespace " \t \n  \t "]
 68 | 	
 69 | Important: Your whitespace parser should *not* accept the empty string.
 70 | 
 71 | 	=> (whitespace "")
 72 | 	Parse error at line 1, column 1:
 73 | 	nil
 74 | 	^
 75 | 	Expected:
 76 | 	#"^\s+" (followed by end-of-string)
 77 | 	
 78 | Good, this is what we want.  Now, we can define a parser similar to the `words-and-numbers` parser from the tutorial, but this time we'll use the auto-whitespace feature.
 79 | 
 80 | 	(def words-and-numbers-auto-whitespace
 81 | 	  (insta/parser
 82 | 	    "sentence = token+
 83 | 	     <token> = word | number
 84 | 	     word = #'[a-zA-Z]+'
 85 | 	     number = #'[0-9]+'"
 86 | 
 87 | 	    :auto-whitespace whitespace))
 88 | 
 89 | Notice the use of the `:auto-whitespace` keyword, and how we call it with the whitespace parser we developed earlier.
 90 | 
 91 | 	=> (words-and-numbers-auto-whitespace " abc 123   45 de ")
 92 | 	[:sentence [:word "abc"] [:number "123"] [:number "45"] [:word "de"]]
 93 | 
 94 | Behind the scenes, here's what's going on: the whitespace parsing rule(s) are merged into the new parser, and an optional version of the starting production for the whitespace rule is liberally inserted before all tokens and at the end.  In this case, that means `<whitespace?>` is inserted all over the place.  You can see the insertion points by viewing the parser:
 95 | 
 96 | 	=> words-and-numbers-auto-whitespace
 97 | 
 98 | 	sentence = token+ whitespace?
 99 | 	whitespace = #"\s+"
100 | 	token = word | number
101 | 	word = whitespace? #"[a-zA-Z]+"
102 | 	number = whitespace? #"[0-9]+"
103 | 
104 | You can also see that the whitespace is in fact getting parsed, and is just being hidden:
105 | 
106 | 	=> (words-and-numbers-auto-whitespace " abc 123   45 de " :unhide :content)
107 | 	[:sentence " " [:word "abc"] " " [:number "123"] "   " [:number "45"] " " [:word "de"] " "]
108 | 
109 | Because the whitespace parser rules are merged into the new parser, don't create any rules in your parser with the same names as those in the whitespace parser.  If you do, one of the rules will get clobbered and you'll run into problems.  (TODO: Report an error if a user tries to do this)
110 | 
111 | Note that it makes no difference whether the `:output-format` of the whitespace parser is :enlive or :hiccup.  The rules and the starting production for the whitespace parser are all that matter.
112 | 
113 | Because the :auto-whitespace feature allows you to specify your notion of whitespace, you have the total flexibility to define this however you want.  For example, let's say I want to allow not only whitespace, but `(* comments *)` between any tokens.  Again, we start by developing a corresponding parser:
114 | 
115 | 	(def whitespace-or-comments-v1
116 | 	  (insta/parser
117 | 	    "ws-or-comment = #'\\s+' | comment
118 | 	     comment = '(*' inside-comment* '*)'
119 | 	     inside-comment =  !( '*)' | '(*' ) #'.' | comment"))
120 | 
121 | Does it eat whitespace?
122 | 
123 | 	=> (whitespace-or-comments-v1 "    ")
124 | 	[:ws-or-comment "    "]
125 | 
126 | Check.  Does it handle a comment?
127 | 
128 | 	=> (whitespace-or-comments-v1 "(* comment *)")
129 | 	<successful parse output omitted>
130 | 
131 | Check.  Can it handle nested comments?
132 | 
133 | 	=> (whitespace-or-comments-v1 "(* (* comment *) *)")
134 | 	<successful parse output omitted>
135 | 
136 | And we mustn't forget -- make sure it *doesn't* parse the empty string:
137 | 
138 | 	=> (whitespace-or-comments-v1 "")
139 | 	<failure message omitted>
140 | 
141 | However, there's a problem here.  The auto-whitespace feature inserts optional `?` versions of the whitespace parser everywhere, *not* repeating versions.  It's up to us to make sure that the whitespace parser consumes the *full extent* of any whitespace that could appear between tokens.  In other words, if we want to allow multiple comments in a row, we need to spell that out:
142 | 
143 | 	(def whitespace-or-comments-v2
144 | 	  (insta/parser
145 | 	    "ws-or-comments = #'\\s+' | comments
146 | 	     comments = comment+
147 | 	     comment = '(*' inside-comment* '*)'
148 | 	     inside-comment =  !( '*)' | '(*' ) #'.' | comment"))
149 | 
150 | 	=> (whitespace-or-comments-v2 "(* comment1 *)(* (* nested comment *) *)")
151 | 	<successful parse output omitted>
152 | 
153 | There's still one more issue, though.  Right now, our parser specifies complete empty whitespace, or a series of comments.  But if we want to intermingle whitespace and comments, it won't work:
154 | 
155 | 	=> (whitespace-or-comments-v2 "  (* comment1 *) (* comment2 *) ")
156 | 	Parse error at line 1, column 1:
157 | 	  (* comment1 *) (* comment2 *)
158 | 	^
159 | 	Expected one of:
160 | 	#"^\s+" (followed by end-of-string)
161 | 	"(*"
162 | 
163 | I could go through and manually insert optional whitespace, but wouldn't it be deliciously meta to use the auto-whitespace feature with our previous, simple whitespace parser to define our whitespace-or-comments parser?
164 | 
165 | 	(def whitespace-or-comments
166 | 	  (insta/parser
167 | 	    "ws-or-comments = #'\\s+' | comments
168 | 	     comments = comment+
169 | 	     comment = '(*' inside-comment* '*)'
170 | 	     inside-comment =  !( '*)' | '(*' ) #'.' | comment"
171 | 
172 |     	:auto-whitespace whitespace))
173 | 
174 | Now it works:
175 | 
176 | 	=> (whitespace-or-comments "  (* comment1 *) (* comment2 *) ")
177 | 	<successful parse output omitted>
178 | 
179 | Just out of curiosity, let's see where the `<whitespace?>` got inserted:
180 | 
181 | 	=> whitespace-or-comments
182 | 	ws-or-comments = (whitespace? #"\s+" | comments) whitespace?
183 | 	whitespace = #"\s+"
184 | 	comments = comment+
185 | 	comment = whitespace? "(*" inside-comment* whitespace? "*)"
186 | 	inside-comment = !(whitespace? "*)" | whitespace? "(*") whitespace? #"." | comment
187 | 
188 | Note that the auto-insertion process inserted `whitespace?` right before the `"*)"`, but this isn't particularly useful, because all whitespace before `*)` would already be eaten by the `inside-comment` rule.  If you were inserting the optional whitespace by hand, you'd probably realize it was unnecessary there.  However, when you let the system automatically insert it everywhere, some of the insertions might be gratuitous.  But that's okay, having the extra optional whitespace inserted there doesn't really hurt us either.
189 | 
190 | Now that we have thoroughly tested our whitespace-or-comments parser, we can use it to enrich our words-and-numbers parser:
191 | 
192 | 	(def words-and-numbers-auto-whitespace-and-comments
193 | 	  (insta/parser
194 | 	    "sentence = token+
195 | 	     <token> = word | number
196 | 	     word = #'[a-zA-Z]+'
197 | 	     number = #'[0-9]+'"
198 | 
199 | 	    :auto-whitespace whitespace-or-comments))
200 | 
201 | 	=> (words-and-numbers-auto-whitespace-and-comments " abc 123 (* 456 *) (* (* 7*) 89 *)  def ")
202 | 	[:sentence [:word "abc"] [:number "123"] [:word "def"]]
203 | 
204 | 	=> words-and-numbers-auto-whitespace-and-comments
205 | 
206 | 	sentence = token+ ws-or-comments?
207 | 	inside-comment = !(whitespace? "*)" | whitespace? "(*") whitespace? #"." | comment
208 | 	comment = whitespace? "(*" inside-comment* whitespace? "*)"
209 | 	comments = comment+
210 | 	ws-or-comments = (whitespace? #"\s+" | comments) whitespace?
211 | 	whitespace = #"\s+"
212 | 	token = word | number
213 | 	word = ws-or-comments? #"[a-zA-Z]+"
214 | 	number = ws-or-comments? #"[0-9]+"
215 | 
216 | Note that this feature is only useful in grammars where all the strings and regexes are, conceptually, the "tokens" of your language.  Occasionally, you'll see situations where grammars specify tokens through rules that build up the tokens character-by-character, for example:
217 | 
218 |     month = ('M'|'m') 'arch'
219 |     
220 | If you try to use the auto-whitespace feature with a grammar like this, it will end up allowing space between the "m" and the "arch", which isn't what you want.  The key is to try to express such tokens using a single regular expression:
221 | 
222 | 	month = #'[Mm]arch'
223 | 	
224 | ### Predefined whitespace parsers
225 | 
226 | There's no doubt that the following whitespace rule is by far the most common:
227 | 
228 | 	whitespace = #"\s+"
229 | 
230 | So for this common case, there's no need to create a separate whitespace parser.  You can access this predefined whitespace parser with the option:
231 | 
232 | 	:auto-whitespace :standard
233 | 
234 | At this time, one other predefined whitespace parser is available, for Clojure-like parsing tasks where the comma is also treated as whitespace.  The rule that will be added to your grammar is:
235 | 
236 | 	whitespace = #"[,\s]+"
237 | 
238 | and you can access it with the option:
239 | 
240 | 	:auto-whitespace :comma
241 | 
242 | Let me know what you think of the auto-whitespace feature.  Is it sufficiently simple and useful to belong in the instaparse library?


--------------------------------------------------------------------------------
/docs/Performance.md:
--------------------------------------------------------------------------------
  1 | # Instaparse Performance Notes
  2 | 
  3 | In the instaparse tutorial, I make the claim that instaparse is performant without really defining what I mean.  I explained that I've spent a lot of time on optimization, without really specifying what I'm tring to optimize.  In this document, I'd like to [elaborate on these points](https://github.com/Engelberg/instaparse/blob/master/docs/Performance.md#specific-performance-goals), and talk a bit about how I view [instaparse's role](https://github.com/Engelberg/instaparse/blob/master/docs/Performance.md#the-role-of-instaparse) in the parser ecosystem.  Finally, I'll provide [specific tips on how to get good performance from instaparse parsers](https://github.com/Engelberg/instaparse/blob/master/docs/Performance.md#performance-tips).
  4 | 
  5 | ## A bit of history
  6 | 
  7 | For decades, parsing has been considered a "solved problem" because there are well-known algorithms that can parse a stream of text blazingly fast, in a single linear pass, using minimal memory.  The catch is that these algorithms only apply to certain types of context-free grammars -- these classes of easily-parsed grammars go by names like LL(1) and LALR(1), acronyms describing the parsing technique that applies.  The good news is that most context-free grammars can, with some effort, be converted into the kind of format required by parsing algorithms.  Furthermore, if you are knowledgable about parsing algorithms and are the one constructing the language / data format to be parsed, you can intentionally constrain the syntax to ensure that it can easily be parsed.
  8 | 
  9 | If you can do that, great!  If there's already a parser written for the kind of data you're working with, even better!  However, the programming world is awash with ad hoc config files and data files that don't use an existing standard like XML or JSON.  Sometimes you find yourself needing to work with something that's a little too complex to tease apart just with regular expressions, yet hard to justify the time and energy it would take to study up on LL, LALR, etc. and learn how to parse the data within the constraints of tools using those parsing algorithms.
 10 | 
 11 | ## The role of instaparse
 12 | 
 13 | That's where instaparse comes in.  Instaparse can handle arbitrary context-free grammars written using standard notation, so it's easy to apply it, even for a quickie one-time parsing task.
 14 | 
 15 | Shortly after the release of instaparse, there were a couple great testimonial blog posts about instaparse.  [This blog post by Brandon Harvey](http://walkwithoutrhythm.net/blog/2013/05/16/instaparse-parsing-with-clojure-is-the-new-black/) especially made my day, because it perfectly captured what I had hoped to achieve with instaparse.
 16 | 
 17 | In his blog post, Brandon describes some cave data that he wanted to parse.  Ideally, he wanted to figure out how to get "from a big fat unwieldy string to a nice, regular tree-shaped data structure in 20 minutes or less."  The cave data is clearly structured and looks kind of like JSON, but it isn't quite JSON.
 18 | 
 19 | First, he tried using another Clojure parsing library (a rather excellent library provided you're working with a grammar that fits its constraints), but couldn't figure out how to express his grammar in a way that worked.  He got bogged down with a bunch of shift/reduce conflicts and other errors  that he didn't know how to interpret without understanding the underlying machinery.  Using instaparse, he expressed the grammar in the way that seemed most natural, and it worked.
 20 | 
 21 | This brings me to a point I'd like to make before discussing performance:
 22 | 
 23 | *Instaparse aims to be more flexible than traditional parser libraries --- more tolerant of grammars that are ambiguous, require backtracking, or use a mixture of left and right recursion.*
 24 | 
 25 | To accomplish this, instaparse uses a fundamentally different algorithm than those found in traditional parser libraries, which achieve their speeds and performance guarantees by restricting lookahead and limiting backtracking.
 26 | 
 27 | ## Specific performance goals
 28 | 
 29 | With that disclaimer in mind, here are the specifics of what I strive for:
 30 | 
 31 | + For typical, real-world grammars, I want the running time to be linear with respect to the size of the input.  In other words, if you double the size of your text, it should take about twice as long to parse.  (Of course, I'm using Clojure data structures, so in practice, the running time is more like O(n * log32 n), but that's pretty close to linear.)
 32 | + If your grammar is unambiguous and LL(1), the parser should be competitive with parsers generated by tools that *only* accept unambiguous LL(1) grammars (i.e., within some reasonable constant factor).
 33 | + If you have a reasonable grammar, even one that isn't expressed in "just the right way", it should still have solid performance.
 34 | + Performance should degrade gracefully as you incorporate more ambiguity and heavy backtracking into the grammar.
 35 | 
 36 | Roughly speaking, the goal is for instaparse to be performant in the same sense that Clojure is performant.  Clojure is not quite as fast as languages like Java or C++ and consumes considerably more memory, but we use it because it offers greater expressivity and flexibility with enough speed to be useful for a wide range of tasks.
 37 | 
 38 | ## Specific optimizations
 39 | 
 40 | There were a lot of algorithmic coding decisions that I made by benchmarking multiple alternatives and data structures.  I won't go into them all here.  My aim in this section is to give you a sense for how I go about optimizing and what sorts of things I focus on.
 41 | 
 42 | Here is the gist of my optimization process: I take a grammar, try it on increasingly large inputs, and track the running-time growth.  If the growth is quadratic (or worse), I profile and investigate to try to track down the offending code and rework it into linear behavior.  My goal is to ensure that as many grammars as possible have linear growth.
 43 | 
 44 | As I mentioned in the tutorial, one of the first things I noticed in my profiling was how critical hashing was.  This is a great example of how an algorithm that seems like it *should* be linear can go awry without careful attention to implementation details.  We all know that inserting something into a hash map is essentially constant time, so we take that for granted in our analysis.  As long as the algorithm only performs O(n) insertions/lookups in the hash table, it should have linear performance, right?  Well, if the thing you are inserting into the hash table takes O(n) time to compute the hash, you're in big trouble!
 45 | 
 46 | So the first big accomplishment of my optimization efforts was to reduce the hashing time to constant for all the information cached by instaparse.  Version 1.2 of instaparse sports two new equally significant performance improvements:
 47 | 
 48 | First, I discovered that on long texts with long repeating sequences, linear-time concatenation of the internal partial tree results was a huge bottleneck, leading to overall quadratic behavior.  So in 1.2, I converted over to using a custom data structure with O(1) concatentation.  RRB-trees would be another data structure that could potentially solve my concatenation problem, so this is something I intend to look at after the Clojure implementation of RRB-trees matures.
 49 | 
 50 | The other major performance improvement in 1.2 compensated for an unfortunate change that Oracle made in Java 1.7 to the String class, changing Strings so that the substring operation is O(n) rather than O(1), copying the substring into a freshly allocated string.  Instaparse handles regular expressions by testing the regular expression against a substring of the input text that skips past the part of the text already parsed.  This strategy, which creates rather large substrings frequently, needed to be modified in light of Java 1.7's poor substring behavior.
 51 | 
 52 | With these version 1.2 modifications in place, I'm now getting linear-time behavior for all the parsers in my test suite that aren't explicitly designed to demonstrate huge amounts of ambiguity.  This is exactly where I want instaparse to be.
 53 | 
 54 | ## Memory
 55 | 
 56 | When talking about performance, the other big discussion point is, of course, memory consumption.  As I mentioned in the tutorial, instaparse does use a lot of memory.  There's really no way around this; it all comes back to my earlier point that instaparse aims to gracefully handle arbitrary levels of ambiguity and backtracking, which means that the entire text needs to reside in memory and lots of intermediate results need to be cached.
 57 | 
 58 | Instaparse's own syntax for context-free grammars is parsed by an instaparse parser, and is a great example of the practical value of backtracking.
 59 | 
 60 | Consider the following grammar.  The actual semantics of the grammar is not important here, just think about the syntax of the grammar specification and consider how instaparse's `parser` function needs to parse the grammar string as a series of rules:
 61 | 
 62 | 	(insta/parser
 63 | 	   "A = B B
 64 | 	    B = 'b'")
 65 | 
 66 | You might expect instaparse to impose a requirement that each line of the grammar be clearly terminated by an end-of-line character, such as `;` or a newline, but in fact, instaparse's CFG parser has no problem if you write out the grammar all mushed together on one line:
 67 | 
 68 | 	(insta/parser "A = B B B = 'b'")
 69 | 
 70 | Working from left-to-right, when it processes the third `B`, it is entirely possible that what it has seen so far should be interpreted as the rule:
 71 | 
 72 | 	A = B B B
 73 | 
 74 | But when it encounters the `=`, it realizes that the only sensible interpretation is for the third `B` to be the beginning of a new rule, and instaparse sorts it all out.
 75 | 
 76 | Taken to an extreme, consider the parser defined by the following grammar:
 77 | 
 78 | 	S = 'ab'+ | 'a' 'ba'+
 79 | 
 80 | If you use this parser to parse a long string of "abababab...aba", there's no way to determine when looking at the first 'a' which way to interpret it.  The parser can try one path, perhaps assuming that it is part of the `'ab'+` rule, but it won't know until it gets to the very end of the string that it has chosen incorrectly, and has to back up and try another path.  Looking at this example, it should be clear that there's no way to parse the input string in a single linear pass with bounded memory.
 81 | 
 82 | For this reason, I haven't put as much effort into optimizing memory usage -- a lot of data needs to be retained throughout the parsing process, and there simply is less scope for improvement, I think.  Certainly Java 1.7's substring behavior was causing massive memory churn, so the changes I made in instaparse 1.2 will also benefit the memory side of the performance equation.  But other than that, I haven't found any big wins for optimizing memory consumption.
 83 | 
 84 | In theory, I can imagine that there might be a way to intelligently figure out which cached data can be safely discarded, but in the context of left-recursion this is an extremely hard problem to solve.  Chalk this up as a future research problem, but one that is not likely to bear fruit in the short-term.  I have made one step in this direction which I will detail further in the section below about performance tips.
 85 | 
 86 | ## Performance Tips
 87 | 
 88 | Occasionally, I receive a question about whether there's a *best* way to write instaparse grammars for maximum performance.  I've tried very hard to make it so that instaparse's performance isn't ultra-sensitive to the exact way you word the grammar.  My hope is that most people will find these performance tips to be completely unnecessary.  However, for those that are interested, here are some recommendations:
 89 | 
 90 | 1. Instaparse's algorithm is in the family of LL parsing algorithms.  So if you know how to easily write your grammar as an LL grammar, that's probably going to yield the best possible performance.  If not, don't worry about it.
 91 | 
 92 | 2. If your token is a string, use a string literal, not a regular epxression.  For example, prefer `'apple'` to `#'apple'`.
 93 | 
 94 | 3. When the greedy behavior of regular expressions is what you want, prefer using `*` and `+` *inside* the regular expression rather than outside.  This comes up very commonly in processing whitespace.  In most applications, once you hit whitespace, you want to eat up all the whitespace to get to the next token.  So you'll get better performance with `#'\\s*'` than with `#'\\s'*`.  In my parsers, I routinely have a rule for optional whitespace that looks like `ows = #'\\s*'` and then I sprinkle `<ows>` liberally in my other rules wherever I want to potentially allow whitespace.
 95 | 
 96 | 4. Related to the previous point, prefer using regular expressions to define tokens in their entirety rather than using instaparse to build up the tokens by analyzing the string character by character.  For example, if an identifer in your language is a letter followed by a series of letters or digits, you'll be better off with the rule
 97 | 
 98 | 		Identifier = #'[a-zA-Z][a-zA-Z0-9]*'
 99 | 
100 | 	rather than
101 | 
102 | 		Identifer = Letter Digit*
103 | 		Letter = #'[a-zA-Z]'
104 | 		Digit = #'[a-zA-Z0-9]'
105 | 
106 | 5. Remove as much ambiguity from your grammar as you can.  Instaparse works with ambiguous grammars, but dealing with that ambiguity can take a toll on performance.  Use the `insta/parses` function on a variety of sample inputs in order to troubleshoot your grammar and discover ways in which your inputs might have multiple interpretations.
107 | 
108 | 6. Even if `insta/parses` returns a single answer, think about whether you've created a lot of *internal ambiguity*, i.e., situations where the parser won't be able to work out the interpretation of the text until it has gotten much further along.  One way to analyze this is to test the various rules in your grammar using `insta/parses` with the `:partial true` flag to get a feel for how many scenarios it has to consider before it can be sure it has found the whole chunk of text defined by that rule.
109 | 
110 | 7. Watch out for ambiguity in your hidden content.  One time I was working with a grammar that I was convinced was unambiguous -- `insta/parses` always returned a single answer.  However, it turned out that the definition of whitespace was highly ambiguous.  I didn't realize it because the whitespace was hidden.  To help diagnose these sorts of problems, try running `insta/parses` with the `:unhide :all` flag.
111 | 
112 | 8. Prefer Java 1.7.  I've received one report where instaparse, running on Java 1.6, was running out of memory on a large input, whereas the exact same grammar on the same input ran perfectly fine on Java 1.7.
113 | 
114 | 9. Prefer using * and + over recursion to describe simple repetition.  For example, the rule:
115 | 
116 | 		<A> = 'a'+
117 | 
118 | 	can be internally optimized in ways that
119 | 
120 | 		<A> = 'a' A | 'a'
121 | 
122 | 	cannot.
123 | 
124 | 10. Feed instaparse smaller chunks of text.  The reality is that most large parsing tasks involve a series of individual data records that could potentially be parsed independently of one another.  As has been discussed earlier in this document, if you feed instaparse the entire block of text, instaparse has to assume the worst -- that it might encounter some sort of failure that causes it to go back and reintrepret all the text it has processed so far.  Consider preprocessing the text, chopping it into strings representing the individual data records, and pass the smaller strings into instaparse in order to limit the scope of what possibilities it needs to consider and how much history it needs to track.
125 | 
126 | 	For example, I saw one grammar where each line of text represented a new record, and the grammar looked like:
127 | 
128 | 		document = line+
129 | 		line = ...
130 | 
131 | 	Instead of applying this grammar to the entire document at once, why not build a parser where `line` is the top-level starting rule, and then map this parser over a `line-seq` of the text?
132 | 
133 | 	I've added a new, experimental `:optimize :memory` flag that attempts to automate this kind of preprocessing, chopping the text into smaller independent chunks in order to use less memory. This only works on grammars that describe these sorts of repeated data records (possibly with a header at the beginning of the file).  If instaparse can't find the pattern or runs into any sort of failure, it will fall back to its usual parsing strategy in order to make sure it has considered all possibilities.  Using this flag will likely slow down your parser, but if your data lends itself to this alternative strategy, you'll use much less memory.
134 | 
135 | 	I consider the `:optimize :memory` flag to be an *alpha* feature, subject to change.  If you try it and find it useful, or try it on something where you'd expect it to help and it doesn't, please send me your feedback.
136 | 
137 | 	If you need to annotate your chunks of text with line and column information, recall that `add-line-and-column-info-to-metadata` can take a starting line and column number for its annotation process:
138 | 
139 | 		(insta/add-line-and-column-info-to-metadata text starting-line starting-column parse-tree)
140 | 	
141 | 11. As of version 1.2, the enlive output format is slightly faster than hiccup.  This may change in the future, so I don't recommend that you base your choice of output format on this slight differential.  However, if you're trying to eke out the best possible performance, you might find it useful to experiment with both output formats to see whether one performs better for you than the other.
142 | 
143 | 12. As of version 1.4, instaparse has a way to print a trace of the parser's execution process, as well as some profiling information which can be useful to detmerine whether your parser behaves linearly with respect to the size of the input.  [Read about the new tracing feature here.](https://github.com/Engelberg/instaparse/blob/master/docs/Tracing.md)
144 | 


--------------------------------------------------------------------------------
/docs/Tracing.md:
--------------------------------------------------------------------------------
  1 | # Tracing
  2 | 
  3 | Instaparse 1.4.0 and up (in Clojure only) features the ability to look at a trace of what the parser is doing.  As an example, let's take a look at the as-and-bs parser from the tutorial.
  4 | 
  5 | ```
  6 | => (as-and-bs "aaabb")
  7 | [:S [:AB [:A "a" "a" "a"] [:B "b" "b"]]]
  8 | ```
  9 | 
 10 | Now let's look at a trace.  We do this by calling the parser with the optional keyword argument `:trace true`.  `insta/parse` and `insta/parses` both can take this optional argument.
 11 | 
 12 | ```
 13 | => (as-and-bs "aaabb" :trace true)
 14 | ```
 15 | 
 16 | One of my design goals for the tracing feature was that if you don't use it, you shouldn't pay a performance penalty.  So by default, the parsing code is not instrumented for tracing.  The very first time you call a parser with `:trace true`, you may notice a slight pause as instaparse recompiles itself to support tracing.  The trace the prints to standard out, and looks like this:
 17 | 
 18 | ```
 19 | Initiating full parse: S at index 0 (aaabb)
 20 | Initiating full parse: AB* at index 0 (aaabb)
 21 | Initiating parse: AB at index 0 (aaabb)
 22 | Initiating parse: A B at index 0 (aaabb)
 23 | Initiating parse: A at index 0 (aaabb)
 24 | Initiating parse: "a"+ at index 0 (aaabb)
 25 | Initiating parse: "a" at index 0 (aaabb)
 26 | Result for "a" at index 0 (aaabb) => "a"
 27 | Result for "a"+ at index 0 (aaabb) => ("a")
 28 | Result for A at index 0 (aaabb) => [:A "a"]
 29 | Initiating parse: B at index 1 (aabb)
 30 | Initiating parse: "b"+ at index 1 (aabb)
 31 | Initiating parse: "b" at index 1 (aabb)
 32 | No result for "b" at index 1 (aabb)
 33 | Initiating parse: "a" at index 1 (aabb)
 34 | Result for "a" at index 1 (aabb) => "a"
 35 | Result for "a"+ at index 0 (aaabb) => ("a" "a")
 36 | Result for A at index 0 (aaabb) => [:A "a" "a"]
 37 | Initiating parse: B at index 2 (abb)
 38 | Initiating parse: "b"+ at index 2 (abb)
 39 | Initiating parse: "b" at index 2 (abb)
 40 | No result for "b" at index 2 (abb)
 41 | Initiating parse: "a" at index 2 (abb)
 42 | Result for "a" at index 2 (abb) => "a"
 43 | Result for "a"+ at index 0 (aaabb) => ("a" "a" "a")
 44 | Result for A at index 0 (aaabb) => [:A "a" "a" "a"]
 45 | Initiating parse: B at index 3 (bb)
 46 | Initiating parse: "b"+ at index 3 (bb)
 47 | Initiating parse: "b" at index 3 (bb)
 48 | Result for "b" at index 3 (bb) => "b"
 49 | Result for "b"+ at index 3 (bb) => ("b")
 50 | Result for B at index 3 (bb) => [:B "b"]
 51 | Result for A B at index 0 (aaabb) => ([:A "a" "a" "a"] [:B "b"])
 52 | Result for AB at index 0 (aaabb) => [:AB [:A "a" "a" "a"] [:B "b"]]
 53 | Initiating parse: AB at index 4 (b)
 54 | Initiating parse: A B at index 4 (b)
 55 | Initiating parse: A at index 4 (b)
 56 | Initiating parse: "a"+ at index 4 (b)
 57 | Initiating parse: "a" at index 4 (b)
 58 | No result for "a" at index 4 (b)
 59 | Initiating parse: "b" at index 4 (b)
 60 | Result for "b" at index 4 (b) => "b"
 61 | Result for "b"+ at index 3 (bb) => ("b" "b")
 62 | Result for B at index 3 (bb) => [:B "b" "b"]
 63 | Result for A B at index 0 (aaabb) => ([:A "a" "a" "a"] [:B "b" "b"])
 64 | Result for AB at index 0 (aaabb) => [:AB [:A "a" "a" "a"] [:B "b" "b"]]
 65 | Result for AB* at index 0 (aaabb) => ([:AB [:A "a" "a" "a"] [:B "b" "b"]])
 66 | Result for S at index 0 (aaabb) => [:S [:AB [:A "a" "a" "a"] [:B "b" "b"]]]
 67 | Successful parse.
 68 | Profile:  {:push-message 21, :push-result 21, :push-listener 24, :push-stack 26, :push-full-listener 2, :create-node 26}
 69 | [:S [:AB [:A "a" "a" "a"] [:B "b" "b"]]]
 70 | ```
 71 | 
 72 | Let me explain what some of these lines mean.
 73 | 
 74 | ```
 75 | Initiating full parse: S at index 0 (aaabb)
 76 | ```
 77 | 
 78 | A "full parse" means that it only succeeds if it consumes the entire string.  Usually, we're looking to completely parse an entire string, and that's what "full parse" reflects.
 79 | 
 80 | It is important to understand that the word "initiating" does not necessarily mean that it is starting to work on that parse sub-problem right away.  It just means that we're putting it on a stack of sub-problems to try to solve.
 81 | 
 82 | Notice the `(aaabb)` in parens.  This is giving us the next several characters from this point in the string, which makes it a little easier to see at a glance where we are in the string (although, of course the index number can always be used to figure it out precisely).
 83 | 
 84 | ```
 85 | Initiating full parse: AB* at index 0 (aaabb)
 86 | Initiating parse: AB at index 0 (aaabb)
 87 | ```
 88 | 
 89 | Note that AB* needs to be a full parse to be satisfied, but that kicks off another subproblem, which is to look for a parse of AB (not necessarily a full parse) at index 0.
 90 | 
 91 | ```
 92 | Initiating parse: A at index 0 (aaabb)
 93 | Initiating parse: "a"+ at index 0 (aaabb)
 94 | Initiating parse: "a" at index 0 (aaabb)
 95 | Result for "a" at index 0 (aaabb) => "a"
 96 | Result for "a"+ at index 0 (aaabb) => ("a")
 97 | Result for A at index 0 (aaabb) => [:A "a"]
 98 | ```
 99 | 
100 | Note that after initiating a bunch of parse subtasks, we start to see some results.  Again, the content in the parentheses is a look ahead at the next several characters in the string, just to get our bearings.  The information after the `=>` is the parse result that was found.  Typically, the parse results are found in reverse order from the order in which the subtasks are initiated, because when initiated, the subtasks are put on a stack.
101 | 
102 | ```
103 | No result for "b" at index 1 (aabb)
104 | ```
105 | 
106 | The tracing mechanism reports when tokens (i.e., strings or regular expressions) are sought but not found.  In general, the tracing mechanism does not report when subtasks involving non-terminals fail (because internally, instaparse does not transmit failure messages between subtasks).
107 | 
108 | ```
109 | Result for S at index 0 (aaabb) => [:S [:AB [:A "a" "a" "a"] [:B "b" "b"]]]
110 | Successful parse.
111 | ```
112 | 
113 | At the end, we see the final parse, followed by some profiling data:
114 | 
115 | ```
116 | Profile:  {:push-message 21, :push-result 21, :push-listener 24, :push-stack 26, :push-full-listener 2, :create-node 26}
117 | ```
118 | 
119 | The details of the profiling data don't matter that much, other than to know that it's a measure of how much work instaparse had to do to come up with the result.  Repeating the trace with an input of `"aaaaaabbbb"` we get the profiling results:
120 | 
121 | ```
122 | Profile:  {:push-message 40, :push-result 40, :push-listener 48, :push-stack 50, :push-full-listener 2, :create-node 50}
123 | ```
124 | 
125 | The key here is that we doubled the length of the input string, and this doubled-the amount of work that instaparse needed to do.  That's good, it means that this parser behaves linearly with respect to its input size.  Even though the code is instrumented with tracing functionality, you still need to explicitly request the trace each time.  If you don't request the trace, it won't display:
126 | 
127 | ```
128 | => (as-and-bs "aaabb")
129 | [:S [:AB [:A "a" "a" "a"] [:B "b" "b"]]]
130 | ```
131 | 
132 | Now let's look at an example with negative lookahead.  Here is the parser:
133 | 
134 | ```
135 | => negative-lookahead-example
136 | S = !"ab" ("a" | "b")+
137 | => (negative-lookahead-example "aabb")
138 | [:S "a" "a" "b" "b"]
139 | ```
140 | 
141 | Let's run it with the trace:
142 | 
143 | ```
144 | => (negative-lookahead-example "aabb" :trace true)
145 | Initiating full parse: S at index 0 (aabb)
146 | Initiating full parse: !"ab" ("a" | "b")+ at index 0 (aabb)
147 | Initiating parse: !"ab" at index 0 (aabb)
148 | Initiating parse: "ab" at index 0 (aabb)
149 | No result for "ab" at index 0 (aabb)
150 | Exhausted results for "ab" at index 0 (aabb)
151 | Negation satisfied: !"ab" at index 0 (aabb)
152 | Initiating full parse: ("a" | "b")+ at index 0 (aabb)
153 | Initiating parse: "a" | "b" at index 0 (aabb)
154 | Initiating parse: "b" at index 0 (aabb)
155 | No result for "b" at index 0 (aabb)
156 | Initiating parse: "a" at index 0 (aabb)
157 | Result for "a" at index 0 (aabb) => "a"
158 | Result for "a" | "b" at index 0 (aabb) => "a"
159 | Initiating parse: "a" | "b" at index 1 (abb)
160 | Initiating parse: "b" at index 1 (abb)
161 | No result for "b" at index 1 (abb)
162 | Initiating parse: "a" at index 1 (abb)
163 | Result for "a" at index 1 (abb) => "a"
164 | Result for "a" | "b" at index 1 (abb) => "a"
165 | Initiating parse: "a" | "b" at index 2 (bb)
166 | Initiating parse: "b" at index 2 (bb)
167 | Result for "b" at index 2 (bb) => "b"
168 | Result for "a" | "b" at index 2 (bb) => "b"
169 | Initiating parse: "a" | "b" at index 3 (b)
170 | Initiating parse: "b" at index 3 (b)
171 | Result for "b" at index 3 (b) => "b"
172 | Result for "a" | "b" at index 3 (b) => "b"
173 | Result for ("a" | "b")+ at index 0 (aabb) => ("a" "a" "b" "b")
174 | Result for !"ab" ("a" | "b")+ at index 0 (aabb) => ("a" "a" "b" "b")
175 | Result for S at index 0 (aabb) => [:S "a" "a" "b" "b"]
176 | Successful parse.
177 | Profile:  {:push-message 12, :push-result 12, :push-listener 14, :push-stack 17, :push-full-listener 3, :create-node 17}
178 | [:S "a" "a" "b" "b"]
179 | ```
180 | 
181 | The interesting thing with negative lookahead (or ordered choice) is the following lines:
182 | 
183 | ```
184 | Initiating parse: !"ab" at index 0 (aabb)
185 | Initiating parse: "ab" at index 0 (aabb)
186 | No result for "ab" at index 0 (aabb)
187 | Exhausted results for "ab" at index 0 (aabb)
188 | Negation satisfied: !"ab" at index 0 (aabb)
189 | ```
190 | 
191 | To do negative lookahead, the parser sets up a subtask to try to parse the very thing we want to avoid.  If the parser runs out of work to do, then the trace tells us that the negation was in fact satisfied.
192 | 
193 | When you are done tracing, you probably will want to recompile the code without all the tracing and profiling instrumentation.  You can either restart the REPL or just type:
194 | 
195 | ```
196 | => (insta/disable-tracing!)
197 | nil
198 | ```
199 | 


--------------------------------------------------------------------------------
/images/vizexample1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Engelberg/instaparse/da886e71a4afa80f8b83d1d67f058b2f02cdc0e3/images/vizexample1.png


--------------------------------------------------------------------------------
/project.clj:
--------------------------------------------------------------------------------
 1 | (defproject instaparse "1.5.0"
 2 |   :description "Instaparse: No grammar left behind"
 3 |   :url "https://github.com/Engelberg/instaparse"
 4 |   :license {:name "Eclipse Public License"
 5 |             :url "http://www.eclipse.org/legal/epl-v10.html"}
 6 |   :dependencies [[org.clojure/clojure "1.11.1"]]
 7 |   :resource-paths ["resources"]
 8 |   :profiles {:dev {:dependencies
 9 |                    [[org.clojure/clojurescript "1.11.132"]
10 |                     [org.clojure/tools.trace "0.7.11"]
11 |                     [criterium "0.4.6"]
12 |                     [rhizome "0.2.9"]]}
13 |              :1.5 {:dependencies [[org.clojure/clojure "1.5.1"]]}
14 |              :1.6 {:dependencies [[org.clojure/clojure "1.6.0"]]}
15 |              :1.7 {:dependencies [[org.clojure/clojure "1.7.0"]
16 |                                   [org.clojure/clojurescript "1.7.28"]]}
17 |              :1.8 {:dependencies [[org.clojure/clojure "1.8.0"]
18 |                                   [org.clojure/clojurescript "1.8.34"]]}
19 |              :1.9 {:dependencies [[org.clojure/clojure "1.9.0"]
20 |                                   [org.clojure/clojurescript "1.10.238"]
21 |                                   [org.clojure/tools.reader "1.2.1"]]}
22 |              :1.10 {:dependencies [[org.clojure/clojure "1.10.0"]
23 |                                    [org.clojure/clojurescript "1.10.439"]
24 |                                    [org.clojure/tools.reader "1.3.2"]]}
25 |              :1.11 {:dependencies [[org.clojure/clojure "1.11.1"]
26 |                                    [org.clojure/clojurescript "1.11.132"]
27 |                                    [org.clojure/tools.reader "1.3.6"]]}}
28 |   :aliases {"test-all" ["with-profile" "+1.5:+1.6:+1.7:+1.8:+1.9:+1.10:+1.11" "test"]
29 |             "test-cljs" ["cljsbuild" "test" "unit-tests"]
30 |             "test-cljs-all" ["with-profile" "+1.9:+1.10:+1.11" "do" "clean," "test-cljs"]}
31 |   :test-paths ["test/" "target/generated/test/clj"]
32 |   :source-paths ["src/" "target/generated/src/clj"]
33 |   :cljsee {:builds [{:source-paths ["src/"]
34 |                      :output-path "target/generated/src/clj"
35 |                      :rules :clj}
36 |                     {:source-paths ["test/"]
37 |                      :output-path "target/generated/test/clj"
38 |                      :rules :clj}]}
39 |   :plugins [[lein-cljsbuild "1.1.8"]
40 |             [cljsee "0.1.0"]]
41 |   ;:hooks [leiningen.cljsbuild]
42 |   :target-path "target"
43 |   :scm {:name "git"
44 |         :url "https://github.com/Engelberg/instaparse"}
45 |   :prep-tasks [["cljsee" "once"]]
46 |   :cljsbuild {:builds [{:id "none"
47 |                         :source-paths ["src/"]
48 |                         :compiler {:output-to "target/js/none.js"
49 |                                    :optimizations :none
50 |                                    :pretty-print true}}
51 |                        {:id "test"
52 |                         :source-paths ["src/"
53 |                                        "test/"
54 |                                        "runner/cljs"]
55 |                         :compiler {:output-to "target/js/advanced-test.js"
56 |                                    :optimizations :advanced
57 |                                    :target :nodejs
58 |                                    :pretty-print false}}]
59 |               :test-commands {"unit-tests" ["node" "target/js/advanced-test.js"]}})
60 | 


--------------------------------------------------------------------------------
/resources/clj-kondo.exports/instaparse/config.edn:
--------------------------------------------------------------------------------
1 | {:lint-as {instaparse.macros/defclone clojure.core/def
2 |            instaparse.macros/set-global-var! clojure.core/set!}}
3 | 


--------------------------------------------------------------------------------
/runner/cljs/runner/runner.cljs:
--------------------------------------------------------------------------------
 1 | (ns instaparse.runner.runner
 2 |   (:require [cljs.nodejs :as nodejs]
 3 |             [instaparse.abnf-test]
 4 |             [instaparse.auto-flatten-seq-test]
 5 |             [instaparse.core-test]
 6 |             [instaparse.defparser-test]
 7 |             [instaparse.failure-test]
 8 |             [instaparse.grammars]
 9 |             [instaparse.repeat-test]
10 |             [instaparse.specs]
11 |             [cljs.test :as test :refer-macros [run-tests]]))
12 | 
13 | (nodejs/enable-util-print!)
14 | 
15 | (defmethod cljs.test/report [:cljs.test/default :end-run-tests] [m]
16 |   (if (test/successful? m)
17 |     (println "Tests succeeded!")
18 |     (do
19 |       (println "Tests failed.")
20 |       ((aget js/process "exit") 1))))
21 | 
22 | (defn -main []
23 |   (run-tests 'instaparse.abnf-test
24 |              'instaparse.auto-flatten-seq-test
25 |              'instaparse.core-test
26 |              'instaparse.defparser-test
27 |              'instaparse.failure-test
28 |              'instaparse.grammars
29 |              'instaparse.repeat-test
30 |              'instaparse.specs))
31 | 
32 | (set! *main-cli-fn* -main)
33 | 


--------------------------------------------------------------------------------
/src/instaparse/abnf.cljc:
--------------------------------------------------------------------------------
  1 | (ns instaparse.abnf
  2 |   "This is the context free grammar that recognizes ABNF notation."
  3 |   (:refer-clojure :exclude [cat])
  4 |   (:require [instaparse.transform :as t]
  5 |             [instaparse.cfg :as cfg]
  6 |             [instaparse.gll :as gll]
  7 |             [instaparse.reduction :as red]
  8 |             [instaparse.util :refer [throw-runtime-exception]]
  9 |             [instaparse.combinators-source :refer
 10 |              [Epsilon opt plus star rep alt ord cat string-ci string
 11 |               string-ci regexp nt look neg hide hide-tag unicode-char]]
 12 |             #?(:cljs [goog.string.format])
 13 |             [clojure.walk :as walk])
 14 |   #?(:cljs (:require-macros [instaparse.abnf :refer [precompile-cljs-grammar]])))
 15 | 
 16 | (def ^:dynamic *case-insensitive*
 17 |   "This is normally set to false, in which case the non-terminals
 18 | are treated as case-sensitive, which is NOT the norm
 19 | for ABNF grammars. If you really want case-insensitivity,
 20 | bind this to true, in which case all non-terminals
 21 | will be converted to upper-case internally (which
 22 | you'll have to keep in mind when transforming)."
 23 |   false)
 24 | 
 25 | (def abnf-core
 26 |   {:ALPHA (regexp "[a-zA-Z]")
 27 |    :BIT (regexp "[01]")
 28 |    :CHAR (regexp "[\\u0001-\\u007F]")
 29 |    :CR (string "\u000D")
 30 |    :CRLF (string "\u000D\u000A")
 31 |    :CTL (regexp "[\\u0000-\\u001F|\\u007F]")
 32 |    :DIGIT (regexp "[0-9]")
 33 |    :DQUOTE (string "\u0022")
 34 |    :HEXDIG (regexp "[0-9a-fA-F]")
 35 |    :HTAB (string "\u0009")
 36 |    :LF (string "\u000A")
 37 |    :LWSP (alt (alt (string "\u0020") (string "\u0009")) ;WSP
 38 |               (star
 39 |                 (cat (string "\u000D\u000A") ;CRLF
 40 |                      (alt (string "\u0020") (string "\u0009"))))) ;WSP
 41 |    :OCTET (regexp "[\\u0000-\\u00FF]")
 42 |    :SP (string "\u0020")
 43 |    :VCHAR (regexp "[\\u0021-\\u007E]")
 44 |    :WSP (alt (string "\u0020")     ;SP
 45 |              (string "\u0009"))})  ;HTAB
 46 | 
 47 | (def abnf-grammar-common
 48 |   "
 49 | <rulelist> = <opt-whitespace> (rule | hide-tag-rule)+;
 50 | rule = rulename-left <defined-as> alternation <opt-whitespace>;
 51 | hide-tag-rule = hide-tag <defined-as> alternation <opt-whitespace>;
 52 | rulename-left = rulename;
 53 | rulename-right = rulename;
 54 | <hide-tag> = <'<' opt-whitespace> rulename-left <opt-whitespace '>'>;
 55 | defined-as = <opt-whitespace> ('=' | '=/') <opt-whitespace>;
 56 | alternation = concatenation (<opt-whitespace '/' opt-whitespace> concatenation)*;
 57 | concatenation = repetition (<whitespace> repetition)*;
 58 | repetition = [repeat] <opt-whitespace> element;
 59 | repeat = NUM | (NUM? '*' NUM?);
 60 | <element> = rulename-right | group | hide | option | char-val | num-val
 61 |           | look | neg | regexp;
 62 | look = <'&' opt-whitespace> element;
 63 | neg = <'!' opt-whitespace> element;
 64 | <group> = <'(' opt-whitespace> alternation <opt-whitespace ')'>;
 65 | option = <'[' opt-whitespace> alternation <opt-whitespace ']'>;
 66 | hide = <'<' opt-whitespace> alternation <opt-whitespace '>'>;
 67 | char-val = <'\\u0022'> #'[\\u0020-\\u0021\\u0023-\\u007E]'* <'\\u0022'> (* double-quoted strings *)
 68 |          | <'\\u0027'> #'[\\u0020-\\u0026\u0028-\u007E]'* <'\\u0027'>;  (* single-quoted strings *)
 69 | <num-val> = <'%'> (bin-val | dec-val | hex-val);
 70 | bin-val = <'b'> bin-char
 71 |           [ (<'.'> bin-char)+ | ('-' bin-char) ];
 72 | bin-char = ('0' | '1')+;
 73 | dec-val = <'d'> dec-char
 74 |           [ (<'.'> dec-char)+ | ('-' dec-char) ];
 75 | dec-char = DIGIT+;
 76 | hex-val = <'x'> hex-char
 77 |           [ (<'.'> hex-char)+ | ('-' hex-char) ];
 78 | hex-char = HEXDIG+;
 79 | NUM = DIGIT+;
 80 | <DIGIT> = #'[0-9]';
 81 | <HEXDIG> = #'[0-9a-fA-F]';
 82 | 
 83 | 
 84 | (* extra entrypoint to be used by the abnf combinator *)
 85 | <rules-or-parser> = rulelist | alternation;
 86 |   ")
 87 | 
 88 | (def abnf-grammar-clj-only
 89 |   "
 90 | <rulename> = #'[a-zA-Z][-a-zA-Z0-9]*(?x) #identifier';
 91 | opt-whitespace = #'\\s*(?:;.*?(?:\\u000D?\\u000A\\s*|$))*(?x) # optional whitespace or comments';
 92 | whitespace = #'\\s+(?:;.*?\\u000D?\\u000A\\s*)*(?x) # whitespace or comments';
 93 | regexp = #\"#'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'(?x) #Single-quoted regexp\"
 94 |        | #\"#\\\"[^\\\"\\\\]*(?:\\\\.[^\\\"\\\\]*)*\\\"(?x) #Double-quoted regexp\"
 95 | ")
 96 | 
 97 | (def abnf-grammar-cljs-only
 98 |   "
 99 | <rulename> = #'[a-zA-Z][-a-zA-Z0-9]*';
100 | opt-whitespace = #'\\s*(?:;.*?(?:\\u000D?\\u000A\\s*|$))*';
101 | whitespace = #'\\s+(?:;.*?\\u000D?\\u000A\\s*)*';
102 | regexp = #\"#'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'\"
103 |        | #\"#\\\"[^\\\"\\\\]*(?:\\\\.[^\\\"\\\\]*)*\\\"\"
104 | ")
105 | 
106 | #?(:clj
107 |    (defmacro precompile-cljs-grammar
108 |      []
109 |      (let [combinators (red/apply-standard-reductions
110 |                          :hiccup (cfg/ebnf (str abnf-grammar-common
111 |                                                 abnf-grammar-cljs-only)))]
112 |        (walk/postwalk
113 |          (fn [form]
114 |            (cond
115 |              ;; Lists cannot be evaluated verbatim
116 |              (seq? form)
117 |              (list* 'list form)
118 | 
119 |              ;; Regexp terminals are handled differently in cljs
120 |              (= :regexp (:tag form))
121 |              `(merge (regexp ~(str (:regexp form)))
122 |                      ~(dissoc form :tag :regexp))
123 | 
124 |              :else form))
125 |          combinators))))
126 | 
127 | #?(:clj
128 |    (def abnf-parser (red/apply-standard-reductions
129 |                       :hiccup (cfg/ebnf (str abnf-grammar-common
130 |                                              abnf-grammar-clj-only))))
131 |    :cljs
132 |    (def abnf-parser (precompile-cljs-grammar)))
133 | 
134 | (defn get-char-combinator
135 |   [& nums]
136 |   (cond
137 |     (= "-" (second nums)) (let [[lo _ hi] nums]
138 |                             (unicode-char lo hi))
139 |     :else (apply cat (for [n nums]
140 |                        (unicode-char n)))))
141 | 
142 | (defn project
143 |   "Restricts map to certain keys"
144 |   [m ks]
145 |   (into {}
146 |         (for [k ks
147 |               :when (contains? m k)]
148 |           [k (m k)])))
149 | 
150 | (defn merge-core
151 |   "Merges abnf-core map in with parsed grammar map"
152 |   [grammar-map]
153 |   (merge
154 |     (project abnf-core (distinct (mapcat cfg/seq-nt (vals grammar-map))))
155 |     grammar-map))
156 | 
157 | (defn hide-tag?
158 |   "Tests whether parser was constructed with hide-tag"
159 |   [p]
160 |   (= (:red p) red/raw-non-terminal-reduction))
161 | 
162 | (defn alt-preserving-hide-tag [p1 p2]
163 |   (let [hide-tag-p1? (hide-tag? p1)
164 |         hide-tag-p2? (hide-tag? p2)]
165 |     (cond
166 |       (and hide-tag-p1? hide-tag-p2?)
167 |       (hide-tag (alt (dissoc p1 :red) (dissoc p2 :red)))
168 |       hide-tag-p1?
169 |       (hide-tag (alt (dissoc p1 :red) p2))
170 |       hide-tag-p2?
171 |       (hide-tag (alt p1 (dissoc p2 :red)))
172 |       :else
173 |       (alt p1 p2))))
174 | 
175 | #?(:clj
176 |    (defn parse-int
177 |      ([string] (Integer/parseInt string))
178 |      ([string radix] (Integer/parseInt string radix)))
179 |    :cljs
180 |    (def parse-int js/parseInt))
181 | 
182 | (def abnf-transformer
183 |   {
184 |    :rule hash-map
185 |    :hide-tag-rule (fn [tag rule] {tag (hide-tag rule)})
186 |    :rulename-left #(if *case-insensitive*
187 |                      (keyword (clojure.string/upper-case (apply str %&)))
188 |                      (keyword (apply str %&)))
189 |    :rulename-right #(if *case-insensitive*
190 |                       (nt (keyword (clojure.string/upper-case (apply str %&))))
191 |                       (nt (keyword (apply str %&))))
192 |    ; since rulenames are case insensitive, convert it to upper case internally to be consistent
193 |    :alternation alt
194 |    :concatenation cat
195 |    :repeat (fn [& items]
196 |              (case (count items)
197 |                1 (cond
198 |                    (= (first items) "*") {}                         ; *
199 |                    :else {:low (first items), :high (first items)}) ; x
200 |                2 (cond
201 |                    (= (first items) "*") {:high (second items)}     ; *x
202 |                    :else {:low (first items)})                      ; x*
203 |                3 {:low (first items), :high (nth items 2)}))        ; x*y
204 | 
205 |    :repetition (fn
206 |                  ([repeat element]
207 |                    (cond
208 |                      (empty? repeat) (star element)
209 |                      (= (count repeat) 2) (rep (:low repeat) (:high repeat) element)
210 |                      (= (:low repeat) 1) (plus element)
211 |                      (= (:high repeat) 1) (opt element)
212 |                      :else (rep (or (:low repeat) 0)
213 |                                 (or (:high repeat) #?(:clj Double/POSITIVE_INFINITY
214 |                                                       :cljs js/Infinity))
215 |                                 element)))
216 |                  ([element]
217 |                    element))
218 |    :option opt
219 |    :hide hide
220 |    :look look
221 |    :neg neg
222 |    :regexp (comp regexp cfg/process-regexp)
223 |    :char-val (fn [& cs]
224 |                (cfg/string+ (apply str cs) true))
225 |    :bin-char (fn [& cs]
226 |                (parse-int (apply str cs) 2))
227 |    :dec-char (fn [& cs]
228 |                (parse-int (apply str cs)))
229 |    :hex-char (fn [& cs]
230 |                (parse-int (apply str cs) 16))
231 |    :bin-val get-char-combinator
232 |    :dec-val get-char-combinator
233 |    :hex-val get-char-combinator
234 |    :NUM #(parse-int (apply str %&))})
235 | 
236 | (defn rules->grammar-map
237 |   [rules]
238 |   (merge-core (apply merge-with alt-preserving-hide-tag rules)))
239 | 
240 | (defn abnf
241 |   "Takes an ABNF grammar specification string and returns the combinator version.
242 | If you give it the right-hand side of a rule, it will return the combinator equivalent.
243 | If you give it a series of rules, it will give you back a grammar map.
244 | Useful for combining with other combinators."
245 |   [spec & {:as opts}]
246 |   (binding [cfg/*case-insensitive-literals* (:string-ci opts :default)]
247 |     (let [tree (gll/parse abnf-parser :rules-or-parser spec false)]
248 |       (cond
249 |         (instance? instaparse.gll.Failure tree)
250 |         (throw-runtime-exception
251 |           "Error parsing grammar specification:\n"
252 |           (with-out-str (println tree)))
253 |         (= :alternation (ffirst tree))
254 |         (t/transform abnf-transformer (first tree))
255 | 
256 |         :else (rules->grammar-map (t/transform abnf-transformer tree))))))
257 | 
258 | (defn build-parser [spec output-format]
259 |   (let [rule-tree (gll/parse abnf-parser :rulelist spec false)]
260 |     (if (instance? instaparse.gll.Failure rule-tree)
261 |       (throw-runtime-exception
262 |         "Error parsing grammar specification:\n"
263 |         (with-out-str (println rule-tree)))
264 |       (let [rules (t/transform abnf-transformer rule-tree)
265 |             grammar-map (rules->grammar-map rules)
266 |             start-production (first (first (first rules)))]
267 |         {:grammar (cfg/check-grammar (red/apply-standard-reductions output-format grammar-map))
268 |          :start-production start-production
269 |          :output-format output-format}))))
270 | 
271 | 


--------------------------------------------------------------------------------
/src/instaparse/auto_flatten_seq.cljc:
--------------------------------------------------------------------------------
  1 | (ns instaparse.auto-flatten-seq
  2 |   #?(:clj (:import clojure.lang.PersistentVector))
  3 |   #?(:clj (:require [clojure.core.protocols :refer [IKVReduce]])))
  4 | 
  5 | (def ^:const threshold 32)
  6 | 
  7 | (defprotocol ConjFlat
  8 |   (conj-flat [self obj])
  9 |   (cached? [self]))
 10 | 
 11 | ; Need a backwards compatible version of mix-collection-hash
 12 | #?(:clj (defmacro compile-if [test then else]
 13 |           (if (eval test)
 14 |             then
 15 |             else)))
 16 | 
 17 | #?(:clj (defmacro mix-collection-hash-bc [x y]
 18 |           ;; backwards-compatible
 19 |           `(compile-if (resolve 'clojure.core/mix-collection-hash)
 20 |                        (mix-collection-hash ~x ~y)
 21 |                        ~x)))
 22 | 
 23 | (declare EMPTY hash-cat afs? true-count)
 24 | 
 25 | #?(:clj
 26 |    (defmacro hash-conj [premix-hash-v item]
 27 |      `(unchecked-add-int (unchecked-multiply-int 31 ~premix-hash-v) (hash ~item)))
 28 |    :cljs
 29 |    (defn ^number hash-conj
 30 |      "Returns the hash code, consistent with =, for an external ordered
 31 |   collection implementing Iterable.
 32 |   See http://clojure.org/data_structures#hash for full algorithms."
 33 |      [unmixed-hash item]
 34 |      (+ (imul 31 unmixed-hash) (hash item))))
 35 | 
 36 | #?(:clj
 37 |    (defn- expt [base pow]
 38 |      (if (zero? pow)
 39 |        1
 40 |        (loop [n (int pow), y (int 1), z (int base)]
 41 |          (let [t (even? n), n (quot n 2)]
 42 |            (cond
 43 |              t (recur n y (unchecked-multiply-int z z))
 44 |              (zero? n) (unchecked-multiply-int z y)
 45 |              :else (recur n (unchecked-multiply-int z y) (unchecked-multiply-int z z)))))))
 46 |    :cljs
 47 |    (defn- expt [base pow]
 48 |      (if (zero? pow)
 49 |        1
 50 |        (loop [n (int pow), y (int 1), z (int base)]
 51 |          (let [t (even? n), n (quot n 2)]
 52 |            (cond
 53 |              t (recur n y (imul z z))
 54 |              (zero? n) (imul z y)
 55 |              :else (recur n (imul z y) (imul z z))))))))
 56 | 
 57 | (defn delve [v index]
 58 |   (loop [v (get-in v index)
 59 |          index index]
 60 |     (if (afs? v)
 61 |       (recur (get v 0) (conj index 0))
 62 |       index)))
 63 | 
 64 | (defn advance [v index]
 65 |   (cond
 66 |     (= (count index) 1)
 67 |     (when (< (peek index) (dec (true-count v)))
 68 |       (delve v [(inc (peek index))]))
 69 |     
 70 |     (< (peek index) (dec (true-count (get-in v (pop index)))))
 71 |     (delve v (conj (pop index) (inc (peek index))))
 72 |     
 73 |     :else
 74 |     (recur v (pop index))))
 75 | 
 76 | (defn flat-seq
 77 |   ([v] (if (pos? (count v)) 
 78 |          (flat-seq v (delve v [0]))
 79 |          nil))
 80 |   ([v index]
 81 |     (lazy-seq
 82 |       (cons (get-in v index) 
 83 |             (when-let [next-index (advance v index)] 
 84 |               (flat-seq v next-index))))))  
 85 | 
 86 | #?(:clj
 87 | (deftype AutoFlattenSeq [^PersistentVector v ^int premix-hashcode ^int hashcode
 88 |                          ^int cnt ^boolean dirty
 89 |                          ^:unsynchronized-mutable ^clojure.lang.ISeq cached-seq]
 90 |   Object
 91 |   (toString [self] (.toString (seq self)))
 92 |   (hashCode [self] hashcode)
 93 |   (equals [self other]
 94 |     (and (instance? AutoFlattenSeq other)
 95 |          (== hashcode (.hashcode ^AutoFlattenSeq other))
 96 |          (== cnt (.cnt ^AutoFlattenSeq other))
 97 |          (= dirty (.dirty ^AutoFlattenSeq other))
 98 |          (= v (.v ^AutoFlattenSeq other))))
 99 |   clojure.lang.IHashEq
100 |   (hasheq [self] hashcode)
101 |   java.util.Collection
102 |   (iterator [self]
103 |     (if-let [^java.util.Collection s (seq self)]
104 |       (.iterator s)
105 |       (let [^java.util.Collection e ()]
106 |         (.iterator e))))
107 |   (size [self]
108 |     cnt)
109 |   (toArray [self]
110 |     (let [^java.util.Collection s (seq self)]
111 |       (.toArray s)))
112 |   clojure.lang.Sequential
113 |   clojure.lang.ISeq
114 |   (equiv [self other]
115 |     (and (== hashcode (hash other))
116 |          (== cnt (count other))
117 |          (or (== cnt 0)
118 |              (= (seq self) other))))
119 |   (empty [self] (with-meta EMPTY (meta self))) 
120 |   (first [self] (first (seq self)))
121 |   (next [self] (next (seq self)))
122 |   (more [self] (rest (seq self)))
123 |   (cons [self obj]
124 |     (cons obj self))
125 |   ConjFlat
126 |   (conj-flat [self obj]
127 |     (cond
128 |       (nil? obj) self
129 |       (afs? obj)
130 |       (cond
131 |         (zero? cnt) obj
132 |         (<= (count obj) threshold)
133 |         (let [phc (hash-cat self obj)
134 |               new-cnt (+ cnt (count obj))]
135 |           (AutoFlattenSeq. (into v obj) phc (mix-collection-hash-bc phc new-cnt) new-cnt
136 |                            (or dirty (.dirty ^AutoFlattenSeq obj)) nil))
137 |         :else
138 |         (let [phc (hash-cat self obj)
139 |               new-cnt (+ cnt (count obj))]
140 |           (AutoFlattenSeq. (conj v obj) phc (mix-collection-hash-bc phc new-cnt) new-cnt
141 |                            true nil)))
142 |       :else 
143 |       (let [phc (hash-conj premix-hashcode obj)
144 |             new-cnt (inc cnt)]
145 |         (AutoFlattenSeq. (conj v obj) phc (mix-collection-hash-bc phc new-cnt) new-cnt dirty nil))))
146 |   (cached? [self] cached-seq)
147 |   clojure.lang.Counted
148 |   (count [self] cnt)
149 |   clojure.lang.ILookup
150 |   (valAt [self key]    
151 |     (.valAt v key))
152 |   (valAt [self key not-found]
153 |     (.valAt v key not-found))
154 |   clojure.lang.IObj
155 |   (withMeta [self metamap]
156 |     (AutoFlattenSeq. (with-meta v metamap) premix-hashcode hashcode cnt dirty nil))
157 |   clojure.lang.IMeta
158 |   (meta [self]
159 |     (meta v))
160 |   clojure.lang.Seqable
161 |   (seq [self]
162 |     (if cached-seq cached-seq
163 |       (do
164 |         (set! cached-seq (if dirty (flat-seq v) (seq v)))
165 |         cached-seq))))
166 | :cljs
167 | (deftype AutoFlattenSeq [^PersistentVector v ^number premix-hashcode ^number hashcode ^number cnt ^boolean dirty
168 |                          ^:unsynchronized-mutable ^ISeq cached-seq]
169 |   Object
170 |   (toString [self] (pr-str* (seq self)))
171 |   IHash
172 |   (-hash [self] hashcode)
173 |   ISequential
174 |   ISeq
175 |   (-first [self] (first (seq self)))
176 |   (-rest [self] (rest (seq self)))
177 |   IEquiv
178 |   (-equiv [self other]
179 |     (and ;(instance? AutoFlattenSeq other)
180 |          (= hashcode (hash other))
181 |          (= cnt (count other))
182 |          (or (= cnt 0)
183 |              (= (seq self) other))))
184 |   ICollection
185 |   (-conj [self o] (cons o self))
186 |   IEmptyableCollection
187 |   (-empty [self] (with-meta EMPTY (meta self))) 
188 |   INext
189 |   (-next [self] (next (seq self)))
190 |   ConjFlat
191 |   (conj-flat [self obj]
192 |     (cond
193 |       (nil? obj) self
194 |       (afs? obj)
195 |       (cond
196 |         (zero? cnt) obj
197 |         (<= (count obj) threshold)
198 |         (let [phc (hash-cat self obj)
199 |               new-cnt (+ cnt (count obj))]
200 |           (AutoFlattenSeq. (into v obj) phc (mix-collection-hash phc new-cnt) new-cnt
201 |                            (or dirty (.-dirty ^AutoFlattenSeq obj)) nil))
202 |         :else
203 |         (let [phc (hash-cat self obj)
204 |               new-cnt (+ cnt (count obj))]
205 |           (AutoFlattenSeq. (conj v obj) phc (mix-collection-hash phc new-cnt) new-cnt
206 |                            true nil)))
207 |       :else
208 |       (let [phc (hash-conj premix-hashcode obj)
209 |             new-cnt (inc cnt)]
210 |         (AutoFlattenSeq. (conj v obj) phc (mix-collection-hash phc new-cnt) new-cnt dirty nil))))
211 |   (cached? [self] cached-seq)
212 |   ICounted
213 |   (-count [self] cnt)
214 |   ILookup
215 |   (-lookup [self key]
216 |     (-lookup v key))
217 |   (-lookup [self key not-found]
218 |     (-lookup v key not-found))
219 |   IWithMeta
220 |   (-with-meta [self metamap]
221 |     (AutoFlattenSeq. (with-meta v metamap) premix-hashcode hashcode cnt dirty nil))
222 |   IMeta
223 |   (-meta [self]
224 |     (meta v))
225 |   ISeqable
226 |   (-seq [self]
227 |     (if cached-seq cached-seq
228 |       (do
229 |         (set! cached-seq (if dirty (flat-seq v) (seq v)))
230 |         cached-seq)))))
231 | 
232 | #?(:clj
233 |    (defn- hash-cat ^long [^AutoFlattenSeq v1 ^AutoFlattenSeq v2]
234 |      (let [c (count v2)
235 |            e (int (expt 31 c))]
236 |        (unchecked-add-int
237 |         (unchecked-multiply-int e (.premix-hashcode v1))
238 |         (unchecked-subtract-int (.premix-hashcode v2) e))))
239 |    :cljs
240 |    (defn- hash-cat ^number [^AutoFlattenSeq v1 ^AutoFlattenSeq v2]
241 |      (let [c (count v2)
242 |            e (int (expt 31 c))]
243 |        (+ (imul e (.-premix-hashcode v1))
244 |           (- (.-premix-hashcode v2) e)))))
245 | 
246 | #?(:clj
247 |    (defn hash-ordered-coll-without-mix ^long [v]
248 |      (compile-if (resolve 'clojure.core/mix-collection-hash)
249 |        (let [thirty-one (int 31)
250 |              cnt (count v)]
251 |          (loop [acc (int 1) i (int 0)]
252 |            (if (< i cnt)
253 |              (recur (unchecked-add-int
254 |                      (unchecked-multiply-int thirty-one acc)
255 |                      (hash (v i)))
256 |                     (inc i))
257 |              acc)))
258 |        (hash v)))
259 |    :cljs
260 |    (defn ^number hash-ordered-coll-without-mix
261 |      "Returns the partially calculated hash code, still requires a call to mix-collection-hash"
262 |      ([coll]
263 |       (hash-ordered-coll-without-mix 1 coll))
264 |      ([existing-unmixed-hash coll]
265 |       (loop [unmixed-hash existing-unmixed-hash
266 |              coll (seq coll)]
267 |         (if-not (nil? coll)
268 |           (recur (bit-or (+ (imul 31 unmixed-hash) (hash (first coll))) 0) 
269 |                  (next coll))
270 |           unmixed-hash)))))
271 | 
272 | #?(:cljs
273 |    (extend-protocol IPrintWithWriter
274 |      instaparse.auto-flatten-seq/AutoFlattenSeq
275 |      (-pr-writer [afs writer opts]
276 |        (-pr-writer (seq afs) writer opts))))
277 | 
278 | (defn auto-flatten-seq [v]
279 |   (let [v (vec v)]
280 |     (AutoFlattenSeq. v
281 |                      (hash-ordered-coll-without-mix v)
282 |                      (hash v) (count v)
283 |                      false nil)))
284 | 
285 | (def EMPTY (auto-flatten-seq []))
286 | 
287 | (defn afs? [s]
288 |   (instance? AutoFlattenSeq s))
289 | 
290 | (defn true-count [v]
291 |   (if (afs? v)
292 |     (count (.-v ^AutoFlattenSeq v))
293 |     (count v)))
294 | 
295 | ;; For hiccup format, we need to be able to convert the seq to a vector.
296 | 
297 | (defn flat-vec-helper [acc v]
298 |   (if-let [s (seq v)]
299 |     (let [fst (first v)]
300 |       (if (afs? fst) 
301 |         (recur (flat-vec-helper acc fst) (next v))
302 |         (recur (conj! acc fst) (next v))))
303 |     acc))
304 | 
305 | (defn flat-vec
306 |   "Turns deep vector (like the vector inside of FlattenOnDemandVector) into a flat vec"
307 |   [v]
308 |   (persistent! (flat-vec-helper (transient []) v)))
309 | 
310 | (defprotocol GetVec
311 |   (^PersistentVector get-vec [self]))
312 | 
313 | #?(:clj
314 | (deftype FlattenOnDemandVector [v   ; ref containing PersistentVector or nil 
315 |                                 ^int hashcode
316 |                                 ^int cnt
317 |                                 flat] ; ref containing PersistentVector or nil                                
318 |   GetVec
319 |   (get-vec [self] 
320 |            (when (not @flat)             
321 |              (dosync
322 |                (when (not @flat)
323 |                  (ref-set flat (with-meta (flat-vec @v) (meta @v))) 
324 |                  (ref-set v nil)))) ; clear out v so it can be garbage collected
325 |            @flat)
326 |                     
327 |   Object
328 |   (toString [self] (.toString (get-vec self)))
329 |   (hashCode [self] hashcode)
330 |   (equals [self other]
331 |     (and (instance? FlattenOnDemandVector other)
332 |          (== hashcode (.hashcode ^FlattenOnDemandVector other))
333 |          (== cnt (.cnt ^FlattenOnDemandVector other))
334 |          (= v (.v ^FlattenOnDemandVector other))
335 |          (= flat (.flat ^FlattenOnDemandVector other))))
336 |   clojure.lang.IHashEq
337 |   (hasheq [self] hashcode)
338 |   java.util.Collection
339 |   (iterator [self]
340 |     (.iterator (get-vec self)))
341 |   (size [self]
342 |     cnt)
343 |   (toArray [self]
344 |     (.toArray (get-vec self)))
345 |   clojure.lang.IPersistentCollection
346 |   (equiv [self other]
347 |     (or 
348 |       (and (== hashcode (hash other))
349 |            (== cnt (count other))
350 |            (= (get-vec self) other))))
351 |   (empty [self] (with-meta [] (meta self))) 
352 |   clojure.lang.Counted
353 |   (count [self] cnt)
354 |   clojure.lang.IPersistentVector
355 |   (assoc [self i val]
356 |     (assoc (get-vec self) i val))
357 |   (assocN [self i val]
358 |     (.assocN (get-vec self) i val))
359 |   (length [self]
360 |     cnt)
361 |   (cons [self obj]
362 |     (conj (get-vec self) obj))
363 |   clojure.lang.IObj
364 |   (withMeta [self metamap]    
365 |     (if @flat
366 |       (FlattenOnDemandVector. (ref @v) hashcode cnt (ref (with-meta @flat metamap)))
367 |       (FlattenOnDemandVector. (ref (with-meta @v metamap)) hashcode cnt (ref @flat))))
368 |   clojure.lang.IMeta
369 |   (meta [self]
370 |     (if @flat (meta @flat) (meta @v)))
371 |   clojure.lang.Seqable
372 |   (seq [self]
373 |     (seq (get-vec self)))
374 |   clojure.lang.ILookup
375 |   (valAt [self key]
376 |     (.valAt (get-vec self) key))
377 |   (valAt [self key not-found]
378 |     (.valAt (get-vec self) key not-found))
379 |   clojure.lang.Indexed
380 |   (nth [self i]
381 |     (.nth (get-vec self) i))
382 |   (nth [self i not-found]
383 |     (.nth (get-vec self) i not-found))
384 |   clojure.lang.IFn
385 |   (invoke [self arg]
386 |     (.invoke (get-vec self) arg))
387 |   (applyTo [self arglist]
388 |     (.applyTo (get-vec self) arglist))
389 |   clojure.lang.Reversible
390 |   (rseq [self]
391 |     (if (pos? cnt)
392 |       (rseq (get-vec self))
393 |       nil))
394 |   clojure.lang.IPersistentStack
395 |   (peek [self] 
396 |     (peek (get-vec self)))
397 |   (pop [self] 
398 |     (pop (get-vec self)))
399 |   clojure.lang.Associative
400 |   (containsKey [self k]
401 |     (.containsKey (get-vec self) k))
402 |   (entryAt [self k]
403 |     (.entryAt (get-vec self) k))
404 |   IKVReduce
405 |   (kv-reduce [self f init]
406 |     (.kvreduce (get-vec self) f init))
407 |   java.lang.Comparable
408 |   (compareTo [self that]
409 |     (.compareTo (get-vec self) that))
410 |   java.util.List
411 |   (get [self i] (nth (get-vec self) i))
412 |   (indexOf [self o] (.indexOf (get-vec self) o))
413 |   (lastIndexOf [self o] (.lastIndexOf (get-vec self) o))
414 |   (listIterator [self]
415 |     (.listIterator (get-vec self) 0))
416 |   (listIterator [self i]
417 |     (.listIterator (get-vec self) i))
418 |   (subList [self a z]
419 |     (.subList (get-vec self) a z))
420 |   )
421 | :cljs
422 | (deftype FlattenOnDemandVector [v   ; atom containing PersistentVector or nil 
423 |                                 ^number hashcode
424 |                                 ^number cnt
425 |                                 flat] ; atom containing PersistentVector or nil
426 |   GetVec
427 |   (get-vec [self] 
428 |     (when (not @flat)             
429 |       (swap! flat (fn [_] (with-meta (flat-vec @v) (meta @v))))
430 |       (swap! v (fn [_] nil))) ; clear out v so it can be garbage collected 
431 |     @flat)
432 |   
433 |   Object
434 |   (toString [self]
435 |     (pr-str* (get-vec self)))
436 |   IHash
437 |   (-hash [self] hashcode)
438 |   IEquiv
439 |   (-equiv [self other]
440 |     (or 
441 |      (and (= hashcode (hash other))
442 |           (= cnt (count other))
443 |           (= (get-vec self) other))))
444 |   IEmptyableCollection
445 |   (-empty [self] (with-meta [] (meta self))) 
446 |   ICounted
447 |   (-count [self] cnt)
448 |   IVector
449 |   (-assoc-n [self i val]
450 |     (-assoc-n (get-vec self) i val))
451 |   ICollection
452 |   (-conj [self obj]
453 |     (conj (get-vec self) obj))
454 |   IWithMeta
455 |   (-with-meta [self metamap]    
456 |     (if @flat
457 |       (FlattenOnDemandVector. (atom @v) hashcode cnt (atom (with-meta @flat metamap)))
458 |       (FlattenOnDemandVector. (atom (with-meta @v metamap)) hashcode cnt (atom @flat))))
459 |   IMeta
460 |   (-meta [self]
461 |     (if @flat (meta @flat) (meta @v)))
462 |   ISequential
463 |   ISeqable
464 |   (-seq [self]
465 |     (seq (get-vec self)))
466 |   ILookup
467 |   (-lookup [self key]
468 |     (-lookup (get-vec self) key))
469 |   (-lookup [self key not-found]
470 |     (-lookup (get-vec self) key not-found))
471 |   IIndexed
472 |   (-nth [self i]
473 |     (-nth (get-vec self) i))
474 |   (-nth [self i not-found]
475 |     (-nth (get-vec self) i not-found))
476 |   IFn
477 |   (-invoke [self arg]
478 |     (-invoke (get-vec self) arg))
479 |   (-invoke [self arg not-found]
480 |     (-invoke (get-vec self) arg not-found))
481 |   IReversible
482 |   (-rseq [self]
483 |     (if (pos? cnt)
484 |       (rseq (get-vec self))
485 |       nil))
486 |   IStack
487 |   (-peek [self] 
488 |     (-peek (get-vec self)))
489 |   (-pop [self] 
490 |     (-pop (get-vec self)))
491 |   IAssociative
492 |   (-assoc [self i val]
493 |     (assoc (get-vec self) i val))
494 |   (-contains-key? [self k]
495 |     (-contains-key? (get-vec self) k))
496 |   IKVReduce
497 |   (-kv-reduce [self f init]
498 |     (-kv-reduce (get-vec self) f init))
499 |   IComparable
500 |   (-compare [self that]
501 |     (-compare (get-vec self) that))
502 |   ))
503 | 
504 | #?(:cljs
505 |    (extend-protocol IPrintWithWriter
506 |      instaparse.auto-flatten-seq/FlattenOnDemandVector
507 |      (-pr-writer [v writer opts]
508 |        (-pr-writer (get-vec v) writer opts))))
509 | 
510 | (defn convert-afs-to-vec [^AutoFlattenSeq afs]
511 |   (cond
512 |     (.-dirty afs)
513 |     (if (cached? afs)
514 |       (vec (seq afs))
515 |       #?(:clj
516 |          (FlattenOnDemandVector.
517 |           (ref (.-v afs))
518 |           (.-hashcode afs)
519 |           (.-cnt afs)
520 |           (ref nil))
521 |          :cljs
522 |          (FlattenOnDemandVector.
523 |           (atom (.-v afs))
524 |           (.-hashcode afs)
525 |           (.-cnt afs)
526 |           (atom nil))))
527 |     :else
528 |     (.-v afs)))
529 | 


--------------------------------------------------------------------------------
/src/instaparse/cfg.cljc:
--------------------------------------------------------------------------------
  1 | (ns instaparse.cfg
  2 |   "This is the context free grammar that recognizes context free grammars."
  3 |   (:refer-clojure :exclude [cat])
  4 |   (:require [instaparse.combinators-source :refer
  5 |              [Epsilon opt plus star rep alt ord cat string-ci string
  6 |               string-ci regexp nt look neg hide hide-tag]]
  7 |             [instaparse.reduction :refer [apply-standard-reductions]]
  8 |             [instaparse.gll :refer [parse]]
  9 |             [instaparse.util :refer [throw-illegal-argument-exception
 10 |                                      throw-runtime-exception]]
 11 |             [clojure.string :as str]
 12 |             #?(:cljs [cljs.tools.reader :as reader])
 13 |             #?(:cljs [cljs.tools.reader.reader-types :as readers])))
 14 | 
 15 | (def ^:dynamic *case-insensitive-literals*
 16 |   "Sets whether all string literal terminals in a built grammar
 17 |   will be treated as case insensitive.
 18 | 
 19 |   `true`: case-insensitive
 20 |   `false`: case-sensitive
 21 |   `:default`: case-sensitive for EBNF, case-insensitive for ABNF"
 22 |   :default)
 23 | 
 24 | (defn string+
 25 |   "Returns a string combinator that may be case-insensntive, based
 26 |   on (in priority order):
 27 | 
 28 |   1) the value of `*case-insensitive-literals*`, if it has been
 29 |   overridden to a boolean
 30 |   2) the supplied `ci-by-default?` parameter"
 31 |   [s ci-by-default?]
 32 |   (case *case-insensitive-literals*
 33 |     true (string-ci s)
 34 |     false (string s)
 35 |     :default (if ci-by-default? (string-ci s) (string s))))
 36 | 
 37 | (defn regex-doc
 38 |   "Adds a comment to a Clojure regex, or no-op in ClojureScript"
 39 |   [pattern-str comment]
 40 |   #?(:clj (re-pattern (str pattern-str "(?x) #" comment))
 41 |      :cljs (re-pattern pattern-str)))
 42 | 
 43 | (def single-quoted-string (regex-doc #"'[^'\\]*(?:\\.[^'\\]*)*'" "Single-quoted string"))
 44 | (def single-quoted-regexp (regex-doc #"#'[^'\\]*(?:\\.[^'\\]*)*'" "Single-quoted regexp"))
 45 | (def double-quoted-string (regex-doc #"\"[^\"\\]*(?:\\.[^\"\\]*)*\"" "Double-quoted string"))
 46 | (def double-quoted-regexp (regex-doc #"#\"[^\"\\]*(?:\\.[^\"\\]*)*\"" "Double-quoted regexp"))
 47 | (def inside-comment #?(:clj #"(?s)(?:(?!(?:\(\*|\*\))).)*(?x) #Comment text"
 48 |                        :cljs #"(?:(?!(?:\(\*|\*\)))[\s\S])*"))
 49 | (def ws (regex-doc "[,\\s]*" "optional whitespace"))
 50 | 
 51 | (def opt-whitespace (hide (nt :opt-whitespace)))
 52 | 
 53 | (def non-terminal
 54 |   (regex-doc "[^, \\r\\t\\n<>(){}\\[\\]+*?:=|'\"#&!;./]+" "Non-terminal"))
 55 | 
 56 | (def non-terminal-namespace-allowed
 57 |   (let [no-slash "[^, \\r\\t\\n<>(){}\\[\\]+*?:=|'\"#&!;/.]"
 58 |         with-slash "[^, \\r\\t\\n<>(){}\\[\\]+*?:=|'\"#&!;]"]
 59 |     (regex-doc (str no-slash with-slash "*") "Non-terminal-namespace-allowed")))
 60 | 
 61 | (defn make-cfg [allow-namespaced-nts?]
 62 |   (apply-standard-reductions
 63 |    :hiccup    ; use the hiccup output format
 64 |    {:rules (hide-tag (cat opt-whitespace
 65 |                           (plus (nt :rule))))
 66 |     :comment (cat (string "(*") (nt :inside-comment) (string "*)"))
 67 |     :inside-comment (cat (regexp inside-comment)
 68 |                          (star (cat (nt :comment)
 69 |                                     (regexp inside-comment))))
 70 |     :opt-whitespace (cat (regexp ws)
 71 |                          (star (cat (nt :comment)
 72 |                                     (regexp ws))))
 73 |     :rule-separator (alt (string ":")
 74 |                          (string ":=")
 75 |                          (string "::=")
 76 |                          (string "="))
 77 |     :rule (cat (alt (nt :nt)
 78 |                     (nt :hide-nt))
 79 |                opt-whitespace
 80 |                (hide (nt :rule-separator))
 81 |                opt-whitespace
 82 |                (nt :alt-or-ord)
 83 |                (hide (alt (nt :opt-whitespace)
 84 |                           (cat (nt :opt-whitespace) (alt (string ";") (string ".")) (nt :opt-whitespace)))))
 85 |     :nt (cat
 86 |          (neg (nt :epsilon))
 87 |          (regexp
 88 |           (if allow-namespaced-nts?
 89 |             non-terminal-namespace-allowed
 90 |             non-terminal)))
 91 |     :hide-nt (cat (hide (string "<"))
 92 |                   opt-whitespace
 93 |                   (nt :nt)
 94 |                   opt-whitespace
 95 |                   (hide (string ">")))
 96 |     :alt-or-ord (hide-tag (alt (nt :alt) (nt :ord)))
 97 |     :alt (cat (nt :cat)
 98 |               (star
 99 |                (cat
100 |                 opt-whitespace
101 |                 (hide (string "|"))
102 |                 opt-whitespace
103 |                 (nt :cat))))
104 |     :ord (cat (nt :cat)
105 |               (plus
106 |                (cat
107 |                 opt-whitespace
108 |                 (hide (string "/"))
109 |                 opt-whitespace
110 |                 (nt :cat))))
111 |     :paren (cat (hide (string "("))
112 |                 opt-whitespace
113 |                 (nt :alt-or-ord)
114 |                 opt-whitespace
115 |                 (hide (string ")")))
116 |     :hide (cat (hide (string "<"))
117 |                opt-whitespace
118 |                (nt :alt-or-ord)
119 |                opt-whitespace
120 |                (hide (string ">")))
121 |     :cat (plus (cat
122 |                 opt-whitespace
123 |                 (alt (nt :factor) (nt :look) (nt :neg))
124 |                 opt-whitespace))
125 |     :string (alt
126 |              (regexp single-quoted-string)
127 |              (regexp double-quoted-string))
128 |     :regexp (alt
129 |              (regexp single-quoted-regexp)
130 |              (regexp double-quoted-regexp))
131 |     :opt (alt
132 |           (cat (hide (string "["))
133 |                opt-whitespace
134 |                (nt :alt-or-ord)
135 |                opt-whitespace
136 |                (hide (string "]")))
137 |           (cat (nt :factor)
138 |                opt-whitespace
139 |                (hide (string "?"))))
140 |     :star (alt
141 |            (cat (hide (string "{"))
142 |                 opt-whitespace
143 |                 (nt :alt-or-ord)
144 |                 opt-whitespace
145 |                 (hide (string "}")))
146 |            (cat (nt :factor)
147 |                 opt-whitespace
148 |                 (hide (string "*"))))
149 |     :plus (cat (nt :factor)
150 |                opt-whitespace
151 |                (hide (string "+")))
152 |     :look (cat (hide (string "&"))
153 |                opt-whitespace
154 |                (nt :factor))
155 |     :neg (cat (hide (string "!"))
156 |               opt-whitespace
157 |               (nt :factor))
158 |     :epsilon (alt (string "Epsilon")
159 |                   (string "epsilon")
160 |                   (string "EPSILON")
161 |                   (string "eps")
162 |                   (string "\u03b5"))
163 |     :factor (hide-tag (alt (nt :nt)
164 |                            (nt :string)
165 |                            (nt :regexp)
166 |                            (nt :opt)
167 |                            (nt :star)
168 |                            (nt :plus)
169 |                            (nt :paren)
170 |                            (nt :hide)
171 |                            (nt :epsilon)))
172 |     ;; extra entrypoint to be used by the ebnf combinator
173 |     :rules-or-parser (hide-tag (alt (nt :rules) (nt :alt-or-ord)))}))
174 | 
175 | (def cfg (make-cfg false)) ;; the original parser for instaparse's ebnf notation flavor of context-free grammars
176 | (def cfg-allow-namespaced-nts (make-cfg true))  ;; new version recognizes namespaced non-terminals
177 | 
178 | ;; Internally, we're converting the grammar into a hiccup parse tree
179 | ;; Here's how you extract the relevant information
180 | (def tag first)
181 | (def contents next)
182 | (def content fnext)
183 | 
184 | ;;;; Helper functions for reading strings and regexes
185 | 
186 | (defn escape
187 |   "Converts escaped single-quotes to unescaped, and unescaped double-quotes to escaped"
188 |   [s]
189 |   (loop [sq (seq s), v []]
190 |     (if-let [c (first sq)]
191 |       (case c
192 |         \\ (if-let [c2 (second sq)]
193 |              (if (= c2 \')
194 |                (recur (drop 2 sq) (conj v c2))
195 |                (recur (drop 2 sq) (conj v c c2)))
196 |              (throw-runtime-exception
197 |                "Encountered backslash character at end of string: " s))
198 |         \" (recur (next sq) (conj v \\ \"))
199 |         (recur (next sq) (conj v c)))
200 |       (apply str v))))
201 | 
202 | ;(defn safe-read-string [s]
203 | ;  (binding [*read-eval* false]
204 | ;    (read-string s)))
205 | 
206 | #?(:clj
207 |    (defn wrap-reader [reader]
208 |      (let [{major :major minor :minor} *clojure-version*]
209 |        (if (and (<= major 1) (<= minor 6))
210 |          reader
211 |          (fn [r s] (reader r s {} (java.util.LinkedList.)))))))
212 | 
213 | #?(:clj
214 |    (let [string-reader (wrap-reader
215 |                         (clojure.lang.LispReader$StringReader.))]
216 |      (defn safe-read-string
217 |        "Expects a double-quote at the end of the string"
218 |        [s]
219 |        (with-in-str s (string-reader *in* nil))))
220 | 
221 |    :cljs
222 |    (let [read-string* @#'reader/read-string*] ;; since read-string* is private
223 |      (defn safe-read-string [s]
224 |        (read-string* (readers/string-push-back-reader s) nil nil nil))))
225 | 
226 | ; I think re-pattern is sufficient, but here's how to do it without.
227 | ;(let [regexp-reader (clojure.lang.LispReader$RegexReader.)]
228 | ;  (defn safe-read-regexp
229 | ;    "Expects a double-quote at the end of the string"
230 | ;    [s]
231 | ;    (with-in-str s (regexp-reader *in* nil))))
232 | 
233 | (defn process-string
234 |   "Converts single quoted string to double-quoted"
235 |   [s]
236 |   (let [stripped
237 |         (subs s 1 (dec (count s)))
238 |         remove-escaped-single-quotes
239 |         (escape stripped)
240 |         final-string
241 |         (safe-read-string (str remove-escaped-single-quotes \"))]
242 | 
243 |     final-string))
244 | 
245 | (defn process-regexp
246 |   "Converts single quoted regexp to double-quoted"
247 |   [s]
248 |   ;(println (with-out-str (pr s)))
249 |   (let [stripped
250 |         (subs s 2 (dec (count s)))
251 |         remove-escaped-single-quotes
252 |         (escape stripped)
253 |         final-string
254 |         (re-pattern remove-escaped-single-quotes)]
255 | ;        (safe-read-regexp (str remove-escaped-single-quotes \"))]
256 | 
257 |     final-string))
258 | 
259 | ;;; Now we need to convert the grammar's parse tree into combinators
260 | 
261 | (defn build-rule
262 |   "Convert one parsed rule from the grammar into combinators"
263 |   [tree]
264 |   (case (tag tree)
265 |     :rule (let [[nt alt-or-ord] (contents tree)]
266 |             (if (= (tag nt) :hide-nt)
267 |               [(keyword (content (content nt)))
268 |                (hide-tag (build-rule alt-or-ord))]
269 |               [(keyword (content nt))
270 |                (build-rule alt-or-ord)]))
271 |     :nt (nt (keyword (content tree)))
272 |     :alt (apply alt (map build-rule (contents tree)))
273 |     :ord (apply ord (map build-rule (contents tree)))
274 |     :paren (recur (content tree))
275 |     :hide (hide (build-rule (content tree)))
276 |     :cat (apply cat (map build-rule (contents tree)))
277 |     :string (string+ (process-string (content tree)) false)
278 |     :regexp (regexp (process-regexp (content tree)))
279 |     :opt (opt (build-rule (content tree)))
280 |     :star (star (build-rule (content tree)))
281 |     :plus (plus (build-rule (content tree)))
282 |     :look (look (build-rule (content tree)))
283 |     :neg (neg (build-rule (content tree)))
284 |     :epsilon Epsilon))
285 | 
286 | (defn seq-nt
287 |   "Returns a sequence of all non-terminals in a parser built from combinators."
288 |   [parser]
289 |   (case (:tag parser)
290 |     :nt [(:keyword parser)]
291 |     (:string :string-ci :char :regexp :epsilon) []
292 |     (:opt :plus :star :look :neg :rep) (recur (:parser parser))
293 |     (:alt :cat) (mapcat seq-nt (:parsers parser))
294 |     :ord (mapcat seq-nt
295 |                  [(:parser1 parser) (:parser2 parser)])))
296 | 
297 | (defn check-grammar
298 |   "Throw error if grammar uses any invalid non-terminals in its productions"
299 |   [grammar-map]
300 |   (let [valid-nts (set (keys grammar-map))]
301 |     (doseq [nt (distinct (mapcat seq-nt (vals grammar-map)))]
302 |       (when-not (valid-nts nt)
303 |         (throw-runtime-exception
304 |           (subs (str nt) 1)
305 |           " occurs on the right-hand side of your grammar, but not on the left"))))
306 |   grammar-map)
307 | 
308 | (defn build-parser
309 |   ([spec output-format] (build-parser spec output-format false))
310 |   ([spec output-format allow-namespaced-nts?]
311 |    (let [rules (parse (if allow-namespaced-nts? cfg-allow-namespaced-nts cfg) :rules spec false)]
312 |      (if (instance? instaparse.gll.Failure rules)
313 |        (throw-runtime-exception
314 |         "Error parsing grammar specification:\n"
315 |         (with-out-str (println rules)))
316 |        (let [productions (map build-rule rules)
317 |              start-production (first (first productions))]
318 |          {:grammar (check-grammar (apply-standard-reductions output-format (into {} productions)))
319 |           :start-production start-production
320 |           :output-format output-format})))))
321 | 
322 | (defn build-parser-from-combinators [grammar-map output-format start-production]
323 |   (if (nil? start-production)
324 |     (throw-illegal-argument-exception
325 |       "When you build a parser from a map of parser combinators, you must provide a start production using the :start keyword argument.")
326 |     {:grammar (check-grammar (apply-standard-reductions output-format grammar-map))
327 |      :start-production start-production
328 |      :output-format output-format}))
329 | 
330 | (defn ebnf
331 |   "Takes an EBNF grammar specification string and returns the combinator version.
332 | If you give it the right-hand side of a rule, it will return the combinator equivalent.
333 | If you give it a series of rules, it will give you back a grammar map.
334 | Useful for combining with other combinators."
335 |   [spec & {:as opts}]
336 |   (binding [*case-insensitive-literals* (:string-ci opts :default)]
337 |     (let [rules (parse cfg :rules-or-parser spec false)]
338 |       (cond
339 |         (instance? instaparse.gll.Failure rules)
340 |         (throw-runtime-exception
341 |           "Error parsing grammar specification:\n"
342 |           (with-out-str (println rules)))
343 |         (= :rule (ffirst rules))
344 |         (into {} (map build-rule rules))
345 | 
346 |         :else (build-rule (first rules))))))
347 | 


--------------------------------------------------------------------------------
/src/instaparse/combinators.cljc:
--------------------------------------------------------------------------------
 1 | (ns instaparse.combinators
 2 |   "The combinator public API for instaparse"
 3 |   (:refer-clojure :exclude [cat])
 4 |   #?(:clj (:use instaparse.macros)
 5 |      :cljs (:require-macros
 6 |             [instaparse.macros :refer [defclone]]))
 7 |   (:require [instaparse.combinators-source :as c]
 8 |             [instaparse.cfg :as cfg]
 9 |             [instaparse.abnf :as abnf]))
10 | 
11 | ;; The actual source is in combinators-source.
12 | ;; This was necessary to avoid a cyclical dependency in the namespaces.
13 | 
14 | (defclone Epsilon c/Epsilon)
15 | (defclone opt c/opt)
16 | (defclone plus c/plus)
17 | (defclone star c/star)
18 | (defclone rep c/rep)
19 | (defclone alt c/alt) 
20 | (defclone ord c/ord)
21 | (defclone cat c/cat)
22 | (defclone string c/string)
23 | (defclone string-ci c/string-ci)
24 | (defclone unicode-char c/unicode-char)
25 | (defclone regexp c/regexp)
26 | (defclone nt c/nt)
27 | (defclone look c/look)
28 | (defclone neg c/neg)
29 | (defclone hide c/hide)
30 | (defclone hide-tag c/hide-tag)
31 | 
32 | (defclone ebnf cfg/ebnf)
33 | (defclone abnf abnf/abnf)
34 |        
35 | 


--------------------------------------------------------------------------------
/src/instaparse/combinators_source.cljc:
--------------------------------------------------------------------------------
  1 | (ns instaparse.combinators-source
  2 |   "This is the underlying implementation of the various combinators."
  3 |   (:refer-clojure :exclude [cat])
  4 |   (:require [instaparse.reduction :refer [singleton? red
  5 |                                           raw-non-terminal-reduction
  6 |                                           reduction-types]]
  7 |             [instaparse.util :refer [throw-illegal-argument-exception #?(:cljs regexp-flags)]]))
  8 | 
  9 | ;; Ways to build parsers
 10 | 
 11 | (def Epsilon {:tag :epsilon})
 12 | 
 13 | (defn opt "Optional, i.e., parser?"
 14 |   [parser] 
 15 |   (if (= parser Epsilon) Epsilon
 16 |     {:tag :opt :parser parser}))
 17 | 
 18 | (defn plus "One or more, i.e., parser+"
 19 |   [parser]
 20 |   (if (= parser Epsilon) Epsilon
 21 |     {:tag :plus :parser parser}))
 22 | 
 23 | (defn star "Zero or more, i.e., parser*"
 24 |   [parser] 
 25 |   (if (= parser Epsilon) Epsilon
 26 |     {:tag :star :parser parser}))
 27 | 
 28 | (defn rep "Between m and n repetitions"
 29 |   [m n parser]
 30 |   {:pre [(<= m n)]}
 31 |   (if (= parser Epsilon) Epsilon
 32 |     {:tag :rep :parser parser :min m :max n}))
 33 | 
 34 | (defn alt "Alternation, i.e., parser1 | parser2 | parser3 | ..."
 35 |   [& parsers] 
 36 |   (cond
 37 |     (every? (partial = Epsilon) parsers) Epsilon
 38 |     (singleton? parsers) (first parsers)
 39 |     :else {:tag :alt :parsers parsers}))
 40 | 
 41 | (defn- ord2 [parser1 parser2]
 42 |   {:tag :ord :parser1 parser1 :parser2 parser2})
 43 | 
 44 | (defn ord "Ordered choice, i.e., parser1 / parser2"
 45 |   ([] Epsilon)
 46 |   ([parser1 & parsers]
 47 |     (let [parsers (if (= parser1 Epsilon)
 48 |                     (remove #{Epsilon} parsers)
 49 |                     parsers)]
 50 |       (if (seq parsers)
 51 |         (ord2 parser1 (apply ord parsers))
 52 |         parser1))))
 53 | 
 54 | (defn cat "Concatenation, i.e., parser1 parser2 ..."
 55 |   [& parsers]
 56 |   (if (every? (partial = Epsilon) parsers) Epsilon
 57 |     (let [parsers (remove #{Epsilon} parsers)]
 58 |       (if (singleton? parsers) (first parsers) ; apply vector reduction
 59 |         {:tag :cat :parsers parsers}))))
 60 | 
 61 | (defn string "Create a string terminal out of s" 
 62 |   [s] 
 63 |   (if (= s "") Epsilon
 64 |     {:tag :string :string s}))
 65 | 
 66 | (defn string-ci "Create a case-insensitive string terminal out of s" 
 67 |   [s] 
 68 |   (if (= s "") Epsilon
 69 |       {:tag :string-ci :string s}))
 70 | 
 71 | (defn unicode-char
 72 |   "Matches a Unicode code point or a range of code points"
 73 |   ([code-point]
 74 |    (unicode-char code-point code-point))
 75 |   ([lo hi]
 76 |    (assert (<= lo hi) "Character range minimum must be less than or equal the maximum")
 77 |    {:tag :char :lo lo :hi hi}))
 78 | 
 79 | #?(:cljs
 80 |    (defn- add-beginning-constraint
 81 |      "JavaScript regexes have no .lookingAt method, so in cljs we just
 82 |   add a '^' character to the front of the regex."
 83 |      [r]
 84 |      (if (regexp? r)
 85 |        (js/RegExp. (str "^" (.-source r)) (regexp-flags r))
 86 |        r)))
 87 | 
 88 | (defn regexp "Create a regexp terminal out of regular expression r"
 89 |   [r]
 90 |   (if (= r "") Epsilon
 91 |       {:tag :regexp
 92 |        :regexp (-> (re-pattern r)
 93 |                    #?(:cljs add-beginning-constraint))}))
 94 | 
 95 | (defn nt "Refers to a non-terminal defined by the grammar map"
 96 |   [s] 
 97 |   {:tag :nt :keyword s})
 98 | 
 99 | (defn look "Lookahead, i.e., &parser" 
100 |   [parser] 
101 |   {:tag :look :parser parser}) 
102 | 
103 | (defn neg "Negative lookahead, i.e., !parser"
104 |   [parser] 
105 |   {:tag :neg :parser parser})
106 | 
107 | (defn hide "Hide the result of parser, i.e., <parser>"
108 |   [parser] 
109 |   (assoc parser :hide true))
110 | 
111 | (defn hide-tag "Hide the tag associated with this rule.  
112 |   Wrap this combinator around the entire right-hand side."  
113 |   [parser]
114 |   (red parser raw-non-terminal-reduction))
115 | 
116 | ; Ways to alter a parser with hidden information, unhiding that information
117 | 
118 | (defn hidden-tag?
119 |   "Tests whether parser was created with hide-tag combinator"
120 |   [parser]
121 |   (= (:red parser) raw-non-terminal-reduction))
122 | 
123 | (defn unhide-content
124 |   "Recursively undoes the effect of hide on one parser"
125 |   [parser]
126 |   (let [parser (if (:hide parser) (dissoc parser :hide) parser)]
127 |     (cond
128 |       (:parser parser) (assoc parser :parser (unhide-content (:parser parser)))
129 |       (:parsers parser) (assoc parser :parsers (map unhide-content (:parsers parser)))
130 |       (= (:tag parser) :ord) (assoc parser 
131 |                                     :parser1 (unhide-content (:parser1 parser))
132 |                                     :parser2 (unhide-content (:parser2 parser)))
133 |       :else parser)))
134 | 
135 | (defn unhide-all-content
136 |   "Recursively undoes the effect of hide on all parsers in the grammar"
137 |   [grammar]
138 |   (into {} (for [[k v] grammar]
139 |              [k (unhide-content v)])))
140 | 
141 | (defn unhide-tags 
142 |   "Recursively undoes the effect of hide-tag"
143 |   [reduction-type grammar]
144 |   (if-let [reduction (reduction-types reduction-type)]
145 |     (into {} (for [[k v] grammar]
146 |                [k (assoc v :red (reduction k))]))
147 |     (throw-illegal-argument-exception
148 |       "Invalid output format " reduction-type ". Use :enlive or :hiccup.")))
149 | 
150 | (defn unhide-all
151 |   "Recursively undoes the effect of both hide and hide-tag"
152 |   [reduction-type grammar]
153 |   (if-let [reduction (reduction-types reduction-type)]
154 |     (into {} (for [[k v] grammar]
155 |                [k (assoc (unhide-content v) :red (reduction k))]))
156 |     (throw-illegal-argument-exception
157 |       "Invalid output format " reduction-type ". Use :enlive or :hiccup.")))
158 | 
159 | 
160 | ;; New beta feature: automatically add whitespace
161 | 
162 | (defn auto-whitespace-parser [parser ws-parser]
163 |   (case (:tag parser)
164 |     (:nt :epsilon) parser  
165 |     (:opt :plus :star :rep :look :neg) (update-in parser [:parser] auto-whitespace-parser ws-parser)
166 |     (:alt :cat) (assoc parser :parsers  
167 |                        (map #(auto-whitespace-parser % ws-parser) (:parsers parser)))
168 |     :ord (assoc parser 
169 |                 :parser1 (auto-whitespace-parser (:parser1 parser) ws-parser)
170 |                 :parser2 (auto-whitespace-parser (:parser2 parser) ws-parser))
171 |     (:string :string-ci :regexp) 
172 |     ; If the string/regexp has a reduction associated with it,
173 |     ; we need to "lift" that reduction out to the (cat whitespace string)
174 |     ; parser that is being created.
175 |     (if (:red parser)
176 |       (assoc (cat ws-parser (dissoc parser :red)) :red (:red parser))
177 |       (cat ws-parser parser))))
178 | 
179 | (defn auto-whitespace [grammar start grammar-ws start-ws]
180 |   (let [ws-parser (hide (opt (nt start-ws)))
181 |         grammar-ws (assoc grammar-ws start-ws (hide-tag (grammar-ws start-ws)))
182 |         modified-grammar (into {} 
183 |                                (for [[nt parser] grammar] 
184 |                                  [nt (auto-whitespace-parser parser ws-parser)]))
185 |         final-grammar (assoc modified-grammar start 
186 |                              (assoc (cat (dissoc (modified-grammar start) :red) 
187 |                                          ws-parser)
188 |                                     :red (:red (modified-grammar start))))]
189 |     (merge final-grammar grammar-ws)))
190 | 


--------------------------------------------------------------------------------
/src/instaparse/core.cljc:
--------------------------------------------------------------------------------
  1 | (ns instaparse.core
  2 |   #?(:cljs
  3 |      (:require-macros [instaparse.core]
  4 |                       [instaparse.macros :refer [defclone set-global-var!]]))
  5 |   (:require [clojure.walk :as walk]
  6 |             [instaparse.gll :as gll]
  7 |             [instaparse.cfg :as cfg]
  8 |             [instaparse.failure :as fail]
  9 |             [instaparse.print :as print]
 10 |             [instaparse.reduction :as red]
 11 |             [instaparse.transform :as t]
 12 |             [instaparse.abnf :as abnf]
 13 |             [instaparse.repeat :as repeat]
 14 |             [instaparse.combinators-source :as c]
 15 |             [instaparse.line-col :as lc]
 16 |             [instaparse.viz :as viz]
 17 |             [instaparse.util :refer [throw-illegal-argument-exception]]
 18 |             #?(:clj [instaparse.macros :refer [defclone set-global-var!]])))
 19 | 
 20 | (def ^:dynamic *default-output-format* :hiccup)
 21 | (defn set-default-output-format!
 22 |   "Changes the default output format.  Input should be :hiccup or :enlive"
 23 |   [type]
 24 |   {:pre [(#{:hiccup :enlive} type)]}
 25 |   (set-global-var! *default-output-format* type))
 26 | 
 27 | (def ^:dynamic *default-input-format* :ebnf)
 28 | (defn set-default-input-format!
 29 |   "Changes the default input format.  Input should be :abnf or :ebnf"
 30 |   [type]
 31 |   {:pre [(#{:abnf :ebnf} type)]}
 32 |   (set-global-var! *default-input-format* type))
 33 | 
 34 | (declare failure? standard-whitespace-parsers enable-tracing!)
 35 | 
 36 | (defn- unhide-parser [parser unhide]
 37 |   (case unhide
 38 |     nil parser
 39 |     :content 
 40 |     (assoc parser :grammar (c/unhide-all-content (:grammar parser)))
 41 |     :tags 
 42 |     (assoc parser :grammar (c/unhide-tags (:output-format parser) 
 43 |                                           (:grammar parser)))
 44 |     :all
 45 |     (assoc parser :grammar (c/unhide-all (:output-format parser)
 46 |                                          (:grammar parser)))))
 47 |   
 48 | (defn parse 
 49 |   "Use parser to parse the text.  Returns first parse tree found
 50 |    that completely parses the text.  If no parse tree is possible, returns
 51 |    a Failure object.
 52 |    
 53 |    Optional keyword arguments:
 54 |    :start :keyword  (where :keyword is name of starting production rule)
 55 |    :partial true    (parses that don't consume the whole string are okay)
 56 |    :total true      (if parse fails, embed failure node in tree)
 57 |    :unhide <:tags or :content or :all> (for this parse, disable hiding)
 58 |    :optimize :memory   (when possible, employ strategy to use less memory)
 59 | 
 60 |    Clj only:
 61 |    :trace true      (print diagnostic trace while parsing)"
 62 |   [parser text &{:as options}]
 63 |   {:pre [(contains? #{:tags :content :all nil} (get options :unhide))
 64 |          (contains? #{:memory nil} (get options :optimize))]}
 65 |   (let [start-production 
 66 |         (get options :start (:start-production parser)),
 67 |         
 68 |         partial?
 69 |         (get options :partial false)
 70 |         
 71 |         optimize?
 72 |         (get options :optimize false)
 73 |         
 74 |         unhide
 75 |         (get options :unhide)
 76 |         
 77 |         trace?
 78 |         (get options :trace false)
 79 |         
 80 |         #?@(:clj [_ (when (and trace? (not gll/TRACE)) (enable-tracing!))])
 81 |         
 82 |         parser (unhide-parser parser unhide)]
 83 |     (->> (cond
 84 |            (:total options)
 85 |            (gll/parse-total (:grammar parser) start-production text 
 86 |                             partial? (red/node-builders (:output-format parser)))
 87 | 
 88 |            (and optimize? (not partial?))
 89 |            (let [result (repeat/try-repeating-parse-strategy parser text start-production)]
 90 |              (if (failure? result)
 91 |                (gll/parse (:grammar parser) start-production text partial?)
 92 |                result))
 93 | 
 94 |            :else
 95 |            (gll/parse (:grammar parser) start-production text partial?))
 96 | 
 97 |          #?(:clj (gll/bind-trace trace?)))))
 98 |   
 99 | (defn parses 
100 |   "Use parser to parse the text.  Returns lazy seq of all parse trees
101 |    that completely parse the text.  If no parse tree is possible, returns
102 |    () with a Failure object attached as metadata.
103 |    
104 |    Optional keyword arguments:
105 |    :start :keyword  (where :keyword is name of starting production rule)
106 |    :partial true    (parses that don't consume the whole string are okay)
107 |    :total true      (if parse fails, embed failure node in tree)
108 |    :unhide <:tags or :content or :all> (for this parse, disable hiding)
109 | 
110 |    Clj only:
111 |    :trace true      (print diagnostic trace while parsing)"
112 |   [parser text &{:as options}]
113 |   {:pre [(contains? #{:tags :content :all nil} (get options :unhide))]}
114 |   (let [start-production 
115 |         (get options :start (:start-production parser)),
116 |         
117 |         partial?
118 |         (get options :partial false)
119 |         
120 |         unhide
121 |         (get options :unhide)
122 |         
123 |         trace?
124 |         (get options :trace false)
125 |         
126 |         #?@(:clj [_ (when (and trace? (not gll/TRACE)) (enable-tracing!))])
127 |         
128 |         parser (unhide-parser parser unhide)]
129 |     (->> (cond
130 |            (:total options)
131 |            (gll/parses-total (:grammar parser) start-production text 
132 |                              partial? (red/node-builders (:output-format parser)))
133 |         
134 |            :else
135 |            (gll/parses (:grammar parser) start-production text partial?))
136 | 
137 |          #?(:clj (gll/bind-trace trace?)))))
138 |   
139 | (defrecord Parser [grammar start-production output-format]
140 | #?@(:clj
141 |     [clojure.lang.IFn
142 |      (invoke [parser text] (parse parser text))
143 |      (invoke [parser text key1 val1] (parse parser text key1 val1))
144 |      (invoke [parser text key1 val1 key2 val2] (parse parser text key1 val1 key2 val2))
145 |      (invoke [parser text key1 val1 key2 val2 key3 val3] (parse parser text key1 val1 key2 val2 key3 val3))
146 |      (applyTo [parser args] (apply parse parser args))]
147 | 
148 |     :cljs
149 |     [IFn
150 |      (-invoke [parser text] (parse parser text))
151 |      (-invoke [parser text key1 val1] (parse parser text key1 val1))
152 |      (-invoke [parser text key1 val1 key2 val2] (parse parser text key1 val1 key2 val2))
153 |      (-invoke [parser text key1 val1 key2 val2 key3 val3] (parse parser text key1 val1 key2 val2 key3 val3))
154 |      (-invoke [parser text a b c d e f g h] (parse parser text a b c d e f g h))
155 |      (-invoke [parser text a b c d e f g h i j] (parse parser text a b c d e f g h i j))
156 |      (-invoke [parser text a b c d e f g h i j k l] (parse parser text a b c d e f g h i j k l))
157 |      (-invoke [parser text a b c d e f g h i j k l m n] (parse parser text a b c d e f g h i j k l m n))
158 |      (-invoke [parser text a b c d e f g h i j k l m n o p] (parse parser text a b c d e f g h i j k l m n o p))
159 |      (-invoke [parser text a b c d e f g h i j k l m n o p q r] (parse parser text a b c d e f g h i j k l m n o p))
160 |      (-invoke [parser text a b c d e f g h i j k l m n o p q r s more] (apply parse parser text a b c d e f g h i j k l m n o p q r s more))]))
161 | 
162 | #?(:clj
163 |    (defmethod clojure.core/print-method Parser [x writer]
164 |      (binding [*out* writer]
165 |        (println (print/Parser->str x))))
166 |    :cljs
167 |    (extend-protocol IPrintWithWriter
168 |      instaparse.core/Parser
169 |      (-pr-writer  [parser writer _]
170 |        (-write writer (print/Parser->str parser)))))
171 | 
172 | (defn parser
173 |   "Takes a string specification of a context-free grammar,
174 |   or a URI for a text file containing such a specification (Clj only),
175 |   or a map of parser combinators and returns a parser for that grammar.
176 | 
177 |   Optional keyword arguments:
178 |   :input-format :ebnf
179 |   or
180 |   :input-format :abnf
181 | 
182 |   :output-format :enlive
183 |   or
184 |   :output-format :hiccup
185 | 
186 |   :start :keyword (where :keyword is name of starting production rule)
187 | 
188 |   :string-ci true (treat all string literals as case insensitive)
189 | 
190 |   :allow-namespaced-nts true (allow namespaced non-terminals in parser specification;
191 |                               parser's output will use corresponding namespaced keywords)
192 | 
193 |   :auto-whitespace (:standard or :comma)
194 |   or
195 |   :auto-whitespace custom-whitespace-parser
196 | 
197 |   Clj only:
198 |   :no-slurp true (disables use of slurp to auto-detect whether
199 |                   input is a URI.  When using this option, input
200 |                   must be a grammar string or grammar map.  Useful
201 |                   for platforms where slurp is slow or not available.)"
202 |   [grammar-specification &{:as options}]
203 |   {:pre [(contains? #{:abnf :ebnf nil} (get options :input-format))
204 |          (contains? #{:enlive :hiccup nil} (get options :output-format))
205 |          (let [ws-parser (get options :auto-whitespace)]
206 |            (or (nil? ws-parser)
207 |                (contains? standard-whitespace-parsers ws-parser)
208 |                (and
209 |                 (map? ws-parser)
210 |                 (contains? ws-parser :grammar)
211 |                 (contains? ws-parser :start-production))))]}
212 |   (let [input-format (get options :input-format *default-input-format*)
213 |         build-parser
214 |         (fn [spec output-format]
215 |           (binding [cfg/*case-insensitive-literals* (:string-ci options :default)]
216 |             (case input-format
217 |               :abnf (abnf/build-parser spec output-format)
218 |               :ebnf (cfg/build-parser spec output-format (:allow-namespaced-nts options false)))))
219 |         output-format (get options :output-format *default-output-format*)
220 |         start (get options :start nil)
221 | 
222 |         built-parser
223 |         (cond
224 |           (string? grammar-specification)
225 |           (let [parser
226 |                 #?(:clj
227 |                    (if (get options :no-slurp)
228 |                      ;; if :no-slurp is set to true, string is a grammar spec
229 |                      (build-parser grammar-specification output-format)                  
230 |                      ;; otherwise, grammar-specification might be a URI,
231 |                      ;; let's slurp to see
232 |                      (try (let [spec (slurp grammar-specification)]
233 |                             (build-parser spec output-format))
234 |                           (catch java.io.FileNotFoundException e 
235 |                             (build-parser grammar-specification output-format))))
236 |                    :cljs
237 |                    (build-parser grammar-specification output-format))]
238 |             (if start (map->Parser (assoc parser :start-production start))
239 |                 (map->Parser parser)))
240 | 
241 |           (map? grammar-specification)
242 |           (let [parser
243 |                 (cfg/build-parser-from-combinators grammar-specification
244 |                                                    output-format
245 |                                                    start)]
246 |             (map->Parser parser))
247 | 
248 |           (vector? grammar-specification)
249 |           (let [start (if start start (grammar-specification 0))
250 |                 parser
251 |                 (cfg/build-parser-from-combinators (apply hash-map grammar-specification)
252 |                                                    output-format
253 |                                                    start)]
254 |             (map->Parser parser))
255 | 
256 |           :else
257 |           #?(:clj
258 |              (let [spec (slurp grammar-specification)
259 |                    parser (build-parser spec output-format)]
260 |                (if start (map->Parser (assoc parser :start-production start))
261 |                    (map->Parser parser)))
262 |              :cljs
263 |              (throw-illegal-argument-exception
264 |               "Expected string, map, or vector as grammar specification, got "
265 |               (pr-str grammar-specification))))]
266 | 
267 |     (let [auto-whitespace (get options :auto-whitespace)
268 |                                         ; auto-whitespace is keyword, parser, or nil
269 |           whitespace-parser (if (keyword? auto-whitespace)
270 |                               (get standard-whitespace-parsers auto-whitespace)
271 |                               auto-whitespace)]
272 |       (if-let [{ws-grammar :grammar ws-start :start-production} whitespace-parser]
273 |         (assoc built-parser :grammar
274 |                (c/auto-whitespace (:grammar built-parser) (:start-production built-parser)
275 |                                   ws-grammar ws-start))
276 |         built-parser))))
277 | 
278 | #?(:clj
279 |    (defmacro defparser
280 |      "Takes a string specification of a context-free grammar,
281 |   or a string URI for a text file containing such a specification,
282 |   or a map/vector of parser combinators, and sets a variable to a parser for that grammar.
283 | 
284 |   String specifications are processed at macro-time, not runtime, so this is an
285 |   appealing alternative to (def _ (parser \"...\")) for ClojureScript users.
286 | 
287 |   Optional keyword arguments unique to `defparser`:
288 |   - :instaparse.abnf/case-insensitive true"
289 |      [name grammar & {:as opts}]
290 |      ;; For each of the macro-time opts, ensure that they are the data
291 |      ;; types we expect, not more complex quoted expressions.
292 |      {:pre [(or (nil? (:input-format opts))
293 |                 (keyword? (:input-format opts)))
294 |             (or (nil? (:output-format opts))
295 |                 (keyword? (:output-format opts)))
296 |             (contains? #{true false nil} (:string-ci opts))
297 |             (contains? #{true false nil} (:no-slurp opts))]}
298 |      (if (string? grammar)
299 |        `(def ~name
300 |           (map->Parser
301 |            ~(binding [abnf/*case-insensitive* (:instaparse.abnf/case-insensitive opts false)]
302 |               (let [macro-time-opts (select-keys opts [:input-format
303 |                                                        :output-format
304 |                                                        :string-ci
305 |                                                        :no-slurp])
306 |                     runtime-opts (dissoc opts :start)
307 |                     macro-time-parser (apply parser grammar (apply concat macro-time-opts))
308 |                     pre-processed-grammar (:grammar macro-time-parser)
309 | 
310 |                     grammar-producing-code
311 |                     (->> pre-processed-grammar
312 |                          (walk/postwalk
313 |                            (fn [form]
314 |                              (cond
315 |                                ;; Lists cannot be evaluated verbatim
316 |                                (seq? form)
317 |                                (list* 'list form)
318 | 
319 |                                ;; Regexp terminals are handled differently in cljs
320 |                                (= :regexp (:tag form))
321 |                                `(merge (c/regexp ~(str (:regexp form)))
322 |                                        ~(dissoc form :tag :regexp))
323 | 
324 |                                :else form))))
325 | 
326 |                     start-production
327 |                     (or (:start opts) (:start-production macro-time-parser))]
328 |                 `(parser ~grammar-producing-code
329 |                          :start ~start-production
330 |                          ~@(apply concat runtime-opts))))))
331 |        `(def ~name (parser ~grammar ~@(apply concat opts))))))
332 |         
333 | (defn failure?
334 |   "Tests whether a parse result is a failure."
335 |   [result]
336 |   (or
337 |     (instance? gll/failure-type result)
338 |     (instance? gll/failure-type (meta result))))
339 | 
340 | (defn get-failure
341 |   "Extracts failure object from failed parse result."
342 |   [result]
343 |   (cond
344 |     (instance? gll/failure-type result)
345 |     result
346 |     (instance? gll/failure-type (meta result))
347 |     (meta result)
348 |     :else
349 |     nil))
350 | 
351 | (def ^:private standard-whitespace-parsers
352 |   {:standard (parser "whitespace = #'\\s+'")
353 |    :comma (parser "whitespace = #'[,\\s]+'")})
354 | 
355 | #?(:clj
356 |    (defn enable-tracing!
357 |      "Recompiles instaparse with tracing enabled.
358 |   This is called implicitly the first time you invoke a parser with
359 |   `:trace true` so usually you will not need to call this directly."
360 |      []
361 |      (alter-var-root #'gll/TRACE (constantly true))
362 |      (alter-var-root #'gll/PROFILE (constantly true))
363 |      (require 'instaparse.gll :reload)))
364 | 
365 | #?(:clj
366 |    (defn disable-tracing!
367 |      "Recompiles instaparse with tracing disabled.
368 |   Call this to restore regular performance characteristics, eliminating
369 |   the small performance hit imposed by tracing."
370 |      []
371 |      (alter-var-root #'gll/TRACE (constantly false))
372 |      (alter-var-root #'gll/PROFILE (constantly false))
373 |      (require 'instaparse.gll :reload)))
374 |    
375 | (defclone transform t/transform)
376 | 
377 | (defclone add-line-and-column-info-to-metadata lc/add-line-col-spans)
378 | 
379 | (defclone span viz/span)
380 | 
381 | #?(:clj (defclone visualize viz/tree-viz))
382 | 


--------------------------------------------------------------------------------
/src/instaparse/failure.cljc:
--------------------------------------------------------------------------------
 1 | (ns instaparse.failure
 2 |   "Facilities for printing and manipulating error messages."
 3 |   #?(:clj (:import java.io.BufferedReader java.io.StringReader))
 4 |   (:require [instaparse.print :as print]))
 5 | 
 6 | (defn index->line-column
 7 |   "Takes an index into text, and determines the line and column info"
 8 |   [index text]
 9 |   (loop [line 1, col 1, counter 0]
10 |     (cond
11 |       (= index counter) {:line line :column col}
12 |       (= \newline (get text counter)) (recur (inc line) 1 (inc counter))
13 |       :else (recur line (inc col) (inc counter)))))
14 | 
15 | #?(:clj
16 |    (defn get-line
17 |      "Returns nth line of text, 1-based"
18 |      [n text]
19 |      (try (nth (line-seq (BufferedReader. (StringReader. (str text)))) (dec n))
20 |           (catch Exception e "")))
21 |    :cljs
22 |    (defn get-line
23 |      [n text]
24 |      (loop [chars (seq (clojure.string/replace text "\r\n" "\n"))
25 |             n n]
26 |        (cond
27 |          (empty? chars) ""
28 |          (= n 1) (apply str (take-while (complement #{\newline}) chars))
29 |          (= \newline (first chars)) (recur (next chars) (dec n))
30 |          :else (recur (next chars) n)))))
31 | 
32 | (defn marker
33 |   "Creates string with caret at nth position, 1-based
34 |    and accounts for horizontal tabs which might change
35 |    the alignment of the '^' to the error location."
36 |   [text n]
37 |   (when (and text (integer? n))
38 |     (let [marker-text (clojure.string/replace text #"[^\s]" " ")]
39 |       (if (<= n 1)
40 |           "^"
41 |           (str (subs marker-text 0 (dec n)) \^)))))
42 |       
43 | (defn augment-failure
44 |   "Adds text, line, and column info to failure object."
45 |   [failure text]  
46 |   (let [lc (index->line-column (:index failure) text)]
47 |     (merge failure 
48 |            lc
49 |            {:text (get-line (:line lc) text)})))
50 | 
51 | (defn print-reason
52 |   "Provides special case for printing negative lookahead reasons"
53 |   [r]
54 |   (cond
55 |     (:NOT r)
56 |     (do (print "NOT ")
57 |         (print (:NOT r))),
58 |     (:char-range r)
59 |     (print (print/char-range->str r))
60 |     (instance? #?(:clj java.util.regex.Pattern
61 |                   :cljs js/RegExp)
62 |                r)
63 |     (print (print/regexp->str r))
64 |     :else
65 |     (pr r)))
66 | 
67 | (defn pprint-failure
68 |   "Takes an augmented failure object and prints the error message"
69 |   [{:keys [line column text reason]}]
70 |   (println (str "Parse error at line " line ", column " column ":"))
71 |   (println text)
72 |   (println (marker text column))
73 |   (let [full-reasons (distinct (map :expecting
74 |                                     (filter :full reason)))
75 |         partial-reasons (distinct (map :expecting
76 |                                        (filter (complement :full) reason)))
77 |         total (+ (count full-reasons) (count partial-reasons))]        
78 |     (cond (zero? total) nil
79 |           (= 1 total) (println "Expected:")
80 |           :else (println "Expected one of:"))
81 |     (doseq [r full-reasons]
82 |       (print-reason r)
83 |       (println " (followed by end-of-string)"))
84 |     (doseq [r partial-reasons]
85 |       (print-reason r)
86 |       (println))))
87 | 


--------------------------------------------------------------------------------
/src/instaparse/line_col.cljc:
--------------------------------------------------------------------------------
  1 | (ns instaparse.line-col
  2 |   (:require [instaparse.transform]
  3 |             [instaparse.util :refer [throw-illegal-argument-exception]]))
  4 | 
  5 | ; Function to annotate parse-tree with line and column metadata.
  6 | 
  7 | (defrecord Cursor [^int index ^long line ^long column])
  8 | 
  9 | (defn- advance-cursor [^Cursor cursor ^String text new-index]
 10 |   (let [new-index (int new-index)]
 11 |     (assert (<= (.-index cursor) new-index))
 12 |     (if (= (.-index cursor) new-index) cursor
 13 |       (loop [index (.-index cursor), line (.-line cursor), column (.-column cursor)]
 14 |         (cond 
 15 |           (= index new-index) (Cursor. index line column)
 16 |           (= (.charAt text index) \newline) (recur (inc index) (inc line) 1)
 17 |           :else (recur (inc index) line (inc column)))))))
 18 |          
 19 | (defn- make-line-col-fn
 20 |   "Given a string `text`, returns a function that takes an index into the string,
 21 | and returns a cursor, including line and column information.  For efficiency,
 22 | inputs must be fed into the function in increasing order."
 23 |   [^String text start-line start-column]
 24 |   (let [cursor-state (atom (Cursor. 0 start-line start-column))]
 25 |     (fn line-col [i]
 26 |       (swap! cursor-state advance-cursor text i)
 27 |       @cursor-state)))                        
 28 | 
 29 | (defn- hiccup-add-line-col-spans
 30 |   [line-col-fn parse-tree]
 31 |   (let [m (meta parse-tree), 
 32 |         start-index (:instaparse.gll/start-index m), 
 33 |         end-index (:instaparse.gll/end-index m)]
 34 |     (if (and start-index end-index)
 35 |       (let [start-cursor (line-col-fn start-index),
 36 |             children (doall (map (partial hiccup-add-line-col-spans line-col-fn) (next parse-tree))),
 37 |             end-cursor (line-col-fn end-index)]
 38 |         (with-meta
 39 |           (into [(first parse-tree)] children)
 40 |           (merge (meta parse-tree) 
 41 |                  {:instaparse.gll/start-line (:line start-cursor)
 42 |                   :instaparse.gll/start-column (:column start-cursor)
 43 |                   :instaparse.gll/end-line (:line end-cursor)
 44 |                   :instaparse.gll/end-column (:column end-cursor)})))
 45 |       parse-tree)))
 46 | 
 47 | (defn- enlive-add-line-col-spans
 48 |   [line-col-fn parse-tree]
 49 |   (let [m (meta parse-tree), 
 50 |         start-index (:instaparse.gll/start-index m), 
 51 |         end-index (:instaparse.gll/end-index m)]
 52 |     (if (and start-index end-index)
 53 |       (let [start-cursor (line-col-fn start-index),
 54 |             children (doall (map (partial enlive-add-line-col-spans line-col-fn) (:content parse-tree))),
 55 |             end-cursor (line-col-fn end-index)]
 56 |         (with-meta
 57 |           (assoc parse-tree :content children)
 58 |           (merge (meta parse-tree) 
 59 |                  {:instaparse.gll/start-line (:line start-cursor)
 60 |                   :instaparse.gll/start-column (:column start-cursor)
 61 |                   :instaparse.gll/end-line (:line end-cursor)
 62 |                   :instaparse.gll/end-column (:column end-cursor)})))
 63 |       parse-tree)))
 64 |   
 65 | (defn add-line-col-spans
 66 |   "Given a string `text` and a `parse-tree` for text, return parse tree
 67 | with its metadata annotated with line and column info. The info can
 68 | then be found in the metadata map under the keywords:
 69 |  
 70 | :instaparse.gll/start-line, :instaparse.gll/start-column,
 71 | :instaparse.gll/end-line, :instaparse.gll/end-column
 72 | 
 73 | The start is inclusive, the end is exclusive. Lines and columns are 1-based."
 74 |   ([text parse-tree] (add-line-col-spans text 1 1 parse-tree))
 75 |   ([text start-line start-column parse-tree]
 76 |    (let [line-col-fn (make-line-col-fn text start-line start-column)]
 77 |      (cond
 78 |        (nil? parse-tree) nil
 79 | 
 80 |        (and (map? parse-tree) (:tag parse-tree))
 81 |        ; This is an enlive tree-seq
 82 |        (enlive-add-line-col-spans line-col-fn parse-tree)
 83 | 
 84 |        (and (vector? parse-tree) (keyword? (first parse-tree)))
 85 |        ; This is a hiccup tree-seq
 86 |        (hiccup-add-line-col-spans line-col-fn parse-tree)
 87 | 
 88 |        (and (sequential? parse-tree) (map? (first parse-tree)) (:tag (first parse-tree)))
 89 |        ; This is an enlive tree with hidden root tag
 90 |        (instaparse.transform/map-preserving-meta
 91 |          (partial enlive-add-line-col-spans line-col-fn) parse-tree)
 92 | 
 93 |        (and (sequential? parse-tree) (vector? (first parse-tree)) (keyword? (first (first parse-tree))))
 94 |        ; This is a hiccup tree with hidden root tag
 95 |        (instaparse.transform/map-preserving-meta
 96 |          (partial hiccup-add-line-col-spans line-col-fn) parse-tree)
 97 | 
 98 |        (instance? instaparse.gll.Failure parse-tree)
 99 |        ; pass failures through unchanged
100 |        parse-tree
101 | 
102 |        :else
103 |        (throw-illegal-argument-exception
104 |          "Invalid parse-tree, not recognized as either enlive or hiccup format.")))))
105 | 


--------------------------------------------------------------------------------
/src/instaparse/macros.clj:
--------------------------------------------------------------------------------
 1 | (ns instaparse.macros)
 2 | 
 3 | (defmacro defclone [here there]
 4 |   (if (contains? &env :locals)
 5 |     ;; cljs
 6 |     `(def ~here ~there)
 7 |     ;; clj
 8 |     `(do 
 9 |        (def ~here ~there)
10 |        (alter-meta! (var ~here) assoc
11 |                     :doc (:doc (meta (var ~there)))
12 |                     :arglists (:arglists (meta (var ~there)))
13 |                     :file (:file (meta (var ~there)))
14 |                     :line (:line (meta (var ~there)))
15 |                     :column (:column (meta (var ~there))))
16 |        (var ~here))))
17 | 
18 | (defmacro set-global-var!
19 |   [v value]
20 |   (if (contains? &env :locals)
21 |     ;; cljs
22 |     `(set! ~v ~value)
23 |     ;; clj
24 |     `(alter-var-root (var ~v) (constantly ~value))))
25 | 


--------------------------------------------------------------------------------
/src/instaparse/print.cljc:
--------------------------------------------------------------------------------
  1 | (ns instaparse.print
  2 |   "Facilities for taking parsers and grammars, and converting them to strings.
  3 |    Used for pretty-printing."
  4 |   (:require [clojure.string :as str]))
  5 | 
  6 | (declare combinators->str) ; mutual recursion
  7 | 
  8 | (defn paren-for-tags [tag-set hidden? parser]
  9 |   (if (and (not hidden?) (tag-set (parser :tag)))
 10 |     (str "(" (combinators->str parser false) ")")
 11 |     (combinators->str parser false)))
 12 | 
 13 | (def paren-for-compound 
 14 |   (partial paren-for-tags #{:alt :ord :cat}))
 15 | 
 16 | (defn regexp-replace
 17 |   "Replaces whitespace characters with escape sequences for better printing" 
 18 |   [s]
 19 |   (case s
 20 |     "\n" "\\n"
 21 |     "\b" "\\b"
 22 |     "\f" "\\f"
 23 |     "\r" "\\r"
 24 |     "\t" "\\t"
 25 |     s)) 
 26 | 
 27 | (defn regexp->str [r]
 28 |   (str/replace 
 29 |     (str "#\""
 30 |          #?(:clj (str r)
 31 |             :cljs (subs (.-source r) 1))
 32 |          "\"")
 33 |     #"[\s]" regexp-replace))
 34 | 
 35 | #?(:clj
 36 |    (defn char-range->str [{:keys [lo hi]}]
 37 |      (if (= lo hi)
 38 |        (format "%%x%04x" lo)
 39 |        (format "%%x%04x-%04x" lo hi)))
 40 | 
 41 |    :cljs
 42 |    (do
 43 |      (defn number->hex-padded [n]
 44 |        (if (<= n 0xFFF)
 45 |          (.substr (str "0000" (.toString n 16)) -4)
 46 |          (.toString n 16)))
 47 | 
 48 |      (defn char-range->str [{:keys [lo hi]}]
 49 |        (if (= lo hi)
 50 |          (str "%x" (number->hex-padded lo))
 51 |          (str "%x" (number->hex-padded lo) "-" (number->hex-padded hi))))))
 52 | 
 53 | (defn combinators->str
 54 |   "Stringifies a parser built from combinators"
 55 |   ([p] (combinators->str p false))
 56 |   ([{:keys [parser parser1 parser2 parsers tag] :as p} hidden?]
 57 |    (if (and (not hidden?) (:hide p))
 58 |      (str \< (combinators->str p true) \>)
 59 |      (case tag
 60 |        :epsilon "\u03b5"
 61 |        :opt (str (paren-for-compound hidden? parser) "?")
 62 |        :plus (str (paren-for-compound hidden? parser) "+")
 63 |        :star (str (paren-for-compound hidden? parser) "*")
 64 |        :rep (if (not= (:min p) (:max p))
 65 |               (str (paren-for-compound hidden? parser) \{ 
 66 |                    (:min p) \, (:max p) \})
 67 |               (str (paren-for-compound hidden? parser) \{ 
 68 |                    (:min p)\}))
 69 |        :alt (str/join " | " (map (partial paren-for-tags #{:ord} hidden?) parsers))
 70 |        :ord (str (paren-for-tags #{:alt} hidden? parser1)
 71 |                  " / "
 72 |                  (paren-for-tags #{:alt} hidden? parser2))
 73 |        :cat (str/join " " (map (partial paren-for-tags #{:alt :ord} hidden?) parsers))
 74 |        :string (with-out-str (pr (:string p)))
 75 |        :string-ci (with-out-str (pr (:string p)))
 76 |        :char (char-range->str p)
 77 |        :regexp (regexp->str (:regexp p))
 78 |        :nt (subs (str (:keyword p)) 1)
 79 |        :look (str "&" (paren-for-compound hidden? parser))
 80 |        :neg (str "!" (paren-for-compound hidden? parser))))))
 81 | 
 82 | (defn non-terminal->str [non-terminal]
 83 |   (if-let  [ns (namespace non-terminal)]
 84 |     (str ns "/" (name non-terminal))
 85 |     (name non-terminal)))
 86 | 
 87 | (defn rule->str
 88 |   "Takes a non-terminal symbol and a parser built from combinators,
 89 |    and returns a string for the rule."
 90 |   [non-terminal parser]
 91 |   (if (= (-> parser :red :reduction-type) :raw)
 92 |     (str \< (name non-terminal) \> 
 93 |          " = " 
 94 |          (combinators->str parser))
 95 |     (str (non-terminal->str non-terminal)
 96 |          " = " 
 97 |          (combinators->str parser))))
 98 | 
 99 | (defn Parser->str
100 |   "Takes a Parser object, i.e., something with a grammar map and a start 
101 |    production keyword, and stringifies it." 
102 |   [{grammar :grammar start :start-production}]
103 |   (str/join \newline
104 |             (cons
105 |               ; Put starting production first
106 |               (rule->str start (grammar start))
107 |               ; Then the others
108 |               (for [[non-terminal parser] grammar
109 |                     :when (not= non-terminal start)]
110 |                 (rule->str non-terminal parser)))))
111 | 


--------------------------------------------------------------------------------
/src/instaparse/reduction.cljc:
--------------------------------------------------------------------------------
 1 | (ns instaparse.reduction
 2 |   (:require [instaparse.auto-flatten-seq :as afs]
 3 |             [instaparse.util :refer [throw-illegal-argument-exception]]))
 4 | 
 5 | ;; utilities
 6 | 
 7 | (defn singleton? [s]
 8 |   (and (seq s) (not (next s))))
 9 | 
10 | ;; red is a reduction combinator for expert use only
11 | ;; because it is used internally to control the tree tags that
12 | ;; are displayed, so adding a different reduction would change
13 | ;; that behavior.
14 | 
15 | (defn red [parser f] (assoc parser :red f))
16 | 
17 | ;; Flattening and reductions
18 | 
19 | (def raw-non-terminal-reduction {:reduction-type :raw})
20 | 
21 | (defn HiccupNonTerminalReduction [key]
22 |   {:reduction-type :hiccup :key key})
23 | 
24 | (defn EnliveNonTerminalReduction [key] 
25 |   {:reduction-type :enlive, :key key})
26 | 
27 | (def ^:constant reduction-types 
28 |   {:hiccup HiccupNonTerminalReduction
29 |    :enlive EnliveNonTerminalReduction})
30 |                     
31 | (def ^:constant node-builders
32 |   ; A map of functions for building a node that only has one item
33 |   ; These functions are used in total-parse mode to build failure nodes
34 |   {:enlive (fn [tag item] {:tag tag :content (list item)})
35 |    :hiccup (fn [tag item] [tag item])})
36 | 
37 | (def standard-non-terminal-reduction :hiccup)
38 | 
39 | (defn apply-reduction [f result]
40 |   (case (:reduction-type f)
41 |     :raw (afs/conj-flat afs/EMPTY result)               
42 |     :hiccup (afs/convert-afs-to-vec (afs/conj-flat (afs/auto-flatten-seq [(:key f)]) result))
43 |     :enlive 
44 |     (let [content (afs/conj-flat afs/EMPTY result)]
45 |       {:tag (:key f), :content (if (zero? (count content)) nil content)})
46 |     (f result)))
47 |     
48 | (defn apply-standard-reductions 
49 |   ([grammar] (apply-standard-reductions standard-non-terminal-reduction grammar))
50 |   ([reduction-type grammar]
51 |     (if-let [reduction (reduction-types reduction-type)]
52 |       (into {} (for [[k v] grammar]
53 |                  (if (:red v) [k v]
54 |                    [k (assoc v :red (reduction k))])))
55 |       (throw-illegal-argument-exception
56 |         "Invalid output format " reduction-type ". Use :enlive or :hiccup."))))
57 | 


--------------------------------------------------------------------------------
/src/instaparse/repeat.cljc:
--------------------------------------------------------------------------------
  1 | (ns instaparse.repeat
  2 |   (:require [instaparse.gll :as gll
  3 |              #?@(:clj [:refer [profile]])]
  4 |             [instaparse.combinators-source :as c]
  5 |             [instaparse.auto-flatten-seq :as afs]
  6 |             [instaparse.viz :as viz]
  7 |             [instaparse.reduction :as red]
  8 |             [instaparse.failure :as fail])
  9 |   #?(:cljs
 10 |      (:require-macros [instaparse.gll :refer [profile]])))
 11 | 
 12 | (defn empty-result? [result]
 13 |   (or (and (vector? result) (= (count result) 1))
 14 |       (and (map? result) (contains? result :tag) (empty? (get result :content)))
 15 |       (empty? result)))       
 16 | 
 17 | (def ^:constant failure-signal (gll/->Failure nil nil))
 18 | 
 19 | (defn get-end 
 20 |   (#?(:clj ^long [parse]
 21 |       :cljs ^number [parse])
 22 |     (let [[start end] (viz/span parse)]
 23 |       (if end (long end) (count parse))))
 24 |   (#?(:clj ^long [parse ^long index]
 25 |       :cljs ^number [parse ^number index])
 26 |     (let [[start end] (viz/span parse)]
 27 |       (if end (long end) (+ index (count parse))))))
 28 | 
 29 | (defn parse-from-index [grammar initial-parser text segment index]
 30 |   (let [tramp (gll/make-tramp grammar text segment)]
 31 |     (gll/push-listener tramp [index initial-parser] (gll/TopListener tramp))
 32 |     (gll/run tramp)))
 33 | 
 34 | (defn select-parse
 35 |   "Returns either:
 36 |    [a-parse end-index a-list-of-valid-follow-up-parses]
 37 |    [a-parse end-index nil] (successfully reached end of text)
 38 |    nil (hit a dead-end with this strategy)"
 39 |   [grammar initial-parser text segment index parses]
 40 |   ;(clojure.pprint/pprint parses)
 41 |   (let [length (count text)]
 42 |     (loop [parses (seq parses)]
 43 |       (when parses
 44 |         (let [parse (first parses)
 45 |               [start end] (viz/span parse)
 46 |               end (if end end (+ index (count parse)))]
 47 |           (cond
 48 |             (= end length) [parse end nil]
 49 |             :else 
 50 |             (if-let [follow-ups (seq (parse-from-index grammar initial-parser text segment end))]
 51 |               [parse end follow-ups]
 52 |               (recur (next parses)))))))))
 53 |                                  
 54 | (defn repeat-parse-hiccup 
 55 |   ([grammar initial-parser root-tag text segment]
 56 |     (repeat-parse-hiccup grammar initial-parser root-tag text segment 0))
 57 |   ([grammar initial-parser root-tag text segment index]
 58 |     (let [length (count text)
 59 |           first-result (parse-from-index grammar initial-parser text segment index)]
 60 |       (loop [index (long index)
 61 |              parses (afs/auto-flatten-seq [root-tag])
 62 |              
 63 |              [parse end follow-ups :as selection]
 64 |              (select-parse grammar initial-parser text segment index first-result)]
 65 |         (cond
 66 |           (nil? selection) failure-signal
 67 |           (= index end) failure-signal
 68 |           (nil? follow-ups) (gll/safe-with-meta
 69 |                               (afs/convert-afs-to-vec 
 70 |                                 (afs/conj-flat parses parse))
 71 |                               {:optimize :memory
 72 |                                :instaparse.gll/start-index 0
 73 |                                :instaparse.gll/end-index length})
 74 |           :else (recur (long end)
 75 |                        (afs/conj-flat parses parse)
 76 |                        (select-parse grammar initial-parser text segment end follow-ups)))))))
 77 | 
 78 | (defn repeat-parse-enlive
 79 |   ([grammar initial-parser root-tag text segment]
 80 |     (repeat-parse-enlive grammar initial-parser root-tag text segment 0))
 81 |   ([grammar initial-parser root-tag text segment index]
 82 |     (let [length (count text)
 83 |           first-result (parse-from-index grammar initial-parser text segment index)]
 84 |       (loop [index (long index)
 85 |              parses afs/EMPTY
 86 |              
 87 |              [parse end follow-ups :as selection]
 88 |              (select-parse grammar initial-parser text segment index first-result)]
 89 |         (cond
 90 |           (nil? selection) failure-signal
 91 |           (= index end) failure-signal          
 92 |           (nil? follow-ups) (gll/safe-with-meta
 93 |                               {:tag root-tag 
 94 |                                :content (seq (afs/conj-flat parses parse))}
 95 |                               {:optimize :memory
 96 |                                :instaparse.gll/start-index 0
 97 |                                :instaparse.gll/end-index length})
 98 |           :else (recur (long end)
 99 |                        (afs/conj-flat parses parse)
100 |                        (select-parse grammar initial-parser text segment end follow-ups)))))))
101 | 
102 | (defn repeat-parse-no-tag 
103 |   ([grammar initial-parser text segment]
104 |     (repeat-parse-no-tag grammar initial-parser text segment 0))
105 |   ([grammar initial-parser text segment index]
106 |     (let [length (count text)
107 |           first-result (parse-from-index grammar initial-parser text segment index)]
108 |       (loop [index (long index)
109 |              parses afs/EMPTY
110 |              
111 |              [parse end follow-ups :as selection]
112 |              (select-parse grammar initial-parser text segment index first-result)]
113 |         (cond
114 |           (nil? selection) failure-signal
115 |           (= index end) failure-signal          
116 |           (nil? follow-ups) (gll/safe-with-meta
117 |                               (afs/conj-flat parses parse)
118 |                               {:optimize :memory
119 |                                :instaparse.gll/start-index 0
120 |                                :instaparse.gll/end-index length})
121 |           :else (recur (long end)
122 |                        (afs/conj-flat parses parse)
123 |                        (select-parse grammar initial-parser text segment end follow-ups)))))))
124 | 
125 | (defn repeat-parse 
126 |   ([grammar initial-parser output-format text] (repeat-parse-no-tag grammar initial-parser text (gll/text->segment text)))
127 |   ([grammar initial-parser output-format root-tag text]
128 |     {:pre [(#{:hiccup :enlive} output-format)]} 
129 |     (cond
130 |       (= output-format :hiccup)
131 |       (repeat-parse-hiccup grammar initial-parser root-tag text (gll/text->segment text))
132 |       (= output-format :enlive)
133 |       (repeat-parse-enlive grammar initial-parser root-tag text (gll/text->segment text)))))
134 | 
135 | (defn repeat-parse-with-header
136 |   ([grammar header-parser repeating-parser output-format root-tag text]
137 |     (let [segment (gll/text->segment text)
138 |           length (count text)
139 |           header-results (parse-from-index grammar header-parser text segment 0)]
140 |       (if (or (empty? header-results)
141 |               (:hide header-parser))
142 |         failure-signal
143 |         (let [header-result (apply max-key get-end header-results)
144 |               end (get-end header-result)
145 |               repeat-result (repeat-parse-no-tag grammar (:parser repeating-parser) text segment end)
146 |               span-meta {:optimize :memory
147 |                          :instaparse.gll/start-index 0
148 |                          :instaparse.gll/end-index length}]
149 |           (if (or (instance? instaparse.gll.Failure repeat-result)
150 |                   (and (= (:tag repeating-parser) :star)
151 |                        (empty-result? repeat-result)))
152 |             failure-signal
153 |             (case output-format
154 |               :enlive
155 |               (gll/safe-with-meta
156 |                 {:tag root-tag 
157 |                  :content
158 |                  (afs/conj-flat (afs/conj-flat afs/EMPTY header-result) repeat-result)}
159 |                 span-meta)
160 |               :hiccup
161 |               (gll/safe-with-meta
162 |                 (afs/convert-afs-to-vec 
163 |                   (afs/conj-flat (afs/conj-flat (afs/auto-flatten-seq [root-tag])
164 |                                                 header-result) 
165 |                                  repeat-result))
166 |                 span-meta)
167 |               (gll/safe-with-meta 
168 |                 (afs/conj-flat (afs/conj-flat afs/EMPTY header-result) repeat-result)
169 |                 span-meta))))))))
170 |     
171 | (defn try-repeating-parse-strategy-with-header
172 |   [grammar text start-production start-rule output-format]
173 |   (gll/profile (gll/clear!))
174 |   (let [parsers (:parsers start-rule)
175 |         repeating-parser (last parsers)]
176 |     (if
177 |       (not (and (= (:tag start-rule) :cat)
178 |                 (#{:star :plus} (:tag repeating-parser))
179 |                 (not (:hide repeating-parser))
180 |                 (not (:hide (:parser repeating-parser)))))
181 |       failure-signal
182 |       (let [header-parser (apply c/cat (butlast parsers))]
183 |         (if (= (:red start-rule) red/raw-non-terminal-reduction)
184 |           (repeat-parse-with-header grammar header-parser repeating-parser nil start-production text)
185 |           (repeat-parse-with-header grammar header-parser repeating-parser output-format start-production text))))))
186 |   
187 | (defn try-repeating-parse-strategy
188 |   [parser text start-production]
189 |   (let [grammar (:grammar parser)
190 |         output-format (:output-format parser)
191 |         start-rule (get grammar start-production)]
192 |     (profile (gll/clear!))
193 |     (cond
194 |       (= (:hide start-rule) true) failure-signal
195 |       (= (:red start-rule) red/raw-non-terminal-reduction)
196 |       (cond
197 |         (= (:tag start-rule) :star)
198 |         (repeat-parse grammar (:parser start-rule) output-format text)
199 |         (= (:tag start-rule) :plus)
200 |         (let [result (repeat-parse grammar (:parser start-rule) output-format text)]
201 |           (if (empty-result? result)
202 |             failure-signal
203 |             result))
204 |         :else (try-repeating-parse-strategy-with-header 
205 |                 grammar text start-production start-rule output-format))
206 |               
207 |       (= (:tag start-rule) :star)
208 |       (repeat-parse grammar (:parser start-rule) output-format start-production text)
209 |       (= (:tag start-rule) :plus)      
210 |       (let [result (repeat-parse grammar (:parser start-rule) output-format start-production text)]
211 |         (if (empty-result? result)
212 |           failure-signal
213 |           result))
214 |       
215 |       :else (try-repeating-parse-strategy-with-header 
216 |                 grammar text start-production start-rule output-format))))
217 | 
218 | (defn used-memory-optimization? [tree]
219 |   (= :memory (-> tree meta :optimize)))


--------------------------------------------------------------------------------
/src/instaparse/transform.cljc:
--------------------------------------------------------------------------------
 1 | (ns instaparse.transform
 2 |   "Functions to transform parse trees"
 3 |   (:require [instaparse.gll]
 4 |             [instaparse.util :refer [throw-illegal-argument-exception]]))
 5 | 
 6 | (defn map-preserving-meta [f l]
 7 |   (with-meta (map f l) (meta l)))
 8 | 
 9 | (defn merge-meta
10 |   "This variation of the merge-meta in gll does nothing if obj is not
11 | something that can have a metamap attached."
12 |   [obj metamap]
13 |   (if #?(:clj (instance? clojure.lang.IObj obj)
14 |          :cljs (satisfies? IWithMeta obj))
15 |     (instaparse.gll/merge-meta obj metamap)
16 |     obj))
17 | 
18 | (defn- enlive-transform
19 |   [transform-map parse-tree]
20 |   (let [transform (transform-map (:tag parse-tree))]
21 |     (cond
22 |       transform
23 |       (merge-meta 
24 |         (apply transform (map (partial enlive-transform transform-map)
25 |                               (:content parse-tree)))
26 |         (meta parse-tree))
27 |       (:tag parse-tree)
28 |       (assoc parse-tree :content (map (partial enlive-transform transform-map)
29 |                                       (:content parse-tree)))
30 |       :else
31 |       parse-tree)))
32 | 
33 | (defn- hiccup-transform
34 |   [transform-map parse-tree]
35 |   (if (and (sequential? parse-tree) (seq parse-tree))
36 |     (if-let [transform (transform-map (first parse-tree))]
37 |       (merge-meta
38 |         (apply transform (map (partial hiccup-transform transform-map)
39 |                               (next parse-tree)))
40 |         (meta parse-tree))
41 |       (with-meta 
42 |         (into [(first parse-tree)]
43 |               (map (partial hiccup-transform transform-map) 
44 |                    (next parse-tree)))
45 |         (meta parse-tree)))
46 |     parse-tree))
47 | 
48 | (defn transform
49 |   "Takes a transform map and a parse tree (or seq of parse-trees).
50 |    A transform map is a mapping from tags to 
51 |    functions that take a node's contents and return
52 |    a replacement for the node, i.e.,
53 |    {:node-tag (fn [child1 child2 ...] node-replacement),
54 |     :another-node-tag (fn [child1 child2 ...] node-replacement)}"
55 |   [transform-map parse-tree]
56 |   ; Detect what kind of tree this is
57 |   (cond
58 |     (string? parse-tree)
59 |     ; This is a leaf of the tree that should pass through unchanged
60 |     parse-tree
61 | 
62 |     (and (map? parse-tree) (:tag parse-tree))
63 |     ; This is an enlive tree-seq
64 |     (enlive-transform transform-map parse-tree)
65 |     
66 |     (and (vector? parse-tree) (keyword? (first parse-tree)))
67 |     ; This is a hiccup tree-seq
68 |     (hiccup-transform transform-map parse-tree)
69 |     
70 |     (sequential? parse-tree)
71 |     ; This is either a sequence of parse results, or a tree
72 |     ; with a hidden root tag.
73 |     (map-preserving-meta (partial transform transform-map) parse-tree)
74 |     
75 |     (instance? instaparse.gll.Failure parse-tree)
76 |     ; pass failures through unchanged
77 |     parse-tree
78 |     
79 |     :else
80 |     (throw-illegal-argument-exception
81 |       "Invalid parse-tree, not recognized as either enlive or hiccup format.")))
82 | 


--------------------------------------------------------------------------------
/src/instaparse/util.cljc:
--------------------------------------------------------------------------------
 1 | (ns instaparse.util)
 2 | 
 3 | ;; Both appear to be called with several strings as separate
 4 | ;; arguments:
 5 | (defn throw-runtime-exception
 6 |   [& message]
 7 |   (let [^String text (apply str message)]
 8 |     (-> text
 9 |         #?(:clj RuntimeException.)
10 |         throw)))
11 | 
12 | (defn throw-illegal-argument-exception
13 |   [& message]
14 |   (let [^String text (apply str message)]
15 |     (-> text
16 |         #?(:clj IllegalArgumentException.)
17 |         throw)))
18 | 
19 | #?(:cljs
20 |     (defn regexp-flags [re]
21 |       (cond-> ""
22 |         (.-ignoreCase re) (str "i")
23 |         (.-multiline re) (str "m")
24 |         (.-unicode re) (str "u"))))
25 | 


--------------------------------------------------------------------------------
/src/instaparse/viz.clj:
--------------------------------------------------------------------------------
  1 | (ns instaparse.viz
  2 |   (:import java.io.IOException))
  3 | 
  4 | (defn span
  5 |   "Takes a subtree of the parse tree and returns a [start-index end-index] pair
  6 |    indicating the span of text parsed by this subtree.
  7 |    start-index is inclusive and end-index is exclusive, as is customary
  8 |    with substrings.
  9 |    Returns nil if no span metadata is attached."
 10 |   [tree]
 11 |   (let [m (meta tree)
 12 |         s (:instaparse.gll/start-index m)
 13 |         e (:instaparse.gll/end-index m)]
 14 |     (when (and s e)
 15 |       [s e])))
 16 | 
 17 | (def rhizome-newline
 18 |   ;; Prior to Rhizome 0.2.5., \ was not an escape character so \n needed extra escaping.
 19 |   (when-let [escape-chars (try (ns-resolve (find-ns 'rhizome.dot) 'escapable-characters)
 20 |                                (catch Exception e nil))]
 21 |     (if (= escape-chars "|{}\"")
 22 |       "\\n"
 23 |       "\n")))
 24 | 
 25 | 
 26 | (defn- hiccup-tree-viz
 27 |   "visualize instaparse hiccup output as a rhizome graph. Requires rhizome: https://github.com/ztellman/rhizome"
 28 |   [mytree options]
 29 |   (let [tree->image (resolve 'rhizome.viz/tree->image)]
 30 |     (tree->image sequential? rest mytree
 31 |                  :node->descriptor (fn [n] {:label (if (sequential? n)
 32 |                                                      (apply str (first n)
 33 |                                                             (when (span n)
 34 |                                                               [rhizome-newline (span n)]))
 35 |                                                      (with-out-str (pr n)))})
 36 |                  :options options)))
 37 | 
 38 | (defn- enlive-tree-viz
 39 |   "visualize enlive trees"
 40 |   [mytree options]
 41 |   (let [tree->image (resolve 'rhizome.viz/tree->image)]
 42 |     (tree->image (comp seq :content) :content mytree
 43 |                  :node->descriptor (fn [n]
 44 |                                      {:label (if (and (map? n) (:tag n))
 45 |                                                (apply str (:tag n)
 46 |                                                       (when (span n)
 47 |                                                         [rhizome-newline (span n)]))
 48 |                                                (with-out-str (pr n)))})
 49 |                  :options options)))
 50 | 
 51 | (defn tree-type
 52 |   [tree]
 53 |   (cond
 54 |     (and (map? tree) (:tag tree)) :enlive
 55 |     (and (vector? tree) (keyword? (first tree))) :hiccup
 56 |     (empty? tree) :nil
 57 |     (seq? tree) :rootless
 58 |     :else :invalid))
 59 | 
 60 | (defn fake-root
 61 |   "Create a root for a rootless tree"
 62 |   [children]
 63 |   (case (tree-type (first children))
 64 |     :enlive {:tag :hidden-root-tag
 65 |              :content children}
 66 |     :hiccup (into [:hidden-root-tag]
 67 |                   children)
 68 |     :nil nil
 69 |     :invalid))
 70 |     
 71 | (defn tree-viz
 72 |   "Creates a graphviz visualization of the parse tree.
 73 |    Optional keyword arguments:
 74 |    :output-file :buffered-image (return a java.awt.image.BufferedImage object)
 75 |    or
 76 |    :output-file output-file (will save the tree image to output-file)
 77 | 
 78 |    :options options (options passed along to rhizome)
 79 | 
 80 | Important: This function will only work if you have added rhizome
 81 | to your dependencies, and installed graphviz on your system.  
 82 | See https://github.com/ztellman/rhizome for more information."
 83 |   [tree & {output-file :output-file options :options}]
 84 |   {:pre [(not= (tree-type tree) :invalid)]}
 85 |   (let [ttype (tree-type tree)]
 86 |     (if (= ttype :rootless)
 87 |       (tree-viz (fake-root tree) :output-file output-file :options options)
 88 |       (do
 89 |         (try
 90 |           (require 'rhizome.viz)
 91 |           (catch Exception e
 92 |             (throw (UnsupportedOperationException.
 93 |                      "\n\nVisualization of parse trees is only supported if you have rhizome among your project dependencies and graphviz installed on your computer.\n
 94 |           Visit https://github.com/ztellman/rhizome to find out the version info to put in your project.clj file and for links to the graphviz installer."))))
 95 |         (let [image
 96 |               (try
 97 |                 (case (tree-type tree)
 98 |                   :enlive (enlive-tree-viz tree options)
 99 |                   (:hiccup :nil) (hiccup-tree-viz tree options))
100 |                 (catch IOException e
101 |                   (throw (UnsupportedOperationException.
102 |                            "\n\nYou appear to have rhizome in your dependencies, but have not installed GraphViz on your system.
103 |   \nSee https://github.com/ztellman/rhizome for more information.\n"))))
104 |               save-image (resolve 'rhizome.viz/save-image)
105 |               view-image (resolve 'rhizome.viz/view-image)]
106 |           (cond
107 |             (= output-file :buffered-image) image
108 |             output-file (save-image image output-file)
109 |             :else (view-image image)))))))


--------------------------------------------------------------------------------
/src/instaparse/viz.cljs:
--------------------------------------------------------------------------------
 1 | (ns instaparse.viz)   
 2 | 
 3 | (defn span
 4 |   "Takes a subtree of the parse tree and returns a [start-index end-index] pair
 5 |    indicating the span of text parsed by this subtree.
 6 |    start-index is inclusive and end-index is exclusive, as is customary
 7 |    with substrings.
 8 |    Returns nil if no span metadata is attached."
 9 |   [tree]
10 |   (let [m (meta tree)
11 |         s (:instaparse.gll/start-index m)
12 |         e (:instaparse.gll/end-index m)]
13 |     (when (and s e)
14 |       [s e])))
15 | 
16 | 


--------------------------------------------------------------------------------
/test/data/abnf_uri.txt:
--------------------------------------------------------------------------------
 1 | URI           = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 2 | 
 3 | hier-part     = "//" authority path-abempty
 4 |                  / path-absolute
 5 |                  / path-rootless
 6 |                  / path-empty
 7 | 
 8 | URI-reference = URI / relative-ref
 9 | 
10 | absolute-URI  = scheme ":" hier-part [ "?" query ]
11 | 
12 | relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
13 | 
14 | relative-part = "//" authority path-abempty
15 |                  / path-absolute
16 |                  / path-noscheme
17 |                  / path-empty
18 | 
19 | scheme        = ALPHA *( ALPHA / DIGIT / "+" / "-" / ".")
20 | 
21 | authority     = [ userinfo "@" ] host [ ":" port ]
22 | userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
23 | host          = IP-literal / IPv4address / reg-name
24 | port          = *DIGIT
25 | 
26 | IP-literal    = "[" ( IPv6address / IPvFuture  ) "]"
27 | 
28 | IPvFuture     = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
29 | 
30 | IPv6address   =                            6( h16 ":" ) ls32
31 |                  /                       "::" 5( h16 ":" ) ls32
32 |                  / [               h16 ] "::" 4( h16 ":" ) ls32
33 |                  / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
34 |                  / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
35 |                  / [ *3( h16 ":" ) h16 ] "::"    h16 ":"   ls32
36 |                  / [ *4( h16 ":" ) h16 ] "::"              ls32
37 |                  / [ *5( h16 ":" ) h16 ] "::"              h16
38 |                  / [ *6( h16 ":" ) h16 ] "::"
39 | 
40 | h16           = 1*4HEXDIG
41 | ls32          = ( h16 ":" h16 ) / IPv4address
42 | IPv4address   = dec-octet "." dec-octet "." dec-octet "." dec-octet
43 | 
44 | dec-octet     = DIGIT                 ; 0-9
45 |               / %x31-39 DIGIT         ; 10-99
46 |               / "1" 2DIGIT            ; 100-199
47 |               / "2" %x30-34 DIGIT     ; 200-249
48 |               / "25" %x30-35          ; 250-255
49 | 
50 | reg-name      = *( unreserved / pct-encoded / sub-delims )
51 | 
52 | path          = path-abempty    ; begins with "/" or is empty
53 |                  / path-absolute   ; begins with "/" but not "//"
54 |                  / path-noscheme   ; begins with a non-colon segment
55 |                  / path-rootless   ; begins with a segment
56 |                  / path-empty      ; zero characters
57 | 
58 | path-abempty  = *( "/" segment )
59 | path-absolute = "/" [ segment-nz *( "/" segment ) ]
60 | path-noscheme = segment-nz-nc *( "/" segment )
61 | path-rootless = segment-nz *( "/" segment )
62 | path-empty    = 0pchar
63 | 
64 | segment       = *pchar
65 | segment-nz    = 1*pchar
66 | segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
67 |               ; non-zero-length segment without any colon ":"
68 | 
69 | pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
70 | 
71 | query         = *( pchar / "/" / "?" )
72 | 
73 | fragment      = *( pchar / "/" / "?" )
74 | 
75 | pct-encoded   = "%" HEXDIG HEXDIG
76 | 
77 | unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
78 | reserved      = gen-delims / sub-delims
79 | gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
80 | sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
81 |                  / "*" / "+" / "," / ";" / "="              ; comment


--------------------------------------------------------------------------------
/test/data/defparser_grammar.txt:
--------------------------------------------------------------------------------
1 | S = #'a' | 'b'
2 | 


--------------------------------------------------------------------------------
/test/data/phone_uri.txt:
--------------------------------------------------------------------------------
 1 |    telephone-uri        = "tel:" telephone-subscriber
 2 |    telephone-subscriber = global-number / local-number
 3 |    global-number        = global-number-digits *par
 4 |    local-number         = local-number-digits *par context *par
 5 |    par                  = parameter / extension / isdn-subaddress
 6 |    isdn-subaddress      = ";isub=" 1*uric
 7 |    extension            = ";ext=" 1*phonedigit
 8 |    context              = ";phone-context=" descriptor
 9 |    descriptor           = domainname / global-number-digits
10 |    global-number-digits = "+" *phonedigit DIGIT *phonedigit
11 |    local-number-digits  =
12 |       *phonedigit-hex (HEXDIG / "*" / "#") *phonedigit-hex
13 |    domainname           = *( domainlabel "." ) toplabel [ "." ]
14 |    domainlabel          = alphanum
15 |                           / alphanum *( alphanum / "-" ) alphanum
16 |    toplabel             = ALPHA / ALPHA *( alphanum / "-" ) alphanum
17 |    parameter            = ";" pname ["=" pvalue ]
18 |    pname                = 1*( alphanum / "-" )
19 |    pvalue               = 1*paramchar
20 |    paramchar            = param-unreserved / unreserved / pct-encoded
21 |    unreserved           = alphanum / mark
22 |    mark                 = "-" / "_" / "." / "!" / "~" / "*" /
23 |                           "'" / "(" / ")"
24 |    pct-encoded          = "%" HEXDIG HEXDIG
25 |    param-unreserved     = "[" / "]" / "/" / ":" / "&" / "+" / "$"
26 |    phonedigit           = DIGIT / [ visual-separator ]
27 |    phonedigit-hex       = HEXDIG / "*" / "#" / [ visual-separator ]
28 |    visual-separator     = "-" / "." / "(" / ")"
29 |    alphanum             = ALPHA / DIGIT
30 |    reserved             = ";" / "/" / "?" / ":" / "@" / "&" /
31 |                           "=" / "+" / "$" / ","
32 |    uric                 = reserved / unreserved / pct-encoded


--------------------------------------------------------------------------------
/test/instaparse/abnf_test.cljc:
--------------------------------------------------------------------------------
  1 | (ns instaparse.abnf-test
  2 |   (:require
  3 |     #?(:clj  [instaparse.core :refer [parser parses defparser]]
  4 |        :cljs [instaparse.core :refer [parser parses] :refer-macros [defparser]])
  5 |     [instaparse.core-test :refer [parsers-similar?]]
  6 |     [instaparse.combinators :refer [ebnf abnf]]
  7 |     #?(:clj [clojure.test :refer [deftest are is]]
  8 |        :cljs [cljs.test])
  9 |     #?(:clj  [clojure.java.io :as io]))
 10 |   #?(:cljs (:require-macros
 11 |              [cljs.test :refer [is are deftest]])))
 12 | 
 13 | (defparser uri-parser
 14 |   "test/data/abnf_uri.txt"
 15 |   :input-format :abnf
 16 |   :instaparse.abnf/case-insensitive true)
 17 | 
 18 | (defparser phone-uri-parser
 19 |   "test/data/phone_uri.txt"
 20 |   :input-format :abnf
 21 |   :instaparse.abnf/case-insensitive true)
 22 | 
 23 | #?(:clj
 24 |    (deftest slurping-test
 25 |      (is (parsers-similar?
 26 |            uri-parser
 27 |            (binding [instaparse.abnf/*case-insensitive* true]
 28 |              (parser 
 29 |               "test/data/abnf_uri.txt"
 30 |               :input-format :abnf
 31 |               :instaparse.abnf/case-insensitive true))
 32 |            (binding [instaparse.abnf/*case-insensitive* true]
 33 |              (parser 
 34 |               (io/resource "data/abnf_uri.txt")
 35 |               :input-format :abnf
 36 |               :instaparse.abnf/case-insensitive true))
 37 |            (binding [instaparse.abnf/*case-insensitive* true]
 38 |              (parser 
 39 |               (slurp "test/data/abnf_uri.txt")
 40 |               :input-format :abnf
 41 |               :instaparse.abnf/case-insensitive true)))
 42 |          "Verify that defparser, auto-slurp from string filename,
 43 |          auto-slurp from resource (URL), and manual slurp all return
 44 |          equivalent parsers.")))
 45 | 
 46 | (deftest abnf-uri
 47 |   (are [x y] (= x y)
 48 |     (uri-parser "http://www.google.com")
 49 |     [:URI [:SCHEME [:ALPHA "h"] [:ALPHA "t"] [:ALPHA "t"] [:ALPHA "p"]] ":" [:HIER-PART "//" [:AUTHORITY [:HOST [:REG-NAME [:UNRESERVED [:ALPHA "w"]] [:UNRESERVED [:ALPHA "w"]] [:UNRESERVED [:ALPHA "w"]] [:UNRESERVED "."] [:UNRESERVED [:ALPHA "g"]] [:UNRESERVED [:ALPHA "o"]] [:UNRESERVED [:ALPHA "o"]] [:UNRESERVED [:ALPHA "g"]] [:UNRESERVED [:ALPHA "l"]] [:UNRESERVED [:ALPHA "e"]] [:UNRESERVED "."] [:UNRESERVED [:ALPHA "c"]] [:UNRESERVED [:ALPHA "o"]] [:UNRESERVED [:ALPHA "m"]]]]] [:PATH-ABEMPTY]]]
 50 | 
 51 |     (uri-parser "ftp://ftp.is.co.za/rfc/rfc1808.txt")
 52 |     [:URI [:SCHEME [:ALPHA "f"] [:ALPHA "t"] [:ALPHA "p"]] ":" [:HIER-PART "//" [:AUTHORITY [:HOST [:REG-NAME [:UNRESERVED [:ALPHA "f"]] [:UNRESERVED [:ALPHA "t"]] [:UNRESERVED [:ALPHA "p"]] [:UNRESERVED "."] [:UNRESERVED [:ALPHA "i"]] [:UNRESERVED [:ALPHA "s"]] [:UNRESERVED "."] [:UNRESERVED [:ALPHA "c"]] [:UNRESERVED [:ALPHA "o"]] [:UNRESERVED "."] [:UNRESERVED [:ALPHA "z"]] [:UNRESERVED [:ALPHA "a"]]]]] [:PATH-ABEMPTY "/" [:SEGMENT [:PCHAR [:UNRESERVED [:ALPHA "r"]]] [:PCHAR [:UNRESERVED [:ALPHA "f"]]] [:PCHAR [:UNRESERVED [:ALPHA "c"]]]] "/" [:SEGMENT [:PCHAR [:UNRESERVED [:ALPHA "r"]]] [:PCHAR [:UNRESERVED [:ALPHA "f"]]] [:PCHAR [:UNRESERVED [:ALPHA "c"]]] [:PCHAR [:UNRESERVED [:DIGIT "1"]]] [:PCHAR [:UNRESERVED [:DIGIT "8"]]] [:PCHAR [:UNRESERVED [:DIGIT "0"]]] [:PCHAR [:UNRESERVED [:DIGIT "8"]]] [:PCHAR [:UNRESERVED "."]] [:PCHAR [:UNRESERVED [:ALPHA "t"]]] [:PCHAR [:UNRESERVED [:ALPHA "x"]]] [:PCHAR [:UNRESERVED [:ALPHA "t"]]]]]]]
 53 | 
 54 |     (uri-parser "mailto:John.Doe@example.com")
 55 |     [:URI [:SCHEME [:ALPHA "m"] [:ALPHA "a"] [:ALPHA "i"] [:ALPHA "l"] [:ALPHA "t"] [:ALPHA "o"]] ":" [:HIER-PART [:PATH-ROOTLESS [:SEGMENT-NZ [:PCHAR [:UNRESERVED [:ALPHA "J"]]] [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "h"]]] [:PCHAR [:UNRESERVED [:ALPHA "n"]]] [:PCHAR [:UNRESERVED "."]] [:PCHAR [:UNRESERVED [:ALPHA "D"]]] [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "e"]]] [:PCHAR "@"] [:PCHAR [:UNRESERVED [:ALPHA "e"]]] [:PCHAR [:UNRESERVED [:ALPHA "x"]]] [:PCHAR [:UNRESERVED [:ALPHA "a"]]] [:PCHAR [:UNRESERVED [:ALPHA "m"]]] [:PCHAR [:UNRESERVED [:ALPHA "p"]]] [:PCHAR [:UNRESERVED [:ALPHA "l"]]] [:PCHAR [:UNRESERVED [:ALPHA "e"]]] [:PCHAR [:UNRESERVED "."]] [:PCHAR [:UNRESERVED [:ALPHA "c"]]] [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "m"]]]]]]]
 56 | 
 57 |     (uri-parser "tel:+1-816-555-1212")
 58 |     [:URI [:SCHEME [:ALPHA "t"] [:ALPHA "e"] [:ALPHA "l"]] ":" [:HIER-PART [:PATH-ROOTLESS [:SEGMENT-NZ [:PCHAR [:SUB-DELIMS "+"]] [:PCHAR [:UNRESERVED [:DIGIT "1"]]] [:PCHAR [:UNRESERVED "-"]] [:PCHAR [:UNRESERVED [:DIGIT "8"]]] [:PCHAR [:UNRESERVED [:DIGIT "1"]]] [:PCHAR [:UNRESERVED [:DIGIT "6"]]] [:PCHAR [:UNRESERVED "-"]] [:PCHAR [:UNRESERVED [:DIGIT "5"]]] [:PCHAR [:UNRESERVED [:DIGIT "5"]]] [:PCHAR [:UNRESERVED [:DIGIT "5"]]] [:PCHAR [:UNRESERVED "-"]] [:PCHAR [:UNRESERVED [:DIGIT "1"]]] [:PCHAR [:UNRESERVED [:DIGIT "2"]]] [:PCHAR [:UNRESERVED [:DIGIT "1"]]] [:PCHAR [:UNRESERVED [:DIGIT "2"]]]]]]]
 59 | 
 60 |     (uri-parser "telnet://192.0.2.16:80/")
 61 |     [:URI [:SCHEME [:ALPHA "t"] [:ALPHA "e"] [:ALPHA "l"] [:ALPHA "n"] [:ALPHA "e"] [:ALPHA "t"]] ":" [:HIER-PART "//" [:AUTHORITY [:HOST [:REG-NAME [:UNRESERVED [:DIGIT "1"]] [:UNRESERVED [:DIGIT "9"]] [:UNRESERVED [:DIGIT "2"]] [:UNRESERVED "."] [:UNRESERVED [:DIGIT "0"]] [:UNRESERVED "."] [:UNRESERVED [:DIGIT "2"]] [:UNRESERVED "."] [:UNRESERVED [:DIGIT "1"]] [:UNRESERVED [:DIGIT "6"]]]] ":" [:PORT [:DIGIT "8"] [:DIGIT "0"]]] [:PATH-ABEMPTY "/" [:SEGMENT]]]]
 62 | 
 63 |     (uri-parser "urn:oasis:names:specification:docbook:dtd:xml:4.1.2")
 64 |     [:URI [:SCHEME [:ALPHA "u"] [:ALPHA "r"] [:ALPHA "n"]] ":" [:HIER-PART [:PATH-ROOTLESS [:SEGMENT-NZ [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "a"]]] [:PCHAR [:UNRESERVED [:ALPHA "s"]]] [:PCHAR [:UNRESERVED [:ALPHA "i"]]] [:PCHAR [:UNRESERVED [:ALPHA "s"]]] [:PCHAR ":"] [:PCHAR [:UNRESERVED [:ALPHA "n"]]] [:PCHAR [:UNRESERVED [:ALPHA "a"]]] [:PCHAR [:UNRESERVED [:ALPHA "m"]]] [:PCHAR [:UNRESERVED [:ALPHA "e"]]] [:PCHAR [:UNRESERVED [:ALPHA "s"]]] [:PCHAR ":"] [:PCHAR [:UNRESERVED [:ALPHA "s"]]] [:PCHAR [:UNRESERVED [:ALPHA "p"]]] [:PCHAR [:UNRESERVED [:ALPHA "e"]]] [:PCHAR [:UNRESERVED [:ALPHA "c"]]] [:PCHAR [:UNRESERVED [:ALPHA "i"]]] [:PCHAR [:UNRESERVED [:ALPHA "f"]]] [:PCHAR [:UNRESERVED [:ALPHA "i"]]] [:PCHAR [:UNRESERVED [:ALPHA "c"]]] [:PCHAR [:UNRESERVED [:ALPHA "a"]]] [:PCHAR [:UNRESERVED [:ALPHA "t"]]] [:PCHAR [:UNRESERVED [:ALPHA "i"]]] [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "n"]]] [:PCHAR ":"] [:PCHAR [:UNRESERVED [:ALPHA "d"]]] [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "c"]]] [:PCHAR [:UNRESERVED [:ALPHA "b"]]] [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "k"]]] [:PCHAR ":"] [:PCHAR [:UNRESERVED [:ALPHA "d"]]] [:PCHAR [:UNRESERVED [:ALPHA "t"]]] [:PCHAR [:UNRESERVED [:ALPHA "d"]]] [:PCHAR ":"] [:PCHAR [:UNRESERVED [:ALPHA "x"]]] [:PCHAR [:UNRESERVED [:ALPHA "m"]]] [:PCHAR [:UNRESERVED [:ALPHA "l"]]] [:PCHAR ":"] [:PCHAR [:UNRESERVED [:DIGIT "4"]]] [:PCHAR [:UNRESERVED "."]] [:PCHAR [:UNRESERVED [:DIGIT "1"]]] [:PCHAR [:UNRESERVED "."]] [:PCHAR [:UNRESERVED [:DIGIT "2"]]]]]]]
 65 | 
 66 |     (uri-parser "ldap://[2001:db8::7]/c=GB?objectClass?one")
 67 |     [:URI [:SCHEME [:ALPHA "l"] [:ALPHA "d"] [:ALPHA "a"] [:ALPHA "p"]] ":" [:HIER-PART "//" [:AUTHORITY [:HOST [:IP-LITERAL "[" [:IPV6ADDRESS [:H16 [:HEXDIG "2"] [:HEXDIG "0"] [:HEXDIG "0"] [:HEXDIG "1"]] ":" [:H16 [:HEXDIG "d"] [:HEXDIG "b"] [:HEXDIG "8"]] "::" [:H16 [:HEXDIG "7"]]] "]"]]] [:PATH-ABEMPTY "/" [:SEGMENT [:PCHAR [:UNRESERVED [:ALPHA "c"]]] [:PCHAR [:SUB-DELIMS "="]] [:PCHAR [:UNRESERVED [:ALPHA "G"]]] [:PCHAR [:UNRESERVED [:ALPHA "B"]]]]]] "?" [:QUERY [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "b"]]] [:PCHAR [:UNRESERVED [:ALPHA "j"]]] [:PCHAR [:UNRESERVED [:ALPHA "e"]]] [:PCHAR [:UNRESERVED [:ALPHA "c"]]] [:PCHAR [:UNRESERVED [:ALPHA "t"]]] [:PCHAR [:UNRESERVED [:ALPHA "C"]]] [:PCHAR [:UNRESERVED [:ALPHA "l"]]] [:PCHAR [:UNRESERVED [:ALPHA "a"]]] [:PCHAR [:UNRESERVED [:ALPHA "s"]]] [:PCHAR [:UNRESERVED [:ALPHA "s"]]] "?" [:PCHAR [:UNRESERVED [:ALPHA "o"]]] [:PCHAR [:UNRESERVED [:ALPHA "n"]]] [:PCHAR [:UNRESERVED [:ALPHA "e"]]]]]))
 68 | 
 69 | (deftest phone-uri
 70 |   (are [x y] (= x y)
 71 |     (phone-uri-parser "tel:+1-201-555-0123")
 72 |     [:TELEPHONE-URI
 73 |      "tel:"
 74 |      [:TELEPHONE-SUBSCRIBER
 75 |       [:GLOBAL-NUMBER
 76 |        [:GLOBAL-NUMBER-DIGITS
 77 |         "+"
 78 |         [:DIGIT "1"]
 79 |         [:PHONEDIGIT [:VISUAL-SEPARATOR "-"]]
 80 |         [:PHONEDIGIT [:DIGIT "2"]]
 81 |         [:PHONEDIGIT [:DIGIT "0"]]
 82 |         [:PHONEDIGIT [:DIGIT "1"]]
 83 |         [:PHONEDIGIT [:VISUAL-SEPARATOR "-"]]
 84 |         [:PHONEDIGIT [:DIGIT "5"]]
 85 |         [:PHONEDIGIT [:DIGIT "5"]]
 86 |         [:PHONEDIGIT [:DIGIT "5"]]
 87 |         [:PHONEDIGIT [:VISUAL-SEPARATOR "-"]]
 88 |         [:PHONEDIGIT [:DIGIT "0"]]
 89 |         [:PHONEDIGIT [:DIGIT "1"]]
 90 |         [:PHONEDIGIT [:DIGIT "2"]]
 91 |         [:PHONEDIGIT [:DIGIT "3"]]]]]]))
 92 | 
 93 | (def abnf-german
 94 |   "Testing the ABNF regular expressions"
 95 |   (parser
 96 |     "
 97 | ; a parser for the German programming language
 98 | ; http://esolangs.org/wiki/German
 99 | 
100 | S = <*1space> (A / B) *(<space> (A / B)) <*1space>
101 | A = #'BEER'
102 | B = #'SCHNITZEL'
103 | space = #'\\s+'
104 | " :input-format :abnf))
105 | 
106 | (deftest german
107 |   (are [x y] (= x y)
108 |        (abnf-german " BEER SCHNITZEL BEER BEER SCHNITZEL SCHNITZEL
109 |                      BEER BEER BEER ")
110 |        [:S
111 |         [:A "BEER"]
112 |         [:B "SCHNITZEL"]
113 |         [:A "BEER"]
114 |         [:A "BEER"]
115 |         [:B "SCHNITZEL"]
116 |         [:B "SCHNITZEL"]
117 |         [:A "BEER"]
118 |         [:A "BEER"]
119 |         [:A "BEER"]]))
120 | 
121 | (def abnf-abc
122 |   "Trying the \"equal amount of A's, B's, and C's\" parser in ABNF,
123 | to test the lookahead"
124 |   (parser
125 |     "S = &(A 'c') 1*'a' B
126 |      A = 'a' [A] 'b'
127 |      <B> = 'b' [B] 'c'"
128 |     :input-format :abnf))
129 | 
130 | (deftest abc
131 |   (are [x y] (= x y)
132 |        (abnf-abc "aaaabbbbcccc")
133 |        [:S "a" "a" "a" "a" "b" "b" "b" "b" "c" "c" "c" "c"]
134 |        (abnf-abc "aaabbbc" :total true)
135 |        [:S "a" "a" "a" "b" "b" "b" "c" [:instaparse/failure ""] [:instaparse/failure ""]]))
136 | 
137 | (def reps
138 |   "Testing the different kinds of repetitions"
139 |   (parser
140 |     "S = A B C D E FG
141 |      A = *'a'
142 |      B = 2*'b'
143 |      C = *2'c'
144 |      D = 2'd'
145 |      E = 2*4'e'
146 |      FG = 2('f' 'g')"
147 |     :input-format :abnf))
148 | 
149 | (deftest rep-test
150 |   (are [x] (not (instance? instaparse.gll.Failure x))
151 |        (reps "aabbccddeefgfg")
152 |        (reps "bbbbbbddeeeefgfg")
153 |        (reps "bbcddeefgfg")))
154 | 
155 | (deftest rep-test-errors
156 |   (are [x] (instance? instaparse.gll.Failure x)
157 |        (reps "")
158 |        (reps "bccddeefgfg")
159 |        (reps "aaaabbbbcccddeefgfg")
160 |        (reps "aabbccddeefg")
161 |        (reps "aabbccddeeffgg")))
162 | 
163 | (def regex-chars
164 |   "Testing %d42-91. The boundary chars are \"*\" and \"[\", which normally aren't allowed in a regex."
165 |   (parser
166 |     "S = %d42-91"
167 |     :input-format :abnf))
168 | 
169 | (deftest regex-char-test
170 |   (doseq [i (range 1 (inc 100))
171 |           :let [c (char i)]]
172 |     (if (<= 42 i 91)
173 |       (is (not (instance? instaparse.gll.Failure (regex-chars (str c)))))
174 |       (is (instance? instaparse.gll.Failure (regex-chars (str c)))))))
175 | 
176 | (deftest unicode-test
177 |   (let [poop "\uD83D\uDCA9"]  ; U+1F4A9 PILE OF POO
178 |     (let [parser1 (parser "S = %x1F4A9"
179 |                           :input-format :abnf)]
180 |       (are [x y] (= x y)
181 |            (parses parser1 poop) [[:S poop]])
182 |       (are [x] (instance? instaparse.gll.Failure x)
183 |            (parser1 (str poop poop))
184 |            (parser1 (str (first poop)))
185 |            ;; shouldn't work on the surrogate characters individually
186 |            (parser1 (str (second poop)))))
187 |     (let [parser2 (parser "S = %x1F4A8-1F4A9"
188 |                           :input-format :abnf)]
189 |       (are [x y] (= x y)
190 |            (parses parser2 poop) [[:S poop]])
191 |       (are [x] (instance? instaparse.gll.Failure x)
192 |            (parser2 (str poop poop))
193 |            (parser2 (str (first poop)))
194 |            (parser2 (str (second poop)))))
195 |     (let [parser3 (parser "S = %x1F4A9.1F4A9.1F4A9"
196 |                           :input-format :abnf)]
197 |       (are [x y] (= x y)
198 |            (parses parser3 (str poop poop poop)) [[:S poop poop poop]])
199 |       (are [x] (instance? instaparse.gll.Failure x)
200 |            (parser3 (str poop))))
201 |     ;; it would be cool if EBNF supported unicode in a parser spec
202 |     ;; (ABNF doesn't allow that though)
203 |     (let [parser4 (parser (str "S = '" poop "'*"))]
204 |       (are [x y] (= x y)
205 |            (parses parser4 (str poop poop poop)) [[:S poop poop poop]])
206 |       (are [x] (instance? instaparse.gll.Failure x)
207 |            (parser4 (str (first poop)))
208 |            (parser4 (str (second poop)))
209 |            (parser4 (str poop poop (first poop)))))))
210 | 
211 | (deftest abnf-combinator-test
212 |   (let [p (parser (merge
213 |                    {:S (abnf "A / B")}
214 |                    (abnf "<A> = 1*'a'")
215 |                    {:B (abnf "'='")})
216 |                   :start :S)]
217 |     (are [x y] (= y x)
218 |       (p "aAaa")
219 |       [:S "a" "a" "a" "a"]
220 |       (p "=")
221 |       [:S [:B "="]])))
222 | 
223 | (defn output-matches?
224 |   [expected actual]
225 |   (if (= :fail expected)
226 |     (instance? instaparse.gll.Failure actual)
227 |     (= expected actual)))
228 | 
229 | (deftest string-ci-test
230 |   (are [p input expected] (output-matches? expected (p input))
231 |     (parser "S = 'Hi'" :input-format :ebnf) "Hi" [:S "Hi"]
232 |     (parser "S = 'Hi'" :input-format :ebnf) "hi" :fail
233 |     (parser "S = 'Hi'" :input-format :ebnf :string-ci false) "Hi" [:S "Hi"]
234 |     (parser "S = 'Hi'" :input-format :ebnf :string-ci false) "hi" :fail
235 |     (parser "S = 'Hi'" :input-format :ebnf :string-ci true) "Hi" [:S "Hi"]
236 |     (parser "S = 'Hi'" :input-format :ebnf :string-ci true) "hi" [:S "Hi"]
237 | 
238 |     (parser [:S (ebnf "'Hi'")]) "Hi" [:S "Hi"]
239 |     (parser [:S (ebnf "'Hi'")]) "hi" :fail
240 |     (parser [:S (ebnf "'Hi'" :string-ci true)]) "Hi" [:S "Hi"]
241 |     (parser [:S (ebnf "'Hi'" :string-ci true)]) "hi" [:S "Hi"]
242 | 
243 |     (parser "S = 'Hi'" :input-format :abnf) "Hi" [:S "Hi"]
244 |     (parser "S = 'Hi'" :input-format :abnf) "hi" [:S "Hi"]
245 |     (parser "S = 'Hi'" :input-format :abnf :string-ci false) "Hi" [:S "Hi"]
246 |     (parser "S = 'Hi'" :input-format :abnf :string-ci false) "hi" :fail
247 |     (parser "S = 'Hi'" :input-format :abnf :string-ci true) "Hi" [:S "Hi"]
248 |     (parser "S = 'Hi'" :input-format :abnf :string-ci true) "hi" [:S "Hi"]
249 | 
250 |     (parser [:S (abnf "'Hi'")]) "Hi" [:S "Hi"]
251 |     (parser [:S (abnf "'Hi'")]) "hi" [:S "Hi"]
252 |     (parser [:S (abnf "'Hi'" :string-ci false)]) "Hi" [:S "Hi"]
253 |     (parser [:S (abnf "'Hi'" :string-ci false)]) "hi" :fail))
254 | 


--------------------------------------------------------------------------------
/test/instaparse/auto_flatten_seq_test.cljc:
--------------------------------------------------------------------------------
 1 | (ns instaparse.auto-flatten-seq-test
 2 |   (:require
 3 |     [instaparse.auto-flatten-seq :refer [auto-flatten-seq conj-flat convert-afs-to-vec]]
 4 |     #?(:clj [clojure.test :refer [deftest are is]]
 5 |        :cljs [cljs.test]))
 6 |   #?(:cljs (:require-macros [cljs.test :refer [deftest are is]])))
 7 | 
 8 | (defn rand-mutation [v iv]
 9 |   (let [rnd (int (rand-int 3))]
10 |     (case rnd
11 |       0 (let [n (rand-int 50000)] [(conj v n) (conj-flat iv n) rnd])
12 |       2 (let [i (rand-int 64), r (auto-flatten-seq (repeat i (rand-int 50000)))]
13 |           [(into v r) (conj-flat iv r) rnd])
14 |       1 (let [i (rand-int 64), r (auto-flatten-seq (repeat i (rand-int 50000)))]
15 |           [(into (vec (seq r)) v) (conj-flat r iv) rnd]))))
16 |         
17 | (deftest rand-incremental-vector-test
18 |   (is (= (conj-flat (auto-flatten-seq [:s]) nil) [:s]))
19 |   (loop [v (vec (range 100)) iv (auto-flatten-seq (range 100)) n 50 loops 20]
20 |     (let [[v iv rnd] (rand-mutation v iv)]
21 |       (cond
22 |         (zero? loops) nil
23 |         (zero? n) (recur (vec (range 100)) (auto-flatten-seq (range 100)) 50 (dec loops))
24 |         :else
25 |         (do
26 |           (is (= (count v) (count iv)))
27 |           (is (= v iv))
28 |           (is (= iv v))        
29 |           (is (= (hash v) (hash iv)))
30 |           (is (= (seq v) (seq iv)))
31 |           (is (= v (convert-afs-to-vec iv)))
32 |           (is (= (convert-afs-to-vec iv) v))
33 |           (is (= (type (empty (convert-afs-to-vec iv))) (type v)))
34 |           (is (= (hash v) (hash (convert-afs-to-vec iv))))
35 |           (recur v iv (dec n) loops))))))
36 | 
37 | (defn depth [v]
38 |   (cond
39 |     (empty? v) 0
40 |     (sequential? (first v)) (max (inc (depth (first v))) (depth (rest v)))
41 |     :else (depth (rest v))))
42 | 


--------------------------------------------------------------------------------
/test/instaparse/defparser_test.cljc:
--------------------------------------------------------------------------------
 1 | (ns instaparse.defparser-test
 2 |   (:require
 3 |     #?(:clj  [clojure.test :as t :refer [deftest are is]]
 4 |        :cljs [cljs.test :as t :refer-macros [deftest are is]])
 5 |     #?(:clj  [instaparse.core :as insta :refer [defparser]]
 6 |        :cljs [instaparse.core :as insta :refer-macros [defparser]])
 7 |     [instaparse.combinators :as c]
 8 |     [instaparse.core-test :refer [parsers-similar?]]))
 9 | 
10 | (defparser p1 "S = #'a' | 'b'")
11 | 
12 | (defparser p2 [:S (c/alt (c/regexp #"a") (c/string "b"))])
13 | 
14 | (defparser p3 {:S (c/alt (c/regexp #"a") (c/string "b"))}
15 |   :start :S)
16 | 
17 | (defparser p4 "test/data/defparser_grammar.txt")
18 | 
19 | (def p5 (insta/parser "S = #'a' | 'b'"))
20 | 
21 | (deftest defparser-test-standard
22 |   (is (parsers-similar? p1 p2 p3 p4 p5))
23 | 
24 |   #?(:clj
25 |      (are [x y] (thrown? y (eval (quote x)))
26 |        (instaparse.core/defparser p6 "test/data/parser_not_found.txt")
27 |        Exception
28 | 
29 |        (instaparse.core/defparser p7 "test/data/defparser_grammar.txt" :no-slurp true)
30 |        Exception)))
31 | 
32 |        ;; We catch up front when someone tries to do something overly
33 |        ;; complicated in the macro-time options
34 |        ;; [test removed due to a bug in Clojure 1.10 which prevents capture of
35 |        ;; errors triggered during macroexpansion]
36 |        ;; (instaparse.core/defparser p8 "S = #'a' | 'b'" :input-format (do :ebnf))
37 |        ;; AssertionError
38 | 
39 | 
40 | 
41 | 
42 | (defparser a1 "S = #'a' / 'b'"
43 |   :input-format :abnf)
44 | 
45 | (def a2 (insta/parser "S = #'a' / 'b'" :input-format :abnf))
46 | 
47 | (defparser a3 "S = #'a' | 'b'"
48 |   :input-format :ebnf, :string-ci true)
49 | 
50 | (deftest defparser-test-abnf
51 |   (is (parsers-similar? a1 a2 a3)))
52 | 
53 | 
54 | 
55 | (defparser ws1 "S = (<whitespace?> 'a')+ <whitespace?>; <whitespace> = #'\\s+'")
56 | 
57 | (defparser ws2 "S = 'a'+" :auto-whitespace :standard)
58 | 
59 | (defparser ws3 "S = 'a'+" :auto-whitespace (insta/parser "whitespace = #'\\s+'"))
60 | 
61 | (let [ws (insta/parser "whitespace = #'\\s+'")]
62 |   (defparser ws4 "S = 'a'+" :auto-whitespace ws))
63 | 
64 | (def ws5 (insta/parser "S = 'a'+" :auto-whitespace :standard))
65 | 
66 | (defparser ws6 "<whitespace> = #'\\s+'; S = (<whitespace?> 'a')+ <whitespace?>"
67 |   :start :S)
68 | 
69 | (deftest defparser-test-auto-whitespace
70 |   (is (parsers-similar? ws1 ws2 ws3 ws4 ws5 ws6)))
71 | 
72 | 
73 | 
74 | (defparser e1 "S = 'a'+" :output-format :enlive)
75 | 
76 | (def e2 (insta/parser "S = 'a'+" :output-format :enlive))
77 | 
78 | (deftest defparser-test-enlive
79 |   (is (parsers-similar? e1 e2))
80 |   (is (= (e2 "a") (e1 "a"))))
81 | 


--------------------------------------------------------------------------------
/test/instaparse/failure_test.cljc:
--------------------------------------------------------------------------------
 1 | (ns instaparse.failure-test
 2 |   (:require
 3 |     [instaparse.failure :refer [marker pprint-failure]]
 4 |     #?(:clj [clojure.test :refer [deftest are is]]
 5 |        :cljs [cljs.test]))
 6 |   #?(:cljs (:require-macros
 7 |              [cljs.test :refer [is are deftest]])))
 8 | 
 9 | ;; Tests new marker function by counting the number of tabs in both text
10 | ;; and marker lines to make sure the count is the same.
11 | (deftest marker-test
12 |   (let [text           "\t\ti'm a sample error line with tabs."
13 |         n              16
14 |         marker         (marker text n)]
15 |     (let [text-tabs   (count (filter #{"\t"} text))
16 |           marker-tabs (count (filter #{"\t"} marker))]
17 |       (is (= text-tabs marker-tabs)))))
18 | 
19 | (deftest pprint-failure-test
20 |   (let [request {:line 3
21 |                  :column 16
22 |                  :text "\t\ti'm a sample error line with tabs."
23 |                  :reason [{:tag :string :expecting "A"}]}
24 |         nl (println-str)]
25 |     (is (= (with-out-str (pprint-failure request))
26 |            (str "Parse error at line 3, column 16:" nl
27 |                 (:text request) nl
28 |                 "\t\t             ^" nl
29 |                 "Expected:" nl
30 |                 "\"A\"" nl)))))
31 | 


--------------------------------------------------------------------------------
/test/instaparse/namespaced_nts_test.cljc:
--------------------------------------------------------------------------------
 1 | (ns instaparse.namespaced-nts-test
 2 |   (:require
 3 |    #?(:clj [clojure.test :refer [deftest is]]
 4 |       :cljs [cljs.test :as t])
 5 |    #?(:clj [instaparse.core :as insta]
 6 |       :cljs [instaparse.core :as insta]))
 7 |   #?(:cljs (:require-macros
 8 |             [cljs.test :refer [is deftest]])))
 9 | 
10 | (def namespaced-nts-parser
11 |   (insta/parser
12 |    "S = token (<ws> token)*
13 |     ws = #'\\s+'
14 |     keyword/hello = 'hello'
15 |     keyword.namespaced/bye = 'bye'
16 |     <keyword> = keyword/hello | keyword.namespaced/bye
17 |     identifier = #'\\S+'
18 |     token = keyword / identifier"
19 |    :allow-namespaced-nts true))
20 | 
21 | (deftest parser
22 |   (is (= (namespaced-nts-parser "bye") [:S [:token [:keyword.namespaced/bye "bye"]]])))
23 | 
24 | (deftest round-trip
25 |   (let [grammar (prn-str namespaced-nts-parser)]
26 |     (is (= grammar
27 |            (prn-str (insta/parser
28 |                      grammar
29 |                      :allow-namespaced-nts true))))))
30 | 
31 | 


--------------------------------------------------------------------------------
/test/instaparse/repeat_test.cljc:
--------------------------------------------------------------------------------
  1 | (ns instaparse.repeat-test
  2 |   (:require #?(:clj  [clojure.test :refer [deftest are]]
  3 |                :cljs [cljs.test :as t])
  4 |             [instaparse.core :as insta]
  5 |             [instaparse.repeat :as repeat])
  6 |   #?(:cljs (:require-macros [cljs.test :refer [are deftest]])))
  7 | 
  8 | (def user-parser
  9 | "content = user-block*
 10 | user-block = (user before-section after-section < blank-line* >)
 11 | user = prefix separator number separator name newline
 12 | before-section = < before > lines error-line*
 13 | after-section = < after > lines
 14 | <before> = < 'BEFORE' newline >
 15 | <after> = < 'AFTER' newline >
 16 | <prefix> = < 'User' >
 17 | <lines> = line*
 18 | <line> = <#'\\s+'> subscription newline
 19 | <error-line> = ( '(no dates!)' | 'FIXUP!' ) newline
 20 | blank-line = #'\\s*\n'
 21 | name = #'.*'
 22 | (*WIP why infinite loop?*)
 23 | subscription = !prefix #'.*?(?=\\s+-)' < separator > date
 24 | date = #'.*'
 25 | <newline> = <'\n'>
 26 | <separator> = <#'[ -]+'>
 27 | number = #'[0-9]+'
 28 |  
 29 | ")
 30 | 
 31 | (deftest memory-optimize-test 
 32 |   (are [grammar text optimize?]
 33 |        (let [parser (insta/parser grammar)
 34 |              parser-enlive (insta/parser grammar :output-format :enlive)
 35 |              tree1 (parser text)
 36 |              tree2 (parser text :optimize :memory)
 37 |              tree3 (parser-enlive text)
 38 |              tree4 (parser-enlive text :optimize :memory)]
 39 |          (and (= tree1 tree2) (= tree3 tree4)
 40 |             (= optimize? (repeat/used-memory-optimization? tree2))
 41 |             (= optimize? (repeat/used-memory-optimization? tree4))))
 42 |        
 43 |        ;user-parser text true
 44 |        "S = 'ab'*" "ababab" true
 45 |        "S = 'ab'*" "abababd" false
 46 |        "S = 'ab'*" "" false
 47 |        "<S> = 'ab'*" "ababab" true
 48 |        "<S> = 'ab'*" "abababd" false
 49 |        "<S> = 'ab'*" "" false
 50 |        "S = <'ab'>*" "ababab" false
 51 |        "S = <'ab'*>" "ababab" false
 52 |        
 53 |        "S = A*; A = 'a'" "aaaa" true
 54 |        "S = A*; A = 'a'" "aaaad" false
 55 |        "S = A*; A = 'a'" "" false
 56 |        "<S> = A*; A = 'a'" "aaaa" true
 57 |        "<S> = A*; A = 'a'" "aaaad" false
 58 |        "<S> = A*; A = 'a'" "" false
 59 |        "S = <A>*; A = 'a'" "aaaa" false
 60 |        "S = <A*>; A = 'a'" "aaaa" false
 61 |        
 62 |        "S = 'ab'+" "ababab" true
 63 |        "S = 'ab'+" "abababd" false
 64 |        "S = 'ab'+" "" false
 65 |        "<S> = 'ab'+" "ababab" true
 66 |        "<S> = 'ab'+" "abababd" false
 67 |        "<S> = 'ab'+" "" false
 68 |        "S = <'ab'>+" "ababab" false
 69 |        "S = <'ab'+>" "ababab" false
 70 |        
 71 |        "S = A+; A = 'a'" "aaaa" true
 72 |        "S = A+; A = 'a'" "aaaad" false
 73 |        "S = A+; A = 'a'" "" false
 74 |        "<S> = A+; A = 'a'" "aaaa" true
 75 |        "<S> = A+; A = 'a'" "aaaad" false
 76 |        "<S> = A+; A = 'a'" "" false
 77 |        "S = <A>+; A = 'a'" "aaaa" false
 78 |        "S = <A+>; A = 'a'" "aaaa" false
 79 | 
 80 |        "S = 'c' 'ab'*" "cababab" true
 81 |        "S = 'c' 'ab'*" "cabababd" false
 82 |        "S = 'c' 'ab'*" "dababab" false
 83 |        "S = 'c' 'ab'*" "c" false
 84 |        "S = 'c' 'ab'*" "" false
 85 |        "<S> = 'c' 'ab'*" "cababab" true
 86 |        "<S> = 'c' 'ab'*" "cabababd" false
 87 |        "<S> = 'c' 'ab'*" "dcababab" false
 88 |        "<S> = 'c' 'ab'*" "c" false
 89 |        "<S> = 'c' 'ab'*" "" false       
 90 |        "S = 'c' <'ab'>*" "cababab" false
 91 |        "S = 'c' <'ab'*>" "cababab" false
 92 |        "S = <'c'> <'ab'>*" "cababab" false
 93 |        "S = <'c'> 'ab'*" "cababab" false
 94 |        
 95 |        "S = 'c' A*; A = 'a'" "caaaa" true
 96 |        "S = 'c' A*; A = 'a'" "caaaad" false
 97 |        "S = 'c' A*; A = 'a'" "dcaaaad" false
 98 |        "S = 'c' A*; A = 'a'" "c" false
 99 |        "<S> = 'c' A*; A = 'a'" "caaaa" true
100 |        "<S> = 'c' A*; A = 'a'" "caaaad" false
101 |        "<S> = 'c' A*; A = 'a'" "daaaad" false
102 |        "<S> = 'c' A*; A = 'a'" "c" false
103 |        "S = 'c' <A>*; A = 'a'" "caaaa" false
104 |        "S = 'c' <A*>; A = 'a'" "caaaa" false
105 |        
106 |        "S = 'c' 'ab'+" "cababab" true
107 |        "S = 'c' 'ab'+" "dababab" false
108 |        "S = 'c' 'ab'+" "abababd" false
109 |        "S = 'c' 'ab'+" "c" false
110 |        "S = 'c' 'ab'+" "" false
111 |        "<S> = 'c' 'ab'+" "cababab" true
112 |        "<S> = 'c' 'ab'+" "cabababd" false
113 |        "<S> = 'c' 'ab'+" "dcababab" false
114 |        "<S> = 'c' 'ab'+" "c" false
115 |        "<S> = 'c' 'ab'+" "" false       
116 |        "S = 'c' <'ab'>+" "cababab" false
117 |        "S = 'c' <'ab'+>" "cababab" false
118 |        "S = <'c'> <'ab'>+" "cababab" false
119 |        "S = <'c'> 'ab'+" "cababab" false
120 |        
121 |        "S = 'c' A+; A = 'a'" "caaaa" true
122 |        "S = 'c' A+; A = 'a'" "caaaad" false
123 |        "S = 'c' A+; A = 'a'" "dcaaaa" false
124 |        "S = 'c' A+; A = 'a'" "c" false
125 |        "<S> = 'c' A+; A = 'a'" "caaaa" true
126 |        "<S> = 'c' A+; A = 'a'" "caaaad" false
127 |        "<S> = 'c' A+; A = 'a'" "dcaaaa" false
128 |        "<S> = 'c' A+; A = 'a'" "c" false
129 |        "S = 'c' <A>+; A = 'a'" "caaaa" false
130 |        "S = 'c' <A+>; A = 'a'" "caaaa" false
131 |        
132 |        "S = C A+; C = 'c'; A = 'a'" "caaaa" true
133 |        "S = C A+; C = 'c'; <A> = 'a'" "caaaa" true
134 |        "S = C A+; <C> = 'c'; A = 'a'" "caaaa" true
135 |        "S = C A+; <C> = 'c'; <A> = 'a'" "caaaa" true
136 |        "S = <C> A+; C = 'c'; A = 'a'" "caaaa" false       
137 |        "S = C A+; C = 'c'; A = 'a'" "caaaad" false
138 |        "S = C A+; C = 'c'; A = 'a'" "dcaaaa" false
139 |        "S = C A+; C = 'c'; A = 'a'" "c" false
140 |        "<S> = C A+; C = 'c'; A = 'a'" "caaaa" true
141 |        "<S> = <C> A+; C = 'c'; A = 'a'" "caaaa" false
142 |        "<S> = C A+; C = 'c'; A = 'a'" "caaaad" false
143 |        "<S> = C A+; C = 'c'; A = 'a'" "dcaaaa" false
144 |        "<S> = C A+; C = 'c'; A = 'a'" "c" false
145 |        "S = C <A>+; C = 'c'; A = 'a'" "caaaa" false
146 |        "S = C <A+>; C = 'c'; A = 'a'" "caaaa" false
147 |        ))
148 | 


--------------------------------------------------------------------------------
/test/instaparse/specs.cljc:
--------------------------------------------------------------------------------
 1 | (ns instaparse.specs)
 2 | 
 3 | (def cfg1 "S = 'a'")
 4 | (def cfg2 
 5 |   "S = X
 6 |    X = Y
 7 |    Y = Z")
 8 | (def cfg3
 9 |   "S = X | Y
10 |    Y = A Z
11 |    Z = 'a'")
12 | (def cfg4
13 |   "S := A B | C
14 |    C := (A | B) C")
15 | (def cfg5
16 |   "S=A?")
17 | (def cfg6
18 |   "S =(A | B)?")
19 | (def cfg7
20 |   "S = A, B?, (C C)*, D+, E")
21 | (def cfg8
22 |   "<S> = <A B?> (C | D)")
23 | (def cfg9
24 |   "S = A, &B")
25 | (def cfg10
26 |   "S = &B A")
27 | (def cfg11
28 |   "S = &B+ A")
29 | (def cfg12
30 |   "S = !B A")
31 | (def cfg13
32 |   "S = !&B A")
33 | (def cfg15
34 |   "S = 'a' S | Epsilon;
35 |    C = 'b'.
36 |    D = A")
37 | (def cfg16
38 |   "S = 'a' / 'b'")
39 | (def cfg17
40 |   "S = 'a' / 'b' | 'c'")
41 | (def cfg18
42 |   "S = 'a' | 'b' / 'c'")
43 | (def cfg19
44 |   "S = A ('a' | 'b')+
45 |    A = !B
46 |    B = 'a' !'b'")
47 | (def cfg20
48 |   "(* A comment about this grammar 
49 |    *split* (across) lines *)
50 |    (* And some (* nested *) comments *)
51 |    S = (A*)
52 |    A = 'a'")
53 | 


--------------------------------------------------------------------------------
/test/instaparse/viz_test.clj:
--------------------------------------------------------------------------------
 1 | (ns instaparse.viz-test
 2 |   (:require instaparse.core)
 3 |   (:use instaparse.viz))
 4 | 
 5 | (def make-tree-e
 6 |      "simple tree parser"
 7 |      (instaparse.core/parser "tree: node* 
 8 |               node: leaf | <'('> node (<'('> node <')'>)* node* <')'> 
 9 |               leaf: #'a+'
10 |               " :output-format :enlive))
11 | 
12 | (def make-tree-h
13 |      "simple tree parser"
14 |      (instaparse.core/parser "tree: node* 
15 |               node: leaf | <'('> node (<'('> node <')'>)* node* <')'> 
16 |               leaf: #'a+'
17 |               " :output-format :hiccup))
18 | 
19 | (def make-tree-se
20 |      "simple tree parser"
21 |      (instaparse.core/parser "<tree>: node* 
22 |               node: leaf | <'('> node (<'('> node <')'>)* node* <')'> 
23 |               leaf: #'a+'
24 |               " :output-format :enlive))
25 | 
26 | (def make-tree-sh
27 |      "simple tree parser"
28 |      (instaparse.core/parser "<tree>: node* 
29 |               node: leaf | <'('> node (<'('> node <')'>)* node* <')'> 
30 |               leaf: #'a+'
31 |               " :output-format :hiccup))
32 | 
33 | (defn view-test-trees [t]
34 |   (tree-viz (make-tree-e "((a)((a)))(a)"))
35 |   (Thread/sleep t)
36 |   (tree-viz (make-tree-h "((a)((a)))(a)"))
37 |   (Thread/sleep t)
38 |   (tree-viz (make-tree-sh "((a)((a)))(a)"))
39 |   (Thread/sleep t)
40 |   (tree-viz (make-tree-se "((a)((a)))(a)"))
41 |   (Thread/sleep t)
42 |   (tree-viz (make-tree-e ""))
43 |   (Thread/sleep t)
44 |   (tree-viz (make-tree-se "")))
45 | 


--------------------------------------------------------------------------------