├── api-doc.zip
├── target
    └── stale
    │   └── extract-native.dependencies
├── .gitignore
├── bin
    └── protoflex
    │   ├── util.clj
    │   ├── examples
    │       ├── csv_parse.clj
    │       └── xml_parse.clj
    │   ├── test_parse.clj
    │   └── parse.clj
├── src
    └── protoflex
    │   ├── util.clj
    │   ├── examples
    │       ├── csv_parse.clj
    │       └── xml_parse.clj
    │   └── parse.clj
├── pom.xml.asc
├── project.clj
├── .project
├── .classpath
├── ReleaseNotes.md
├── .settings
    └── ccw.repl.cmdhistory.prefs
├── ReleaseNotes.html
├── test
    └── protoflex
    │   └── test_parse.clj
├── README.md
└── README.html


/api-doc.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/protoflex/parse-ez/HEAD/api-doc.zip


--------------------------------------------------------------------------------
/target/stale/extract-native.dependencies:
--------------------------------------------------------------------------------
1 | ([:dependencies ([org.clojure/clojure "1.4.0"])])


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /pom.xml
 2 | *jar
 3 | /lib
 4 | /classes
 5 | /native
 6 | /.lein-failures
 7 | /checkouts
 8 | /.lein-deps-sum
 9 | /api-doc
10 | /README.md.html
11 | /ReleaseNotes.md.html
12 | /doc.sh
13 | 


--------------------------------------------------------------------------------
/bin/protoflex/util.clj:
--------------------------------------------------------------------------------
1 | (ns ^{ :doc "Misc Utilities."} protoflex.util)
2 | 
3 | (defmacro apply-macro [m args] `(eval (list* '~m ~args)))
4 | 
5 | (defmacro macro->fn [m] `(fn [& args#] (apply-macro ~m args#)))


--------------------------------------------------------------------------------
/src/protoflex/util.clj:
--------------------------------------------------------------------------------
1 | (ns ^{ :doc "Misc Utilities."} protoflex.util)
2 | 
3 | (defmacro apply-macro [m args] `(eval (list* '~m ~args)))
4 | 
5 | (defmacro macro->fn [m] `(fn [& args#] (apply-macro ~m args#)))


--------------------------------------------------------------------------------
/pom.xml.asc:
--------------------------------------------------------------------------------
 1 | -----BEGIN PGP SIGNATURE-----
 2 | Version: GnuPG v1.4.11 (GNU/Linux)
 3 | 
 4 | iQEcBAABAgAGBQJQtt0sAAoJEOxwYAq7Gjj9KMQH/351BGiPev2fKIspjSmBluLk
 5 | tW5IrO3FpNFg+aPQdVvXS655ldeVlMXT5+qvRzK9OEorHhfQTDmBbP2FBRDGtq/D
 6 | mlQAZoI+6Q2LuqmY5wWSUTQCyondfSyrV28UJlpr11f2srfmnMuHnhY/odj5hbEz
 7 | ajtPAtyFjDVCcdG/NYjs0XOgLfvgHT6gdA8+deYib+PjzI2K0Ff/ksmcUVmcE+3u
 8 | UZrrdmI+bc5oFYdug7NeYHpVafGYeAMdFY1fl7VLqwOADPryZlh0ToXu437m5Zcm
 9 | e6EYP2RoSf1TgalTnS1u7uYgsQgTPYa6mlI9o77JjK/3NkAAY3DkqER1VXHi2Xc=
10 | =x14b
11 | -----END PGP SIGNATURE-----
12 | 


--------------------------------------------------------------------------------
/project.clj:
--------------------------------------------------------------------------------
 1 | (defproject protoflex/parse-ez "0.4.2"
 2 |   :description "Clojure Parser Library"
 3 |   :url "https://github.com/protoflex/parse-ez"
 4 |   :license {:name "Eclipse Public License"}
 5 |   :dependencies [[org.clojure/clojure "1.4.0"]]
 6 |   :dev-dependencies [[lein-autodoc "0.9.0"]
 7 |                      [lein-clojars "0.6.0"]]
 8 |   :jvm-opts ["-Xmx512m"] 
 9 |   :warn-on-reflection true
10 |   :autodoc { :name "Parse Library", :page-title "Parse API Documentation"
11 |              :copyright "Protoflex Software"}
12 |   )
13 | 


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>parse-ez</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>ccw.builder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>ccw.leiningen.builder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 		<buildCommand>
19 | 			<name>org.eclipse.jdt.core.javabuilder</name>
20 | 			<arguments>
21 | 			</arguments>
22 | 		</buildCommand>
23 | 	</buildSpec>
24 | 	<natures>
25 | 		<nature>org.eclipse.jdt.core.javanature</nature>
26 | 		<nature>ccw.leiningen.nature</nature>
27 | 		<nature>ccw.nature</nature>
28 | 	</natures>
29 | </projectDescription>
30 | 


--------------------------------------------------------------------------------
/bin/protoflex/examples/csv_parse.clj:
--------------------------------------------------------------------------------
 1 | (ns protoflex.examples.csv_parse
 2 |   (:use [protoflex.parse]))
 3 | 
 4 | (declare detect-sep csv-1)
 5 | 
 6 | (defn csv
 7 |   "Reads and returns one or more records as a vector of vector of field-values"
 8 |   ([] (csv (no-trim #(detect-sep))))
 9 |   ([sep] (multi* (fn [] (no-trim #(csv-1 sep))))))
10 | 
11 | (defn csv-1
12 |   "Reads and returns the fields of one record (line)"
13 |   [sep] (sep-by #(any-string sep) #(chr sep)))
14 | 
15 | (defn detect-sep
16 |   "Detects the separator used in a csv file (a comma or a tab)"
17 |   [] (let [m (mark-pos)
18 |            s (attempt #(any dq-str sq-str))
19 |            s (if s s (no-trim #(read-to-re #",|\t")))
20 |            sep (read-ch)]
21 |        (back-to-mark m)
22 |        sep))
23 | 


--------------------------------------------------------------------------------
/src/protoflex/examples/csv_parse.clj:
--------------------------------------------------------------------------------
 1 | (ns protoflex.examples.csv_parse
 2 |   (:use [protoflex.parse]))
 3 | 
 4 | (declare detect-sep csv-1)
 5 | 
 6 | (defn csv
 7 |   "Reads and returns one or more records as a vector of vector of field-values"
 8 |   ([] (csv (no-trim #(detect-sep))))
 9 |   ([sep] (multi* (fn [] (no-trim #(csv-1 sep))))))
10 | 
11 | (defn csv-1
12 |   "Reads and returns the fields of one record (line)"
13 |   [sep] (sep-by #(any-string sep) #(chr sep)))
14 | 
15 | (defn detect-sep
16 |   "Detects the separator used in a csv file (a comma or a tab)"
17 |   [] (let [m (mark-pos)
18 |            s (attempt #(any dq-str sq-str))
19 |            s (if s s (no-trim #(read-to-re #",|\t")))
20 |            sep (read-ch)]
21 |        (back-to-mark m)
22 |        sep))
23 | 


--------------------------------------------------------------------------------
/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
 4 | 	<classpathentry kind="src" path="src"/>
 5 | 	<classpathentry kind="src" path="test">
 6 | 		<attributes>
 7 | 			<attribute name="optional" value="true"/>
 8 | 		</attributes>
 9 | 	</classpathentry>
10 | 	<classpathentry kind="src" path="dev-resources">
11 | 		<attributes>
12 | 			<attribute name="optional" value="true"/>
13 | 		</attributes>
14 | 	</classpathentry>
15 | 	<classpathentry kind="src" path="resources">
16 | 		<attributes>
17 | 			<attribute name="optional" value="true"/>
18 | 		</attributes>
19 | 	</classpathentry>
20 | 	<classpathentry kind="lib" path="target/classes">
21 | 		<attributes>
22 | 			<attribute name="optional" value="true"/>
23 | 		</attributes>
24 | 	</classpathentry>
25 | 	<classpathentry exported="true" kind="con" path="ccw.LEININGEN_CONTAINER">
26 | 		<attributes>
27 | 			<attribute name="org.eclipse.jdt.launching.CLASSPATH_ATTR_LIBRARY_PATH_ENTRY" value="parse-ez/target/native/windows/x86"/>
28 | 		</attributes>
29 | 	</classpathentry>
30 | 	<classpathentry kind="output" path="bin"/>
31 | </classpath>
32 | 


--------------------------------------------------------------------------------
/bin/protoflex/examples/xml_parse.clj:
--------------------------------------------------------------------------------
 1 | (ns protoflex.examples.xml_parse
 2 |   (:use [protoflex.parse]))
 3 | 
 4 | (declare pi prolog element attributes children-and-close cdata elem-or-text close-tag)
 5 | 
 6 | (defn parse-xml [xml-str]
 7 |   (parse #(between prolog element pi) xml-str :blk-cmt-delim ["<!--" "-->"] :line-cmt-start nil))
 8 | 
 9 | (defn- pi [] (while (starts-with? "<?") (skip-over "?>")))
10 | 
11 | (defn- prolog [] (pi) (attempt  #(regex #"(?s)<!DOCTYPE([^<]+?>)|(.*?\]\s*>)")) (pi))
12 | 
13 | (def name-start ":A-Z_a-z\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\u02FF\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD")
14 | 
15 | (def name-char (str name-start "\\-.0-9\\xB7\\u0300-\\u036F\\u203F-\\u2040"))
16 | 
17 | (def name-re (-> (format "[%s][%s]*" name-start name-char) re-pattern))
18 | 
19 | (defn element []
20 |   (let [tag (do (chr \<) (regex name-re))
21 |         attrs (attributes)
22 |         children (look-ahead* [
23 |                                ">" #(children-and-close tag)
24 |                                "/>" (fn [] [])])]
25 |     {:tag tag, :attributes attrs, :children children}))
26 | 
27 | (defn attr []
28 |   (let [n (regex name-re) _ (chr \=)
29 |         v (any sq-str dq-str)]
30 |     [n v]))
31 | 
32 | (defn attributes [] (apply hash-map (flatten  (multi* attr))))
33 | 
34 | (defn- children-and-close [tag]
35 |   (let [children (multi* #(between pi elem-or-text pi))]
36 |     (close-tag tag)
37 |     children))
38 | 
39 | (defn- elem-or-text []
40 |   (look-ahead [
41 |                "<![CDATA[" cdata
42 |                "</" (fn [] nil)
43 |                "<" element
44 |                "" #(read-to "<")]))
45 | 
46 | (defn- cdata []
47 |   (string "<![CDATA[")
48 |   (let [txt (read-to "]]>")] (string "]]>") txt))
49 | 
50 | (defn- close-tag [tag]
51 |     (string (str "</" tag))
52 |     (chr \>))
53 | 
54 | 


--------------------------------------------------------------------------------
/src/protoflex/examples/xml_parse.clj:
--------------------------------------------------------------------------------
 1 | (ns protoflex.examples.xml_parse
 2 |   (:use [protoflex.parse]))
 3 | 
 4 | (declare pi prolog element attributes children-and-close cdata elem-or-text close-tag)
 5 | 
 6 | (defn parse-xml [xml-str]
 7 |   (parse #(between prolog element pi) xml-str :blk-cmt-delim ["<!--" "-->"] :line-cmt-start nil))
 8 | 
 9 | (defn- pi [] (while (starts-with? "<?") (skip-over "?>")))
10 | 
11 | (defn- prolog [] (pi) (attempt  #(regex #"(?s)<!DOCTYPE([^<]+?>)|(.*?\]\s*>)")) (pi))
12 | 
13 | (def name-start ":A-Z_a-z\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\u02FF\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD")
14 | 
15 | (def name-char (str name-start "\\-.0-9\\xB7\\u0300-\\u036F\\u203F-\\u2040"))
16 | 
17 | (def name-re (-> (format "[%s][%s]*" name-start name-char) re-pattern))
18 | 
19 | (defn element []
20 |   (let [tag (do (chr \<) (regex name-re))
21 |         attrs (attributes)
22 |         children (look-ahead* [
23 |                                ">" #(children-and-close tag)
24 |                                "/>" (fn [] [])])]
25 |     {:tag tag, :attributes attrs, :children children}))
26 | 
27 | (defn attr []
28 |   (let [n (regex name-re) _ (chr \=)
29 |         v (any sq-str dq-str)]
30 |     [n v]))
31 | 
32 | (defn attributes [] (apply hash-map (flatten  (multi* attr))))
33 | 
34 | (defn- children-and-close [tag]
35 |   (let [children (multi* #(between pi elem-or-text pi))]
36 |     (close-tag tag)
37 |     children))
38 | 
39 | (defn- elem-or-text []
40 |   (look-ahead [
41 |                "<![CDATA[" cdata
42 |                "</" (fn [] nil)
43 |                "<" element
44 |                "" #(read-to "<")]))
45 | 
46 | (defn- cdata []
47 |   (string "<![CDATA[")
48 |   (let [txt (read-to "]]>")] (string "]]>") txt))
49 | 
50 | (defn- close-tag [tag]
51 |     (string (str "</" tag))
52 |     (chr \>))
53 | 
54 | 


--------------------------------------------------------------------------------
/ReleaseNotes.md:
--------------------------------------------------------------------------------
 1 | # Parse-EZ Release Notes
 2 | 
 3 | ## Version 0.3.0
 4 | 
 5 | ### Main Additions
 6 | Version 0.3.0 of Parse-EZ adds macro versions of parse combinator functions
 7 | to make it easy to nest calls to parse combinators without having to write
 8 | nested anonymous functions using the "(fn [] ...)" syntax. Note that Clojure
 9 | does not allow nesting of anonymous functions of "#(...)" forms.  Whereas
10 | the existing parse combinators take parse functions as arguments and actually
11 | perform parsing and return the parse results, the newly added macros take 
12 | parse expressions as arguments and return parse functions (to be passed 
13 | to other parse combinators).  These macros are named the same as the 
14 | corresponding parse combinators but with an underscore ("\_") suffix. For example
15 | the macro version of `any` is named `any_`.
16 | 
17 | ### Miscellaneous Changes
18 | 
19 | The following minor changes/additions are made in the current release:
20 | 
21 | - regex function now also accepts string regular expressions (in addition
22 |   to patterns typically passed using #"..." forms).  However, if you use
23 |   strings, you would have to escape backslashes as you would do in Java.
24 | 
25 | ## Version 0.3.5
26 | 
27 | - Added `sep-by*` function with a slightly different behavior from `sep-by`
28 | - Bug fixes for `with-trim-on` and `with-trim-off`; they were not properly 
29 |   resetting the auto-trim option to original value.
30 | 
31 | ## Version 0.4.1
32 | 
33 | - Added functionality to commit to a particular parse branch and prevent
34 |   the parser from trying alternatives at higher levels on parse-failure.
35 |   See the new `commit` and `commit-on` functions.
36 | 
37 | - Other new functions: `with-follow`, `with-follow*`, `with-no-follow`,
38 |   `ident`, `key-word`, `semi` `comma`, `dot`, `colon`, `popen`, `pclose`,
39 |   `bopen`, `bclose`, `sqopen`, `sqclose`, `aopen`, `aclose`, and `equal`.
40 | 
41 | - Bug fixes for `sep-by`, `any-string` and `line-cmt`.
42 | 
43 | - Leiningen coordinates: group-id 'protoflex' should be included now.
44 |   Eg: [protoflex/parse-ez 0.4.2], instead of [parse-ez 0.3.6].
45 | 


--------------------------------------------------------------------------------
/.settings/ccw.repl.cmdhistory.prefs:
--------------------------------------------------------------------------------
1 | cmdhistory=["(run-tests)" "(+ 2 2)" "(ns-publics)" "(ns-publics *ns*)" "*ns*" "(in-ns 'protoflex.test-parse)" "(ns-publics *ns*)" "*ns*" "(load-file \\"test/protoflex/test_parse.clj\\")" "(run-tests)" "*ns*" "(in-ns 'protoflex.test-parse)" "(run-tests)" "*ns*" "(in-ns 'user)" "*ns*" "(run-tests)" "*ns*" "(+ 2 2)" ";; Switching to protoflex.parse namespace" ";; Switching to protoflex.test_parse namespace" "(run-tests)" ";; Switching to protoflex.parse namespace" ";; Switching to protoflex.test_parse namespace" "(run-tests)" "(def fns (->fns (string \\"a\\") (string \\"b\\")))" "(parse (first fns) \\"a\\")" "(first fns)" "(def fns (->fns (string \\"a\\") (string \\"b\\")))" "(first fns)" "(def fns (->fns (string \\"a\\") (string \\"b\\")))" "(->fns '(string \\"a\\") '(string \\"b\\"))" "(parse (first *1) \\"a\\")" "(->fns (string \\"a\\") (string \\"b\\"))" ";; Switching to protoflex.test_parse namespace" "(run-tests)" ";; Switching to protoflex.examples.csv_parse namespace" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" "(defn any-string* [sep] (any \#(any-string sep) \#(multi* (read-n 1))))" "(defn csv-1 [sep] (sep-by \#(any-string* sep) \#(chr sep)))" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" "(.printStackTrace *e)" "(defn any-string* [sep] (any \#(any-string sep) (fn [] (multi* \#(read-n 1))))\\n  )" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" "(defn any-string* [sep] (any \#(any-string sep) \#(regex \#\\".*\\")))" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" ";; Switching to protoflex.examples.csv_parse namespace" "(defn any-string* [sep] (any  \#(regex \#\\".*\\")))" "(parse \#(any-string* \\",\\") \\"abc\\" )" "(parse \#(any-string* \\",\\") \\"\\" )" ";; Switching to protoflex.parse namespace" ";; Switching to protoflex.examples.csv_parse namespace" "(parse csv \\"foo,bar\\\\nbaz,boo\\\\n\\")" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" ";; Switching to protoflex.parse namespace" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" ";; Switching to protoflex.examples.csv_parse namespace" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" "(parse csv \\"foo,bar\\\\nbaz,boo\\\\n\\")" ";; Switching to protoflex.examples.csv_parse namespace" "(parse csv \\"foo,bar\\\\nbaz,boo\\\\n\\")" ";; Switching to protoflex.parse namespace" ";; Switching to protoflex.examples.csv_parse namespace" "(parse csv \\"foo,bar\\\\nbaz,boo\\\\n\\")" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" "(parse csv \#(read-to-re-or-eof \\"XXX\\") \\"foo,bar\\\\nbaz,boo\\")" "(parse \#(read-to-re-or-eof \\"XXX\\") \\"foo,bar\\\\nbaz,boo\\")" "(parse \#(read-to-re-or-eof \\"XXX\\") \\"foo,bar\\\\nbaz,XXXboo\\")" "(parse \#(read-to-re-or-eof \\"XXX\\") \\"foo,bar\\\\nbaz,XXXboo\\" \:eof false)" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" "(parse csv \\"foo,bar\\\\nbaz,boo\\\\n\\")" ";; Switching to protoflex.parse namespace" ";; Switching to protoflex.examples.csv_parse namespace" "(parse csv \\"foo,bar\\\\nbaz,boo\\\\n\\")" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" "(parse csv \\"foo,bar\\\\nbaz,boo\\\\n\\")"]
2 | eclipse.preferences.version=1
3 | 


--------------------------------------------------------------------------------
/ReleaseNotes.html:
--------------------------------------------------------------------------------
  1 | <style>/* 
  2 |    Some simple Github-like styles, with syntax highlighting CSS via Pygments.
  3 | */
  4 | html {
  5 | 	background:#fff;
  6 | 	margin:0;
  7 | 	padding:0;
  8 | }
  9 | body {
 10 | 	font: 14px helvetica,arial,freesans,clean,sans-serif;
 11 | 	line-height: 1.6;
 12 | 	margin: 0 auto;
 13 |   padding: 20px;
 14 | 	text-align:left;
 15 | 	color: #333;
 16 | 	width:920px;
 17 | }
 18 | 
 19 | .md {
 20 | 	background-color: #eee;
 21 | 	border-radius:3px;
 22 |   margin: 0;
 23 | 	padding:3px;
 24 | }
 25 | 
 26 | article {
 27 |   padding: 30px;
 28 |   border: 1px solid #cacaca;
 29 |   background-color: white;
 30 | }
 31 | article > :first-child {
 32 |   margin-top: 0!important;
 33 | }
 34 | 
 35 | h1 {
 36 | 	font-size:28px;
 37 | 	margin-bottom:10px;
 38 |   color: black;
 39 | }
 40 | 
 41 | h2 {
 42 | 	font-size:24px;
 43 | 	margin:20px 0 10px;
 44 |   color: black;
 45 |   border-bottom: 1px solid #ccc;
 46 | }
 47 | 
 48 | h3 {
 49 | 	font-size:18px;
 50 | 	margin:20px 0 10px;
 51 | }
 52 | 
 53 | h4 {
 54 | 	font-size:16px;
 55 | 	font-weight:bold;
 56 | 	margin:20px 0 10px;
 57 | }
 58 | 
 59 | h5 {
 60 | 	font-size:14px;
 61 | 	font-weight:bold;
 62 | 	margin:20px 0 10px;
 63 | }
 64 | 
 65 | h6 {
 66 | 	color:#777;
 67 | 	font-size:14px;
 68 | 	font-weight:bold;
 69 | 	margin:20px 0 10px;
 70 | }
 71 | 
 72 | hr {
 73 | 	background: transparent url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAYAAAAECAYAAACtBE5DAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAAyJpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuMC1jMDYwIDYxLjEzNDc3NywgMjAxMC8wMi8xMi0xNzozMjowMCAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvIiB4bWxuczp4bXBNTT0iaHR0cDovL25zLmFkb2JlLmNvbS94YXAvMS4wL21tLyIgeG1sbnM6c3RSZWY9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9zVHlwZS9SZXNvdXJjZVJlZiMiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENTNSBNYWNpbnRvc2giIHhtcE1NOkluc3RhbmNlSUQ9InhtcC5paWQ6OENDRjNBN0E2NTZBMTFFMEI3QjRBODM4NzJDMjlGNDgiIHhtcE1NOkRvY3VtZW50SUQ9InhtcC5kaWQ6OENDRjNBN0I2NTZBMTFFMEI3QjRBODM4NzJDMjlGNDgiPiA8eG1wTU06RGVyaXZlZEZyb20gc3RSZWY6aW5zdGFuY2VJRD0ieG1wLmlpZDo4Q0NGM0E3ODY1NkExMUUwQjdCNEE4Mzg3MkMyOUY0OCIgc3RSZWY6ZG9jdW1lbnRJRD0ieG1wLmRpZDo4Q0NGM0E3OTY1NkExMUUwQjdCNEE4Mzg3MkMyOUY0OCIvPiA8L3JkZjpEZXNjcmlwdGlvbj4gPC9yZGY6UkRGPiA8L3g6eG1wbWV0YT4gPD94cGFja2V0IGVuZD0iciI/PqqezsUAAAAfSURBVHjaYmRABcYwBiM2QSA4y4hNEKYDQxAEAAIMAHNGAzhkPOlYAAAAAElFTkSuQmCC) repeat-x 0 0;
 74 | 	border: 0 none;
 75 | 	height: 4px;
 76 | 	margin: 15px 0;
 77 | 	padding: 0;
 78 | }
 79 | 
 80 | 
 81 | p {
 82 | 	margin: 15px 0; 
 83 | }
 84 | 
 85 | pre, code {
 86 | 	font: 12px 'Bitstream Vera Sans Mono','Courier',monospace;
 87 | }
 88 | .highlight pre, pre {
 89 | 	background-color:#f8f8f8;
 90 | 	border:1px solid #ccc;
 91 | 	font-size:13px;
 92 | 	line-height:19px;
 93 | 	overflow:auto;
 94 | 	border-radius:3px;
 95 | 	-moz-border-radius:3px;
 96 | 	-webkit-border-radius:3px;
 97 | 	padding:6px 10px;
 98 | }
 99 | 
100 | 
101 | 
102 | code {
103 | 	white-space:nowrap;
104 | 	border:1px solid #eaeaea;
105 | 	background-color:#f8f8f8;
106 | 	border-radius:3px;
107 | 	-moz-border-radius:3px;
108 | 	-webkit-border-radius:3px;
109 | 	margin:0 2px;
110 | 	padding:0 5px;
111 | }
112 | 
113 | pre>code
114 | {
115 | 	white-space:pre;
116 | 	border:none;
117 | 	background:transparent;
118 | 	margin:0;
119 | 	padding:0;
120 | }
121 | 
122 | a, a code {
123 | 	color: #4183C4;
124 | 	text-decoration:none;
125 | }
126 | 
127 | 
128 | blockquote
129 | {
130 | 	border-left:4px solid #ddd;
131 | 	padding-left:11px;
132 | 	color:#555;
133 | 	margin:14px 0;
134 | }
135 | 
136 | table
137 | {
138 |   font-size: 14px;
139 | 	border-collapse:collapse;
140 | 	margin:20px 0 0;
141 | 	padding:0;
142 | }
143 | 
144 | table tr
145 | {
146 | 	border-top:1px solid #ccc;
147 | 	background-color:#fff;
148 | 	margin:0;
149 | 	padding:0;
150 | }
151 | 
152 | table tr:nth-child(2n)
153 | {
154 | 	background-color:#f8f8f8;
155 | }
156 | table tr th[align="center"], table tr td[align="center"] {
157 | 	text-align:center;
158 | }
159 | table tr th, table tr td
160 | {
161 | 	border:1px solid #ccc;
162 | 	text-align:left;
163 | 	margin:0;
164 | 	padding:6px 13px;
165 | }
166 | 
167 | ul, ol
168 | {
169 | 	margin:15px 0;
170 | }
171 | 
172 | ul li, ol li
173 | {
174 | 	margin-top:7px;
175 | 	margin-bottom:7px;
176 | }
177 | 
178 | 
179 | 
180 | .shadow {
181 | 	-webkit-box-shadow:0 5px 15px #000;
182 | 	-moz-box-shadow:0 5px 15px #000;
183 | 	box-shadow:0 5px 15px #000;		
184 | }
185 | 
186 | 
187 | 
188 | /* Pygments coloring */
189 | .highlight .c{color:#998;font-style:italic;}
190 | .highlight .err{color:#a61717;background-color:#e3d2d2;}
191 | .highlight .k{font-weight:bold;}
192 | .highlight .o{font-weight:bold;}
193 | .highlight .cm{color:#998;font-style:italic;}
194 | .highlight .cp{color:#999;font-weight:bold;}
195 | .highlight .c1{color:#998;font-style:italic;}
196 | .highlight .cs{color:#999;font-weight:bold;font-style:italic;}
197 | .highlight .gd{color:#000;background-color:#fdd;}
198 | .highlight .gd .x{color:#000;background-color:#faa;}
199 | .highlight .ge{font-style:italic;}
200 | .highlight .gr{color:#a00;}
201 | .highlight .gh{color:#999;}
202 | .highlight .gi{color:#000;background-color:#dfd;}
203 | .highlight .gi .x{color:#000;background-color:#afa;}
204 | .highlight .go{color:#888;}
205 | .highlight .gp{color:#555;}
206 | .highlight .gs{font-weight:bold;}
207 | .highlight .gu{color:#800080;font-weight:bold;}
208 | .highlight .gt{color:#a00;}
209 | .highlight .kc{font-weight:bold;}
210 | .highlight .kd{font-weight:bold;}
211 | .highlight .kn{font-weight:bold;}
212 | .highlight .kp{font-weight:bold;}
213 | .highlight .kr{font-weight:bold;}
214 | .highlight .kt{color:#458;font-weight:bold;}
215 | .highlight .m{color:#099;}
216 | .highlight .s{color:#d14;}
217 | .highlight .na{color:#008080;}
218 | .highlight .nb{color:#0086B3;}
219 | .highlight .nc{color:#458;font-weight:bold;}
220 | .highlight .no{color:#008080;}
221 | .highlight .ni{color:#800080;}
222 | .highlight .ne{color:#900;font-weight:bold;}
223 | .highlight .nf{color:#900;font-weight:bold;}
224 | .highlight .nn{color:#555;}
225 | .highlight .nt{color:#000080;}
226 | .highlight .nv{color:#008080;}
227 | .highlight .ow{font-weight:bold;}
228 | .highlight .w{color:#bbb;}
229 | .highlight .mf{color:#099;}
230 | .highlight .mh{color:#099;}
231 | .highlight .mi{color:#099;}
232 | .highlight .mo{color:#099;}
233 | .highlight .sb{color:#d14;}
234 | .highlight .sc{color:#d14;}
235 | .highlight .sd{color:#d14;}
236 | .highlight .s2{color:#d14;}
237 | .highlight .se{color:#d14;}
238 | .highlight .sh{color:#d14;}
239 | .highlight .si{color:#d14;}
240 | .highlight .sx{color:#d14;}
241 | .highlight .sr{color:#009926;}
242 | .highlight .s1{color:#d14;}
243 | .highlight .ss{color:#990073;}
244 | .highlight .bp{color:#999;}
245 | .highlight .vc{color:#008080;}
246 | .highlight .vg{color:#008080;}
247 | .highlight .vi{color:#008080;}
248 | .highlight .il{color:#099;}
249 | 
250 | </style><div class="md"><article>
251 | <h1>Parse-EZ Release Notes</h1>
252 | 
253 | <h2>Version 0.3.0</h2>
254 | 
255 | <h3>Main Additions</h3>
256 | 
257 | <p>Version 0.3.0 of Parse-EZ adds macro versions of parse combinator functions
258 | to make it easy to nest calls to parse combinators without having to write
259 | nested anonymous functions using the &quot;(fn [] ...)&quot; syntax. Note that Clojure
260 | does not allow nesting of anonymous functions of &quot;#(...)&quot; forms.  Whereas
261 | the existing parse combinators take parse functions as arguments and actually
262 | perform parsing and return the parse results, the newly added macros take 
263 | parse expressions as arguments and return parse functions (to be passed 
264 | to other parse combinators).  These macros are named the same as the 
265 | corresponding parse combinators but with an underscore (&quot;_&quot;) suffix. For example
266 | the macro version of <code>any</code> is named <code>any_</code>.</p>
267 | 
268 | <h3>Miscellaneous Changes</h3>
269 | 
270 | <p>The following minor changes/additions are made in the current release:</p>
271 | 
272 | <ul>
273 | <li>regex function now also accepts string regular expressions (in addition
274 | to patterns typically passed using #&quot;...&quot; forms).  However, if you use
275 | strings, you would have to escape backslashes as you would do in Java.</li>
276 | </ul>
277 | 
278 | <h2>Version 0.3.5</h2>
279 | 
280 | <ul>
281 | <li>Added <code>sep-by*</code> function with a slightly different behavior from <code>sep-by</code></li>
282 | <li>Bug fixes for <code>with-trim-on</code> and <code>with-trim-off</code>; they were not properly 
283 | resetting the auto-trim option to original value.</li>
284 | </ul>
285 | 
286 | <h2>Version 0.4.0</h2>
287 | 
288 | <ul>
289 | <li><p>Added functionality to commit to a particular parse branch and prevent
290 | the parser from trying alternatives at higher levels on parse-failure.
291 | See the new <code>commit</code> and <code>commit-on</code> functions.</p></li>
292 | <li><p>Other new functions: <code>with-follow</code>, <code>with-follow*</code>, <code>with-no-follow</code>,
293 | <code>ident</code>, <code>key-word</code>, <code>semi</code> <code>comma</code>, <code>dot</code>, <code>colon</code>, <code>popen</code>, <code>pclose</code>,
294 | <code>bopen</code>, <code>bclose</code>, <code>sqopen</code>, <code>sqclose</code>, <code>aopen</code>, <code>aclose</code>, and <code>equal</code>.</p></li>
295 | <li><p>Bug fixes for <code>sep-by</code>, <code>any-string</code> and <code>line-cmt</code>.</p></li>
296 | <li><p>Leiningen coordinates: group-id &#39;protoflex&#39; should be included now.
297 | Eg: [protoflex/parse-ez 0.4.2], instead of [parse-ez 0.3.6].</p></li>
298 | </ul>
299 | </article></div>


--------------------------------------------------------------------------------
/bin/protoflex/test_parse.clj:
--------------------------------------------------------------------------------
  1 | ;Copyright (c) Protoflex Software. All rights reserved.
  2 | ;The use and distribution terms for this software are covered by the
  3 | ;Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
  4 | ;which can be found in the file epl-v10.html at the root of this distribution.
  5 | ;By using this software in any fashion, you are agreeing to be bound by
  6 | ;the terms of this license.
  7 | ;You must not remove this notice, or any other, from this software.
  8 | 
  9 | (ns ^{:doc "Test Parse Library functions" :author "Panduranga Adusumilli"}
 10 |   protoflex.test_parse
 11 |   (:use [clojure.test]
 12 |         [protoflex.parse]
 13 |         [protoflex.examples.csv_parse]
 14 |         [protoflex.examples.xml_parse]))
 15 | 
 16 | (defmacro ptest [s expected & body]
 17 |   (let [res# `(parse #(do ~@body) ~s :eof false)]
 18 |     `(is (= ~res# ~expected))))
 19 | 
 20 | (defmacro ptest_ [s expected pf]
 21 |   (let [res# `(parse ~pf ~s :eof false)]
 22 |     `(is (= ~res# ~expected))))
 23 | 
 24 | (defmacro ptest-fail [s & body]
 25 |   `(is (~'thrown? Exception (parse #(do ~@body) ~s))))
 26 | 
 27 | (defmacro ptest-fail_ [s & pf]
 28 |   `(is (~'thrown? Exception (parse ~pf ~s))))
 29 | 
 30 | (deftest test-chr
 31 |   (ptest "abc" \a (chr \a))
 32 |   (ptest "abc" [\a \b] [(chr \a) (chr \b)])
 33 |   (ptest "a b c" [\a \b \c] [(chr \a) (chr \b) (chr \c)])
 34 |   (ptest-fail "abc" (chr \b))
 35 |   (ptest-fail "" (chr \a))
 36 |   )
 37 | 
 38 | (deftest test-chr-in
 39 |   (ptest "axy" [\a] [(chr-in "abc")])
 40 |   (ptest "axb" [\a] [(chr-in [\b \a \c])])
 41 |   (ptest "axc" [\a] [(chr-in #{\c \b \a})])
 42 |   (ptest "acdef" [\a \c] [(chr-in "abc") (chr-in "abc")])
 43 |   (ptest-fail "xyza" (chr-in "abc"))
 44 |   (ptest-fail "" (chr-in "abc"))
 45 |   )
 46 | 
 47 | (deftest test-string
 48 |   (ptest "abc" "abc" (string "abc"))
 49 |   (ptest "abcdef" "abc" (string "abc"))
 50 |   (ptest "abc def" "abc def" (string "abc def"))
 51 |   (ptest "abcdef" ["abc" "de"] [(string "abc") (string "de")])
 52 |   (ptest "abc def" ["abc" "de"] [(string "abc") (string "de")])
 53 |   (ptest "abc/*comment*/def" ["abc" "def"]
 54 |     [(string "abc") (string "def")])
 55 |   (ptest "abc defxyz" ["abc" "def" "x"]
 56 |     [(string "abc") (string "def") (string "x")])
 57 |   (ptest-fail "" (string "abc"))
 58 |   (ptest-fail "ab c" (string "abc"))
 59 |   )
 60 | 
 61 | (deftest test-string-in
 62 |   (ptest "abcxyzdef" "abc" (string-in ["abc" "def"]))
 63 |   (ptest "abcxyzdef" "abc" (string-in ["def" "abc"]))
 64 |   (ptest ">>=" ">>=" (string-in [">>" ">>="]))
 65 |   (ptest ">>=" ">>" (string-in-ord [">>" ">>="]))
 66 |   (ptest "abcdefxyz" ["abc" "def"]
 67 |     [(string-in ["abc" "def"]) (string-in ["abc" "def"])])
 68 |   (ptest-fail "xyzabc" (string-in ["abc" "def"]))
 69 |   )
 70 | 
 71 | ;; TODO: test cases for custom word-reader
 72 | (deftest test-word
 73 |   (ptest "abc def" ["abc" "def"] [(word "abc") (word "def")])
 74 |   (ptest "abc /*comment*/ def" ["abc" "def"] [(word "abc") (word "def")])
 75 |   (ptest-fail "abcdef" (word "abc"))
 76 |   (ptest-fail "abc d" (word "abc d"))
 77 |   )
 78 | 
 79 | (deftest test-word-in
 80 |   (ptest "abc xyz" "abc" (word-in ["abc" "def" "ghi"]))
 81 |   (ptest "ghi xyz" "ghi" (word-in ["abc" "def" "ghi"]))
 82 |   (ptest "abc def ghi" ["abc" "def"]
 83 |     (let [words ["abc" "def" "ghi"]] [(word-in words) (word-in words)]))
 84 |   (ptest-fail "abcdef" (word-in ["abc" "def"]))
 85 |   (ptest-fail "xyzabcdef" (word-in ["abc" "def"]))
 86 |   )
 87 | 
 88 | (deftest test-between
 89 |   (ptest "(1234)" 1234 (parens number))
 90 |   (ptest "{1234}" 1234 (braces number))
 91 |   (ptest "[1234]" 1234 (sq-brackets number))
 92 |   (ptest "<1234>" 1234 (ang-brackets number))
 93 |   )
 94 | 
 95 | (deftest test-regex
 96 |   (ptest "abc def123" "abc def" (regex #"\w+\s+([a-z]*)"))
 97 |   (ptest "abc def123" "abc def" (regex "\\w+\\s+([a-z]*)"))
 98 |   (ptest "abc def123 456" ["abc" "def123" "456"]
 99 |          [(regex #"\w+") (regex #"\w+") (regex #"\d+")])
100 |   (ptest "abc def123 456" ["abc" "def123" "456"]
101 |          [(regex "\\w+") (regex "\\w+") (regex "\\d+")])
102 |   (ptest "abc def123" "123" (regex #"(a\w+)\s+[a-z]*(\d+)" 2))
103 |   (ptest "abc def123" "123" (regex "(a\\w+)\\s+[a-z]*(\\d+)" 2))
104 |   (ptest-fail "abcdef123" (regex #"\w+\s+([a-z]*)"))
105 |   (ptest-fail "abcdef123" (regex "\\w+\\s+([a-z]*)"))
106 |   )
107 | 
108 | 
109 | (deftest test-integer
110 |   (ptest "123 abc" 123 (integer))
111 |   (ptest "-123 abc" -123 (integer))
112 |   (ptest-fail "a123" (integer))
113 |   (ptest-fail "123.45" (integer))
114 |   (ptest-fail "123abc" (integer))
115 |   )
116 | 
117 | (deftest test-decimal
118 |   (ptest "123 abc" 123.0 (decimal))
119 |   (ptest "123.45 abc" 123.45 (decimal))
120 |   (ptest "-123.45 abc" -123.45 (decimal))
121 |   (ptest-fail "  123abc" (decimal))
122 |   (ptest-fail "  123.45abc" (decimal))
123 |   )
124 | 
125 | (deftest test-number
126 |   (ptest "123" 123 (number))
127 |   (ptest "123.5" 123.5 (number))
128 |   )
129 | 
130 | (deftest test-dq-str
131 |   (ptest "\"abc\"def" "abc" (dq-str))
132 |   (ptest (str \" "ab" \\ \" \c \") (str "ab" \" \c) (dq-str))
133 |   (ptest-fail "abcdef" (dq-str))
134 |   )
135 | 
136 | (deftest test-sq-str
137 |   (ptest "'abc'def" "abc" (sq-str))
138 |   (ptest (str "'abc"  \\ "'def'ghi") "abc'def" (sq-str))
139 |   (ptest-fail "abcdef" (sq-str))
140 |   )
141 | 
142 | (deftest test-read-to
143 |   (ptest "abcdef12345" ["abcdef" 12345] [(read-to "123") (integer)])
144 |   (ptest-fail "abcdef12345" (read-to "999"))
145 |   )
146 | 
147 | (deftest test-skip-over
148 |   (ptest "abcdef" ["abc" "def"] [(skip-over "abc") (string "def")])
149 |   (ptest "abc  def" ["abc" "def"] [(skip-over "abc") (string "def")])
150 |   (ptest-fail "abc  def" (with-trim-off (skip-over "abc") (string "def")))
151 |   )
152 | 
153 | (deftest test-read-re
154 |   (ptest "abc123" "abc" (read-re #"(\D+)"))
155 |   (ptest "abc123" "abc" (read-re "(\\D+)"))
156 |   )
157 | 
158 | (deftest test-read-to-re
159 |   (ptest "123abc" "123" (read-to-re #"(\D+)"))
160 |   (ptest "123abc" "123" (read-to-re "(\\D+)"))
161 |   )
162 | 
163 | (deftest test-skip-over-re
164 |   (ptest "123abc456" "123abc" (skip-over-re #"(\D+)"))
165 |   (ptest "123abc456" "123abc" (skip-over-re "(\\D+)"))
166 |   (ptest "123abc" "123" (skip-over-re #"(\d+)"))
167 |   (ptest "123abc" "123" (skip-over-re "(\\d+)"))
168 |   (ptest-fail "123abc" (skip-over-re #"([x-z]+)"))
169 |   (ptest-fail "123abc" (skip-over-re "([x-z]+)"))
170 |   )
171 | 
172 | (deftest test-starts-with?
173 |   (ptest "abcdef" true (starts-with? "abc"))
174 |   (ptest "abcdef" false (starts-with? "def"))
175 |   (ptest "abcdef" true (starts-with? ""))
176 |   (ptest "" false (starts-with? "def"))
177 |   )
178 | 
179 | (deftest test-starts-with-re?
180 |   (ptest "123abc" true (starts-with-re? #"\d+"))
181 |   (ptest "123abc" true (starts-with-re? "\\d+"))
182 |   (ptest "abc123" false (starts-with-re? #"\\d+"))
183 |   (ptest "abc123" false (starts-with-re? "\\d+"))
184 |   )
185 | 
186 | (deftest test-read-n
187 |   (ptest "abcdef" "abc" (read-n 3))
188 |   (ptest "abcdef" "a" (read-n 1))
189 |   (ptest "abcdef" nil (read-n 0))
190 |   (ptest-fail "abcdef" (read-n 20))
191 |   )
192 | 
193 | (deftest test-read-ch
194 |   (ptest "abcdef" [\a \b \c] [(read-ch) (read-ch) (read-ch)])
195 |   (ptest "a b c" [\a \b \c] [(read-ch) (read-ch) (read-ch)])
196 |   (ptest "a b c" [\a \space \b] [(read-ch true) (read-ch true) (read-ch true)])
197 |   (ptest-fail "" (read-ch))
198 |   )
199 | 
200 | (deftest test-read-ch-in-set
201 |   (ptest "abcdef" [\a \b] [(read-ch-in-set #{\a \b \c})
202 |                            (read-ch-in-set #{\a \b \c})])
203 |   (ptest-fail "axyz" (read-ch-in-set #{\x \y \z}))
204 |   )
205 | 
206 | (deftest test-blk-cmt
207 |   (ptest "abc/*comment*/def" ["abc" "/*comment*/"]
208 |     (with-trim-off [(string "abc") (blk-cmt "/*" "*/")]))
209 | 
210 |   (ptest "abc<!--comment-->def" ["abc" "<!--comment-->"]
211 |     (with-trim-off [(string "abc") (blk-cmt "<!--" "-->")]))
212 | 
213 |   (ptest "abc/*comment*/def" ["abc" "def"] [(string "abc") (string "def")])
214 |   (ptest-fail "abc/*comment*/def" [(string "abc") (blk-cmt "/*" "*/")])
215 |   )
216 | 
217 | (deftest test-line-cmt
218 |   (ptest "abc//line comment" ["abc" "//line comment"]
219 |     (with-trim-off [(string "abc") (line-cmt "//")]))
220 |   (ptest "abc//line comment\ndef" ["abc" "//line comment\n"]
221 |     (with-trim-off [(string "abc") (line-cmt "//")]))
222 |   )
223 | 
224 | (deftest test-ws
225 |   (ptest "abc /*xyz*/ ghi //kkk" ["abc" " /*xyz*/ " "ghi" " //kkk"]
226 |     (with-trim-off [(string "abc") (ws) (string "ghi") (ws)]))
227 |   )
228 | 
229 | 
230 | (deftest test-trim-on
231 |   (ptest "  abc  def" ["abc" "def"]
232 |     (with-trim-on [(string "abc") (string "def")]))
233 | 
234 |   (ptest "  abc def " ["abc" "def"]
235 |     (with-trim-on [(string "abc") (string "def")]))
236 |   )
237 | 
238 | (deftest test-trim-off
239 |   ; note: the leading whitespaces are trimmed before with-trim-off is called
240 |   (ptest "  abc def" ["abc" " def"]
241 |     (with-trim-off [(string "abc") (string " def")]))
242 |   (is (thrown? Exception (parse #(string "abc") "  abc" :auto-trim false)))
243 |   (is (thrown? Exception (parse #(with-trim-off (string "abc")) "  abc" :auto-trim false)))
244 |   )
245 | 
246 | (deftest test-no-trim
247 |   (ptest_ "123 456" [123 " " 456] (no-trim_ (series number #(string " ") number)))
248 |   (ptest-fail "123 456" (series number #(string " ") number))
249 |   (ptest_ "123 \n 456" [123 "\n" 456] (no-trim-nl_ (series number #(string "\n") number)))
250 |   )
251 | 
252 | (deftest test-lexeme
253 |   (let [wr #(regex #"\w+")]
254 |     (ptest "abc/*comment*/def" ["abc" "def"]
255 |       (with-trim-off [(lexeme wr) (wr)]))
256 |     (ptest "abc   def" ["abc" "def"]
257 |       (with-trim-off [(lexeme wr) (wr)]))
258 |     )
259 |   )
260 | 
261 | (deftest test-attempt
262 |   (ptest "abc" "abc" (attempt #(string "abc")))
263 |   (ptest_ "abc" "abc" (attempt_ (string "abc")))
264 |   (ptest "abc" nil (attempt #(string "def")))
265 |   (ptest_ "abc" nil (attempt_ (string "def")))
266 |   )
267 | 
268 | (deftest test-opt
269 |   (ptest "abc" "abc" (opt #(string "abc")))
270 |   (ptest_ "abc" "abc" (opt_ (string "abc")))
271 |   (ptest "abc" 123 (opt #(string "def") 123))
272 |   (ptest_ "abc" 123 (opt_ (string "def") 123))
273 |   )
274 | 
275 | (deftest test-any
276 |   (ptest "abcdef" "abc" (any #(regex #"\d+") #(string "abc")))
277 |   (ptest_ "abcdef" "abc" (any_ (regex #"\d+") (string "abc")))
278 |   (ptest "123abcdef" "123" (any #(regex #"\d+") #(string "abc")))
279 |   (ptest_ "123abcdef" "123" (any_ (regex #"\d+") (string "abc")))
280 |   (ptest-fail "abcdef" (any #(regex #"\d+") #(string "xyz")))
281 |   (ptest-fail_ "abcdef" (any_ (regex #"\d+") (string "xyz")))
282 |   )
283 | 
284 | (deftest test-look-ahead
285 |   (let [la-spec ["a" #(string "abc") "x" #(string "xyz")]]
286 |   (ptest "abc def" "abc" (look-ahead la-spec))
287 |   (ptest "xyz def" "xyz" (look-ahead la-spec))
288 |   (ptest-fail "pqr def" (look-ahead la-spec))
289 |     ))
290 | 
291 | (deftest test-look-ahead*
292 |   (let [la-spec [
293 |                  "0x" #(-> (regex #"[0-9a-fA-F]+") (Integer/parseInt 16))
294 |                  "0" #(-> (regex #"[0-7]+") (Integer/parseInt 8))
295 |                  "" #(number)]]
296 |     (ptest "0x123 0123" 16r123 (look-ahead* la-spec))
297 |     (ptest "123 0x123 0123" [123 16r123 8r123] (multi+ #(look-ahead* la-spec)))
298 |     ))
299 | 
300 | (deftest test-series
301 |   (ptest "abc123xxx" ["abc" "123" "xxx"]
302 |     (series #(string "abc") #(regex #"\d+") #(word "xxx")))
303 |   (ptest_ "abc123xxx" ["abc" "123" "xxx"]
304 |     (series_ (string "abc") (regex #"\d+") (word "xxx")))
305 | 
306 |   (ptest-fail "abc123xxx"
307 |     (series #(string "abc") #(regex #"[a-w]+") #(word "xxx")))
308 |   (ptest-fail_ "abc123xxx"
309 |     (series_ (string "abc") (regex #"[a-w]+") (word "xxx")))
310 |   )
311 | 
312 | (deftest test-multi*
313 |   (ptest "abcabcabcdef" ["abc" "abc" "abc"] (multi* #(string "abc")))
314 |   (ptest_ "abcabcabcdef" ["abc" "abc" "abc"] (multi*_ (string "abc")))
315 | 
316 |   (ptest "xyzabcabcabc" nil (multi* #(string "abc")))
317 |   (ptest_ "xyzabcabcabc" nil (multi*_ (string "abc")))
318 | 
319 |   (ptest "" nil (multi* #(string "abc")))
320 |   (ptest_ "" nil (multi*_ (string "abc")))
321 | 
322 |   (ptest "xyzabcabc" ["xyz" ["abc" "abc"]]
323 |     [(string "xyz") (multi* #(string "abc"))])
324 |   (ptest "xyzabcabc" ["xyz" ["abc" "abc"]]
325 |     (series #(string "xyz") (multi*_ (string "abc"))))
326 |   (ptest_ "xyzabcabc" ["xyz" ["abc" "abc"]]
327 |     (fn [] (series #(string "xyz") (multi*_ (string "abc")))))
328 |   )
329 | 
330 | (deftest test-multi+
331 |   (ptest "abcabcabcdef" ["abc" "abc" "abc"] (multi+ #(string "abc")))
332 |   (ptest_ "abcabcabcdef" ["abc" "abc" "abc"] (multi+_ (string "abc")))
333 | 
334 |   (ptest-fail "xyzabcabcabc" (multi+ #(string "abc")))
335 |   (ptest-fail_ "xyzabcabcabc" (multi+_ (string "abc")))
336 | 
337 |   (ptest-fail "" (multi+ #(string "abc")))
338 |   (ptest-fail_ "" (multi+_ (string "abc")))
339 | 
340 |   (ptest "xyzabcabc" ["xyz" ["abc" "abc"]]
341 |     [(string "xyz") (multi+ #(string "abc"))])
342 |   (ptest_ "xyzabcabc" ["xyz" ["abc" "abc"]]
343 |     (fn [] (series #(string "xyz") (multi+_ (string "abc")))))
344 |   )
345 | 
346 | (deftest test-times
347 |   (ptest "aaaa" [\a \a \a \a] (times 4 #(chr \a)))
348 |   (ptest_ "aaaa" [\a \a \a \a] (times_ 4 (chr \a)))
349 | 
350 |   (ptest "aaaa" ["aa" "aa"] (times 2 #(string "aa")))
351 |   (ptest_ "aaaa" ["aa" "aa"] (times_ 2 (string "aa")))
352 | 
353 |   (ptest "aa aa" ["aa" "aa"] (times 2 #(string "aa")))
354 |   (ptest_ "aa aa" ["aa" "aa"] (times_ 2 (string "aa")))
355 |   )
356 | 
357 | (deftest test-expect
358 |   (ptest "230" "230" (expect "integer" #(regex #"\d+")))
359 |   (ptest_ "230" "230" (expect_ "integer" (regex #"\d+")))
360 | 
361 |   (try (parse (fn [] (expect "integer" #(regex #"\d+"))) "x230")
362 |        (catch Exception ex (is (.contains (.getMessage ex) "Expecting \"integer\""))))
363 |   (try (parse (expect_ "integer" (regex #"\d+")) "x230")
364 |        (catch Exception ex (is (.contains (.getMessage ex) "Expecting \"integer\""))))
365 |   )
366 | 
367 | (deftest test-parse_
368 |   (is (= (parse_ [(string "abc"), (string "def")] "abc def") ["abc" "def"]))
369 |   (is (= (parse_ [(regex #"\d+"), (regex "\\S+")] "123 def") ["123" "def"]))
370 |   (is (= (parse_ (multi+ (any_ (regex #"\d+") (regex #"\D+"))) "123 def456") ["123" "def" "456"]))
371 |   )
372 | 
373 | (deftest test-sep-by
374 |   (ptest "123.45" [123.45] (sep-by decimal #(chr \,)))
375 |   (ptest "123.45, 234.56,345" [123.45 234.56 345.0] (sep-by decimal #(chr \,)))
376 |   (ptest-fail "123.45, 234, 345abc" (sep-by decimal #(chr \,)))
377 |   )
378 | 
379 | (deftest test-detect-sep
380 |   (ptest "abc,ghi" \, (detect-sep))
381 |   (ptest "abc\tghi" \tab, (detect-sep))
382 |   (ptest-fail "abcghi" (detect-sep))
383 |   (ptest-fail "\"abc,ghi\"" (detect-sep))
384 |   )
385 | 
386 | (deftest test-csv
387 |   (let [s1 "'1abc','def',ghi\n2abc,def,ghi\n"
388 |         s2 "\"1a,bc\",def,ghi\n2abc,def,ghi\n"
389 |         s3 "1abc\tdef\tghi\n2abc\tdef\tghi\n"
390 |         s4 "\"1a\tbc\"\tdef\tghi\n2abc\tdef\tghi\n"]
391 | 
392 |     (ptest s1 [["1abc" "def" "ghi"]["2abc" "def" "ghi"]] (csv))
393 |     (ptest s2 [["1a,bc" "def" "ghi"]["2abc" "def" "ghi"]] (csv))
394 | 
395 |     (ptest s3 [["1abc" "def" "ghi"]["2abc" "def" "ghi"]] (csv))
396 |     (ptest s4 [["1a\tbc" "def" "ghi"]["2abc" "def" "ghi"]] (csv))
397 |     ))
398 | 
399 | (deftest test-expr
400 |   (ptest "3+2-5" ["-" ["+" 3 2] 5] (expr))
401 |   (ptest "3+2*5" ["+" 3 ["*" 2 5]] (expr))
402 |   (ptest "-3+2*5" ["+" -3 ["*" 2 5]] (expr))
403 |   )
404 | 
405 | (deftest test-eval-expr
406 |   (is (= (eval-expr "3") 3))
407 |   (is (= (eval-expr "-3") -3))
408 |   (is (= (eval-expr "3+2") 5))
409 |   (is (= (eval-expr "3+ -2") 1))
410 |   (is (= (eval-expr "3+-2") 1))
411 |   (is (= (eval-expr "3+2*3") 9))
412 |   (is (= (eval-expr "3+2*3+4") 13))
413 |   (is (= (eval-expr "(3+2)*3+4") 19))
414 |   (is (= (eval-expr "(3+2)*(3+4)") 35))
415 |   (is (= (eval-expr "(3*6/2)") 9))
416 |   (is (= (eval-expr "(36/2*3)") 54))
417 |   )
418 | 
419 | (use '[clojure.java.shell :only [sh]])
420 | (defn test-xml []
421 |   (let [ dir (str (System/getProperty "user.home") "/test/data/xmltest/valid")
422 |         files (-> (sh "find" dir ) ^String (:out) (.split "\n"))
423 |         files (filter #(.endsWith ^String %1 ".xml") files)
424 |         ignore #{
425 |                  "/sa/049.xml"
426 |                  "/sa/050.xml"
427 |                  "/sa/051.xml"
428 |                  "/sa/114.xml"
429 |                  }]
430 |     (doseq [f files] (if (not (get ignore (subs f (count dir)))) 
431 |                        (do (println f) (parse-xml (slurp f)))))))
432 | 
433 | 


--------------------------------------------------------------------------------
/test/protoflex/test_parse.clj:
--------------------------------------------------------------------------------
  1 | ;Copyright (c) Protoflex Software. All rights reserved.
  2 | ;The use and distribution terms for this software are covered by the
  3 | ;Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
  4 | ;which can be found in the file epl-v10.html at the root of this distribution.
  5 | ;By using this software in any fashion, you are agreeing to be bound by
  6 | ;the terms of this license.
  7 | ;You must not remove this notice, or any other, from this software.
  8 | 
  9 | (ns ^{:doc "Test Parse Library functions" :author "Panduranga Adusumilli"}
 10 |   protoflex.test_parse
 11 |   (:use [clojure.test]
 12 |         [protoflex.parse]
 13 |         [protoflex.examples.csv_parse]
 14 |         [protoflex.examples.xml_parse]))
 15 | 
 16 | (defmacro ptest [s expected & body]
 17 |   (let [res# `(parse #(do ~@body) ~s :eof false)]
 18 |     `(is (= ~res# ~expected))))
 19 | 
 20 | (defmacro ptest_ [s expected pf]
 21 |   (let [res# `(parse ~pf ~s :eof false)]
 22 |     `(is (= ~res# ~expected))))
 23 | 
 24 | (defmacro ptest-fail [s & body]
 25 |   `(is (~'thrown? Exception (parse #(do ~@body) ~s))))
 26 | 
 27 | (defmacro ptest-fail_ [s & pf]
 28 |   `(is (~'thrown? Exception (parse ~pf ~s))))
 29 | 
 30 | (deftest test-chr
 31 |   (ptest "abc" \a (chr \a))
 32 |   (ptest "abc" [\a \b] [(chr \a) (chr \b)])
 33 |   (ptest "a b c" [\a \b \c] [(chr \a) (chr \b) (chr \c)])
 34 |   (ptest-fail "abc" (chr \b))
 35 |   (ptest-fail "" (chr \a))
 36 |   )
 37 | 
 38 | (deftest test-chr-in
 39 |   (ptest "axy" [\a] [(chr-in "abc")])
 40 |   (ptest "axb" [\a] [(chr-in [\b \a \c])])
 41 |   (ptest "axc" [\a] [(chr-in #{\c \b \a})])
 42 |   (ptest "acdef" [\a \c] [(chr-in "abc") (chr-in "abc")])
 43 |   (ptest-fail "xyza" (chr-in "abc"))
 44 |   (ptest-fail "" (chr-in "abc"))
 45 |   )
 46 | 
 47 | (deftest test-string
 48 |   (ptest "abc" "abc" (string "abc"))
 49 |   (ptest "abcdef" "abc" (string "abc"))
 50 |   (ptest "abc def" "abc def" (string "abc def"))
 51 |   (ptest "abcdef" ["abc" "de"] [(string "abc") (string "de")])
 52 |   (ptest "abc def" ["abc" "de"] [(string "abc") (string "de")])
 53 |   (ptest "abc/*comment*/def" ["abc" "def"]
 54 |     [(string "abc") (string "def")])
 55 |   (ptest "abc defxyz" ["abc" "def" "x"]
 56 |     [(string "abc") (string "def") (string "x")])
 57 |   (ptest-fail "" (string "abc"))
 58 |   (ptest-fail "ab c" (string "abc"))
 59 |   )
 60 | 
 61 | (deftest test-string-in
 62 |   (ptest "abcxyzdef" "abc" (string-in ["abc" "def"]))
 63 |   (ptest "abcxyzdef" "abc" (string-in ["def" "abc"]))
 64 |   (ptest ">>=" ">>=" (string-in [">>" ">>="]))
 65 |   (ptest ">>=" ">>" (string-in-ord [">>" ">>="]))
 66 |   (ptest "abcdefxyz" ["abc" "def"]
 67 |     [(string-in ["abc" "def"]) (string-in ["abc" "def"])])
 68 |   (ptest-fail "xyzabc" (string-in ["abc" "def"]))
 69 |   )
 70 | 
 71 | ;; TODO: test cases for custom word-reader
 72 | (deftest test-word
 73 |   (ptest "abc def" ["abc" "def"] [(word "abc") (word "def")])
 74 |   (ptest "abc /*comment*/ def" ["abc" "def"] [(word "abc") (word "def")])
 75 |   (ptest-fail "abcdef" (word "abc"))
 76 |   (ptest-fail "abc d" (word "abc d"))
 77 |   )
 78 | 
 79 | (deftest test-word-in
 80 |   (ptest "abc xyz" "abc" (word-in ["abc" "def" "ghi"]))
 81 |   (ptest "ghi xyz" "ghi" (word-in ["abc" "def" "ghi"]))
 82 |   (ptest "abc def ghi" ["abc" "def"]
 83 |     (let [words ["abc" "def" "ghi"]] [(word-in words) (word-in words)]))
 84 |   (ptest-fail "abcdef" (word-in ["abc" "def"]))
 85 |   (ptest-fail "xyzabcdef" (word-in ["abc" "def"]))
 86 |   )
 87 | 
 88 | (deftest test-between
 89 |   (ptest "(1234)" 1234 (parens number))
 90 |   (ptest "{1234}" 1234 (braces number))
 91 |   (ptest "[1234]" 1234 (sq-brackets number))
 92 |   (ptest "<1234>" 1234 (ang-brackets number))
 93 |   )
 94 | 
 95 | (deftest test-regex
 96 |   (ptest "abc def123" "abc def" (regex #"\w+\s+([a-z]*)"))
 97 |   (ptest "abc def123" "abc def" (regex "\\w+\\s+([a-z]*)"))
 98 |   (ptest "abc def123 456" ["abc" "def123" "456"]
 99 |          [(regex #"\w+") (regex #"\w+") (regex #"\d+")])
100 |   (ptest "abc def123 456" ["abc" "def123" "456"]
101 |          [(regex "\\w+") (regex "\\w+") (regex "\\d+")])
102 |   (ptest "abc def123" "123" (regex #"(a\w+)\s+[a-z]*(\d+)" 2))
103 |   (ptest "abc def123" "123" (regex "(a\\w+)\\s+[a-z]*(\\d+)" 2))
104 |   (ptest-fail "abcdef123" (regex #"\w+\s+([a-z]*)"))
105 |   (ptest-fail "abcdef123" (regex "\\w+\\s+([a-z]*)"))
106 |   )
107 | 
108 | 
109 | (deftest test-integer
110 |   (ptest "123 abc" 123 (integer))
111 |   (ptest "-123 abc" -123 (integer))
112 |   (ptest-fail "a123" (integer))
113 |   (ptest-fail "123.45" (integer))
114 |   (ptest-fail "123abc" (integer))
115 |   )
116 | 
117 | (deftest test-decimal
118 |   (ptest "123 abc" 123.0 (decimal))
119 |   (ptest "123.45 abc" 123.45 (decimal))
120 |   (ptest "-123.45 abc" -123.45 (decimal))
121 |   (ptest-fail "  123abc" (decimal))
122 |   (ptest-fail "  123.45abc" (decimal))
123 |   )
124 | 
125 | (deftest test-number
126 |   (ptest "123" 123 (number))
127 |   (ptest "123.5" 123.5 (number))
128 |   )
129 | 
130 | (deftest test-dq-str
131 |   (ptest "\"abc\"def" "abc" (dq-str))
132 |   (ptest (str \" "ab" \\ \" \c \") (str "ab" \" \c) (dq-str))
133 |   (ptest-fail "abcdef" (dq-str))
134 |   )
135 | 
136 | (deftest test-sq-str
137 |   (ptest "'abc'def" "abc" (sq-str))
138 |   (ptest (str "'abc"  \\ "'def'ghi") "abc'def" (sq-str))
139 |   (ptest-fail "abcdef" (sq-str))
140 |   )
141 | 
142 | (deftest test-read-to
143 |   (ptest "abcdef12345" ["abcdef" 12345] [(read-to "123") (integer)])
144 |   (ptest-fail "abcdef12345" (read-to "999"))
145 |   )
146 | 
147 | (deftest test-skip-over
148 |   (ptest "abcdef" ["abc" "def"] [(skip-over "abc") (string "def")])
149 |   (ptest "abc  def" ["abc" "def"] [(skip-over "abc") (string "def")])
150 |   (ptest-fail "abc  def" (with-trim-off (skip-over "abc") (string "def")))
151 |   )
152 | 
153 | (deftest test-read-re
154 |   (ptest "abc123" "abc" (read-re #"(\D+)"))
155 |   (ptest "abc123" "abc" (read-re "(\\D+)"))
156 |   )
157 | 
158 | (deftest test-read-to-re
159 |   (ptest "123abc" "123" (read-to-re #"(\D+)"))
160 |   (ptest "123abc" "123" (read-to-re "(\\D+)"))
161 |   )
162 | 
163 | (deftest test-skip-over-re
164 |   (ptest "123abc456" "123abc" (skip-over-re #"(\D+)"))
165 |   (ptest "123abc456" "123abc" (skip-over-re "(\\D+)"))
166 |   (ptest "123abc" "123" (skip-over-re #"(\d+)"))
167 |   (ptest "123abc" "123" (skip-over-re "(\\d+)"))
168 |   (ptest-fail "123abc" (skip-over-re #"([x-z]+)"))
169 |   (ptest-fail "123abc" (skip-over-re "([x-z]+)"))
170 |   )
171 | 
172 | (deftest test-starts-with?
173 |   (ptest "abcdef" true (starts-with? "abc"))
174 |   (ptest "abcdef" false (starts-with? "def"))
175 |   (ptest "abcdef" true (starts-with? ""))
176 |   (ptest "" false (starts-with? "def"))
177 |   )
178 | 
179 | (deftest test-starts-with-re?
180 |   (ptest "123abc" true (starts-with-re? #"\d+"))
181 |   (ptest "123abc" true (starts-with-re? "\\d+"))
182 |   (ptest "abc123" false (starts-with-re? #"\\d+"))
183 |   (ptest "abc123" false (starts-with-re? "\\d+"))
184 |   )
185 | 
186 | (deftest test-read-n
187 |   (ptest "abcdef" "abc" (read-n 3))
188 |   (ptest "abcdef" "a" (read-n 1))
189 |   (ptest "abcdef" nil (read-n 0))
190 |   (ptest-fail "abcdef" (read-n 20))
191 |   )
192 | 
193 | (deftest test-read-ch
194 |   (ptest "abcdef" [\a \b \c] [(read-ch) (read-ch) (read-ch)])
195 |   (ptest "a b c" [\a \b \c] [(read-ch) (read-ch) (read-ch)])
196 |   (ptest "a b c" [\a \space \b] [(read-ch true) (read-ch true) (read-ch true)])
197 |   (ptest-fail "" (read-ch))
198 |   )
199 | 
200 | (deftest test-read-ch-in-set
201 |   (ptest "abcdef" [\a \b] [(read-ch-in-set #{\a \b \c})
202 |                            (read-ch-in-set #{\a \b \c})])
203 |   (ptest-fail "axyz" (read-ch-in-set #{\x \y \z}))
204 |   )
205 | 
206 | (deftest test-blk-cmt
207 |   (ptest "abc/*comment*/def" ["abc" "/*comment*/"]
208 |     (with-trim-off [(string "abc") (blk-cmt "/*" "*/")]))
209 | 
210 |   (ptest "abc<!--comment-->def" ["abc" "<!--comment-->"]
211 |     (with-trim-off [(string "abc") (blk-cmt "<!--" "-->")]))
212 | 
213 |   (ptest "abc/*comment*/def" ["abc" "def"] [(string "abc") (string "def")])
214 |   (ptest-fail "abc/*comment*/def" [(string "abc") (blk-cmt "/*" "*/")])
215 |   )
216 | 
217 | (deftest test-line-cmt
218 |   (ptest "abc//line comment" ["abc" "//line comment"]
219 |     (with-trim-off [(string "abc") (line-cmt "//")]))
220 |   (ptest "abc//line comment\ndef" ["abc" "//line comment\n"]
221 |     (with-trim-off [(string "abc") (line-cmt "//")]))
222 |   )
223 | 
224 | (deftest test-ws
225 |   (ptest "abc /*xyz*/ ghi //kkk" ["abc" " /*xyz*/ " "ghi" " //kkk"]
226 |     (with-trim-off [(string "abc") (ws) (string "ghi") (ws)]))
227 |   )
228 | 
229 | 
230 | (deftest test-trim-on
231 |   (ptest "  abc  def" ["abc" "def"]
232 |     (with-trim-on [(string "abc") (string "def")]))
233 | 
234 |   (ptest "  abc def " ["abc" "def"]
235 |     (with-trim-on [(string "abc") (string "def")]))
236 |   )
237 | 
238 | (deftest test-trim-off
239 |   ; note: the leading whitespaces are trimmed before with-trim-off is called
240 |   (ptest "  abc def" ["abc" " def"]
241 |     (with-trim-off [(string "abc") (string " def")]))
242 |   (is (thrown? Exception (parse #(string "abc") "  abc" :auto-trim false)))
243 |   (is (thrown? Exception (parse #(with-trim-off (string "abc")) "  abc" :auto-trim false)))
244 |   )
245 | 
246 | (deftest test-no-trim
247 |   (ptest_ "123 456" [123 " " 456] (no-trim_ (series number #(string " ") number)))
248 |   (ptest-fail "123 456" (series number #(string " ") number))
249 |   (ptest_ "123 \n 456" [123 "\n" 456] (no-trim-nl_ (series number #(string "\n") number)))
250 |   )
251 | 
252 | (deftest test-lexeme
253 |   (let [wr #(regex #"\w+")]
254 |     (ptest "abc/*comment*/def" ["abc" "def"]
255 |       (with-trim-off [(lexeme wr) (wr)]))
256 |     (ptest "abc   def" ["abc" "def"]
257 |       (with-trim-off [(lexeme wr) (wr)]))
258 |     )
259 |   )
260 | 
261 | (deftest test-attempt
262 |   (ptest "abc" "abc" (attempt #(string "abc")))
263 |   (ptest_ "abc" "abc" (attempt_ (string "abc")))
264 |   (ptest "abc" nil (attempt #(string "def")))
265 |   (ptest_ "abc" nil (attempt_ (string "def")))
266 |   )
267 | 
268 | (deftest test-opt
269 |   (ptest "abc" "abc" (opt #(string "abc")))
270 |   (ptest_ "abc" "abc" (opt_ (string "abc")))
271 |   (ptest "abc" 123 (opt #(string "def") 123))
272 |   (ptest_ "abc" 123 (opt_ (string "def") 123))
273 |   )
274 | 
275 | (deftest test-any
276 |   (ptest "abcdef" "abc" (any #(regex #"\d+") #(string "abc")))
277 |   (ptest_ "abcdef" "abc" (any_ (regex #"\d+") (string "abc")))
278 |   (ptest "123abcdef" "123" (any #(regex #"\d+") #(string "abc")))
279 |   (ptest_ "123abcdef" "123" (any_ (regex #"\d+") (string "abc")))
280 |   (ptest-fail "abcdef" (any #(regex #"\d+") #(string "xyz")))
281 |   (ptest-fail_ "abcdef" (any_ (regex #"\d+") (string "xyz")))
282 |   )
283 | 
284 | (deftest test-look-ahead
285 |   (let [la-spec ["a" #(string "abc") "x" #(string "xyz")]]
286 |   (ptest "abc def" "abc" (look-ahead la-spec))
287 |   (ptest "xyz def" "xyz" (look-ahead la-spec))
288 |   (ptest-fail "pqr def" (look-ahead la-spec))
289 |     ))
290 | 
291 | (deftest test-look-ahead*
292 |   (let [la-spec [
293 |                  "0x" #(-> (regex #"[0-9a-fA-F]+") (Integer/parseInt 16))
294 |                  "0" #(-> (regex #"[0-7]+") (Integer/parseInt 8))
295 |                  "" #(number)]]
296 |     (ptest "0x123 0123" 16r123 (look-ahead* la-spec))
297 |     (ptest "123 0x123 0123" [123 16r123 8r123] (multi+ #(look-ahead* la-spec)))
298 |     ))
299 | 
300 | (deftest test-series
301 |   (ptest "abc123xxx" ["abc" "123" "xxx"]
302 |     (series #(string "abc") #(regex #"\d+") #(word "xxx")))
303 |   (ptest_ "abc123xxx" ["abc" "123" "xxx"]
304 |     (series_ (string "abc") (regex #"\d+") (word "xxx")))
305 | 
306 |   (ptest-fail "abc123xxx"
307 |     (series #(string "abc") #(regex #"[a-w]+") #(word "xxx")))
308 |   (ptest-fail_ "abc123xxx"
309 |     (series_ (string "abc") (regex #"[a-w]+") (word "xxx")))
310 |   )
311 | 
312 | (deftest test-multi*
313 |   (ptest "abcabcabcdef" ["abc" "abc" "abc"] (multi* #(string "abc")))
314 |   (ptest_ "abcabcabcdef" ["abc" "abc" "abc"] (multi*_ (string "abc")))
315 | 
316 |   (ptest "xyzabcabcabc" nil (multi* #(string "abc")))
317 |   (ptest_ "xyzabcabcabc" nil (multi*_ (string "abc")))
318 | 
319 |   (ptest "" nil (multi* #(string "abc")))
320 |   (ptest_ "" nil (multi*_ (string "abc")))
321 | 
322 |   (ptest "xyzabcabc" ["xyz" ["abc" "abc"]]
323 |     [(string "xyz") (multi* #(string "abc"))])
324 |   (ptest "xyzabcabc" ["xyz" ["abc" "abc"]]
325 |     (series #(string "xyz") (multi*_ (string "abc"))))
326 |   (ptest_ "xyzabcabc" ["xyz" ["abc" "abc"]]
327 |     (fn [] (series #(string "xyz") (multi*_ (string "abc")))))
328 |   )
329 | 
330 | (deftest test-multi+
331 |   (ptest "abcabcabcdef" ["abc" "abc" "abc"] (multi+ #(string "abc")))
332 |   (ptest_ "abcabcabcdef" ["abc" "abc" "abc"] (multi+_ (string "abc")))
333 | 
334 |   (ptest-fail "xyzabcabcabc" (multi+ #(string "abc")))
335 |   (ptest-fail_ "xyzabcabcabc" (multi+_ (string "abc")))
336 | 
337 |   (ptest-fail "" (multi+ #(string "abc")))
338 |   (ptest-fail_ "" (multi+_ (string "abc")))
339 | 
340 |   (ptest "xyzabcabc" ["xyz" ["abc" "abc"]]
341 |     [(string "xyz") (multi+ #(string "abc"))])
342 |   (ptest_ "xyzabcabc" ["xyz" ["abc" "abc"]]
343 |     (fn [] (series #(string "xyz") (multi+_ (string "abc")))))
344 |   )
345 | 
346 | (deftest test-times
347 |   (ptest "aaaa" [\a \a \a \a] (times 4 #(chr \a)))
348 |   (ptest_ "aaaa" [\a \a \a \a] (times_ 4 (chr \a)))
349 | 
350 |   (ptest "aaaa" ["aa" "aa"] (times 2 #(string "aa")))
351 |   (ptest_ "aaaa" ["aa" "aa"] (times_ 2 (string "aa")))
352 | 
353 |   (ptest "aa aa" ["aa" "aa"] (times 2 #(string "aa")))
354 |   (ptest_ "aa aa" ["aa" "aa"] (times_ 2 (string "aa")))
355 |   )
356 | 
357 | (deftest test-expect
358 |   (ptest "230" "230" (expect "integer" #(regex #"\d+")))
359 |   (ptest_ "230" "230" (expect_ "integer" (regex #"\d+")))
360 | 
361 |   (try (parse (fn [] (expect "integer" #(regex #"\d+"))) "x230")
362 |        (catch Exception ex (is (.contains (.getMessage ex) "Expecting \"integer\""))))
363 |   (try (parse (expect_ "integer" (regex #"\d+")) "x230")
364 |        (catch Exception ex (is (.contains (.getMessage ex) "Expecting \"integer\""))))
365 |   )
366 | 
367 | (deftest test-parse_
368 |   (is (= (parse_ [(string "abc"), (string "def")] "abc def") ["abc" "def"]))
369 |   (is (= (parse_ [(regex #"\d+"), (regex "\\S+")] "123 def") ["123" "def"]))
370 |   (is (= (parse_ (multi+ (any_ (regex #"\d+") (regex #"\D+"))) "123 def456") ["123" "def" "456"]))
371 |   )
372 | 
373 | (deftest test-sep-by
374 |   (ptest "123.45" [123.45] (sep-by decimal #(chr \,)))
375 |   (ptest "123.45, 234.56,345" [123.45 234.56 345.0] (sep-by decimal #(chr \,)))
376 |   (ptest-fail "123.45, 234, 345abc" (sep-by decimal #(chr \,)))
377 |   )
378 | 
379 | (deftest test-detect-sep
380 |   (ptest "abc,ghi" \, (detect-sep))
381 |   (ptest "abc\tghi" \tab, (detect-sep))
382 |   (ptest-fail "abcghi" (detect-sep))
383 |   (ptest-fail "\"abc,ghi\"" (detect-sep))
384 |   )
385 | 
386 | (deftest test-csv
387 |   (let [s1 "'1abc','def',ghi\n2abc,def,ghi\n"
388 |         s2 "\"1a,bc\",def,ghi\n2abc,def,ghi\n"
389 |         s3 "1abc\tdef\tghi\n2abc\tdef\tghi\n"
390 |         s4 "\"1a\tbc\"\tdef\tghi\n2abc\tdef\tghi\n"]
391 | 
392 |     (ptest s1 [["1abc" "def" "ghi"]["2abc" "def" "ghi"]] (csv))
393 |     (ptest s2 [["1a,bc" "def" "ghi"]["2abc" "def" "ghi"]] (csv))
394 | 
395 |     (ptest s3 [["1abc" "def" "ghi"]["2abc" "def" "ghi"]] (csv))
396 |     (ptest s4 [["1a\tbc" "def" "ghi"]["2abc" "def" "ghi"]] (csv))
397 |     ))
398 | 
399 | (deftest test-expr
400 |   (ptest "3+2-5" ["-" ["+" 3 2] 5] (expr))
401 |   (ptest "3+2*5" ["+" 3 ["*" 2 5]] (expr))
402 |   (ptest "-3+2*5" ["+" -3 ["*" 2 5]] (expr))
403 |   )
404 | 
405 | (deftest test-eval-expr
406 |   (is (= (eval-expr "3") 3))
407 |   (is (= (eval-expr "-3") -3))
408 |   (is (= (eval-expr "3+2") 5))
409 |   (is (= (eval-expr "3+ -2") 1))
410 |   (is (= (eval-expr "3+-2") 1))
411 |   (is (= (eval-expr "3+2*3") 9))
412 |   (is (= (eval-expr "3+2*3+4") 13))
413 |   (is (= (eval-expr "(3+2)*3+4") 19))
414 |   (is (= (eval-expr "(3+2)*(3+4)") 35))
415 |   (is (= (eval-expr "(3*6/2)") 9))
416 |   (is (= (eval-expr "(36/2*3)") 54))
417 |   )
418 | 
419 | (use '[clojure.java.shell :only [sh]])
420 | (defn test-xml []
421 |   (let [ dir (str (System/getProperty "user.home") "/test/data/xmltest/valid")
422 |         files (-> (sh "find" dir ) ^String (:out) (.split "\n"))
423 |         files (filter #(.endsWith ^String %1 ".xml") files)
424 |         ignore #{
425 |                  "/sa/049.xml"
426 |                  "/sa/050.xml"
427 |                  "/sa/051.xml"
428 |                  "/sa/114.xml"
429 |                  }]
430 |     (doseq [f files] (if (not (get ignore (subs f (count dir)))) 
431 |                        (do (println f) (parse-xml (slurp f)))))))
432 | 
433 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Parse-EZ : Clojure Parser Library
  2 | 
  3 | [api]: http://www.protoflex.com/parse-ez/api-doc/protoflex.parse-api.html "Parse-EZ API"
  4 | [API Documentation][api]
  5 | 
  6 | Parse-EZ is a parser library for Clojure programmers. It allows easy
  7 | mixing of declarative and imperative styles and does not 
  8 | require any special constructs, macros, monads, etc. to write custom parsers. 
  9 | All the parsing is implemented using regular Clojure functions.
 10 | 
 11 | The library provides a number of
 12 | parse functions and combinators and comes with a built-in customizable infix
 13 | expression parser and evaluator. It allows the programmer to concisely specify
 14 | the structure of input text using Clojure functions and easily build parse trees
 15 | without having to step out of Clojure.  Whether you are writing a parser
 16 | for some well structured data or for data scraping or prototyping a new language, 
 17 | you can make use of this library to quickly create a parser.
 18 | 
 19 | ## Features
 20 | 
 21 | - Parse functions and Combinators
 22 | - Automatic handling of whitespaces, comments
 23 | - Marking positions and backtracking
 24 | - Seek, read, skip string/regex patterns
 25 | - Builtin customizable expression parser and evaluator
 26 | - Exceptions based error handling
 27 | - Custom error messages
 28 | 
 29 | ## Usage
 30 | 
 31 | ### Installation
 32 | Just add Parse-EZ as a dependency to your lein project
 33 | 
 34 | ```clojure
 35 | [protoflex/parse-ez "0.4.2"]
 36 | ```
 37 | and run
 38 | 
 39 | ```clojure
 40 | lein deps
 41 | ```
 42 | 
 43 | ## A Taste of Parse-EZ
 44 | 
 45 | Here are a couple of sample parsers to give you a taste of the parser library.
 46 | 
 47 | ### CSV Parser
 48 | 
 49 | A CSV file contains multiple records, one-record per line, with field-values separated by a delimiter
 50 | such as a comma or a tab.  The field values may optionally be quoted either using a single or double
 51 | quotes.  When field-values are quoted, they may contain the field-delimiter characters, and in such
 52 | cases they will not be treated as field separators.
 53 | 
 54 | 
 55 | First, let us define a parse function for parsing one-line of csv file:
 56 | 
 57 | ```clojure
 58 | (defn csv-1 [sep] 
 59 |     (sep-by #(any-string sep) #(chr sep)))
 60 | ```
 61 | In the above function definition, we make use of the parse combinator `sep-by`
 62 | which takes two arguments: the first one to read a field-value and the second
 63 | one to read the separator.  Here, we have used Clojure's anonymous function shortcuts to
 64 | specify the desired behavior succinctly.  The `any-string` function matches a single-quoted
 65 | string or a double-quoted string or a plain-string that is followed by the specified separator
 66 | `sep`.  This is exactly the function that we need to read the field-value.  The second argument
 67 | provided to `sep-by` above uses the primitive parse function `chr` which succeeds only when
 68 | the next character in the input matches its argument (`sep` parameter in this case).  The _csv-1_ function returns the field values as a vector.
 69 | 
 70 | The `sep-by` function actually takes a third, optional argument as record-separator
 71 | function with the default value of a function that matches a newline.  We didn't
 72 | pass the third argument above because the default behavior suits our purpose.
 73 | Had the default behavior of `sep-by` been different, we would have written the
 74 | above function as:
 75 | 
 76 | ```clojure
 77 | (defn csv-1 [sep] 
 78 |     (sep-by #(any-string sep) #(chr sep) #(regex #"\r?\n")))
 79 | ```
 80 | 
 81 | Now that we have created a parse function to parse a single line of CSV
 82 | file, let us write another parse function that parses the entire CSV file
 83 | content and returns the result as a vector of vector of field values
 84 | (one-vector per record/line).  All we need to do is to repeatedly apply the
 85 | above defined `csv-1` function and the `multi*` parse combinator does
 86 | just that.
 87 | 
 88 | Just one small but important detail: by default, Parse-EZ
 89 | automatically trims whitespace after successfully applying a parse function.
 90 | This means that the newline at the end of line would be consumed after reading
 91 | the last field value and the `sep-by` would be unable to match the end-of-line
 92 | which is the record-separator in this case.  So, we will disable the newline
 93 | trimming functionality using the `no-trim` combinator.
 94 | 
 95 | ```clojure
 96 | (defn csv [sep] 
 97 |     (multi* (fn [] (no-trim #(csv-1 sep)))))
 98 | ```
 99 | 
100 | Alternatively, you can express the above function a bit more easily using the macro versions of combinators introduced in Version 0.3.0 as follows:
101 | 
102 | ```clojure
103 | (defn csv [sep] 
104 |     (multi* (no-trim_ (csv-1 sep))))
105 | ```
106 | 
107 | Now, let us try out our csv parser. First let us define a couple of test 
108 | strings containing a couple of records (lines) each.  Note that the second 
109 | string contains a comma inside the first cell (a quoted string).  
110 | 
111 | ```clojure
112 | user> (def s1 "1abc,def,ghi\n2jkl,mno,pqr\n")
113 | #'user/s1
114 | user> (def s2 "'1a,bc',def,ghi\n2jkl,mno,pqr\n")
115 | #'user/s2
116 | user> (parse #(csv \,) s1)
117 | [["1abc" "def" "ghi"] ["2jkl" "mno" "pqr"]]
118 | user> (parse #(csv \,) s2)
119 | [["1a,bc" "def" "ghi"] ["2jkl" "mno" "pqr"]]
120 | user> 
121 | ```
122 | 
123 | Well, all we had to do was to write two lines of Clojure code to implement the CSV parser.
124 | Let's add a bit more functionality: the CSV files may use a comma or a tab character to
125 | separate the field values.  Let's say we don't know ahead of time which character
126 | a file uses as a separator and we want to detect the separator automatically.  Note
127 | that both characters may occur in a data file, but only one acts as a field-separator -- that too
128 | only when it's not inside a quoted string.
129 | 
130 | Here is our strategy to detect the separator:
131 | 
132 | - if the first field value is quoted (single or double), read the quoted string
133 | - else, read until one of comma or tab occurs
134 | - the next char is our delimiter
135 | 
136 | Here is the code:
137 | 
138 | ```clojure
139 | (defn detect-sep []
140 |     (let [m (mark-pos)
141 |            s (attempt #(any dq-str sq-str))
142 |            s (if s s (no-trim #(read-to-re #",|\t")))
143 |            sep (read-ch)]
144 |        (back-to-mark m)
145 |        sep))
146 | ```
147 | 
148 | Note how we used the `mark-pos` and `back-to-mark` Parse-EZ functions to 'unconsume'
149 | the consumed input.  
150 | 
151 | The complete code for the sample CSV parser with the separator-detection functionality is
152 | listed below (you can find this in `csv_parse.clj` file under the `examples` directory.
153 | 
154 | ```clojure
155 | (ns protoflex.examples.csv_parse
156 |   (:use [protoflex.parse]))
157 | 
158 | (declare detect-sep csv-1)
159 | 
160 | (defn csv
161 |   "Reads and returns one or more records as a vector of vector of field-values"
162 |   ([] (csv (no-trim #(detect-sep))))
163 |   ([sep] (multi* (fn [] (no-trim-nl #(csv-1 sep))))))
164 | 
165 | (defn csv-1
166 |   "Reads and returns the fields of one record (line)"
167 |   [sep] (sep-by #(any-string sep) #(chr sep)))
168 | 
169 | (defn detect-sep
170 |   "Detects the separator used in a csv file (a comma or a tab)"
171 |   [] (let [m (mark-pos)
172 |            s (attempt #(any dq-str sq-str))
173 |            s (if s s (no-trim #(read-to-re #",|\t")))
174 |            sep (read-ch)]
175 |        (back-to-mark m)
176 |        sep))
177 | ```
178 | 
179 | Let's try out the new auto-detect functionality. Let us define two new test
180 | strings `s3` and `s4` that use `tab` character as field-separator.
181 | 
182 | ```clojure
183 | user> (use 'protoflex.examples.csv_parse)
184 | nil
185 | user> (def s3 "1abc\tdef\tghi\n2jkl\tmno\tpqr\n")
186 | #'user/s3
187 | user> (def s4 "'1a\tbc'\tdef\tghi\n2jkl\tmno\tpqr\n")
188 | #'user/s4
189 | user> (parse csv s3)
190 | [["1abc" "def" "ghi"] ["2jkl" "mno" "pqr"]]
191 | user> (parse csv s4)
192 | [["1a\tbc" "def" "ghi"] ["2jkl" "mno" "pqr"]]
193 | user> (parse csv s1)
194 | [["1abc" "def" "ghi"] ["2jkl" "mno" "pqr"]]
195 | user> 
196 | ```
197 | 
198 | As you can see, this time we didn't specify what field-separator to use: the parser
199 | itself detected the field-separator character and used it, returning us the desired
200 | results.
201 | 
202 | ### XML Parser
203 | 
204 | Here is the listing of a sample XML parser implemented using Parse-EZ. You can find the
205 | source file in the examples directory.  The parser returns a map containing keys and values
206 | for `:tag`, `:attributes` and `:children` for the root element.  The value for `:attributes` key
207 | is itself another map containing attribute names and their values.  The value for `:children`
208 | key is a vector (potentially empty) containing string content and/or maps for child elements.
209 | 
210 | ```clojure
211 | (ns protoflex.examples.xml_parse
212 |   (:use [protoflex.parse]))
213 | 
214 | (declare pi prolog element attributes children-and-close cdata elem-or-text close-tag)
215 | 
216 | (defn parse-xml [xml-str]
217 |   (parse #(between prolog element pi) xml-str :blk-cmt-delim ["<!--" "-->"] :line-cmt-start nil))
218 | 
219 | (defn- pi [] (while (starts-with? "<?") (skip-over "?>")))
220 | 
221 | (defn- prolog [] (pi) (attempt  #(regex #"(?s)<!DOCTYPE([^<]+?>)|(.*?\]\s*>)")) (pi))
222 | ```
223 | The function _parse-xml_ is the entry point that kicks off parsing of input xml string _xml-str_.  It passes the _between_ combinator to __Parse-EZ__'s _parse_ function. Here, the call to _between_ returns the value returned by the _element_ parse function, ignoring the content surrounding it (matched by _prolog_ and _pi_ functions). The block-comment delimiters are set to match XML's and the line-comment delimiter is cleared (by default these match Java comments).
224 | 
225 | The parse function _pi_ is used to skip consecutive processing instructions by using the delimiters __<?__ and __?>__.
226 | 
227 | The parse function _prolog_ is used to skip DTD declaration (if any) and also any surrounding processing instructions.  Note that the regex used to match DTD declaration is only meant for illustration purposes. It isn't complete but will work in most cases.
228 | 
229 | ```clojure
230 | (def name-start ":A-Z_a-z\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\u02FF\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD")
231 | 
232 | (def name-char (str name-start "\\-.0-9\\xB7\\u0300-\\u036F\\u203F-\\u2040"))
233 | 
234 | (def name-re (-> (format "[%s][%s]*" name-start name-char) re-pattern))
235 | ```
236 | _name-re_ is a regular expression that matches xml element and attribute names.
237 | 
238 | ```clojure
239 | (defn element []
240 |   (let [tag (do (chr \<) (regex name-re))
241 |         attrs (attributes)
242 |         children (look-ahead* [
243 |                                ">" #(children-and-close tag)
244 |                                "/>" (fn [] [])])]
245 |     {:tag tag, :attributes attrs, :children children}))
246 | ```
247 | The _element_ parse function matches an xml element and returns the tag, attribute list and children in a hash map. Note the usage of the _look_ahead*_ combinator to handle both the cases -- with children and without children. If it sees a ">" after reading the attributes, the _look-ahead*_ function calls the _children-and-close_ parse function to read children and the element close tag. On the other hand, if it sees "/>" after the attributes, it calls the (almost) empty parse function that simply returns an empty list.
248 | 
249 | ```clojure
250 | (defn attr []
251 |   (let [n (regex name-re) _ (chr \=)
252 |         v (any sq-str dq-str)]
253 |     [n v]))
254 | 
255 | (defn attributes [] (apply hash-map (flatten  (multi* attr))))
256 | ```
257 | The _attr_ parse function matches a single attribute. The attribute value may be
258 | a single-quoted or double-quoted string. Note the usage of _any_ parse combinator for this purpose.
259 | 
260 | The _attributes_ parse function matches multiple attribute specifications by passing the _attr_ parse function to _multi*_ parse combinator.
261 | 
262 | ```clojure
263 | (defn- children-and-close [tag]
264 |   (let [children (multi* #(between pi elem-or-text pi))]
265 |     (close-tag tag)
266 |     children))
267 | ```
268 | Each child item is read using the _elem-or-text_ parse function while ignoring any surrounding processing instructions using the _between_ combinator; the combinator _multi*_ is used to read all the child items.
269 | 
270 | ```clojure
271 | (defn- elem-or-text []
272 |   (look-ahead [
273 |                "<![CDATA[" cdata
274 |                "</" (fn [] nil)
275 |                "<" element
276 |                "" #(read-to "<")]))
277 | ```
278 | The _look-ahead_ parse combinator is used to call different parse functions
279 | based on different lookahead strings. Note that the _look-ahead_ function 
280 | doesn't consume the lookahead string unlike the _look-ahead*_ function used
281 | earlier (in the definition of _element_ parse function).
282 | 
283 | ```clojure
284 | (defn- cdata []
285 |   (string "<![CDATA[")
286 |   (let [txt (read-to "]]>")] (string "]]>") txt))
287 | 
288 | (defn- close-tag [tag]
289 |     (string (str "</" tag))
290 |     (chr \>))
291 | ```
292 | By now, it should be obvious what the above two functions do.
293 | 
294 | Well, an XML parser in under 50 lines.  Let's try it with a few sample inputs:
295 | 
296 | ```clojure
297 | user> (use 'protoflex.examples.xml_parse)
298 | nil
299 | user> (parse-xml "<abc>text</abc>")
300 | {:tag "abc", :attributes {}, :children ["text"]}
301 | user> (parse-xml "<abc a1=\"1\" a2=\"attr 2\">sample text</abc>")
302 | {:tag "abc", :attributes {"a1" "1", "a2" "attr2"}, :children ["sample text"]}
303 | user> (parse-xml "<abc a1=\"1\" a2=\"attr 2\"><def d1=\"99\">xxx</def></abc>")
304 | {:tag "abc", :attributes {"a1" "1", "a2" "attr2"}, :children [{:tag "def", :attributes {"d1" "99"}, :children ["xxx"]}]}
305 | user> 
306 | ```
307 | 
308 | ## Comments and Whitespaces
309 | 
310 | By default, Parse-EZ automatically handles comments and whitespaces. This
311 | behavior can be turned on or off temporarily using the macros `with-trim-on`
312 | and `with-trim-off` respectively. The parser option `:auto-trim` can be used to
313 | enable or disable the auto handling of whitespace and comments.  Use the parser
314 | option `:blk-cmt-delim` to specify the begin and end delimiters for block
315 | comments.  The parser option `:line-cmt-start` can be used to specify the line
316 | comment marker.  By default, these options are set to java/C++ block and line
317 | comment markers respectively.  You can alter the whitespace recognizer by setting
318 | the `:ws-regex` parser option.  By default it is set to `#"\s+"`.
319 | 
320 | Alternatively, you can turn off auto-handling of whitespace and comments and use
321 | the `lexeme` function which trims the whitespace/comments after application of the
322 | parse-function passed as its argument.
323 | 
324 | Also see the `no-trim` and `no-trim-nl` functions.
325 | 
326 | ## Primitive Parse Functions
327 | 
328 | Parse-EZ provides a number of primitive parse functions such as: `chr`, 
329 | `chr-in`, `string`, `string-in`, `word`, `word-in`, `sq-str`,  `dq-str`, 
330 | `any-string`, `regex`, `read-to`, `skip-over`, `read-re`, `read-to-re`, 
331 | `skip-over-re`, `read-n`, `read-ch`, `read-ch-in-set`, etc.
332 | [See API Documentation][api]
333 | 
334 | Let us try some of the builtin primitive parse functions:
335 | 
336 | ```clojure
337 | user> (use 'protoflex.parse)
338 | nil
339 | user> (parse integer "12")
340 | 12
341 | user> (parse decimal "12.5")
342 | 12.5
343 | user> (parse #(chr \a) "a")
344 | \a
345 | user> (parse #(chr-in "abc") "b")
346 | \b
347 | user> (parse #(string-in ["abc" "def"]) "abc")
348 | "abc"
349 | user> (parse #(string-in ["abc" "def"]) "abcx")
350 | Parse Error: Extraneous text at line 1, col 4
351 |   [Thrown class java.lang.Exception]
352 | ```
353 | 
354 | Note the parse error for the last parse call. By default, the `parse` function parses to the
355 | end of the input text.  Even though the first 3 characters of the input text is recognized
356 | as valid input, a parse error is generated because the input cursor would not be at the
357 | end of input-text after recognizing "abc".
358 | 
359 | The parser option `:eof` can be set to false to allow recognition of partial input:
360 | 
361 | ```clojure
362 | user> (parse #(string-in ["abc" "def"]) "abcx" :eof false)
363 | "abc"
364 | user> 
365 | ```
366 | 
367 | You can start parsing by looking for some marker patterns using the `read-to`,
368 | `read-to-re`, `skip-over`, `skip-over-re` functions.
369 | 
370 | ```clojure
371 | user> (parse #(do (skip-over ">>") (number)) "ignore upto this>> 456.7")
372 | 456.7
373 | ```
374 | 
375 | ## Parse Combinators
376 | 
377 | Parse Combinators in Parse-EZ are higher-order functions that take other parse
378 | functions as input arguments and combine/apply them in different ways to 
379 | implement new parse functionality.  Parse-EZ provides parse combinators such as:
380 | `opt`, `attempt`, `any`, `series`, `multi\*`, `multi+`, `between`, `look-ahead`, `lexeme`,
381 | `expect`, etc.
382 | [See API Documentation][api]
383 | 
384 | Let us try some of the builtin parse combinators:
385 | 
386 | ```clojure
387 | user> (parse #(opt integer) "abc" :eof false)
388 | nil
389 | user> (parse #(opt integer) "12")
390 | 12
391 | user> (parse #(any integer decimal) "12")
392 | 12
393 | user> (parse #(any integer decimal) "12.3")
394 | 12.3
395 | user> (parse #(series integer decimal integer) "3 4.2 6")
396 | [3 4.2 6]
397 | user> (parse #(multi* integer) "1 2 3 4")
398 | [1 2 3 4]
399 | user> (parse #(multi* (fn [] (string-in ["abc" "def"]))) "abcabcdefabc abcdef")
400 | ["abc" "abc" "def" "abc" "abc" "def"]
401 | user> 
402 | ```
403 | 
404 | You can create your own parse functions on top of primitive parse-functions and/or
405 | parse combinators provided by Parse-EZ.
406 | 
407 | ## Committing to a particular parse branch
408 | 
409 | Version 0.4.0 added support for committing to a particular parse branch via
410 | the new parse combinators `commit` and `commit-on`. These functions make the 
411 | parser commit to the current parse branch, making the parser report subsequent
412 | parse-failures in the current branch as parse-errors and preventing it 
413 | from trying other alternatives at higher levels.
414 | 
415 | ## Nesting Parse Combinators Using Macros
416 | 
417 | Version 0.3.0 of Parse-EZ adds macro versions of parse combinator functions
418 | to make it easy to nest calls to parse combinators without having to write
419 | nested anonymous functions using the "(fn [] ...)" syntax. Note that Clojure
420 | does not allow nesting of anonymous functions of "#(...)" forms.  Whereas
421 | the existing parse combinators take parse functions as arguments and actually
422 | perform parsing and return the parse results, the newly added macros take 
423 | parse expressions as arguments and return parse functions (to be passed 
424 | to other parse combinators).  These macros are named the same as the 
425 | corresponding parse combinators but with an underscore ("\_") suffix. For example
426 | the macro version of "any" is named "any_".
427 | 
428 | ## Error Handling
429 | 
430 | Parse Errors are handled in Parse-EZ using Exceptions.  The default error messages generated
431 | by Parse-EZ include line and column number information and in some cases what is expected
432 | at that location.  However, you can provide your own custom error messages by using the
433 | `expect` parse combinator.
434 | 
435 | ## Expressions
436 | 
437 | Parse-EZ includes a customizable expression parser `expr` for parsing expressions in infix
438 | notation and an expression evaluator function `eval-expr` to evaluate infix expressions.
439 | You can customize the operators, their precedences and associative properties using
440 | `:operators` option to the `parse` function.  For evaluating expressions, you can optionally
441 | specify the functions to invoke for each operator using the `:op-fn-map` option.
442 | 
443 | ## Parser State
444 | 
445 | The parser state consists of the input cursor and various parser options (specified or derived)
446 | such as those affecting whitespace and comment parsing, word recognizers, expression parsing,
447 | etc.  The parser options can be changed any time in your own parse functions using `set-opt`.
448 | 
449 | Note that most of the parse functions affect Parser state (e.g: input cursor) and hence they are
450 | not pure functions.  The side-effects could be avoided by making the Parser State an explicit
451 | parameter to all the parse functions and returning the changed Parser State along with the parse
452 | value from each of the parse functions.  However, the result would be a significantly programmer
453 | unfriendly API.  We made a design decision to keep the parse fuctions simple and easy to use
454 | than to fanatically keep the functions "pure".
455 | 
456 | ## Relation to Parsec
457 | 
458 | Parsec is a popular parser combinator library written in Haskell. While Parse-EZ
459 | makes use of some of the ideas in there, it is *not* a port of Parsec to Clojure.
460 | 
461 | ## License
462 | 
463 | Copyright (C) 2012 Protoflex Software
464 | 
465 | Distributed under the Eclipse Public License, the same as Clojure.
466 | 


--------------------------------------------------------------------------------
/src/protoflex/parse.clj:
--------------------------------------------------------------------------------
   1 | ;Copyright (c) Protoflex Software. All rights reserved.
   2 | ;The use and distribution terms for this software are covered by the
   3 | ;Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
   4 | ;which can be found in the file epl-v10.html at the root of this distribution.
   5 | ;By using this software in any fashion, you are agreeing to be bound by
   6 | ;the terms of this license.
   7 | ;You must not remove this notice, or any other, from this software.
   8 | 
   9 | (ns ^{ :doc "Clojure Parser Library." :author "Panduranga Adusumilli"}
  10 |   protoflex.parse
  11 |   (:refer-clojure :exclude [replace reverse])
  12 |   (:import [java.util.regex Matcher])
  13 |   (:import [clojure.lang ExceptionInfo])
  14 |   (:use [protoflex.util])
  15 |   (:require [clojure.string]))
  16 | 
  17 | (declare parser-init set-pos throw-ex starts-with? move read-ch read-ch-in-set
  18 |          qstring unexpected next-n regex to-set read-re next-text
  19 |          init-default-ws-reader match-text find-first auto-trim-if get-opt 
  20 |          mark-pos back-to-mark any-string dq-str sq-str read-to-re at-end? 
  21 |          no-trim no-trim-nl get-opts set-opt get-default-ops 
  22 |          get-default-op-fn-map init-operators read-ws string expect
  23 |          line-pos-str cursor-pos with-opts read-to-re-or-eof to-eof with-trim-off)
  24 | 
  25 | (def ^:dynamic *parser-state* (atom []))
  26 | (defn state [] (deref *parser-state*))
  27 | 
  28 | (def default-options {
  29 |                       :blk-cmt-delim ["/*" "*/"]
  30 |                       :line-cmt-start "//"
  31 |                       :ws-regex #"\s+"
  32 |                       :auto-trim true
  33 |                       :word-regex #"\S+"
  34 |                       :ident-regex #"[a-zA-Z_][a-zA-Z0-9_]*"
  35 |                       :eof true })
  36 | 
  37 | (defn parse
  38 |   "This function triggers off the parsing of the provided input string using
  39 |   the specified parse function. The following parser options may be provided 
  40 |   to alter the default behavior of the parser:
  41 |   :blk-cmt-delim - vector specifying start and end of block-comment markers
  42 |   :line-cmt-start - string specifying the begin marker of a line comment
  43 |   :ws-regex - regular expression for matching (non-comment) white space
  44 |   :auto-trim - whether to automatically remove the leading whitespace/comments
  45 |   at the current position in the input text or immediately after a parse action.
  46 |   :word-regex - regular expression for matching words
  47 |   :operators - a vector of vector of operators in the decreasing order of
  48 |   precedence; see get-default-ops function for an example.
  49 |   :op-fn-map - a map of operator and the function to call for that operator when
  50 |   evaluating expressions
  51 |   :eof - if true, the parse function must consume the entire input text
  52 | 
  53 |   Args:
  54 |   parse-fn  - parse function to apply
  55 |   input-str - input text to be parsed
  56 |   opts      - key value options (listed above)"
  57 | 
  58 |   [parse-fn input-str & opts]
  59 |   (let [options (merge default-options (apply hash-map opts))]
  60 |     (binding [*parser-state* (parser-init input-str options)
  61 |               *ns* (find-ns 'protoflex.parse)]
  62 |       (init-operators (get options :operators (get-default-ops)))
  63 |       (set-opt :op-fn-map (get options :op-fn-map (get-default-op-fn-map)))
  64 |       (auto-trim-if)
  65 |       (let [result (parse-fn)]
  66 |         (if (and (:eof options) (not (at-end?))) (throw-ex "Extraneous text") result)))))
  67 | 
  68 | (defmacro parse_
  69 |   "Similar to the `parse` function, but takes a parse expression instead of a
  70 |   parse function as its first argument. The parse expression is any clojure
  71 |   expression that performs parsing by calling built-in or custom parse
  72 |   functions. See the documentation for `parse`"
  73 |   [parse-expr input-str & opts]
  74 |   `(parse (fn [] (eval ~parse-expr)) ~input-str ~@opts)
  75 |   )
  76 | 
  77 | (defn attempt
  78 |   "Tries to match the input at the current position with the provided
  79 |   parse function. If the parse function matches successfully, the matched
  80 |   text is returned and the input cursor advances by the length of the
  81 |   matched text. Otherwise a nil is returned and the current position
  82 |   in the input remains unchanged."
  83 |   [parse-fn]
  84 |   (let [m (mark-pos)]
  85 |     (try (let [r (parse-fn)] r)
  86 |       (catch ExceptionInfo ex 
  87 |         (when (:committed (ex-data ex)) (throw ex))
  88 |         (back-to-mark m) nil))))
  89 | 
  90 | (defmacro attempt_
  91 |   "Creates and returns a parse function that calls `attempt` when it is
  92 |   invoked. The argument `parse-expr` is converted to a parse function
  93 |   and passed to `attempt` in the returned function's body.  See `attempt`."
  94 |   [parse-expr]
  95 |   `#(attempt (fn [] (eval ~parse-expr))))
  96 | 
  97 | (defn opt
  98 |   "Same as attempt, but accepts a default value argument to return in case the
  99 |   specified parse function fails.  Useful for matching optional text."
 100 |   ([parse-fn] (attempt parse-fn))
 101 |   ([parse-fn default-val] (let [v (attempt parse-fn)] (if v v default-val))))
 102 | 
 103 | (defmacro opt_
 104 |   "Creates and returns a parse function that calls `opt` when it is
 105 |   invoked. The argument `parse-expr` is converted to a parse function
 106 |   and passed to `opt` in the returned function's body.  See `opt`."
 107 |   ([parse-expr] `#(opt (fn [] (eval ~parse-expr))))
 108 |   ([parse-expr default-val] `#(opt (fn [] (eval ~parse-expr)) ~default-val)))
 109 | 
 110 | (defn any
 111 |   "Returns the result of the first successfully matching parse-function.
 112 |   If none of the parse functions match, an exception is thrown."
 113 |   [& parse-fns]
 114 |   (if-let [r (find-first :result #(attempt %) parse-fns)]
 115 |     r (throw-ex)))
 116 | 
 117 | (defmacro ->fns 
 118 |   [& exprs] 
 119 |   `(map #(fn [] (eval %)) '~exprs))
 120 | 
 121 | (defmacro any_ 
 122 |   "Creates and returns a parse function that calls `any` when it is
 123 |   invoked. The arguments `parse-exprs` are converted to parse functions
 124 |   and passed to `any` in the returned function's body.  See `any`."
 125 |   [& parse-exprs] 
 126 |   `#(apply any (->fns  ~@parse-exprs)))
 127 | 
 128 | (defn la-strs [la-pf-vec] 
 129 |   (for [x (range (count la-pf-vec)) :when (even? x)] (get la-pf-vec x)))
 130 | 
 131 | (defn- look-ahead-aux
 132 |   [[la pf & rest]]
 133 |   (if (starts-with? la)
 134 |     (pf)
 135 |     (if (> (count rest) 1) (recur  rest) (throw-ex))) 
 136 |   )
 137 | 
 138 | (defn look-ahead
 139 |   "Takes a collection of look-ahead-string and parse-function pairs and applies
 140 |   the first parse function that follows the matching look-ahead-string  and
 141 |   returns the result, or throws a parse exception if the parse function fails.
 142 | 
 143 |   If none of the look-ahead strings match the current text, an exception is thrown.
 144 | 
 145 |   To specify a default parse function, provide an empty string as look-ahead and
 146 |   the default parse function at the end of the argument list.
 147 | 
 148 |   Args: [la-str-1 parse-fn-1 la-str-2 parse-fn-2 ...]"
 149 |   [la-pf-vec]
 150 |   (expect (la-strs la-pf-vec) #(look-ahead-aux la-pf-vec)))
 151 | 
 152 | (defn- look-ahead*-aux
 153 |   [[la pf & rest]] 
 154 |   (if (starts-with? la)
 155 |     (do (string la) (pf))
 156 |     (if (> (count rest) 1) (recur  rest) (throw-ex)))) 
 157 | 
 158 | (defn look-ahead*
 159 |   "Same as look-ahead, but consumes the matching look-ahead string before
 160 |   applying the corresponding parse function. "
 161 |   [la-pf-vec]
 162 |   (expect (la-strs la-pf-vec) #(look-ahead*-aux la-pf-vec)))
 163 | 
 164 | (defn series
 165 |   "Applies a sequence of parse functions and returns their results in
 166 |   a vector. Each successfull match by the parse function advances the cursor.
 167 |   If any of the parse functions fails, an exception is thrown."
 168 |   [& parse-fns]
 169 |   ;(map #(%) parse-fns)
 170 |   ; just applying map doesn't get the correct dynamically rebound var values.
 171 |   ; doall & bound-fn don't seem to work; clojure bug?
 172 |   (letfn [(apply-fseq [fns]
 173 |             (loop [fst (first fns)
 174 |                    rst (rest fns)
 175 |                    result []]
 176 |               (if fst (recur (first rst) (rest rst) (conj result (fst)))
 177 |                 result)))]
 178 |     (apply-fseq parse-fns)))
 179 | 
 180 | (defmacro series_ 
 181 |   "Creates and returns a parse function that calls `series` when it is
 182 |   invoked. The arguments `parse-exprs` are converted to parse functions
 183 |   and passed to `series` in the returned function's body.  See `series`."
 184 |   [& parse-exprs] 
 185 |   `#(apply series (->fns  ~@parse-exprs)))
 186 | 
 187 | (defn multi*
 188 |   "Matches zero or more occurrences of text accepted by the provided parse 
 189 |   function and returns the results in a vector."
 190 |   [parse-fn]
 191 |   (loop [rv []]
 192 |     (if-let [r (attempt parse-fn)]
 193 |       (recur (conj rv r))
 194 |       (if (pos? (count rv)) rv))))
 195 |     
 196 | 
 197 | (defmacro multi*_
 198 |   "Creates and returns a parse function that calls `multi*` when it is
 199 |   invoked. The argument `parse-expr` is converted to a parse function
 200 |   and passed to `multi*` in the returned function's body.  See `multi*`."
 201 |   [parse-expr]
 202 |   `#(multi* (fn [] (eval ~parse-expr))))
 203 | 
 204 | (defn multi+
 205 |   "Matches one or more occurrences of text accepted by the provided parse 
 206 |   function and returns the results in a vector. If the parse function doesn't 
 207 |   match even once, an exception is thrown."
 208 |   [parse-fn]
 209 |   (if-let [rv (multi* parse-fn)] rv (throw-ex)))
 210 | 
 211 | (defmacro multi+_
 212 |   "Creates and returns a parse function that calls `multi+` when it is
 213 |   invoked. The argument `parse-expr` is converted to a parse function
 214 |   and passed to `multi+` in the returned function's body.  See `multi+`."
 215 |   [parse-expr]
 216 |   `#(multi+ (fn [] (eval ~parse-expr))))
 217 | 
 218 | (defn times 
 219 |   "Applies the provided parse function exactly n times and returns the
 220 |   results of applications of the function in a vector."
 221 |   [n parse-fn]
 222 |   (vec (for [i (range n)] (parse-fn))))
 223 | 
 224 | (defmacro times_
 225 |   "Creates and returns a parse function that calls `times` when it is
 226 |   invoked. The argument `parse-expr` is converted to a parse function
 227 |   and passed to `times` in the returned function's body.  See `times`."
 228 |   [n parse-expr]
 229 |   `#(times ~n (fn [] (eval ~parse-expr))))
 230 | 
 231 | (defn lexeme
 232 |   "Applies the specified parse function for current input text, consumes any
 233 |   following whitespace, comments and returns the result of the parse function
 234 |   application."
 235 |   [parse-fn]
 236 |   (let [r (parse-fn)] (read-ws) r))
 237 | 
 238 | (defmacro lexeme_
 239 |   "Creates and returns a parse function that calls `lexeme` when it is
 240 |   invoked. The argument `parse-expr` is converted to a parse function
 241 |   and passed to `lexeme` in the returned function's body.  See `lexeme`."
 242 |   [parse-expr]
 243 |   `#(lexeme (fn [] (eval ~parse-expr))))
 244 | 
 245 | (defn- dq [x] (str \" x \"))
 246 | 
 247 | (defn- one-of-msg [x] (str "one of [" 
 248 |                            (->> (map dq x) (clojure.string/join ", ")) "]"))
 249 | 
 250 | (defn- exp-msg [x] (if (coll? x) (one-of-msg x) (dq x)))
 251 | 
 252 | (defn expect
 253 |   "Customize error message; if the specified parse function doesn't match
 254 |   the current input text, the error message of the parse exception will include
 255 |   the specified custom expected-message."
 256 |   [expected-msg parse-fn]
 257 |   (try (parse-fn)
 258 |     (catch ExceptionInfo ex 
 259 |       (if (:unexpected (ex-data ex)) (throw ex) 
 260 |         (throw-ex (unexpected expected-msg) {:unexpected true})))))
 261 | 
 262 | (defmacro expect_
 263 |   "Creates and returns a parse function that calls `expect` when it is
 264 |   invoked. The argument `parse-expr` is converted to a parse function
 265 |   and passed to `expect` in the returned function's body.  See `expect`."
 266 |   [expected-msg parse-expr]
 267 |   `#(expect ~expected-msg (fn [] (eval ~parse-expr))))
 268 | 
 269 | (defn- has-parse-error-msg? [^Exception ex]
 270 |   (let [m (.getMessage ex)]
 271 |     (and m (>= (.indexOf m "Parse Error") 0))))
 272 | 
 273 | (defn throw-ex
 274 |   "Throws an exception of ExceptionInfo class; this is usually called to 
 275 |    indicate a match failure in a parse function."
 276 |   ([] (throw-ex "" {} nil))
 277 |   ([msg] (throw-ex msg {} nil))
 278 |   ([msg map] (throw-ex msg map nil))
 279 |   ([msg map cause] 
 280 |     (let [pos (line-pos-str (or (:pos map) (cursor-pos)))
 281 |           msg (if (:msg-final map) msg (str "Parse Error: " msg " at " pos))
 282 |           map (assoc map :msg-final true)]
 283 |       (throw (ex-info msg map cause)))))
 284 | 
 285 | (defn unexpected
 286 |   "Creates a message string for unexpected input exception."
 287 |   ([expected] (unexpected (next-n 10) expected))
 288 |   ([actual expected]
 289 |   (str "Unexpected input: \"" actual "\"; Expecting "
 290 |        (exp-msg expected))))
 291 | 
 292 | (defn- mchar [ch is-no-auto-trim]
 293 |   (let [c (read-ch is-no-auto-trim)]
 294 |     (if (= ch c) c
 295 |       (do (move -1) (throw-ex (unexpected c ch))))))
 296 | 
 297 | (defn chr
 298 |   "If the next character in the input matches the specified character ch,
 299 |   returns it; otherwise throws an exception."
 300 |   [ch] (mchar ch false))
 301 | 
 302 | (defn chr-
 303 |   "Same as chr but with auto-trimming turned off for the following input"
 304 |   [ch] (mchar ch true))
 305 | 
 306 | (defn chr-in
 307 |   "If the next character in the input matches any character in the specified
 308 |   string or character collection, the matching character is returned.
 309 |   Otherwise throws an exception."
 310 |   [chars] (read-ch-in-set (to-set chars) false))
 311 | 
 312 | (defn chr-in-
 313 |   "Same as chr-in but with auto-trimming turned off for the following input"
 314 |   [chars] (read-ch-in-set (to-set chars) true))
 315 | 
 316 | (defn string
 317 |   "If the input matches the specified string, the string is
 318 |   returned. Otherwise, a parse exception is thrown."
 319 |   [^String s]
 320 |   (if (starts-with? s) (move (.length s))
 321 |     (throw-ex (unexpected (next-n (.length s)) s))))
 322 | 
 323 | (defn string-in-ord
 324 |   "Returns the first string from the provided strings that matches text
 325 |   at the current position. Throws an exception if none of the strings match."
 326 |   [strings]
 327 |   (if-let [^String s (find-first :item #(starts-with? %) strings)]
 328 |     (do (move (.length s)) s)
 329 |     (throw-ex (unexpected strings))))
 330 | 
 331 | (defn string-in
 332 |   "Returns the longest string from the provided strings that matches text
 333 |   at the current position. Throws an exception if none of the strings match."
 334 |   [strings]
 335 |   (string-in-ord (sort-by count #(compare %2 %1) strings)))
 336 | 
 337 | (defn word-in
 338 |   "Returns the first word from the provided words that matches text
 339 |   at the current position. Throws an exception if none of the words match.
 340 |   An optional word-reader parse-function may be provided to read words."
 341 |   ([str-coll]
 342 |    (word-in str-coll #(regex (get-opt :word-regex))))
 343 |   ([str-coll word-reader]
 344 |    (let [wset (to-set str-coll)
 345 |          w (word-reader)]
 346 |      (if (wset w) w (throw-ex (unexpected w wset))))))
 347 | 
 348 | (defn word
 349 |   "Returns the specified word if the word occurs at the current position in
 350 |   the input text; an exception is thrown otherwise."
 351 |   [w] (word-in #{w}))
 352 | 
 353 | (defn ^String ident
 354 |   "Reads an identifier at current input position using the ident-regex 
 355 |    parser option. If id is specified, the read identifier must match the 
 356 |    specified value; otherwise an exception is thrown."
 357 |   [] (regex (get-opt :ident-regex)))
 358 | 
 359 | (defn ^String key-word
 360 |   "Reads an identifier at the current input position. If the read identifier
 361 |    matches the specified keyword kw, the same is returned; otherwise, an
 362 |    exception is thrown."
 363 |   [kw] (expect kw #(if (= kw (ident)) kw (throw-ex))))
 364 | 
 365 | (defn commit
 366 |   "Applies supplied parse-fn and if it fails, the failure gets reported as a
 367 |    'committed' exception, which prevents the parser from trying alternatives at
 368 |    higher levels. On success, returns the result of parse-fn."
 369 |   [parse-fn] 
 370 |   (try (parse-fn) 
 371 |     (catch ExceptionInfo ex 
 372 |       (throw-ex (.getMessage ex) (assoc (ex-data ex) :committed true)))))
 373 | 
 374 | (defn commit-on
 375 |   "If the keyword kw occurs at the current position in the input text, parse-fn
 376 |    will be applied next. If parse-fn fails, it will get reported as a 'committed'
 377 |    exception, which prevents the parser from trying alternatives at higher levels.
 378 |    On sucess, returns the result of parse-fn."
 379 |   [kw parse-fn] 
 380 |   (if (attempt #(key-word kw)) (commit parse-fn)))
 381 | 
 382 | (defn with-follow
 383 |   "Applies parse-fn and follow-fn in sequence; Ignores the result of follow-fn and
 384 |    returns the result of parse-fn."
 385 |   [parse-fn follow-fn] (let [r (parse-fn), _ (follow-fn)] r))
 386 | 
 387 | (defn with-follow*
 388 |   "Similar to with-follow, but commits to follow-fn parse if parse-fn succeeds. 
 389 |    Returns the result of parse-fn."
 390 |   [parse-fn follow-fn] (with-follow parse-fn #(commit follow-fn)))
 391 | 
 392 | (defn with-no-follow
 393 |   "Applies parse-fn and follow-fn in sequence; This method succeeds only if the 
 394 |    follow-fn fails to match. Returns the result of parse-fn."
 395 |   [parse-fn follow-fn] 
 396 |   (let [r (parse-fn), f (attempt follow-fn)] 
 397 |     (when f (throw-ex "Unexpected follow"))
 398 |     r))
 399 | 
 400 | (defn sep-by
 401 |   "Reads a record using the specified field, field-separator and 
 402 |   record-separator parse functions.  If no record-separator is specified, 
 403 |   a newline character is used as record separator. Returns the fields of the 
 404 |   record in a vector."
 405 |   ([fld-fn fld-sep-fn]
 406 |    (sep-by fld-fn fld-sep-fn #(regex #"\r?\n")))
 407 | 
 408 |   ([fld-fn fld-sep-fn rec-sep-fn]
 409 |     (if-not (at-end?)
 410 |       (let [fst (fld-fn)
 411 |             rst (multi* #(series fld-sep-fn fld-fn))
 412 |             _ (any rec-sep-fn at-end?)
 413 |             rst (if rst (map second rst))
 414 |             result (if rst (vec(conj rst fst)) [fst])]
 415 |         result))))
 416 |    
 417 | 
 418 | (defn sep-rest [parse-fn sep-fn]
 419 |   (let [rst (multi* #(series sep-fn parse-fn))]
 420 |     (reduce (fn [a e] (conj a (e 1))) [] rst)))
 421 | 
 422 | (defn sep-by* 
 423 |   "Differs from sep-by in that it allows zero matches of parse-fn before stop-fn;
 424 |    Unlike sep-by, stop-fn must match -- not optional."
 425 |   [parse-fn sep-fn stop-fn]
 426 |   (if-let [fst (attempt parse-fn)]
 427 |     (let [rst (sep-rest parse-fn sep-fn), _ (stop-fn)]
 428 |       (into [fst] rst)) ; then
 429 |     (do (stop-fn) nil))) ; else
 430 | 
 431 | (defn any-string
 432 |   "Reads a single-quoted or double-quoted or a plain-string that is followed
 433 |   by the specified separator sep or EOF; the separator is not part of the returned
 434 |   string."
 435 |   [sep] (cond
 436 |           (starts-with? "\"") (dq-str)
 437 |           (starts-with? "'") (sq-str)
 438 |           :else (read-to-re-or-eof (re-pattern (str sep "|\r?\n")))))
 439 | 
 440 | (defn between
 441 |   "Applies the supplied start-fn, parse-fn and end-fn functions and returns
 442 |   the result of parse-fn. This is typically used to parse content enclosed by
 443 |   some delimiters on either side."
 444 |   [start-fn parse-fn end-fn]
 445 |   (let [res (series start-fn parse-fn end-fn)] (res 1)))
 446 | 
 447 | (defn parens
 448 |   "Returns the result of applying specifed parse function to text that is
 449 |   in between the opening and closing parentheses '(' and ')'"
 450 |   [parse-fn] (between #(chr \() parse-fn #(chr \))))
 451 | 
 452 | (defn braces
 453 |   "Returns the result of applying specifed parse function to text that is
 454 |   in between the opening and closing braces '{' and '}'"
 455 |   [parse-fn] (between #(chr \{) parse-fn #(chr \})))
 456 | 
 457 | (defn sq-brackets
 458 |   "Returns the result of applying specifed parse function to text that is
 459 |   in between the opening and closing square brackets '[' and ']'"
 460 |   [parse-fn] (between #(chr \[) parse-fn #(chr \])))
 461 | 
 462 | (defn ang-brackets
 463 |   "Returns the result of applying specifed parse function to text that is
 464 |   in between the opening and closing angular brackets '<' and '>'"
 465 |   [parse-fn] (between #(chr \<) parse-fn #(chr \>)))
 466 | 
 467 | (defn regex
 468 |   "Returns the text matched by the specified regex; If a group is specified,
 469 |   the returned text is for that group only. In either case, the cursor is
 470 |   advanced by the length of the entire matched text (group 0)"
 471 |   ([re] (regex re 0))
 472 |   ([re grp] (read-re re grp)))
 473 | 
 474 | (defn semi 
 475 |   "Matches and returns a semi-colon character"
 476 |   [] (chr \;))
 477 | 
 478 | (defn comma 
 479 |   "Matches and returns a comma character"
 480 |   [] (chr \,))
 481 | 
 482 | (defn dot 
 483 |   "Matches and retuns a dot character"
 484 |   [] (chr \.))
 485 | 
 486 | (defn colon 
 487 |   "Matches and returns a colon character"
 488 |   [] (chr \:))
 489 | 
 490 | (defn popen 
 491 |   "Matches and returns an opening paranthesis character"
 492 |   [] (chr \())
 493 | 
 494 | (defn pclose 
 495 |   "Matches and returns a closing paranthesis character"
 496 |   [] (chr \)))
 497 | 
 498 | (defn bopen 
 499 |   "Matches and returns an opening curly brace character"
 500 |   [] (chr \{))
 501 | 
 502 | (defn bclose 
 503 |   "Matches and returns a closing curly brace character"
 504 |   [] (chr \}))
 505 | 
 506 | (defn sqopen 
 507 |   "Matches and returns an opening curly brace character"
 508 |   [] (chr \[))
 509 | 
 510 | (defn sqclose 
 511 |   "Matches and returns a closing curly brace character"
 512 |   [] (chr \]))
 513 | 
 514 | (defn aopen 
 515 |   "Matches and returns an opening angular bracket character"
 516 |   [] (chr \<))
 517 | 
 518 | (defn aclose 
 519 |   "Matches and returns a closing angular bracket character"
 520 |   [] (chr \>))
 521 | 
 522 | (defn equal 
 523 |   "Matches and returns an equal character"
 524 |   [] (chr \=))
 525 | 
 526 | (defn integer
 527 |   "Parses a long integer value and returns a Long."
 528 |   [] (Long/parseLong (regex #"-?\d+(?!\w|\.)")))
 529 | 
 530 | (defn decimal
 531 |   "Parses a decimal value and returns a Double."
 532 |   [] (Double/parseDouble (regex #"-?\d+(\.\d+)?(?!\w|\.)")))
 533 | 
 534 | (defn number
 535 |   "Matches an integral or non-integral numeric value. While the function 
 536 |   decimal also matches both integer and non-integer values, it always
 537 |   returns a Double; where as number returns Long for integers and Double
 538 |   for non-integers."
 539 |   [] (any integer decimal))
 540 | 
 541 | (defn sq-str
 542 |   "Parses a single-quoted string and returns the matched string (minus the quotes)"
 543 |   [] (qstring \' \\))
 544 | 
 545 | (defn dq-str
 546 |   "Parses a double-quoted string and returns the matched string (minus the quotes)"
 547 |   [] (qstring \" \\))
 548 | 
 549 | (defn- qstring [qchar esc]
 550 |   (let [sb (StringBuilder.)]
 551 |     (no-trim #(chr qchar)) ; read and ignore
 552 |     (loop []
 553 |       (let [ch (read-ch true)
 554 |             is-esc (= ch esc)
 555 |             ch (if is-esc (read-ch) ch)]
 556 |         (if (and (= ch qchar) (not is-esc))
 557 |           nil
 558 |           (do (.append sb ch) (recur)))))
 559 |     (auto-trim-if)
 560 |     (.toString sb)))
 561 | 
 562 | (defn next-text ^String []
 563 |   (let [[s c _ _] (state)] (subs s c)))
 564 | 
 565 | (defn read-to
 566 |   "The parser skips to the position where the text contains the string
 567 |   specified by s. The string itself is not consumed, that is the cursor is
 568 |   positioned at the beginning of the match. If the specified string is not
 569 |   found, cursor position does not change and a parse exception is thrown."
 570 |   [^String s]
 571 |   (let [t (next-text)
 572 |         n (.indexOf t s)]
 573 |     (if (>= n 0) (move n) (throw-ex))))
 574 | 
 575 | (defn skip-over
 576 |   "Finds the specified string s in the input and skips over it. If the string
 577 |   is not found, a parse exception is thrown."
 578 |   [^String s]
 579 |   (let [s1 (read-to s)
 580 |         s2 (string s)]
 581 |     (str s1 s2)))
 582 | 
 583 | (defn- ->re [re] (if (string? re) (re-pattern re) re))
 584 | 
 585 | (defn read-re
 586 |   "Reads the string matching the specified regular expression. If a match-group
 587 |   is specified, the corresponding text is returned; otherwise the entire 
 588 |   matched text is returned."
 589 |   ([re] (read-re re 0))
 590 |   ([re ^Integer grp] 
 591 |    (let [t (next-text)
 592 |          ^Matcher rm (re-matcher (->re re) t)]
 593 |      (if (.lookingAt rm)
 594 |        (do (move (.end rm))
 595 |          (.group rm grp))
 596 |        (throw-ex)))))
 597 | 
 598 | (defn read-to-re
 599 |   "Reads and returns text upto but not including the text matched by the
 600 |   specified regular expression. If the specified regular expression doesn't
 601 |   occur in the remaining input text, an exception is thrown."
 602 |   [re]
 603 |   (let [t (next-text)
 604 |         m (re-find (->re re) t)]
 605 |     (if (nil? m)
 606 |       (throw-ex)
 607 |       (let [^String ms (if (string? m) m (m 0))
 608 |             i (.indexOf t ms)]
 609 |         (move i)))))
 610 | 
 611 | (defn read-to-re-or-eof
 612 |   "If the specified regex matches in the remaining text, returns text upto the match;
 613 |    Otherwise returns all the remaining text and the input cursor is positioned at EOF."
 614 |   [re]
 615 |   (if-let [t (attempt #(read-to-re re))] t (to-eof)))
 616 | 
 617 | (defn skip-over-re
 618 |   "Reads and returns text upto and including the text matched by the
 619 |   specified regular expression. If the specified regular expression doesn't
 620 |   occur in the remaining input text, an exception is thrown."
 621 |   [re] (let [s1 (read-to-re re)
 622 |              s2 (read-re re)]
 623 |          (str s1 s2)))
 624 | 
 625 | (defn starts-with?
 626 |   "Returns a boolean value indicating whether the current input text matches
 627 |   the specified string."
 628 |   [^String s] (.startsWith (next-text) s))
 629 | 
 630 | (defn starts-with-re?
 631 |   "Returns a boolean value indicating whether the specified regular expression
 632 |   matches the input at the current position."
 633 |   [re] (->> (next-text) (re-matcher (->re re)) .lookingAt))
 634 | 
 635 | (defn read-n
 636 |   "Reads and returns an n-character string at the current position."
 637 |   ^String [n]
 638 |   (let [t (next-text)]
 639 |     (if (<= n (.length t))
 640 |       (move n)
 641 |       (throw-ex "EOF"))))
 642 | 
 643 | (defn read-ch
 644 |   "Reads and return the next input character. Throws an exception if the
 645 |   current position is at the end of the input."
 646 |   ([] (read-ch false))
 647 |   ([is-no-auto-trim]
 648 |    (let [[^String s c _ _] (state)]
 649 |      (if (= c (.length s)) (throw-ex "EOF"))
 650 |      (if is-no-auto-trim (set-pos (inc c)) (move 1))
 651 |      (.charAt s c))))
 652 | 
 653 | (defn read-ch-in-set
 654 |   "Reads and returns the next character if it matches any of the characters
 655 |   specified in the provided set.  An exception is thrown otherwise.  The
 656 |   optional is-no-auto-trim argument may be used to specify whether or not
 657 |   to apply auto-trim after reading the next character."
 658 |   ([char-set] (read-ch-in-set char-set false))
 659 |   ([char-set is-no-auto-trim]
 660 |    (let [ch (read-ch is-no-auto-trim)]
 661 |      (if (char-set ch) ch
 662 |        (throw-ex (unexpected ch char-set))))))
 663 | 
 664 | (defn blk-cmt
 665 |   "Reads and returns a block comment as specified by the begin and end
 666 |   markers.  Throws an exception if the specified block-comment doesn't
 667 |   occur at the current position."
 668 |   [beg end]
 669 |   (if beg
 670 |     (let [s1 (string beg) s2 (skip-over end)] (str s1 s2))))
 671 | 
 672 | (defn blk-cmt?
 673 |   "Similar to blk-cmt but returns a nil instead of throwing an exception
 674 |   in case of a match failure."
 675 |   [beg end] (attempt #(blk-cmt beg end)))
 676 | 
 677 | (defn- to-eof
 678 |   "Reads and returns text from current position to the end of the input text."
 679 |   [] (let [t (next-text)] (move (.length t))))
 680 | 
 681 | (defn line-cmt
 682 |   "Reads and returns a line comment as specified by the begin marker.
 683 |   Throws an exception if the specified block-comment doesn't occur at the
 684 |   current position."
 685 |   [beg]
 686 |   (if beg 
 687 |     (->> (no-trim (fn [] (series #(string beg) #(regex #"[^\n]*\n?"))))
 688 |          (apply str))))
 689 | 
 690 | (defn line-cmt?
 691 |   "Similar to line-cmt but returns a nil instead of throwing an exception
 692 |   in case of a match failure."
 693 |   [beg] (attempt #(line-cmt beg)))
 694 | 
 695 | (defn ws
 696 |   "Matches white space (including comments) at the current position.  The
 697 |   optional parameters bcb, bce, lcb and wsre specify block-comment-begin,
 698 |   block-comment-end, line-comment-begin and white-space-regex respectively.
 699 |   If they are not specified here, the options set for the parser are used.
 700 |   Throws an exception if white space doesn't occur at the current position."
 701 |   ([]
 702 |    (let [opts (get-opts)
 703 |          blk (opts :blk-cmt-delim)
 704 |          lc (opts :line-cmt-start)
 705 |          wsre (opts :ws-regex)]
 706 |      (ws (blk 0) (blk 1) lc wsre)))
 707 | 
 708 |   ([bcb bce lcb wsre]
 709 |    (let [w (multi+ (fn [](any #(blk-cmt bcb bce)
 710 |                               #(line-cmt lcb)
 711 |                               #(regex wsre))))]
 712 |      (apply str w))))
 713 | 
 714 | (defn ws?
 715 |   "Similar to ws except that a nil value is returned instead of throwing
 716 |   an exception in case of a match failure."
 717 |   [& args] (attempt #(apply ws args)))
 718 | 
 719 | (defn at-end?
 720 |   "Returns true if no more input is left to be read; false otherwise."
 721 |   [] (let [[^String s c _ _] (state)] (= (.length s) c)))
 722 | 
 723 | (defn cursor-pos
 724 |   "Returns the current cursor position as a scalar"
 725 |   [] (let [[_ c _ _] (state)] c))
 726 | 
 727 | (defn line-column 
 728 |   "Returns the line and column vector corresponding to the cursor in string s"
 729 |   [^String s cursor]
 730 |   (loop [i 0, nl-cnt 0, col 0] 
 731 |     (if (= i cursor) [(inc nl-cnt) (inc col)] 
 732 |       (if (== (int(.charAt s i)) (int \newline)) 
 733 |         (recur (inc i) (inc nl-cnt) 0) 
 734 |         (recur (inc i) nl-cnt (inc col))))))
 735 | 
 736 | (defn- line-pos* 
 737 |   "Returns the line and column vector for the specified cursor position"
 738 |   [cursor]
 739 |   (let [[^String s _ _ _] (state)]
 740 |     (loop [i 0, nl-cnt 0, col 0]
 741 |          (if (= i cursor) [(inc nl-cnt) (inc col)]
 742 |            (if (== (int(.charAt s i)) (int \newline))
 743 |              (recur (inc i) (inc nl-cnt) 0)
 744 |              (recur (inc i) nl-cnt (inc col)))))))
 745 | 
 746 | (defn line-pos
 747 |   "Returns [line column] vector corresponding to the specified cursor position (or
 748 |    the current position if cursor is not specified) of the parser"
 749 |   ([] (line-pos* (cursor-pos)))
 750 |   ([cursor] (line-pos* cursor)))
 751 | 
 752 | (defn line-pos-str
 753 |   "Returns line position in a descriptive string. If the cursor position is specified,
 754 |    the returned value corresponds to that position.  Otherwise, the returned value
 755 |    corresponds to the current position."
 756 |   ([] (line-pos-str (cursor-pos)))
 757 |   ([cursor] (let [lp (line-pos cursor)] (str "line " (lp 0) ", col " (lp 1)))))
 758 | 
 759 | ; ------------------ create & configure the parser ------------
 760 | 
 761 | (defn parser-init
 762 |   "Initializes the parser state with the specified input string and options."
 763 |   ([^String input-str]
 764 |    (parser-init input-str default-options))
 765 | 
 766 |   ([^String input-str opts]
 767 |    (let [new-opts (init-default-ws-reader opts)
 768 |          st (atom [input-str 0 0 new-opts])]
 769 |      st)))      ; [str pos prev-pos opts]
 770 | 
 771 | (defn- init-default-ws-reader [opts]
 772 |   (let [bco (get opts :blk-cmt-delim)
 773 |         lcb (get opts :line-cmt-start)
 774 |         ws-re (get opts :ws-regex)
 775 |         wsr #(ws? (bco 0) (bco 1) lcb ws-re)]
 776 |     (assoc opts :default-ws-reader wsr)))
 777 | 
 778 | (defn set-opts 
 779 |   "Sets specified parser options"
 780 |   [opts] 
 781 |   (swap! *parser-state* 
 782 |     #(let [[s c p o] %
 783 |            o (merge o opts)
 784 |            o (merge o (init-default-ws-reader o))]
 785 |        [s c p o])))
 786 | 
 787 | (defn- reset-opts [opts]
 788 |   (swap! *parser-state* 
 789 |     #(let [[s c p o] %] [s c p opts])))
 790 | 
 791 | (defn set-opt
 792 |   "Sets parser option k to value v"
 793 |   [k v] #_(swap! *parser-state*
 794 |                #(let [[s c p o] %
 795 |                       o (-> o (assoc k v) (init-default-ws-reader))]
 796 |                   [s c p (assoc o k v)]))
 797 |   (set-opts {k v}))
 798 | 
 799 | (defn get-opts
 800 |   "Returns all parser options"
 801 |   [] (let [[_ _ _ o] (state)] o))
 802 | 
 803 | (defn get-opt
 804 |   "Returns the value for parser option k; if the optional default value
 805 |   parameter d is specified, its value is returned if the option k is not set
 806 |   in parser options."
 807 |   ([k] (get-opt k nil))
 808 |   ([k d] (get (get-opts) k d)))
 809 | 
 810 | (defn with-opts [opts parse-fn]
 811 |   "Sets the parser options specified in the opts map and applies the parse-fn 
 812 |    function. Returns the result of applying parse-fn.  
 813 |    
 814 |    Ensures that the original parse options are restored before the function exits." 
 815 |   (let [orig-opts (get-opts)]
 816 |     (set-opts opts)
 817 |     (try (parse-fn) (finally (reset-opts orig-opts)))))
 818 | 
 819 | (defn auto-trim-on
 820 |   "Turns on auto-trim feature that cleans trailing white-space, comments
 821 |   or whatever the custom ws-reader if any is spe"
 822 |   []
 823 |   (set-opt :auto-trim true))
 824 | 
 825 | (defn auto-trim-off
 826 |   "Turns off the auto-trim option."
 827 |   [] (set-opt :auto-trim false))
 828 | 
 829 | (defn set-blk-cmt-opts
 830 |   "Sets block comment begin and end markers."
 831 |   [beg end]
 832 |   (set-opt :blk-cmt-delim [beg end]))
 833 | 
 834 | (defn set-line-cmt-opts
 835 |   "Sets line comment begin marker."
 836 |   [beg]
 837 |   (set-opt :line-cmt-start beg))
 838 | 
 839 | (defn set-ws-regex
 840 |   "Sets the regular expression to be used for matching non-comment white-space."
 841 |   [ws-re]
 842 |   (set-opt :ws-regex ws-re))
 843 | 
 844 | (defn set-ws-reader
 845 |   "This sets the white-space parser to be used when auto-trim is set.
 846 |   If this is specified, it overrides the options set by set-blk-cmt-opts,
 847 |   set-line-cmt-opts and set-ws-regex options."
 848 |   [ws-reader]
 849 |   (set-opt :ws-reader ws-reader))
 850 | 
 851 | ;----------------------- misc utilities for parse functions ------------
 852 | (defn- get-input [] ((state) 0)) 
 853 | 
 854 | (defn- get-pos [] ((state) 1))
 855 | 
 856 | (defn- get-prev [] ((state) 2))
 857 | 
 858 | (defn get-opts [] ((state) 3))
 859 | 
 860 | (defn- set-pos
 861 |   ([pos] (set-pos pos (get-pos)))
 862 |   ([pos prev]
 863 |    (swap! *parser-state* #(let [[s _ _ o] %] [s pos (min prev pos) o]))
 864 |    nil))
 865 | ; prev-pos must never be greater than current-pos
 866 | 
 867 | (defn mark-pos
 868 |   "Returns the current positional parameters of the parser."
 869 |   [] (let [[_ c p _] (state)] [c p]))
 870 | 
 871 | (defn back-to-mark
 872 |   "Resets the positional parameters to a previously set mark."
 873 |   [mark] (set-pos (mark 0) (mark 1)))
 874 | 
 875 | (defn- move [delta]
 876 |   (set-pos (+ (get-pos) delta))
 877 |   (let [t (match-text)]
 878 |     (auto-trim-if)
 879 |     t))
 880 | 
 881 | ;------------------------------------------------------------------------
 882 | 
 883 | (defmacro with-trim-on
 884 |   "Executes the provided body with auto-trim option set to true.  The earlier
 885 |   value of the auto-trim option is restored after executing the body."
 886 |   [& body]
 887 |     `(let [at# (get-opt :auto-trim)]
 888 |        (set-opt :auto-trim true)
 889 |        (let [ret# (try (do ~@body) (finally (set-opt :auto-trim at#)))]
 890 |          ret# )))
 891 | 
 892 | (defmacro with-trim-off
 893 |   "Executes the provided body with auto-trim option set to false.  The earlier
 894 |   value of the auto-trim option is restored after executing the body."
 895 |   [& body]
 896 |     `(no-trim (fn [] ~@body)))
 897 | 
 898 | (defn no-trim
 899 |   "Similar to with-trim-off, but takes a function as a parameter instead of
 900 |   the body"
 901 |   [parse-fn] 
 902 |   (let [at (get-opt :auto-trim)]
 903 |     (set-opt :auto-trim false)
 904 |     (try (parse-fn) (finally (set-opt :auto-trim at)))))
 905 | 
 906 | (defmacro no-trim_
 907 |   "Creates and returns a parse function that calls `no-trim` when it is
 908 |   invoked. The argument `parse-expr` is converted to a parse function
 909 |   and passed to `no-trim` in the returned function's body.  See `no-trim`."
 910 |   [parse-expr]
 911 |   `#(no-trim (fn [] (eval ~parse-expr)))
 912 |   )
 913 | 
 914 | (defn no-trim-nl
 915 |   "Turns off automatic trimming of newline characters (as part of white-space)
 916 |   and executes the specified function. The earlier auto-trim options are restored
 917 |   at the end of execution of the specified function."
 918 |   [parse-fn] (let [wsre (get-opt :ws-regex)]
 919 |          (set-opt :ws-regex #"[ \t]+")
 920 |          (let [result (try (parse-fn) (finally (set-opt :ws-regex wsre)))]
 921 |            result)))
 922 | 
 923 | (defmacro no-trim-nl_
 924 |   "Creates and returns a parse function that calls `no-trim-nl` when it is
 925 |   invoked. The argument `parse-expr` is converted to a parse function
 926 |   and passed to `no-trim-nl` in the returned function's body.  See `no-trim-nl`."
 927 |   [parse-expr]
 928 |   `#(no-trim-nl (fn [] (eval ~parse-expr)))
 929 |   )
 930 | 
 931 | (defn read-ws
 932 |   "Reads whitespace (including comments) using a whitespace reader based
 933 |   on parser options. If the :ws-reader option is not set, a default whitespace
 934 |   reader based on other parser options such as :ws-regex, :blk-cmt-delim and
 935 |   :line-cmt-start will be used. Returns the whitespace read."
 936 |   [] (let [wr (get-opt :ws-reader)
 937 |            wr (if (nil? wr) (get-opt :default-ws-reader) wr)]
 938 |        (wr)))
 939 | 
 940 | (defn auto-trim-if
 941 |   "Automatically trim the leading input text if :auto-trim option is set to true."
 942 |   [] (if (get-opt :auto-trim) (read-ws))
 943 |   nil)
 944 | 
 945 | 
 946 | (defn- match-text
 947 |   "Returns the text matched by the previous parser operation."
 948 |   []
 949 |   (let [[s c p _] (state)] (if (> c p) (subs s p c))))
 950 | 
 951 | (defn- remaining [] (- (count (get-input)) (get-pos)))
 952 | 
 953 | (defn- next-n [n]
 954 |   (let [t (next-text)]
 955 |     (subs t 0 (min n (.length t)))))
 956 | 
 957 | (defn- to-set [coll] (if (set? coll) coll (apply hash-set coll)))
 958 | 
 959 | ; stops at first match instead of taking first of filter
 960 | (defn- find-first
 961 |   [item-or-result pred coll]
 962 |   (loop [fst (first coll)
 963 |          rst (rest coll)]
 964 |     (if fst (if-let [result (pred fst)]
 965 |               (if (= item-or-result :item) fst result)
 966 |               (recur (first rst) (rest rst))))))
 967 | 
 968 | ; --------------------------------------------------------------
 969 | ; Sample Infix Expression Parser
 970 | 
 971 | (defn get-default-ops []
 972 |   [[:unary ["!" "~"]]
 973 |    [:left ["*" "/" "%"]]
 974 |    [:left ["+" "-"]]
 975 |    [:left ["<<" ">>" ">>>"]]
 976 |    [:left ["<" "<=" ">" ">="]]
 977 |    [:left ["==" "!="]]
 978 |    [:left ["&"]] 
 979 |    [:left ["^"]] 
 980 |    [:left ["|"]] 
 981 |    [:left ["&&"]] 
 982 |    [:left ["||"]] 
 983 |    [:right [ "=" "+=" "-=" "*=" "/=" "%=" "&="
 984 |             "^=" "|=" "<<=" ">>=" "<<<="]]])
 985 | 
 986 | (defn get-default-op-fn-map []
 987 |   {"!" not, "~" bit-not, "*" *, "/" /, "+" +, "-" -, "<<" bit-shift-left,
 988 |    ">>" bit-shift-right "<" <, "<=" <=, ">" >, ">=" >=, "==" ==,
 989 |    "!=" #(not (== %1 %2)),"&" bit-and, "^" bit-xor, "|" bit-or,
 990 |    "&&" (macro->fn and), "||" (macro->fn or)})
 991 | 
 992 | (defn- ops->map [operators]
 993 |   (reduce #(merge %1 %2) {}
 994 |           (for [i (range (count operators))]
 995 |             (loop [m {} j 0]
 996 |               (let [l-ops (operators i) desc (l-ops 0) op-names (l-ops 1)]
 997 |                 (if (< j (count op-names))
 998 |                   (recur (assoc m (op-names j) [desc i]) (inc j))
 999 |                   m))))))
1000 | 
1001 | (defn- get-all-ops [operators]
1002 |   (reduce #(concat %1 (get %2 1)) [] operators))
1003 | 
1004 | (defn op-map [] (get-opt :op-map))
1005 | 
1006 | (defn all-ops [] (get-opt :all-ops))
1007 | 
1008 | (defn init-operators [operators]
1009 |   (set-opt :op-map (ops->map operators))
1010 |   (set-opt :all-ops (get-all-ops operators)))
1011 | 
1012 | (defn- op? [op] (get (op-map) op))
1013 | 
1014 | (defn- unary? [op] (= :unary ((get (op-map) op [""]) 0)))
1015 | 
1016 | (defn- prec [op] ((get (op-map) op) 1))
1017 | 
1018 | (defn- left-assoc? [op] (= :left ((get (op-map) op) 0)))
1019 | 
1020 | (defn has-priority? [op1 op2]
1021 |   (if (< (prec op1) (prec op2)) ;op1 has higher precedence (smaller #)
1022 |     true
1023 |     (if (> (prec op1) (prec op2))
1024 |       false
1025 |       (left-assoc? op1))))
1026 | 
1027 | (defn operator [] (string-in (all-ops)))
1028 | 
1029 | (defn- unary-operator []
1030 |   (let [op (operator)]
1031 |     (if (unary? op) op (throw-ex (unexpected op "unary operator or expression")))))
1032 | 
1033 | (defn- binary-operator []
1034 |   (let [op (operator)]
1035 |     (if-not (unary? op) op (throw-ex (unexpected op "binary operator")))))
1036 | 
1037 | (declare paren-expr)
1038 | 
1039 | (defn- factor []
1040 |   (let [uop (attempt unary-operator)
1041 |         fac (if (starts-with? "(") (paren-expr) (number))]
1042 |     (if uop [uop fac] fac)))
1043 | 
1044 | (declare expr)
1045 | 
1046 | (defn paren-expr []
1047 |   (chr \()
1048 |   (let [e (expr)] (chr \)) e))
1049 | 
1050 | (declare build-tree)
1051 | 
1052 | (defn expr
1053 |   "Parses expressions and returns the parse tree as nested vectors."
1054 |   [] (if-let [fac (attempt factor)]
1055 |        (if-let [rhs (multi* #(series binary-operator factor))]
1056 |          (-> (into [fac] (apply concat rhs)) build-tree )
1057 |          fac)))
1058 | 
1059 | (defn build-tree [nodes]
1060 |   (loop [i 0 tree []]
1061 |     (if (< i (count nodes))
1062 |       (let [node (nodes i)
1063 |             nc (count tree)
1064 |             n0 (get tree 0) n1 (get tree 1) n2 (get tree 2)]
1065 |         (case nc
1066 |           0 (recur (inc i) [node])
1067 |           1 (recur (inc i) [node n0])
1068 |           2 (recur (inc i) [n0 n1 node])
1069 |           3 (recur (inc i) (if (odd? i) ; operator
1070 |                              (if (has-priority? n0 node)
1071 |                                [node tree]
1072 |                                [n0 n1 [node n2]])
1073 |                              [n0 n1 [(n2 0) (n2 1) node]]))))
1074 |       tree)))
1075 | 
1076 | (defn- get-op-fn [op]
1077 |   (let [of-map (get-opt :op-fn-map)]
1078 |     (if-let [fn (get of-map op)] fn (throw-ex (str op " not implemented")))))
1079 | 
1080 | (defn eval-expr-tree
1081 |   "Evaluates the parse tree returned by expr parse function."
1082 |   [ptree]
1083 |   (letfn [(val [i] (-> (ptree i) eval-expr-tree))]
1084 |     (if (coll? ptree)
1085 |       (case (count ptree)
1086 |         2 ((get-op-fn (ptree 0)) (val 1))
1087 |         3 ((get-op-fn (ptree 0)) (val 1) (val 2)))
1088 |       ptree)))
1089 | 
1090 | (defn eval-expr
1091 |   "Parses and evaluates an expression in infix notation.
1092 |   Args: expression-string followed by parser options.  See parse function for details."
1093 |   [& args]
1094 |   (apply parse (list* #(-> (expr) eval-expr-tree) args)))
1095 | 
1096 | 


--------------------------------------------------------------------------------
/bin/protoflex/parse.clj:
--------------------------------------------------------------------------------
   1 | ;Copyright (c) Protoflex Software. All rights reserved.
   2 | ;The use and distribution terms for this software are covered by the
   3 | ;Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php)
   4 | ;which can be found in the file epl-v10.html at the root of this distribution.
   5 | ;By using this software in any fashion, you are agreeing to be bound by
   6 | ;the terms of this license.
   7 | ;You must not remove this notice, or any other, from this software.
   8 | 
   9 | (ns ^{ :doc "Clojure Parser Library." :author "Panduranga Adusumilli"}
  10 |   protoflex.parse
  11 |   (:refer-clojure :exclude [replace reverse])
  12 |   (:import [java.util.regex Matcher])
  13 |   (:import [clojure.lang ExceptionInfo])
  14 |   (:use [protoflex.util])
  15 |   (:require [clojure.string]))
  16 | 
  17 | (declare parser-init set-pos throw-ex starts-with? move read-ch read-ch-in-set
  18 |          qstring unexpected next-n regex to-set read-re next-text
  19 |          init-default-ws-reader match-text find-first auto-trim-if get-opt 
  20 |          mark-pos back-to-mark any-string dq-str sq-str read-to-re at-end? 
  21 |          no-trim no-trim-nl get-opts set-opt get-default-ops 
  22 |          get-default-op-fn-map init-operators read-ws string expect
  23 |          line-pos-str cursor-pos with-opts read-to-re-or-eof to-eof)
  24 | 
  25 | (def ^:dynamic *parser-state* (atom []))
  26 | (defn state [] (deref *parser-state*))
  27 | 
  28 | (def default-options {
  29 |                       :blk-cmt-delim ["/*" "*/"]
  30 |                       :line-cmt-start "//"
  31 |                       :ws-regex #"\s+"
  32 |                       :auto-trim true
  33 |                       :word-regex #"\S+"
  34 |                       :ident-regex #"[a-zA-Z_][a-zA-Z0-9_]*"
  35 |                       :eof true })
  36 | 
  37 | (defn parse
  38 |   "This function triggers off the parsing of the provided input string using
  39 |   the specified parse function. The following parser options may be provided 
  40 |   to alter the default behavior of the parser:
  41 |   :blk-cmt-delim - vector specifying start and end of block-comment markers
  42 |   :line-cmt-start - string specifying the begin marker of a line comment
  43 |   :ws-regex - regular expression for matching (non-comment) white space
  44 |   :auto-trim - whether to automatically remove the leading whitespace/comments
  45 |   at the current position in the input text or immediately after a parse action.
  46 |   :word-regex - regular expression for matching words
  47 |   :operators - a vector of vector of operators in the decreasing order of
  48 |   precedence; see get-default-ops function for an example.
  49 |   :op-fn-map - a map of operator and the function to call for that operator when
  50 |   evaluating expressions
  51 |   :eof - if true, the parse function must consume the entire input text
  52 | 
  53 |   Args:
  54 |   parse-fn  - parse function to apply
  55 |   input-str - input text to be parsed
  56 |   opts      - key value options (listed above)"
  57 | 
  58 |   [parse-fn input-str & opts]
  59 |   (let [options (merge default-options (apply hash-map opts))]
  60 |     (binding [*parser-state* (parser-init input-str options)
  61 |               *ns* (find-ns 'protoflex.parse)]
  62 |       (init-operators (get options :operators (get-default-ops)))
  63 |       (set-opt :op-fn-map (get options :op-fn-map (get-default-op-fn-map)))
  64 |       (auto-trim-if)
  65 |       (let [result (parse-fn)]
  66 |         (if (and (:eof options) (not (at-end?))) (throw-ex "Extraneous text") result)))))
  67 | 
  68 | (defmacro parse_
  69 |   "Similar to the `parse` function, but takes a parse expression instead of a
  70 |   parse function as its first argument. The parse expression is any clojure
  71 |   expression that performs parsing by calling built-in or custom parse
  72 |   functions. See the documentation for `parse`"
  73 |   [parse-expr input-str & opts]
  74 |   `(parse (fn [] (eval ~parse-expr)) ~input-str ~@opts)
  75 |   )
  76 | 
  77 | (defn attempt
  78 |   "Tries to match the input at the current position with the provided
  79 |   parse function. If the parse function matches successfully, the matched
  80 |   text is returned and the input cursor advances by the length of the
  81 |   matched text. Otherwise a nil is returned and the current position
  82 |   in the input remains unchanged."
  83 |   [parse-fn]
  84 |   (let [m (mark-pos)]
  85 |     (try (let [r (parse-fn)] r)
  86 |       (catch ExceptionInfo ex 
  87 |         (when (:committed (ex-data ex)) (throw ex))
  88 |         (back-to-mark m) nil))))
  89 | 
  90 | (defmacro attempt_
  91 |   "Creates and returns a parse function that calls `attempt` when it is
  92 |   invoked. The argument `parse-expr` is converted to a parse function
  93 |   and passed to `attempt` in the returned function's body.  See `attempt`."
  94 |   [parse-expr]
  95 |   `#(attempt (fn [] (eval ~parse-expr))))
  96 | 
  97 | (defn opt
  98 |   "Same as attempt, but accepts a default value argument to return in case the
  99 |   specified parse function fails.  Useful for matching optional text."
 100 |   ([parse-fn] (attempt parse-fn))
 101 |   ([parse-fn default-val] (let [v (attempt parse-fn)] (if v v default-val))))
 102 | 
 103 | (defmacro opt_
 104 |   "Creates and returns a parse function that calls `opt` when it is
 105 |   invoked. The argument `parse-expr` is converted to a parse function
 106 |   and passed to `opt` in the returned function's body.  See `opt`."
 107 |   ([parse-expr] `#(opt (fn [] (eval ~parse-expr))))
 108 |   ([parse-expr default-val] `#(opt (fn [] (eval ~parse-expr)) ~default-val)))
 109 | 
 110 | (defn any
 111 |   "Returns the result of the first successfully matching parse-function.
 112 |   If none of the parse functions match, an exception is thrown."
 113 |   [& parse-fns]
 114 |   (if-let [r (find-first :result #(attempt %) parse-fns)]
 115 |     r (throw-ex)))
 116 | 
 117 | (defmacro ->fns 
 118 |   [& exprs] 
 119 |   `(map #(fn [] (eval %)) '~exprs))
 120 | 
 121 | (defmacro any_ 
 122 |   "Creates and returns a parse function that calls `any` when it is
 123 |   invoked. The arguments `parse-exprs` are converted to parse functions
 124 |   and passed to `any` in the returned function's body.  See `any`."
 125 |   [& parse-exprs] 
 126 |   `#(apply any (->fns  ~@parse-exprs)))
 127 | 
 128 | (defn la-strs [la-pf-vec] 
 129 |   (for [x (range (count la-pf-vec)) :when (even? x)] (get la-pf-vec x)))
 130 | 
 131 | (defn- look-ahead-aux
 132 |   [[la pf & rest]]
 133 |   (if (starts-with? la)
 134 |     (pf)
 135 |     (if (> (count rest) 1) (recur  rest) (throw-ex))) 
 136 |   )
 137 | 
 138 | (defn look-ahead
 139 |   "Takes a collection of look-ahead-string and parse-function pairs and applies
 140 |   the first parse function that follows the matching look-ahead-string  and
 141 |   returns the result, or throws a parse exception if the parse function fails.
 142 | 
 143 |   If none of the look-ahead strings match the current text, an exception is thrown.
 144 | 
 145 |   To specify a default parse function, provide an empty string as look-ahead and
 146 |   the default parse function at the end of the argument list.
 147 | 
 148 |   Args: [la-str-1 parse-fn-1 la-str-2 parse-fn-2 ...]"
 149 |   [la-pf-vec]
 150 |   (expect (la-strs la-pf-vec) #(look-ahead-aux la-pf-vec)))
 151 | 
 152 | (defn- look-ahead*-aux
 153 |   [[la pf & rest]] 
 154 |   (if (starts-with? la)
 155 |     (do (string la) (pf))
 156 |     (if (> (count rest) 1) (recur  rest) (throw-ex)))) 
 157 | 
 158 | (defn look-ahead*
 159 |   "Same as look-ahead, but consumes the matching look-ahead string before
 160 |   applying the corresponding parse function. "
 161 |   [la-pf-vec]
 162 |   (expect (la-strs la-pf-vec) #(look-ahead*-aux la-pf-vec)))
 163 | 
 164 | (defn series
 165 |   "Applies a sequence of parse functions and returns their results in
 166 |   a vector. Each successfull match by the parse function advances the cursor.
 167 |   If any of the parse functions fails, an exception is thrown."
 168 |   [& parse-fns]
 169 |   ;(map #(%) parse-fns)
 170 |   ; just applying map doesn't get the correct dynamically rebound var values.
 171 |   ; doall & bound-fn don't seem to work; clojure bug?
 172 |   (letfn [(apply-fseq [fns]
 173 |             (loop [fst (first fns)
 174 |                    rst (rest fns)
 175 |                    result []]
 176 |               (if fst (recur (first rst) (rest rst) (conj result (fst)))
 177 |                 result)))]
 178 |     (apply-fseq parse-fns)))
 179 | 
 180 | (defmacro series_ 
 181 |   "Creates and returns a parse function that calls `series` when it is
 182 |   invoked. The arguments `parse-exprs` are converted to parse functions
 183 |   and passed to `series` in the returned function's body.  See `series`."
 184 |   [& parse-exprs] 
 185 |   `#(apply series (->fns  ~@parse-exprs)))
 186 | 
 187 | (defn multi*
 188 |   "Matches zero or more occurrences of text accepted by the provided parse 
 189 |   function and returns the results in a vector."
 190 |   [parse-fn]
 191 |   (loop [rv []]
 192 |     (if-let [r (attempt parse-fn)]
 193 |       (recur (conj rv r))
 194 |       (if (pos? (count rv)) rv))))
 195 |     
 196 | 
 197 | (defmacro multi*_
 198 |   "Creates and returns a parse function that calls `multi*` when it is
 199 |   invoked. The argument `parse-expr` is converted to a parse function
 200 |   and passed to `multi*` in the returned function's body.  See `multi*`."
 201 |   [parse-expr]
 202 |   `#(multi* (fn [] (eval ~parse-expr))))
 203 | 
 204 | (defn multi+
 205 |   "Matches one or more occurrences of text accepted by the provided parse 
 206 |   function and returns the results in a vector. If the parse function doesn't 
 207 |   match even once, an exception is thrown."
 208 |   [parse-fn]
 209 |   (if-let [rv (multi* parse-fn)] rv (throw-ex)))
 210 | 
 211 | (defmacro multi+_
 212 |   "Creates and returns a parse function that calls `multi+` when it is
 213 |   invoked. The argument `parse-expr` is converted to a parse function
 214 |   and passed to `multi+` in the returned function's body.  See `multi+`."
 215 |   [parse-expr]
 216 |   `#(multi+ (fn [] (eval ~parse-expr))))
 217 | 
 218 | (defn times 
 219 |   "Applies the provided parse function exactly n times and returns the
 220 |   results of applications of the function in a vector."
 221 |   [n parse-fn]
 222 |   (vec (for [i (range n)] (parse-fn))))
 223 | 
 224 | (defmacro times_
 225 |   "Creates and returns a parse function that calls `times` when it is
 226 |   invoked. The argument `parse-expr` is converted to a parse function
 227 |   and passed to `times` in the returned function's body.  See `times`."
 228 |   [n parse-expr]
 229 |   `#(times ~n (fn [] (eval ~parse-expr))))
 230 | 
 231 | (defn lexeme
 232 |   "Applies the specified parse function for current input text, consumes any
 233 |   following whitespace, comments and returns the result of the parse function
 234 |   application."
 235 |   [parse-fn]
 236 |   (let [r (parse-fn)] (read-ws) r))
 237 | 
 238 | (defmacro lexeme_
 239 |   "Creates and returns a parse function that calls `lexeme` when it is
 240 |   invoked. The argument `parse-expr` is converted to a parse function
 241 |   and passed to `lexeme` in the returned function's body.  See `lexeme`."
 242 |   [parse-expr]
 243 |   `#(lexeme (fn [] (eval ~parse-expr))))
 244 | 
 245 | (defn- dq [x] (str \" x \"))
 246 | 
 247 | (defn- one-of-msg [x] (str "one of [" 
 248 |                            (->> (map dq x) (clojure.string/join ", ")) "]"))
 249 | 
 250 | (defn- exp-msg [x] (if (coll? x) (one-of-msg x) (dq x)))
 251 | 
 252 | (defn expect
 253 |   "Customize error message; if the specified parse function doesn't match
 254 |   the current input text, the error message of the parse exception will include
 255 |   the specified custom expected-message."
 256 |   [expected-msg parse-fn]
 257 |   (try (parse-fn)
 258 |     (catch ExceptionInfo ex 
 259 |       (if (:unexpected (ex-data ex)) (throw ex) 
 260 |         (throw-ex (unexpected expected-msg) {:unexpected true})))))
 261 | 
 262 | (defmacro expect_
 263 |   "Creates and returns a parse function that calls `expect` when it is
 264 |   invoked. The argument `parse-expr` is converted to a parse function
 265 |   and passed to `expect` in the returned function's body.  See `expect`."
 266 |   [expected-msg parse-expr]
 267 |   `#(expect ~expected-msg (fn [] (eval ~parse-expr))))
 268 | 
 269 | (defn- has-parse-error-msg? [^Exception ex]
 270 |   (let [m (.getMessage ex)]
 271 |     (and m (>= (.indexOf m "Parse Error") 0))))
 272 | 
 273 | (defn throw-ex
 274 |   "Throws an exception of ExceptionInfo class; this is usually called to 
 275 |    indicate a match failure in a parse function."
 276 |   ([] (throw-ex "" {} nil))
 277 |   ([msg] (throw-ex msg {} nil))
 278 |   ([msg map] (throw-ex msg map nil))
 279 |   ([msg map cause] 
 280 |     (let [pos (line-pos-str (or (:pos map) (cursor-pos)))
 281 |           msg (if (:msg-final map) msg (str "Parse Error: " msg " at " pos))
 282 |           map (assoc map :msg-final true)]
 283 |       (throw (ex-info msg map cause)))))
 284 | 
 285 | (defn unexpected
 286 |   "Creates a message string for unexpected input exception."
 287 |   ([expected] (unexpected (next-n 10) expected))
 288 |   ([actual expected]
 289 |   (str "Unexpected input: \"" actual "\"; Expecting "
 290 |        (exp-msg expected))))
 291 | 
 292 | (defn- mchar [ch is-no-auto-trim]
 293 |   (let [c (read-ch is-no-auto-trim)]
 294 |     (if (= ch c) c
 295 |       (do (move -1) (throw-ex (unexpected c ch))))))
 296 | 
 297 | (defn chr
 298 |   "If the next character in the input matches the specified character ch,
 299 |   returns it; otherwise throws an exception."
 300 |   [ch] (mchar ch false))
 301 | 
 302 | (defn chr-
 303 |   "Same as chr but with auto-trimming turned off for the following input"
 304 |   [ch] (mchar ch true))
 305 | 
 306 | (defn chr-in
 307 |   "If the next character in the input matches any character in the specified
 308 |   string or character collection, the matching character is returned.
 309 |   Otherwise throws an exception."
 310 |   [chars] (read-ch-in-set (to-set chars) false))
 311 | 
 312 | (defn chr-in-
 313 |   "Same as chr-in but with auto-trimming turned off for the following input"
 314 |   [chars] (read-ch-in-set (to-set chars) true))
 315 | 
 316 | (defn string
 317 |   "If the input matches the specified string, the string is
 318 |   returned. Otherwise, a parse exception is thrown."
 319 |   [^String s]
 320 |   (if (starts-with? s) (move (.length s))
 321 |     (throw-ex (unexpected (next-n (.length s)) s))))
 322 | 
 323 | (defn string-in-ord
 324 |   "Returns the first string from the provided strings that matches text
 325 |   at the current position. Throws an exception if none of the strings match."
 326 |   [strings]
 327 |   (if-let [^String s (find-first :item #(starts-with? %) strings)]
 328 |     (do (move (.length s)) s)
 329 |     (throw-ex (unexpected strings))))
 330 | 
 331 | (defn string-in
 332 |   "Returns the longest string from the provided strings that matches text
 333 |   at the current position. Throws an exception if none of the strings match."
 334 |   [strings]
 335 |   (string-in-ord (sort-by count #(compare %2 %1) strings)))
 336 | 
 337 | (defn word-in
 338 |   "Returns the first word from the provided words that matches text
 339 |   at the current position. Throws an exception if none of the words match.
 340 |   An optional word-reader parse-function may be provided to read words."
 341 |   ([str-coll]
 342 |    (word-in str-coll #(regex (get-opt :word-regex))))
 343 |   ([str-coll word-reader]
 344 |    (let [wset (to-set str-coll)
 345 |          w (word-reader)]
 346 |      (if (wset w) w (throw-ex (unexpected w wset))))))
 347 | 
 348 | (defn word
 349 |   "Returns the specified word if the word occurs at the current position in
 350 |   the input text; an exception is thrown otherwise."
 351 |   [w] (word-in #{w}))
 352 | 
 353 | (defn ^String ident
 354 |   "Reads an identifier at current input position using the ident-regex 
 355 |    parser option. If id is specified, the read identifier must match the 
 356 |    specified value; otherwise an exception is thrown."
 357 |   [] (regex (get-opt :ident-regex)))
 358 | 
 359 | (defn ^String key-word
 360 |   "Reads an identifier at the current input position. If the read identifier
 361 |    matches the specified keyword kw, the same is returned; otherwise, an
 362 |    exception is thrown."
 363 |   [kw] (expect kw #(if (= kw (ident)) kw (throw-ex))))
 364 | 
 365 | (defn commit
 366 |   "Applies supplied parse-fn and if it fails, the failure gets reported as a
 367 |    'committed' exception, which prevents the parser from trying alternatives at
 368 |    higher levels. On success, returns the result of parse-fn."
 369 |   [parse-fn] 
 370 |   (try (parse-fn) 
 371 |     (catch ExceptionInfo ex 
 372 |       (throw-ex (.getMessage ex) (assoc (ex-data ex) :committed true)))))
 373 | 
 374 | (defn commit-on
 375 |   "If the keyword kw occurs at the current position in the input text, parse-fn
 376 |    will be applied next. If parse-fn fails, it will get reported as a 'committed'
 377 |    exception, which prevents the parser from trying alternatives at higher levels.
 378 |    On sucess, returns the result of parse-fn."
 379 |   [kw parse-fn] 
 380 |   (if (attempt #(key-word kw)) (commit parse-fn)))
 381 | 
 382 | (defn with-follow
 383 |   "Applies parse-fn and follow-fn in sequence; Ignores the result of follow-fn and
 384 |    returns the result of parse-fn."
 385 |   [parse-fn follow-fn] (let [r (parse-fn), _ (follow-fn)] r))
 386 | 
 387 | (defn with-follow*
 388 |   "Similar to with-follow, but commits to follow-fn parse if parse-fn succeeds. 
 389 |    Returns the result of parse-fn."
 390 |   [parse-fn follow-fn] (with-follow parse-fn #(commit follow-fn)))
 391 | 
 392 | (defn with-no-follow
 393 |   "Applies parse-fn and follow-fn in sequence; This method succeeds only if the 
 394 |    follow-fn fails to match. Returns the result of parse-fn."
 395 |   [parse-fn follow-fn] 
 396 |   (let [r (parse-fn), f (attempt follow-fn)] 
 397 |     (when f (throw-ex "Unexpected follow"))
 398 |     r))
 399 | 
 400 | (defn sep-by
 401 |   "Reads a record using the specified field, field-separator and 
 402 |   record-separator parse functions.  If no record-separator is specified, 
 403 |   a newline character is used as record separator. Returns the fields of the 
 404 |   record in a vector."
 405 |   ([fld-fn fld-sep-fn]
 406 |    (sep-by fld-fn fld-sep-fn #(regex #"\r?\n")))
 407 | 
 408 |   ([fld-fn fld-sep-fn rec-sep-fn]
 409 |     (if-not (at-end?)
 410 |       (let [fst (fld-fn)
 411 |             rst (multi* #(series fld-sep-fn fld-fn))
 412 |             _ (any rec-sep-fn at-end?)
 413 |             rst (if rst (map second rst))
 414 |             result (if rst (vec(conj rst fst)) [fst])]
 415 |         result))))
 416 |    
 417 | 
 418 | (defn sep-rest [parse-fn sep-fn]
 419 |   (let [rst (multi* #(series sep-fn parse-fn))]
 420 |     (reduce (fn [a e] (conj a (e 1))) [] rst)))
 421 | 
 422 | (defn sep-by* 
 423 |   "Differs from sep-by in that it allows zero matches of parse-fn before stop-fn;
 424 |    Unlike sep-by, stop-fn must match -- not optional."
 425 |   [parse-fn sep-fn stop-fn]
 426 |   (if-let [fst (attempt parse-fn)]
 427 |     (let [rst (sep-rest parse-fn sep-fn), _ (stop-fn)]
 428 |       (into [fst] rst)) ; then
 429 |     (do (stop-fn) nil))) ; else
 430 | 
 431 | (defn any-string
 432 |   "Reads a single-quoted or double-quoted or a plain-string that is followed
 433 |   by the specified separator sep or EOF; the separator is not part of the returned
 434 |   string."
 435 |   [sep] (cond
 436 |           (starts-with? "\"") (dq-str)
 437 |           (starts-with? "'") (sq-str)
 438 |           :else (read-to-re-or-eof (re-pattern (str sep "|\r?\n")))))
 439 | 
 440 | (defn between
 441 |   "Applies the supplied start-fn, parse-fn and end-fn functions and returns
 442 |   the result of parse-fn. This is typically used to parse content enclosed by
 443 |   some delimiters on either side."
 444 |   [start-fn parse-fn end-fn]
 445 |   (let [res (series start-fn parse-fn end-fn)] (res 1)))
 446 | 
 447 | (defn parens
 448 |   "Returns the result of applying specifed parse function to text that is
 449 |   in between the opening and closing parentheses '(' and ')'"
 450 |   [parse-fn] (between #(chr \() parse-fn #(chr \))))
 451 | 
 452 | (defn braces
 453 |   "Returns the result of applying specifed parse function to text that is
 454 |   in between the opening and closing braces '{' and '}'"
 455 |   [parse-fn] (between #(chr \{) parse-fn #(chr \})))
 456 | 
 457 | (defn sq-brackets
 458 |   "Returns the result of applying specifed parse function to text that is
 459 |   in between the opening and closing square brackets '[' and ']'"
 460 |   [parse-fn] (between #(chr \[) parse-fn #(chr \])))
 461 | 
 462 | (defn ang-brackets
 463 |   "Returns the result of applying specifed parse function to text that is
 464 |   in between the opening and closing angular brackets '<' and '>'"
 465 |   [parse-fn] (between #(chr \<) parse-fn #(chr \>)))
 466 | 
 467 | (defn regex
 468 |   "Returns the text matched by the specified regex; If a group is specified,
 469 |   the returned text is for that group only. In either case, the cursor is
 470 |   advanced by the length of the entire matched text (group 0)"
 471 |   ([re] (regex re 0))
 472 |   ([re grp] (read-re re grp)))
 473 | 
 474 | (defn semi 
 475 |   "Matches and returns a semi-colon character"
 476 |   [] (chr \;))
 477 | 
 478 | (defn comma 
 479 |   "Matches and returns a comma character"
 480 |   [] (chr \,))
 481 | 
 482 | (defn dot 
 483 |   "Matches and retuns a dot character"
 484 |   [] (chr \.))
 485 | 
 486 | (defn colon 
 487 |   "Matches and returns a colon character"
 488 |   [] (chr \:))
 489 | 
 490 | (defn popen 
 491 |   "Matches and returns an opening paranthesis character"
 492 |   [] (chr \())
 493 | 
 494 | (defn pclose 
 495 |   "Matches and returns a closing paranthesis character"
 496 |   [] (chr \)))
 497 | 
 498 | (defn bopen 
 499 |   "Matches and returns an opening curly brace character"
 500 |   [] (chr \{))
 501 | 
 502 | (defn bclose 
 503 |   "Matches and returns a closing curly brace character"
 504 |   [] (chr \}))
 505 | 
 506 | (defn sqopen 
 507 |   "Matches and returns an opening curly brace character"
 508 |   [] (chr \[))
 509 | 
 510 | (defn sqclose 
 511 |   "Matches and returns a closing curly brace character"
 512 |   [] (chr \]))
 513 | 
 514 | (defn aopen 
 515 |   "Matches and returns an opening angular bracket character"
 516 |   [] (chr \<))
 517 | 
 518 | (defn aclose 
 519 |   "Matches and returns a closing angular bracket character"
 520 |   [] (chr \>))
 521 | 
 522 | (defn equal 
 523 |   "Matches and returns an equal character"
 524 |   [] (chr \=))
 525 | 
 526 | (defn integer
 527 |   "Parses a long integer value and returns a Long."
 528 |   [] (Long/parseLong (regex #"-?\d+(?!\w|\.)")))
 529 | 
 530 | (defn decimal
 531 |   "Parses a decimal value and returns a Double."
 532 |   [] (Double/parseDouble (regex #"-?\d+(\.\d+)?(?!\w|\.)")))
 533 | 
 534 | (defn number
 535 |   "Matches an integral or non-integral numeric value. While the function 
 536 |   decimal also matches both integer and non-integer values, it always
 537 |   returns a Double; where as number returns Long for integers and Double
 538 |   for non-integers."
 539 |   [] (any integer decimal))
 540 | 
 541 | (defn sq-str
 542 |   "Parses a single-quoted string and returns the matched string (minus the quotes)"
 543 |   [] (qstring \' \\))
 544 | 
 545 | (defn dq-str
 546 |   "Parses a double-quoted string and returns the matched string (minus the quotes)"
 547 |   [] (qstring \" \\))
 548 | 
 549 | (defn- qstring [qchar esc]
 550 |   (let [sb (StringBuilder.)]
 551 |     (no-trim #(chr qchar)) ; read and ignore
 552 |     (loop []
 553 |       (let [ch (read-ch true)
 554 |             is-esc (= ch esc)
 555 |             ch (if is-esc (read-ch) ch)]
 556 |         (if (and (= ch qchar) (not is-esc))
 557 |           nil
 558 |           (do (.append sb ch) (recur)))))
 559 |     (auto-trim-if)
 560 |     (.toString sb)))
 561 | 
 562 | (defn next-text ^String []
 563 |   (let [[s c _ _] (state)] (subs s c)))
 564 | 
 565 | (defn read-to
 566 |   "The parser skips to the position where the text contains the string
 567 |   specified by s. The string itself is not consumed, that is the cursor is
 568 |   positioned at the beginning of the match. If the specified string is not
 569 |   found, cursor position does not change and a parse exception is thrown."
 570 |   [^String s]
 571 |   (let [t (next-text)
 572 |         n (.indexOf t s)]
 573 |     (if (>= n 0) (move n) (throw-ex))))
 574 | 
 575 | (defn skip-over
 576 |   "Finds the specified string s in the input and skips over it. If the string
 577 |   is not found, a parse exception is thrown."
 578 |   [^String s]
 579 |   (let [s1 (read-to s)
 580 |         s2 (string s)]
 581 |     (str s1 s2)))
 582 | 
 583 | (defn- ->re [re] (if (string? re) (re-pattern re) re))
 584 | 
 585 | (defn read-re
 586 |   "Reads the string matching the specified regular expression. If a match-group
 587 |   is specified, the corresponding text is returned; otherwise the entire 
 588 |   matched text is returned."
 589 |   ([re] (read-re re 0))
 590 |   ([re ^Integer grp] 
 591 |    (let [t (next-text)
 592 |          ^Matcher rm (re-matcher (->re re) t)]
 593 |      (if (.lookingAt rm)
 594 |        (do (move (.end rm))
 595 |          (.group rm grp))
 596 |        (throw-ex)))))
 597 | 
 598 | (defn read-to-re
 599 |   "Reads and returns text upto but not including the text matched by the
 600 |   specified regular expression. If the specified regular expression doesn't
 601 |   occur in the remaining input text, an exception is thrown."
 602 |   [re]
 603 |   (let [t (next-text)
 604 |         m (re-find (->re re) t)]
 605 |     (if (nil? m)
 606 |       (throw-ex)
 607 |       (let [^String ms (if (string? m) m (m 0))
 608 |             i (.indexOf t ms)]
 609 |         (move i)))))
 610 | 
 611 | (defn read-to-re-or-eof
 612 |   "If the specified regex matches in the remaining text, returns text upto the match;
 613 |    Otherwise returns all the remaining text and the input cursor is positioned at EOF."
 614 |   [re]
 615 |   (if-let [t (attempt #(read-to-re re))] t (to-eof)))
 616 | 
 617 | (defn skip-over-re
 618 |   "Reads and returns text upto and including the text matched by the
 619 |   specified regular expression. If the specified regular expression doesn't
 620 |   occur in the remaining input text, an exception is thrown."
 621 |   [re] (let [s1 (read-to-re re)
 622 |              s2 (read-re re)]
 623 |          (str s1 s2)))
 624 | 
 625 | (defn starts-with?
 626 |   "Returns a boolean value indicating whether the current input text matches
 627 |   the specified string."
 628 |   [^String s] (.startsWith (next-text) s))
 629 | 
 630 | (defn starts-with-re?
 631 |   "Returns a boolean value indicating whether the specified regular expression
 632 |   matches the input at the current position."
 633 |   [re] (->> (next-text) (re-matcher (->re re)) .lookingAt))
 634 | 
 635 | (defn read-n
 636 |   "Reads and returns an n-character string at the current position."
 637 |   ^String [n]
 638 |   (let [t (next-text)]
 639 |     (if (<= n (.length t))
 640 |       (move n)
 641 |       (throw-ex "EOF"))))
 642 | 
 643 | (defn read-ch
 644 |   "Reads and return the next input character. Throws an exception if the
 645 |   current position is at the end of the input."
 646 |   ([] (read-ch false))
 647 |   ([is-no-auto-trim]
 648 |    (let [[^String s c _ _] (state)]
 649 |      (if (= c (.length s)) (throw-ex "EOF"))
 650 |      (if is-no-auto-trim (set-pos (inc c)) (move 1))
 651 |      (.charAt s c))))
 652 | 
 653 | (defn read-ch-in-set
 654 |   "Reads and returns the next character if it matches any of the characters
 655 |   specified in the provided set.  An exception is thrown otherwise.  The
 656 |   optional is-no-auto-trim argument may be used to specify whether or not
 657 |   to apply auto-trim after reading the next character."
 658 |   ([char-set] (read-ch-in-set char-set false))
 659 |   ([char-set is-no-auto-trim]
 660 |    (let [ch (read-ch is-no-auto-trim)]
 661 |      (if (char-set ch) ch
 662 |        (throw-ex (unexpected ch char-set))))))
 663 | 
 664 | (defn blk-cmt
 665 |   "Reads and returns a block comment as specified by the begin and end
 666 |   markers.  Throws an exception if the specified block-comment doesn't
 667 |   occur at the current position."
 668 |   [beg end]
 669 |   (if beg
 670 |     (let [s1 (string beg) s2 (skip-over end)] (str s1 s2))))
 671 | 
 672 | (defn blk-cmt?
 673 |   "Similar to blk-cmt but returns a nil instead of throwing an exception
 674 |   in case of a match failure."
 675 |   [beg end] (attempt #(blk-cmt beg end)))
 676 | 
 677 | (defn- to-eof
 678 |   "Reads and returns text from current position to the end of the input text."
 679 |   [] (let [t (next-text)] (move (.length t))))
 680 | 
 681 | (defn line-cmt
 682 |   "Reads and returns a line comment as specified by the begin marker.
 683 |   Throws an exception if the specified block-comment doesn't occur at the
 684 |   current position."
 685 |   [beg]
 686 |   (if beg 
 687 |     (->> (with-trim-off (series #(string beg) #(regex #"[^\n]*\n?"))) 
 688 |          (apply str))))
 689 | 
 690 | (defn line-cmt?
 691 |   "Similar to line-cmt but returns a nil instead of throwing an exception
 692 |   in case of a match failure."
 693 |   [beg] (attempt #(line-cmt beg)))
 694 | 
 695 | (defn ws
 696 |   "Matches white space (including comments) at the current position.  The
 697 |   optional parameters bcb, bce, lcb and wsre specify block-comment-begin,
 698 |   block-comment-end, line-comment-begin and white-space-regex respectively.
 699 |   If they are not specified here, the options set for the parser are used.
 700 |   Throws an exception if white space doesn't occur at the current position."
 701 |   ([]
 702 |    (let [opts (get-opts)
 703 |          blk (opts :blk-cmt-delim)
 704 |          lc (opts :line-cmt-start)
 705 |          wsre (opts :ws-regex)]
 706 |      (ws (blk 0) (blk 1) lc wsre)))
 707 | 
 708 |   ([bcb bce lcb wsre]
 709 |    (let [w (multi+ (fn [](any #(blk-cmt bcb bce)
 710 |                               #(line-cmt lcb)
 711 |                               #(regex wsre))))]
 712 |      (apply str w))))
 713 | 
 714 | (defn ws?
 715 |   "Similar to ws except that a nil value is returned instead of throwing
 716 |   an exception in case of a match failure."
 717 |   [& args] (attempt #(apply ws args)))
 718 | 
 719 | (defn at-end?
 720 |   "Returns true if no more input is left to be read; false otherwise."
 721 |   [] (let [[^String s c _ _] (state)] (= (.length s) c)))
 722 | 
 723 | (defn cursor-pos
 724 |   "Returns the current cursor position as a scalar"
 725 |   [] (let [[_ c _ _] (state)] c))
 726 | 
 727 | (defn line-column 
 728 |   "Returns the line and column vector corresponding to the cursor in string s"
 729 |   [^String s cursor]
 730 |   (loop [i 0, nl-cnt 0, col 0] 
 731 |     (if (= i cursor) [(inc nl-cnt) (inc col)] 
 732 |       (if (== (int(.charAt s i)) (int \newline)) 
 733 |         (recur (inc i) (inc nl-cnt) 0) 
 734 |         (recur (inc i) nl-cnt (inc col))))))
 735 | 
 736 | (defn- line-pos* 
 737 |   "Returns the line and column vector for the specified cursor position"
 738 |   [cursor]
 739 |   (let [[^String s _ _ _] (state)]
 740 |     (loop [i 0, nl-cnt 0, col 0]
 741 |          (if (= i cursor) [(inc nl-cnt) (inc col)]
 742 |            (if (== (int(.charAt s i)) (int \newline))
 743 |              (recur (inc i) (inc nl-cnt) 0)
 744 |              (recur (inc i) nl-cnt (inc col)))))))
 745 | 
 746 | (defn line-pos
 747 |   "Returns [line column] vector corresponding to the specified cursor position (or
 748 |    the current position if cursor is not specified) of the parser"
 749 |   ([] (line-pos* (cursor-pos)))
 750 |   ([cursor] (line-pos* cursor)))
 751 | 
 752 | (defn line-pos-str
 753 |   "Returns line position in a descriptive string. If the cursor position is specified,
 754 |    the returned value corresponds to that position.  Otherwise, the returned value
 755 |    corresponds to the current position."
 756 |   ([] (line-pos-str (cursor-pos)))
 757 |   ([cursor] (let [lp (line-pos cursor)] (str "line " (lp 0) ", col " (lp 1)))))
 758 | 
 759 | ; ------------------ create & configure the parser ------------
 760 | 
 761 | (defn parser-init
 762 |   "Initializes the parser state with the specified input string and options."
 763 |   ([^String input-str]
 764 |    (parser-init input-str default-options))
 765 | 
 766 |   ([^String input-str opts]
 767 |    (let [new-opts (init-default-ws-reader opts)
 768 |          st (atom [input-str 0 0 new-opts])]
 769 |      st)))      ; [str pos prev-pos opts]
 770 | 
 771 | (defn- init-default-ws-reader [opts]
 772 |   (let [bco (get opts :blk-cmt-delim)
 773 |         lcb (get opts :line-cmt-start)
 774 |         ws-re (get opts :ws-regex)
 775 |         wsr #(ws? (bco 0) (bco 1) lcb ws-re)]
 776 |     (assoc opts :default-ws-reader wsr)))
 777 | 
 778 | (defn set-opts 
 779 |   "Sets specified parser options"
 780 |   [opts] 
 781 |   (swap! *parser-state* 
 782 |     #(let [[s c p o] %
 783 |            o (merge o opts)
 784 |            o (merge o (init-default-ws-reader o))]
 785 |        [s c p o])))
 786 | 
 787 | (defn- reset-opts [opts]
 788 |   (swap! *parser-state* 
 789 |     #(let [[s c p o] %] [s c p opts])))
 790 | 
 791 | (defn set-opt
 792 |   "Sets parser option k to value v"
 793 |   [k v] #_(swap! *parser-state*
 794 |                #(let [[s c p o] %
 795 |                       o (-> o (assoc k v) (init-default-ws-reader))]
 796 |                   [s c p (assoc o k v)]))
 797 |   (set-opts {k v}))
 798 | 
 799 | (defn get-opts
 800 |   "Returns all parser options"
 801 |   [] (let [[_ _ _ o] (state)] o))
 802 | 
 803 | (defn get-opt
 804 |   "Returns the value for parser option k; if the optional default value
 805 |   parameter d is specified, its value is returned if the option k is not set
 806 |   in parser options."
 807 |   ([k] (get-opt k nil))
 808 |   ([k d] (get (get-opts) k d)))
 809 | 
 810 | (defn with-opts [opts parse-fn]
 811 |   "Sets the parser options specified in the opts map and applies the parse-fn 
 812 |    function. Returns the result of applying parse-fn.  
 813 |    
 814 |    Ensures that the original parse options are restored before the function exits." 
 815 |   (let [orig-opts (get-opts)]
 816 |     (set-opts opts)
 817 |     (try (parse-fn) (finally (reset-opts orig-opts)))))
 818 | 
 819 | (defn auto-trim-on
 820 |   "Turns on auto-trim feature that cleans trailing white-space, comments
 821 |   or whatever the custom ws-reader if any is spe"
 822 |   []
 823 |   (set-opt :auto-trim true))
 824 | 
 825 | (defn auto-trim-off
 826 |   "Turns off the auto-trim option."
 827 |   [] (set-opt :auto-trim false))
 828 | 
 829 | (defn set-blk-cmt-opts
 830 |   "Sets block comment begin and end markers."
 831 |   [beg end]
 832 |   (set-opt :blk-cmt-delim [beg end]))
 833 | 
 834 | (defn set-line-cmt-opts
 835 |   "Sets line comment begin marker."
 836 |   [beg]
 837 |   (set-opt :line-cmt-start beg))
 838 | 
 839 | (defn set-ws-regex
 840 |   "Sets the regular expression to be used for matching non-comment white-space."
 841 |   [ws-re]
 842 |   (set-opt :ws-regex ws-re))
 843 | 
 844 | (defn set-ws-reader
 845 |   "This sets the white-space parser to be used when auto-trim is set.
 846 |   If this is specified, it overrides the options set by set-blk-cmt-opts,
 847 |   set-line-cmt-opts and set-ws-regex options."
 848 |   [ws-reader]
 849 |   (set-opt :ws-reader ws-reader))
 850 | 
 851 | ;----------------------- misc utilities for parse functions ------------
 852 | (defn- get-input [] ((state) 0)) 
 853 | 
 854 | (defn- get-pos [] ((state) 1))
 855 | 
 856 | (defn- get-prev [] ((state) 2))
 857 | 
 858 | (defn get-opts [] ((state) 3))
 859 | 
 860 | (defn- set-pos
 861 |   ([pos] (set-pos pos (get-pos)))
 862 |   ([pos prev]
 863 |    (swap! *parser-state* #(let [[s _ _ o] %] [s pos (min prev pos) o]))
 864 |    nil))
 865 | ; prev-pos must never be greater than current-pos
 866 | 
 867 | (defn mark-pos
 868 |   "Returns the current positional parameters of the parser."
 869 |   [] (let [[_ c p _] (state)] [c p]))
 870 | 
 871 | (defn back-to-mark
 872 |   "Resets the positional parameters to a previously set mark."
 873 |   [mark] (set-pos (mark 0) (mark 1)))
 874 | 
 875 | (defn- move [delta]
 876 |   (set-pos (+ (get-pos) delta))
 877 |   (let [t (match-text)]
 878 |     (auto-trim-if)
 879 |     t))
 880 | 
 881 | ;------------------------------------------------------------------------
 882 | 
 883 | (defmacro with-trim-on
 884 |   "Executes the provided body with auto-trim option set to true.  The earlier
 885 |   value of the auto-trim option is restored after executing the body."
 886 |   [& body]
 887 |     `(let [at# (get-opt :auto-trim)]
 888 |        (set-opt :auto-trim true)
 889 |        (let [ret# (try (do ~@body) (finally (set-opt :auto-trim at#)))]
 890 |          ret# )))
 891 | 
 892 | (defmacro with-trim-off
 893 |   "Executes the provided body with auto-trim option set to false.  The earlier
 894 |   value of the auto-trim option is restored after executing the body."
 895 |   [& body]
 896 |     `(let [at# (get-opt :auto-trim)]
 897 |        (set-opt :auto-trim false)
 898 |        (let [ret# (try (do ~@body) (finally (set-opt :auto-trim at#)))]
 899 |          ret# )))
 900 | 
 901 | (defn no-trim
 902 |   "Similar to with-trim-off, but takes a function as a parameter instead of
 903 |   the body"
 904 |   [parse-fn] (with-trim-off (parse-fn)))
 905 | 
 906 | (defmacro no-trim_
 907 |   "Creates and returns a parse function that calls `no-trim` when it is
 908 |   invoked. The argument `parse-expr` is converted to a parse function
 909 |   and passed to `no-trim` in the returned function's body.  See `no-trim`."
 910 |   [parse-expr]
 911 |   `#(no-trim (fn [] (eval ~parse-expr)))
 912 |   )
 913 | 
 914 | (defn no-trim-nl
 915 |   "Turns off automatic trimming of newline characters (as part of white-space)
 916 |   and executes the specified function. The earlier auto-trim options are restored
 917 |   at the end of execution of the specified function."
 918 |   [parse-fn] (let [wsre (get-opt :ws-regex)]
 919 |          (set-opt :ws-regex #"[ \t]+")
 920 |          (let [result (try (parse-fn) (finally (set-opt :ws-regex wsre)))]
 921 |            result)))
 922 | 
 923 | (defmacro no-trim-nl_
 924 |   "Creates and returns a parse function that calls `no-trim-nl` when it is
 925 |   invoked. The argument `parse-expr` is converted to a parse function
 926 |   and passed to `no-trim-nl` in the returned function's body.  See `no-trim-nl`."
 927 |   [parse-expr]
 928 |   `#(no-trim-nl (fn [] (eval ~parse-expr)))
 929 |   )
 930 | 
 931 | (defn read-ws
 932 |   "Reads whitespace (including comments) using a whitespace reader based
 933 |   on parser options. If the :ws-reader option is not set, a default whitespace
 934 |   reader based on other parser options such as :ws-regex, :blk-cmt-delim and
 935 |   :line-cmt-start will be used. Returns the whitespace read."
 936 |   [] (let [wr (get-opt :ws-reader)
 937 |            wr (if (nil? wr) (get-opt :default-ws-reader) wr)]
 938 |        (wr)))
 939 | 
 940 | (defn auto-trim-if
 941 |   "Automatically trim the leading input text if :auto-trim option is set to true."
 942 |   [] (if (get-opt :auto-trim) (read-ws))
 943 |   nil)
 944 | 
 945 | 
 946 | (defn- match-text
 947 |   "Returns the text matched by the previous parser operation."
 948 |   []
 949 |   (let [[s c p _] (state)] (if (> c p) (subs s p c))))
 950 | 
 951 | (defn- remaining [] (- (count (get-input)) (get-pos)))
 952 | 
 953 | (defn- next-n [n]
 954 |   (let [t (next-text)]
 955 |     (subs t 0 (min n (.length t)))))
 956 | 
 957 | (defn- to-set [coll] (if (set? coll) coll (apply hash-set coll)))
 958 | 
 959 | ; stops at first match instead of taking first of filter
 960 | (defn- find-first
 961 |   [item-or-result pred coll]
 962 |   (loop [fst (first coll)
 963 |          rst (rest coll)]
 964 |     (if fst (if-let [result (pred fst)]
 965 |               (if (= item-or-result :item) fst result)
 966 |               (recur (first rst) (rest rst))))))
 967 | 
 968 | ; --------------------------------------------------------------
 969 | ; Sample Infix Expression Parser
 970 | 
 971 | (defn get-default-ops []
 972 |   [[:unary ["!" "~"]]
 973 |    [:left ["*" "/" "%"]]
 974 |    [:left ["+" "-"]]
 975 |    [:left ["<<" ">>" ">>>"]]
 976 |    [:left ["<" "<=" ">" ">="]]
 977 |    [:left ["==" "!="]]
 978 |    [:left ["&"]] 
 979 |    [:left ["^"]] 
 980 |    [:left ["|"]] 
 981 |    [:left ["&&"]] 
 982 |    [:left ["||"]] 
 983 |    [:right [ "=" "+=" "-=" "*=" "/=" "%=" "&="
 984 |             "^=" "|=" "<<=" ">>=" "<<<="]]])
 985 | 
 986 | (defn get-default-op-fn-map []
 987 |   {"!" not, "~" bit-not, "*" *, "/" /, "+" +, "-" -, "<<" bit-shift-left,
 988 |    ">>" bit-shift-right "<" <, "<=" <=, ">" >, ">=" >=, "==" ==,
 989 |    "!=" #(not (== %1 %2)),"&" bit-and, "^" bit-xor, "|" bit-or,
 990 |    "&&" (macro->fn and), "||" (macro->fn or)})
 991 | 
 992 | (defn- ops->map [operators]
 993 |   (reduce #(merge %1 %2) {}
 994 |           (for [i (range (count operators))]
 995 |             (loop [m {} j 0]
 996 |               (let [l-ops (operators i) desc (l-ops 0) op-names (l-ops 1)]
 997 |                 (if (< j (count op-names))
 998 |                   (recur (assoc m (op-names j) [desc i]) (inc j))
 999 |                   m))))))
1000 | 
1001 | (defn- get-all-ops [operators]
1002 |   (reduce #(concat %1 (get %2 1)) [] operators))
1003 | 
1004 | (defn op-map [] (get-opt :op-map))
1005 | 
1006 | (defn all-ops [] (get-opt :all-ops))
1007 | 
1008 | (defn init-operators [operators]
1009 |   (set-opt :op-map (ops->map operators))
1010 |   (set-opt :all-ops (get-all-ops operators)))
1011 | 
1012 | (defn- op? [op] (get (op-map) op))
1013 | 
1014 | (defn- unary? [op] (= :unary ((get (op-map) op [""]) 0)))
1015 | 
1016 | (defn- prec [op] ((get (op-map) op) 1))
1017 | 
1018 | (defn- left-assoc? [op] (= :left ((get (op-map) op) 0)))
1019 | 
1020 | (defn has-priority? [op1 op2]
1021 |   (if (< (prec op1) (prec op2)) ;op1 has higher precedence (smaller #)
1022 |     true
1023 |     (if (> (prec op1) (prec op2))
1024 |       false
1025 |       (left-assoc? op1))))
1026 | 
1027 | (defn operator [] (string-in (all-ops)))
1028 | 
1029 | (defn- unary-operator []
1030 |   (let [op (operator)]
1031 |     (if (unary? op) op (throw-ex (unexpected op "unary operator or expression")))))
1032 | 
1033 | (defn- binary-operator []
1034 |   (let [op (operator)]
1035 |     (if-not (unary? op) op (throw-ex (unexpected op "binary operator")))))
1036 | 
1037 | (declare paren-expr)
1038 | 
1039 | (defn- factor []
1040 |   (let [uop (attempt unary-operator)
1041 |         fac (if (starts-with? "(") (paren-expr) (number))]
1042 |     (if uop [uop fac] fac)))
1043 | 
1044 | (declare expr)
1045 | 
1046 | (defn paren-expr []
1047 |   (chr \()
1048 |   (let [e (expr)] (chr \)) e))
1049 | 
1050 | (declare build-tree)
1051 | 
1052 | (defn expr
1053 |   "Parses expressions and returns the parse tree as nested vectors."
1054 |   [] (if-let [fac (attempt factor)]
1055 |        (if-let [rhs (multi* #(series binary-operator factor))]
1056 |          (-> (into [fac] (apply concat rhs)) build-tree )
1057 |          fac)))
1058 | 
1059 | (defn build-tree [nodes]
1060 |   (loop [i 0 tree []]
1061 |     (if (< i (count nodes))
1062 |       (let [node (nodes i)
1063 |             nc (count tree)
1064 |             n0 (get tree 0) n1 (get tree 1) n2 (get tree 2)]
1065 |         (case nc
1066 |           0 (recur (inc i) [node])
1067 |           1 (recur (inc i) [node n0])
1068 |           2 (recur (inc i) [n0 n1 node])
1069 |           3 (recur (inc i) (if (odd? i) ; operator
1070 |                              (if (has-priority? n0 node)
1071 |                                [node tree]
1072 |                                [n0 n1 [node n2]])
1073 |                              [n0 n1 [(n2 0) (n2 1) node]]))))
1074 |       tree)))
1075 | 
1076 | (defn- get-op-fn [op]
1077 |   (let [of-map (get-opt :op-fn-map)]
1078 |     (if-let [fn (get of-map op)] fn (throw-ex (str op " not implemented")))))
1079 | 
1080 | (defn eval-expr-tree
1081 |   "Evaluates the parse tree returned by expr parse function."
1082 |   [ptree]
1083 |   (letfn [(val [i] (-> (ptree i) eval-expr-tree))]
1084 |     (if (coll? ptree)
1085 |       (case (count ptree)
1086 |         2 ((get-op-fn (ptree 0)) (val 1))
1087 |         3 ((get-op-fn (ptree 0)) (val 1) (val 2)))
1088 |       ptree)))
1089 | 
1090 | (defn eval-expr
1091 |   "Parses and evaluates an expression in infix notation.
1092 |   Args: expression-string followed by parser options.  See parse function for details."
1093 |   [& args]
1094 |   (apply parse (list* #(-> (expr) eval-expr-tree) args)))
1095 | 
1096 | 


--------------------------------------------------------------------------------
/README.html:
--------------------------------------------------------------------------------
  1 | <style>/* 
  2 |    Some simple Github-like styles, with syntax highlighting CSS via Pygments.
  3 | */
  4 | html {
  5 | 	background:#fff;
  6 | 	margin:0;
  7 | 	padding:0;
  8 | }
  9 | body {
 10 | 	font: 14px helvetica,arial,freesans,clean,sans-serif;
 11 | 	line-height: 1.6;
 12 | 	margin: 0 auto;
 13 |   padding: 20px;
 14 | 	text-align:left;
 15 | 	color: #333;
 16 | 	width:920px;
 17 | }
 18 | 
 19 | .md {
 20 | 	background-color: #eee;
 21 | 	border-radius:3px;
 22 |   margin: 0;
 23 | 	padding:3px;
 24 | }
 25 | 
 26 | article {
 27 |   padding: 30px;
 28 |   border: 1px solid #cacaca;
 29 |   background-color: white;
 30 | }
 31 | article > :first-child {
 32 |   margin-top: 0!important;
 33 | }
 34 | 
 35 | h1 {
 36 | 	font-size:28px;
 37 | 	margin-bottom:10px;
 38 |   color: black;
 39 | }
 40 | 
 41 | h2 {
 42 | 	font-size:24px;
 43 | 	margin:20px 0 10px;
 44 |   color: black;
 45 |   border-bottom: 1px solid #ccc;
 46 | }
 47 | 
 48 | h3 {
 49 | 	font-size:18px;
 50 | 	margin:20px 0 10px;
 51 | }
 52 | 
 53 | h4 {
 54 | 	font-size:16px;
 55 | 	font-weight:bold;
 56 | 	margin:20px 0 10px;
 57 | }
 58 | 
 59 | h5 {
 60 | 	font-size:14px;
 61 | 	font-weight:bold;
 62 | 	margin:20px 0 10px;
 63 | }
 64 | 
 65 | h6 {
 66 | 	color:#777;
 67 | 	font-size:14px;
 68 | 	font-weight:bold;
 69 | 	margin:20px 0 10px;
 70 | }
 71 | 
 72 | hr {
 73 | 	background: transparent url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAYAAAAECAYAAACtBE5DAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAAyJpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuMC1jMDYwIDYxLjEzNDc3NywgMjAxMC8wMi8xMi0xNzozMjowMCAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvIiB4bWxuczp4bXBNTT0iaHR0cDovL25zLmFkb2JlLmNvbS94YXAvMS4wL21tLyIgeG1sbnM6c3RSZWY9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9zVHlwZS9SZXNvdXJjZVJlZiMiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENTNSBNYWNpbnRvc2giIHhtcE1NOkluc3RhbmNlSUQ9InhtcC5paWQ6OENDRjNBN0E2NTZBMTFFMEI3QjRBODM4NzJDMjlGNDgiIHhtcE1NOkRvY3VtZW50SUQ9InhtcC5kaWQ6OENDRjNBN0I2NTZBMTFFMEI3QjRBODM4NzJDMjlGNDgiPiA8eG1wTU06RGVyaXZlZEZyb20gc3RSZWY6aW5zdGFuY2VJRD0ieG1wLmlpZDo4Q0NGM0E3ODY1NkExMUUwQjdCNEE4Mzg3MkMyOUY0OCIgc3RSZWY6ZG9jdW1lbnRJRD0ieG1wLmRpZDo4Q0NGM0E3OTY1NkExMUUwQjdCNEE4Mzg3MkMyOUY0OCIvPiA8L3JkZjpEZXNjcmlwdGlvbj4gPC9yZGY6UkRGPiA8L3g6eG1wbWV0YT4gPD94cGFja2V0IGVuZD0iciI/PqqezsUAAAAfSURBVHjaYmRABcYwBiM2QSA4y4hNEKYDQxAEAAIMAHNGAzhkPOlYAAAAAElFTkSuQmCC) repeat-x 0 0;
 74 | 	border: 0 none;
 75 | 	height: 4px;
 76 | 	margin: 15px 0;
 77 | 	padding: 0;
 78 | }
 79 | 
 80 | 
 81 | p {
 82 | 	margin: 15px 0; 
 83 | }
 84 | 
 85 | pre, code {
 86 | 	font: 12px 'Bitstream Vera Sans Mono','Courier',monospace;
 87 | }
 88 | .highlight pre, pre {
 89 | 	background-color:#f8f8f8;
 90 | 	border:1px solid #ccc;
 91 | 	font-size:13px;
 92 | 	line-height:19px;
 93 | 	overflow:auto;
 94 | 	border-radius:3px;
 95 | 	-moz-border-radius:3px;
 96 | 	-webkit-border-radius:3px;
 97 | 	padding:6px 10px;
 98 | }
 99 | 
100 | 
101 | 
102 | code {
103 | 	white-space:nowrap;
104 | 	border:1px solid #eaeaea;
105 | 	background-color:#f8f8f8;
106 | 	border-radius:3px;
107 | 	-moz-border-radius:3px;
108 | 	-webkit-border-radius:3px;
109 | 	margin:0 2px;
110 | 	padding:0 5px;
111 | }
112 | 
113 | pre>code
114 | {
115 | 	white-space:pre;
116 | 	border:none;
117 | 	background:transparent;
118 | 	margin:0;
119 | 	padding:0;
120 | }
121 | 
122 | a, a code {
123 | 	color: #4183C4;
124 | 	text-decoration:none;
125 | }
126 | 
127 | 
128 | blockquote
129 | {
130 | 	border-left:4px solid #ddd;
131 | 	padding-left:11px;
132 | 	color:#555;
133 | 	margin:14px 0;
134 | }
135 | 
136 | table
137 | {
138 |   font-size: 14px;
139 | 	border-collapse:collapse;
140 | 	margin:20px 0 0;
141 | 	padding:0;
142 | }
143 | 
144 | table tr
145 | {
146 | 	border-top:1px solid #ccc;
147 | 	background-color:#fff;
148 | 	margin:0;
149 | 	padding:0;
150 | }
151 | 
152 | table tr:nth-child(2n)
153 | {
154 | 	background-color:#f8f8f8;
155 | }
156 | table tr th[align="center"], table tr td[align="center"] {
157 | 	text-align:center;
158 | }
159 | table tr th, table tr td
160 | {
161 | 	border:1px solid #ccc;
162 | 	text-align:left;
163 | 	margin:0;
164 | 	padding:6px 13px;
165 | }
166 | 
167 | ul, ol
168 | {
169 | 	margin:15px 0;
170 | }
171 | 
172 | ul li, ol li
173 | {
174 | 	margin-top:7px;
175 | 	margin-bottom:7px;
176 | }
177 | 
178 | 
179 | 
180 | .shadow {
181 | 	-webkit-box-shadow:0 5px 15px #000;
182 | 	-moz-box-shadow:0 5px 15px #000;
183 | 	box-shadow:0 5px 15px #000;		
184 | }
185 | 
186 | 
187 | 
188 | /* Pygments coloring */
189 | .highlight .c{color:#998;font-style:italic;}
190 | .highlight .err{color:#a61717;background-color:#e3d2d2;}
191 | .highlight .k{font-weight:bold;}
192 | .highlight .o{font-weight:bold;}
193 | .highlight .cm{color:#998;font-style:italic;}
194 | .highlight .cp{color:#999;font-weight:bold;}
195 | .highlight .c1{color:#998;font-style:italic;}
196 | .highlight .cs{color:#999;font-weight:bold;font-style:italic;}
197 | .highlight .gd{color:#000;background-color:#fdd;}
198 | .highlight .gd .x{color:#000;background-color:#faa;}
199 | .highlight .ge{font-style:italic;}
200 | .highlight .gr{color:#a00;}
201 | .highlight .gh{color:#999;}
202 | .highlight .gi{color:#000;background-color:#dfd;}
203 | .highlight .gi .x{color:#000;background-color:#afa;}
204 | .highlight .go{color:#888;}
205 | .highlight .gp{color:#555;}
206 | .highlight .gs{font-weight:bold;}
207 | .highlight .gu{color:#800080;font-weight:bold;}
208 | .highlight .gt{color:#a00;}
209 | .highlight .kc{font-weight:bold;}
210 | .highlight .kd{font-weight:bold;}
211 | .highlight .kn{font-weight:bold;}
212 | .highlight .kp{font-weight:bold;}
213 | .highlight .kr{font-weight:bold;}
214 | .highlight .kt{color:#458;font-weight:bold;}
215 | .highlight .m{color:#099;}
216 | .highlight .s{color:#d14;}
217 | .highlight .na{color:#008080;}
218 | .highlight .nb{color:#0086B3;}
219 | .highlight .nc{color:#458;font-weight:bold;}
220 | .highlight .no{color:#008080;}
221 | .highlight .ni{color:#800080;}
222 | .highlight .ne{color:#900;font-weight:bold;}
223 | .highlight .nf{color:#900;font-weight:bold;}
224 | .highlight .nn{color:#555;}
225 | .highlight .nt{color:#000080;}
226 | .highlight .nv{color:#008080;}
227 | .highlight .ow{font-weight:bold;}
228 | .highlight .w{color:#bbb;}
229 | .highlight .mf{color:#099;}
230 | .highlight .mh{color:#099;}
231 | .highlight .mi{color:#099;}
232 | .highlight .mo{color:#099;}
233 | .highlight .sb{color:#d14;}
234 | .highlight .sc{color:#d14;}
235 | .highlight .sd{color:#d14;}
236 | .highlight .s2{color:#d14;}
237 | .highlight .se{color:#d14;}
238 | .highlight .sh{color:#d14;}
239 | .highlight .si{color:#d14;}
240 | .highlight .sx{color:#d14;}
241 | .highlight .sr{color:#009926;}
242 | .highlight .s1{color:#d14;}
243 | .highlight .ss{color:#990073;}
244 | .highlight .bp{color:#999;}
245 | .highlight .vc{color:#008080;}
246 | .highlight .vg{color:#008080;}
247 | .highlight .vi{color:#008080;}
248 | .highlight .il{color:#099;}
249 | 
250 | </style><div class="md"><article>
251 | <h1>Parse-EZ : Clojure Parser Library</h1>
252 | 
253 | <p><a href="http://www.protoflex.com/parse-ez/api-doc/protoflex.parse-api.html" title="Parse-EZ API">API Documentation</a></p>
254 | 
255 | <p>Parse-EZ is a parser library for Clojure programmers. It allows easy
256 | mixing of declarative and imperative styles and does not 
257 | require any special constructs, macros, monads, etc. to write custom parsers. 
258 | All the parsing is implemented using regular Clojure functions.</p>
259 | 
260 | <p>The library provides a number of
261 | parse functions and combinators and comes with a built-in customizable infix
262 | expression parser and evaluator. It allows the programmer to concisely specify
263 | the structure of input text using Clojure functions and easily build parse trees
264 | without having to step out of Clojure.  Whether you are writing a parser
265 | for some well structured data or for data scraping or prototyping a new language, 
266 | you can make use of this library to quickly create a parser.</p>
267 | 
268 | <h2>Features</h2>
269 | 
270 | <ul>
271 | <li>Parse functions and Combinators</li>
272 | <li>Automatic handling of whitespaces, comments</li>
273 | <li>Marking positions and backtracking</li>
274 | <li>Seek, read, skip string/regex patterns</li>
275 | <li>Builtin customizable expression parser and evaluator</li>
276 | <li>Exceptions based error handling</li>
277 | <li>Custom error messages</li>
278 | </ul>
279 | 
280 | <h2>Usage</h2>
281 | 
282 | <h3>Installation</h3>
283 | 
284 | <p>Just add Parse-EZ as a dependency to your lein project</p>
285 | <div class="highlight"><pre><span class="p">[</span><span class="nv">protoflex/parse-ez</span> <span class="s">&quot;0.4.2&quot;</span><span class="p">]</span>
286 | </pre></div>
287 | <p>and run</p>
288 | <div class="highlight"><pre><span class="nv">lein</span> <span class="nv">deps</span>
289 | </pre></div>
290 | <h2>A Taste of Parse-EZ</h2>
291 | 
292 | <p>Here are a couple of sample parsers to give you a taste of the parser library.</p>
293 | 
294 | <h3>CSV Parser</h3>
295 | 
296 | <p>A CSV file contains multiple records, one-record per line, with field-values separated by a delimiter
297 | such as a comma or a tab.  The field values may optionally be quoted either using a single or double
298 | quotes.  When field-values are quoted, they may contain the field-delimiter characters, and in such
299 | cases they will not be treated as field separators.</p>
300 | 
301 | <p>First, let us define a parse function for parsing one-line of csv file:</p>
302 | <div class="highlight"><pre><span class="p">(</span><span class="kd">defn </span><span class="nv">csv-1</span> <span class="p">[</span><span class="nv">sep</span><span class="p">]</span> 
303 |     <span class="p">(</span><span class="nf">sep-by</span> <span class="o">#</span><span class="p">(</span><span class="nf">any-string</span> <span class="nv">sep</span><span class="p">)</span> <span class="o">#</span><span class="p">(</span><span class="nf">chr</span> <span class="nv">sep</span><span class="p">)))</span>
304 | </pre></div>
305 | <p>In the above function definition, we make use of the parse combinator <code>sep-by</code>
306 | which takes two arguments: the first one to read a field-value and the second
307 | one to read the separator.  Here, we have used Clojure&#39;s anonymous function shortcuts to
308 | specify the desired behavior succinctly.  The <code>any-string</code> function matches a single-quoted
309 | string or a double-quoted string or a plain-string that is followed by the specified separator
310 | <code>sep</code>.  This is exactly the function that we need to read the field-value.  The second argument
311 | provided to <code>sep-by</code> above uses the primitive parse function <code>chr</code> which succeeds only when
312 | the next character in the input matches its argument (<code>sep</code> parameter in this case).  The <em>csv-1</em> function returns the field values as a vector.</p>
313 | 
314 | <p>The <code>sep-by</code> function actually takes a third, optional argument as record-separator
315 | function with the default value of a function that matches a newline.  We didn&#39;t
316 | pass the third argument above because the default behavior suits our purpose.
317 | Had the default behavior of <code>sep-by</code> been different, we would have written the
318 | above function as:</p>
319 | <div class="highlight"><pre><span class="p">(</span><span class="kd">defn </span><span class="nv">csv-1</span> <span class="p">[</span><span class="nv">sep</span><span class="p">]</span> 
320 |     <span class="p">(</span><span class="nf">sep-by</span> <span class="o">#</span><span class="p">(</span><span class="nf">any-string</span> <span class="nv">sep</span><span class="p">)</span> <span class="o">#</span><span class="p">(</span><span class="nf">chr</span> <span class="nv">sep</span><span class="p">)</span> <span class="o">#</span><span class="p">(</span><span class="nf">regex</span> <span class="o">#</span><span class="s">&quot;\r?\n&quot;</span><span class="p">)))</span>
321 | </pre></div>
322 | <p>Now that we have created a parse function to parse a single line of CSV
323 | file, let us write another parse function that parses the entire CSV file
324 | content and returns the result as a vector of vector of field values
325 | (one-vector per record/line).  All we need to do is to repeatedly apply the
326 | above defined <code>csv-1</code> function and the <code>multi*</code> parse combinator does
327 | just that.</p>
328 | 
329 | <p>Just one small but important detail: by default, Parse-EZ
330 | automatically trims whitespace after successfully applying a parse function.
331 | This means that the newline at the end of line would be consumed after reading
332 | the last field value and the <code>sep-by</code> would be unable to match the end-of-line
333 | which is the record-separator in this case.  So, we will disable the newline
334 | trimming functionality using the <code>no-trim</code> combinator.</p>
335 | <div class="highlight"><pre><span class="p">(</span><span class="kd">defn </span><span class="nv">csv</span> <span class="p">[</span><span class="nv">sep</span><span class="p">]</span> 
336 |     <span class="p">(</span><span class="nf">multi*</span> <span class="p">(</span><span class="k">fn </span><span class="p">[]</span> <span class="p">(</span><span class="nf">no-trim</span> <span class="o">#</span><span class="p">(</span><span class="nf">csv-1</span> <span class="nv">sep</span><span class="p">)))))</span>
337 | </pre></div>
338 | <p>Alternatively, you can express the above function a bit more easily using the macro versions of combinators introduced in Version 0.3.0 as follows:</p>
339 | <div class="highlight"><pre><span class="p">(</span><span class="kd">defn </span><span class="nv">csv</span> <span class="p">[</span><span class="nv">sep</span><span class="p">]</span> 
340 |     <span class="p">(</span><span class="nf">multi*</span> <span class="p">(</span><span class="nf">no-trim_</span> <span class="p">(</span><span class="nf">csv-1</span> <span class="nv">sep</span><span class="p">))))</span>
341 | </pre></div>
342 | <p>Now, let us try out our csv parser. First let us define a couple of test 
343 | strings containing a couple of records (lines) each.  Note that the second 
344 | string contains a comma inside the first cell (a quoted string).  </p>
345 | <div class="highlight"><pre><span class="nv">user&gt;</span> <span class="p">(</span><span class="k">def </span><span class="nv">s1</span> <span class="s">&quot;1abc,def,ghi\n2jkl,mno,pqr\n&quot;</span><span class="p">)</span>
346 | <span class="o">#</span><span class="ss">&#39;user/s1</span>
347 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="k">def </span><span class="nv">s2</span> <span class="s">&quot;&#39;1a,bc&#39;,def,ghi\n2jkl,mno,pqr\n&quot;</span><span class="p">)</span>
348 | <span class="o">#</span><span class="ss">&#39;user/s2</span>
349 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="nf">csv</span> <span class="sc">\,</span><span class="p">)</span> <span class="nv">s1</span><span class="p">)</span>
350 | <span class="p">[[</span><span class="s">&quot;1abc&quot;</span> <span class="s">&quot;def&quot;</span> <span class="s">&quot;ghi&quot;</span><span class="p">]</span> <span class="p">[</span><span class="s">&quot;2jkl&quot;</span> <span class="s">&quot;mno&quot;</span> <span class="s">&quot;pqr&quot;</span><span class="p">]]</span>
351 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="nf">csv</span> <span class="sc">\,</span><span class="p">)</span> <span class="nv">s2</span><span class="p">)</span>
352 | <span class="p">[[</span><span class="s">&quot;1a,bc&quot;</span> <span class="s">&quot;def&quot;</span> <span class="s">&quot;ghi&quot;</span><span class="p">]</span> <span class="p">[</span><span class="s">&quot;2jkl&quot;</span> <span class="s">&quot;mno&quot;</span> <span class="s">&quot;pqr&quot;</span><span class="p">]]</span>
353 | <span class="nv">user&gt;</span> 
354 | </pre></div>
355 | <p>Well, all we had to do was to write two lines of Clojure code to implement the CSV parser.
356 | Let&#39;s add a bit more functionality: the CSV files may use a comma or a tab character to
357 | separate the field values.  Let&#39;s say we don&#39;t know ahead of time which character
358 | a file uses as a separator and we want to detect the separator automatically.  Note
359 | that both characters may occur in a data file, but only one acts as a field-separator -- that too
360 | only when it&#39;s not inside a quoted string.</p>
361 | 
362 | <p>Here is our strategy to detect the separator:</p>
363 | 
364 | <ul>
365 | <li>if the first field value is quoted (single or double), read the quoted string</li>
366 | <li>else, read until one of comma or tab occurs</li>
367 | <li>the next char is our delimiter</li>
368 | </ul>
369 | 
370 | <p>Here is the code:</p>
371 | <div class="highlight"><pre><span class="p">(</span><span class="kd">defn </span><span class="nv">detect-sep</span> <span class="p">[]</span>
372 |     <span class="p">(</span><span class="k">let </span><span class="p">[</span><span class="nv">m</span> <span class="p">(</span><span class="nf">mark-pos</span><span class="p">)</span>
373 |            <span class="nv">s</span> <span class="p">(</span><span class="nf">attempt</span> <span class="o">#</span><span class="p">(</span><span class="nf">any</span> <span class="nv">dq-str</span> <span class="nv">sq-str</span><span class="p">))</span>
374 |            <span class="nv">s</span> <span class="p">(</span><span class="k">if </span><span class="nv">s</span> <span class="nv">s</span> <span class="p">(</span><span class="nf">no-trim</span> <span class="o">#</span><span class="p">(</span><span class="nf">read-to-re</span> <span class="o">#</span><span class="s">&quot;,|\t&quot;</span><span class="p">)))</span>
375 |            <span class="nv">sep</span> <span class="p">(</span><span class="nf">read-ch</span><span class="p">)]</span>
376 |        <span class="p">(</span><span class="nf">back-to-mark</span> <span class="nv">m</span><span class="p">)</span>
377 |        <span class="nv">sep</span><span class="p">))</span>
378 | </pre></div>
379 | <p>Note how we used the <code>mark-pos</code> and <code>back-to-mark</code> Parse-EZ functions to &#39;unconsume&#39;
380 | the consumed input.  </p>
381 | 
382 | <p>The complete code for the sample CSV parser with the separator-detection functionality is
383 | listed below (you can find this in <code>csv_parse.clj</code> file under the <code>examples</code> directory.</p>
384 | <div class="highlight"><pre><span class="p">(</span><span class="kd">ns </span><span class="nv">protoflex.examples.csv_parse</span>
385 |   <span class="p">(</span><span class="ss">:use</span> <span class="p">[</span><span class="nv">protoflex.parse</span><span class="p">]))</span>
386 | 
387 | <span class="p">(</span><span class="kd">declare </span><span class="nv">detect-sep</span> <span class="nv">csv-1</span><span class="p">)</span>
388 | 
389 | <span class="p">(</span><span class="kd">defn </span><span class="nv">csv</span>
390 |   <span class="s">&quot;Reads and returns one or more records as a vector of vector of field-values&quot;</span>
391 |   <span class="p">([]</span> <span class="p">(</span><span class="nf">csv</span> <span class="p">(</span><span class="nf">no-trim</span> <span class="o">#</span><span class="p">(</span><span class="nf">detect-sep</span><span class="p">))))</span>
392 |   <span class="p">([</span><span class="nv">sep</span><span class="p">]</span> <span class="p">(</span><span class="nf">multi*</span> <span class="p">(</span><span class="k">fn </span><span class="p">[]</span> <span class="p">(</span><span class="nf">no-trim-nl</span> <span class="o">#</span><span class="p">(</span><span class="nf">csv-1</span> <span class="nv">sep</span><span class="p">))))))</span>
393 | 
394 | <span class="p">(</span><span class="kd">defn </span><span class="nv">csv-1</span>
395 |   <span class="s">&quot;Reads and returns the fields of one record (line)&quot;</span>
396 |   <span class="p">[</span><span class="nv">sep</span><span class="p">]</span> <span class="p">(</span><span class="nf">sep-by</span> <span class="o">#</span><span class="p">(</span><span class="nf">any-string</span> <span class="nv">sep</span><span class="p">)</span> <span class="o">#</span><span class="p">(</span><span class="nf">chr</span> <span class="nv">sep</span><span class="p">)))</span>
397 | 
398 | <span class="p">(</span><span class="kd">defn </span><span class="nv">detect-sep</span>
399 |   <span class="s">&quot;Detects the separator used in a csv file (a comma or a tab)&quot;</span>
400 |   <span class="p">[]</span> <span class="p">(</span><span class="k">let </span><span class="p">[</span><span class="nv">m</span> <span class="p">(</span><span class="nf">mark-pos</span><span class="p">)</span>
401 |            <span class="nv">s</span> <span class="p">(</span><span class="nf">attempt</span> <span class="o">#</span><span class="p">(</span><span class="nf">any</span> <span class="nv">dq-str</span> <span class="nv">sq-str</span><span class="p">))</span>
402 |            <span class="nv">s</span> <span class="p">(</span><span class="k">if </span><span class="nv">s</span> <span class="nv">s</span> <span class="p">(</span><span class="nf">no-trim</span> <span class="o">#</span><span class="p">(</span><span class="nf">read-to-re</span> <span class="o">#</span><span class="s">&quot;,|\t&quot;</span><span class="p">)))</span>
403 |            <span class="nv">sep</span> <span class="p">(</span><span class="nf">read-ch</span><span class="p">)]</span>
404 |        <span class="p">(</span><span class="nf">back-to-mark</span> <span class="nv">m</span><span class="p">)</span>
405 |        <span class="nv">sep</span><span class="p">))</span>
406 | </pre></div>
407 | <p>Let&#39;s try out the new auto-detect functionality. Let us define two new test
408 | strings <code>s3</code> and <code>s4</code> that use <code>tab</code> character as field-separator.</p>
409 | <div class="highlight"><pre><span class="nv">user&gt;</span> <span class="p">(</span><span class="nf">use</span> <span class="ss">&#39;protoflex.examples.csv_parse</span><span class="p">)</span>
410 | <span class="nv">nil</span>
411 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="k">def </span><span class="nv">s3</span> <span class="s">&quot;1abc\tdef\tghi\n2jkl\tmno\tpqr\n&quot;</span><span class="p">)</span>
412 | <span class="o">#</span><span class="ss">&#39;user/s3</span>
413 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="k">def </span><span class="nv">s4</span> <span class="s">&quot;&#39;1a\tbc&#39;\tdef\tghi\n2jkl\tmno\tpqr\n&quot;</span><span class="p">)</span>
414 | <span class="o">#</span><span class="ss">&#39;user/s4</span>
415 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="nv">csv</span> <span class="nv">s3</span><span class="p">)</span>
416 | <span class="p">[[</span><span class="s">&quot;1abc&quot;</span> <span class="s">&quot;def&quot;</span> <span class="s">&quot;ghi&quot;</span><span class="p">]</span> <span class="p">[</span><span class="s">&quot;2jkl&quot;</span> <span class="s">&quot;mno&quot;</span> <span class="s">&quot;pqr&quot;</span><span class="p">]]</span>
417 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="nv">csv</span> <span class="nv">s4</span><span class="p">)</span>
418 | <span class="p">[[</span><span class="s">&quot;1a\tbc&quot;</span> <span class="s">&quot;def&quot;</span> <span class="s">&quot;ghi&quot;</span><span class="p">]</span> <span class="p">[</span><span class="s">&quot;2jkl&quot;</span> <span class="s">&quot;mno&quot;</span> <span class="s">&quot;pqr&quot;</span><span class="p">]]</span>
419 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="nv">csv</span> <span class="nv">s1</span><span class="p">)</span>
420 | <span class="p">[[</span><span class="s">&quot;1abc&quot;</span> <span class="s">&quot;def&quot;</span> <span class="s">&quot;ghi&quot;</span><span class="p">]</span> <span class="p">[</span><span class="s">&quot;2jkl&quot;</span> <span class="s">&quot;mno&quot;</span> <span class="s">&quot;pqr&quot;</span><span class="p">]]</span>
421 | <span class="nv">user&gt;</span> 
422 | </pre></div>
423 | <p>As you can see, this time we didn&#39;t specify what field-separator to use: the parser
424 | itself detected the field-separator character and used it, returning us the desired
425 | results.</p>
426 | 
427 | <h3>XML Parser</h3>
428 | 
429 | <p>Here is the listing of a sample XML parser implemented using Parse-EZ. You can find the
430 | source file in the examples directory.  The parser returns a map containing keys and values
431 | for <code>:tag</code>, <code>:attributes</code> and <code>:children</code> for the root element.  The value for <code>:attributes</code> key
432 | is itself another map containing attribute names and their values.  The value for <code>:children</code>
433 | key is a vector (potentially empty) containing string content and/or maps for child elements.</p>
434 | <div class="highlight"><pre><span class="p">(</span><span class="kd">ns </span><span class="nv">protoflex.examples.xml_parse</span>
435 |   <span class="p">(</span><span class="ss">:use</span> <span class="p">[</span><span class="nv">protoflex.parse</span><span class="p">]))</span>
436 | 
437 | <span class="p">(</span><span class="kd">declare </span><span class="nv">pi</span> <span class="nv">prolog</span> <span class="nv">element</span> <span class="nv">attributes</span> <span class="nv">children-and-close</span> <span class="nv">cdata</span> <span class="nv">elem-or-text</span> <span class="nv">close-tag</span><span class="p">)</span>
438 | 
439 | <span class="p">(</span><span class="kd">defn </span><span class="nv">parse-xml</span> <span class="p">[</span><span class="nv">xml-str</span><span class="p">]</span>
440 |   <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="nf">between</span> <span class="nv">prolog</span> <span class="nv">element</span> <span class="nv">pi</span><span class="p">)</span> <span class="nv">xml-str</span> <span class="ss">:blk-cmt-delim</span> <span class="p">[</span><span class="s">&quot;&lt;!--&quot;</span> <span class="s">&quot;--&gt;&quot;</span><span class="p">]</span> <span class="ss">:line-cmt-start</span> <span class="nv">nil</span><span class="p">))</span>
441 | 
442 | <span class="p">(</span><span class="kd">defn- </span><span class="nv">pi</span> <span class="p">[]</span> <span class="p">(</span><span class="nf">while</span> <span class="p">(</span><span class="nf">starts-with?</span> <span class="s">&quot;&lt;?&quot;</span><span class="p">)</span> <span class="p">(</span><span class="nf">skip-over</span> <span class="s">&quot;?&gt;&quot;</span><span class="p">)))</span>
443 | 
444 | <span class="p">(</span><span class="kd">defn- </span><span class="nv">prolog</span> <span class="p">[]</span> <span class="p">(</span><span class="nf">pi</span><span class="p">)</span> <span class="p">(</span><span class="nf">attempt</span>  <span class="o">#</span><span class="p">(</span><span class="nf">regex</span> <span class="o">#</span><span class="s">&quot;(?s)&lt;!DOCTYPE([^&lt;]+?&gt;)|(.*?\]\s*&gt;)&quot;</span><span class="p">))</span> <span class="p">(</span><span class="nf">pi</span><span class="p">))</span>
445 | </pre></div>
446 | <p>The function <em>parse-xml</em> is the entry point that kicks off parsing of input xml string <em>xml-str</em>.  It passes the <em>between</em> combinator to <strong>Parse-EZ</strong>&#39;s <em>parse</em> function. Here, the call to <em>between</em> returns the value returned by the <em>element</em> parse function, ignoring the content surrounding it (matched by <em>prolog</em> and <em>pi</em> functions). The block-comment delimiters are set to match XML&#39;s and the line-comment delimiter is cleared (by default these match Java comments).</p>
447 | 
448 | <p>The parse function <em>pi</em> is used to skip consecutive processing instructions by using the delimiters <strong>&lt;?</strong> and <strong>?&gt;</strong>.</p>
449 | 
450 | <p>The parse function <em>prolog</em> is used to skip DTD declaration (if any) and also any surrounding processing instructions.  Note that the regex used to match DTD declaration is only meant for illustration purposes. It isn&#39;t complete but will work in most cases.</p>
451 | <div class="highlight"><pre><span class="p">(</span><span class="k">def </span><span class="nv">name-start</span> <span class="s">&quot;:A-Z_a-z\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\u02FF\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD&quot;</span><span class="p">)</span>
452 | 
453 | <span class="p">(</span><span class="k">def </span><span class="nv">name-char</span> <span class="p">(</span><span class="nb">str </span><span class="nv">name-start</span> <span class="s">&quot;\\-.0-9\\xB7\\u0300-\\u036F\\u203F-\\u2040&quot;</span><span class="p">))</span>
454 | 
455 | <span class="p">(</span><span class="k">def </span><span class="nv">name-re</span> <span class="p">(</span><span class="nb">-&gt; </span><span class="p">(</span><span class="nf">format</span> <span class="s">&quot;[%s][%s]*&quot;</span> <span class="nv">name-start</span> <span class="nv">name-char</span><span class="p">)</span> <span class="nv">re-pattern</span><span class="p">))</span>
456 | </pre></div>
457 | <p><em>name-re</em> is a regular expression that matches xml element and attribute names.</p>
458 | <div class="highlight"><pre><span class="p">(</span><span class="kd">defn </span><span class="nv">element</span> <span class="p">[]</span>
459 |   <span class="p">(</span><span class="k">let </span><span class="p">[</span><span class="nv">tag</span> <span class="p">(</span><span class="k">do </span><span class="p">(</span><span class="nf">chr</span> <span class="sc">\&lt;</span><span class="p">)</span> <span class="p">(</span><span class="nf">regex</span> <span class="nv">name-re</span><span class="p">))</span>
460 |         <span class="nv">attrs</span> <span class="p">(</span><span class="nf">attributes</span><span class="p">)</span>
461 |         <span class="nb">children </span><span class="p">(</span><span class="nf">look-ahead*</span> <span class="p">[</span>
462 |                                <span class="s">&quot;&gt;&quot;</span> <span class="o">#</span><span class="p">(</span><span class="nf">children-and-close</span> <span class="nv">tag</span><span class="p">)</span>
463 |                                <span class="s">&quot;/&gt;&quot;</span> <span class="p">(</span><span class="k">fn </span><span class="p">[]</span> <span class="p">[])])]</span>
464 |     <span class="p">{</span><span class="ss">:tag</span> <span class="nv">tag</span>, <span class="ss">:attributes</span> <span class="nv">attrs</span>, <span class="ss">:children</span> <span class="nv">children</span><span class="p">}))</span>
465 | </pre></div>
466 | <p>The <em>element</em> parse function matches an xml element and returns the tag, attribute list and children in a hash map. Note the usage of the <em>look_ahead*</em> combinator to handle both the cases -- with children and without children. If it sees a &quot;&gt;&quot; after reading the attributes, the <em>look-ahead*</em> function calls the <em>children-and-close</em> parse function to read children and the element close tag. On the other hand, if it sees &quot;/&gt;&quot; after the attributes, it calls the (almost) empty parse function that simply returns an empty list.</p>
467 | <div class="highlight"><pre><span class="p">(</span><span class="kd">defn </span><span class="nv">attr</span> <span class="p">[]</span>
468 |   <span class="p">(</span><span class="k">let </span><span class="p">[</span><span class="nv">n</span> <span class="p">(</span><span class="nf">regex</span> <span class="nv">name-re</span><span class="p">)</span> <span class="nv">_</span> <span class="p">(</span><span class="nf">chr</span> <span class="sc">\=</span><span class="p">)</span>
469 |         <span class="nv">v</span> <span class="p">(</span><span class="nf">any</span> <span class="nv">sq-str</span> <span class="nv">dq-str</span><span class="p">)]</span>
470 |     <span class="p">[</span><span class="nv">n</span> <span class="nv">v</span><span class="p">]))</span>
471 | 
472 | <span class="p">(</span><span class="kd">defn </span><span class="nv">attributes</span> <span class="p">[]</span> <span class="p">(</span><span class="nb">apply hash-map </span><span class="p">(</span><span class="nf">flatten</span>  <span class="p">(</span><span class="nf">multi*</span> <span class="nv">attr</span><span class="p">))))</span>
473 | </pre></div>
474 | <p>The <em>attr</em> parse function matches a single attribute. The attribute value may be
475 | a single-quoted or double-quoted string. Note the usage of <em>any</em> parse combinator for this purpose.</p>
476 | 
477 | <p>The <em>attributes</em> parse function matches multiple attribute specifications by passing the <em>attr</em> parse function to <em>multi*</em> parse combinator.</p>
478 | <div class="highlight"><pre><span class="p">(</span><span class="kd">defn- </span><span class="nv">children-and-close</span> <span class="p">[</span><span class="nv">tag</span><span class="p">]</span>
479 |   <span class="p">(</span><span class="k">let </span><span class="p">[</span><span class="nb">children </span><span class="p">(</span><span class="nf">multi*</span> <span class="o">#</span><span class="p">(</span><span class="nf">between</span> <span class="nv">pi</span> <span class="nv">elem-or-text</span> <span class="nv">pi</span><span class="p">))]</span>
480 |     <span class="p">(</span><span class="nf">close-tag</span> <span class="nv">tag</span><span class="p">)</span>
481 |     <span class="nv">children</span><span class="p">))</span>
482 | </pre></div>
483 | <p>Each child item is read using the <em>elem-or-text</em> parse function while ignoring any surrounding processing instructions using the <em>between</em> combinator; the combinator <em>multi*</em> is used to read all the child items.</p>
484 | <div class="highlight"><pre><span class="p">(</span><span class="kd">defn- </span><span class="nv">elem-or-text</span> <span class="p">[]</span>
485 |   <span class="p">(</span><span class="nf">look-ahead</span> <span class="p">[</span>
486 |                <span class="s">&quot;&lt;![CDATA[&quot;</span> <span class="nv">cdata</span>
487 |                <span class="s">&quot;&lt;/&quot;</span> <span class="p">(</span><span class="k">fn </span><span class="p">[]</span> <span class="nv">nil</span><span class="p">)</span>
488 |                <span class="s">&quot;&lt;&quot;</span> <span class="nv">element</span>
489 |                <span class="s">&quot;&quot;</span> <span class="o">#</span><span class="p">(</span><span class="nf">read-to</span> <span class="s">&quot;&lt;&quot;</span><span class="p">)]))</span>
490 | </pre></div>
491 | <p>The <em>look-ahead</em> parse combinator is used to call different parse functions
492 | based on different lookahead strings. Note that the <em>look-ahead</em> function 
493 | doesn&#39;t consume the lookahead string unlike the <em>look-ahead*</em> function used
494 | earlier (in the definition of <em>element</em> parse function).</p>
495 | <div class="highlight"><pre><span class="p">(</span><span class="kd">defn- </span><span class="nv">cdata</span> <span class="p">[]</span>
496 |   <span class="p">(</span><span class="nf">string</span> <span class="s">&quot;&lt;![CDATA[&quot;</span><span class="p">)</span>
497 |   <span class="p">(</span><span class="k">let </span><span class="p">[</span><span class="nv">txt</span> <span class="p">(</span><span class="nf">read-to</span> <span class="s">&quot;]]&gt;&quot;</span><span class="p">)]</span> <span class="p">(</span><span class="nf">string</span> <span class="s">&quot;]]&gt;&quot;</span><span class="p">)</span> <span class="nv">txt</span><span class="p">))</span>
498 | 
499 | <span class="p">(</span><span class="kd">defn- </span><span class="nv">close-tag</span> <span class="p">[</span><span class="nv">tag</span><span class="p">]</span>
500 |     <span class="p">(</span><span class="nf">string</span> <span class="p">(</span><span class="nb">str </span><span class="s">&quot;&lt;/&quot;</span> <span class="nv">tag</span><span class="p">))</span>
501 |     <span class="p">(</span><span class="nf">chr</span> <span class="sc">\&gt;</span><span class="p">))</span>
502 | </pre></div>
503 | <p>By now, it should be obvious what the above two functions do.</p>
504 | 
505 | <p>Well, an XML parser in under 50 lines.  Let&#39;s try it with a few sample inputs:</p>
506 | <div class="highlight"><pre><span class="nv">user&gt;</span> <span class="p">(</span><span class="nf">use</span> <span class="ss">&#39;protoflex.examples.xml_parse</span><span class="p">)</span>
507 | <span class="nv">nil</span>
508 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nf">parse-xml</span> <span class="s">&quot;&lt;abc&gt;text&lt;/abc&gt;&quot;</span><span class="p">)</span>
509 | <span class="p">{</span><span class="ss">:tag</span> <span class="s">&quot;abc&quot;</span>, <span class="ss">:attributes</span> <span class="p">{}</span>, <span class="ss">:children</span> <span class="p">[</span><span class="s">&quot;text&quot;</span><span class="p">]}</span>
510 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nf">parse-xml</span> <span class="s">&quot;&lt;abc a1=\&quot;1\&quot; a2=\&quot;attr 2\&quot;&gt;sample text&lt;/abc&gt;&quot;</span><span class="p">)</span>
511 | <span class="p">{</span><span class="ss">:tag</span> <span class="s">&quot;abc&quot;</span>, <span class="ss">:attributes</span> <span class="p">{</span><span class="s">&quot;a1&quot;</span> <span class="s">&quot;1&quot;</span>, <span class="s">&quot;a2&quot;</span> <span class="s">&quot;attr2&quot;</span><span class="p">}</span>, <span class="ss">:children</span> <span class="p">[</span><span class="s">&quot;sample text&quot;</span><span class="p">]}</span>
512 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nf">parse-xml</span> <span class="s">&quot;&lt;abc a1=\&quot;1\&quot; a2=\&quot;attr 2\&quot;&gt;&lt;def d1=\&quot;99\&quot;&gt;xxx&lt;/def&gt;&lt;/abc&gt;&quot;</span><span class="p">)</span>
513 | <span class="p">{</span><span class="ss">:tag</span> <span class="s">&quot;abc&quot;</span>, <span class="ss">:attributes</span> <span class="p">{</span><span class="s">&quot;a1&quot;</span> <span class="s">&quot;1&quot;</span>, <span class="s">&quot;a2&quot;</span> <span class="s">&quot;attr2&quot;</span><span class="p">}</span>, <span class="ss">:children</span> <span class="p">[{</span><span class="ss">:tag</span> <span class="s">&quot;def&quot;</span>, <span class="ss">:attributes</span> <span class="p">{</span><span class="s">&quot;d1&quot;</span> <span class="s">&quot;99&quot;</span><span class="p">}</span>, <span class="ss">:children</span> <span class="p">[</span><span class="s">&quot;xxx&quot;</span><span class="p">]}]}</span>
514 | <span class="nv">user&gt;</span> 
515 | </pre></div>
516 | <h2>Comments and Whitespaces</h2>
517 | 
518 | <p>By default, Parse-EZ automatically handles comments and whitespaces. This
519 | behavior can be turned on or off temporarily using the macros <code>with-trim-on</code>
520 | and <code>with-trim-off</code> respectively. The parser option <code>:auto-trim</code> can be used to
521 | enable or disable the auto handling of whitespace and comments.  Use the parser
522 | option <code>:blk-cmt-delim</code> to specify the begin and end delimiters for block
523 | comments.  The parser option <code>:line-cmt-start</code> can be used to specify the line
524 | comment marker.  By default, these options are set to java/C++ block and line
525 | comment markers respectively.  You can alter the whitespace recognizer by setting
526 | the <code>:ws-regex</code> parser option.  By default it is set to <code>#&quot;\s+&quot;</code>.</p>
527 | 
528 | <p>Alternatively, you can turn off auto-handling of whitespace and comments and use
529 | the <code>lexeme</code> function which trims the whitespace/comments after application of the
530 | parse-function passed as its argument.</p>
531 | 
532 | <p>Also see the <code>no-trim</code> and <code>no-trim-nl</code> functions.</p>
533 | 
534 | <h2>Primitive Parse Functions</h2>
535 | 
536 | <p>Parse-EZ provides a number of primitive parse functions such as: <code>chr</code>, 
537 | <code>chr-in</code>, <code>string</code>, <code>string-in</code>, <code>word</code>, <code>word-in</code>, <code>sq-str</code>,  <code>dq-str</code>, 
538 | <code>any-string</code>, <code>regex</code>, <code>read-to</code>, <code>skip-over</code>, <code>read-re</code>, <code>read-to-re</code>, 
539 | <code>skip-over-re</code>, <code>read-n</code>, <code>read-ch</code>, <code>read-ch-in-set</code>, etc.
540 | <a href="http://www.protoflex.com/parse-ez/api-doc/protoflex.parse-api.html" title="Parse-EZ API">See API Documentation</a></p>
541 | 
542 | <p>Let us try some of the builtin primitive parse functions:</p>
543 | <div class="highlight"><pre><span class="nv">user&gt;</span> <span class="p">(</span><span class="nf">use</span> <span class="ss">&#39;protoflex.parse</span><span class="p">)</span>
544 | <span class="nv">nil</span>
545 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="nv">integer</span> <span class="s">&quot;12&quot;</span><span class="p">)</span>
546 | <span class="mi">12</span>
547 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="nv">decimal</span> <span class="s">&quot;12.5&quot;</span><span class="p">)</span>
548 | <span class="mf">12.5</span>
549 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="nf">chr</span> <span class="sc">\a</span><span class="p">)</span> <span class="s">&quot;a&quot;</span><span class="p">)</span>
550 | <span class="sc">\a</span>
551 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="nf">chr-in</span> <span class="s">&quot;abc&quot;</span><span class="p">)</span> <span class="s">&quot;b&quot;</span><span class="p">)</span>
552 | <span class="sc">\b</span>
553 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="nf">string-in</span> <span class="p">[</span><span class="s">&quot;abc&quot;</span> <span class="s">&quot;def&quot;</span><span class="p">])</span> <span class="s">&quot;abc&quot;</span><span class="p">)</span>
554 | <span class="s">&quot;abc&quot;</span>
555 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="nf">string-in</span> <span class="p">[</span><span class="s">&quot;abc&quot;</span> <span class="s">&quot;def&quot;</span><span class="p">])</span> <span class="s">&quot;abcx&quot;</span><span class="p">)</span>
556 | <span class="nv">Parse</span> <span class="nv">Error</span><span class="err">:</span> <span class="nv">Extraneous</span> <span class="nv">text</span> <span class="nv">at</span> <span class="nv">line</span> <span class="mi">1</span>, <span class="nv">col</span> <span class="mi">4</span>
557 |   <span class="p">[</span><span class="nv">Thrown</span> <span class="nb">class </span><span class="nv">java.lang.Exception</span><span class="p">]</span>
558 | </pre></div>
559 | <p>Note the parse error for the last parse call. By default, the <code>parse</code> function parses to the
560 | end of the input text.  Even though the first 3 characters of the input text is recognized
561 | as valid input, a parse error is generated because the input cursor would not be at the
562 | end of input-text after recognizing &quot;abc&quot;.</p>
563 | 
564 | <p>The parser option <code>:eof</code> can be set to false to allow recognition of partial input:</p>
565 | <div class="highlight"><pre><span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="nf">string-in</span> <span class="p">[</span><span class="s">&quot;abc&quot;</span> <span class="s">&quot;def&quot;</span><span class="p">])</span> <span class="s">&quot;abcx&quot;</span> <span class="ss">:eof</span> <span class="nv">false</span><span class="p">)</span>
566 | <span class="s">&quot;abc&quot;</span>
567 | <span class="nv">user&gt;</span> 
568 | </pre></div>
569 | <p>You can start parsing by looking for some marker patterns using the <code>read-to</code>,
570 | <code>read-to-re</code>, <code>skip-over</code>, <code>skip-over-re</code> functions.</p>
571 | <div class="highlight"><pre><span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="k">do </span><span class="p">(</span><span class="nf">skip-over</span> <span class="s">&quot;&gt;&gt;&quot;</span><span class="p">)</span> <span class="p">(</span><span class="nf">number</span><span class="p">))</span> <span class="s">&quot;ignore upto this&gt;&gt; 456.7&quot;</span><span class="p">)</span>
572 | <span class="mf">456.7</span>
573 | </pre></div>
574 | <h2>Parse Combinators</h2>
575 | 
576 | <p>Parse Combinators in Parse-EZ are higher-order functions that take other parse
577 | functions as input arguments and combine/apply them in different ways to 
578 | implement new parse functionality.  Parse-EZ provides parse combinators such as:
579 | <code>opt</code>, <code>attempt</code>, <code>any</code>, <code>series</code>, <code>multi\*</code>, <code>multi+</code>, <code>between</code>, <code>look-ahead</code>, <code>lexeme</code>,
580 | <code>expect</code>, etc.
581 | <a href="http://www.protoflex.com/parse-ez/api-doc/protoflex.parse-api.html" title="Parse-EZ API">See API Documentation</a></p>
582 | 
583 | <p>Let us try some of the builtin parse combinators:</p>
584 | <div class="highlight"><pre><span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="nf">opt</span> <span class="nv">integer</span><span class="p">)</span> <span class="s">&quot;abc&quot;</span> <span class="ss">:eof</span> <span class="nv">false</span><span class="p">)</span>
585 | <span class="nv">nil</span>
586 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="nf">opt</span> <span class="nv">integer</span><span class="p">)</span> <span class="s">&quot;12&quot;</span><span class="p">)</span>
587 | <span class="mi">12</span>
588 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="nf">any</span> <span class="nv">integer</span> <span class="nv">decimal</span><span class="p">)</span> <span class="s">&quot;12&quot;</span><span class="p">)</span>
589 | <span class="mi">12</span>
590 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="nf">any</span> <span class="nv">integer</span> <span class="nv">decimal</span><span class="p">)</span> <span class="s">&quot;12.3&quot;</span><span class="p">)</span>
591 | <span class="mf">12.3</span>
592 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="nf">series</span> <span class="nv">integer</span> <span class="nv">decimal</span> <span class="nv">integer</span><span class="p">)</span> <span class="s">&quot;3 4.2 6&quot;</span><span class="p">)</span>
593 | <span class="p">[</span><span class="mi">3</span> <span class="mf">4.2</span> <span class="mi">6</span><span class="p">]</span>
594 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="nf">multi*</span> <span class="nv">integer</span><span class="p">)</span> <span class="s">&quot;1 2 3 4&quot;</span><span class="p">)</span>
595 | <span class="p">[</span><span class="mi">1</span> <span class="mi">2</span> <span class="mi">3</span> <span class="mi">4</span><span class="p">]</span>
596 | <span class="nv">user&gt;</span> <span class="p">(</span><span class="nb">parse </span><span class="o">#</span><span class="p">(</span><span class="nf">multi*</span> <span class="p">(</span><span class="k">fn </span><span class="p">[]</span> <span class="p">(</span><span class="nf">string-in</span> <span class="p">[</span><span class="s">&quot;abc&quot;</span> <span class="s">&quot;def&quot;</span><span class="p">])))</span> <span class="s">&quot;abcabcdefabc abcdef&quot;</span><span class="p">)</span>
597 | <span class="p">[</span><span class="s">&quot;abc&quot;</span> <span class="s">&quot;abc&quot;</span> <span class="s">&quot;def&quot;</span> <span class="s">&quot;abc&quot;</span> <span class="s">&quot;abc&quot;</span> <span class="s">&quot;def&quot;</span><span class="p">]</span>
598 | <span class="nv">user&gt;</span> 
599 | </pre></div>
600 | <p>You can create your own parse functions on top of primitive parse-functions and/or
601 | parse combinators provided by Parse-EZ.</p>
602 | 
603 | <h2>Committing to a particular parse branch</h2>
604 | 
605 | <p>Version 0.4.0 added support for committing to a particular parse branch via
606 | the new parse combinators <code>commit</code> and <code>commit-on</code>. These functions make the 
607 | parser commit to the current parse branch, making the parser report subsequent
608 | parse-failures in the current branch as parse-errors and preventing it 
609 | from trying other alternatives at higher levels.</p>
610 | 
611 | <h2>Nesting Parse Combinators Using Macros</h2>
612 | 
613 | <p>Version 0.3.0 of Parse-EZ adds macro versions of parse combinator functions
614 | to make it easy to nest calls to parse combinators without having to write
615 | nested anonymous functions using the &quot;(fn [] ...)&quot; syntax. Note that Clojure
616 | does not allow nesting of anonymous functions of &quot;#(...)&quot; forms.  Whereas
617 | the existing parse combinators take parse functions as arguments and actually
618 | perform parsing and return the parse results, the newly added macros take 
619 | parse expressions as arguments and return parse functions (to be passed 
620 | to other parse combinators).  These macros are named the same as the 
621 | corresponding parse combinators but with an underscore (&quot;_&quot;) suffix. For example
622 | the macro version of &quot;any&quot; is named &quot;any_&quot;.</p>
623 | 
624 | <h2>Error Handling</h2>
625 | 
626 | <p>Parse Errors are handled in Parse-EZ using Exceptions.  The default error messages generated
627 | by Parse-EZ include line and column number information and in some cases what is expected
628 | at that location.  However, you can provide your own custom error messages by using the
629 | <code>expect</code> parse combinator.</p>
630 | 
631 | <h2>Expressions</h2>
632 | 
633 | <p>Parse-EZ includes a customizable expression parser <code>expr</code> for parsing expressions in infix
634 | notation and an expression evaluator function <code>eval-expr</code> to evaluate infix expressions.
635 | You can customize the operators, their precedences and associative properties using
636 | <code>:operators</code> option to the <code>parse</code> function.  For evaluating expressions, you can optionally
637 | specify the functions to invoke for each operator using the <code>:op-fn-map</code> option.</p>
638 | 
639 | <h2>Parser State</h2>
640 | 
641 | <p>The parser state consists of the input cursor and various parser options (specified or derived)
642 | such as those affecting whitespace and comment parsing, word recognizers, expression parsing,
643 | etc.  The parser options can be changed any time in your own parse functions using <code>set-opt</code>.</p>
644 | 
645 | <p>Note that most of the parse functions affect Parser state (e.g: input cursor) and hence they are
646 | not pure functions.  The side-effects could be avoided by making the Parser State an explicit
647 | parameter to all the parse functions and returning the changed Parser State along with the parse
648 | value from each of the parse functions.  However, the result would be a significantly programmer
649 | unfriendly API.  We made a design decision to keep the parse fuctions simple and easy to use
650 | than to fanatically keep the functions &quot;pure&quot;.</p>
651 | 
652 | <h2>Relation to Parsec</h2>
653 | 
654 | <p>Parsec is a popular parser combinator library written in Haskell. While Parse-EZ
655 | makes use of some of the ideas in there, it is <em>not</em> a port of Parsec to Clojure.</p>
656 | 
657 | <h2>License</h2>
658 | 
659 | <p>Copyright (C) 2012 Protoflex Software</p>
660 | 
661 | <p>Distributed under the Eclipse Public License, the same as Clojure.</p>
662 | </article></div>


--------------------------------------------------------------------------------