├── api-doc.zip ├── target └── stale │ └── extract-native.dependencies ├── .gitignore ├── bin └── protoflex │ ├── util.clj │ ├── examples │ ├── csv_parse.clj │ └── xml_parse.clj │ ├── test_parse.clj │ └── parse.clj ├── src └── protoflex │ ├── util.clj │ ├── examples │ ├── csv_parse.clj │ └── xml_parse.clj │ └── parse.clj ├── pom.xml.asc ├── project.clj ├── .project ├── .classpath ├── ReleaseNotes.md ├── .settings └── ccw.repl.cmdhistory.prefs ├── ReleaseNotes.html ├── test └── protoflex │ └── test_parse.clj ├── README.md └── README.html /api-doc.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/protoflex/parse-ez/HEAD/api-doc.zip -------------------------------------------------------------------------------- /target/stale/extract-native.dependencies: -------------------------------------------------------------------------------- 1 | ([:dependencies ([org.clojure/clojure "1.4.0"])]) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /pom.xml 2 | *jar 3 | /lib 4 | /classes 5 | /native 6 | /.lein-failures 7 | /checkouts 8 | /.lein-deps-sum 9 | /api-doc 10 | /README.md.html 11 | /ReleaseNotes.md.html 12 | /doc.sh 13 | -------------------------------------------------------------------------------- /bin/protoflex/util.clj: -------------------------------------------------------------------------------- 1 | (ns ^{ :doc "Misc Utilities."} protoflex.util) 2 | 3 | (defmacro apply-macro [m args] `(eval (list* '~m ~args))) 4 | 5 | (defmacro macro->fn [m] `(fn [& args#] (apply-macro ~m args#))) -------------------------------------------------------------------------------- /src/protoflex/util.clj: -------------------------------------------------------------------------------- 1 | (ns ^{ :doc "Misc Utilities."} protoflex.util) 2 | 3 | (defmacro apply-macro [m args] `(eval (list* '~m ~args))) 4 | 5 | (defmacro macro->fn [m] `(fn [& args#] (apply-macro ~m args#))) -------------------------------------------------------------------------------- /pom.xml.asc: -------------------------------------------------------------------------------- 1 | -----BEGIN PGP SIGNATURE----- 2 | Version: GnuPG v1.4.11 (GNU/Linux) 3 | 4 | iQEcBAABAgAGBQJQtt0sAAoJEOxwYAq7Gjj9KMQH/351BGiPev2fKIspjSmBluLk 5 | tW5IrO3FpNFg+aPQdVvXS655ldeVlMXT5+qvRzK9OEorHhfQTDmBbP2FBRDGtq/D 6 | mlQAZoI+6Q2LuqmY5wWSUTQCyondfSyrV28UJlpr11f2srfmnMuHnhY/odj5hbEz 7 | ajtPAtyFjDVCcdG/NYjs0XOgLfvgHT6gdA8+deYib+PjzI2K0Ff/ksmcUVmcE+3u 8 | UZrrdmI+bc5oFYdug7NeYHpVafGYeAMdFY1fl7VLqwOADPryZlh0ToXu437m5Zcm 9 | e6EYP2RoSf1TgalTnS1u7uYgsQgTPYa6mlI9o77JjK/3NkAAY3DkqER1VXHi2Xc= 10 | =x14b 11 | -----END PGP SIGNATURE----- 12 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject protoflex/parse-ez "0.4.2" 2 | :description "Clojure Parser Library" 3 | :url "https://github.com/protoflex/parse-ez" 4 | :license {:name "Eclipse Public License"} 5 | :dependencies [[org.clojure/clojure "1.4.0"]] 6 | :dev-dependencies [[lein-autodoc "0.9.0"] 7 | [lein-clojars "0.6.0"]] 8 | :jvm-opts ["-Xmx512m"] 9 | :warn-on-reflection true 10 | :autodoc { :name "Parse Library", :page-title "Parse API Documentation" 11 | :copyright "Protoflex Software"} 12 | ) 13 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | parse-ez 4 | 5 | 6 | 7 | 8 | 9 | ccw.builder 10 | 11 | 12 | 13 | 14 | ccw.leiningen.builder 15 | 16 | 17 | 18 | 19 | org.eclipse.jdt.core.javabuilder 20 | 21 | 22 | 23 | 24 | 25 | org.eclipse.jdt.core.javanature 26 | ccw.leiningen.nature 27 | ccw.nature 28 | 29 | 30 | -------------------------------------------------------------------------------- /bin/protoflex/examples/csv_parse.clj: -------------------------------------------------------------------------------- 1 | (ns protoflex.examples.csv_parse 2 | (:use [protoflex.parse])) 3 | 4 | (declare detect-sep csv-1) 5 | 6 | (defn csv 7 | "Reads and returns one or more records as a vector of vector of field-values" 8 | ([] (csv (no-trim #(detect-sep)))) 9 | ([sep] (multi* (fn [] (no-trim #(csv-1 sep)))))) 10 | 11 | (defn csv-1 12 | "Reads and returns the fields of one record (line)" 13 | [sep] (sep-by #(any-string sep) #(chr sep))) 14 | 15 | (defn detect-sep 16 | "Detects the separator used in a csv file (a comma or a tab)" 17 | [] (let [m (mark-pos) 18 | s (attempt #(any dq-str sq-str)) 19 | s (if s s (no-trim #(read-to-re #",|\t"))) 20 | sep (read-ch)] 21 | (back-to-mark m) 22 | sep)) 23 | -------------------------------------------------------------------------------- /src/protoflex/examples/csv_parse.clj: -------------------------------------------------------------------------------- 1 | (ns protoflex.examples.csv_parse 2 | (:use [protoflex.parse])) 3 | 4 | (declare detect-sep csv-1) 5 | 6 | (defn csv 7 | "Reads and returns one or more records as a vector of vector of field-values" 8 | ([] (csv (no-trim #(detect-sep)))) 9 | ([sep] (multi* (fn [] (no-trim #(csv-1 sep)))))) 10 | 11 | (defn csv-1 12 | "Reads and returns the fields of one record (line)" 13 | [sep] (sep-by #(any-string sep) #(chr sep))) 14 | 15 | (defn detect-sep 16 | "Detects the separator used in a csv file (a comma or a tab)" 17 | [] (let [m (mark-pos) 18 | s (attempt #(any dq-str sq-str)) 19 | s (if s s (no-trim #(read-to-re #",|\t"))) 20 | sep (read-ch)] 21 | (back-to-mark m) 22 | sep)) 23 | -------------------------------------------------------------------------------- /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /bin/protoflex/examples/xml_parse.clj: -------------------------------------------------------------------------------- 1 | (ns protoflex.examples.xml_parse 2 | (:use [protoflex.parse])) 3 | 4 | (declare pi prolog element attributes children-and-close cdata elem-or-text close-tag) 5 | 6 | (defn parse-xml [xml-str] 7 | (parse #(between prolog element pi) xml-str :blk-cmt-delim [""] :line-cmt-start nil)) 8 | 9 | (defn- pi [] (while (starts-with? ""))) 10 | 11 | (defn- prolog [] (pi) (attempt #(regex #"(?s))|(.*?\]\s*>)")) (pi)) 12 | 13 | (def name-start ":A-Z_a-z\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\u02FF\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD") 14 | 15 | (def name-char (str name-start "\\-.0-9\\xB7\\u0300-\\u036F\\u203F-\\u2040")) 16 | 17 | (def name-re (-> (format "[%s][%s]*" name-start name-char) re-pattern)) 18 | 19 | (defn element [] 20 | (let [tag (do (chr \<) (regex name-re)) 21 | attrs (attributes) 22 | children (look-ahead* [ 23 | ">" #(children-and-close tag) 24 | "/>" (fn [] [])])] 25 | {:tag tag, :attributes attrs, :children children})) 26 | 27 | (defn attr [] 28 | (let [n (regex name-re) _ (chr \=) 29 | v (any sq-str dq-str)] 30 | [n v])) 31 | 32 | (defn attributes [] (apply hash-map (flatten (multi* attr)))) 33 | 34 | (defn- children-and-close [tag] 35 | (let [children (multi* #(between pi elem-or-text pi))] 36 | (close-tag tag) 37 | children)) 38 | 39 | (defn- elem-or-text [] 40 | (look-ahead [ 41 | "")] (string "]]>") txt)) 49 | 50 | (defn- close-tag [tag] 51 | (string (str ")) 53 | 54 | -------------------------------------------------------------------------------- /src/protoflex/examples/xml_parse.clj: -------------------------------------------------------------------------------- 1 | (ns protoflex.examples.xml_parse 2 | (:use [protoflex.parse])) 3 | 4 | (declare pi prolog element attributes children-and-close cdata elem-or-text close-tag) 5 | 6 | (defn parse-xml [xml-str] 7 | (parse #(between prolog element pi) xml-str :blk-cmt-delim [""] :line-cmt-start nil)) 8 | 9 | (defn- pi [] (while (starts-with? ""))) 10 | 11 | (defn- prolog [] (pi) (attempt #(regex #"(?s))|(.*?\]\s*>)")) (pi)) 12 | 13 | (def name-start ":A-Z_a-z\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\u02FF\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD") 14 | 15 | (def name-char (str name-start "\\-.0-9\\xB7\\u0300-\\u036F\\u203F-\\u2040")) 16 | 17 | (def name-re (-> (format "[%s][%s]*" name-start name-char) re-pattern)) 18 | 19 | (defn element [] 20 | (let [tag (do (chr \<) (regex name-re)) 21 | attrs (attributes) 22 | children (look-ahead* [ 23 | ">" #(children-and-close tag) 24 | "/>" (fn [] [])])] 25 | {:tag tag, :attributes attrs, :children children})) 26 | 27 | (defn attr [] 28 | (let [n (regex name-re) _ (chr \=) 29 | v (any sq-str dq-str)] 30 | [n v])) 31 | 32 | (defn attributes [] (apply hash-map (flatten (multi* attr)))) 33 | 34 | (defn- children-and-close [tag] 35 | (let [children (multi* #(between pi elem-or-text pi))] 36 | (close-tag tag) 37 | children)) 38 | 39 | (defn- elem-or-text [] 40 | (look-ahead [ 41 | "")] (string "]]>") txt)) 49 | 50 | (defn- close-tag [tag] 51 | (string (str ")) 53 | 54 | -------------------------------------------------------------------------------- /ReleaseNotes.md: -------------------------------------------------------------------------------- 1 | # Parse-EZ Release Notes 2 | 3 | ## Version 0.3.0 4 | 5 | ### Main Additions 6 | Version 0.3.0 of Parse-EZ adds macro versions of parse combinator functions 7 | to make it easy to nest calls to parse combinators without having to write 8 | nested anonymous functions using the "(fn [] ...)" syntax. Note that Clojure 9 | does not allow nesting of anonymous functions of "#(...)" forms. Whereas 10 | the existing parse combinators take parse functions as arguments and actually 11 | perform parsing and return the parse results, the newly added macros take 12 | parse expressions as arguments and return parse functions (to be passed 13 | to other parse combinators). These macros are named the same as the 14 | corresponding parse combinators but with an underscore ("\_") suffix. For example 15 | the macro version of `any` is named `any_`. 16 | 17 | ### Miscellaneous Changes 18 | 19 | The following minor changes/additions are made in the current release: 20 | 21 | - regex function now also accepts string regular expressions (in addition 22 | to patterns typically passed using #"..." forms). However, if you use 23 | strings, you would have to escape backslashes as you would do in Java. 24 | 25 | ## Version 0.3.5 26 | 27 | - Added `sep-by*` function with a slightly different behavior from `sep-by` 28 | - Bug fixes for `with-trim-on` and `with-trim-off`; they were not properly 29 | resetting the auto-trim option to original value. 30 | 31 | ## Version 0.4.1 32 | 33 | - Added functionality to commit to a particular parse branch and prevent 34 | the parser from trying alternatives at higher levels on parse-failure. 35 | See the new `commit` and `commit-on` functions. 36 | 37 | - Other new functions: `with-follow`, `with-follow*`, `with-no-follow`, 38 | `ident`, `key-word`, `semi` `comma`, `dot`, `colon`, `popen`, `pclose`, 39 | `bopen`, `bclose`, `sqopen`, `sqclose`, `aopen`, `aclose`, and `equal`. 40 | 41 | - Bug fixes for `sep-by`, `any-string` and `line-cmt`. 42 | 43 | - Leiningen coordinates: group-id 'protoflex' should be included now. 44 | Eg: [protoflex/parse-ez 0.4.2], instead of [parse-ez 0.3.6]. 45 | -------------------------------------------------------------------------------- /.settings/ccw.repl.cmdhistory.prefs: -------------------------------------------------------------------------------- 1 | cmdhistory=["(run-tests)" "(+ 2 2)" "(ns-publics)" "(ns-publics *ns*)" "*ns*" "(in-ns 'protoflex.test-parse)" "(ns-publics *ns*)" "*ns*" "(load-file \\"test/protoflex/test_parse.clj\\")" "(run-tests)" "*ns*" "(in-ns 'protoflex.test-parse)" "(run-tests)" "*ns*" "(in-ns 'user)" "*ns*" "(run-tests)" "*ns*" "(+ 2 2)" ";; Switching to protoflex.parse namespace" ";; Switching to protoflex.test_parse namespace" "(run-tests)" ";; Switching to protoflex.parse namespace" ";; Switching to protoflex.test_parse namespace" "(run-tests)" "(def fns (->fns (string \\"a\\") (string \\"b\\")))" "(parse (first fns) \\"a\\")" "(first fns)" "(def fns (->fns (string \\"a\\") (string \\"b\\")))" "(first fns)" "(def fns (->fns (string \\"a\\") (string \\"b\\")))" "(->fns '(string \\"a\\") '(string \\"b\\"))" "(parse (first *1) \\"a\\")" "(->fns (string \\"a\\") (string \\"b\\"))" ";; Switching to protoflex.test_parse namespace" "(run-tests)" ";; Switching to protoflex.examples.csv_parse namespace" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" "(defn any-string* [sep] (any \#(any-string sep) \#(multi* (read-n 1))))" "(defn csv-1 [sep] (sep-by \#(any-string* sep) \#(chr sep)))" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" "(.printStackTrace *e)" "(defn any-string* [sep] (any \#(any-string sep) (fn [] (multi* \#(read-n 1))))\\n )" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" "(defn any-string* [sep] (any \#(any-string sep) \#(regex \#\\".*\\")))" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" ";; Switching to protoflex.examples.csv_parse namespace" "(defn any-string* [sep] (any \#(regex \#\\".*\\")))" "(parse \#(any-string* \\",\\") \\"abc\\" )" "(parse \#(any-string* \\",\\") \\"\\" )" ";; Switching to protoflex.parse namespace" ";; Switching to protoflex.examples.csv_parse namespace" "(parse csv \\"foo,bar\\\\nbaz,boo\\\\n\\")" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" ";; Switching to protoflex.parse namespace" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" ";; Switching to protoflex.examples.csv_parse namespace" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" "(parse csv \\"foo,bar\\\\nbaz,boo\\\\n\\")" ";; Switching to protoflex.examples.csv_parse namespace" "(parse csv \\"foo,bar\\\\nbaz,boo\\\\n\\")" ";; Switching to protoflex.parse namespace" ";; Switching to protoflex.examples.csv_parse namespace" "(parse csv \\"foo,bar\\\\nbaz,boo\\\\n\\")" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" "(parse csv \#(read-to-re-or-eof \\"XXX\\") \\"foo,bar\\\\nbaz,boo\\")" "(parse \#(read-to-re-or-eof \\"XXX\\") \\"foo,bar\\\\nbaz,boo\\")" "(parse \#(read-to-re-or-eof \\"XXX\\") \\"foo,bar\\\\nbaz,XXXboo\\")" "(parse \#(read-to-re-or-eof \\"XXX\\") \\"foo,bar\\\\nbaz,XXXboo\\" \:eof false)" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" "(parse csv \\"foo,bar\\\\nbaz,boo\\\\n\\")" ";; Switching to protoflex.parse namespace" ";; Switching to protoflex.examples.csv_parse namespace" "(parse csv \\"foo,bar\\\\nbaz,boo\\\\n\\")" "(parse csv \\"foo,bar\\\\nbaz,boo\\")" "(parse csv \\"foo,bar\\\\nbaz,boo\\\\n\\")"] 2 | eclipse.preferences.version=1 3 | -------------------------------------------------------------------------------- /ReleaseNotes.html: -------------------------------------------------------------------------------- 1 |
251 |

Parse-EZ Release Notes

252 | 253 |

Version 0.3.0

254 | 255 |

Main Additions

256 | 257 |

Version 0.3.0 of Parse-EZ adds macro versions of parse combinator functions 258 | to make it easy to nest calls to parse combinators without having to write 259 | nested anonymous functions using the "(fn [] ...)" syntax. Note that Clojure 260 | does not allow nesting of anonymous functions of "#(...)" forms. Whereas 261 | the existing parse combinators take parse functions as arguments and actually 262 | perform parsing and return the parse results, the newly added macros take 263 | parse expressions as arguments and return parse functions (to be passed 264 | to other parse combinators). These macros are named the same as the 265 | corresponding parse combinators but with an underscore ("_") suffix. For example 266 | the macro version of any is named any_.

267 | 268 |

Miscellaneous Changes

269 | 270 |

The following minor changes/additions are made in the current release:

271 | 272 | 277 | 278 |

Version 0.3.5

279 | 280 | 285 | 286 |

Version 0.4.0

287 | 288 | 299 |
-------------------------------------------------------------------------------- /bin/protoflex/test_parse.clj: -------------------------------------------------------------------------------- 1 | ;Copyright (c) Protoflex Software. All rights reserved. 2 | ;The use and distribution terms for this software are covered by the 3 | ;Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) 4 | ;which can be found in the file epl-v10.html at the root of this distribution. 5 | ;By using this software in any fashion, you are agreeing to be bound by 6 | ;the terms of this license. 7 | ;You must not remove this notice, or any other, from this software. 8 | 9 | (ns ^{:doc "Test Parse Library functions" :author "Panduranga Adusumilli"} 10 | protoflex.test_parse 11 | (:use [clojure.test] 12 | [protoflex.parse] 13 | [protoflex.examples.csv_parse] 14 | [protoflex.examples.xml_parse])) 15 | 16 | (defmacro ptest [s expected & body] 17 | (let [res# `(parse #(do ~@body) ~s :eof false)] 18 | `(is (= ~res# ~expected)))) 19 | 20 | (defmacro ptest_ [s expected pf] 21 | (let [res# `(parse ~pf ~s :eof false)] 22 | `(is (= ~res# ~expected)))) 23 | 24 | (defmacro ptest-fail [s & body] 25 | `(is (~'thrown? Exception (parse #(do ~@body) ~s)))) 26 | 27 | (defmacro ptest-fail_ [s & pf] 28 | `(is (~'thrown? Exception (parse ~pf ~s)))) 29 | 30 | (deftest test-chr 31 | (ptest "abc" \a (chr \a)) 32 | (ptest "abc" [\a \b] [(chr \a) (chr \b)]) 33 | (ptest "a b c" [\a \b \c] [(chr \a) (chr \b) (chr \c)]) 34 | (ptest-fail "abc" (chr \b)) 35 | (ptest-fail "" (chr \a)) 36 | ) 37 | 38 | (deftest test-chr-in 39 | (ptest "axy" [\a] [(chr-in "abc")]) 40 | (ptest "axb" [\a] [(chr-in [\b \a \c])]) 41 | (ptest "axc" [\a] [(chr-in #{\c \b \a})]) 42 | (ptest "acdef" [\a \c] [(chr-in "abc") (chr-in "abc")]) 43 | (ptest-fail "xyza" (chr-in "abc")) 44 | (ptest-fail "" (chr-in "abc")) 45 | ) 46 | 47 | (deftest test-string 48 | (ptest "abc" "abc" (string "abc")) 49 | (ptest "abcdef" "abc" (string "abc")) 50 | (ptest "abc def" "abc def" (string "abc def")) 51 | (ptest "abcdef" ["abc" "de"] [(string "abc") (string "de")]) 52 | (ptest "abc def" ["abc" "de"] [(string "abc") (string "de")]) 53 | (ptest "abc/*comment*/def" ["abc" "def"] 54 | [(string "abc") (string "def")]) 55 | (ptest "abc defxyz" ["abc" "def" "x"] 56 | [(string "abc") (string "def") (string "x")]) 57 | (ptest-fail "" (string "abc")) 58 | (ptest-fail "ab c" (string "abc")) 59 | ) 60 | 61 | (deftest test-string-in 62 | (ptest "abcxyzdef" "abc" (string-in ["abc" "def"])) 63 | (ptest "abcxyzdef" "abc" (string-in ["def" "abc"])) 64 | (ptest ">>=" ">>=" (string-in [">>" ">>="])) 65 | (ptest ">>=" ">>" (string-in-ord [">>" ">>="])) 66 | (ptest "abcdefxyz" ["abc" "def"] 67 | [(string-in ["abc" "def"]) (string-in ["abc" "def"])]) 68 | (ptest-fail "xyzabc" (string-in ["abc" "def"])) 69 | ) 70 | 71 | ;; TODO: test cases for custom word-reader 72 | (deftest test-word 73 | (ptest "abc def" ["abc" "def"] [(word "abc") (word "def")]) 74 | (ptest "abc /*comment*/ def" ["abc" "def"] [(word "abc") (word "def")]) 75 | (ptest-fail "abcdef" (word "abc")) 76 | (ptest-fail "abc d" (word "abc d")) 77 | ) 78 | 79 | (deftest test-word-in 80 | (ptest "abc xyz" "abc" (word-in ["abc" "def" "ghi"])) 81 | (ptest "ghi xyz" "ghi" (word-in ["abc" "def" "ghi"])) 82 | (ptest "abc def ghi" ["abc" "def"] 83 | (let [words ["abc" "def" "ghi"]] [(word-in words) (word-in words)])) 84 | (ptest-fail "abcdef" (word-in ["abc" "def"])) 85 | (ptest-fail "xyzabcdef" (word-in ["abc" "def"])) 86 | ) 87 | 88 | (deftest test-between 89 | (ptest "(1234)" 1234 (parens number)) 90 | (ptest "{1234}" 1234 (braces number)) 91 | (ptest "[1234]" 1234 (sq-brackets number)) 92 | (ptest "<1234>" 1234 (ang-brackets number)) 93 | ) 94 | 95 | (deftest test-regex 96 | (ptest "abc def123" "abc def" (regex #"\w+\s+([a-z]*)")) 97 | (ptest "abc def123" "abc def" (regex "\\w+\\s+([a-z]*)")) 98 | (ptest "abc def123 456" ["abc" "def123" "456"] 99 | [(regex #"\w+") (regex #"\w+") (regex #"\d+")]) 100 | (ptest "abc def123 456" ["abc" "def123" "456"] 101 | [(regex "\\w+") (regex "\\w+") (regex "\\d+")]) 102 | (ptest "abc def123" "123" (regex #"(a\w+)\s+[a-z]*(\d+)" 2)) 103 | (ptest "abc def123" "123" (regex "(a\\w+)\\s+[a-z]*(\\d+)" 2)) 104 | (ptest-fail "abcdef123" (regex #"\w+\s+([a-z]*)")) 105 | (ptest-fail "abcdef123" (regex "\\w+\\s+([a-z]*)")) 106 | ) 107 | 108 | 109 | (deftest test-integer 110 | (ptest "123 abc" 123 (integer)) 111 | (ptest "-123 abc" -123 (integer)) 112 | (ptest-fail "a123" (integer)) 113 | (ptest-fail "123.45" (integer)) 114 | (ptest-fail "123abc" (integer)) 115 | ) 116 | 117 | (deftest test-decimal 118 | (ptest "123 abc" 123.0 (decimal)) 119 | (ptest "123.45 abc" 123.45 (decimal)) 120 | (ptest "-123.45 abc" -123.45 (decimal)) 121 | (ptest-fail " 123abc" (decimal)) 122 | (ptest-fail " 123.45abc" (decimal)) 123 | ) 124 | 125 | (deftest test-number 126 | (ptest "123" 123 (number)) 127 | (ptest "123.5" 123.5 (number)) 128 | ) 129 | 130 | (deftest test-dq-str 131 | (ptest "\"abc\"def" "abc" (dq-str)) 132 | (ptest (str \" "ab" \\ \" \c \") (str "ab" \" \c) (dq-str)) 133 | (ptest-fail "abcdef" (dq-str)) 134 | ) 135 | 136 | (deftest test-sq-str 137 | (ptest "'abc'def" "abc" (sq-str)) 138 | (ptest (str "'abc" \\ "'def'ghi") "abc'def" (sq-str)) 139 | (ptest-fail "abcdef" (sq-str)) 140 | ) 141 | 142 | (deftest test-read-to 143 | (ptest "abcdef12345" ["abcdef" 12345] [(read-to "123") (integer)]) 144 | (ptest-fail "abcdef12345" (read-to "999")) 145 | ) 146 | 147 | (deftest test-skip-over 148 | (ptest "abcdef" ["abc" "def"] [(skip-over "abc") (string "def")]) 149 | (ptest "abc def" ["abc" "def"] [(skip-over "abc") (string "def")]) 150 | (ptest-fail "abc def" (with-trim-off (skip-over "abc") (string "def"))) 151 | ) 152 | 153 | (deftest test-read-re 154 | (ptest "abc123" "abc" (read-re #"(\D+)")) 155 | (ptest "abc123" "abc" (read-re "(\\D+)")) 156 | ) 157 | 158 | (deftest test-read-to-re 159 | (ptest "123abc" "123" (read-to-re #"(\D+)")) 160 | (ptest "123abc" "123" (read-to-re "(\\D+)")) 161 | ) 162 | 163 | (deftest test-skip-over-re 164 | (ptest "123abc456" "123abc" (skip-over-re #"(\D+)")) 165 | (ptest "123abc456" "123abc" (skip-over-re "(\\D+)")) 166 | (ptest "123abc" "123" (skip-over-re #"(\d+)")) 167 | (ptest "123abc" "123" (skip-over-re "(\\d+)")) 168 | (ptest-fail "123abc" (skip-over-re #"([x-z]+)")) 169 | (ptest-fail "123abc" (skip-over-re "([x-z]+)")) 170 | ) 171 | 172 | (deftest test-starts-with? 173 | (ptest "abcdef" true (starts-with? "abc")) 174 | (ptest "abcdef" false (starts-with? "def")) 175 | (ptest "abcdef" true (starts-with? "")) 176 | (ptest "" false (starts-with? "def")) 177 | ) 178 | 179 | (deftest test-starts-with-re? 180 | (ptest "123abc" true (starts-with-re? #"\d+")) 181 | (ptest "123abc" true (starts-with-re? "\\d+")) 182 | (ptest "abc123" false (starts-with-re? #"\\d+")) 183 | (ptest "abc123" false (starts-with-re? "\\d+")) 184 | ) 185 | 186 | (deftest test-read-n 187 | (ptest "abcdef" "abc" (read-n 3)) 188 | (ptest "abcdef" "a" (read-n 1)) 189 | (ptest "abcdef" nil (read-n 0)) 190 | (ptest-fail "abcdef" (read-n 20)) 191 | ) 192 | 193 | (deftest test-read-ch 194 | (ptest "abcdef" [\a \b \c] [(read-ch) (read-ch) (read-ch)]) 195 | (ptest "a b c" [\a \b \c] [(read-ch) (read-ch) (read-ch)]) 196 | (ptest "a b c" [\a \space \b] [(read-ch true) (read-ch true) (read-ch true)]) 197 | (ptest-fail "" (read-ch)) 198 | ) 199 | 200 | (deftest test-read-ch-in-set 201 | (ptest "abcdef" [\a \b] [(read-ch-in-set #{\a \b \c}) 202 | (read-ch-in-set #{\a \b \c})]) 203 | (ptest-fail "axyz" (read-ch-in-set #{\x \y \z})) 204 | ) 205 | 206 | (deftest test-blk-cmt 207 | (ptest "abc/*comment*/def" ["abc" "/*comment*/"] 208 | (with-trim-off [(string "abc") (blk-cmt "/*" "*/")])) 209 | 210 | (ptest "abcdef" ["abc" ""] 211 | (with-trim-off [(string "abc") (blk-cmt "")])) 212 | 213 | (ptest "abc/*comment*/def" ["abc" "def"] [(string "abc") (string "def")]) 214 | (ptest-fail "abc/*comment*/def" [(string "abc") (blk-cmt "/*" "*/")]) 215 | ) 216 | 217 | (deftest test-line-cmt 218 | (ptest "abc//line comment" ["abc" "//line comment"] 219 | (with-trim-off [(string "abc") (line-cmt "//")])) 220 | (ptest "abc//line comment\ndef" ["abc" "//line comment\n"] 221 | (with-trim-off [(string "abc") (line-cmt "//")])) 222 | ) 223 | 224 | (deftest test-ws 225 | (ptest "abc /*xyz*/ ghi //kkk" ["abc" " /*xyz*/ " "ghi" " //kkk"] 226 | (with-trim-off [(string "abc") (ws) (string "ghi") (ws)])) 227 | ) 228 | 229 | 230 | (deftest test-trim-on 231 | (ptest " abc def" ["abc" "def"] 232 | (with-trim-on [(string "abc") (string "def")])) 233 | 234 | (ptest " abc def " ["abc" "def"] 235 | (with-trim-on [(string "abc") (string "def")])) 236 | ) 237 | 238 | (deftest test-trim-off 239 | ; note: the leading whitespaces are trimmed before with-trim-off is called 240 | (ptest " abc def" ["abc" " def"] 241 | (with-trim-off [(string "abc") (string " def")])) 242 | (is (thrown? Exception (parse #(string "abc") " abc" :auto-trim false))) 243 | (is (thrown? Exception (parse #(with-trim-off (string "abc")) " abc" :auto-trim false))) 244 | ) 245 | 246 | (deftest test-no-trim 247 | (ptest_ "123 456" [123 " " 456] (no-trim_ (series number #(string " ") number))) 248 | (ptest-fail "123 456" (series number #(string " ") number)) 249 | (ptest_ "123 \n 456" [123 "\n" 456] (no-trim-nl_ (series number #(string "\n") number))) 250 | ) 251 | 252 | (deftest test-lexeme 253 | (let [wr #(regex #"\w+")] 254 | (ptest "abc/*comment*/def" ["abc" "def"] 255 | (with-trim-off [(lexeme wr) (wr)])) 256 | (ptest "abc def" ["abc" "def"] 257 | (with-trim-off [(lexeme wr) (wr)])) 258 | ) 259 | ) 260 | 261 | (deftest test-attempt 262 | (ptest "abc" "abc" (attempt #(string "abc"))) 263 | (ptest_ "abc" "abc" (attempt_ (string "abc"))) 264 | (ptest "abc" nil (attempt #(string "def"))) 265 | (ptest_ "abc" nil (attempt_ (string "def"))) 266 | ) 267 | 268 | (deftest test-opt 269 | (ptest "abc" "abc" (opt #(string "abc"))) 270 | (ptest_ "abc" "abc" (opt_ (string "abc"))) 271 | (ptest "abc" 123 (opt #(string "def") 123)) 272 | (ptest_ "abc" 123 (opt_ (string "def") 123)) 273 | ) 274 | 275 | (deftest test-any 276 | (ptest "abcdef" "abc" (any #(regex #"\d+") #(string "abc"))) 277 | (ptest_ "abcdef" "abc" (any_ (regex #"\d+") (string "abc"))) 278 | (ptest "123abcdef" "123" (any #(regex #"\d+") #(string "abc"))) 279 | (ptest_ "123abcdef" "123" (any_ (regex #"\d+") (string "abc"))) 280 | (ptest-fail "abcdef" (any #(regex #"\d+") #(string "xyz"))) 281 | (ptest-fail_ "abcdef" (any_ (regex #"\d+") (string "xyz"))) 282 | ) 283 | 284 | (deftest test-look-ahead 285 | (let [la-spec ["a" #(string "abc") "x" #(string "xyz")]] 286 | (ptest "abc def" "abc" (look-ahead la-spec)) 287 | (ptest "xyz def" "xyz" (look-ahead la-spec)) 288 | (ptest-fail "pqr def" (look-ahead la-spec)) 289 | )) 290 | 291 | (deftest test-look-ahead* 292 | (let [la-spec [ 293 | "0x" #(-> (regex #"[0-9a-fA-F]+") (Integer/parseInt 16)) 294 | "0" #(-> (regex #"[0-7]+") (Integer/parseInt 8)) 295 | "" #(number)]] 296 | (ptest "0x123 0123" 16r123 (look-ahead* la-spec)) 297 | (ptest "123 0x123 0123" [123 16r123 8r123] (multi+ #(look-ahead* la-spec))) 298 | )) 299 | 300 | (deftest test-series 301 | (ptest "abc123xxx" ["abc" "123" "xxx"] 302 | (series #(string "abc") #(regex #"\d+") #(word "xxx"))) 303 | (ptest_ "abc123xxx" ["abc" "123" "xxx"] 304 | (series_ (string "abc") (regex #"\d+") (word "xxx"))) 305 | 306 | (ptest-fail "abc123xxx" 307 | (series #(string "abc") #(regex #"[a-w]+") #(word "xxx"))) 308 | (ptest-fail_ "abc123xxx" 309 | (series_ (string "abc") (regex #"[a-w]+") (word "xxx"))) 310 | ) 311 | 312 | (deftest test-multi* 313 | (ptest "abcabcabcdef" ["abc" "abc" "abc"] (multi* #(string "abc"))) 314 | (ptest_ "abcabcabcdef" ["abc" "abc" "abc"] (multi*_ (string "abc"))) 315 | 316 | (ptest "xyzabcabcabc" nil (multi* #(string "abc"))) 317 | (ptest_ "xyzabcabcabc" nil (multi*_ (string "abc"))) 318 | 319 | (ptest "" nil (multi* #(string "abc"))) 320 | (ptest_ "" nil (multi*_ (string "abc"))) 321 | 322 | (ptest "xyzabcabc" ["xyz" ["abc" "abc"]] 323 | [(string "xyz") (multi* #(string "abc"))]) 324 | (ptest "xyzabcabc" ["xyz" ["abc" "abc"]] 325 | (series #(string "xyz") (multi*_ (string "abc")))) 326 | (ptest_ "xyzabcabc" ["xyz" ["abc" "abc"]] 327 | (fn [] (series #(string "xyz") (multi*_ (string "abc"))))) 328 | ) 329 | 330 | (deftest test-multi+ 331 | (ptest "abcabcabcdef" ["abc" "abc" "abc"] (multi+ #(string "abc"))) 332 | (ptest_ "abcabcabcdef" ["abc" "abc" "abc"] (multi+_ (string "abc"))) 333 | 334 | (ptest-fail "xyzabcabcabc" (multi+ #(string "abc"))) 335 | (ptest-fail_ "xyzabcabcabc" (multi+_ (string "abc"))) 336 | 337 | (ptest-fail "" (multi+ #(string "abc"))) 338 | (ptest-fail_ "" (multi+_ (string "abc"))) 339 | 340 | (ptest "xyzabcabc" ["xyz" ["abc" "abc"]] 341 | [(string "xyz") (multi+ #(string "abc"))]) 342 | (ptest_ "xyzabcabc" ["xyz" ["abc" "abc"]] 343 | (fn [] (series #(string "xyz") (multi+_ (string "abc"))))) 344 | ) 345 | 346 | (deftest test-times 347 | (ptest "aaaa" [\a \a \a \a] (times 4 #(chr \a))) 348 | (ptest_ "aaaa" [\a \a \a \a] (times_ 4 (chr \a))) 349 | 350 | (ptest "aaaa" ["aa" "aa"] (times 2 #(string "aa"))) 351 | (ptest_ "aaaa" ["aa" "aa"] (times_ 2 (string "aa"))) 352 | 353 | (ptest "aa aa" ["aa" "aa"] (times 2 #(string "aa"))) 354 | (ptest_ "aa aa" ["aa" "aa"] (times_ 2 (string "aa"))) 355 | ) 356 | 357 | (deftest test-expect 358 | (ptest "230" "230" (expect "integer" #(regex #"\d+"))) 359 | (ptest_ "230" "230" (expect_ "integer" (regex #"\d+"))) 360 | 361 | (try (parse (fn [] (expect "integer" #(regex #"\d+"))) "x230") 362 | (catch Exception ex (is (.contains (.getMessage ex) "Expecting \"integer\"")))) 363 | (try (parse (expect_ "integer" (regex #"\d+")) "x230") 364 | (catch Exception ex (is (.contains (.getMessage ex) "Expecting \"integer\"")))) 365 | ) 366 | 367 | (deftest test-parse_ 368 | (is (= (parse_ [(string "abc"), (string "def")] "abc def") ["abc" "def"])) 369 | (is (= (parse_ [(regex #"\d+"), (regex "\\S+")] "123 def") ["123" "def"])) 370 | (is (= (parse_ (multi+ (any_ (regex #"\d+") (regex #"\D+"))) "123 def456") ["123" "def" "456"])) 371 | ) 372 | 373 | (deftest test-sep-by 374 | (ptest "123.45" [123.45] (sep-by decimal #(chr \,))) 375 | (ptest "123.45, 234.56,345" [123.45 234.56 345.0] (sep-by decimal #(chr \,))) 376 | (ptest-fail "123.45, 234, 345abc" (sep-by decimal #(chr \,))) 377 | ) 378 | 379 | (deftest test-detect-sep 380 | (ptest "abc,ghi" \, (detect-sep)) 381 | (ptest "abc\tghi" \tab, (detect-sep)) 382 | (ptest-fail "abcghi" (detect-sep)) 383 | (ptest-fail "\"abc,ghi\"" (detect-sep)) 384 | ) 385 | 386 | (deftest test-csv 387 | (let [s1 "'1abc','def',ghi\n2abc,def,ghi\n" 388 | s2 "\"1a,bc\",def,ghi\n2abc,def,ghi\n" 389 | s3 "1abc\tdef\tghi\n2abc\tdef\tghi\n" 390 | s4 "\"1a\tbc\"\tdef\tghi\n2abc\tdef\tghi\n"] 391 | 392 | (ptest s1 [["1abc" "def" "ghi"]["2abc" "def" "ghi"]] (csv)) 393 | (ptest s2 [["1a,bc" "def" "ghi"]["2abc" "def" "ghi"]] (csv)) 394 | 395 | (ptest s3 [["1abc" "def" "ghi"]["2abc" "def" "ghi"]] (csv)) 396 | (ptest s4 [["1a\tbc" "def" "ghi"]["2abc" "def" "ghi"]] (csv)) 397 | )) 398 | 399 | (deftest test-expr 400 | (ptest "3+2-5" ["-" ["+" 3 2] 5] (expr)) 401 | (ptest "3+2*5" ["+" 3 ["*" 2 5]] (expr)) 402 | (ptest "-3+2*5" ["+" -3 ["*" 2 5]] (expr)) 403 | ) 404 | 405 | (deftest test-eval-expr 406 | (is (= (eval-expr "3") 3)) 407 | (is (= (eval-expr "-3") -3)) 408 | (is (= (eval-expr "3+2") 5)) 409 | (is (= (eval-expr "3+ -2") 1)) 410 | (is (= (eval-expr "3+-2") 1)) 411 | (is (= (eval-expr "3+2*3") 9)) 412 | (is (= (eval-expr "3+2*3+4") 13)) 413 | (is (= (eval-expr "(3+2)*3+4") 19)) 414 | (is (= (eval-expr "(3+2)*(3+4)") 35)) 415 | (is (= (eval-expr "(3*6/2)") 9)) 416 | (is (= (eval-expr "(36/2*3)") 54)) 417 | ) 418 | 419 | (use '[clojure.java.shell :only [sh]]) 420 | (defn test-xml [] 421 | (let [ dir (str (System/getProperty "user.home") "/test/data/xmltest/valid") 422 | files (-> (sh "find" dir ) ^String (:out) (.split "\n")) 423 | files (filter #(.endsWith ^String %1 ".xml") files) 424 | ignore #{ 425 | "/sa/049.xml" 426 | "/sa/050.xml" 427 | "/sa/051.xml" 428 | "/sa/114.xml" 429 | }] 430 | (doseq [f files] (if (not (get ignore (subs f (count dir)))) 431 | (do (println f) (parse-xml (slurp f))))))) 432 | 433 | -------------------------------------------------------------------------------- /test/protoflex/test_parse.clj: -------------------------------------------------------------------------------- 1 | ;Copyright (c) Protoflex Software. All rights reserved. 2 | ;The use and distribution terms for this software are covered by the 3 | ;Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) 4 | ;which can be found in the file epl-v10.html at the root of this distribution. 5 | ;By using this software in any fashion, you are agreeing to be bound by 6 | ;the terms of this license. 7 | ;You must not remove this notice, or any other, from this software. 8 | 9 | (ns ^{:doc "Test Parse Library functions" :author "Panduranga Adusumilli"} 10 | protoflex.test_parse 11 | (:use [clojure.test] 12 | [protoflex.parse] 13 | [protoflex.examples.csv_parse] 14 | [protoflex.examples.xml_parse])) 15 | 16 | (defmacro ptest [s expected & body] 17 | (let [res# `(parse #(do ~@body) ~s :eof false)] 18 | `(is (= ~res# ~expected)))) 19 | 20 | (defmacro ptest_ [s expected pf] 21 | (let [res# `(parse ~pf ~s :eof false)] 22 | `(is (= ~res# ~expected)))) 23 | 24 | (defmacro ptest-fail [s & body] 25 | `(is (~'thrown? Exception (parse #(do ~@body) ~s)))) 26 | 27 | (defmacro ptest-fail_ [s & pf] 28 | `(is (~'thrown? Exception (parse ~pf ~s)))) 29 | 30 | (deftest test-chr 31 | (ptest "abc" \a (chr \a)) 32 | (ptest "abc" [\a \b] [(chr \a) (chr \b)]) 33 | (ptest "a b c" [\a \b \c] [(chr \a) (chr \b) (chr \c)]) 34 | (ptest-fail "abc" (chr \b)) 35 | (ptest-fail "" (chr \a)) 36 | ) 37 | 38 | (deftest test-chr-in 39 | (ptest "axy" [\a] [(chr-in "abc")]) 40 | (ptest "axb" [\a] [(chr-in [\b \a \c])]) 41 | (ptest "axc" [\a] [(chr-in #{\c \b \a})]) 42 | (ptest "acdef" [\a \c] [(chr-in "abc") (chr-in "abc")]) 43 | (ptest-fail "xyza" (chr-in "abc")) 44 | (ptest-fail "" (chr-in "abc")) 45 | ) 46 | 47 | (deftest test-string 48 | (ptest "abc" "abc" (string "abc")) 49 | (ptest "abcdef" "abc" (string "abc")) 50 | (ptest "abc def" "abc def" (string "abc def")) 51 | (ptest "abcdef" ["abc" "de"] [(string "abc") (string "de")]) 52 | (ptest "abc def" ["abc" "de"] [(string "abc") (string "de")]) 53 | (ptest "abc/*comment*/def" ["abc" "def"] 54 | [(string "abc") (string "def")]) 55 | (ptest "abc defxyz" ["abc" "def" "x"] 56 | [(string "abc") (string "def") (string "x")]) 57 | (ptest-fail "" (string "abc")) 58 | (ptest-fail "ab c" (string "abc")) 59 | ) 60 | 61 | (deftest test-string-in 62 | (ptest "abcxyzdef" "abc" (string-in ["abc" "def"])) 63 | (ptest "abcxyzdef" "abc" (string-in ["def" "abc"])) 64 | (ptest ">>=" ">>=" (string-in [">>" ">>="])) 65 | (ptest ">>=" ">>" (string-in-ord [">>" ">>="])) 66 | (ptest "abcdefxyz" ["abc" "def"] 67 | [(string-in ["abc" "def"]) (string-in ["abc" "def"])]) 68 | (ptest-fail "xyzabc" (string-in ["abc" "def"])) 69 | ) 70 | 71 | ;; TODO: test cases for custom word-reader 72 | (deftest test-word 73 | (ptest "abc def" ["abc" "def"] [(word "abc") (word "def")]) 74 | (ptest "abc /*comment*/ def" ["abc" "def"] [(word "abc") (word "def")]) 75 | (ptest-fail "abcdef" (word "abc")) 76 | (ptest-fail "abc d" (word "abc d")) 77 | ) 78 | 79 | (deftest test-word-in 80 | (ptest "abc xyz" "abc" (word-in ["abc" "def" "ghi"])) 81 | (ptest "ghi xyz" "ghi" (word-in ["abc" "def" "ghi"])) 82 | (ptest "abc def ghi" ["abc" "def"] 83 | (let [words ["abc" "def" "ghi"]] [(word-in words) (word-in words)])) 84 | (ptest-fail "abcdef" (word-in ["abc" "def"])) 85 | (ptest-fail "xyzabcdef" (word-in ["abc" "def"])) 86 | ) 87 | 88 | (deftest test-between 89 | (ptest "(1234)" 1234 (parens number)) 90 | (ptest "{1234}" 1234 (braces number)) 91 | (ptest "[1234]" 1234 (sq-brackets number)) 92 | (ptest "<1234>" 1234 (ang-brackets number)) 93 | ) 94 | 95 | (deftest test-regex 96 | (ptest "abc def123" "abc def" (regex #"\w+\s+([a-z]*)")) 97 | (ptest "abc def123" "abc def" (regex "\\w+\\s+([a-z]*)")) 98 | (ptest "abc def123 456" ["abc" "def123" "456"] 99 | [(regex #"\w+") (regex #"\w+") (regex #"\d+")]) 100 | (ptest "abc def123 456" ["abc" "def123" "456"] 101 | [(regex "\\w+") (regex "\\w+") (regex "\\d+")]) 102 | (ptest "abc def123" "123" (regex #"(a\w+)\s+[a-z]*(\d+)" 2)) 103 | (ptest "abc def123" "123" (regex "(a\\w+)\\s+[a-z]*(\\d+)" 2)) 104 | (ptest-fail "abcdef123" (regex #"\w+\s+([a-z]*)")) 105 | (ptest-fail "abcdef123" (regex "\\w+\\s+([a-z]*)")) 106 | ) 107 | 108 | 109 | (deftest test-integer 110 | (ptest "123 abc" 123 (integer)) 111 | (ptest "-123 abc" -123 (integer)) 112 | (ptest-fail "a123" (integer)) 113 | (ptest-fail "123.45" (integer)) 114 | (ptest-fail "123abc" (integer)) 115 | ) 116 | 117 | (deftest test-decimal 118 | (ptest "123 abc" 123.0 (decimal)) 119 | (ptest "123.45 abc" 123.45 (decimal)) 120 | (ptest "-123.45 abc" -123.45 (decimal)) 121 | (ptest-fail " 123abc" (decimal)) 122 | (ptest-fail " 123.45abc" (decimal)) 123 | ) 124 | 125 | (deftest test-number 126 | (ptest "123" 123 (number)) 127 | (ptest "123.5" 123.5 (number)) 128 | ) 129 | 130 | (deftest test-dq-str 131 | (ptest "\"abc\"def" "abc" (dq-str)) 132 | (ptest (str \" "ab" \\ \" \c \") (str "ab" \" \c) (dq-str)) 133 | (ptest-fail "abcdef" (dq-str)) 134 | ) 135 | 136 | (deftest test-sq-str 137 | (ptest "'abc'def" "abc" (sq-str)) 138 | (ptest (str "'abc" \\ "'def'ghi") "abc'def" (sq-str)) 139 | (ptest-fail "abcdef" (sq-str)) 140 | ) 141 | 142 | (deftest test-read-to 143 | (ptest "abcdef12345" ["abcdef" 12345] [(read-to "123") (integer)]) 144 | (ptest-fail "abcdef12345" (read-to "999")) 145 | ) 146 | 147 | (deftest test-skip-over 148 | (ptest "abcdef" ["abc" "def"] [(skip-over "abc") (string "def")]) 149 | (ptest "abc def" ["abc" "def"] [(skip-over "abc") (string "def")]) 150 | (ptest-fail "abc def" (with-trim-off (skip-over "abc") (string "def"))) 151 | ) 152 | 153 | (deftest test-read-re 154 | (ptest "abc123" "abc" (read-re #"(\D+)")) 155 | (ptest "abc123" "abc" (read-re "(\\D+)")) 156 | ) 157 | 158 | (deftest test-read-to-re 159 | (ptest "123abc" "123" (read-to-re #"(\D+)")) 160 | (ptest "123abc" "123" (read-to-re "(\\D+)")) 161 | ) 162 | 163 | (deftest test-skip-over-re 164 | (ptest "123abc456" "123abc" (skip-over-re #"(\D+)")) 165 | (ptest "123abc456" "123abc" (skip-over-re "(\\D+)")) 166 | (ptest "123abc" "123" (skip-over-re #"(\d+)")) 167 | (ptest "123abc" "123" (skip-over-re "(\\d+)")) 168 | (ptest-fail "123abc" (skip-over-re #"([x-z]+)")) 169 | (ptest-fail "123abc" (skip-over-re "([x-z]+)")) 170 | ) 171 | 172 | (deftest test-starts-with? 173 | (ptest "abcdef" true (starts-with? "abc")) 174 | (ptest "abcdef" false (starts-with? "def")) 175 | (ptest "abcdef" true (starts-with? "")) 176 | (ptest "" false (starts-with? "def")) 177 | ) 178 | 179 | (deftest test-starts-with-re? 180 | (ptest "123abc" true (starts-with-re? #"\d+")) 181 | (ptest "123abc" true (starts-with-re? "\\d+")) 182 | (ptest "abc123" false (starts-with-re? #"\\d+")) 183 | (ptest "abc123" false (starts-with-re? "\\d+")) 184 | ) 185 | 186 | (deftest test-read-n 187 | (ptest "abcdef" "abc" (read-n 3)) 188 | (ptest "abcdef" "a" (read-n 1)) 189 | (ptest "abcdef" nil (read-n 0)) 190 | (ptest-fail "abcdef" (read-n 20)) 191 | ) 192 | 193 | (deftest test-read-ch 194 | (ptest "abcdef" [\a \b \c] [(read-ch) (read-ch) (read-ch)]) 195 | (ptest "a b c" [\a \b \c] [(read-ch) (read-ch) (read-ch)]) 196 | (ptest "a b c" [\a \space \b] [(read-ch true) (read-ch true) (read-ch true)]) 197 | (ptest-fail "" (read-ch)) 198 | ) 199 | 200 | (deftest test-read-ch-in-set 201 | (ptest "abcdef" [\a \b] [(read-ch-in-set #{\a \b \c}) 202 | (read-ch-in-set #{\a \b \c})]) 203 | (ptest-fail "axyz" (read-ch-in-set #{\x \y \z})) 204 | ) 205 | 206 | (deftest test-blk-cmt 207 | (ptest "abc/*comment*/def" ["abc" "/*comment*/"] 208 | (with-trim-off [(string "abc") (blk-cmt "/*" "*/")])) 209 | 210 | (ptest "abcdef" ["abc" ""] 211 | (with-trim-off [(string "abc") (blk-cmt "")])) 212 | 213 | (ptest "abc/*comment*/def" ["abc" "def"] [(string "abc") (string "def")]) 214 | (ptest-fail "abc/*comment*/def" [(string "abc") (blk-cmt "/*" "*/")]) 215 | ) 216 | 217 | (deftest test-line-cmt 218 | (ptest "abc//line comment" ["abc" "//line comment"] 219 | (with-trim-off [(string "abc") (line-cmt "//")])) 220 | (ptest "abc//line comment\ndef" ["abc" "//line comment\n"] 221 | (with-trim-off [(string "abc") (line-cmt "//")])) 222 | ) 223 | 224 | (deftest test-ws 225 | (ptest "abc /*xyz*/ ghi //kkk" ["abc" " /*xyz*/ " "ghi" " //kkk"] 226 | (with-trim-off [(string "abc") (ws) (string "ghi") (ws)])) 227 | ) 228 | 229 | 230 | (deftest test-trim-on 231 | (ptest " abc def" ["abc" "def"] 232 | (with-trim-on [(string "abc") (string "def")])) 233 | 234 | (ptest " abc def " ["abc" "def"] 235 | (with-trim-on [(string "abc") (string "def")])) 236 | ) 237 | 238 | (deftest test-trim-off 239 | ; note: the leading whitespaces are trimmed before with-trim-off is called 240 | (ptest " abc def" ["abc" " def"] 241 | (with-trim-off [(string "abc") (string " def")])) 242 | (is (thrown? Exception (parse #(string "abc") " abc" :auto-trim false))) 243 | (is (thrown? Exception (parse #(with-trim-off (string "abc")) " abc" :auto-trim false))) 244 | ) 245 | 246 | (deftest test-no-trim 247 | (ptest_ "123 456" [123 " " 456] (no-trim_ (series number #(string " ") number))) 248 | (ptest-fail "123 456" (series number #(string " ") number)) 249 | (ptest_ "123 \n 456" [123 "\n" 456] (no-trim-nl_ (series number #(string "\n") number))) 250 | ) 251 | 252 | (deftest test-lexeme 253 | (let [wr #(regex #"\w+")] 254 | (ptest "abc/*comment*/def" ["abc" "def"] 255 | (with-trim-off [(lexeme wr) (wr)])) 256 | (ptest "abc def" ["abc" "def"] 257 | (with-trim-off [(lexeme wr) (wr)])) 258 | ) 259 | ) 260 | 261 | (deftest test-attempt 262 | (ptest "abc" "abc" (attempt #(string "abc"))) 263 | (ptest_ "abc" "abc" (attempt_ (string "abc"))) 264 | (ptest "abc" nil (attempt #(string "def"))) 265 | (ptest_ "abc" nil (attempt_ (string "def"))) 266 | ) 267 | 268 | (deftest test-opt 269 | (ptest "abc" "abc" (opt #(string "abc"))) 270 | (ptest_ "abc" "abc" (opt_ (string "abc"))) 271 | (ptest "abc" 123 (opt #(string "def") 123)) 272 | (ptest_ "abc" 123 (opt_ (string "def") 123)) 273 | ) 274 | 275 | (deftest test-any 276 | (ptest "abcdef" "abc" (any #(regex #"\d+") #(string "abc"))) 277 | (ptest_ "abcdef" "abc" (any_ (regex #"\d+") (string "abc"))) 278 | (ptest "123abcdef" "123" (any #(regex #"\d+") #(string "abc"))) 279 | (ptest_ "123abcdef" "123" (any_ (regex #"\d+") (string "abc"))) 280 | (ptest-fail "abcdef" (any #(regex #"\d+") #(string "xyz"))) 281 | (ptest-fail_ "abcdef" (any_ (regex #"\d+") (string "xyz"))) 282 | ) 283 | 284 | (deftest test-look-ahead 285 | (let [la-spec ["a" #(string "abc") "x" #(string "xyz")]] 286 | (ptest "abc def" "abc" (look-ahead la-spec)) 287 | (ptest "xyz def" "xyz" (look-ahead la-spec)) 288 | (ptest-fail "pqr def" (look-ahead la-spec)) 289 | )) 290 | 291 | (deftest test-look-ahead* 292 | (let [la-spec [ 293 | "0x" #(-> (regex #"[0-9a-fA-F]+") (Integer/parseInt 16)) 294 | "0" #(-> (regex #"[0-7]+") (Integer/parseInt 8)) 295 | "" #(number)]] 296 | (ptest "0x123 0123" 16r123 (look-ahead* la-spec)) 297 | (ptest "123 0x123 0123" [123 16r123 8r123] (multi+ #(look-ahead* la-spec))) 298 | )) 299 | 300 | (deftest test-series 301 | (ptest "abc123xxx" ["abc" "123" "xxx"] 302 | (series #(string "abc") #(regex #"\d+") #(word "xxx"))) 303 | (ptest_ "abc123xxx" ["abc" "123" "xxx"] 304 | (series_ (string "abc") (regex #"\d+") (word "xxx"))) 305 | 306 | (ptest-fail "abc123xxx" 307 | (series #(string "abc") #(regex #"[a-w]+") #(word "xxx"))) 308 | (ptest-fail_ "abc123xxx" 309 | (series_ (string "abc") (regex #"[a-w]+") (word "xxx"))) 310 | ) 311 | 312 | (deftest test-multi* 313 | (ptest "abcabcabcdef" ["abc" "abc" "abc"] (multi* #(string "abc"))) 314 | (ptest_ "abcabcabcdef" ["abc" "abc" "abc"] (multi*_ (string "abc"))) 315 | 316 | (ptest "xyzabcabcabc" nil (multi* #(string "abc"))) 317 | (ptest_ "xyzabcabcabc" nil (multi*_ (string "abc"))) 318 | 319 | (ptest "" nil (multi* #(string "abc"))) 320 | (ptest_ "" nil (multi*_ (string "abc"))) 321 | 322 | (ptest "xyzabcabc" ["xyz" ["abc" "abc"]] 323 | [(string "xyz") (multi* #(string "abc"))]) 324 | (ptest "xyzabcabc" ["xyz" ["abc" "abc"]] 325 | (series #(string "xyz") (multi*_ (string "abc")))) 326 | (ptest_ "xyzabcabc" ["xyz" ["abc" "abc"]] 327 | (fn [] (series #(string "xyz") (multi*_ (string "abc"))))) 328 | ) 329 | 330 | (deftest test-multi+ 331 | (ptest "abcabcabcdef" ["abc" "abc" "abc"] (multi+ #(string "abc"))) 332 | (ptest_ "abcabcabcdef" ["abc" "abc" "abc"] (multi+_ (string "abc"))) 333 | 334 | (ptest-fail "xyzabcabcabc" (multi+ #(string "abc"))) 335 | (ptest-fail_ "xyzabcabcabc" (multi+_ (string "abc"))) 336 | 337 | (ptest-fail "" (multi+ #(string "abc"))) 338 | (ptest-fail_ "" (multi+_ (string "abc"))) 339 | 340 | (ptest "xyzabcabc" ["xyz" ["abc" "abc"]] 341 | [(string "xyz") (multi+ #(string "abc"))]) 342 | (ptest_ "xyzabcabc" ["xyz" ["abc" "abc"]] 343 | (fn [] (series #(string "xyz") (multi+_ (string "abc"))))) 344 | ) 345 | 346 | (deftest test-times 347 | (ptest "aaaa" [\a \a \a \a] (times 4 #(chr \a))) 348 | (ptest_ "aaaa" [\a \a \a \a] (times_ 4 (chr \a))) 349 | 350 | (ptest "aaaa" ["aa" "aa"] (times 2 #(string "aa"))) 351 | (ptest_ "aaaa" ["aa" "aa"] (times_ 2 (string "aa"))) 352 | 353 | (ptest "aa aa" ["aa" "aa"] (times 2 #(string "aa"))) 354 | (ptest_ "aa aa" ["aa" "aa"] (times_ 2 (string "aa"))) 355 | ) 356 | 357 | (deftest test-expect 358 | (ptest "230" "230" (expect "integer" #(regex #"\d+"))) 359 | (ptest_ "230" "230" (expect_ "integer" (regex #"\d+"))) 360 | 361 | (try (parse (fn [] (expect "integer" #(regex #"\d+"))) "x230") 362 | (catch Exception ex (is (.contains (.getMessage ex) "Expecting \"integer\"")))) 363 | (try (parse (expect_ "integer" (regex #"\d+")) "x230") 364 | (catch Exception ex (is (.contains (.getMessage ex) "Expecting \"integer\"")))) 365 | ) 366 | 367 | (deftest test-parse_ 368 | (is (= (parse_ [(string "abc"), (string "def")] "abc def") ["abc" "def"])) 369 | (is (= (parse_ [(regex #"\d+"), (regex "\\S+")] "123 def") ["123" "def"])) 370 | (is (= (parse_ (multi+ (any_ (regex #"\d+") (regex #"\D+"))) "123 def456") ["123" "def" "456"])) 371 | ) 372 | 373 | (deftest test-sep-by 374 | (ptest "123.45" [123.45] (sep-by decimal #(chr \,))) 375 | (ptest "123.45, 234.56,345" [123.45 234.56 345.0] (sep-by decimal #(chr \,))) 376 | (ptest-fail "123.45, 234, 345abc" (sep-by decimal #(chr \,))) 377 | ) 378 | 379 | (deftest test-detect-sep 380 | (ptest "abc,ghi" \, (detect-sep)) 381 | (ptest "abc\tghi" \tab, (detect-sep)) 382 | (ptest-fail "abcghi" (detect-sep)) 383 | (ptest-fail "\"abc,ghi\"" (detect-sep)) 384 | ) 385 | 386 | (deftest test-csv 387 | (let [s1 "'1abc','def',ghi\n2abc,def,ghi\n" 388 | s2 "\"1a,bc\",def,ghi\n2abc,def,ghi\n" 389 | s3 "1abc\tdef\tghi\n2abc\tdef\tghi\n" 390 | s4 "\"1a\tbc\"\tdef\tghi\n2abc\tdef\tghi\n"] 391 | 392 | (ptest s1 [["1abc" "def" "ghi"]["2abc" "def" "ghi"]] (csv)) 393 | (ptest s2 [["1a,bc" "def" "ghi"]["2abc" "def" "ghi"]] (csv)) 394 | 395 | (ptest s3 [["1abc" "def" "ghi"]["2abc" "def" "ghi"]] (csv)) 396 | (ptest s4 [["1a\tbc" "def" "ghi"]["2abc" "def" "ghi"]] (csv)) 397 | )) 398 | 399 | (deftest test-expr 400 | (ptest "3+2-5" ["-" ["+" 3 2] 5] (expr)) 401 | (ptest "3+2*5" ["+" 3 ["*" 2 5]] (expr)) 402 | (ptest "-3+2*5" ["+" -3 ["*" 2 5]] (expr)) 403 | ) 404 | 405 | (deftest test-eval-expr 406 | (is (= (eval-expr "3") 3)) 407 | (is (= (eval-expr "-3") -3)) 408 | (is (= (eval-expr "3+2") 5)) 409 | (is (= (eval-expr "3+ -2") 1)) 410 | (is (= (eval-expr "3+-2") 1)) 411 | (is (= (eval-expr "3+2*3") 9)) 412 | (is (= (eval-expr "3+2*3+4") 13)) 413 | (is (= (eval-expr "(3+2)*3+4") 19)) 414 | (is (= (eval-expr "(3+2)*(3+4)") 35)) 415 | (is (= (eval-expr "(3*6/2)") 9)) 416 | (is (= (eval-expr "(36/2*3)") 54)) 417 | ) 418 | 419 | (use '[clojure.java.shell :only [sh]]) 420 | (defn test-xml [] 421 | (let [ dir (str (System/getProperty "user.home") "/test/data/xmltest/valid") 422 | files (-> (sh "find" dir ) ^String (:out) (.split "\n")) 423 | files (filter #(.endsWith ^String %1 ".xml") files) 424 | ignore #{ 425 | "/sa/049.xml" 426 | "/sa/050.xml" 427 | "/sa/051.xml" 428 | "/sa/114.xml" 429 | }] 430 | (doseq [f files] (if (not (get ignore (subs f (count dir)))) 431 | (do (println f) (parse-xml (slurp f))))))) 432 | 433 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Parse-EZ : Clojure Parser Library 2 | 3 | [api]: http://www.protoflex.com/parse-ez/api-doc/protoflex.parse-api.html "Parse-EZ API" 4 | [API Documentation][api] 5 | 6 | Parse-EZ is a parser library for Clojure programmers. It allows easy 7 | mixing of declarative and imperative styles and does not 8 | require any special constructs, macros, monads, etc. to write custom parsers. 9 | All the parsing is implemented using regular Clojure functions. 10 | 11 | The library provides a number of 12 | parse functions and combinators and comes with a built-in customizable infix 13 | expression parser and evaluator. It allows the programmer to concisely specify 14 | the structure of input text using Clojure functions and easily build parse trees 15 | without having to step out of Clojure. Whether you are writing a parser 16 | for some well structured data or for data scraping or prototyping a new language, 17 | you can make use of this library to quickly create a parser. 18 | 19 | ## Features 20 | 21 | - Parse functions and Combinators 22 | - Automatic handling of whitespaces, comments 23 | - Marking positions and backtracking 24 | - Seek, read, skip string/regex patterns 25 | - Builtin customizable expression parser and evaluator 26 | - Exceptions based error handling 27 | - Custom error messages 28 | 29 | ## Usage 30 | 31 | ### Installation 32 | Just add Parse-EZ as a dependency to your lein project 33 | 34 | ```clojure 35 | [protoflex/parse-ez "0.4.2"] 36 | ``` 37 | and run 38 | 39 | ```clojure 40 | lein deps 41 | ``` 42 | 43 | ## A Taste of Parse-EZ 44 | 45 | Here are a couple of sample parsers to give you a taste of the parser library. 46 | 47 | ### CSV Parser 48 | 49 | A CSV file contains multiple records, one-record per line, with field-values separated by a delimiter 50 | such as a comma or a tab. The field values may optionally be quoted either using a single or double 51 | quotes. When field-values are quoted, they may contain the field-delimiter characters, and in such 52 | cases they will not be treated as field separators. 53 | 54 | 55 | First, let us define a parse function for parsing one-line of csv file: 56 | 57 | ```clojure 58 | (defn csv-1 [sep] 59 | (sep-by #(any-string sep) #(chr sep))) 60 | ``` 61 | In the above function definition, we make use of the parse combinator `sep-by` 62 | which takes two arguments: the first one to read a field-value and the second 63 | one to read the separator. Here, we have used Clojure's anonymous function shortcuts to 64 | specify the desired behavior succinctly. The `any-string` function matches a single-quoted 65 | string or a double-quoted string or a plain-string that is followed by the specified separator 66 | `sep`. This is exactly the function that we need to read the field-value. The second argument 67 | provided to `sep-by` above uses the primitive parse function `chr` which succeeds only when 68 | the next character in the input matches its argument (`sep` parameter in this case). The _csv-1_ function returns the field values as a vector. 69 | 70 | The `sep-by` function actually takes a third, optional argument as record-separator 71 | function with the default value of a function that matches a newline. We didn't 72 | pass the third argument above because the default behavior suits our purpose. 73 | Had the default behavior of `sep-by` been different, we would have written the 74 | above function as: 75 | 76 | ```clojure 77 | (defn csv-1 [sep] 78 | (sep-by #(any-string sep) #(chr sep) #(regex #"\r?\n"))) 79 | ``` 80 | 81 | Now that we have created a parse function to parse a single line of CSV 82 | file, let us write another parse function that parses the entire CSV file 83 | content and returns the result as a vector of vector of field values 84 | (one-vector per record/line). All we need to do is to repeatedly apply the 85 | above defined `csv-1` function and the `multi*` parse combinator does 86 | just that. 87 | 88 | Just one small but important detail: by default, Parse-EZ 89 | automatically trims whitespace after successfully applying a parse function. 90 | This means that the newline at the end of line would be consumed after reading 91 | the last field value and the `sep-by` would be unable to match the end-of-line 92 | which is the record-separator in this case. So, we will disable the newline 93 | trimming functionality using the `no-trim` combinator. 94 | 95 | ```clojure 96 | (defn csv [sep] 97 | (multi* (fn [] (no-trim #(csv-1 sep))))) 98 | ``` 99 | 100 | Alternatively, you can express the above function a bit more easily using the macro versions of combinators introduced in Version 0.3.0 as follows: 101 | 102 | ```clojure 103 | (defn csv [sep] 104 | (multi* (no-trim_ (csv-1 sep)))) 105 | ``` 106 | 107 | Now, let us try out our csv parser. First let us define a couple of test 108 | strings containing a couple of records (lines) each. Note that the second 109 | string contains a comma inside the first cell (a quoted string). 110 | 111 | ```clojure 112 | user> (def s1 "1abc,def,ghi\n2jkl,mno,pqr\n") 113 | #'user/s1 114 | user> (def s2 "'1a,bc',def,ghi\n2jkl,mno,pqr\n") 115 | #'user/s2 116 | user> (parse #(csv \,) s1) 117 | [["1abc" "def" "ghi"] ["2jkl" "mno" "pqr"]] 118 | user> (parse #(csv \,) s2) 119 | [["1a,bc" "def" "ghi"] ["2jkl" "mno" "pqr"]] 120 | user> 121 | ``` 122 | 123 | Well, all we had to do was to write two lines of Clojure code to implement the CSV parser. 124 | Let's add a bit more functionality: the CSV files may use a comma or a tab character to 125 | separate the field values. Let's say we don't know ahead of time which character 126 | a file uses as a separator and we want to detect the separator automatically. Note 127 | that both characters may occur in a data file, but only one acts as a field-separator -- that too 128 | only when it's not inside a quoted string. 129 | 130 | Here is our strategy to detect the separator: 131 | 132 | - if the first field value is quoted (single or double), read the quoted string 133 | - else, read until one of comma or tab occurs 134 | - the next char is our delimiter 135 | 136 | Here is the code: 137 | 138 | ```clojure 139 | (defn detect-sep [] 140 | (let [m (mark-pos) 141 | s (attempt #(any dq-str sq-str)) 142 | s (if s s (no-trim #(read-to-re #",|\t"))) 143 | sep (read-ch)] 144 | (back-to-mark m) 145 | sep)) 146 | ``` 147 | 148 | Note how we used the `mark-pos` and `back-to-mark` Parse-EZ functions to 'unconsume' 149 | the consumed input. 150 | 151 | The complete code for the sample CSV parser with the separator-detection functionality is 152 | listed below (you can find this in `csv_parse.clj` file under the `examples` directory. 153 | 154 | ```clojure 155 | (ns protoflex.examples.csv_parse 156 | (:use [protoflex.parse])) 157 | 158 | (declare detect-sep csv-1) 159 | 160 | (defn csv 161 | "Reads and returns one or more records as a vector of vector of field-values" 162 | ([] (csv (no-trim #(detect-sep)))) 163 | ([sep] (multi* (fn [] (no-trim-nl #(csv-1 sep)))))) 164 | 165 | (defn csv-1 166 | "Reads and returns the fields of one record (line)" 167 | [sep] (sep-by #(any-string sep) #(chr sep))) 168 | 169 | (defn detect-sep 170 | "Detects the separator used in a csv file (a comma or a tab)" 171 | [] (let [m (mark-pos) 172 | s (attempt #(any dq-str sq-str)) 173 | s (if s s (no-trim #(read-to-re #",|\t"))) 174 | sep (read-ch)] 175 | (back-to-mark m) 176 | sep)) 177 | ``` 178 | 179 | Let's try out the new auto-detect functionality. Let us define two new test 180 | strings `s3` and `s4` that use `tab` character as field-separator. 181 | 182 | ```clojure 183 | user> (use 'protoflex.examples.csv_parse) 184 | nil 185 | user> (def s3 "1abc\tdef\tghi\n2jkl\tmno\tpqr\n") 186 | #'user/s3 187 | user> (def s4 "'1a\tbc'\tdef\tghi\n2jkl\tmno\tpqr\n") 188 | #'user/s4 189 | user> (parse csv s3) 190 | [["1abc" "def" "ghi"] ["2jkl" "mno" "pqr"]] 191 | user> (parse csv s4) 192 | [["1a\tbc" "def" "ghi"] ["2jkl" "mno" "pqr"]] 193 | user> (parse csv s1) 194 | [["1abc" "def" "ghi"] ["2jkl" "mno" "pqr"]] 195 | user> 196 | ``` 197 | 198 | As you can see, this time we didn't specify what field-separator to use: the parser 199 | itself detected the field-separator character and used it, returning us the desired 200 | results. 201 | 202 | ### XML Parser 203 | 204 | Here is the listing of a sample XML parser implemented using Parse-EZ. You can find the 205 | source file in the examples directory. The parser returns a map containing keys and values 206 | for `:tag`, `:attributes` and `:children` for the root element. The value for `:attributes` key 207 | is itself another map containing attribute names and their values. The value for `:children` 208 | key is a vector (potentially empty) containing string content and/or maps for child elements. 209 | 210 | ```clojure 211 | (ns protoflex.examples.xml_parse 212 | (:use [protoflex.parse])) 213 | 214 | (declare pi prolog element attributes children-and-close cdata elem-or-text close-tag) 215 | 216 | (defn parse-xml [xml-str] 217 | (parse #(between prolog element pi) xml-str :blk-cmt-delim [""] :line-cmt-start nil)) 218 | 219 | (defn- pi [] (while (starts-with? ""))) 220 | 221 | (defn- prolog [] (pi) (attempt #(regex #"(?s))|(.*?\]\s*>)")) (pi)) 222 | ``` 223 | The function _parse-xml_ is the entry point that kicks off parsing of input xml string _xml-str_. It passes the _between_ combinator to __Parse-EZ__'s _parse_ function. Here, the call to _between_ returns the value returned by the _element_ parse function, ignoring the content surrounding it (matched by _prolog_ and _pi_ functions). The block-comment delimiters are set to match XML's and the line-comment delimiter is cleared (by default these match Java comments). 224 | 225 | The parse function _pi_ is used to skip consecutive processing instructions by using the delimiters ____. 226 | 227 | The parse function _prolog_ is used to skip DTD declaration (if any) and also any surrounding processing instructions. Note that the regex used to match DTD declaration is only meant for illustration purposes. It isn't complete but will work in most cases. 228 | 229 | ```clojure 230 | (def name-start ":A-Z_a-z\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\u02FF\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD") 231 | 232 | (def name-char (str name-start "\\-.0-9\\xB7\\u0300-\\u036F\\u203F-\\u2040")) 233 | 234 | (def name-re (-> (format "[%s][%s]*" name-start name-char) re-pattern)) 235 | ``` 236 | _name-re_ is a regular expression that matches xml element and attribute names. 237 | 238 | ```clojure 239 | (defn element [] 240 | (let [tag (do (chr \<) (regex name-re)) 241 | attrs (attributes) 242 | children (look-ahead* [ 243 | ">" #(children-and-close tag) 244 | "/>" (fn [] [])])] 245 | {:tag tag, :attributes attrs, :children children})) 246 | ``` 247 | The _element_ parse function matches an xml element and returns the tag, attribute list and children in a hash map. Note the usage of the _look_ahead*_ combinator to handle both the cases -- with children and without children. If it sees a ">" after reading the attributes, the _look-ahead*_ function calls the _children-and-close_ parse function to read children and the element close tag. On the other hand, if it sees "/>" after the attributes, it calls the (almost) empty parse function that simply returns an empty list. 248 | 249 | ```clojure 250 | (defn attr [] 251 | (let [n (regex name-re) _ (chr \=) 252 | v (any sq-str dq-str)] 253 | [n v])) 254 | 255 | (defn attributes [] (apply hash-map (flatten (multi* attr)))) 256 | ``` 257 | The _attr_ parse function matches a single attribute. The attribute value may be 258 | a single-quoted or double-quoted string. Note the usage of _any_ parse combinator for this purpose. 259 | 260 | The _attributes_ parse function matches multiple attribute specifications by passing the _attr_ parse function to _multi*_ parse combinator. 261 | 262 | ```clojure 263 | (defn- children-and-close [tag] 264 | (let [children (multi* #(between pi elem-or-text pi))] 265 | (close-tag tag) 266 | children)) 267 | ``` 268 | Each child item is read using the _elem-or-text_ parse function while ignoring any surrounding processing instructions using the _between_ combinator; the combinator _multi*_ is used to read all the child items. 269 | 270 | ```clojure 271 | (defn- elem-or-text [] 272 | (look-ahead [ 273 | "")] (string "]]>") txt)) 287 | 288 | (defn- close-tag [tag] 289 | (string (str ")) 291 | ``` 292 | By now, it should be obvious what the above two functions do. 293 | 294 | Well, an XML parser in under 50 lines. Let's try it with a few sample inputs: 295 | 296 | ```clojure 297 | user> (use 'protoflex.examples.xml_parse) 298 | nil 299 | user> (parse-xml "text") 300 | {:tag "abc", :attributes {}, :children ["text"]} 301 | user> (parse-xml "sample text") 302 | {:tag "abc", :attributes {"a1" "1", "a2" "attr2"}, :children ["sample text"]} 303 | user> (parse-xml "xxx") 304 | {:tag "abc", :attributes {"a1" "1", "a2" "attr2"}, :children [{:tag "def", :attributes {"d1" "99"}, :children ["xxx"]}]} 305 | user> 306 | ``` 307 | 308 | ## Comments and Whitespaces 309 | 310 | By default, Parse-EZ automatically handles comments and whitespaces. This 311 | behavior can be turned on or off temporarily using the macros `with-trim-on` 312 | and `with-trim-off` respectively. The parser option `:auto-trim` can be used to 313 | enable or disable the auto handling of whitespace and comments. Use the parser 314 | option `:blk-cmt-delim` to specify the begin and end delimiters for block 315 | comments. The parser option `:line-cmt-start` can be used to specify the line 316 | comment marker. By default, these options are set to java/C++ block and line 317 | comment markers respectively. You can alter the whitespace recognizer by setting 318 | the `:ws-regex` parser option. By default it is set to `#"\s+"`. 319 | 320 | Alternatively, you can turn off auto-handling of whitespace and comments and use 321 | the `lexeme` function which trims the whitespace/comments after application of the 322 | parse-function passed as its argument. 323 | 324 | Also see the `no-trim` and `no-trim-nl` functions. 325 | 326 | ## Primitive Parse Functions 327 | 328 | Parse-EZ provides a number of primitive parse functions such as: `chr`, 329 | `chr-in`, `string`, `string-in`, `word`, `word-in`, `sq-str`, `dq-str`, 330 | `any-string`, `regex`, `read-to`, `skip-over`, `read-re`, `read-to-re`, 331 | `skip-over-re`, `read-n`, `read-ch`, `read-ch-in-set`, etc. 332 | [See API Documentation][api] 333 | 334 | Let us try some of the builtin primitive parse functions: 335 | 336 | ```clojure 337 | user> (use 'protoflex.parse) 338 | nil 339 | user> (parse integer "12") 340 | 12 341 | user> (parse decimal "12.5") 342 | 12.5 343 | user> (parse #(chr \a) "a") 344 | \a 345 | user> (parse #(chr-in "abc") "b") 346 | \b 347 | user> (parse #(string-in ["abc" "def"]) "abc") 348 | "abc" 349 | user> (parse #(string-in ["abc" "def"]) "abcx") 350 | Parse Error: Extraneous text at line 1, col 4 351 | [Thrown class java.lang.Exception] 352 | ``` 353 | 354 | Note the parse error for the last parse call. By default, the `parse` function parses to the 355 | end of the input text. Even though the first 3 characters of the input text is recognized 356 | as valid input, a parse error is generated because the input cursor would not be at the 357 | end of input-text after recognizing "abc". 358 | 359 | The parser option `:eof` can be set to false to allow recognition of partial input: 360 | 361 | ```clojure 362 | user> (parse #(string-in ["abc" "def"]) "abcx" :eof false) 363 | "abc" 364 | user> 365 | ``` 366 | 367 | You can start parsing by looking for some marker patterns using the `read-to`, 368 | `read-to-re`, `skip-over`, `skip-over-re` functions. 369 | 370 | ```clojure 371 | user> (parse #(do (skip-over ">>") (number)) "ignore upto this>> 456.7") 372 | 456.7 373 | ``` 374 | 375 | ## Parse Combinators 376 | 377 | Parse Combinators in Parse-EZ are higher-order functions that take other parse 378 | functions as input arguments and combine/apply them in different ways to 379 | implement new parse functionality. Parse-EZ provides parse combinators such as: 380 | `opt`, `attempt`, `any`, `series`, `multi\*`, `multi+`, `between`, `look-ahead`, `lexeme`, 381 | `expect`, etc. 382 | [See API Documentation][api] 383 | 384 | Let us try some of the builtin parse combinators: 385 | 386 | ```clojure 387 | user> (parse #(opt integer) "abc" :eof false) 388 | nil 389 | user> (parse #(opt integer) "12") 390 | 12 391 | user> (parse #(any integer decimal) "12") 392 | 12 393 | user> (parse #(any integer decimal) "12.3") 394 | 12.3 395 | user> (parse #(series integer decimal integer) "3 4.2 6") 396 | [3 4.2 6] 397 | user> (parse #(multi* integer) "1 2 3 4") 398 | [1 2 3 4] 399 | user> (parse #(multi* (fn [] (string-in ["abc" "def"]))) "abcabcdefabc abcdef") 400 | ["abc" "abc" "def" "abc" "abc" "def"] 401 | user> 402 | ``` 403 | 404 | You can create your own parse functions on top of primitive parse-functions and/or 405 | parse combinators provided by Parse-EZ. 406 | 407 | ## Committing to a particular parse branch 408 | 409 | Version 0.4.0 added support for committing to a particular parse branch via 410 | the new parse combinators `commit` and `commit-on`. These functions make the 411 | parser commit to the current parse branch, making the parser report subsequent 412 | parse-failures in the current branch as parse-errors and preventing it 413 | from trying other alternatives at higher levels. 414 | 415 | ## Nesting Parse Combinators Using Macros 416 | 417 | Version 0.3.0 of Parse-EZ adds macro versions of parse combinator functions 418 | to make it easy to nest calls to parse combinators without having to write 419 | nested anonymous functions using the "(fn [] ...)" syntax. Note that Clojure 420 | does not allow nesting of anonymous functions of "#(...)" forms. Whereas 421 | the existing parse combinators take parse functions as arguments and actually 422 | perform parsing and return the parse results, the newly added macros take 423 | parse expressions as arguments and return parse functions (to be passed 424 | to other parse combinators). These macros are named the same as the 425 | corresponding parse combinators but with an underscore ("\_") suffix. For example 426 | the macro version of "any" is named "any_". 427 | 428 | ## Error Handling 429 | 430 | Parse Errors are handled in Parse-EZ using Exceptions. The default error messages generated 431 | by Parse-EZ include line and column number information and in some cases what is expected 432 | at that location. However, you can provide your own custom error messages by using the 433 | `expect` parse combinator. 434 | 435 | ## Expressions 436 | 437 | Parse-EZ includes a customizable expression parser `expr` for parsing expressions in infix 438 | notation and an expression evaluator function `eval-expr` to evaluate infix expressions. 439 | You can customize the operators, their precedences and associative properties using 440 | `:operators` option to the `parse` function. For evaluating expressions, you can optionally 441 | specify the functions to invoke for each operator using the `:op-fn-map` option. 442 | 443 | ## Parser State 444 | 445 | The parser state consists of the input cursor and various parser options (specified or derived) 446 | such as those affecting whitespace and comment parsing, word recognizers, expression parsing, 447 | etc. The parser options can be changed any time in your own parse functions using `set-opt`. 448 | 449 | Note that most of the parse functions affect Parser state (e.g: input cursor) and hence they are 450 | not pure functions. The side-effects could be avoided by making the Parser State an explicit 451 | parameter to all the parse functions and returning the changed Parser State along with the parse 452 | value from each of the parse functions. However, the result would be a significantly programmer 453 | unfriendly API. We made a design decision to keep the parse fuctions simple and easy to use 454 | than to fanatically keep the functions "pure". 455 | 456 | ## Relation to Parsec 457 | 458 | Parsec is a popular parser combinator library written in Haskell. While Parse-EZ 459 | makes use of some of the ideas in there, it is *not* a port of Parsec to Clojure. 460 | 461 | ## License 462 | 463 | Copyright (C) 2012 Protoflex Software 464 | 465 | Distributed under the Eclipse Public License, the same as Clojure. 466 | -------------------------------------------------------------------------------- /src/protoflex/parse.clj: -------------------------------------------------------------------------------- 1 | ;Copyright (c) Protoflex Software. All rights reserved. 2 | ;The use and distribution terms for this software are covered by the 3 | ;Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) 4 | ;which can be found in the file epl-v10.html at the root of this distribution. 5 | ;By using this software in any fashion, you are agreeing to be bound by 6 | ;the terms of this license. 7 | ;You must not remove this notice, or any other, from this software. 8 | 9 | (ns ^{ :doc "Clojure Parser Library." :author "Panduranga Adusumilli"} 10 | protoflex.parse 11 | (:refer-clojure :exclude [replace reverse]) 12 | (:import [java.util.regex Matcher]) 13 | (:import [clojure.lang ExceptionInfo]) 14 | (:use [protoflex.util]) 15 | (:require [clojure.string])) 16 | 17 | (declare parser-init set-pos throw-ex starts-with? move read-ch read-ch-in-set 18 | qstring unexpected next-n regex to-set read-re next-text 19 | init-default-ws-reader match-text find-first auto-trim-if get-opt 20 | mark-pos back-to-mark any-string dq-str sq-str read-to-re at-end? 21 | no-trim no-trim-nl get-opts set-opt get-default-ops 22 | get-default-op-fn-map init-operators read-ws string expect 23 | line-pos-str cursor-pos with-opts read-to-re-or-eof to-eof with-trim-off) 24 | 25 | (def ^:dynamic *parser-state* (atom [])) 26 | (defn state [] (deref *parser-state*)) 27 | 28 | (def default-options { 29 | :blk-cmt-delim ["/*" "*/"] 30 | :line-cmt-start "//" 31 | :ws-regex #"\s+" 32 | :auto-trim true 33 | :word-regex #"\S+" 34 | :ident-regex #"[a-zA-Z_][a-zA-Z0-9_]*" 35 | :eof true }) 36 | 37 | (defn parse 38 | "This function triggers off the parsing of the provided input string using 39 | the specified parse function. The following parser options may be provided 40 | to alter the default behavior of the parser: 41 | :blk-cmt-delim - vector specifying start and end of block-comment markers 42 | :line-cmt-start - string specifying the begin marker of a line comment 43 | :ws-regex - regular expression for matching (non-comment) white space 44 | :auto-trim - whether to automatically remove the leading whitespace/comments 45 | at the current position in the input text or immediately after a parse action. 46 | :word-regex - regular expression for matching words 47 | :operators - a vector of vector of operators in the decreasing order of 48 | precedence; see get-default-ops function for an example. 49 | :op-fn-map - a map of operator and the function to call for that operator when 50 | evaluating expressions 51 | :eof - if true, the parse function must consume the entire input text 52 | 53 | Args: 54 | parse-fn - parse function to apply 55 | input-str - input text to be parsed 56 | opts - key value options (listed above)" 57 | 58 | [parse-fn input-str & opts] 59 | (let [options (merge default-options (apply hash-map opts))] 60 | (binding [*parser-state* (parser-init input-str options) 61 | *ns* (find-ns 'protoflex.parse)] 62 | (init-operators (get options :operators (get-default-ops))) 63 | (set-opt :op-fn-map (get options :op-fn-map (get-default-op-fn-map))) 64 | (auto-trim-if) 65 | (let [result (parse-fn)] 66 | (if (and (:eof options) (not (at-end?))) (throw-ex "Extraneous text") result))))) 67 | 68 | (defmacro parse_ 69 | "Similar to the `parse` function, but takes a parse expression instead of a 70 | parse function as its first argument. The parse expression is any clojure 71 | expression that performs parsing by calling built-in or custom parse 72 | functions. See the documentation for `parse`" 73 | [parse-expr input-str & opts] 74 | `(parse (fn [] (eval ~parse-expr)) ~input-str ~@opts) 75 | ) 76 | 77 | (defn attempt 78 | "Tries to match the input at the current position with the provided 79 | parse function. If the parse function matches successfully, the matched 80 | text is returned and the input cursor advances by the length of the 81 | matched text. Otherwise a nil is returned and the current position 82 | in the input remains unchanged." 83 | [parse-fn] 84 | (let [m (mark-pos)] 85 | (try (let [r (parse-fn)] r) 86 | (catch ExceptionInfo ex 87 | (when (:committed (ex-data ex)) (throw ex)) 88 | (back-to-mark m) nil)))) 89 | 90 | (defmacro attempt_ 91 | "Creates and returns a parse function that calls `attempt` when it is 92 | invoked. The argument `parse-expr` is converted to a parse function 93 | and passed to `attempt` in the returned function's body. See `attempt`." 94 | [parse-expr] 95 | `#(attempt (fn [] (eval ~parse-expr)))) 96 | 97 | (defn opt 98 | "Same as attempt, but accepts a default value argument to return in case the 99 | specified parse function fails. Useful for matching optional text." 100 | ([parse-fn] (attempt parse-fn)) 101 | ([parse-fn default-val] (let [v (attempt parse-fn)] (if v v default-val)))) 102 | 103 | (defmacro opt_ 104 | "Creates and returns a parse function that calls `opt` when it is 105 | invoked. The argument `parse-expr` is converted to a parse function 106 | and passed to `opt` in the returned function's body. See `opt`." 107 | ([parse-expr] `#(opt (fn [] (eval ~parse-expr)))) 108 | ([parse-expr default-val] `#(opt (fn [] (eval ~parse-expr)) ~default-val))) 109 | 110 | (defn any 111 | "Returns the result of the first successfully matching parse-function. 112 | If none of the parse functions match, an exception is thrown." 113 | [& parse-fns] 114 | (if-let [r (find-first :result #(attempt %) parse-fns)] 115 | r (throw-ex))) 116 | 117 | (defmacro ->fns 118 | [& exprs] 119 | `(map #(fn [] (eval %)) '~exprs)) 120 | 121 | (defmacro any_ 122 | "Creates and returns a parse function that calls `any` when it is 123 | invoked. The arguments `parse-exprs` are converted to parse functions 124 | and passed to `any` in the returned function's body. See `any`." 125 | [& parse-exprs] 126 | `#(apply any (->fns ~@parse-exprs))) 127 | 128 | (defn la-strs [la-pf-vec] 129 | (for [x (range (count la-pf-vec)) :when (even? x)] (get la-pf-vec x))) 130 | 131 | (defn- look-ahead-aux 132 | [[la pf & rest]] 133 | (if (starts-with? la) 134 | (pf) 135 | (if (> (count rest) 1) (recur rest) (throw-ex))) 136 | ) 137 | 138 | (defn look-ahead 139 | "Takes a collection of look-ahead-string and parse-function pairs and applies 140 | the first parse function that follows the matching look-ahead-string and 141 | returns the result, or throws a parse exception if the parse function fails. 142 | 143 | If none of the look-ahead strings match the current text, an exception is thrown. 144 | 145 | To specify a default parse function, provide an empty string as look-ahead and 146 | the default parse function at the end of the argument list. 147 | 148 | Args: [la-str-1 parse-fn-1 la-str-2 parse-fn-2 ...]" 149 | [la-pf-vec] 150 | (expect (la-strs la-pf-vec) #(look-ahead-aux la-pf-vec))) 151 | 152 | (defn- look-ahead*-aux 153 | [[la pf & rest]] 154 | (if (starts-with? la) 155 | (do (string la) (pf)) 156 | (if (> (count rest) 1) (recur rest) (throw-ex)))) 157 | 158 | (defn look-ahead* 159 | "Same as look-ahead, but consumes the matching look-ahead string before 160 | applying the corresponding parse function. " 161 | [la-pf-vec] 162 | (expect (la-strs la-pf-vec) #(look-ahead*-aux la-pf-vec))) 163 | 164 | (defn series 165 | "Applies a sequence of parse functions and returns their results in 166 | a vector. Each successfull match by the parse function advances the cursor. 167 | If any of the parse functions fails, an exception is thrown." 168 | [& parse-fns] 169 | ;(map #(%) parse-fns) 170 | ; just applying map doesn't get the correct dynamically rebound var values. 171 | ; doall & bound-fn don't seem to work; clojure bug? 172 | (letfn [(apply-fseq [fns] 173 | (loop [fst (first fns) 174 | rst (rest fns) 175 | result []] 176 | (if fst (recur (first rst) (rest rst) (conj result (fst))) 177 | result)))] 178 | (apply-fseq parse-fns))) 179 | 180 | (defmacro series_ 181 | "Creates and returns a parse function that calls `series` when it is 182 | invoked. The arguments `parse-exprs` are converted to parse functions 183 | and passed to `series` in the returned function's body. See `series`." 184 | [& parse-exprs] 185 | `#(apply series (->fns ~@parse-exprs))) 186 | 187 | (defn multi* 188 | "Matches zero or more occurrences of text accepted by the provided parse 189 | function and returns the results in a vector." 190 | [parse-fn] 191 | (loop [rv []] 192 | (if-let [r (attempt parse-fn)] 193 | (recur (conj rv r)) 194 | (if (pos? (count rv)) rv)))) 195 | 196 | 197 | (defmacro multi*_ 198 | "Creates and returns a parse function that calls `multi*` when it is 199 | invoked. The argument `parse-expr` is converted to a parse function 200 | and passed to `multi*` in the returned function's body. See `multi*`." 201 | [parse-expr] 202 | `#(multi* (fn [] (eval ~parse-expr)))) 203 | 204 | (defn multi+ 205 | "Matches one or more occurrences of text accepted by the provided parse 206 | function and returns the results in a vector. If the parse function doesn't 207 | match even once, an exception is thrown." 208 | [parse-fn] 209 | (if-let [rv (multi* parse-fn)] rv (throw-ex))) 210 | 211 | (defmacro multi+_ 212 | "Creates and returns a parse function that calls `multi+` when it is 213 | invoked. The argument `parse-expr` is converted to a parse function 214 | and passed to `multi+` in the returned function's body. See `multi+`." 215 | [parse-expr] 216 | `#(multi+ (fn [] (eval ~parse-expr)))) 217 | 218 | (defn times 219 | "Applies the provided parse function exactly n times and returns the 220 | results of applications of the function in a vector." 221 | [n parse-fn] 222 | (vec (for [i (range n)] (parse-fn)))) 223 | 224 | (defmacro times_ 225 | "Creates and returns a parse function that calls `times` when it is 226 | invoked. The argument `parse-expr` is converted to a parse function 227 | and passed to `times` in the returned function's body. See `times`." 228 | [n parse-expr] 229 | `#(times ~n (fn [] (eval ~parse-expr)))) 230 | 231 | (defn lexeme 232 | "Applies the specified parse function for current input text, consumes any 233 | following whitespace, comments and returns the result of the parse function 234 | application." 235 | [parse-fn] 236 | (let [r (parse-fn)] (read-ws) r)) 237 | 238 | (defmacro lexeme_ 239 | "Creates and returns a parse function that calls `lexeme` when it is 240 | invoked. The argument `parse-expr` is converted to a parse function 241 | and passed to `lexeme` in the returned function's body. See `lexeme`." 242 | [parse-expr] 243 | `#(lexeme (fn [] (eval ~parse-expr)))) 244 | 245 | (defn- dq [x] (str \" x \")) 246 | 247 | (defn- one-of-msg [x] (str "one of [" 248 | (->> (map dq x) (clojure.string/join ", ")) "]")) 249 | 250 | (defn- exp-msg [x] (if (coll? x) (one-of-msg x) (dq x))) 251 | 252 | (defn expect 253 | "Customize error message; if the specified parse function doesn't match 254 | the current input text, the error message of the parse exception will include 255 | the specified custom expected-message." 256 | [expected-msg parse-fn] 257 | (try (parse-fn) 258 | (catch ExceptionInfo ex 259 | (if (:unexpected (ex-data ex)) (throw ex) 260 | (throw-ex (unexpected expected-msg) {:unexpected true}))))) 261 | 262 | (defmacro expect_ 263 | "Creates and returns a parse function that calls `expect` when it is 264 | invoked. The argument `parse-expr` is converted to a parse function 265 | and passed to `expect` in the returned function's body. See `expect`." 266 | [expected-msg parse-expr] 267 | `#(expect ~expected-msg (fn [] (eval ~parse-expr)))) 268 | 269 | (defn- has-parse-error-msg? [^Exception ex] 270 | (let [m (.getMessage ex)] 271 | (and m (>= (.indexOf m "Parse Error") 0)))) 272 | 273 | (defn throw-ex 274 | "Throws an exception of ExceptionInfo class; this is usually called to 275 | indicate a match failure in a parse function." 276 | ([] (throw-ex "" {} nil)) 277 | ([msg] (throw-ex msg {} nil)) 278 | ([msg map] (throw-ex msg map nil)) 279 | ([msg map cause] 280 | (let [pos (line-pos-str (or (:pos map) (cursor-pos))) 281 | msg (if (:msg-final map) msg (str "Parse Error: " msg " at " pos)) 282 | map (assoc map :msg-final true)] 283 | (throw (ex-info msg map cause))))) 284 | 285 | (defn unexpected 286 | "Creates a message string for unexpected input exception." 287 | ([expected] (unexpected (next-n 10) expected)) 288 | ([actual expected] 289 | (str "Unexpected input: \"" actual "\"; Expecting " 290 | (exp-msg expected)))) 291 | 292 | (defn- mchar [ch is-no-auto-trim] 293 | (let [c (read-ch is-no-auto-trim)] 294 | (if (= ch c) c 295 | (do (move -1) (throw-ex (unexpected c ch)))))) 296 | 297 | (defn chr 298 | "If the next character in the input matches the specified character ch, 299 | returns it; otherwise throws an exception." 300 | [ch] (mchar ch false)) 301 | 302 | (defn chr- 303 | "Same as chr but with auto-trimming turned off for the following input" 304 | [ch] (mchar ch true)) 305 | 306 | (defn chr-in 307 | "If the next character in the input matches any character in the specified 308 | string or character collection, the matching character is returned. 309 | Otherwise throws an exception." 310 | [chars] (read-ch-in-set (to-set chars) false)) 311 | 312 | (defn chr-in- 313 | "Same as chr-in but with auto-trimming turned off for the following input" 314 | [chars] (read-ch-in-set (to-set chars) true)) 315 | 316 | (defn string 317 | "If the input matches the specified string, the string is 318 | returned. Otherwise, a parse exception is thrown." 319 | [^String s] 320 | (if (starts-with? s) (move (.length s)) 321 | (throw-ex (unexpected (next-n (.length s)) s)))) 322 | 323 | (defn string-in-ord 324 | "Returns the first string from the provided strings that matches text 325 | at the current position. Throws an exception if none of the strings match." 326 | [strings] 327 | (if-let [^String s (find-first :item #(starts-with? %) strings)] 328 | (do (move (.length s)) s) 329 | (throw-ex (unexpected strings)))) 330 | 331 | (defn string-in 332 | "Returns the longest string from the provided strings that matches text 333 | at the current position. Throws an exception if none of the strings match." 334 | [strings] 335 | (string-in-ord (sort-by count #(compare %2 %1) strings))) 336 | 337 | (defn word-in 338 | "Returns the first word from the provided words that matches text 339 | at the current position. Throws an exception if none of the words match. 340 | An optional word-reader parse-function may be provided to read words." 341 | ([str-coll] 342 | (word-in str-coll #(regex (get-opt :word-regex)))) 343 | ([str-coll word-reader] 344 | (let [wset (to-set str-coll) 345 | w (word-reader)] 346 | (if (wset w) w (throw-ex (unexpected w wset)))))) 347 | 348 | (defn word 349 | "Returns the specified word if the word occurs at the current position in 350 | the input text; an exception is thrown otherwise." 351 | [w] (word-in #{w})) 352 | 353 | (defn ^String ident 354 | "Reads an identifier at current input position using the ident-regex 355 | parser option. If id is specified, the read identifier must match the 356 | specified value; otherwise an exception is thrown." 357 | [] (regex (get-opt :ident-regex))) 358 | 359 | (defn ^String key-word 360 | "Reads an identifier at the current input position. If the read identifier 361 | matches the specified keyword kw, the same is returned; otherwise, an 362 | exception is thrown." 363 | [kw] (expect kw #(if (= kw (ident)) kw (throw-ex)))) 364 | 365 | (defn commit 366 | "Applies supplied parse-fn and if it fails, the failure gets reported as a 367 | 'committed' exception, which prevents the parser from trying alternatives at 368 | higher levels. On success, returns the result of parse-fn." 369 | [parse-fn] 370 | (try (parse-fn) 371 | (catch ExceptionInfo ex 372 | (throw-ex (.getMessage ex) (assoc (ex-data ex) :committed true))))) 373 | 374 | (defn commit-on 375 | "If the keyword kw occurs at the current position in the input text, parse-fn 376 | will be applied next. If parse-fn fails, it will get reported as a 'committed' 377 | exception, which prevents the parser from trying alternatives at higher levels. 378 | On sucess, returns the result of parse-fn." 379 | [kw parse-fn] 380 | (if (attempt #(key-word kw)) (commit parse-fn))) 381 | 382 | (defn with-follow 383 | "Applies parse-fn and follow-fn in sequence; Ignores the result of follow-fn and 384 | returns the result of parse-fn." 385 | [parse-fn follow-fn] (let [r (parse-fn), _ (follow-fn)] r)) 386 | 387 | (defn with-follow* 388 | "Similar to with-follow, but commits to follow-fn parse if parse-fn succeeds. 389 | Returns the result of parse-fn." 390 | [parse-fn follow-fn] (with-follow parse-fn #(commit follow-fn))) 391 | 392 | (defn with-no-follow 393 | "Applies parse-fn and follow-fn in sequence; This method succeeds only if the 394 | follow-fn fails to match. Returns the result of parse-fn." 395 | [parse-fn follow-fn] 396 | (let [r (parse-fn), f (attempt follow-fn)] 397 | (when f (throw-ex "Unexpected follow")) 398 | r)) 399 | 400 | (defn sep-by 401 | "Reads a record using the specified field, field-separator and 402 | record-separator parse functions. If no record-separator is specified, 403 | a newline character is used as record separator. Returns the fields of the 404 | record in a vector." 405 | ([fld-fn fld-sep-fn] 406 | (sep-by fld-fn fld-sep-fn #(regex #"\r?\n"))) 407 | 408 | ([fld-fn fld-sep-fn rec-sep-fn] 409 | (if-not (at-end?) 410 | (let [fst (fld-fn) 411 | rst (multi* #(series fld-sep-fn fld-fn)) 412 | _ (any rec-sep-fn at-end?) 413 | rst (if rst (map second rst)) 414 | result (if rst (vec(conj rst fst)) [fst])] 415 | result)))) 416 | 417 | 418 | (defn sep-rest [parse-fn sep-fn] 419 | (let [rst (multi* #(series sep-fn parse-fn))] 420 | (reduce (fn [a e] (conj a (e 1))) [] rst))) 421 | 422 | (defn sep-by* 423 | "Differs from sep-by in that it allows zero matches of parse-fn before stop-fn; 424 | Unlike sep-by, stop-fn must match -- not optional." 425 | [parse-fn sep-fn stop-fn] 426 | (if-let [fst (attempt parse-fn)] 427 | (let [rst (sep-rest parse-fn sep-fn), _ (stop-fn)] 428 | (into [fst] rst)) ; then 429 | (do (stop-fn) nil))) ; else 430 | 431 | (defn any-string 432 | "Reads a single-quoted or double-quoted or a plain-string that is followed 433 | by the specified separator sep or EOF; the separator is not part of the returned 434 | string." 435 | [sep] (cond 436 | (starts-with? "\"") (dq-str) 437 | (starts-with? "'") (sq-str) 438 | :else (read-to-re-or-eof (re-pattern (str sep "|\r?\n"))))) 439 | 440 | (defn between 441 | "Applies the supplied start-fn, parse-fn and end-fn functions and returns 442 | the result of parse-fn. This is typically used to parse content enclosed by 443 | some delimiters on either side." 444 | [start-fn parse-fn end-fn] 445 | (let [res (series start-fn parse-fn end-fn)] (res 1))) 446 | 447 | (defn parens 448 | "Returns the result of applying specifed parse function to text that is 449 | in between the opening and closing parentheses '(' and ')'" 450 | [parse-fn] (between #(chr \() parse-fn #(chr \)))) 451 | 452 | (defn braces 453 | "Returns the result of applying specifed parse function to text that is 454 | in between the opening and closing braces '{' and '}'" 455 | [parse-fn] (between #(chr \{) parse-fn #(chr \}))) 456 | 457 | (defn sq-brackets 458 | "Returns the result of applying specifed parse function to text that is 459 | in between the opening and closing square brackets '[' and ']'" 460 | [parse-fn] (between #(chr \[) parse-fn #(chr \]))) 461 | 462 | (defn ang-brackets 463 | "Returns the result of applying specifed parse function to text that is 464 | in between the opening and closing angular brackets '<' and '>'" 465 | [parse-fn] (between #(chr \<) parse-fn #(chr \>))) 466 | 467 | (defn regex 468 | "Returns the text matched by the specified regex; If a group is specified, 469 | the returned text is for that group only. In either case, the cursor is 470 | advanced by the length of the entire matched text (group 0)" 471 | ([re] (regex re 0)) 472 | ([re grp] (read-re re grp))) 473 | 474 | (defn semi 475 | "Matches and returns a semi-colon character" 476 | [] (chr \;)) 477 | 478 | (defn comma 479 | "Matches and returns a comma character" 480 | [] (chr \,)) 481 | 482 | (defn dot 483 | "Matches and retuns a dot character" 484 | [] (chr \.)) 485 | 486 | (defn colon 487 | "Matches and returns a colon character" 488 | [] (chr \:)) 489 | 490 | (defn popen 491 | "Matches and returns an opening paranthesis character" 492 | [] (chr \()) 493 | 494 | (defn pclose 495 | "Matches and returns a closing paranthesis character" 496 | [] (chr \))) 497 | 498 | (defn bopen 499 | "Matches and returns an opening curly brace character" 500 | [] (chr \{)) 501 | 502 | (defn bclose 503 | "Matches and returns a closing curly brace character" 504 | [] (chr \})) 505 | 506 | (defn sqopen 507 | "Matches and returns an opening curly brace character" 508 | [] (chr \[)) 509 | 510 | (defn sqclose 511 | "Matches and returns a closing curly brace character" 512 | [] (chr \])) 513 | 514 | (defn aopen 515 | "Matches and returns an opening angular bracket character" 516 | [] (chr \<)) 517 | 518 | (defn aclose 519 | "Matches and returns a closing angular bracket character" 520 | [] (chr \>)) 521 | 522 | (defn equal 523 | "Matches and returns an equal character" 524 | [] (chr \=)) 525 | 526 | (defn integer 527 | "Parses a long integer value and returns a Long." 528 | [] (Long/parseLong (regex #"-?\d+(?!\w|\.)"))) 529 | 530 | (defn decimal 531 | "Parses a decimal value and returns a Double." 532 | [] (Double/parseDouble (regex #"-?\d+(\.\d+)?(?!\w|\.)"))) 533 | 534 | (defn number 535 | "Matches an integral or non-integral numeric value. While the function 536 | decimal also matches both integer and non-integer values, it always 537 | returns a Double; where as number returns Long for integers and Double 538 | for non-integers." 539 | [] (any integer decimal)) 540 | 541 | (defn sq-str 542 | "Parses a single-quoted string and returns the matched string (minus the quotes)" 543 | [] (qstring \' \\)) 544 | 545 | (defn dq-str 546 | "Parses a double-quoted string and returns the matched string (minus the quotes)" 547 | [] (qstring \" \\)) 548 | 549 | (defn- qstring [qchar esc] 550 | (let [sb (StringBuilder.)] 551 | (no-trim #(chr qchar)) ; read and ignore 552 | (loop [] 553 | (let [ch (read-ch true) 554 | is-esc (= ch esc) 555 | ch (if is-esc (read-ch) ch)] 556 | (if (and (= ch qchar) (not is-esc)) 557 | nil 558 | (do (.append sb ch) (recur))))) 559 | (auto-trim-if) 560 | (.toString sb))) 561 | 562 | (defn next-text ^String [] 563 | (let [[s c _ _] (state)] (subs s c))) 564 | 565 | (defn read-to 566 | "The parser skips to the position where the text contains the string 567 | specified by s. The string itself is not consumed, that is the cursor is 568 | positioned at the beginning of the match. If the specified string is not 569 | found, cursor position does not change and a parse exception is thrown." 570 | [^String s] 571 | (let [t (next-text) 572 | n (.indexOf t s)] 573 | (if (>= n 0) (move n) (throw-ex)))) 574 | 575 | (defn skip-over 576 | "Finds the specified string s in the input and skips over it. If the string 577 | is not found, a parse exception is thrown." 578 | [^String s] 579 | (let [s1 (read-to s) 580 | s2 (string s)] 581 | (str s1 s2))) 582 | 583 | (defn- ->re [re] (if (string? re) (re-pattern re) re)) 584 | 585 | (defn read-re 586 | "Reads the string matching the specified regular expression. If a match-group 587 | is specified, the corresponding text is returned; otherwise the entire 588 | matched text is returned." 589 | ([re] (read-re re 0)) 590 | ([re ^Integer grp] 591 | (let [t (next-text) 592 | ^Matcher rm (re-matcher (->re re) t)] 593 | (if (.lookingAt rm) 594 | (do (move (.end rm)) 595 | (.group rm grp)) 596 | (throw-ex))))) 597 | 598 | (defn read-to-re 599 | "Reads and returns text upto but not including the text matched by the 600 | specified regular expression. If the specified regular expression doesn't 601 | occur in the remaining input text, an exception is thrown." 602 | [re] 603 | (let [t (next-text) 604 | m (re-find (->re re) t)] 605 | (if (nil? m) 606 | (throw-ex) 607 | (let [^String ms (if (string? m) m (m 0)) 608 | i (.indexOf t ms)] 609 | (move i))))) 610 | 611 | (defn read-to-re-or-eof 612 | "If the specified regex matches in the remaining text, returns text upto the match; 613 | Otherwise returns all the remaining text and the input cursor is positioned at EOF." 614 | [re] 615 | (if-let [t (attempt #(read-to-re re))] t (to-eof))) 616 | 617 | (defn skip-over-re 618 | "Reads and returns text upto and including the text matched by the 619 | specified regular expression. If the specified regular expression doesn't 620 | occur in the remaining input text, an exception is thrown." 621 | [re] (let [s1 (read-to-re re) 622 | s2 (read-re re)] 623 | (str s1 s2))) 624 | 625 | (defn starts-with? 626 | "Returns a boolean value indicating whether the current input text matches 627 | the specified string." 628 | [^String s] (.startsWith (next-text) s)) 629 | 630 | (defn starts-with-re? 631 | "Returns a boolean value indicating whether the specified regular expression 632 | matches the input at the current position." 633 | [re] (->> (next-text) (re-matcher (->re re)) .lookingAt)) 634 | 635 | (defn read-n 636 | "Reads and returns an n-character string at the current position." 637 | ^String [n] 638 | (let [t (next-text)] 639 | (if (<= n (.length t)) 640 | (move n) 641 | (throw-ex "EOF")))) 642 | 643 | (defn read-ch 644 | "Reads and return the next input character. Throws an exception if the 645 | current position is at the end of the input." 646 | ([] (read-ch false)) 647 | ([is-no-auto-trim] 648 | (let [[^String s c _ _] (state)] 649 | (if (= c (.length s)) (throw-ex "EOF")) 650 | (if is-no-auto-trim (set-pos (inc c)) (move 1)) 651 | (.charAt s c)))) 652 | 653 | (defn read-ch-in-set 654 | "Reads and returns the next character if it matches any of the characters 655 | specified in the provided set. An exception is thrown otherwise. The 656 | optional is-no-auto-trim argument may be used to specify whether or not 657 | to apply auto-trim after reading the next character." 658 | ([char-set] (read-ch-in-set char-set false)) 659 | ([char-set is-no-auto-trim] 660 | (let [ch (read-ch is-no-auto-trim)] 661 | (if (char-set ch) ch 662 | (throw-ex (unexpected ch char-set)))))) 663 | 664 | (defn blk-cmt 665 | "Reads and returns a block comment as specified by the begin and end 666 | markers. Throws an exception if the specified block-comment doesn't 667 | occur at the current position." 668 | [beg end] 669 | (if beg 670 | (let [s1 (string beg) s2 (skip-over end)] (str s1 s2)))) 671 | 672 | (defn blk-cmt? 673 | "Similar to blk-cmt but returns a nil instead of throwing an exception 674 | in case of a match failure." 675 | [beg end] (attempt #(blk-cmt beg end))) 676 | 677 | (defn- to-eof 678 | "Reads and returns text from current position to the end of the input text." 679 | [] (let [t (next-text)] (move (.length t)))) 680 | 681 | (defn line-cmt 682 | "Reads and returns a line comment as specified by the begin marker. 683 | Throws an exception if the specified block-comment doesn't occur at the 684 | current position." 685 | [beg] 686 | (if beg 687 | (->> (no-trim (fn [] (series #(string beg) #(regex #"[^\n]*\n?")))) 688 | (apply str)))) 689 | 690 | (defn line-cmt? 691 | "Similar to line-cmt but returns a nil instead of throwing an exception 692 | in case of a match failure." 693 | [beg] (attempt #(line-cmt beg))) 694 | 695 | (defn ws 696 | "Matches white space (including comments) at the current position. The 697 | optional parameters bcb, bce, lcb and wsre specify block-comment-begin, 698 | block-comment-end, line-comment-begin and white-space-regex respectively. 699 | If they are not specified here, the options set for the parser are used. 700 | Throws an exception if white space doesn't occur at the current position." 701 | ([] 702 | (let [opts (get-opts) 703 | blk (opts :blk-cmt-delim) 704 | lc (opts :line-cmt-start) 705 | wsre (opts :ws-regex)] 706 | (ws (blk 0) (blk 1) lc wsre))) 707 | 708 | ([bcb bce lcb wsre] 709 | (let [w (multi+ (fn [](any #(blk-cmt bcb bce) 710 | #(line-cmt lcb) 711 | #(regex wsre))))] 712 | (apply str w)))) 713 | 714 | (defn ws? 715 | "Similar to ws except that a nil value is returned instead of throwing 716 | an exception in case of a match failure." 717 | [& args] (attempt #(apply ws args))) 718 | 719 | (defn at-end? 720 | "Returns true if no more input is left to be read; false otherwise." 721 | [] (let [[^String s c _ _] (state)] (= (.length s) c))) 722 | 723 | (defn cursor-pos 724 | "Returns the current cursor position as a scalar" 725 | [] (let [[_ c _ _] (state)] c)) 726 | 727 | (defn line-column 728 | "Returns the line and column vector corresponding to the cursor in string s" 729 | [^String s cursor] 730 | (loop [i 0, nl-cnt 0, col 0] 731 | (if (= i cursor) [(inc nl-cnt) (inc col)] 732 | (if (== (int(.charAt s i)) (int \newline)) 733 | (recur (inc i) (inc nl-cnt) 0) 734 | (recur (inc i) nl-cnt (inc col)))))) 735 | 736 | (defn- line-pos* 737 | "Returns the line and column vector for the specified cursor position" 738 | [cursor] 739 | (let [[^String s _ _ _] (state)] 740 | (loop [i 0, nl-cnt 0, col 0] 741 | (if (= i cursor) [(inc nl-cnt) (inc col)] 742 | (if (== (int(.charAt s i)) (int \newline)) 743 | (recur (inc i) (inc nl-cnt) 0) 744 | (recur (inc i) nl-cnt (inc col))))))) 745 | 746 | (defn line-pos 747 | "Returns [line column] vector corresponding to the specified cursor position (or 748 | the current position if cursor is not specified) of the parser" 749 | ([] (line-pos* (cursor-pos))) 750 | ([cursor] (line-pos* cursor))) 751 | 752 | (defn line-pos-str 753 | "Returns line position in a descriptive string. If the cursor position is specified, 754 | the returned value corresponds to that position. Otherwise, the returned value 755 | corresponds to the current position." 756 | ([] (line-pos-str (cursor-pos))) 757 | ([cursor] (let [lp (line-pos cursor)] (str "line " (lp 0) ", col " (lp 1))))) 758 | 759 | ; ------------------ create & configure the parser ------------ 760 | 761 | (defn parser-init 762 | "Initializes the parser state with the specified input string and options." 763 | ([^String input-str] 764 | (parser-init input-str default-options)) 765 | 766 | ([^String input-str opts] 767 | (let [new-opts (init-default-ws-reader opts) 768 | st (atom [input-str 0 0 new-opts])] 769 | st))) ; [str pos prev-pos opts] 770 | 771 | (defn- init-default-ws-reader [opts] 772 | (let [bco (get opts :blk-cmt-delim) 773 | lcb (get opts :line-cmt-start) 774 | ws-re (get opts :ws-regex) 775 | wsr #(ws? (bco 0) (bco 1) lcb ws-re)] 776 | (assoc opts :default-ws-reader wsr))) 777 | 778 | (defn set-opts 779 | "Sets specified parser options" 780 | [opts] 781 | (swap! *parser-state* 782 | #(let [[s c p o] % 783 | o (merge o opts) 784 | o (merge o (init-default-ws-reader o))] 785 | [s c p o]))) 786 | 787 | (defn- reset-opts [opts] 788 | (swap! *parser-state* 789 | #(let [[s c p o] %] [s c p opts]))) 790 | 791 | (defn set-opt 792 | "Sets parser option k to value v" 793 | [k v] #_(swap! *parser-state* 794 | #(let [[s c p o] % 795 | o (-> o (assoc k v) (init-default-ws-reader))] 796 | [s c p (assoc o k v)])) 797 | (set-opts {k v})) 798 | 799 | (defn get-opts 800 | "Returns all parser options" 801 | [] (let [[_ _ _ o] (state)] o)) 802 | 803 | (defn get-opt 804 | "Returns the value for parser option k; if the optional default value 805 | parameter d is specified, its value is returned if the option k is not set 806 | in parser options." 807 | ([k] (get-opt k nil)) 808 | ([k d] (get (get-opts) k d))) 809 | 810 | (defn with-opts [opts parse-fn] 811 | "Sets the parser options specified in the opts map and applies the parse-fn 812 | function. Returns the result of applying parse-fn. 813 | 814 | Ensures that the original parse options are restored before the function exits." 815 | (let [orig-opts (get-opts)] 816 | (set-opts opts) 817 | (try (parse-fn) (finally (reset-opts orig-opts))))) 818 | 819 | (defn auto-trim-on 820 | "Turns on auto-trim feature that cleans trailing white-space, comments 821 | or whatever the custom ws-reader if any is spe" 822 | [] 823 | (set-opt :auto-trim true)) 824 | 825 | (defn auto-trim-off 826 | "Turns off the auto-trim option." 827 | [] (set-opt :auto-trim false)) 828 | 829 | (defn set-blk-cmt-opts 830 | "Sets block comment begin and end markers." 831 | [beg end] 832 | (set-opt :blk-cmt-delim [beg end])) 833 | 834 | (defn set-line-cmt-opts 835 | "Sets line comment begin marker." 836 | [beg] 837 | (set-opt :line-cmt-start beg)) 838 | 839 | (defn set-ws-regex 840 | "Sets the regular expression to be used for matching non-comment white-space." 841 | [ws-re] 842 | (set-opt :ws-regex ws-re)) 843 | 844 | (defn set-ws-reader 845 | "This sets the white-space parser to be used when auto-trim is set. 846 | If this is specified, it overrides the options set by set-blk-cmt-opts, 847 | set-line-cmt-opts and set-ws-regex options." 848 | [ws-reader] 849 | (set-opt :ws-reader ws-reader)) 850 | 851 | ;----------------------- misc utilities for parse functions ------------ 852 | (defn- get-input [] ((state) 0)) 853 | 854 | (defn- get-pos [] ((state) 1)) 855 | 856 | (defn- get-prev [] ((state) 2)) 857 | 858 | (defn get-opts [] ((state) 3)) 859 | 860 | (defn- set-pos 861 | ([pos] (set-pos pos (get-pos))) 862 | ([pos prev] 863 | (swap! *parser-state* #(let [[s _ _ o] %] [s pos (min prev pos) o])) 864 | nil)) 865 | ; prev-pos must never be greater than current-pos 866 | 867 | (defn mark-pos 868 | "Returns the current positional parameters of the parser." 869 | [] (let [[_ c p _] (state)] [c p])) 870 | 871 | (defn back-to-mark 872 | "Resets the positional parameters to a previously set mark." 873 | [mark] (set-pos (mark 0) (mark 1))) 874 | 875 | (defn- move [delta] 876 | (set-pos (+ (get-pos) delta)) 877 | (let [t (match-text)] 878 | (auto-trim-if) 879 | t)) 880 | 881 | ;------------------------------------------------------------------------ 882 | 883 | (defmacro with-trim-on 884 | "Executes the provided body with auto-trim option set to true. The earlier 885 | value of the auto-trim option is restored after executing the body." 886 | [& body] 887 | `(let [at# (get-opt :auto-trim)] 888 | (set-opt :auto-trim true) 889 | (let [ret# (try (do ~@body) (finally (set-opt :auto-trim at#)))] 890 | ret# ))) 891 | 892 | (defmacro with-trim-off 893 | "Executes the provided body with auto-trim option set to false. The earlier 894 | value of the auto-trim option is restored after executing the body." 895 | [& body] 896 | `(no-trim (fn [] ~@body))) 897 | 898 | (defn no-trim 899 | "Similar to with-trim-off, but takes a function as a parameter instead of 900 | the body" 901 | [parse-fn] 902 | (let [at (get-opt :auto-trim)] 903 | (set-opt :auto-trim false) 904 | (try (parse-fn) (finally (set-opt :auto-trim at))))) 905 | 906 | (defmacro no-trim_ 907 | "Creates and returns a parse function that calls `no-trim` when it is 908 | invoked. The argument `parse-expr` is converted to a parse function 909 | and passed to `no-trim` in the returned function's body. See `no-trim`." 910 | [parse-expr] 911 | `#(no-trim (fn [] (eval ~parse-expr))) 912 | ) 913 | 914 | (defn no-trim-nl 915 | "Turns off automatic trimming of newline characters (as part of white-space) 916 | and executes the specified function. The earlier auto-trim options are restored 917 | at the end of execution of the specified function." 918 | [parse-fn] (let [wsre (get-opt :ws-regex)] 919 | (set-opt :ws-regex #"[ \t]+") 920 | (let [result (try (parse-fn) (finally (set-opt :ws-regex wsre)))] 921 | result))) 922 | 923 | (defmacro no-trim-nl_ 924 | "Creates and returns a parse function that calls `no-trim-nl` when it is 925 | invoked. The argument `parse-expr` is converted to a parse function 926 | and passed to `no-trim-nl` in the returned function's body. See `no-trim-nl`." 927 | [parse-expr] 928 | `#(no-trim-nl (fn [] (eval ~parse-expr))) 929 | ) 930 | 931 | (defn read-ws 932 | "Reads whitespace (including comments) using a whitespace reader based 933 | on parser options. If the :ws-reader option is not set, a default whitespace 934 | reader based on other parser options such as :ws-regex, :blk-cmt-delim and 935 | :line-cmt-start will be used. Returns the whitespace read." 936 | [] (let [wr (get-opt :ws-reader) 937 | wr (if (nil? wr) (get-opt :default-ws-reader) wr)] 938 | (wr))) 939 | 940 | (defn auto-trim-if 941 | "Automatically trim the leading input text if :auto-trim option is set to true." 942 | [] (if (get-opt :auto-trim) (read-ws)) 943 | nil) 944 | 945 | 946 | (defn- match-text 947 | "Returns the text matched by the previous parser operation." 948 | [] 949 | (let [[s c p _] (state)] (if (> c p) (subs s p c)))) 950 | 951 | (defn- remaining [] (- (count (get-input)) (get-pos))) 952 | 953 | (defn- next-n [n] 954 | (let [t (next-text)] 955 | (subs t 0 (min n (.length t))))) 956 | 957 | (defn- to-set [coll] (if (set? coll) coll (apply hash-set coll))) 958 | 959 | ; stops at first match instead of taking first of filter 960 | (defn- find-first 961 | [item-or-result pred coll] 962 | (loop [fst (first coll) 963 | rst (rest coll)] 964 | (if fst (if-let [result (pred fst)] 965 | (if (= item-or-result :item) fst result) 966 | (recur (first rst) (rest rst)))))) 967 | 968 | ; -------------------------------------------------------------- 969 | ; Sample Infix Expression Parser 970 | 971 | (defn get-default-ops [] 972 | [[:unary ["!" "~"]] 973 | [:left ["*" "/" "%"]] 974 | [:left ["+" "-"]] 975 | [:left ["<<" ">>" ">>>"]] 976 | [:left ["<" "<=" ">" ">="]] 977 | [:left ["==" "!="]] 978 | [:left ["&"]] 979 | [:left ["^"]] 980 | [:left ["|"]] 981 | [:left ["&&"]] 982 | [:left ["||"]] 983 | [:right [ "=" "+=" "-=" "*=" "/=" "%=" "&=" 984 | "^=" "|=" "<<=" ">>=" "<<<="]]]) 985 | 986 | (defn get-default-op-fn-map [] 987 | {"!" not, "~" bit-not, "*" *, "/" /, "+" +, "-" -, "<<" bit-shift-left, 988 | ">>" bit-shift-right "<" <, "<=" <=, ">" >, ">=" >=, "==" ==, 989 | "!=" #(not (== %1 %2)),"&" bit-and, "^" bit-xor, "|" bit-or, 990 | "&&" (macro->fn and), "||" (macro->fn or)}) 991 | 992 | (defn- ops->map [operators] 993 | (reduce #(merge %1 %2) {} 994 | (for [i (range (count operators))] 995 | (loop [m {} j 0] 996 | (let [l-ops (operators i) desc (l-ops 0) op-names (l-ops 1)] 997 | (if (< j (count op-names)) 998 | (recur (assoc m (op-names j) [desc i]) (inc j)) 999 | m)))))) 1000 | 1001 | (defn- get-all-ops [operators] 1002 | (reduce #(concat %1 (get %2 1)) [] operators)) 1003 | 1004 | (defn op-map [] (get-opt :op-map)) 1005 | 1006 | (defn all-ops [] (get-opt :all-ops)) 1007 | 1008 | (defn init-operators [operators] 1009 | (set-opt :op-map (ops->map operators)) 1010 | (set-opt :all-ops (get-all-ops operators))) 1011 | 1012 | (defn- op? [op] (get (op-map) op)) 1013 | 1014 | (defn- unary? [op] (= :unary ((get (op-map) op [""]) 0))) 1015 | 1016 | (defn- prec [op] ((get (op-map) op) 1)) 1017 | 1018 | (defn- left-assoc? [op] (= :left ((get (op-map) op) 0))) 1019 | 1020 | (defn has-priority? [op1 op2] 1021 | (if (< (prec op1) (prec op2)) ;op1 has higher precedence (smaller #) 1022 | true 1023 | (if (> (prec op1) (prec op2)) 1024 | false 1025 | (left-assoc? op1)))) 1026 | 1027 | (defn operator [] (string-in (all-ops))) 1028 | 1029 | (defn- unary-operator [] 1030 | (let [op (operator)] 1031 | (if (unary? op) op (throw-ex (unexpected op "unary operator or expression"))))) 1032 | 1033 | (defn- binary-operator [] 1034 | (let [op (operator)] 1035 | (if-not (unary? op) op (throw-ex (unexpected op "binary operator"))))) 1036 | 1037 | (declare paren-expr) 1038 | 1039 | (defn- factor [] 1040 | (let [uop (attempt unary-operator) 1041 | fac (if (starts-with? "(") (paren-expr) (number))] 1042 | (if uop [uop fac] fac))) 1043 | 1044 | (declare expr) 1045 | 1046 | (defn paren-expr [] 1047 | (chr \() 1048 | (let [e (expr)] (chr \)) e)) 1049 | 1050 | (declare build-tree) 1051 | 1052 | (defn expr 1053 | "Parses expressions and returns the parse tree as nested vectors." 1054 | [] (if-let [fac (attempt factor)] 1055 | (if-let [rhs (multi* #(series binary-operator factor))] 1056 | (-> (into [fac] (apply concat rhs)) build-tree ) 1057 | fac))) 1058 | 1059 | (defn build-tree [nodes] 1060 | (loop [i 0 tree []] 1061 | (if (< i (count nodes)) 1062 | (let [node (nodes i) 1063 | nc (count tree) 1064 | n0 (get tree 0) n1 (get tree 1) n2 (get tree 2)] 1065 | (case nc 1066 | 0 (recur (inc i) [node]) 1067 | 1 (recur (inc i) [node n0]) 1068 | 2 (recur (inc i) [n0 n1 node]) 1069 | 3 (recur (inc i) (if (odd? i) ; operator 1070 | (if (has-priority? n0 node) 1071 | [node tree] 1072 | [n0 n1 [node n2]]) 1073 | [n0 n1 [(n2 0) (n2 1) node]])))) 1074 | tree))) 1075 | 1076 | (defn- get-op-fn [op] 1077 | (let [of-map (get-opt :op-fn-map)] 1078 | (if-let [fn (get of-map op)] fn (throw-ex (str op " not implemented"))))) 1079 | 1080 | (defn eval-expr-tree 1081 | "Evaluates the parse tree returned by expr parse function." 1082 | [ptree] 1083 | (letfn [(val [i] (-> (ptree i) eval-expr-tree))] 1084 | (if (coll? ptree) 1085 | (case (count ptree) 1086 | 2 ((get-op-fn (ptree 0)) (val 1)) 1087 | 3 ((get-op-fn (ptree 0)) (val 1) (val 2))) 1088 | ptree))) 1089 | 1090 | (defn eval-expr 1091 | "Parses and evaluates an expression in infix notation. 1092 | Args: expression-string followed by parser options. See parse function for details." 1093 | [& args] 1094 | (apply parse (list* #(-> (expr) eval-expr-tree) args))) 1095 | 1096 | -------------------------------------------------------------------------------- /bin/protoflex/parse.clj: -------------------------------------------------------------------------------- 1 | ;Copyright (c) Protoflex Software. All rights reserved. 2 | ;The use and distribution terms for this software are covered by the 3 | ;Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) 4 | ;which can be found in the file epl-v10.html at the root of this distribution. 5 | ;By using this software in any fashion, you are agreeing to be bound by 6 | ;the terms of this license. 7 | ;You must not remove this notice, or any other, from this software. 8 | 9 | (ns ^{ :doc "Clojure Parser Library." :author "Panduranga Adusumilli"} 10 | protoflex.parse 11 | (:refer-clojure :exclude [replace reverse]) 12 | (:import [java.util.regex Matcher]) 13 | (:import [clojure.lang ExceptionInfo]) 14 | (:use [protoflex.util]) 15 | (:require [clojure.string])) 16 | 17 | (declare parser-init set-pos throw-ex starts-with? move read-ch read-ch-in-set 18 | qstring unexpected next-n regex to-set read-re next-text 19 | init-default-ws-reader match-text find-first auto-trim-if get-opt 20 | mark-pos back-to-mark any-string dq-str sq-str read-to-re at-end? 21 | no-trim no-trim-nl get-opts set-opt get-default-ops 22 | get-default-op-fn-map init-operators read-ws string expect 23 | line-pos-str cursor-pos with-opts read-to-re-or-eof to-eof) 24 | 25 | (def ^:dynamic *parser-state* (atom [])) 26 | (defn state [] (deref *parser-state*)) 27 | 28 | (def default-options { 29 | :blk-cmt-delim ["/*" "*/"] 30 | :line-cmt-start "//" 31 | :ws-regex #"\s+" 32 | :auto-trim true 33 | :word-regex #"\S+" 34 | :ident-regex #"[a-zA-Z_][a-zA-Z0-9_]*" 35 | :eof true }) 36 | 37 | (defn parse 38 | "This function triggers off the parsing of the provided input string using 39 | the specified parse function. The following parser options may be provided 40 | to alter the default behavior of the parser: 41 | :blk-cmt-delim - vector specifying start and end of block-comment markers 42 | :line-cmt-start - string specifying the begin marker of a line comment 43 | :ws-regex - regular expression for matching (non-comment) white space 44 | :auto-trim - whether to automatically remove the leading whitespace/comments 45 | at the current position in the input text or immediately after a parse action. 46 | :word-regex - regular expression for matching words 47 | :operators - a vector of vector of operators in the decreasing order of 48 | precedence; see get-default-ops function for an example. 49 | :op-fn-map - a map of operator and the function to call for that operator when 50 | evaluating expressions 51 | :eof - if true, the parse function must consume the entire input text 52 | 53 | Args: 54 | parse-fn - parse function to apply 55 | input-str - input text to be parsed 56 | opts - key value options (listed above)" 57 | 58 | [parse-fn input-str & opts] 59 | (let [options (merge default-options (apply hash-map opts))] 60 | (binding [*parser-state* (parser-init input-str options) 61 | *ns* (find-ns 'protoflex.parse)] 62 | (init-operators (get options :operators (get-default-ops))) 63 | (set-opt :op-fn-map (get options :op-fn-map (get-default-op-fn-map))) 64 | (auto-trim-if) 65 | (let [result (parse-fn)] 66 | (if (and (:eof options) (not (at-end?))) (throw-ex "Extraneous text") result))))) 67 | 68 | (defmacro parse_ 69 | "Similar to the `parse` function, but takes a parse expression instead of a 70 | parse function as its first argument. The parse expression is any clojure 71 | expression that performs parsing by calling built-in or custom parse 72 | functions. See the documentation for `parse`" 73 | [parse-expr input-str & opts] 74 | `(parse (fn [] (eval ~parse-expr)) ~input-str ~@opts) 75 | ) 76 | 77 | (defn attempt 78 | "Tries to match the input at the current position with the provided 79 | parse function. If the parse function matches successfully, the matched 80 | text is returned and the input cursor advances by the length of the 81 | matched text. Otherwise a nil is returned and the current position 82 | in the input remains unchanged." 83 | [parse-fn] 84 | (let [m (mark-pos)] 85 | (try (let [r (parse-fn)] r) 86 | (catch ExceptionInfo ex 87 | (when (:committed (ex-data ex)) (throw ex)) 88 | (back-to-mark m) nil)))) 89 | 90 | (defmacro attempt_ 91 | "Creates and returns a parse function that calls `attempt` when it is 92 | invoked. The argument `parse-expr` is converted to a parse function 93 | and passed to `attempt` in the returned function's body. See `attempt`." 94 | [parse-expr] 95 | `#(attempt (fn [] (eval ~parse-expr)))) 96 | 97 | (defn opt 98 | "Same as attempt, but accepts a default value argument to return in case the 99 | specified parse function fails. Useful for matching optional text." 100 | ([parse-fn] (attempt parse-fn)) 101 | ([parse-fn default-val] (let [v (attempt parse-fn)] (if v v default-val)))) 102 | 103 | (defmacro opt_ 104 | "Creates and returns a parse function that calls `opt` when it is 105 | invoked. The argument `parse-expr` is converted to a parse function 106 | and passed to `opt` in the returned function's body. See `opt`." 107 | ([parse-expr] `#(opt (fn [] (eval ~parse-expr)))) 108 | ([parse-expr default-val] `#(opt (fn [] (eval ~parse-expr)) ~default-val))) 109 | 110 | (defn any 111 | "Returns the result of the first successfully matching parse-function. 112 | If none of the parse functions match, an exception is thrown." 113 | [& parse-fns] 114 | (if-let [r (find-first :result #(attempt %) parse-fns)] 115 | r (throw-ex))) 116 | 117 | (defmacro ->fns 118 | [& exprs] 119 | `(map #(fn [] (eval %)) '~exprs)) 120 | 121 | (defmacro any_ 122 | "Creates and returns a parse function that calls `any` when it is 123 | invoked. The arguments `parse-exprs` are converted to parse functions 124 | and passed to `any` in the returned function's body. See `any`." 125 | [& parse-exprs] 126 | `#(apply any (->fns ~@parse-exprs))) 127 | 128 | (defn la-strs [la-pf-vec] 129 | (for [x (range (count la-pf-vec)) :when (even? x)] (get la-pf-vec x))) 130 | 131 | (defn- look-ahead-aux 132 | [[la pf & rest]] 133 | (if (starts-with? la) 134 | (pf) 135 | (if (> (count rest) 1) (recur rest) (throw-ex))) 136 | ) 137 | 138 | (defn look-ahead 139 | "Takes a collection of look-ahead-string and parse-function pairs and applies 140 | the first parse function that follows the matching look-ahead-string and 141 | returns the result, or throws a parse exception if the parse function fails. 142 | 143 | If none of the look-ahead strings match the current text, an exception is thrown. 144 | 145 | To specify a default parse function, provide an empty string as look-ahead and 146 | the default parse function at the end of the argument list. 147 | 148 | Args: [la-str-1 parse-fn-1 la-str-2 parse-fn-2 ...]" 149 | [la-pf-vec] 150 | (expect (la-strs la-pf-vec) #(look-ahead-aux la-pf-vec))) 151 | 152 | (defn- look-ahead*-aux 153 | [[la pf & rest]] 154 | (if (starts-with? la) 155 | (do (string la) (pf)) 156 | (if (> (count rest) 1) (recur rest) (throw-ex)))) 157 | 158 | (defn look-ahead* 159 | "Same as look-ahead, but consumes the matching look-ahead string before 160 | applying the corresponding parse function. " 161 | [la-pf-vec] 162 | (expect (la-strs la-pf-vec) #(look-ahead*-aux la-pf-vec))) 163 | 164 | (defn series 165 | "Applies a sequence of parse functions and returns their results in 166 | a vector. Each successfull match by the parse function advances the cursor. 167 | If any of the parse functions fails, an exception is thrown." 168 | [& parse-fns] 169 | ;(map #(%) parse-fns) 170 | ; just applying map doesn't get the correct dynamically rebound var values. 171 | ; doall & bound-fn don't seem to work; clojure bug? 172 | (letfn [(apply-fseq [fns] 173 | (loop [fst (first fns) 174 | rst (rest fns) 175 | result []] 176 | (if fst (recur (first rst) (rest rst) (conj result (fst))) 177 | result)))] 178 | (apply-fseq parse-fns))) 179 | 180 | (defmacro series_ 181 | "Creates and returns a parse function that calls `series` when it is 182 | invoked. The arguments `parse-exprs` are converted to parse functions 183 | and passed to `series` in the returned function's body. See `series`." 184 | [& parse-exprs] 185 | `#(apply series (->fns ~@parse-exprs))) 186 | 187 | (defn multi* 188 | "Matches zero or more occurrences of text accepted by the provided parse 189 | function and returns the results in a vector." 190 | [parse-fn] 191 | (loop [rv []] 192 | (if-let [r (attempt parse-fn)] 193 | (recur (conj rv r)) 194 | (if (pos? (count rv)) rv)))) 195 | 196 | 197 | (defmacro multi*_ 198 | "Creates and returns a parse function that calls `multi*` when it is 199 | invoked. The argument `parse-expr` is converted to a parse function 200 | and passed to `multi*` in the returned function's body. See `multi*`." 201 | [parse-expr] 202 | `#(multi* (fn [] (eval ~parse-expr)))) 203 | 204 | (defn multi+ 205 | "Matches one or more occurrences of text accepted by the provided parse 206 | function and returns the results in a vector. If the parse function doesn't 207 | match even once, an exception is thrown." 208 | [parse-fn] 209 | (if-let [rv (multi* parse-fn)] rv (throw-ex))) 210 | 211 | (defmacro multi+_ 212 | "Creates and returns a parse function that calls `multi+` when it is 213 | invoked. The argument `parse-expr` is converted to a parse function 214 | and passed to `multi+` in the returned function's body. See `multi+`." 215 | [parse-expr] 216 | `#(multi+ (fn [] (eval ~parse-expr)))) 217 | 218 | (defn times 219 | "Applies the provided parse function exactly n times and returns the 220 | results of applications of the function in a vector." 221 | [n parse-fn] 222 | (vec (for [i (range n)] (parse-fn)))) 223 | 224 | (defmacro times_ 225 | "Creates and returns a parse function that calls `times` when it is 226 | invoked. The argument `parse-expr` is converted to a parse function 227 | and passed to `times` in the returned function's body. See `times`." 228 | [n parse-expr] 229 | `#(times ~n (fn [] (eval ~parse-expr)))) 230 | 231 | (defn lexeme 232 | "Applies the specified parse function for current input text, consumes any 233 | following whitespace, comments and returns the result of the parse function 234 | application." 235 | [parse-fn] 236 | (let [r (parse-fn)] (read-ws) r)) 237 | 238 | (defmacro lexeme_ 239 | "Creates and returns a parse function that calls `lexeme` when it is 240 | invoked. The argument `parse-expr` is converted to a parse function 241 | and passed to `lexeme` in the returned function's body. See `lexeme`." 242 | [parse-expr] 243 | `#(lexeme (fn [] (eval ~parse-expr)))) 244 | 245 | (defn- dq [x] (str \" x \")) 246 | 247 | (defn- one-of-msg [x] (str "one of [" 248 | (->> (map dq x) (clojure.string/join ", ")) "]")) 249 | 250 | (defn- exp-msg [x] (if (coll? x) (one-of-msg x) (dq x))) 251 | 252 | (defn expect 253 | "Customize error message; if the specified parse function doesn't match 254 | the current input text, the error message of the parse exception will include 255 | the specified custom expected-message." 256 | [expected-msg parse-fn] 257 | (try (parse-fn) 258 | (catch ExceptionInfo ex 259 | (if (:unexpected (ex-data ex)) (throw ex) 260 | (throw-ex (unexpected expected-msg) {:unexpected true}))))) 261 | 262 | (defmacro expect_ 263 | "Creates and returns a parse function that calls `expect` when it is 264 | invoked. The argument `parse-expr` is converted to a parse function 265 | and passed to `expect` in the returned function's body. See `expect`." 266 | [expected-msg parse-expr] 267 | `#(expect ~expected-msg (fn [] (eval ~parse-expr)))) 268 | 269 | (defn- has-parse-error-msg? [^Exception ex] 270 | (let [m (.getMessage ex)] 271 | (and m (>= (.indexOf m "Parse Error") 0)))) 272 | 273 | (defn throw-ex 274 | "Throws an exception of ExceptionInfo class; this is usually called to 275 | indicate a match failure in a parse function." 276 | ([] (throw-ex "" {} nil)) 277 | ([msg] (throw-ex msg {} nil)) 278 | ([msg map] (throw-ex msg map nil)) 279 | ([msg map cause] 280 | (let [pos (line-pos-str (or (:pos map) (cursor-pos))) 281 | msg (if (:msg-final map) msg (str "Parse Error: " msg " at " pos)) 282 | map (assoc map :msg-final true)] 283 | (throw (ex-info msg map cause))))) 284 | 285 | (defn unexpected 286 | "Creates a message string for unexpected input exception." 287 | ([expected] (unexpected (next-n 10) expected)) 288 | ([actual expected] 289 | (str "Unexpected input: \"" actual "\"; Expecting " 290 | (exp-msg expected)))) 291 | 292 | (defn- mchar [ch is-no-auto-trim] 293 | (let [c (read-ch is-no-auto-trim)] 294 | (if (= ch c) c 295 | (do (move -1) (throw-ex (unexpected c ch)))))) 296 | 297 | (defn chr 298 | "If the next character in the input matches the specified character ch, 299 | returns it; otherwise throws an exception." 300 | [ch] (mchar ch false)) 301 | 302 | (defn chr- 303 | "Same as chr but with auto-trimming turned off for the following input" 304 | [ch] (mchar ch true)) 305 | 306 | (defn chr-in 307 | "If the next character in the input matches any character in the specified 308 | string or character collection, the matching character is returned. 309 | Otherwise throws an exception." 310 | [chars] (read-ch-in-set (to-set chars) false)) 311 | 312 | (defn chr-in- 313 | "Same as chr-in but with auto-trimming turned off for the following input" 314 | [chars] (read-ch-in-set (to-set chars) true)) 315 | 316 | (defn string 317 | "If the input matches the specified string, the string is 318 | returned. Otherwise, a parse exception is thrown." 319 | [^String s] 320 | (if (starts-with? s) (move (.length s)) 321 | (throw-ex (unexpected (next-n (.length s)) s)))) 322 | 323 | (defn string-in-ord 324 | "Returns the first string from the provided strings that matches text 325 | at the current position. Throws an exception if none of the strings match." 326 | [strings] 327 | (if-let [^String s (find-first :item #(starts-with? %) strings)] 328 | (do (move (.length s)) s) 329 | (throw-ex (unexpected strings)))) 330 | 331 | (defn string-in 332 | "Returns the longest string from the provided strings that matches text 333 | at the current position. Throws an exception if none of the strings match." 334 | [strings] 335 | (string-in-ord (sort-by count #(compare %2 %1) strings))) 336 | 337 | (defn word-in 338 | "Returns the first word from the provided words that matches text 339 | at the current position. Throws an exception if none of the words match. 340 | An optional word-reader parse-function may be provided to read words." 341 | ([str-coll] 342 | (word-in str-coll #(regex (get-opt :word-regex)))) 343 | ([str-coll word-reader] 344 | (let [wset (to-set str-coll) 345 | w (word-reader)] 346 | (if (wset w) w (throw-ex (unexpected w wset)))))) 347 | 348 | (defn word 349 | "Returns the specified word if the word occurs at the current position in 350 | the input text; an exception is thrown otherwise." 351 | [w] (word-in #{w})) 352 | 353 | (defn ^String ident 354 | "Reads an identifier at current input position using the ident-regex 355 | parser option. If id is specified, the read identifier must match the 356 | specified value; otherwise an exception is thrown." 357 | [] (regex (get-opt :ident-regex))) 358 | 359 | (defn ^String key-word 360 | "Reads an identifier at the current input position. If the read identifier 361 | matches the specified keyword kw, the same is returned; otherwise, an 362 | exception is thrown." 363 | [kw] (expect kw #(if (= kw (ident)) kw (throw-ex)))) 364 | 365 | (defn commit 366 | "Applies supplied parse-fn and if it fails, the failure gets reported as a 367 | 'committed' exception, which prevents the parser from trying alternatives at 368 | higher levels. On success, returns the result of parse-fn." 369 | [parse-fn] 370 | (try (parse-fn) 371 | (catch ExceptionInfo ex 372 | (throw-ex (.getMessage ex) (assoc (ex-data ex) :committed true))))) 373 | 374 | (defn commit-on 375 | "If the keyword kw occurs at the current position in the input text, parse-fn 376 | will be applied next. If parse-fn fails, it will get reported as a 'committed' 377 | exception, which prevents the parser from trying alternatives at higher levels. 378 | On sucess, returns the result of parse-fn." 379 | [kw parse-fn] 380 | (if (attempt #(key-word kw)) (commit parse-fn))) 381 | 382 | (defn with-follow 383 | "Applies parse-fn and follow-fn in sequence; Ignores the result of follow-fn and 384 | returns the result of parse-fn." 385 | [parse-fn follow-fn] (let [r (parse-fn), _ (follow-fn)] r)) 386 | 387 | (defn with-follow* 388 | "Similar to with-follow, but commits to follow-fn parse if parse-fn succeeds. 389 | Returns the result of parse-fn." 390 | [parse-fn follow-fn] (with-follow parse-fn #(commit follow-fn))) 391 | 392 | (defn with-no-follow 393 | "Applies parse-fn and follow-fn in sequence; This method succeeds only if the 394 | follow-fn fails to match. Returns the result of parse-fn." 395 | [parse-fn follow-fn] 396 | (let [r (parse-fn), f (attempt follow-fn)] 397 | (when f (throw-ex "Unexpected follow")) 398 | r)) 399 | 400 | (defn sep-by 401 | "Reads a record using the specified field, field-separator and 402 | record-separator parse functions. If no record-separator is specified, 403 | a newline character is used as record separator. Returns the fields of the 404 | record in a vector." 405 | ([fld-fn fld-sep-fn] 406 | (sep-by fld-fn fld-sep-fn #(regex #"\r?\n"))) 407 | 408 | ([fld-fn fld-sep-fn rec-sep-fn] 409 | (if-not (at-end?) 410 | (let [fst (fld-fn) 411 | rst (multi* #(series fld-sep-fn fld-fn)) 412 | _ (any rec-sep-fn at-end?) 413 | rst (if rst (map second rst)) 414 | result (if rst (vec(conj rst fst)) [fst])] 415 | result)))) 416 | 417 | 418 | (defn sep-rest [parse-fn sep-fn] 419 | (let [rst (multi* #(series sep-fn parse-fn))] 420 | (reduce (fn [a e] (conj a (e 1))) [] rst))) 421 | 422 | (defn sep-by* 423 | "Differs from sep-by in that it allows zero matches of parse-fn before stop-fn; 424 | Unlike sep-by, stop-fn must match -- not optional." 425 | [parse-fn sep-fn stop-fn] 426 | (if-let [fst (attempt parse-fn)] 427 | (let [rst (sep-rest parse-fn sep-fn), _ (stop-fn)] 428 | (into [fst] rst)) ; then 429 | (do (stop-fn) nil))) ; else 430 | 431 | (defn any-string 432 | "Reads a single-quoted or double-quoted or a plain-string that is followed 433 | by the specified separator sep or EOF; the separator is not part of the returned 434 | string." 435 | [sep] (cond 436 | (starts-with? "\"") (dq-str) 437 | (starts-with? "'") (sq-str) 438 | :else (read-to-re-or-eof (re-pattern (str sep "|\r?\n"))))) 439 | 440 | (defn between 441 | "Applies the supplied start-fn, parse-fn and end-fn functions and returns 442 | the result of parse-fn. This is typically used to parse content enclosed by 443 | some delimiters on either side." 444 | [start-fn parse-fn end-fn] 445 | (let [res (series start-fn parse-fn end-fn)] (res 1))) 446 | 447 | (defn parens 448 | "Returns the result of applying specifed parse function to text that is 449 | in between the opening and closing parentheses '(' and ')'" 450 | [parse-fn] (between #(chr \() parse-fn #(chr \)))) 451 | 452 | (defn braces 453 | "Returns the result of applying specifed parse function to text that is 454 | in between the opening and closing braces '{' and '}'" 455 | [parse-fn] (between #(chr \{) parse-fn #(chr \}))) 456 | 457 | (defn sq-brackets 458 | "Returns the result of applying specifed parse function to text that is 459 | in between the opening and closing square brackets '[' and ']'" 460 | [parse-fn] (between #(chr \[) parse-fn #(chr \]))) 461 | 462 | (defn ang-brackets 463 | "Returns the result of applying specifed parse function to text that is 464 | in between the opening and closing angular brackets '<' and '>'" 465 | [parse-fn] (between #(chr \<) parse-fn #(chr \>))) 466 | 467 | (defn regex 468 | "Returns the text matched by the specified regex; If a group is specified, 469 | the returned text is for that group only. In either case, the cursor is 470 | advanced by the length of the entire matched text (group 0)" 471 | ([re] (regex re 0)) 472 | ([re grp] (read-re re grp))) 473 | 474 | (defn semi 475 | "Matches and returns a semi-colon character" 476 | [] (chr \;)) 477 | 478 | (defn comma 479 | "Matches and returns a comma character" 480 | [] (chr \,)) 481 | 482 | (defn dot 483 | "Matches and retuns a dot character" 484 | [] (chr \.)) 485 | 486 | (defn colon 487 | "Matches and returns a colon character" 488 | [] (chr \:)) 489 | 490 | (defn popen 491 | "Matches and returns an opening paranthesis character" 492 | [] (chr \()) 493 | 494 | (defn pclose 495 | "Matches and returns a closing paranthesis character" 496 | [] (chr \))) 497 | 498 | (defn bopen 499 | "Matches and returns an opening curly brace character" 500 | [] (chr \{)) 501 | 502 | (defn bclose 503 | "Matches and returns a closing curly brace character" 504 | [] (chr \})) 505 | 506 | (defn sqopen 507 | "Matches and returns an opening curly brace character" 508 | [] (chr \[)) 509 | 510 | (defn sqclose 511 | "Matches and returns a closing curly brace character" 512 | [] (chr \])) 513 | 514 | (defn aopen 515 | "Matches and returns an opening angular bracket character" 516 | [] (chr \<)) 517 | 518 | (defn aclose 519 | "Matches and returns a closing angular bracket character" 520 | [] (chr \>)) 521 | 522 | (defn equal 523 | "Matches and returns an equal character" 524 | [] (chr \=)) 525 | 526 | (defn integer 527 | "Parses a long integer value and returns a Long." 528 | [] (Long/parseLong (regex #"-?\d+(?!\w|\.)"))) 529 | 530 | (defn decimal 531 | "Parses a decimal value and returns a Double." 532 | [] (Double/parseDouble (regex #"-?\d+(\.\d+)?(?!\w|\.)"))) 533 | 534 | (defn number 535 | "Matches an integral or non-integral numeric value. While the function 536 | decimal also matches both integer and non-integer values, it always 537 | returns a Double; where as number returns Long for integers and Double 538 | for non-integers." 539 | [] (any integer decimal)) 540 | 541 | (defn sq-str 542 | "Parses a single-quoted string and returns the matched string (minus the quotes)" 543 | [] (qstring \' \\)) 544 | 545 | (defn dq-str 546 | "Parses a double-quoted string and returns the matched string (minus the quotes)" 547 | [] (qstring \" \\)) 548 | 549 | (defn- qstring [qchar esc] 550 | (let [sb (StringBuilder.)] 551 | (no-trim #(chr qchar)) ; read and ignore 552 | (loop [] 553 | (let [ch (read-ch true) 554 | is-esc (= ch esc) 555 | ch (if is-esc (read-ch) ch)] 556 | (if (and (= ch qchar) (not is-esc)) 557 | nil 558 | (do (.append sb ch) (recur))))) 559 | (auto-trim-if) 560 | (.toString sb))) 561 | 562 | (defn next-text ^String [] 563 | (let [[s c _ _] (state)] (subs s c))) 564 | 565 | (defn read-to 566 | "The parser skips to the position where the text contains the string 567 | specified by s. The string itself is not consumed, that is the cursor is 568 | positioned at the beginning of the match. If the specified string is not 569 | found, cursor position does not change and a parse exception is thrown." 570 | [^String s] 571 | (let [t (next-text) 572 | n (.indexOf t s)] 573 | (if (>= n 0) (move n) (throw-ex)))) 574 | 575 | (defn skip-over 576 | "Finds the specified string s in the input and skips over it. If the string 577 | is not found, a parse exception is thrown." 578 | [^String s] 579 | (let [s1 (read-to s) 580 | s2 (string s)] 581 | (str s1 s2))) 582 | 583 | (defn- ->re [re] (if (string? re) (re-pattern re) re)) 584 | 585 | (defn read-re 586 | "Reads the string matching the specified regular expression. If a match-group 587 | is specified, the corresponding text is returned; otherwise the entire 588 | matched text is returned." 589 | ([re] (read-re re 0)) 590 | ([re ^Integer grp] 591 | (let [t (next-text) 592 | ^Matcher rm (re-matcher (->re re) t)] 593 | (if (.lookingAt rm) 594 | (do (move (.end rm)) 595 | (.group rm grp)) 596 | (throw-ex))))) 597 | 598 | (defn read-to-re 599 | "Reads and returns text upto but not including the text matched by the 600 | specified regular expression. If the specified regular expression doesn't 601 | occur in the remaining input text, an exception is thrown." 602 | [re] 603 | (let [t (next-text) 604 | m (re-find (->re re) t)] 605 | (if (nil? m) 606 | (throw-ex) 607 | (let [^String ms (if (string? m) m (m 0)) 608 | i (.indexOf t ms)] 609 | (move i))))) 610 | 611 | (defn read-to-re-or-eof 612 | "If the specified regex matches in the remaining text, returns text upto the match; 613 | Otherwise returns all the remaining text and the input cursor is positioned at EOF." 614 | [re] 615 | (if-let [t (attempt #(read-to-re re))] t (to-eof))) 616 | 617 | (defn skip-over-re 618 | "Reads and returns text upto and including the text matched by the 619 | specified regular expression. If the specified regular expression doesn't 620 | occur in the remaining input text, an exception is thrown." 621 | [re] (let [s1 (read-to-re re) 622 | s2 (read-re re)] 623 | (str s1 s2))) 624 | 625 | (defn starts-with? 626 | "Returns a boolean value indicating whether the current input text matches 627 | the specified string." 628 | [^String s] (.startsWith (next-text) s)) 629 | 630 | (defn starts-with-re? 631 | "Returns a boolean value indicating whether the specified regular expression 632 | matches the input at the current position." 633 | [re] (->> (next-text) (re-matcher (->re re)) .lookingAt)) 634 | 635 | (defn read-n 636 | "Reads and returns an n-character string at the current position." 637 | ^String [n] 638 | (let [t (next-text)] 639 | (if (<= n (.length t)) 640 | (move n) 641 | (throw-ex "EOF")))) 642 | 643 | (defn read-ch 644 | "Reads and return the next input character. Throws an exception if the 645 | current position is at the end of the input." 646 | ([] (read-ch false)) 647 | ([is-no-auto-trim] 648 | (let [[^String s c _ _] (state)] 649 | (if (= c (.length s)) (throw-ex "EOF")) 650 | (if is-no-auto-trim (set-pos (inc c)) (move 1)) 651 | (.charAt s c)))) 652 | 653 | (defn read-ch-in-set 654 | "Reads and returns the next character if it matches any of the characters 655 | specified in the provided set. An exception is thrown otherwise. The 656 | optional is-no-auto-trim argument may be used to specify whether or not 657 | to apply auto-trim after reading the next character." 658 | ([char-set] (read-ch-in-set char-set false)) 659 | ([char-set is-no-auto-trim] 660 | (let [ch (read-ch is-no-auto-trim)] 661 | (if (char-set ch) ch 662 | (throw-ex (unexpected ch char-set)))))) 663 | 664 | (defn blk-cmt 665 | "Reads and returns a block comment as specified by the begin and end 666 | markers. Throws an exception if the specified block-comment doesn't 667 | occur at the current position." 668 | [beg end] 669 | (if beg 670 | (let [s1 (string beg) s2 (skip-over end)] (str s1 s2)))) 671 | 672 | (defn blk-cmt? 673 | "Similar to blk-cmt but returns a nil instead of throwing an exception 674 | in case of a match failure." 675 | [beg end] (attempt #(blk-cmt beg end))) 676 | 677 | (defn- to-eof 678 | "Reads and returns text from current position to the end of the input text." 679 | [] (let [t (next-text)] (move (.length t)))) 680 | 681 | (defn line-cmt 682 | "Reads and returns a line comment as specified by the begin marker. 683 | Throws an exception if the specified block-comment doesn't occur at the 684 | current position." 685 | [beg] 686 | (if beg 687 | (->> (with-trim-off (series #(string beg) #(regex #"[^\n]*\n?"))) 688 | (apply str)))) 689 | 690 | (defn line-cmt? 691 | "Similar to line-cmt but returns a nil instead of throwing an exception 692 | in case of a match failure." 693 | [beg] (attempt #(line-cmt beg))) 694 | 695 | (defn ws 696 | "Matches white space (including comments) at the current position. The 697 | optional parameters bcb, bce, lcb and wsre specify block-comment-begin, 698 | block-comment-end, line-comment-begin and white-space-regex respectively. 699 | If they are not specified here, the options set for the parser are used. 700 | Throws an exception if white space doesn't occur at the current position." 701 | ([] 702 | (let [opts (get-opts) 703 | blk (opts :blk-cmt-delim) 704 | lc (opts :line-cmt-start) 705 | wsre (opts :ws-regex)] 706 | (ws (blk 0) (blk 1) lc wsre))) 707 | 708 | ([bcb bce lcb wsre] 709 | (let [w (multi+ (fn [](any #(blk-cmt bcb bce) 710 | #(line-cmt lcb) 711 | #(regex wsre))))] 712 | (apply str w)))) 713 | 714 | (defn ws? 715 | "Similar to ws except that a nil value is returned instead of throwing 716 | an exception in case of a match failure." 717 | [& args] (attempt #(apply ws args))) 718 | 719 | (defn at-end? 720 | "Returns true if no more input is left to be read; false otherwise." 721 | [] (let [[^String s c _ _] (state)] (= (.length s) c))) 722 | 723 | (defn cursor-pos 724 | "Returns the current cursor position as a scalar" 725 | [] (let [[_ c _ _] (state)] c)) 726 | 727 | (defn line-column 728 | "Returns the line and column vector corresponding to the cursor in string s" 729 | [^String s cursor] 730 | (loop [i 0, nl-cnt 0, col 0] 731 | (if (= i cursor) [(inc nl-cnt) (inc col)] 732 | (if (== (int(.charAt s i)) (int \newline)) 733 | (recur (inc i) (inc nl-cnt) 0) 734 | (recur (inc i) nl-cnt (inc col)))))) 735 | 736 | (defn- line-pos* 737 | "Returns the line and column vector for the specified cursor position" 738 | [cursor] 739 | (let [[^String s _ _ _] (state)] 740 | (loop [i 0, nl-cnt 0, col 0] 741 | (if (= i cursor) [(inc nl-cnt) (inc col)] 742 | (if (== (int(.charAt s i)) (int \newline)) 743 | (recur (inc i) (inc nl-cnt) 0) 744 | (recur (inc i) nl-cnt (inc col))))))) 745 | 746 | (defn line-pos 747 | "Returns [line column] vector corresponding to the specified cursor position (or 748 | the current position if cursor is not specified) of the parser" 749 | ([] (line-pos* (cursor-pos))) 750 | ([cursor] (line-pos* cursor))) 751 | 752 | (defn line-pos-str 753 | "Returns line position in a descriptive string. If the cursor position is specified, 754 | the returned value corresponds to that position. Otherwise, the returned value 755 | corresponds to the current position." 756 | ([] (line-pos-str (cursor-pos))) 757 | ([cursor] (let [lp (line-pos cursor)] (str "line " (lp 0) ", col " (lp 1))))) 758 | 759 | ; ------------------ create & configure the parser ------------ 760 | 761 | (defn parser-init 762 | "Initializes the parser state with the specified input string and options." 763 | ([^String input-str] 764 | (parser-init input-str default-options)) 765 | 766 | ([^String input-str opts] 767 | (let [new-opts (init-default-ws-reader opts) 768 | st (atom [input-str 0 0 new-opts])] 769 | st))) ; [str pos prev-pos opts] 770 | 771 | (defn- init-default-ws-reader [opts] 772 | (let [bco (get opts :blk-cmt-delim) 773 | lcb (get opts :line-cmt-start) 774 | ws-re (get opts :ws-regex) 775 | wsr #(ws? (bco 0) (bco 1) lcb ws-re)] 776 | (assoc opts :default-ws-reader wsr))) 777 | 778 | (defn set-opts 779 | "Sets specified parser options" 780 | [opts] 781 | (swap! *parser-state* 782 | #(let [[s c p o] % 783 | o (merge o opts) 784 | o (merge o (init-default-ws-reader o))] 785 | [s c p o]))) 786 | 787 | (defn- reset-opts [opts] 788 | (swap! *parser-state* 789 | #(let [[s c p o] %] [s c p opts]))) 790 | 791 | (defn set-opt 792 | "Sets parser option k to value v" 793 | [k v] #_(swap! *parser-state* 794 | #(let [[s c p o] % 795 | o (-> o (assoc k v) (init-default-ws-reader))] 796 | [s c p (assoc o k v)])) 797 | (set-opts {k v})) 798 | 799 | (defn get-opts 800 | "Returns all parser options" 801 | [] (let [[_ _ _ o] (state)] o)) 802 | 803 | (defn get-opt 804 | "Returns the value for parser option k; if the optional default value 805 | parameter d is specified, its value is returned if the option k is not set 806 | in parser options." 807 | ([k] (get-opt k nil)) 808 | ([k d] (get (get-opts) k d))) 809 | 810 | (defn with-opts [opts parse-fn] 811 | "Sets the parser options specified in the opts map and applies the parse-fn 812 | function. Returns the result of applying parse-fn. 813 | 814 | Ensures that the original parse options are restored before the function exits." 815 | (let [orig-opts (get-opts)] 816 | (set-opts opts) 817 | (try (parse-fn) (finally (reset-opts orig-opts))))) 818 | 819 | (defn auto-trim-on 820 | "Turns on auto-trim feature that cleans trailing white-space, comments 821 | or whatever the custom ws-reader if any is spe" 822 | [] 823 | (set-opt :auto-trim true)) 824 | 825 | (defn auto-trim-off 826 | "Turns off the auto-trim option." 827 | [] (set-opt :auto-trim false)) 828 | 829 | (defn set-blk-cmt-opts 830 | "Sets block comment begin and end markers." 831 | [beg end] 832 | (set-opt :blk-cmt-delim [beg end])) 833 | 834 | (defn set-line-cmt-opts 835 | "Sets line comment begin marker." 836 | [beg] 837 | (set-opt :line-cmt-start beg)) 838 | 839 | (defn set-ws-regex 840 | "Sets the regular expression to be used for matching non-comment white-space." 841 | [ws-re] 842 | (set-opt :ws-regex ws-re)) 843 | 844 | (defn set-ws-reader 845 | "This sets the white-space parser to be used when auto-trim is set. 846 | If this is specified, it overrides the options set by set-blk-cmt-opts, 847 | set-line-cmt-opts and set-ws-regex options." 848 | [ws-reader] 849 | (set-opt :ws-reader ws-reader)) 850 | 851 | ;----------------------- misc utilities for parse functions ------------ 852 | (defn- get-input [] ((state) 0)) 853 | 854 | (defn- get-pos [] ((state) 1)) 855 | 856 | (defn- get-prev [] ((state) 2)) 857 | 858 | (defn get-opts [] ((state) 3)) 859 | 860 | (defn- set-pos 861 | ([pos] (set-pos pos (get-pos))) 862 | ([pos prev] 863 | (swap! *parser-state* #(let [[s _ _ o] %] [s pos (min prev pos) o])) 864 | nil)) 865 | ; prev-pos must never be greater than current-pos 866 | 867 | (defn mark-pos 868 | "Returns the current positional parameters of the parser." 869 | [] (let [[_ c p _] (state)] [c p])) 870 | 871 | (defn back-to-mark 872 | "Resets the positional parameters to a previously set mark." 873 | [mark] (set-pos (mark 0) (mark 1))) 874 | 875 | (defn- move [delta] 876 | (set-pos (+ (get-pos) delta)) 877 | (let [t (match-text)] 878 | (auto-trim-if) 879 | t)) 880 | 881 | ;------------------------------------------------------------------------ 882 | 883 | (defmacro with-trim-on 884 | "Executes the provided body with auto-trim option set to true. The earlier 885 | value of the auto-trim option is restored after executing the body." 886 | [& body] 887 | `(let [at# (get-opt :auto-trim)] 888 | (set-opt :auto-trim true) 889 | (let [ret# (try (do ~@body) (finally (set-opt :auto-trim at#)))] 890 | ret# ))) 891 | 892 | (defmacro with-trim-off 893 | "Executes the provided body with auto-trim option set to false. The earlier 894 | value of the auto-trim option is restored after executing the body." 895 | [& body] 896 | `(let [at# (get-opt :auto-trim)] 897 | (set-opt :auto-trim false) 898 | (let [ret# (try (do ~@body) (finally (set-opt :auto-trim at#)))] 899 | ret# ))) 900 | 901 | (defn no-trim 902 | "Similar to with-trim-off, but takes a function as a parameter instead of 903 | the body" 904 | [parse-fn] (with-trim-off (parse-fn))) 905 | 906 | (defmacro no-trim_ 907 | "Creates and returns a parse function that calls `no-trim` when it is 908 | invoked. The argument `parse-expr` is converted to a parse function 909 | and passed to `no-trim` in the returned function's body. See `no-trim`." 910 | [parse-expr] 911 | `#(no-trim (fn [] (eval ~parse-expr))) 912 | ) 913 | 914 | (defn no-trim-nl 915 | "Turns off automatic trimming of newline characters (as part of white-space) 916 | and executes the specified function. The earlier auto-trim options are restored 917 | at the end of execution of the specified function." 918 | [parse-fn] (let [wsre (get-opt :ws-regex)] 919 | (set-opt :ws-regex #"[ \t]+") 920 | (let [result (try (parse-fn) (finally (set-opt :ws-regex wsre)))] 921 | result))) 922 | 923 | (defmacro no-trim-nl_ 924 | "Creates and returns a parse function that calls `no-trim-nl` when it is 925 | invoked. The argument `parse-expr` is converted to a parse function 926 | and passed to `no-trim-nl` in the returned function's body. See `no-trim-nl`." 927 | [parse-expr] 928 | `#(no-trim-nl (fn [] (eval ~parse-expr))) 929 | ) 930 | 931 | (defn read-ws 932 | "Reads whitespace (including comments) using a whitespace reader based 933 | on parser options. If the :ws-reader option is not set, a default whitespace 934 | reader based on other parser options such as :ws-regex, :blk-cmt-delim and 935 | :line-cmt-start will be used. Returns the whitespace read." 936 | [] (let [wr (get-opt :ws-reader) 937 | wr (if (nil? wr) (get-opt :default-ws-reader) wr)] 938 | (wr))) 939 | 940 | (defn auto-trim-if 941 | "Automatically trim the leading input text if :auto-trim option is set to true." 942 | [] (if (get-opt :auto-trim) (read-ws)) 943 | nil) 944 | 945 | 946 | (defn- match-text 947 | "Returns the text matched by the previous parser operation." 948 | [] 949 | (let [[s c p _] (state)] (if (> c p) (subs s p c)))) 950 | 951 | (defn- remaining [] (- (count (get-input)) (get-pos))) 952 | 953 | (defn- next-n [n] 954 | (let [t (next-text)] 955 | (subs t 0 (min n (.length t))))) 956 | 957 | (defn- to-set [coll] (if (set? coll) coll (apply hash-set coll))) 958 | 959 | ; stops at first match instead of taking first of filter 960 | (defn- find-first 961 | [item-or-result pred coll] 962 | (loop [fst (first coll) 963 | rst (rest coll)] 964 | (if fst (if-let [result (pred fst)] 965 | (if (= item-or-result :item) fst result) 966 | (recur (first rst) (rest rst)))))) 967 | 968 | ; -------------------------------------------------------------- 969 | ; Sample Infix Expression Parser 970 | 971 | (defn get-default-ops [] 972 | [[:unary ["!" "~"]] 973 | [:left ["*" "/" "%"]] 974 | [:left ["+" "-"]] 975 | [:left ["<<" ">>" ">>>"]] 976 | [:left ["<" "<=" ">" ">="]] 977 | [:left ["==" "!="]] 978 | [:left ["&"]] 979 | [:left ["^"]] 980 | [:left ["|"]] 981 | [:left ["&&"]] 982 | [:left ["||"]] 983 | [:right [ "=" "+=" "-=" "*=" "/=" "%=" "&=" 984 | "^=" "|=" "<<=" ">>=" "<<<="]]]) 985 | 986 | (defn get-default-op-fn-map [] 987 | {"!" not, "~" bit-not, "*" *, "/" /, "+" +, "-" -, "<<" bit-shift-left, 988 | ">>" bit-shift-right "<" <, "<=" <=, ">" >, ">=" >=, "==" ==, 989 | "!=" #(not (== %1 %2)),"&" bit-and, "^" bit-xor, "|" bit-or, 990 | "&&" (macro->fn and), "||" (macro->fn or)}) 991 | 992 | (defn- ops->map [operators] 993 | (reduce #(merge %1 %2) {} 994 | (for [i (range (count operators))] 995 | (loop [m {} j 0] 996 | (let [l-ops (operators i) desc (l-ops 0) op-names (l-ops 1)] 997 | (if (< j (count op-names)) 998 | (recur (assoc m (op-names j) [desc i]) (inc j)) 999 | m)))))) 1000 | 1001 | (defn- get-all-ops [operators] 1002 | (reduce #(concat %1 (get %2 1)) [] operators)) 1003 | 1004 | (defn op-map [] (get-opt :op-map)) 1005 | 1006 | (defn all-ops [] (get-opt :all-ops)) 1007 | 1008 | (defn init-operators [operators] 1009 | (set-opt :op-map (ops->map operators)) 1010 | (set-opt :all-ops (get-all-ops operators))) 1011 | 1012 | (defn- op? [op] (get (op-map) op)) 1013 | 1014 | (defn- unary? [op] (= :unary ((get (op-map) op [""]) 0))) 1015 | 1016 | (defn- prec [op] ((get (op-map) op) 1)) 1017 | 1018 | (defn- left-assoc? [op] (= :left ((get (op-map) op) 0))) 1019 | 1020 | (defn has-priority? [op1 op2] 1021 | (if (< (prec op1) (prec op2)) ;op1 has higher precedence (smaller #) 1022 | true 1023 | (if (> (prec op1) (prec op2)) 1024 | false 1025 | (left-assoc? op1)))) 1026 | 1027 | (defn operator [] (string-in (all-ops))) 1028 | 1029 | (defn- unary-operator [] 1030 | (let [op (operator)] 1031 | (if (unary? op) op (throw-ex (unexpected op "unary operator or expression"))))) 1032 | 1033 | (defn- binary-operator [] 1034 | (let [op (operator)] 1035 | (if-not (unary? op) op (throw-ex (unexpected op "binary operator"))))) 1036 | 1037 | (declare paren-expr) 1038 | 1039 | (defn- factor [] 1040 | (let [uop (attempt unary-operator) 1041 | fac (if (starts-with? "(") (paren-expr) (number))] 1042 | (if uop [uop fac] fac))) 1043 | 1044 | (declare expr) 1045 | 1046 | (defn paren-expr [] 1047 | (chr \() 1048 | (let [e (expr)] (chr \)) e)) 1049 | 1050 | (declare build-tree) 1051 | 1052 | (defn expr 1053 | "Parses expressions and returns the parse tree as nested vectors." 1054 | [] (if-let [fac (attempt factor)] 1055 | (if-let [rhs (multi* #(series binary-operator factor))] 1056 | (-> (into [fac] (apply concat rhs)) build-tree ) 1057 | fac))) 1058 | 1059 | (defn build-tree [nodes] 1060 | (loop [i 0 tree []] 1061 | (if (< i (count nodes)) 1062 | (let [node (nodes i) 1063 | nc (count tree) 1064 | n0 (get tree 0) n1 (get tree 1) n2 (get tree 2)] 1065 | (case nc 1066 | 0 (recur (inc i) [node]) 1067 | 1 (recur (inc i) [node n0]) 1068 | 2 (recur (inc i) [n0 n1 node]) 1069 | 3 (recur (inc i) (if (odd? i) ; operator 1070 | (if (has-priority? n0 node) 1071 | [node tree] 1072 | [n0 n1 [node n2]]) 1073 | [n0 n1 [(n2 0) (n2 1) node]])))) 1074 | tree))) 1075 | 1076 | (defn- get-op-fn [op] 1077 | (let [of-map (get-opt :op-fn-map)] 1078 | (if-let [fn (get of-map op)] fn (throw-ex (str op " not implemented"))))) 1079 | 1080 | (defn eval-expr-tree 1081 | "Evaluates the parse tree returned by expr parse function." 1082 | [ptree] 1083 | (letfn [(val [i] (-> (ptree i) eval-expr-tree))] 1084 | (if (coll? ptree) 1085 | (case (count ptree) 1086 | 2 ((get-op-fn (ptree 0)) (val 1)) 1087 | 3 ((get-op-fn (ptree 0)) (val 1) (val 2))) 1088 | ptree))) 1089 | 1090 | (defn eval-expr 1091 | "Parses and evaluates an expression in infix notation. 1092 | Args: expression-string followed by parser options. See parse function for details." 1093 | [& args] 1094 | (apply parse (list* #(-> (expr) eval-expr-tree) args))) 1095 | 1096 | -------------------------------------------------------------------------------- /README.html: -------------------------------------------------------------------------------- 1 |
251 |

Parse-EZ : Clojure Parser Library

252 | 253 |

API Documentation

254 | 255 |

Parse-EZ is a parser library for Clojure programmers. It allows easy 256 | mixing of declarative and imperative styles and does not 257 | require any special constructs, macros, monads, etc. to write custom parsers. 258 | All the parsing is implemented using regular Clojure functions.

259 | 260 |

The library provides a number of 261 | parse functions and combinators and comes with a built-in customizable infix 262 | expression parser and evaluator. It allows the programmer to concisely specify 263 | the structure of input text using Clojure functions and easily build parse trees 264 | without having to step out of Clojure. Whether you are writing a parser 265 | for some well structured data or for data scraping or prototyping a new language, 266 | you can make use of this library to quickly create a parser.

267 | 268 |

Features

269 | 270 | 279 | 280 |

Usage

281 | 282 |

Installation

283 | 284 |

Just add Parse-EZ as a dependency to your lein project

285 |
[protoflex/parse-ez "0.4.2"]
286 | 
287 |

and run

288 |
lein deps
289 | 
290 |

A Taste of Parse-EZ

291 | 292 |

Here are a couple of sample parsers to give you a taste of the parser library.

293 | 294 |

CSV Parser

295 | 296 |

A CSV file contains multiple records, one-record per line, with field-values separated by a delimiter 297 | such as a comma or a tab. The field values may optionally be quoted either using a single or double 298 | quotes. When field-values are quoted, they may contain the field-delimiter characters, and in such 299 | cases they will not be treated as field separators.

300 | 301 |

First, let us define a parse function for parsing one-line of csv file:

302 |
(defn csv-1 [sep] 
303 |     (sep-by #(any-string sep) #(chr sep)))
304 | 
305 |

In the above function definition, we make use of the parse combinator sep-by 306 | which takes two arguments: the first one to read a field-value and the second 307 | one to read the separator. Here, we have used Clojure's anonymous function shortcuts to 308 | specify the desired behavior succinctly. The any-string function matches a single-quoted 309 | string or a double-quoted string or a plain-string that is followed by the specified separator 310 | sep. This is exactly the function that we need to read the field-value. The second argument 311 | provided to sep-by above uses the primitive parse function chr which succeeds only when 312 | the next character in the input matches its argument (sep parameter in this case). The csv-1 function returns the field values as a vector.

313 | 314 |

The sep-by function actually takes a third, optional argument as record-separator 315 | function with the default value of a function that matches a newline. We didn't 316 | pass the third argument above because the default behavior suits our purpose. 317 | Had the default behavior of sep-by been different, we would have written the 318 | above function as:

319 |
(defn csv-1 [sep] 
320 |     (sep-by #(any-string sep) #(chr sep) #(regex #"\r?\n")))
321 | 
322 |

Now that we have created a parse function to parse a single line of CSV 323 | file, let us write another parse function that parses the entire CSV file 324 | content and returns the result as a vector of vector of field values 325 | (one-vector per record/line). All we need to do is to repeatedly apply the 326 | above defined csv-1 function and the multi* parse combinator does 327 | just that.

328 | 329 |

Just one small but important detail: by default, Parse-EZ 330 | automatically trims whitespace after successfully applying a parse function. 331 | This means that the newline at the end of line would be consumed after reading 332 | the last field value and the sep-by would be unable to match the end-of-line 333 | which is the record-separator in this case. So, we will disable the newline 334 | trimming functionality using the no-trim combinator.

335 |
(defn csv [sep] 
336 |     (multi* (fn [] (no-trim #(csv-1 sep)))))
337 | 
338 |

Alternatively, you can express the above function a bit more easily using the macro versions of combinators introduced in Version 0.3.0 as follows:

339 |
(defn csv [sep] 
340 |     (multi* (no-trim_ (csv-1 sep))))
341 | 
342 |

Now, let us try out our csv parser. First let us define a couple of test 343 | strings containing a couple of records (lines) each. Note that the second 344 | string contains a comma inside the first cell (a quoted string).

345 |
user> (def s1 "1abc,def,ghi\n2jkl,mno,pqr\n")
346 | #'user/s1
347 | user> (def s2 "'1a,bc',def,ghi\n2jkl,mno,pqr\n")
348 | #'user/s2
349 | user> (parse #(csv \,) s1)
350 | [["1abc" "def" "ghi"] ["2jkl" "mno" "pqr"]]
351 | user> (parse #(csv \,) s2)
352 | [["1a,bc" "def" "ghi"] ["2jkl" "mno" "pqr"]]
353 | user> 
354 | 
355 |

Well, all we had to do was to write two lines of Clojure code to implement the CSV parser. 356 | Let's add a bit more functionality: the CSV files may use a comma or a tab character to 357 | separate the field values. Let's say we don't know ahead of time which character 358 | a file uses as a separator and we want to detect the separator automatically. Note 359 | that both characters may occur in a data file, but only one acts as a field-separator -- that too 360 | only when it's not inside a quoted string.

361 | 362 |

Here is our strategy to detect the separator:

363 | 364 | 369 | 370 |

Here is the code:

371 |
(defn detect-sep []
372 |     (let [m (mark-pos)
373 |            s (attempt #(any dq-str sq-str))
374 |            s (if s s (no-trim #(read-to-re #",|\t")))
375 |            sep (read-ch)]
376 |        (back-to-mark m)
377 |        sep))
378 | 
379 |

Note how we used the mark-pos and back-to-mark Parse-EZ functions to 'unconsume' 380 | the consumed input.

381 | 382 |

The complete code for the sample CSV parser with the separator-detection functionality is 383 | listed below (you can find this in csv_parse.clj file under the examples directory.

384 |
(ns protoflex.examples.csv_parse
385 |   (:use [protoflex.parse]))
386 | 
387 | (declare detect-sep csv-1)
388 | 
389 | (defn csv
390 |   "Reads and returns one or more records as a vector of vector of field-values"
391 |   ([] (csv (no-trim #(detect-sep))))
392 |   ([sep] (multi* (fn [] (no-trim-nl #(csv-1 sep))))))
393 | 
394 | (defn csv-1
395 |   "Reads and returns the fields of one record (line)"
396 |   [sep] (sep-by #(any-string sep) #(chr sep)))
397 | 
398 | (defn detect-sep
399 |   "Detects the separator used in a csv file (a comma or a tab)"
400 |   [] (let [m (mark-pos)
401 |            s (attempt #(any dq-str sq-str))
402 |            s (if s s (no-trim #(read-to-re #",|\t")))
403 |            sep (read-ch)]
404 |        (back-to-mark m)
405 |        sep))
406 | 
407 |

Let's try out the new auto-detect functionality. Let us define two new test 408 | strings s3 and s4 that use tab character as field-separator.

409 |
user> (use 'protoflex.examples.csv_parse)
410 | nil
411 | user> (def s3 "1abc\tdef\tghi\n2jkl\tmno\tpqr\n")
412 | #'user/s3
413 | user> (def s4 "'1a\tbc'\tdef\tghi\n2jkl\tmno\tpqr\n")
414 | #'user/s4
415 | user> (parse csv s3)
416 | [["1abc" "def" "ghi"] ["2jkl" "mno" "pqr"]]
417 | user> (parse csv s4)
418 | [["1a\tbc" "def" "ghi"] ["2jkl" "mno" "pqr"]]
419 | user> (parse csv s1)
420 | [["1abc" "def" "ghi"] ["2jkl" "mno" "pqr"]]
421 | user> 
422 | 
423 |

As you can see, this time we didn't specify what field-separator to use: the parser 424 | itself detected the field-separator character and used it, returning us the desired 425 | results.

426 | 427 |

XML Parser

428 | 429 |

Here is the listing of a sample XML parser implemented using Parse-EZ. You can find the 430 | source file in the examples directory. The parser returns a map containing keys and values 431 | for :tag, :attributes and :children for the root element. The value for :attributes key 432 | is itself another map containing attribute names and their values. The value for :children 433 | key is a vector (potentially empty) containing string content and/or maps for child elements.

434 |
(ns protoflex.examples.xml_parse
435 |   (:use [protoflex.parse]))
436 | 
437 | (declare pi prolog element attributes children-and-close cdata elem-or-text close-tag)
438 | 
439 | (defn parse-xml [xml-str]
440 |   (parse #(between prolog element pi) xml-str :blk-cmt-delim ["<!--" "-->"] :line-cmt-start nil))
441 | 
442 | (defn- pi [] (while (starts-with? "<?") (skip-over "?>")))
443 | 
444 | (defn- prolog [] (pi) (attempt  #(regex #"(?s)<!DOCTYPE([^<]+?>)|(.*?\]\s*>)")) (pi))
445 | 
446 |

The function parse-xml is the entry point that kicks off parsing of input xml string xml-str. It passes the between combinator to Parse-EZ's parse function. Here, the call to between returns the value returned by the element parse function, ignoring the content surrounding it (matched by prolog and pi functions). The block-comment delimiters are set to match XML's and the line-comment delimiter is cleared (by default these match Java comments).

447 | 448 |

The parse function pi is used to skip consecutive processing instructions by using the delimiters <? and ?>.

449 | 450 |

The parse function prolog is used to skip DTD declaration (if any) and also any surrounding processing instructions. Note that the regex used to match DTD declaration is only meant for illustration purposes. It isn't complete but will work in most cases.

451 |
(def name-start ":A-Z_a-z\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\u02FF\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD")
452 | 
453 | (def name-char (str name-start "\\-.0-9\\xB7\\u0300-\\u036F\\u203F-\\u2040"))
454 | 
455 | (def name-re (-> (format "[%s][%s]*" name-start name-char) re-pattern))
456 | 
457 |

name-re is a regular expression that matches xml element and attribute names.

458 |
(defn element []
459 |   (let [tag (do (chr \<) (regex name-re))
460 |         attrs (attributes)
461 |         children (look-ahead* [
462 |                                ">" #(children-and-close tag)
463 |                                "/>" (fn [] [])])]
464 |     {:tag tag, :attributes attrs, :children children}))
465 | 
466 |

The element parse function matches an xml element and returns the tag, attribute list and children in a hash map. Note the usage of the look_ahead* combinator to handle both the cases -- with children and without children. If it sees a ">" after reading the attributes, the look-ahead* function calls the children-and-close parse function to read children and the element close tag. On the other hand, if it sees "/>" after the attributes, it calls the (almost) empty parse function that simply returns an empty list.

467 |
(defn attr []
468 |   (let [n (regex name-re) _ (chr \=)
469 |         v (any sq-str dq-str)]
470 |     [n v]))
471 | 
472 | (defn attributes [] (apply hash-map (flatten  (multi* attr))))
473 | 
474 |

The attr parse function matches a single attribute. The attribute value may be 475 | a single-quoted or double-quoted string. Note the usage of any parse combinator for this purpose.

476 | 477 |

The attributes parse function matches multiple attribute specifications by passing the attr parse function to multi* parse combinator.

478 |
(defn- children-and-close [tag]
479 |   (let [children (multi* #(between pi elem-or-text pi))]
480 |     (close-tag tag)
481 |     children))
482 | 
483 |

Each child item is read using the elem-or-text parse function while ignoring any surrounding processing instructions using the between combinator; the combinator multi* is used to read all the child items.

484 |
(defn- elem-or-text []
485 |   (look-ahead [
486 |                "<![CDATA[" cdata
487 |                "</" (fn [] nil)
488 |                "<" element
489 |                "" #(read-to "<")]))
490 | 
491 |

The look-ahead parse combinator is used to call different parse functions 492 | based on different lookahead strings. Note that the look-ahead function 493 | doesn't consume the lookahead string unlike the look-ahead* function used 494 | earlier (in the definition of element parse function).

495 |
(defn- cdata []
496 |   (string "<![CDATA[")
497 |   (let [txt (read-to "]]>")] (string "]]>") txt))
498 | 
499 | (defn- close-tag [tag]
500 |     (string (str "</" tag))
501 |     (chr \>))
502 | 
503 |

By now, it should be obvious what the above two functions do.

504 | 505 |

Well, an XML parser in under 50 lines. Let's try it with a few sample inputs:

506 |
user> (use 'protoflex.examples.xml_parse)
507 | nil
508 | user> (parse-xml "<abc>text</abc>")
509 | {:tag "abc", :attributes {}, :children ["text"]}
510 | user> (parse-xml "<abc a1=\"1\" a2=\"attr 2\">sample text</abc>")
511 | {:tag "abc", :attributes {"a1" "1", "a2" "attr2"}, :children ["sample text"]}
512 | user> (parse-xml "<abc a1=\"1\" a2=\"attr 2\"><def d1=\"99\">xxx</def></abc>")
513 | {:tag "abc", :attributes {"a1" "1", "a2" "attr2"}, :children [{:tag "def", :attributes {"d1" "99"}, :children ["xxx"]}]}
514 | user> 
515 | 
516 |

Comments and Whitespaces

517 | 518 |

By default, Parse-EZ automatically handles comments and whitespaces. This 519 | behavior can be turned on or off temporarily using the macros with-trim-on 520 | and with-trim-off respectively. The parser option :auto-trim can be used to 521 | enable or disable the auto handling of whitespace and comments. Use the parser 522 | option :blk-cmt-delim to specify the begin and end delimiters for block 523 | comments. The parser option :line-cmt-start can be used to specify the line 524 | comment marker. By default, these options are set to java/C++ block and line 525 | comment markers respectively. You can alter the whitespace recognizer by setting 526 | the :ws-regex parser option. By default it is set to #"\s+".

527 | 528 |

Alternatively, you can turn off auto-handling of whitespace and comments and use 529 | the lexeme function which trims the whitespace/comments after application of the 530 | parse-function passed as its argument.

531 | 532 |

Also see the no-trim and no-trim-nl functions.

533 | 534 |

Primitive Parse Functions

535 | 536 |

Parse-EZ provides a number of primitive parse functions such as: chr, 537 | chr-in, string, string-in, word, word-in, sq-str, dq-str, 538 | any-string, regex, read-to, skip-over, read-re, read-to-re, 539 | skip-over-re, read-n, read-ch, read-ch-in-set, etc. 540 | See API Documentation

541 | 542 |

Let us try some of the builtin primitive parse functions:

543 |
user> (use 'protoflex.parse)
544 | nil
545 | user> (parse integer "12")
546 | 12
547 | user> (parse decimal "12.5")
548 | 12.5
549 | user> (parse #(chr \a) "a")
550 | \a
551 | user> (parse #(chr-in "abc") "b")
552 | \b
553 | user> (parse #(string-in ["abc" "def"]) "abc")
554 | "abc"
555 | user> (parse #(string-in ["abc" "def"]) "abcx")
556 | Parse Error: Extraneous text at line 1, col 4
557 |   [Thrown class java.lang.Exception]
558 | 
559 |

Note the parse error for the last parse call. By default, the parse function parses to the 560 | end of the input text. Even though the first 3 characters of the input text is recognized 561 | as valid input, a parse error is generated because the input cursor would not be at the 562 | end of input-text after recognizing "abc".

563 | 564 |

The parser option :eof can be set to false to allow recognition of partial input:

565 |
user> (parse #(string-in ["abc" "def"]) "abcx" :eof false)
566 | "abc"
567 | user> 
568 | 
569 |

You can start parsing by looking for some marker patterns using the read-to, 570 | read-to-re, skip-over, skip-over-re functions.

571 |
user> (parse #(do (skip-over ">>") (number)) "ignore upto this>> 456.7")
572 | 456.7
573 | 
574 |

Parse Combinators

575 | 576 |

Parse Combinators in Parse-EZ are higher-order functions that take other parse 577 | functions as input arguments and combine/apply them in different ways to 578 | implement new parse functionality. Parse-EZ provides parse combinators such as: 579 | opt, attempt, any, series, multi\*, multi+, between, look-ahead, lexeme, 580 | expect, etc. 581 | See API Documentation

582 | 583 |

Let us try some of the builtin parse combinators:

584 |
user> (parse #(opt integer) "abc" :eof false)
585 | nil
586 | user> (parse #(opt integer) "12")
587 | 12
588 | user> (parse #(any integer decimal) "12")
589 | 12
590 | user> (parse #(any integer decimal) "12.3")
591 | 12.3
592 | user> (parse #(series integer decimal integer) "3 4.2 6")
593 | [3 4.2 6]
594 | user> (parse #(multi* integer) "1 2 3 4")
595 | [1 2 3 4]
596 | user> (parse #(multi* (fn [] (string-in ["abc" "def"]))) "abcabcdefabc abcdef")
597 | ["abc" "abc" "def" "abc" "abc" "def"]
598 | user> 
599 | 
600 |

You can create your own parse functions on top of primitive parse-functions and/or 601 | parse combinators provided by Parse-EZ.

602 | 603 |

Committing to a particular parse branch

604 | 605 |

Version 0.4.0 added support for committing to a particular parse branch via 606 | the new parse combinators commit and commit-on. These functions make the 607 | parser commit to the current parse branch, making the parser report subsequent 608 | parse-failures in the current branch as parse-errors and preventing it 609 | from trying other alternatives at higher levels.

610 | 611 |

Nesting Parse Combinators Using Macros

612 | 613 |

Version 0.3.0 of Parse-EZ adds macro versions of parse combinator functions 614 | to make it easy to nest calls to parse combinators without having to write 615 | nested anonymous functions using the "(fn [] ...)" syntax. Note that Clojure 616 | does not allow nesting of anonymous functions of "#(...)" forms. Whereas 617 | the existing parse combinators take parse functions as arguments and actually 618 | perform parsing and return the parse results, the newly added macros take 619 | parse expressions as arguments and return parse functions (to be passed 620 | to other parse combinators). These macros are named the same as the 621 | corresponding parse combinators but with an underscore ("_") suffix. For example 622 | the macro version of "any" is named "any_".

623 | 624 |

Error Handling

625 | 626 |

Parse Errors are handled in Parse-EZ using Exceptions. The default error messages generated 627 | by Parse-EZ include line and column number information and in some cases what is expected 628 | at that location. However, you can provide your own custom error messages by using the 629 | expect parse combinator.

630 | 631 |

Expressions

632 | 633 |

Parse-EZ includes a customizable expression parser expr for parsing expressions in infix 634 | notation and an expression evaluator function eval-expr to evaluate infix expressions. 635 | You can customize the operators, their precedences and associative properties using 636 | :operators option to the parse function. For evaluating expressions, you can optionally 637 | specify the functions to invoke for each operator using the :op-fn-map option.

638 | 639 |

Parser State

640 | 641 |

The parser state consists of the input cursor and various parser options (specified or derived) 642 | such as those affecting whitespace and comment parsing, word recognizers, expression parsing, 643 | etc. The parser options can be changed any time in your own parse functions using set-opt.

644 | 645 |

Note that most of the parse functions affect Parser state (e.g: input cursor) and hence they are 646 | not pure functions. The side-effects could be avoided by making the Parser State an explicit 647 | parameter to all the parse functions and returning the changed Parser State along with the parse 648 | value from each of the parse functions. However, the result would be a significantly programmer 649 | unfriendly API. We made a design decision to keep the parse fuctions simple and easy to use 650 | than to fanatically keep the functions "pure".

651 | 652 |

Relation to Parsec

653 | 654 |

Parsec is a popular parser combinator library written in Haskell. While Parse-EZ 655 | makes use of some of the ideas in there, it is not a port of Parsec to Clojure.

656 | 657 |

License

658 | 659 |

Copyright (C) 2012 Protoflex Software

660 | 661 |

Distributed under the Eclipse Public License, the same as Clojure.

662 |
--------------------------------------------------------------------------------