├── .gitignore
├── rexxparse-test.asd
├── rexxparse.asd
├── LICENSE
├── package.lisp
├── test.lisp
├── README.md
└── rexxparse.lisp


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.FASL
 2 | *.fasl
 3 | *.lisp-temp
 4 | *.dfsl
 5 | *.pfsl
 6 | *.d64fsl
 7 | *.p64fsl
 8 | *.lx64fsl
 9 | *.lx32fsl
10 | *.dx64fsl
11 | *.dx32fsl
12 | *.fx64fsl
13 | *.fx32fsl
14 | *.sx64fsl
15 | *.sx32fsl
16 | *.wx64fsl
17 | *.wx32fsl
18 | 


--------------------------------------------------------------------------------
/rexxparse-test.asd:
--------------------------------------------------------------------------------
 1 | (in-package :cl-user)
 2 | 
 3 | (defpackage :rexxparse-test-asd
 4 |   (:use :cl :asdf))
 5 | 
 6 | (in-package :rexxparse-test-asd)
 7 | 
 8 | (defsystem :rexxparse-test
 9 |   :version "0.1.1"
10 |   :license "MIT"
11 |   :author "Dave Tenny"
12 |   :description "Tests for the :rexxparse package."
13 |   :depends-on (:rexxparse :fiveam)
14 |   :components ((:file "test")))
15 | 


--------------------------------------------------------------------------------
/rexxparse.asd:
--------------------------------------------------------------------------------
 1 | (in-package :cl-user)
 2 | 
 3 | (defpackage :rexxparse-asd
 4 |   (:use :cl :asdf))
 5 | 
 6 | (in-package :rexxparse-asd)
 7 | 
 8 | (defsystem :rexxparse
 9 |   :version "0.1.0"
10 |   :license "MIT"
11 |   :author "Dave Tenny"
12 |   :description "A trivial parsing tool inspired by the REXX PARSE construct."
13 |   ;;:bug-tracker "https://github.com/dtenny/rexxparse/issues"
14 |   ;;:source-control (:git "https://github.com/dtenny/rexxparse")
15 |   :depends-on (:alexandria :parse-float)
16 |   :serial t
17 |   :components ((:file "package")
18 |                (:file "rexxparse")))
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Jeffrey D. Tenny
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/package.lisp:
--------------------------------------------------------------------------------
 1 | (in-package :cl-user)
 2 | 
 3 | (defpackage :rexxparse
 4 |   (:use :cl)
 5 | 
 6 |   (:export 
 7 | 
 8 |    ;; The only thing you need most of the time.
 9 |    #:parse
10 |    ;; To change the default values of unmatched PARSE variables.
11 |    ;; Defaults to the empty string.
12 |    #:*unmatched-binding-value*
13 |    ;; To specify what kind of scanner you'd like to use for pattern literals
14 |    ;; if REXXPARSE doesn't give you the behavior you want.
15 |    #:*pattern->scanner*
16 |    ;; In case you want to call this from your custom scanner for pattern
17 |    ;; literals you don't support from your custom scanner.  Note that defining
18 |    ;; new pattern->scanner methods is not supported.
19 |    #:pattern->scanner
20 |    ;; If you want to specify an alternative extraction behavior for matched
21 |    ;; vars.
22 |    #:*extractor*
23 |    ;; To change the default "space only" behavior of LTRIM, RTRIM, and TRIM transforms.
24 |    #:*trim-character-bag*
25 |    ;; If you want to specify alternative transforms for matched vars.
26 |    #:*options*)
27 | 
28 |   (:documentation "Provides a PARSE macro emulating the REXX programming language
29 | namesakew with lexical bindings and extensible 'template' (REXX nomenclature)
30 | capabilities."))
31 | 
32 | 


--------------------------------------------------------------------------------
/test.lisp:
--------------------------------------------------------------------------------
  1 | (in-package :cl-user)
  2 | 
  3 | (defpackage :rexxparse-test
  4 |   (:use :cl :rexxparse :fiveam)
  5 |   (:export #:run-tests)
  6 |   (:documentation "Tests for the :rexxparse package."))
  7 | 
  8 | (in-package :rexxparse-test)
  9 | 
 10 | (def-suite test-suite :description ":rexxparse tests")
 11 | (in-suite test-suite)
 12 | 
 13 | ;; Spaces in these text values are significant.
 14 | (defvar *text* 
 15 |   "2024/02/23 17:35:42.022 -  unable to locate '/usr/local/examples/' directory")
 16 | (defvar *text2* "This is  the text which, I think,  is scanned.")
 17 | 
 18 | (test string-patterns-and-basics
 19 |   "Test PARSE with string patterns and basic REXX PARSE edge cases."
 20 | 
 21 |   ;; An empty string is never found, it always matches the end of the source string.
 22 |   (is (equalp '(" abc ") (parse " abc " (a ""))))
 23 |   (is (equalp '(" abc " "") (parse " abc " (a "" b))))
 24 | 
 25 |   (is (null (parse "abc" ())))
 26 |   (is (equalp '("abc" "") (parse "abc" (a b))))
 27 |   (let ((rexxparse:*UNMATCHED-BINDING-VALUE* nil))
 28 |     (is (equalp '("abc" "" nil) (parse "abc" (a b c)))))
 29 |   (is (equalp '("a" "b c") (parse "a b c" (a b))))
 30 |   (is (equalp '("a" "b") (parse "a b c" (a b " "))))
 31 |   (is (equalp '(" a b c ") (parse " a b c " (a))))
 32 |   (is (equalp '("a" "b c") (parse " a b c" (a b))))
 33 |   (is (equalp '("a" "b c") (parse " a b c" (" " a b))))
 34 |   (is (equalp '() (parse "a b c" (_))))
 35 |   (is (equalp '("a" "c") (parse "a b c" (a _ c))))
 36 |   (is (equalp '("a" "b c " " g") (parse "a b c x g" (a b "x" g))))
 37 | 
 38 |   ;; with one var, always matches whole string
 39 |   (is (equalp '("   ") (parse "   " (stuff))))
 40 |   ;; with two vars, both bound to empty strings due to word splitting?
 41 |   (is (equalp '("" "") (parse "   " (a b))))
 42 |   (is (equalp '("" "" "") (parse "   " (a " " b c))))
 43 |   ;; This is pretty central to understanding splits, because the 
 44 |   ;; un-trimmed last binding principle applies _to each split_
 45 |   ;; Thus there's a split at ".", and "Q" is not trimmed.  However there's 
 46 |   ;; still the word-splitting blank elimination after "John".
 47 |   (is (equalp '("John" "     Q" "   Public")
 48 |               (parse "    John      Q.   Public" (fn init "." ln))))
 49 |   ;; I believe that for the next example, the blank is omitted from the 'is'
 50 |   ;; variable because of the leading space elimination because one of the
 51 |   ;; pattern goalposts is the implicit word-splitting pattern.
 52 |   (is (equalp '("Now" "is" "the time")
 53 |               (parse "Now  is the time" (now " " is the-time))))
 54 | 
 55 |   ;; OOREXX manual 42.2.2 Parsing strings into words
 56 |   ;; The word splitting is not the same as searching for " " in the pattern
 57 |   ;; which is why this test has this expected value
 58 |   ;; In this case the leading space for ' I think' on w2, is preserved
 59 |   ;; because the implicit non-word-splitting binding context applies.
 60 |   ;; Still guessing though.
 61 |   (is (equalp '("This is  the text which" " I think" "  is scanned.")
 62 |               (parse *text2* (w1 "," w2 "," w3))))
 63 |   (is (equalp '("This is  the text which" " I think" "  is scanned." "")
 64 |               (parse *text2* (w1 "," w2 "," w3 "," w4))))
 65 |   (is (equalp '("This" "is" "the" "text which") (parse *text2* (w1 w2 w3 w4 ","))))
 66 |   (is (equalp '("This" "is" "" "the text which")
 67 |               (parse *text2* (w1 " " w2 " " w3 " " w4 ","))))
 68 | 
 69 |   ;; Unmatched content, we specify a colon before the millis when a period
 70 |   ;; was in the input.
 71 |   ;; When a match for a pattern cannot be found, it matches the end of the string.
 72 |   (is (equalp '("2024" "02" "23" "17" "35" 
 73 |                 "42.022 -  unable to locate '/usr/local/examples/' directory"
 74 |                 "" "")
 75 |               (parse *text* (year "/" month "/" day hours ":" minutes ":" seconds
 76 |                                   ":" millis "-" rest))))
 77 |   ;; Note the blank following 022.
 78 |   (is (equalp '("2024" "02" "23" "17" "35" "42" "022 "
 79 |                 "  unable to locate '/usr/local/examples/' directory")
 80 |               (parse *text* (year "/" month "/" day hours ":" minutes ":" seconds
 81 |                                   "." millis "-" rest))))
 82 | 
 83 |   ;; Sexp source that is not a string but produces a string
 84 |   (flet ((foo () "bar"))
 85 |     (is (equalp '("b" "r") (parse (foo) (b "a" r)))))
 86 | 
 87 |   ;; Source that is not a string.
 88 |   (signals type-error (parse #\c (a)))
 89 | 
 90 |   ;; Inappropriate use of keywords as variables, which are symbolp.
 91 |   ;; Trying to test this condition may be more trouble than it's worth...
 92 |   (signals error (macroexpand '(parse "a b" (:a))))
 93 | 
 94 |   ;; Non-default body and return value
 95 |   (is (eq 'xyz
 96 |           (let (a b c)
 97 |             (parse "a b c" (x y z)
 98 |               ;; just a couple of statements to mess around
 99 |               (setq a x b y c z)
100 |               (is (equalp '("a" "b" "c") (list a b c)))
101 |               'xyz))))
102 |   )
103 | 
104 | (test standard-options
105 |   ;; Equal, at least on SBCL, respects case, while EQUALP does not.
106 |   ;; And we want these tests to respect case
107 |   (is (not (equal "a" "A")))
108 |   (is (not (equal '("a") '("A"))))
109 | 
110 |   ;; Test conditions raised during macro expansion
111 |   (signals error (macroexpand '(parse :upper :lower "A" (w))))
112 | 
113 |   (is (equal '("a b " " d")
114 |              (parse :lower "A b C d" (w "c" r))))
115 |   (is (equal '("a b c d" "")
116 |              (parse :lower "A b C d" (w "C" r))))
117 |   (is (equal '("A B " " D")
118 |              (parse :upper "A b C d" (w "C" r))))
119 |   (is (equal '("A B C D" "")
120 |              (parse :upper "A b C d" (w "c" r))))
121 |   (is (equal '("A b " " d")
122 |              (parse :caseless "A b C d" (w "c" r))))
123 |   )
124 | 
125 | (test positions
126 |   ;; Absolute positions
127 | 
128 |   ;;               1         2         3
129 |   ;;      1234567890123456789012345678901234
130 |   (is (equalp '("Brimfield    " "Massachusetts   " "10101")
131 |               (parse "Brimfield    Massachusetts   10101" (city 14 state 30 zip))))
132 | 
133 |   (is (equalp '(" ab" " a") (parse " abc " (ab "c" 1 c 3))))
134 | 
135 |   ;; The position 1 is similar to an empty string, it is never "found"
136 |   ;; (in the sense of a pattern being found).
137 |   (is (equalp '("abc" "abc") (parse "abc" (a 1 bc))))
138 |   (is (equalp '(" abc " " abc ") (parse " abc " (a 1 bc))))
139 |   (is (equalp '("a" "bc") (parse "abc" (a 2 bc))))
140 |   (let ((x 1))
141 |     (is (equalp '("abc" "abc") (parse "abc" (a (= x) bc))))
142 |     (is (equalp '(" abc " " abc ") (parse " abc " (a (= x) bc))))
143 |     (is (equalp '("a" "bc") (parse "abc" (a (= (+ x 1)) bc)))))
144 | 
145 |   (is (equalp '("st" "a" "r" "s")
146 |               (parse "astronomers" (2 st 4 1 a 2 4 r 5 11 s))))
147 | 
148 |   ;; Invalid/edge-case absolute positions
149 |   ;; Basically too far to the left treated like "1".
150 |   ;; Too far to the right treated as the whole (or remaining) string matches.
151 |   (is (equalp '(" abc ") (parse " abc " (-0 x -0))))
152 |   (is (equalp '(" abc ") (parse " abc " (-1 x -1))))
153 |   (is (equalp '(" abc ") (parse " abc " (0 x 0))))
154 |   (is (equalp '(" abc ") (parse " abc " (1 x 1))))
155 |   (is (equalp '(" abc") (parse " abc " (x 5))))
156 |   (is (equalp '(" abc" " ") (parse " abc " (x 5 x2))))
157 |   (is (equalp '(" abc ") (parse " abc " (x 6))))
158 |   (is (equalp '(" abc ") (parse " abc " (x 7))))
159 |   (is (equalp '(" abc " "") (parse " abc " (x 7 x2))))
160 | 
161 |   ;; Relative positions
162 |   (is (equalp '(" a" "abc ") (parse " abc " (x "b" (- 1) y))))
163 |   (is (equalp '(" a" " abc ") (parse " abc " (w1 "b" (- 5) w2)))); no such "match" for offset -5
164 | 
165 |   ;; template with "string" var <relative-positional> is a special case,
166 |   ;; var _includes_ string pattern which would normally be skipped
167 |   ;; (after some very similar non-relative positional matches for context/validation)
168 |   (is (equalp '(" c ") (parse " a b c " ("b" b))))     ; "b" not included, no relative positional
169 |   ;; '5' is effectively equal to the source scanning start position when its pseudo-pattern
170 |   ;; is matched, which means an empty string match, which is a break/tail-position behavior.
171 |   (is (equalp '(" c ") (parse " a b c " ("b" b 5))))   ; from end of "b" to 5 is empty, full break
172 |   (is (equalp '(" c")  (parse " a b c " ("b" b 7))))   ; space past "b" to 7, 2 chars
173 |   (is (equalp '("b") (parse " a b c " ("b" b (+ 1))))) ; from position of b to position+1
174 |   (is (equalp '("b c ") (parse " a b c " ("b" b (- 1))))) ; from position of b to end of string
175 |   (is (equalp '(" a" "bc " "abc ") (parse " abc " (x "b" y (- 1) z))))
176 | 
177 |   ;; This is the relative version of the absolute 'stars' position match above
178 |   (is (equalp '("st" "a" "r" "s")
179 |               (parse "astronomers" (2 st (+ 2) (- 3) a (+ 1) (+ 2) r (+ 1) (+ 6) s))))
180 |   
181 |   (is (equalp '("RE" "X" "X") (parse "REstructured eXtended eXecutor"
182 |                                      (v1 3 _ "X" v2 (+ 1) _ "X" v3 (+ 1) _))))
183 | 
184 |   ;; various bounds cases
185 |   (is (equalp '("a" "b" "c") (parse "abc" (0 v1 2 v2 3 v3 4))))
186 |   (is (equalp '("a" "b" "c") (parse "abc" ((- 2) v1 (+ 1) v2 (+ 1) v3 (+ 3)))))
187 |   (is (equalp '("abc") (parse "abc" ((- 2) v1))))
188 |   (is (equalp '("") (parse "abc" ((+ 12) v1))))
189 |   (is (equalp '("") (parse "abc" (12 v1))))
190 |   (is (equalp '("abc") (parse "abc" (v1 (- 2)))))
191 |   (is (equalp '("abc") (parse "abc" (v1 (+ 12)))))
192 |   (is (equalp '("abc") (parse "abc" (v1 12))))
193 |   (is (equalp '("ab") (parse "abc" ("a" (+ 0) b 3))))
194 |   (is (equalp '("ab") (parse "abc" ("a" (- 0) b 3))))
195 |   )
196 | 
197 | (test length-positions
198 |   ;; Testing several things here.
199 |   ;; 1. Length positions vs relative positions, which are mostly the same but not always, 
200 |   ;; particularly for zero.
201 |   ;; 2. References to previous bindings later in the template
202 |   ;; 3. Implicit conversion of strings to integers for positional patterns.
203 | 
204 |   ;; Parsing with relative patterns only.  'middle' being the tricky bit.
205 |   (is (equalp '("Mark" "05Twain" "Twain" "05")
206 |               (parse "04Mark0005Twain" 
207 |                      (len (+ 2) first (+ len) len (+ 2) middle (+ len) len (+ 2) last (+ len))
208 |                      (list first middle last len))))
209 | 
210 |   ;; Parsing with positional length patterns.
211 |   (is (equalp '("Mark" "" "Twain" "05")
212 |               (parse "04Mark0005Twain" 
213 |                      (len (+ 2) first (> len) len (+ 2) middle (> len) len (+ 2) last (> len))
214 |                      (list first middle last len))))
215 | 
216 |   ;; Parsing with length patterns
217 |   (is (equalp '("5" "5.6789") 
218 |               (parse "12345.6789" ("." digit (< 1) rest))))
219 |   ;; Parsing with relative patterns
220 |   (is (equalp '("5" ".6789")
221 |               (parse "12345.6789" ("." (- 1) digit (+ 1) rest))))
222 | 
223 |   ;; For no particular reason other than it messeed with my head at one point
224 |   (is (equalp '("a b" "a b") (parse "a b" (1 w1 (+ 0) w2))))
225 |   (is (equalp '("" "a b") (parse "a b" (1 w1 (> 0) w2))))
226 | 
227 |   ;; </> under/over flows
228 |   (is (equalp '("" "12345.6789") 
229 |               (parse "12345.6789" (1 digit (< 1) rest))))
230 |   (is (equalp '("12345" "12345.6789") 
231 |               (parse "12345.6789" ("." digit (< 5) rest))))
232 |   (is (equalp '("12345" "12345.6789") 
233 |               (parse "12345.6789" ("." digit (< 6) rest))))
234 |   (is (equalp '(".67" "89") 
235 |               (parse "12345.6789" ("." digit (> 3) rest))))
236 |   (is (equalp '(".678" "9") 
237 |               (parse "12345.6789" ("." digit (> 4) rest))))
238 |   (is (equalp '(".6789" "") 
239 |               (parse "12345.6789" ("." digit (> 5) rest))))
240 |   (is (equalp '(".6789" "") 
241 |               (parse "12345.6789" ("." digit (> 6) rest))))
242 |   )
243 | 
244 | (test var-reuse
245 |   (is (equalp '("b") (parse "abc" (w 2 w 3))))
246 |   (is (equalp '("") (parse "abc" (w w))))
247 |   (is (equalp '("") (parse "abc def" (w w w))))
248 |   )
249 | 
250 | (test variable-string-patterns
251 |   (is (equalp '("the quick " " fox")
252 |               (let ((x "brown"))
253 |                 (parse "the quick brown fox" (start ($ x) end)))))
254 |   )
255 | 
256 | ;; As I intend to use this elsewhere (in some flavor for a clojure-like
257 | ;; `with-redefs`), I've made a few notes for future documentation.
258 | 
259 | (defmacro with-redef ((f-sym function) &body body)
260 |   "Redefine the global function definition of symbol F-SYM
261 | with the function FUNCTION with a lexical scope wrapping BODY.
262 | 
263 | FUNCTION must be an object of type FUNCTION, not some other function designator, 
264 | compatible with (setf (fdefinition f-sym) <function>).
265 | 
266 | Execute BODY with the rebound function, restoring the original function (or lack thereof)
267 | on exit.  F-SYM need not be FBOUNDP to start with.
268 | 
269 | Note that this macro may have unsafe effects in a multi-threaded use of the
270 | symbol unless the caller arranges additional critical-section logic.
271 | Also note compiler transformations or inlining may also result in surprises
272 | when it comes to redefining a function, as well as use of compiled symbol-function
273 | references that have previously nabbed the function and which won't see changes made 
274 | after the fact.
275 | 
276 | Warning: If the new function attempts to call the old function, make sure it isn't via the
277 | the function being redefined.  E.g.
278 | 
279 |   ;; This will be an infinite loop or stack overflow
280 |   (with-redef (my-fun (lambda () ... stuff ... (funcall 'my-fun))))
281 | 
282 |   ;; ;This will work
283 |   (let ((old-fun #'my-fun))
284 |     (with-redef (my-fun (lambda () ... stuff ... (funcall old-fun)))))
285 | 
286 | Returns the value(s) returned by BODY."
287 |   ;; Don't really like this implementation, it'll do for the limited test here.
288 |   ;; Would prefer to unwind-protect the setting of the symbol function as well as the 
289 |   ;; restoration, among other things.  Also, we should probably use FDEFINITION
290 |   ;; instead of SYMBOL-FUNCTION, so we can do SETF functions as well as plain symbols.
291 |   ;; Or something like that.
292 |   ;; Note effect on generic functions/methods?
293 |   (let ((old (gensym))
294 |         (fun (gensym)))
295 |     (declare (ignorable old))
296 |     `(let ((,fun ,function))
297 |        (assert (typep ,fun 'cl:function))
298 |        (cond
299 |          ((fboundp ',f-sym)
300 |           (let ((,old (symbol-function ',f-sym)))
301 |             (setf (symbol-function ',f-sym) ,fun)
302 |             (unwind-protect (progn ,@body)
303 |               (setf (symbol-function ',f-sym) ,old))))
304 |          (t
305 |           (setf (symbol-function ',f-sym) ,fun)
306 |           (unwind-protect (progn ,@body)
307 |             (fmakunbound ',f-sym)))))))
308 | 
309 | ;; The redef logic used by this test doesn't work with ECL.  It works with SBCL, ACL,
310 | ;; ABCL, CCL, and Lispworks Basically the flet 'extractor' function used in the redef
311 | ;; is never called.  I haven't been able to make a simpler reprudicible case yet.
312 | #-ECL
313 | (test no-useless-extractions
314 |   (let ((counter 0)
315 |         (old-extractor #'rexxparse::extract))
316 |     (flet ((extractor (&rest args) 
317 |              (incf counter)
318 |              (apply old-extractor args)))
319 |       (with-redef (rexxparse::extract #'extractor)
320 |         (is (null (parse "abc" (_))))
321 |         (is (zerop counter))
322 |         (is (equalp '("abc") (parse "abc" (a))))
323 |         (is (= counter 1))
324 |         (is (equalp '("b") (parse "abc" (2 3 (- 1) c (+ 1)))))
325 |         (is (= counter 2))
326 |         ))))
327 | 
328 | (test transforms
329 |   ;; Upper lower, first & last positions, tail positions
330 |   (is (equalp '("A" "b" "C") (parse "a b c" ((upper a) b (upper c)))))
331 |   (is (equalp '("a" "B" "c") (parse "a b c" (a (upper b) c))))
332 |   (is (equalp '("C D") (parse "a b c d" (_ _ (upper x)))))
333 |   (is (equalp '("a" "B" "c") (parse "A B C" ((lower a) b (lower c)))))
334 |   (is (equalp '("A" "b" "C") (parse "A B C" (a (lower b) c))))
335 |   (is (equalp '("c d") (parse "A B C D" (_ _ (lower x)))))
336 | 
337 |   ;; SNAKE, KEBAB, LTRIM, RTRIM, TRIM, *TRIM-CHARACTER-BAG*
338 |   (is (equalp '("kebab_case" "to_snake_case")
339 |               (parse "kebab-case to-snake-case" ((snake a) (snake b)))))
340 |   (is (equalp '("snake-case" "to-kebab-case")
341 |               (parse "snake_case to_kebab_case" ((kebab a) (kebab b))))) 
342 |   (is (equalp '("  ab" "  def  " "hi  ") (parse "  abc  def  ghi  " (a "c" d "g" g))))
343 |   (is (equalp '("ab" "def" "hi") (parse "  abc  def  ghi  " ((ltrim a) "c" (trim d) "g" (rtrim g)))))
344 | 
345 |   (let ((s (coerce (list #\space #\newline #\a #\b #\c #\space #\newline) 'string))
346 |         (s-with-newline (coerce (list #\newline #\a #\b #\c #\space #\newline) 'string))
347 |         (s-without-newline "abc"))
348 |     (is (equalp (list s-with-newline) (parse s ((trim x)))))
349 |     (let ((*trim-character-bag* (list #\space #\newline)))
350 |       (is (equalp (list s-without-newline) (parse s ((trim x)))))))
351 | 
352 |   ;; INTEGER, FLOAT, DOUBLE, KEYWORD
353 |   (parse "1" ((integer one))
354 |          (is (integerp one))
355 |          (is (= 1 one)))
356 |   (parse "1.0 2.0" ((float one) (double two))
357 |          (is (typep one 'single-float))
358 |          (is (typep two 'double-float))
359 |          (is (= 1 (round one)))
360 |          (is (= 2 (round two))))
361 |   (signals error (parse "fred" ((integer x))))
362 |   (signals error (parse "fred" ((float x))))
363 |   (signals error (parse "fred" ((double x))))
364 | 
365 |   ;; Assumes upper case symbols on the lisp.
366 |   (parse "abc" ((keyword x))
367 |          (is (keywordp x))
368 |          (is (not (eq :abc x)))
369 |          (is (eq :|abc| x)))
370 |   (parse :upper "abc" ((keyword x))
371 |          (is (keywordp x))
372 |          (is (eq :abc x)))
373 | 
374 |   ;; The TRANSFORM transform and various function designators
375 |   (let (y)
376 |     (flet ((saver (val) (setq y val)))
377 |       (is (equalp '("a b c") (parse "a b c" ((transform a #'saver)))))
378 |       (is (equalp "a b c" y))))
379 | 
380 |   ;; #'(lambda ...)
381 |   (let (y)
382 |     (is (equalp '("A B C") 
383 |                 (parse "a b c" ((transform a #'(lambda (x) (setq y (string-upcase x))))))))
384 |     (is (equalp "A B C" y)))
385 | 
386 |   ;; (lambda ...)
387 |   (let (y)
388 |     (is (equalp '("A B C") 
389 |                 (parse "a b c" ((transform a (lambda (x) (setq y (string-upcase x))))))))
390 |     (is (equalp "A B C" y)))
391 | 
392 |   (is (equalp '("A" "b") (parse "a b" ((transform a 'string-upcase) b))))
393 |   (is (equalp '("A" "b") (parse "a b" ((transform a #'string-upcase) b))))
394 | 
395 | 
396 |   ;; Bogus transforms
397 |   (signals error (macroexpand '(parse "a b c" ((foo a)))))
398 |   (signals error (macroexpand '(parse "a b c" ((foo)))))
399 |   )
400 | 
401 | (test declarations
402 |   (is (equalp '("a" 2)
403 |               (parse "a 1" (a (integer one))
404 |                      (declare (string a) (integer one))
405 |                      (list a (1+ one))))))
406 | 
407 | (test using
408 |   (let ((a 0))
409 |     (is (equalp '("a" "b") (parse :using (a) "a b" (a b))))
410 |     (is (equalp "a" a))
411 |     (is (equalp '("a" "b") (parse (:using a) "a b" (b a)
412 |                                   (list b a))))
413 |     (is (equalp "b" a)))
414 | 
415 |   (let ((v (make-array 4 :fill-pointer 0 :adjustable nil)))
416 |     (is (equalp '("a" "b") (parse :using-vector (v) "a b" (v b))))
417 |     (is (equalp #("a") v))
418 |     ;; Explicit body reference to V is going to return the vector, not what was matched
419 |     ;; and stuffed into the vector.  This is by design. User supplied body has full control
420 |     ;; (and responsibility for any mess that arises).
421 |     (is (equalp (list "a" v) (parse (:using-vector v) "a b" (a v) 
422 |                                     (list a v))))
423 |     (is (equalp #("a" "b") v))          ;accumulated across two parse invocations
424 |     (is (null (parse (:using-vector v) "c d" (v v) nil)))
425 |     (is (equalp #("a" "b" "c" "d") v))
426 |     ;; Vector-push silently does nothing if vector is full.
427 |     (is (equalp '("e") (parse (:using-vector v) "e" (v))))
428 |     (is (equalp #("a" "b" "c" "d") v)))
429 | 
430 |   ;; Vector and positional patterns
431 |   (let ((v (make-array 4 :fill-pointer 0)))
432 |     (is (equalp #("04" "Mark")
433 |                 (parse (:using-vector v) "04Mark" (1 v (+ 2) v (> (aref v 0))) v))))
434 | 
435 |   ;; Mix it up!
436 |   (let (a b (v1 (make-array 4 :fill-pointer 0)) (v2 (make-array 4 :fill-pointer 0)))
437 |     (is (equalp '("a" "b" "c" "d" "e")
438 |                 (parse (:using a b) (:using-vector v1 v2) "a b c d e" (a b v1 v2 e))))
439 |     (is (equalp "a" a))
440 |     (is (equalp "b" b))
441 |     (is (equalp #("c") v1))
442 |     (is (equalp #("d") v2))
443 |     (is (not (boundp 'e))))
444 | 
445 |   ;; Negative tests.
446 | 
447 |   ;; Symbols may only be in :USING or :USING-VECTOR, not both
448 |   (signals error (macroexpand '(parse :using (a b e) :using-vector (b c d a) "abc" (a b c d e))))
449 | 
450 |   ;; _ may not be in :USING or :USING-VECTOR
451 |   (signals error (macroexpand '(parse :using (a _ c) "abc" (a c))))
452 |   (signals error (macroexpand '(parse :using-vector (a _ c) "abc" (a c))))
453 | 
454 |   ;; :USING-[VECTOR] - lists may be empty, but must contain (non-keyword) symbols if non-empty
455 |   (is (equalp '("a" "b") (parse :using () "a b" (a b))))
456 |   (is (equalp '("a" "b") (parse :using-vector () "a b" (a b))))
457 |   (signals error (macroexpand '(parse :using (:fred) "a b" (a b))))
458 |   (signals error (macroexpand '(parse :using-vector (1) (a b) "a b" (a b))))
459 |   )
460 | 
461 | (defun run-tests ()
462 |   "Run all :rexxparse tests"
463 |   (run 'test-suite)
464 |   nil)
465 | 
466 | ;;(trace rexxparse::match-and-extract rexxparse::scan-string rexxparse::scan-word-split rexxparse::extract rexxparse::extract-after-left-trim rexxparse::scan-absolute-position rexxparse::pattern->scanner rexxparse::scan-leftward-relative-position rexxparse::scan-rightward-relative-position REXXPARSE::SCAN-LEFTWARD-LENGTH-POSITION REXXPARSE::SCAN-RIGHTWARD-LENGTH-POSITION)
467 | 
468 | ;; Just a tool so I don't have to edit a 'foo.rexx' file and run rexx on it to compare
469 | ;; to what rexx does. Don't use it in tests, it would add a very undesirable dependency (OORexx)
470 | (defun trexx (source template &key debug)
471 |   "Try parsing SOURCE with TEMPLATE, both strings, in REXX, and returning a list of 
472 | bound variables the way PARSE would.  Period placeholders will not be in the results.
473 | Template should be a string containing rexx-syntax template data.
474 | 
475 | E.g. (trexx \"abc\" \"v1 2 v2 3 . 4\") => (\"a\" \"b\")
476 | 
477 | Assumes REXXPARSE:PARSE is working enough to parse the REXX output  :-)
478 | Assumes newlines for line separators, sorry Winblows and Fruit OSes.
479 | 
480 | Only works works if 'rexx' is in your PATH.  Author tests with Open Object Rexx."
481 |   (let ((rexx-output
482 |           (uiop/stream:call-with-temporary-file 
483 |            (lambda (pathname) 
484 |              (with-open-file (s pathname :direction :output :if-exists :supersede)
485 |                (format s "trace i~%parse value '~a' with ~a~%" source template))
486 |              (with-output-to-string (stream)
487 |                #+NIL(uiop:run-program (list "cat" (namestring pathname)) :output stream)
488 |                (uiop:run-program (list "rexx" (namestring pathname)) 
489 |                                  :error-output stream
490 |                                  :output stream)))
491 |            :want-pathname-p t :want-stream-p nil)))
492 |     ;; For (trexx "abc" "v1 2 v2")
493 |     ;; Looking for output lines like        >=>   V2 <= \"bc\"
494 |     ;; Report last binding if var bound multiple times.
495 |     (let ((rexxparse:*unmatched-binding-value* nil))
496 |       (with-input-from-string (stream rexx-output)
497 |         (loop with result = nil
498 |               for line = (if debug (print (read-line stream nil nil)) (read-line stream nil nil))
499 |               while line
500 |               do (parse line (">=>   " name " <= " val)
501 |                    (when val
502 |                      (parse val ("\"" v "\"")
503 |                         (setf result (acons name v result)))))
504 |               finally (return (nreverse
505 |                                (mapcar #'cdr 
506 |                                        (delete-duplicates result :key #'car :test #'equalp 
507 |                                                                  :from-end t)))))))))
508 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
   1 | # TL;DR Purpose
   2 | 
   3 | A DSL to concisely scan/tokenize, extract, and transform semi-structured string data, and
   4 | bind the results to variables. Inspired by the REXX PARSE command.
   5 | 
   6 | Some simple if not particularly inspired examples:
   7 | 
   8 |     (parse "The quick brown fox" (_ _ color animal)
   9 |       (format t "The color of the ~a is ~a~%" animal color))
  10 | 
  11 |     => The color of the fox is brown
  12 | 
  13 |     (defvar *log-line* "2024-Aug-12: [ERROR] Some stupid log error")
  14 |     (parse *log-line* (year "-" "[" severity "] " rest)
  15 |       (when (string= severity "ERROR")
  16 |         (format t "WARNING WILL ROBINSON! ~s happened in ~a~%" rest year)))
  17 | 
  18 |     => WARNING WILL ROBINSON! "Some stupid log error" happened in 2024
  19 | 
  20 |     (parse "Meal total: $23.12" ("$" (float dollars))
  21 |         (format t "Amount with 15 & 20 percent tips: $~,2f, $~,2f~%"
  22 |             (* dollars 1.15) (* dollars 1.20)))
  23 | 
  24 |     => Amount with 15 & 20 percent tips: $26.59, $27.74
  25 | 
  26 | # About
  27 | 
  28 | A long time ago there was a novel scripting language that ran on the IBM
  29 | VM/CMS operating system known as [REXX](https://www.rexxla.org/).
  30 | 
  31 | One of the things I always liked about REXX in its era of pre-regexp
  32 | scripting languages was the `PARSE` statement. In its simplest form PARSE
  33 | is a very nice way to parse strings with delimited or positional data and
  34 | then bind the matching substrings to variables.
  35 | 
  36 | This package attempts to reproduce REXX's `PARSE` statement as a Common Lisp DSL.
  37 | 
  38 | There's a bit of a zen thing to `PARSE`. Pattern matching is almost
  39 | the opposite of a regexp. Instead of specifying patterns for what you want
  40 | to match, you specify patterns for the bits that are not of interest, and
  41 | what gets bound as a match is the stuff in-between those uninteresting bits.
  42 | 
  43 | For example `(parse "12:00:00" (hh ":" mm ":" ss)) => ("12" "00" "00"))`
  44 | is matching for the colons, the other tokens are just variable names to
  45 | receive the text matched _around_ the tokens (though this example doesn't
  46 | show the variables in use, `PARSE` defaults to returning a list of the matched
  47 | variables if there is no body).
  48 | 
  49 | `PARSE` is about scanning strings and binding desired subsequences
  50 | to variables in the style of REXX.  It is not intended for lisp syntax
  51 | parsing, nor will it replace regexps where you need need to express complex
  52 | patterns. It is better suited for word-splitting and fixed-format data.
  53 | 
  54 | That said `PARSE` can sometimes express some parsing tasks in a clearer and
  55 | shorter way, and has other useful capabilities such as its ability to
  56 | act like a programmable tape reader (e.g. reading a length descriptor
  57 | from the input and then extracting a substring for the specified length).
  58 | 
  59 | Realistically this package probably adds little to what you can piece
  60 | together with other lisp parsing and/or pattern-matching packages, but if
  61 | you liked REXX, then perhaps you'll like this macro with its style of
  62 | binding and pattern specifications. This package is also regexp-free by
  63 | design. Some overlapping regexp capabilities are mentioned below.
  64 | 
  65 | # Tested platforms
  66 | 
  67 | Tests ran on the following without issues except as otherwise noted.
  68 | 
  69 | * SBCL
  70 | * ECL
  71 | * CCL
  72 | * ACL
  73 | * LISPWORKS
  74 | * ABCL - with the following mild warning about the `parse-float` packages
  75 |   when loading, but otherwise okay:
  76 | 
  77 |     ; Loading "rexxparse-test"
  78 |     ; Caught BAD-SYSTEM-NAME:
  79 |     ;   System definition file #P"/home/dave/quicklisp/dists/quicklisp/software/parse-float-20200218-git/parse-float.asd" contains definition for system "parse-float-tests". Please only define "parse-float" and secondary systems with a name starting with "parse-float/" (e.g. "parse-float/test") in that file.
  80 |     ; Compilation unit finished
  81 |     ;   Caught 1 WARNING condition
  82 | 
  83 | 
  84 | # REXX Compatibility
  85 | 
  86 | I have endeavored to make the basic string and position parsing compatible
  87 | with REXX semantics. So on the very slim chance you're a former REXX
  88 | programmer using Lisp, hopefully you will feel at home.
  89 | 
  90 | This was also the hardest part of the project because the REXX semantics
  91 | are sometimes subtle. I was never particularly knowledgeable of REXX to
  92 | begin with, and the REXX documentation is a bit hit-or-miss on some
  93 | details.  At times I was guessing at black box behavior. I've tried to boil
  94 | down the main rules in a section labled "Parse Rules 101" below.
  95 | 
  96 | Among the REXX compatibility features of REXXPARSE is tolerance of edge
  97 | cases like rebinding the same variable multiple times, position patterns
  98 | which are out of bounds of the string, and so on.  About the only
  99 | restriction is that relative position fixnums must not be negative, which
 100 | is in keeping with REXX semantics. Otherwise it tries not to complain about
 101 | mundane things in your templates.
 102 | 
 103 | If you think you've found a bug, try your PARSE with Open Object REXX and
 104 | see what it does. My goal is to match its semantics for any functionality
 105 | shared between the two, however note that the Lisp version has additional
 106 | capabilities which can't be compared.
 107 | 
 108 | # Alternative text parsing packages
 109 | 
 110 | Lisp has plenty of great tools that already do parsing, here's a couple for
 111 | consideration.
 112 | 
 113 | ## cl-ppcre
 114 | 
 115 | If regexps are your thing you could also use the 
 116 | [cl-ppcre](https://edicl.github.io/cl-ppcre/#register-groups-bind)
 117 | `register-groups-bind` construct.  It probably performs just
 118 | as well (or better with its years of fine tuning, I have no idea). It even has its own
 119 | flavor of transforms that can be applied to the match before binding.
 120 | 
 121 | ## scanfcl
 122 | 
 123 | There's also the Common Lisp [scanf](https://github.com/splittist/scanfcl)
 124 | tool, which provides a lisp equivalent to the C `scanf` family of functions
 125 | and has the ability to parse numbers for you, but does not provide bindings
 126 | and suffers from broader limitations of `scanf`'s parsing capabilities.
 127 | 
 128 | # Example comparison of regexp/scanf/PARSE
 129 | 
 130 | Here is an example of parsing a simple text string with regexps and/or
 131 | `scanf`, followed by the way parsing is done with `PARSE`.
 132 | 
 133 | Let's use this text string that we want to parse, where we want to 
 134 | tease out the year/month/day and error message components:
 135 | 
 136 |     (defvar *text* 
 137 |       "2024/02/23 17:35:42.022 -  unable to locate '/usr/local/examples/' directory")
 138 | 
 139 | Note the additional blank space after the hyphen as well.
 140 | 
 141 | ## Using cl-ppcre `register-groups-bind`
 142 | 
 143 |     (cl-ppcre:register-groups-bind (year month day error-msg)
 144 |         ("(\\d+)/(\\d+)/(\\d+).* -  (.*)" *text*)
 145 |       (list year month day error-msg))
 146 | 
 147 |     => ("2024" "02" "23" "unable to locate '/usr/local/examples/' directory")
 148 | 
 149 | Nice enough, with the usual cross-eyed issues of writing regexps.
 150 | 
 151 | ## Using `scanf`
 152 | 
 153 |     (scanfcl:sscanf *text* "%d/%d/%d %*s -  %s")
 154 | 
 155 |     => (2024 2 23 "unable") 
 156 | 
 157 | Scanf is nice because it will convert matched text to numeric types,
 158 | `REXXPARSE:PARSE` can do that as well via transforms, a REXXPARSE extension
 159 | to basic REXX capabilities.
 160 | 
 161 | Note that the scanf example is able to suppress scanning of some text with
 162 | the '*' modifier, but fails to parse the message that was desired with
 163 | whitespace content. You can use fixed width %s or %c if you could make
 164 | assumptions about the width but not generally compatible with most service
 165 | log content.  If your scanf supports character sets, you could use that
 166 | too. Still, it isn't super friendly for reading delimited substrings the
 167 | way we do with PARSE.
 168 | 
 169 | ## Using `PARSE`
 170 | 
 171 | ### Pure REXX PARSE
 172 | 
 173 | The original (NOT LISP!) REXX syntax would be:
 174 | 
 175 |     PARSE *text* year "/" month "/" day . "-" error
 176 | 
 177 | In the above statement, `*text*` is known as the source (to be matched),
 178 | and the remainder of the statement is known as the "template".  In REXX,
 179 | the period was a placeholder, in lisp we use '_' (underscore) because periods
 180 | have different behavior with the Lisp reader.  In the above example, the
 181 | period would match the timestamp text.
 182 | 
 183 | The template contains symbols naming variables to be bound, and strings to
 184 | be matched in the source text such that they delimit the text of interest
 185 | to be bound.
 186 | 
 187 | ### Lisp-styled REXX PARSE
 188 | 
 189 | The general syntax of PARSE is
 190 | 
 191 |     (parse <source-string> (<template-elements>) <optional-body>)
 192 | 
 193 | The body allows for optional declarations of template variable symbols via
 194 | an implicit enclosing `locally`. Normally they will be strings unless you
 195 | are using transforms, but no such implicit declarations are made.
 196 | 
 197 | Here is a simple text parse without a body. The underscore is as mentioned above:
 198 | 
 199 |     (parse *text* (year "/" month "/" day _ "-" error))
 200 | 
 201 | If all you want to do is return a list of values bound, you can omit all forms 
 202 | after the template and a list of bound values will be returned, so the above
 203 | would return
 204 | 
 205 |     => ("2024" "02" "23" "unable to locate '/usr/local/examples/' directory")
 206 | 
 207 | Values are returned in order of the variables specified in the template.
 208 | Text conceptually (but not physically) bound do the placeholder `_` 
 209 | is not included in the result.
 210 | 
 211 | One of the main points of PARSE is to lexically bind variables for you
 212 | so you don't have to go and fetch them from a list with `destructuring-bind`
 213 | or other tools. For example:
 214 | 
 215 |     ;; mock snippet dealing with some error noted in *text*
 216 |     (parse *text* ("unable to locate '" path "' directory")
 217 |       (cerror "Create the directory ~s and continue"
 218 |               "The directory ~s did not exist" 
 219 |               path))
 220 | 
 221 |     =>
 222 | 
 223 |     The directory "/usr/local/examples/" did not exist
 224 |        [Condition of type SIMPLE-ERROR]
 225 | 
 226 |     Restarts:
 227 |      0: [CONTINUE] Create the directory "/usr/local/examples/" and continue
 228 |      1: [RETRY] Retry SLIME REPL evaluation request.
 229 |      2: [*ABORT] Return to SLIME's top level.
 230 |      3: [ABORT] abort thread (#<THREAD tid=88446 "repl-thread" RUNNING {10084300A3}>)
 231 | 
 232 | #### Template variables, bindings vs. assignment
 233 | 
 234 | Symbols acting as variables in the template, except for '_', are _bindings_
 235 | introduced by `LET` and initialized with
 236 | `REXXPARSE:*UNMATCHED-BINDING-VALUE*`.
 237 | 
 238 | However depending on the use of the symbols in the template, they may
 239 | undergo multiple assignments, either to text matched by the parse, or to
 240 | the result of transformations on the parsed text.
 241 | 
 242 | The '_' does not result in a binding, no `_` symbol is bound on the stack,
 243 | any template matches for this symbol will not be extracted or saved to any variable.
 244 | 
 245 | #### REXX variables vs. Lisp s-expressions
 246 | 
 247 | If you're reading REXX documentation (or otherwise familiar with it), such
 248 | as [Open Object REXX Reference](https://rexxinfo.org/reference/articles/oorexxref.pdf),
 249 | note that the use of parenthesized forms is different between REXX and
 250 | REXXPARSE:PARSE.  Where REXX would use a parenthesized expression to do a
 251 | language variable references, REXXPARSE uses parenthesized forms in templates for their
 252 | syntactic value beyond that, e.g. `(+ x)` to is a positional pattern to
 253 | move rightward `x` columns.  I imagine the confusion will only occur to
 254 | people who have been writing a lot of REXX recently.
 255 | 
 256 | ### Word-oriented tokenization
 257 | 
 258 | The basic behavior of PARSE favors matching tokens delimited by
 259 | spaces. Absent specific patterns from you, the spaces around tokens bound
 260 | to variables are discarded.  Thus
 261 | 
 262 |     (parse "Now  is the time" (now is the-time))
 263 |     => ("now" "is" "the time")
 264 | 
 265 | Note the multiple spaces between "Now" and "is", all used to divide tokens
 266 | matched and discarded. This is different from a pattern indicating a space,
 267 | e.g.
 268 | 
 269 |     (parse "Now  is the time" (now " " is " " the-time))
 270 |     => ("now" "" "is the time")
 271 | 
 272 | Here 'is' is matched to the text between the point matched by the pattern
 273 | on the left and the point matched by the pattern on the right. The two
 274 | patterns match consecutive spaces and produce the a zero length binding.
 275 | Don't let it mess with your head too much, this is a fairly contrived example.
 276 | 
 277 | ### More text than bindings
 278 | 
 279 | The last binding variable will be assigned any unmatched tail of the source
 280 | string.  E.g.
 281 | 
 282 |     (parse "a b c" (a b))
 283 |     => ("a" "b c")
 284 | 
 285 | In this situation, the text bound to the tail variable will not have spaces trimmed.
 286 | 
 287 | ### More bindings than text
 288 | 
 289 | If there are unused variables because there are fewer words in the
 290 | source than there are variables in the template, unused variables will
 291 | be bound to `REXXPARSE:*UNMATCHED-BINDING-VALUE*`, which defaults to an
 292 | empty string (in keeping with REXX semantics).  You can change this
 293 | behavior by rebinding the variable.
 294 | 
 295 |     (parse "a b" (a b c))
 296 |     => ("a" "b" "")
 297 | 
 298 | ### Consecutive bindings and/or patterns
 299 | 
 300 | Your template may have binding sequences without interleaved patterns, in which case
 301 | the implicit word splitting pattern applies.  It may also have pattern
 302 | sequences without interleaved binding variables, which may be useful if,
 303 | for example, you're looking to advance across like tokens, e.g.
 304 | 
 305 |     (parse "I want the text following the second occurrence of 'text', this text."
 306 |             ("text" "text" the-rest))
 307 |     => ("', this text.")
 308 | 
 309 | ### Parse Rules 101
 310 | 
 311 | The simplest form of parsing template consists of a list of variable names.
 312 | The string being parsed is split up into words (characters delimited by
 313 | blanks), and each word from the string is assigned to a variable in
 314 | sequence from left to right. Leading blanks are removed from each word in
 315 | the string before it is assigned to a variable, as is the blank that
 316 | delimits the end of the word.
 317 | 
 318 | Beyond the simple case there are some rules to remember for the myriad
 319 | edge cases and features related to PARSE:
 320 | 
 321 | 1. If there is one variable and no pattern, the variable matches the whole
 322 |    source string (no whitespace characters are removed).
 323 | 
 324 | 2. If there are more variables than words, excess varables are bound to
 325 |    `*UNMATCHED-BINDING-VALUE*`. 
 326 | 
 327 | 3. If there is more text than variables would match, the last variable is
 328 |    bound to all remaining text.  Sometimes called the "tail match" rule.
 329 |    Tail matches never eat spaces, they preserve the remainder of the source
 330 |    string to be matched.
 331 | 
 332 | 4. [SUBTLE, CRUCIAL] Any explicit pattern (with a match in the source
 333 |    string) creates a logical break in the
 334 |    source string such the var to the left of the pattern is treated as a
 335 |    "tail match" situation on the substring terminated by the pattern.
 336 | 
 337 |    Moreover, variables to the left apply to the substring to the left
 338 |    of the pattern.  I.e.
 339 | 
 340 |    `(parse "a b c x g" (a b "x" g)) => ("a" " b c" " g")`
 341 | 
 342 | 5. Where no pattern is given between two variables or between a variable
 343 |    and the beginning or start of the source string, an implicit "word
 344 |    splitting" takes place.  Word splitting eats spaces before a token to be
 345 |    matched, and one space after the token.
 346 | 
 347 | 6. An empty string is never found, it always matches the end of the source
 348 |    string.  Specifying an absolute position of 1 as the pattern following a 
 349 |    variable has a similar effect as an empty string pattern, it leaves 
 350 |    the cursor positioned such that you can match source string again.
 351 | 
 352 |    `(parse " a b c " (a "" b)) => (" a b c " "")`
 353 |    `(parse " a b c " (a 1 b)) => (" a b c " " a b c ")`
 354 | 
 355 | 7. Absolute positions less than one are treated as one.
 356 |    Absolute positions greater than the source string length are treated
 357 |    as being the string length.
 358 | 
 359 | 8. Relative position expressions, e.g. `(- <exp>)` require the `<exp>`
 360 |    to be a non-negative fixnum or a string that can be converted to a
 361 |    non-negative fixnum.  Absolute positions expressed with `(= <exp>)` have
 362 |    the same rules.
 363 | 
 364 | 9. Relative and absolute positional patterns are interchangeable _with one exception_.
 365 | 
 366 |    Normally template parsing of string matches skips the text matched by
 367 |    the string pattern.  However when a template sequence of the form
 368 |    `"string" variable <relative-positional>` does NOT skip the string
 369 |    pattern data when assigning to the variable, and so the pattern text
 370 |    will will appear in the variable.  For example (with non-relative examples too):
 371 |    
 372 |    `(parse " a b c " ("b" b)) => (" c "))`        ; "b" not included, no relative positional
 373 | 
 374 |     ;; '5' is effectively equal to the source scanning start position when its pseudo-pattern
 375 |     ;; is matched, which means an empty string match, which is a break/tail-position behavior.
 376 |    `(parse " a b c " ("b" b 5)) => (" c "))`      ; from end of "b" to 5 is empty, full break
 377 |    `(parse " a b c " ("b" b 7)) => (" c"))`       ; space past "b" to 7, 2 chars
 378 |    `(parse " a b c " ("b" b (+ 1))) => ("b"))`    ; from position of b to position+1
 379 |    `(parse " a b c " ("b" b (- 1))) => ("b c "))` ; from position of b to end of string
 380 | 
 381 | 10. Template expressions may reference variables bound by preceding
 382 |     template matches. See the section on `Length Positional Patterns` for an example.
 383 | 
 384 | ### Positional template directives
 385 | 
 386 | Patterns may also be positional directives, where integers specify absolute
 387 | or relative positions in the source string, relative positions being
 388 | relative to the start of the last pattern matched. Positions are generally
 389 | used for fixed length subfields in strings, but can also be used to re-scan
 390 | the source.
 391 | 
 392 | Like string patterns, positions identify points at which the source string
 393 | is split, only the length of the match is zero.  Also like string patterns,
 394 | variables bracketed by patterns will not be string trimmed.
 395 | 
 396 |     ;                1         2         3
 397 |     ;       1234567890123456789012345678901234
 398 |     (parse "Brimfield    Massachusetts   10101"
 399 |       (city 14 state 30 zip))
 400 |     => ("Brimfield    " "Massachusetts   " "10101")
 401 | 
 402 | _Absolute_ positions may be specified as positive integer literals.
 403 | The above example specifies position matches for columns 14 and 30.
 404 | Absolute positions are all 1-based integer values, i.e. a column ordinal. Subtract
 405 | one mentally for Lisp array indices. (This choice is for REXX compatibility).
 406 | 
 407 | For positions involving the integer-valued variables
 408 | instead of integer literals, you must supply an s-expression whose car is
 409 | one of `+`, `-`, `=`, followed by a s-expression that is evaluated at
 410 | runtime (not macroexpansion time) to produce an integer to be interpreted
 411 | as the relative or absolute position. The `+` and `-` expressions indicate
 412 | _relative_ positions, while `=` indicates an absolute position.
 413 | 
 414 | Examples:
 415 | 
 416 |     ;; City occupies columns 1-13 inclusive.
 417 |     ;; State occupies columns 14-29 inclusive
 418 |     ;; '+' indicates position relative to the prior pattern match position.
 419 |     (parse "Brimfield    Massachusetts   10101"
 420 |       (city (+ 13) state (+ 16) zip))
 421 |     => ("Brimfield    " "Massachusetts   " "10101")
 422 | 
 423 |     ;; Mixing absolute positions 30 and 31 with relative offsets.
 424 |     ;; reparsing the first '1' twice
 425 |     (parse "Brimfield    Massachusetts   10101"
 426 |       (30 one-a 31 (- 1) one-b (+ 1)))
 427 |     => ("1" "1")
 428 | 
 429 |     ;; use of variables must be through the parenthesized expression
 430 |     ;; otherwise they would be indistinguishable from variables to be bound.
 431 |     ;; '=' indicates absolute positions
 432 |     (defvar *state-column* 14)
 433 |     (defvar *zip-column* 30)
 434 |     (parse "Brimfield    Massachusetts   10101"
 435 |       (city (= *state-column*) state (= *zip*-column*) zip))
 436 |     => ("Brimfield    " "Massachusetts   " "10101")
 437 | 
 438 | 
 439 | Any positional directive that would precede the first source column
 440 | (i.e. are < 1) are treated as 1.
 441 | 
 442 | Any positional directive that would exceed the length of the
 443 | source string is treated as the string length, matching the
 444 | remainder of the string.
 445 | 
 446 | It is an error for any net position value to exceed the range of a fixnum.
 447 | 
 448 | ### Positional pattern data types 
 449 | 
 450 | All positional expressions must be integers in the range of non-negative
 451 | fixnums, or s-expressions that resolve to those values. This constraint is
 452 | relaxed for `+`, `-`, and `=` patterns, as well as `>` and `<` (described
 453 | below) so that strings matched while parsing may be used later in the
 454 | template as numeric positional directives. Note that such uses 
 455 | of the value bound at one step of the parse act as input controlling later
 456 | steps of the parse.
 457 | 
 458 | Allowing strings as positional values is a shortcut to avoid the need that
 459 | for littering your template with `parse-integer` calls on previously
 460 | matched text.  String to fixnum conversions in positional templates that do
 461 | not resolve to non-negative fixnums will result in an continuable error
 462 | being signalled. Conversions are performed with `cl:parse-integer` and may
 463 | generate a `cl:parse-error` condition if the text is not not parseable as
 464 | an integer, and `parse-error` is not a continuable condition.
 465 | 
 466 | See the next section with examples matching integer data in the source
 467 | string and using those integers for subsequent match activity.
 468 | 
 469 | ### Length Positional Patterns
 470 | 
 471 | A `length positional pattern` is a number in a `<` or `>` pattern sexp
 472 | similar to the `+`, `-`, and `=` pattern forms. I'm not sure why REXX
 473 | distinguishes this from `-` and `+` positional patterns, they are identical
 474 | in behavior except for one situation noted below.
 475 | 
 476 | As with `-` and `+` the number specifies the length at which the source
 477 | string is to be split relative to the current position. `>` and `<`
 478 | indicates movement right or left, respectively from the start of the string
 479 | or from the position of the last match.
 480 | 
 481 | The `>` length pattern and the `+` relative positional pattern are
 482 | interchangeable except in the special case of a zero value. A `(> 0)` pattern
 483 | will split the string into a null (empty) string and leave the match position
 484 | unchanged, whereas a `(+ 0)` pattern also leaves the match position
 485 | unchanged, but doesn't split the string.  In essence `(> 0)` says "match
 486 | empty string" whereas `(+ 0)` advance scan zero characters, matching
 487 | whatever follows.
 488 | 
 489 | This string splitting behavior is useful for parsing string subfields
 490 | whose lengths are also encoded in the string.
 491 | 
 492 | The following example shows the difference between `(> 0)` and `(+ 0)`,
 493 | note the different matches for `middle`:
 494 | 
 495 |      ;; Parsing with length patterns
 496 |      (parse "04Mark0005Twain" 
 497 |             (len (+ 2) first (> len) len (+ 2) middle (> len) len (+ 2) last (> len))
 498 |         (list first middle last len))
 499 |      => ("Mark" "" "Twain" "05")
 500 | 
 501 |      ;; Parsing with relative patterns only
 502 |      (parse "04Mark0005Twain" 
 503 |             (len (+ 2) first (+ len) len (+ 2) middle (+ len) len (+ 2) last (+ len))
 504 |         (list first middle last len))
 505 |     => ("Mark" "05Twain" "Twain" "05")
 506 | 
 507 | While `<` is similar to `-`, application of of the match/extract process
 508 | differs.  To achieve the effect of `<` on a region of text with `-` you
 509 | must use a `-`/`+` pair, and the position in source differs as in the
 510 | following example:
 511 | 
 512 |     ;; Parsing with length patterns
 513 |     (parse "12345.6789" ("." digit (< 1) rest)) => ("5" "5.6789")
 514 |     ;; Parsing with relative patterns
 515 |     (parse "12345.6789" ("." (- 1) digit (+ 1) rest)) => ("5" ".6789")
 516 | 
 517 | `<` is similar to matching a string literal, _without_ advancing the next
 518 | position to be scanned after binding.
 519 | 
 520 | ### Transformations (REXXPARSE extension)
 521 | 
 522 | `REXXPARSE::PARSE` supports transformations on matched strings before they
 523 | are assigned to variables.  Transforms are a REXXPARSE lisp extension and
 524 | not part of the basic REXX PARSE capability.
 525 | 
 526 | The syntax for an assignmement based on a predefined transformation is:
 527 | 
 528 |     (<transform> variable)
 529 | 
 530 | where you would otherwise just have a variable to be bound that wasn't in a
 531 | list.  Note that the above uses `<transform>` as a non-terminal BNF token
 532 | representing many possible pre-defined transformations. There is also a `TRANSFORM`
 533 | terminal symbol with specific user-defined transformation semantics.
 534 | 
 535 | Transforms have the same syntax as list-form patterns but are in
 536 | fact binding forms. PARSE distinguishes patterns from transform-augmented
 537 | bindings by the symbol name of the CAR of the list being known as a
 538 | transform symbol.
 539 | 
 540 | Transforms are a convenience for common parse situations, you
 541 | could always do the transformations in the `&BODY` of the parse if you need
 542 | different transformation semantics than those pre-defined by REXXPARSE or
 543 | simply don't like the confusion transform syntax that resembles pattern syntax.
 544 | 
 545 |     (parse "some text with numbers: 1.0 2" (_ ": " (float x) (integer n))
 546 |       (format t "~f is a ~s, ~d is a ~s~%" x (type-of x) n (type-of n)))
 547 | 
 548 |     =>
 549 |     1.0 is a SINGLE-FLOAT, 2 is a (INTEGER 0 4611686018427387903)
 550 |     NIL
 551 | 
 552 | The `(float x)` and `(integer n)` expressions are DSL syntax to invoke transformations
 553 | on the text corresponding to variables `x` and `n`, and assigning the
 554 | transformation result to those variables.  The set supported
 555 | transformations are describe below.
 556 | 
 557 | Transformations do not currently nest, i.e. you _cannot_ do `(LOWER (KEBAB x))`
 558 | if you need to apply more complicated transformations, see 'user defined transforms' below.
 559 | 
 560 | Transforms expressions using the `_` symbol are effectively NO-OPs.  No
 561 | text is extracted, and no transform function is run.
 562 | 
 563 | #### Pre-defined transforms
 564 | 
 565 | * UPPER       - uppercases the extracted text.
 566 | * LOWER       - lowercases the extracted text.
 567 | * SNAKE       - convert hyphens to underscores.
 568 | * KEBAB       - convert underscores to hyphens.
 569 | * LTRIM       - remove leading spaces.
 570 | * RTRIM       - remove trailing spaces.
 571 | * TRIM        - remove leading and trailing spaces.
 572 | * INTEGER     - convert extracted text to an integer.
 573 | * FLOAT       - convert extracted text to a single-float.
 574 | * DOUBLE      - convert extracted text to a double-float.
 575 | * KEYWORD     - convert extracted text to a keyword.
 576 | 
 577 | The floating point conversions are done using the `:parse-float` package,
 578 | they do _not_ perform unsafe `READ`s. INTEGER conversion is done using `PARSE-INTEGER`.
 579 | 
 580 | `SINGLE-FLOAT` and `LONG-FLOAT` conversions are not supported, the
 581 | `:parse-float` package doesn't seem to support them on SBCL at least. If
 582 | you need these representations you'll probably want to use
 583 | `*READ-DEFAULT-FLOAT-FORMAT*` bindings with a user-defined transform that
 584 | observes it and manages the conversion.
 585 | 
 586 | Of course you could also just supply a BODY to the `PARSE` form and do the
 587 | conversions in the body, there's no need whatsoever to use user-defined
 588 | transforms except perhaps to abbreviate code if the transformation is used a lot.
 589 | 
 590 | The `LTRIM`, `RTRIM`, and `TRIM` transforms use the Common Lisp `STRING-LEFT-TRIM`,
 591 | `STRING-RIGHT-TRIM`, and `STRING-TRIM` functions respectively, supplying
 592 | `REXXPARSE:*TRIM-CHARACTER-BAG*` as the character bag argument. This is
 593 | exported so that you may bind it to other characters (outside of the
 594 | `PARSE` form), but note that it will
 595 | affect all trim transforms in the scope of the binding.
 596 | 
 597 | The KEYWORD transform does no conversions to case, so it's easy to make
 598 | symbols in unexpected cases if you aren't careful. If you want to
 599 | upper/lower case the text before the transform makes a keyword of it, you
 600 | could use `:UPPER` or `:LOWER` options to `PARSE` (though that will change
 601 | the case of the whole source string).  Or you can just do what you want in
 602 | the `PARSE` body.
 603 | 
 604 | #### User defined transforms
 605 | 
 606 | There is a special transform operator, `TRANSFORM`, which exists to invoke
 607 | user-supplied transformation functions.
 608 | 
 609 |     (transform <symbol> <function>)
 610 | 
 611 | Will invoke `function` on the text extracted by the parse, and assign the 
 612 | result of the transformation function to `symbol`.  `function` must be a
 613 | [function designator](https://www.lispworks.com/documentation/HyperSpec/Body/26_glo_f.htm#function_designator)
 614 | for a function of one argument which will always be a string. 
 615 | 
 616 |     ;; User defined transform example
 617 |     (defun stupid (str) "Stupid!")
 618 |     (parse "Don't call me dull." (_ _ _ (transform s 'stupid) "."))
 619 |     => ("Stupid!")
 620 | 
 621 | ## Full PARSE syntax
 622 | 
 623 | The general form of a `PARSE` invocation (pardon the weak BNF) is below.
 624 | Only the `<source>` expression is required, and it must yield a string.
 625 | 
 626 |     PARSE <options> <source> <template>
 627 | 
 628 |     <options> ::=
 629 |     <options> ::= :UPPER
 630 |     <options> ::= :LOWER
 631 |     <options> ::= :CASELESS
 632 |     <options> ::= :USING (<var> ...)
 633 |     <options> ::= (:USING <var> ...)
 634 |     <options> ::= :USING-VECTOR (<var> ...)
 635 |     <options> ::= (:USING-VECTOR <var> ...)
 636 | 
 637 |     <source> ::= string-literal
 638 |     <source> ::= s-exp
 639 | 
 640 |     <template> ::= 
 641 |     <template> ::= <template-expression>
 642 | 
 643 |     <template-token> ::= <binding>
 644 |     <template-token> ::= <pattern>
 645 | 
 646 |     <template-expression> ::= <template-token>
 647 |     <template-expression> ::= <template-expression> <template-token>
 648 | 
 649 |     <pattern> ::= string-literal
 650 |     <pattern> ::= <position>
 651 |     <pattern> ::= ( $ <s-exp> )
 652 | 
 653 |     <position> ::= position-integer-literal
 654 |     <position> ::= ( + <position-integer> )
 655 |     <position> ::= ( - <position-integer> )
 656 |     <position> ::= ( = <position-integer> )
 657 |     
 658 |     <position-integer> ::= position-integer-literal
 659 |     <position-integer> ::= <sexp>
 660 | 
 661 |     <binding> ::= symbol
 662 |     <binding> ::= ( <transformation> symbol )
 663 | 
 664 |     <transformation> ::= ( <built-in-transformation> symbol )
 665 |     <transformation> ::= ( TRANSFORM symbol function )
 666 |     
 667 |     ;; String producing case(like) transformations
 668 |     <built-in-transformation> ::= UPPER
 669 |     <built-in-transformation> ::= LOWER
 670 |     <built-in-transformation> ::= SNAKE
 671 |     <built-in-transformation> ::= KEBAB
 672 | 
 673 |     ;; Non-string producing transformations
 674 |     <built-in-transformation> ::= INTEGER
 675 |     <built-in-transformation> ::= DOUBLE
 676 |     <built-in-transformation> ::= FLOAT
 677 |     <built-in-transformation> ::= KEYWORD
 678 |     
 679 | Symbols are compared by name (so package doesn't matter), but upper case symbol names are expected.
 680 | 
 681 | 1. `<source>` is evaluated to produce a string, unless it is already a string.
 682 | 2. `<pattern>` is used to find the text region in `<source>` to be bound to
 683 |    the symbol in `<binding>`.
 684 | 3. Once a region of text is matched by a pattern, it is assigned to the
 685 |    `<binding>` symbol. If there was no match or the source text is
 686 |    exhausted, the symbol is bound to `REXXPARSE:*UNMATCHED-BINDING-VALUE*`. 
 687 |    Do not mutate returned value. 
 688 | 4. `<binding>` can also be an sexp of the form of the form `(TRANSFORM function symbol)`, 
 689 |    in which case `function` is run to transform the matched text before assigning it to `symbol`.
 690 | 
 691 | ## `($ <s-exp>)`: variables (or other s-exps) as patterns
 692 | 
 693 | Patterns of the form `($ <s-exp>)` are used to indicate that the expression
 694 | `<s-exp>` is to be evaluated to produce a string to be used as a
 695 | pattern. It is needed because naked symbols in the template are interpreted
 696 | as binding names, and so are not normally evaluated.  `$` causes them to be
 697 | evaluated and treated as string-literal patterns, similar to a variable
 698 | evaluation directive in various other languages.
 699 | 
 700 | Example:
 701 | 
 702 |     (let ((x "brown"))
 703 |       (parse "the quick brown fox" (start ($ x) end)))
 704 |     => ("the quick " " fox")
 705 | 
 706 | The `$` form is only needed to evaluate symbols in patterns that don't
 707 | otherwise evaluate them. Positional patterns directives such as `(+ x)`
 708 | already evaluate their arguments. `(+ ($ x))` is not only unecessary, it
 709 | also wouldn't work (as `$` is a pattern directive, not a function for arbitrary
 710 | s-exp evaluation).
 711 | 
 712 | # Differences from REXX' PARSE 
 713 | 
 714 | ## The Lisp bits
 715 | 
 716 | First of all, you're in lisp.  So PARSE is a DSL of a style similar to
 717 | Common Lisp's advanced LOOP macro.  If you don't like LOOP, you may not
 718 | like PARSE.
 719 | 
 720 | Second, there is an extensible mechanism you can use both to specify the
 721 | patterns matched in the template, and _transformations_ on the bound
 722 | values.
 723 | 
 724 | ## PARSE UPPER|LOWER|CASELESS => PARSE :UPPER|:LOWER|:CASELESS
 725 | 
 726 | Parse options like UPPER are specified as keywords and not plain symbols.
 727 | 
 728 | ## The source string can be any s-expression that yields a string.
 729 | 
 730 | REXX required a lot of additional syntax to decalre how the string
 731 | expression was interpreted, e.g. 'PARSE VAR'.  REXXPARSE does not suffer
 732 | from this, one source s-expression fits all so long as it yields a string.
 733 | 
 734 | NIL is not permitted for the source string.
 735 | 
 736 | ## A missing template renders PARSE a NO-OP
 737 | 
 738 | As there is no `PARSE LINEIN` or `PARSE PULL` in this implementation,
 739 | if there is no template the `PARSE` invocation is a NO-OP, or as close to
 740 | it as we can make it.
 741 | 
 742 | If the template is missing or does not specify any variables other than
 743 | `_`, the `<source>` expression is not evaluated.
 744 | 
 745 | ## There is no multi-string comma operator
 746 | 
 747 | The original REXX PARSE would bind multiple strings with a comma separating 
 748 | multiple templates. This is not supported.
 749 | 
 750 | ## Positional Position Syntax
 751 | 
 752 | In REXX, a plus `(+)` or minus `(-)` before an integer indicated a
 753 | _relative_ position to be matched in the template. The presence of a `+`
 754 | was not the same as a positive value. A simple "10" indicates an absolute
 755 | position, whereas a "+10" indicates a relative position.
 756 | 
 757 | To do this in lisp requires that `+` be a separate token to survive the
 758 | reader. So we could represent a relative plus position as `(+ 10)`, `+ 10`
 759 | and so on.
 760 | 
 761 | Then there's the evaluation aspect if you're using an expression
 762 | value. E.g., you want to say `+myval` for some variable `myval`.  REXX 
 763 | would render that as +(myval) according to its syntactic semantics.
 764 | 
 765 | REXXPARSE:PARSE allows for the following for positional positions.
 766 | 
 767 | For shorthand positional patterns of constant values we allow integers and
 768 | keywords like these: `:+10`, `:-10`, `:=10`, `:10`. You can also use
 769 | integers, possibly negative, e.g. `10` and `-10`, but we cannot infer
 770 | relative positions on positive integers.
 771 | 
 772 | The long form syntax for positional patterns which might indicate
 773 | relativity as well as the value of arbitrary expressions is as follows
 774 | for some expression `x`, which may also be any number for which integer
 775 | coercion semantics are defined.
 776 | 
 777 |     (+ x)
 778 |     (- x)
 779 |     (= x)
 780 | 
 781 | ## Additional PARSE options
 782 | 
 783 | REXXPARSE:PARSE allows for a number of options that change various
 784 | behaviors of the parse. Options are specified as (optional) keywords that
 785 | precede the source string expression to be parsed, i.e.
 786 | 
 787 |     (PARSE [:option1 ... :optionN] source-sexp (template) ...)
 788 | 
 789 | Some options have accompanying values, some do not, and some may be expressed
 790 | as lists.  All options, and only options, are triggered by keywords in the
 791 | PARSE arguments.
 792 | 
 793 | The following sections describe the options.  I have attempted to attribute
 794 | options to the REXX language versions that introduced them, please
 795 | provide corrections if the attributions are incorrect.
 796 | 
 797 | All options specified are bound to `REXXPARSE:*OPTIONS*` for the scope of
 798 | the PARSE expression, so that user-extensible patterns or other operators
 799 | can check for options that might require consideration, such as :CASELESS
 800 | comparisons. 
 801 | 
 802 | The traditional REXX options (:UPPER, :LOWER, :CASELESS) are plain
 803 | keywords that are _not accompanied by values_, their presence triggers the intended
 804 | behavior. Other options may accept values, refer to the documentation on
 805 | individual options for details.
 806 | 
 807 | ### PARSE :UPPER
 808 | 
 809 | `PARSE :UPPER` converts lowercase a-z to uppercase before parsing. Note that
 810 | this represents a transformation (by copying) of the source string before parsing, 
 811 | and matched content will by definition be upper case as the source string
 812 | will no longer have any lowercase text.
 813 | 
 814 |     (parse :upper "A b C d" (w "C" r)) => ("A B " " D")
 815 | 
 816 | Note that specifying lower case string patterns will foil matching, 
 817 | :UPPER has no effect on the pattern text or the comparisons used.
 818 | 
 819 |     (parse :upper "A b C d" (w "c" r)) => ("A B C D" "")
 820 | 
 821 | `UPPER` was the only option in the original REXX PARSE construct.
 822 | 
 823 | The `:UPPER` option is mutually exclusive with the `:LOWER` option.
 824 | 
 825 | ### PARSE :LOWER
 826 | 
 827 | `PARSE :LOWER` converts uppercase a-z to lowercase before parsing. Note that
 828 | this represents a transformation (by copying) of the source string before parsing, 
 829 | and matched content will by definition be lower case as the source string
 830 | will no longer have any uppercase text.
 831 | 
 832 |     (parse :lower "A b C d" (w "c" r)) => ("a b " " d")
 833 | 
 834 | Note that specifying upper case string patterns will foil matching, 
 835 | :LOWER has no effect on the pattern text or the comparisons used.
 836 | 
 837 |     (parse :lower "A b C d" (w "C" r)) => ("a b c d" "")
 838 | 
 839 | The `LOWER` was added by the NetREXXX language specification.
 840 | 
 841 | The `:LOWER` option is mutually exclusive with the `:UPPER` option.
 842 | 
 843 | ### PARSE :CASELESS
 844 | 
 845 | `PARSE :CASELESS` ignores case on the comparisons. Unlike :UPPER and other
 846 | options it does not transform the source string, but instead changes the
 847 | character equality predicates used for comparison.
 848 | 
 849 |     (parse :caseless "A b C d" (w "c" r)) => ("A b " " d")
 850 | 
 851 | The `CASELESS` was added by the Open Object REXX language specification.
 852 | 
 853 | Note that this option may not be supported by pattern processors used as
 854 | extensions to the REXXPARSE behavior.  Extensions may implement the desired
 855 | behavior by examining the value of `REXXPARSE:*OPTIONS*` which is bound to
 856 | options specified in the PARSE form.
 857 | 
 858 | ### PARSE NUMERIC (unsupported)
 859 | 
 860 | The IBM z/OS version of REXX supports a `parse numeric digits form fuzz`
 861 | packaging of the `numeric` operator.  This is not supported by REXXPARSE.
 862 | 
 863 | ### PARSE :USING (<var> ...) or (:USING <var> ...)
 864 | 
 865 | The `:USING` option indicates that for all symbols in the var list,
 866 | `PARSE` should _not_ allocate bindings in its macroexpansion `LET` block,
 867 | and should instead use vars which already exist in the environment.
 868 | This may be useful for iterative performance or other application logic reasons.
 869 | 
 870 | When you request this behavior the vars do not undergo any initialization
 871 | step by `PARSE`.  If they are not assigned values by by the
 872 | match/extract/assign steps (because there are more variables than matches)
 873 | then whatever value they had going into parse is the the
 874 | value they will have in the body of `PARSE`.
 875 | 
 876 | Example:
 877 | 
 878 |     (let ((x nil)
 879 |           (y t))
 880 |       (parse :using (x y) "abc" (x)))
 881 |     => ("abc")
 882 | 
 883 | With the side effect that X is now "abc", and Y, which was not matched
 884 | or assigned, is still T.
 885 | 
 886 | ### PARSE :USING-VECTOR (<var> ...) or (:USING-VECTOR <var> ...)
 887 | 
 888 | The `:USING-VECTOR` option is similar to the `:USING` vector. Symbols in
 889 | the list will not be allocated or initialized in the macro-expansion.
 890 | 
 891 | However in this case the symbols are expected to refer to fill-pointered
 892 | arrays when the PARSE macroexpansion is executed. Where an ordinary
 893 | `:USING` symbol would be assigned with `SETQ` or `SETF`, assignments to
 894 | symbols named by `:USING-VECTOR` will be executed by `(VECTOR-PUSH <value>
 895 | <var>)`. The variable is typically reused for multiple template bindings.
 896 | 
 897 | The array must exist and be large enough to accept the new value(s), and you
 898 | should ensure the fill pointer is where you want it on entry to `PARSE`.
 899 | It is an error if symbols named in `:USING-VECTOR` are also specified in
 900 | `:USING`.  Note that `VECTOR-PUSH` will not modify the array or signal a
 901 | condition if the fill-pointer indicates the array is full.
 902 | 
 903 | Aside from other possible performance or logic utility, the use of vectors
 904 | enables you to ask how many template assignments were matched and executed
 905 | by querying the fill-pointered vector length.  While you can query ordinary
 906 | `PARSE` bound vars to see if they were not matched, use of `:USING-VECTOR`
 907 | may be a more efficient way to get a count if the input is unlikely to
 908 | match bindings in a predetermined fashion.
 909 | 
 910 | Example:
 911 | 
 912 |     (let ((v (make-array 5 :fill-pointer 0)))
 913 |       (parse :using-vector (v) "abc def" (v v v)) ;=> ("abc" "def" "")
 914 |       v)
 915 |     => #("abc" "def")
 916 | 
 917 | Vector symbols in the template may not be used for input in positional
 918 | directives in the way that ordinary symbols are used. If you want to use a
 919 | value matched and stored in a vector in a previous template binding, in a
 920 | positional pattern you'll need to AREF your previously matched slot in the
 921 | positional pattern s-exp.  I.e.
 922 | 
 923 |     (let ((v (make-array 4 :fill-pointer 0)))
 924 |       (parse (:using-vector v) "04Mark" (1 v (+ 2) v (> (aref v 0)))
 925 |         v))
 926 | 
 927 |     => #("04" "Mark")
 928 | 
 929 | Some notes on return values when using vectors
 930 | _when there is no `&BODY`_ provided to the parse.
 931 | 
 932 | 1. `PARSE` normally returns a list of all symbols values
 933 |    with binding specifications in the template, so it would normally return
 934 |    `("abc" "def" "")`.  However while there are three binding expressions
 935 |    in the example template above there are only two _assignments_ (only two
 936 |    matches) so the vector only receives two values.
 937 | 
 938 | 2. `PARSE` endeavors to return values (again, when there isn't a BODY),
 939 |    such that the bound/assigned values are the same whether vectors or
 940 |    non-vectors variables are used in the template.
 941 | 
 942 |    This would be difficult with vector since we're neither initializing the
 943 |    vector, nor certain that `(aref v <n>)` has any meaningful value, or is
 944 |    even accessible if it wasn't assigned by the parse.  E.g.
 945 | 
 946 |     `(parse :using-vector (v) "a b" (v v v)) => ???`
 947 | 
 948 |    If you'd done (parse "a b" (a a a)) it would return `("")`, the last
 949 |    value matched for the only template binding symbol.
 950 |    
 951 |    For this reason, without a BODY specification we do extra setup for
 952 |    vectors and create shadow symbols for the bindings in the template that
 953 |    specify vectors. The shadow symbols are initialized and assigned like regular
 954 |    variables, solely so that we have something to make the PARSE default
 955 |    return value compatible with similar uses of non-vector template binding
 956 |    variables.  Thus the above example would return ("a" "b" ""), just as if
 957 |    you'd said (parse "a b" (a a a))
 958 | 
 959 |    Similarly, (parse :using-vector (v) "a b" (v b)) => '("a" "b")
 960 | 
 961 | 3. You may wish to return NIL (or some other non-default value) to avoid
 962 |    `PARSE` consing a list as the default result (when there is no BODY)
 963 |    when you already have the result in a vector or other previously
 964 |    allocated bindings, presumably because you may be trying to avoid
 965 |    consing when you specify :USING or :USING-VECTOR.
 966 | 
 967 |    Of course this (NIL or other BODY return) also suppresses maintenance of
 968 |    the vector shadow symbol discussed in item 2 as well.
 969 | 
 970 | # User Extensible Behaviors
 971 | 
 972 | This section describes ways to extend REXXPARSE behavior by adding new
 973 | scanners (pattern matchers), transformers, and options (*TBD* - maybe not options).
 974 | 
 975 | # Future work
 976 | 
 977 | ## Cleanup and code improvements
 978 | 
 979 | It took me longer than expected to grok how REXX' PARSE command works, and
 980 | my implementation took many twists and turns as I went down that road of
 981 | discovery.  The result is some code I don't like that could undoubtedly be
 982 | streamlined to express the rexx semantics better and work a bit faster.
 983 | In particular the scanners and the calling logic that decides what to do
 984 | with the scanner data, e.g. figuring out what is supposed to be extracted
 985 | with the semantics of '>' vs '+'. 
 986 | 
 987 | ## Regexp patterns
 988 | 
 989 | The base `REXXPARSE` capabilities emulate REXX, and by design do not
 990 | incorporate regular expressions into the functionality.  However I was
 991 | thinking it would be nice to also allow that if people want it, in a
 992 | separate ASDF lisp system that combines CL-PPCRE and REXXPARSE into a
 993 | `REXXPARSE-RE` system, so that regexp string scanning is done via a
 994 | user-extensible interface to REXXPARSE. I.e. inaddition to plain string
 995 | literal patterns, you'd have regexp patterns as well. Note that this
 996 | wouldn't change the inverted matching style of `PARSE`, it would just
 997 | augment what could be matched.
 998 | 
 999 | It isn't clear that anybody will ever use REXXPARSE much less a
1000 | hypothetical REXXPARSE-RE, so this is unlikely to appear without an
1001 | indicator of interest.
1002 | 
1003 | ## Some real use of the extension mechansims
1004 | 
1005 | The main tentative extension mechansims now are `*OPTIONS*` and
1006 | `*PATTERN->SCANNER*`.  I've chosen to use special variables and NOT generic
1007 | functions so that in the unlikely event you had two users of REXXPARSE in
1008 | the same lisp system, they could both extend the behavior without
1009 | clobbering each other, and generic functions would not support that.
1010 | 
1011 | However I haven't actually tested this, so consider the extension
1012 | mechansisms a work in progress until someone builds something like
1013 | `REXXPARSE-RE` below, or other things, to battle test the extension logic.
1014 | I.e. there may be breaking changes (only on the extension mechansims) until
1015 | I know it's usable.
1016 | 


--------------------------------------------------------------------------------
/rexxparse.lisp:
--------------------------------------------------------------------------------
  1 | (in-package :rexxparse)
  2 | 
  3 | ;;;; Types of functions in this module.
  4 | ;;;;
  5 | ;;;; 1. Scanning functions. These scan text for some pattern known to the function
  6 | ;;;;    beginning at some starting position.
  7 | ;;;;
  8 | ;;;;    Input arguments: 
  9 | ;;;;    1. PARSE "source" text to be scanned, always a string.
 10 | ;;;;    2. Fixnum indicating of the first position in source to start scanning.
 11 | ;;;;    3. Fixnum indicating the position of the last pattern match. This is zero
 12 | ;;;;       for the first scan.  This data is mostly needed for relative positional scans
 13 | ;;;;       which are interpreted relative to start of the last pattern match.
 14 | ;;;;    4. The "pattern" which is to be used for the scanning, e.g. the string
 15 | ;;;;       to be sought in the source text, a POSITION directive to the scanner,
 16 | ;;;;       or a regexp or other custom representation used by a custom scanner
 17 | ;;;;       of your deriving based on a scanner lookup function you provide via
 18 | ;;;;       *PATTERN->SCANNER*.
 19 | ;;;;
 20 | ;;;;    Not all scanners need all arguments, the arguments are supplied for extensible
 21 | ;;;;    scanner API purposes.
 22 | ;;;;
 23 | ;;;;    Scanners return NIL if the pattern is not found, otherwise 
 24 | ;;;;    they two two values:
 25 | ;;;;    - the position within the input text of the first character matched
 26 | ;;;;    - the position following the last character of the matched pattern.
 27 | ;;;;      Note that for positional patterns the two values will generally be EQL.
 28 | ;;;;  
 29 | ;;;; 2. Extract functions. Extract functions are called when the scanner
 30 | ;;;;    identified a (possibly zero length) region of text to extract from 
 31 | ;;;;    the "source".
 32 | ;;;;
 33 | ;;;;    Input arguments:
 34 | ;;;;    1. The PARSE "source" text of which some subsequence is to be extracted.
 35 | ;;;;    2. The inclusive starting position of the region to extract.
 36 | ;;;;    3. The exclusive end position of the region to extract.
 37 | ;;;;
 38 | ;;;;    Extractors always return strings. The strings returned by the extractor
 39 | ;;;;    should be considered immutable.
 40 | ;;;;
 41 | ;;;; 3. Transform functions. These take a string as input and produce any type of
 42 | ;;;;    of value as output. 
 43 | ;;;;
 44 | ;;;; WARNING:
 45 | ;;;; All strings bound or returned by PARSE should be considered immutable.
 46 | ;;;; We prefer to avoid consing unnecessary string copies in this implementation.
 47 | 
 48 | ;;; Reminder to self, `string=` works on _string designators_ include symbols
 49 | ;;; and looks at the name of symbols so no need to deref with `symbol-name` first.
 50 | 
 51 | ;;; Original REXX PARSE algorihm description which was surprisingly unreadable to me, but FWIW:
 52 | ;;; https://www.ibm.com/docs/en/zos/2.1.0?topic=parsing-details-steps-in
 53 | 
 54 | (alexandria:define-constant +empty-string+ "" :test #'string=)
 55 | 
 56 | (defvar *UNMATCHED-BINDING-VALUE* +empty-string+
 57 |   "PARSE initializes bindings to this value. Unmatched bindings
 58 | will return this value. Rebind (outside the call to PARSE)
 59 | if you want a different outcome.")
 60 | 
 61 | (defun extract (term start end)
 62 |   "Extraction only. No transformation, not even blank spaces.
 63 | Extract text in string TERM from START inclusive to END exclusive after
 64 | Return a new string with the extracted content."
 65 |   (declare (string term) (fixnum start end))
 66 |   (subseq term start end))
 67 | 
 68 | (defun extract-after-left-trim (term start end)
 69 |   "Transformer & Extraction.
 70 | Extract text in string TERM from START inclusive to END exclusive after
 71 | trimming leading space characters (and ONLY leading space characters).
 72 | Return a string with the trimmed content."
 73 |   ;; Note that REXX guide says:
 74 |   ;; "Leading blanks are removed from each word in the string before it is assigned
 75 |   ;; to a variable, as is the blank that delimits the end of the word."
 76 |   (declare (string term) (fixnum start end))
 77 |   (loop while (and (< start end)
 78 |                    (char= #\space (schar term start)))
 79 |         do (incf start))
 80 |   (subseq term start end))
 81 | 
 82 | (defun scan-absolute-position (source source-start last-match-start position)
 83 |   "Pseudo-scanner for an absolute one-based position. 
 84 | 
 85 | Valid (one-based) positions/columns are from one to length of the source.
 86 | Values less than 1 are treated like 1. Values > length are treated as length.
 87 | 
 88 | If the position is not a valid column in the source string, return NIL
 89 | for the match start pos, and (1- POSITION) for the match-follow-pos.
 90 | This is similar to searches for empty string patterns, except that
 91 | we are going to set the start position.
 92 | 
 93 | If the position denotes a valid column in the source string,
 94 | return two values, (1- POSITION) for the match start position
 95 | and (1- POSITION) (as if we scanned a zero length string
 96 | pattern without the 'no empty patterns' rule).
 97 | 
 98 | Note: position 1 is like an empty string pattern, it is never found.
 99 | However unlike an empty string it may modify the source start position."
100 |   (declare (string source) (ignore last-match-start) (fixnum source-start position))
101 |   (let ((position (1- position)))       ;to zero-based index
102 |     (when (< position 0) (setq position 0))
103 |     (if (= position source-start)       ;special "not found" semantics
104 |         (values nil position)
105 |         (let ((len (length source)))
106 |           (when (> position len) (setq position len))
107 |           (if (< position len)
108 |               (values position position)
109 |               nil)))))
110 | 
111 | (defun scan-leftward-relative-position (source source-start last-match-start offset)
112 |   "Pseudo-scanner for a leftward relative offset from the last match.
113 | 
114 | OFFSET must be be a non-negative fixnum, which is subtracted from LAST-MATCH-START
115 | to obtain a new index into SOURCE.
116 | 
117 | If the new position is not a valid column in the source string, return NIL
118 | for the match start pos, and (1- SOURCE-LENGTH) for the match-follow-pos.
119 | This is similar to searches for empty string patterns, except that
120 | we are going to set the start position.
121 | 
122 | If the new position denotes a valid column in the source string,
123 | return two values, (- LAST-MATCH-START OFFSET) for the (new) match start position
124 | and (- LAST-MATCH-START OFFSET) (as if we scanned a zero length string
125 | pattern without the 'no empty patterns' rule).
126 | 
127 | Note1: position 1 is like an empty string pattern, it is never found.
128 | However unlike an empty string it may modify the source start position.
129 | 
130 | Note2: This pseudo-scanner for positions can return a scan position that precedes SOURCE-START,
131 | which is not a valid subsequence.  The caller should deal with with it accordingly."
132 |   (declare (ignore source source-start) (fixnum last-match-start offset))
133 |   ;;(check-type offset (integer 0 #.most-positive-fixnum)) now enforced by caller
134 |   (let ((position (- last-match-start offset)))
135 |     (when (< position 0) (setq position 0))
136 |     (if (= position 0)                  ;special "not found" semantics
137 |         (values nil position)
138 |         (values position position))))
139 | 
140 | (defun scan-leftward-length-position (source source-start last-match-start offset)
141 |   "Pseudo-scanner for a leftward length offset from the last match with `<` semantics.
142 | Arguments are as for `scan-leftward-relative-position`."
143 |   (declare (ignore source source-start) (fixnum last-match-start offset))
144 |   ;;(check-type offset (integer 0 #.most-positive-fixnum)) now enforced by caller
145 |   (let ((position (- last-match-start offset)))
146 |     (when (< position 0) (setq position 0))
147 |     (values position last-match-start)))
148 | 
149 | (defun scan-rightward-relative-position (source source-start last-match-start offset)
150 |   "Pseudo-scanner for a rightward relative offset from the last match.
151 | 
152 | OFFSET must be be a non-negative fixnum, which is added to LAST-MATCH-START
153 | to obtain a new index into SOURCE.
154 | 
155 | If the resulting new position is not a valid column in the source string
156 | (exceeding the length of the string) , return NIL
157 | for the match start pos, and SOURCE-LENGTH for the match-follow-pos.
158 | This is similar to searches for empty string patterns.
159 | 
160 | If the resulting position denotes a valid column in the source string,
161 | return two values, (+ LAST-MATCH-START OFFSET) for the (new) match start position
162 | and (+ LAST-MATCH-START OFFSET) (as if we scanned a zero length string
163 | pattern without the 'no empty patterns' rule)."
164 |   (declare (ignore source-start) (string source) (fixnum last-match-start offset))
165 |   ;;(check-type offset (integer 0 #.most-positive-fixnum)) now enforced by caller
166 |   (let ((max-length (length source))
167 |         (position (+ last-match-start offset))) ;last-match-start already zero-based
168 |     (if (>= position max-length)
169 |         (values nil max-length)
170 |         (values position position))))
171 | 
172 | (defun scan-rightward-length-position (source source-start last-match-start offset)
173 |   "Pseudo-scanner for a rightward length (>) position, similar to, but different from, '+'.
174 | Where + on overflow would not match, > does match."
175 |   (declare (ignore source-start) (string source) (fixnum last-match-start offset))
176 |   ;;(check-type offset (integer 0 #.most-positive-fixnum)) now enforced by caller
177 |   (let ((max-length (length source))
178 |         (position (+ last-match-start offset))) ;last-match-start already zero-based
179 |     (when (>= position max-length)
180 |       (setq position max-length))
181 |     (values position position)))
182 | 
183 | (defvar *string-equality-predicate* #'string=
184 |   "Predicate used to compare strings in `scan-string`.")
185 | 
186 | (defun scan-string (source source-start last-match-start string)
187 |   "Scanner. Find STRING in SOURCE beginning at SOURCE-START.
188 | If not found, return NIL.
189 | Otherwise return two values, the starting position in STRING in SOURCE,
190 | and the position of the character following the occurrence of STRING in SOURCE.
191 | 
192 | Note: an empty string is never found."
193 |   (declare (string source string) (fixnum source-start) (ignore last-match-start))
194 |   (if (zerop (length string))
195 |       nil
196 |       (let ((match-pos (search string source 
197 |                                :start2 source-start
198 |                                :test *string-equality-predicate*)))
199 |         (if match-pos
200 |             (values match-pos (+ match-pos (length string)))
201 |             nil))))
202 | 
203 | (defun scan-word-split (source start last-match-start unused) ; unused == :word-split-pattern
204 |   "Scanner. Search for word-splitting spaces (or end of string) beyond a 
205 | word/token which may have preceding spaces we need to skip.
206 | 
207 | If we don't find a word token, return NIL.
208 | Otherwise return two values:
209 | 1. the position in SOURCE of the first space following the word token,
210 |    or the length of SOURCE if there are no spaces following.
211 | 2. value 1 + 1 if a space was found, or value 1 if end-of-string applied.
212 | "
213 |   (declare (string source) (fixnum start) (ignore last-match-start unused))
214 |   (let ((end (length source)))
215 |     (declare (fixnum end))
216 |     (loop while (and (< start end)
217 |                      (char= #\space (schar source start)))
218 |           do (incf start))
219 |     (if (< start end)
220 |         ;; Positioned at token text, will have non-nil return
221 |         (loop with pos fixnum = (1+ start)
222 |               while (and (< pos end)
223 |                          (char/= #\space (schar source pos)))
224 |               do (incf pos)
225 |               finally (return (values pos (min (1+ pos) end))))
226 |         ;; There was no token, return nil
227 |         nil)))
228 | 
229 | (defparameter *builtin-list-pattern-scanners*
230 |   (list (cons := 'scan-absolute-position)
231 |         (cons :- 'scan-leftward-relative-position)
232 |         (cons :+ 'scan-rightward-relative-position)
233 |         (cons :< 'scan-leftward-length-position)
234 |         (cons :> 'scan-rightward-length-position)
235 |         (cons :$ 'scan-string))
236 |   "An association list keyed by pattern operator keyword (the canonicalized form
237 | of user symbols specified for list patterns like `(+ <s-exp>)` which were originally 
238 | symbols from potentially any package).
239 | 
240 | Values are function symbols which should perform the scan when invoked as
241 | `(fn source-string start-position <s-exp>)` (Similar to `scan-string`).")
242 | 
243 | (defun aget (key alist &key (test #'eql) not-found)
244 |   "Similar to ASSOC but returns the value of the cons whose car was KEY instead of the
245 | cons itself. Returns NOT-FOUND if there KEY isn't ALIST."
246 |   (alexandria:if-let ((cons (assoc key alist :test test)))
247 |     (cdr cons)
248 |     not-found))
249 | 
250 | ;;; Pattern representation after parsing template
251 | ;;; As these are created at macro processing time, and referenced in the expansion
252 | ;;; we need to arrange for the pattern struct be written/read in fasls with MAKE-LOAD-FORM
253 | 
254 | (defstruct (pattern (:constructor %make-pattern))
255 |   sexp        ;original macro argument
256 |   string      ;if sexp is a string. Presently nil for `($ x)` sexp though
257 |   ;; Position isn't necessarily known until pattern is used while matching
258 |   ;; it may rely on variables previously bound in the PARSE matching.
259 |   ;; It is known if a number was specified as an absolute position pattern.
260 |   position    ;integer value of pattern if fixnum sexp or positional-p
261 |   operator    ;iff pattern is list or keyword, operator as a keyword
262 |   relpos-p    ;true if operator is + or - relative position directive
263 |   positional-p;true if operator is +/-/=/</>
264 |   parameter   ;unevaluated argument symbol iff pattern is a list referencing a symbol
265 |   )
266 | 
267 | (defun pattern->scanner (pattern)
268 |   "Function that accepts a pattern structure and returns a function to scan
269 | for the pattern in a PARSE source string. Note that some patterns are directives more than
270 | things saught, for example so-called positional patterns. 
271 | 
272 | See package README.md file for more information on valid pattern literals.
273 | 
274 | If you want to specify a different scanner for a pattern in some extension,
275 | bind the *PATTERN->SCANNER* special variable, which shouild call
276 | this function if it doesn't wish to handle a particular pattern descriptor."
277 |   (let ((sexp (pattern-sexp pattern)))
278 |     (typecase sexp
279 |       (list (or (aget (pattern-operator pattern) *builtin-list-pattern-scanners*)
280 |                 (error "Unknown pattern action ~s in ~s.~%Valid actions are ~{~a~^, ~}"
281 |                        (pattern-operator pattern) sexp
282 |                        (mapcar #'car *builtin-list-pattern-scanners*))))
283 |       (fixnum 'scan-absolute-position)
284 |       (string 'scan-string)
285 |       (t (error "Unable to derive a scanner for pattern literal ~s" sexp)))))
286 | 
287 | (defvar *pattern->scanner* 'pattern->scanner
288 |   "A function designator which takes as input a valid PARSE pattern literal
289 | such as a string, position designator, or `MATCH` expression, and should return
290 | a function to scan for the literal in a PARSE source string.
291 | 
292 | This variable defaults to *FINISH*. It is expressed as a dynamic variable
293 | so that you can rebind it if you have special pattern needs that aren't supported
294 | by the default REXXPARSE behavior.
295 | 
296 | The resulting scanner function should accept arguments:
297 | - source string to be scanned
298 | - start position within the source string for scanning
299 | - pattern literal that was input to the call to the *PATTERN->SCANNER* function.
300 | 
301 | The scanner function should return:
302 | - the match position of the start of the pattern in the input source string.
303 | - the position of the first character in the source string after the matched
304 | - text.")
305 | 
306 | (defmethod make-load-form ((p pattern) &optional env)
307 |   (declare (ignore env))
308 |   `(%make-pattern :sexp ,(pattern-sexp p)
309 |                   :string ,(pattern-string p)
310 |                   :position ,(pattern-position p)
311 |                   :operator ,(pattern-operator p)
312 |                   :relpos-p ,(pattern-relpos-p p)
313 |                   :positional-p ,(pattern-positional-p p)
314 |                   :parameter ,(pattern-parameter p)))
315 | 
316 | (defun make-pattern (s-exp)
317 |   "Allocate and initialize a pattern struct given some macro s-expression representing a pattern.
318 | s-exp may one of a set of keywords denoting special pattern activity, such as
319 | :word-split-pattern or :end-of-text-pattern."
320 |   (let* ((operator (if (listp s-exp) (car s-exp) nil))
321 |          (operator-keyword (cond ((keywordp s-exp) s-exp)
322 |                                  ((keywordp operator) operator)
323 |                                  ((symbolp operator) (intern (symbol-name operator) :keyword)))))
324 |     (%make-pattern :sexp s-exp
325 |                    :string (if (stringp s-exp) s-exp nil)
326 |                    :position (if (integerp s-exp) s-exp nil)
327 |                    :operator operator-keyword
328 |                    :relpos-p (or (eq :- operator-keyword) (eq :+ operator-keyword))
329 |                    :positional-p (and (member operator-keyword '(:+ :- := :> :<) :test #'eq) t)
330 |                    :parameter (if (listp s-exp) (second s-exp) nil))))
331 |                  
332 | (defun pattern-argument (pattern)
333 |   "Return the self-evaluating value of a pattern if present, or the pattern parameter otherwise.
334 | Note that pseudo-patterns such as :end-of-text-pattern will have no argument and will return NIL."
335 |   (or (pattern-string pattern)
336 |       (pattern-position pattern)
337 |       (pattern-parameter pattern)))
338 | 
339 | (defun pattern? (s-exp)
340 |   "Return true if s-exp indicates a pattern in a template (vs a binding symbol)
341 | Valid patterns are:
342 | 
343 |     <pattern> ::= string-literal
344 |     <pattern> ::= <position>
345 |     <pattern> ::= ( $ <sexp> )
346 | 
347 |     <position> ::= position-integer-literal
348 |     <position> ::= ( + <position-integer> )
349 |     <position> ::= ( - <position-integer> )
350 |     <position> ::= ( = <position-integer> )
351 |     
352 |     <position-integer> ::= position-integer-literal
353 |     <position-integer> ::= <sexp>
354 | 
355 | We have not fully validated entities for which we return T."
356 |   (or (stringp s-exp)
357 |       (integerp s-exp)
358 |       (and (listp s-exp) 
359 |            (symbolp (car s-exp))
360 |            (member (car s-exp) '("+" "-" "=" "<" ">" "$") :test #'string=))))
361 | 
362 | (defun implicit-positional-pattern-fixnum (pattern pattern-arg)
363 |   "Given a pattern structure and evaluated pattern-arg as per MATCH-AND-EXTRACT,
364 | allow implicit conversion of a string valued pattern-arg to a non-negative fixnum
365 | for positional s-expression patterns.  This is so we can use previously matched
366 | numbers from the source string as numeric positions for +/-/=/</> positional patterns.
367 | 
368 | Return pattern-arg as-is if pattern is not a positional s-exp pattern,
369 | otherwise attempt to convert it to a non-negative fixnum if it is not already such a thing.
370 | 
371 | Signal a condition if pattern is a positional pattern and pattern-arg isn't, or isn't convertable
372 | to, a non-negative fixnum."
373 |   (if (pattern-positional-p pattern)
374 |       (let ((position 
375 |               (or (pattern-position pattern)
376 |                   (setf (pattern-position pattern)
377 |                         (etypecase pattern-arg
378 |                           (string (parse-integer pattern-arg))
379 |                           (integer pattern-arg))))))
380 |         ;; restartable with new position, assigns new valuje to position if it passes test
381 |         ;; It doesn't actually have to be a fixnum, just in the 0+ range of a fixnum
382 |         (check-type position (integer 0 #.most-positive-fixnum))
383 |         position)
384 |       pattern-arg))
385 | 
386 | (defun match-and-extract (source scan-start last-match-start pattern pattern-arg extract-p
387 |                           &aux (operator (pattern-operator pattern)))
388 |   "MATCH-EXTRACT-TRANSFORM
389 | 
390 | SOURCE is the string to be parsed.
391 | 
392 | SCAN-START is the zero-based offset in SOURCE where we begin scanning.
393 | For all scans but the first, it is the position following a prior pattern match or 
394 | positional pseudo-match. For the first scan it will be zero.
395 | 
396 | LAST-MATCH-START is the zero-based offset in SOURCE where the last pattern match _began_.
397 | This is important for relative positional offsets which are relative to the start of
398 | the last match.  For the first scan it will be zero.
399 | 
400 | PATTERN is a pattern structure with all the original pattern data derived from the template.
401 | 
402 | PATTERN-ARG is the self-evaluating or evaluated argument derived from a symbolic parameter in the
403 | pattern. If the pattern had a variable reference, this is the value of the variable from the 
404 | macro-expanded code. It should be a string or fixnum, unless we've added support for extensions
405 | which would supply regular expressions or other data types.
406 | 
407 | EXTRACT-P is true if a value should be extracted for binding, nil if there is no
408 | binding to be performed.
409 | 
410 | Given an input string SOURCE to be scanned, a starting position START,
411 | and PATTERN being scanned for, do the following:
412 | 
413 | 1. Invoke the scanner designated by (funcall *PATTERN->SCANNER* pattern)
414 |    on the source string.
415 | 
416 | 2. If the scanner indicated a match, invoke the extractor on the region 
417 |    whose end was identified by the scanner, return the extracted 
418 |    text and the position to resume scanning.
419 | 
420 |    Whether the extractor should eat leading spaces depends on two things:
421 |    i.  the pattern preceding the current region must be an implicit word-split-pattern,
422 |    ii. the pattern following the current region must not be an explicit pattern,
423 |        as explicit patterns imply a logical source string match
424 |        such that the variable being bound is considered a tail match.
425 | 
426 | 3. If the scanner does not indicate a match, return the tail match text
427 |    and nil (no position to resume scanning).
428 | 
429 | 4. If there is a transform, invoke it on the result of the extraction.
430 | 
431 | Restating return values:
432 | If the scanner found a match return three values:
433 | - The value returned by the extraction.
434 | - The starting position of the scanner match. For a text pattern this would be the position
435 |   of the first character of the text within the source string.  For a positional pattern
436 |   this would be the position indicated by the positional directive.
437 | - The position of the next place in source to resume scanning. For a text pattern
438 |   this is the position of the first character in source following the last character matched
439 |   in the text pattern.  For a positional pattern this will be identical to the positional
440 |   directive. 
441 | 
442 | If the scanner did not find a match, return three values:
443 | - All unprocessed text in the source as-is, i.e. text from scan-start to end of source string.
444 | - 0
445 | - NIL
446 | 
447 | Note that the first value may be NIL if the transformation function
448 | returns NIL, so it is the third value that is the determination of whether
449 | additional scans might be performed, with the caveat that positional patterns
450 | remaining to be processed could back up the scan position!
451 | "
452 |   (declare (string source) (fixnum scan-start last-match-start) (pattern pattern))
453 |   (if (eq operator :end-of-text-pattern)
454 |       ;; Special tail binding semantics, *TBD*: allow extract/transform semantics?
455 |       (values (and extract-p (extract source scan-start (length source))) nil)
456 |       ;; Select a scanner based on the full pattern expression, e.g. `(+ x)`
457 |       ;; Invoke a scanner on the pattern expression value, which is either the 
458 |       ;; pattern itself it's a literal (string, number, etc), or the second
459 |       ;; value in the pattern if it's a list, e.g. `x` in `(+ x)`.
460 |       (let* ((scanner (if (eq operator :word-split-pattern)
461 |                           'scan-word-split
462 |                           (funcall *pattern->scanner* pattern)))
463 |              (extractor (if (eq operator :word-split-pattern)
464 |                             'extract-after-left-trim
465 |                             'extract))
466 |              ;; Strings to integers for positional patterns, if necessary
467 |              (pattern-arg (implicit-positional-pattern-fixnum pattern pattern-arg)))
468 |         (declare (type (or function symbol) extractor scanner))
469 |         (multiple-value-bind (match-start-pos match-follow-pos)
470 |             (funcall scanner source scan-start last-match-start pattern-arg)
471 |           (if match-start-pos
472 |               (let ((value 
473 |                       (and extract-p
474 |                            (cond 
475 |                              ((eq :< operator)
476 |                               (funcall extractor source match-start-pos match-follow-pos))
477 | 
478 |                              ((eq :> operator)
479 |                               (funcall extractor source last-match-start match-follow-pos))
480 | 
481 |                              ;; +/- relative positions based on last match _start_
482 |                              ((pattern-relpos-p pattern)
483 |                               (if (>= last-match-start match-start-pos)
484 |                                   ;; relpos backward string break or zero offset '>'
485 |                                   (funcall extractor source last-match-start (length source))
486 |                                   ;; relpos forward
487 |                                   (funcall extractor source last-match-start match-start-pos)))
488 |                              
489 |                              ;; Absolute positions, if in effect, might precede scan-start
490 |                              ;; Could be a normal string or word-split match situation as well.
491 |                              ((> match-start-pos scan-start)
492 |                               (funcall extractor source scan-start match-start-pos))
493 |                          
494 |                              ;; Presumed absolute position preceding start of last pattern match.
495 |                              (t "")))))
496 |                 (values value match-start-pos 
497 |                         (if (eq :< operator)
498 |                             match-start-pos
499 |                             match-follow-pos)))
500 |               ;; Special tail binding semantics again
501 |               (values (and extract-p (funcall extractor source scan-start (length source))) 
502 |                       0 match-follow-pos))))))
503 | 
504 | (defparameter *transform-names*
505 |   '("UPPER" "LOWER" "SNAKE" "KEBAB" "LTRIM" "RTRIM" "TRIM" "INTEGER" "DOUBLE" "FLOAT" "KEYWORD")  
506 |   "Symbol names that are valid binding 'predefined' transformation names expecting one argument.
507 | The TRANSFORM operator expects two and is not in this list.")
508 | 
509 | ;; *TBD* whether _all_ extracted strings passed to transforms are safe to use with destructive transforms.
510 | ;; For now transforms make copies.  Besides, we tell users not to modify returned strings.
511 | 
512 | (defun to-snake (str)
513 |   "Transform that converts hyphens to underscores."
514 |   (declare (string str))
515 |   (substitute #\_ #\- str))
516 | 
517 | (defun to-kebab (str)
518 |   "Transform that converts underscores to hyphens."
519 |   (declare (string str))
520 |   (substitute #\- #\_ str))
521 | 
522 | (defvar *trim-character-bag* (list #\space)
523 |   "Sequence of characters as per CL:STRING-TRIM that should be considered whitespace
524 | for removal by the LTRIM, RTRIM, and TRIM predefined transforms. You can bind this if
525 | you like, but it will affect all trimming transforms in the scope of the binding.
526 | 
527 | The default value is just the space character.")
528 | 
529 | (defun left-trim (str)
530 |   "Transform that trims leading spaces off of str, where whitespace is as defined in 
531 | *TRIM-CHARACTER-BAG*."
532 |   (declare (string str))
533 |   (string-left-trim *trim-character-bag* str))
534 | 
535 | (defun right-trim (str)
536 |   "Transform that trims trailing spaces off of str, where whitespace is as defined in 
537 | *TRIM-CHARACTER-BAG*."
538 |   (declare (string str))
539 |   (string-right-trim *trim-character-bag* str))
540 | 
541 | (defun trim (str)
542 |   "Transform that trims leading and trailing spaces off of str, where whitespace is as defined in 
543 | *TRIM-CHARACTER-BAG*."
544 |   (declare (string str))
545 |   (string-trim *trim-character-bag* str))
546 | 
547 | (defun to-float (str)
548 |   "Transform that convers a string to single-float representation."
549 |   (declare (string str))
550 |   (parse-float:parse-float str :type 'single-float))
551 | 
552 | (defun to-double (str)
553 |   "Transform that converts a string to double-float representation."
554 |   (declare (string str))
555 |   (parse-float:parse-float str :type 'double-float))
556 | 
557 | (defun to-keyword (str)
558 |   "Transform that converts strings to (interned) keywords.
559 | Note that no changes are made to string case.  If you want to coerce the case
560 | for use by the transform (since transforms don't nest), you could use the 
561 | :UPPER or :LOWER options to PARSE, e.g.
562 | 
563 |     (parse :upper \"abc\" ((keyword x))) => :ABC"
564 |   (declare (string str))
565 |   (intern str :keyword))
566 | 
567 | (defparameter *builtin-transform-functions*
568 |   (list (cons :UPPER 'string-upcase)
569 |         (cons :LOWER 'string-downcase)
570 |         (cons :SNAKE 'to-snake)
571 |         (cons :KEBAB 'to-kebab)
572 |         (cons :LTRIM 'left-trim)
573 |         (cons :RTRIM 'right-trim)
574 |         (cons :TRIM 'trim)
575 |         (cons :INTEGER 'parse-integer)
576 |         (cons :FLOAT 'to-float)
577 |         (cons :DOUBLE 'to-double)
578 |         (cons :KEYWORD 'to-keyword))
579 |   "Association list keyed by transform operator keywords (e.g. :UPPER)
580 | and valued by function designators for the functions to perform the associated transform.")
581 |         
582 | (defun options-declare-symbol-pre-bound (options symbol)
583 |   "If the PARSE options in the OPTIONS list have a :USING or :USING-VECTOR
584 | declaration for SYMBOL, return the option name (:USING or :USING-VECTOR).
585 | Otherwise return nil."
586 |   (loop for option in options
587 |         as  keyword = (and (listp option)
588 |                            (member (car option) '(:USING :USING-VECTOR))
589 |                            (car option))
590 |         when (and keyword (member symbol (cdr option)))
591 |         do (return keyword)))
592 | 
593 | (defun plain-symbol-p (s-exp)
594 |   "Return true if s-exp is a symbol, but not a keyword."
595 |   (and (symbolp s-exp) (not (keywordp s-exp))))
596 | 
597 | (defun binding? (s-exp)
598 |   "Return true if s-exp represents a binding, NIL if it does not.
599 | This applies to '_' symbols too.
600 | 
601 | For an expression to be a binding it must be either a symbol, or a valid transform expression."
602 |   (or (plain-symbol-p s-exp)
603 |       (and (listp s-exp)
604 |            (or (string-equal (car s-exp) "TRANSFORM")
605 |                (member (car s-exp) *transform-names* :test #'string-equal)))))
606 | 
607 | (defstruct (binding (:constructor %make-binding))
608 |   sexp                   ;original macro argument, symbol or list
609 |   null-p                 ;if symbol name is '_'
610 |   previously-bound-p     ;true if (:USING x) or (:USING-VECTOR x) is in effect for symbol
611 |   vector-p               ;true if (:USING-VECTOR x) is in effect for symbol
612 |   shadow-symbol          ;gensym if no parse body & vector-p is true
613 |   user-symbol            ;sexp if symbol, else second element of transform list
614 |   transform-operator     ;NIL or keyword describing a transformation, e.g. :UPPER, :TRANSFORM
615 |   transform-function)    ;built-in or user-defined function designator
616 | 
617 | (defun make-binding (options s-exp)
618 |   "Given a list of parse options and an s-expression representing a symbol to be
619 | bound, or list with a transformation expression having a symbol to be bound, return a
620 | BINDING structure representing the binding.
621 | 
622 | Signals a condition if the binding expression has improper syntax,
623 | assumes transform names have previously been validated by the call to `binding?`."
624 |   (let* ((listp (listp s-exp))
625 |          (tokens (if listp (length s-exp) 1))
626 |          (user-symbol (if listp (second s-exp) s-exp))
627 |          (transform-symbol (alexandria:if-let 
628 |                                ((s (and listp (first s-exp))))
629 |                              (and (or (string-equal s "TRANSFORM")
630 |                                       (member s *transform-names* :test #'string-equal))
631 |                                   s)))
632 |          (transform-operator (and transform-symbol
633 |                                   (intern (symbol-name transform-symbol) :keyword)))
634 |          (user-defined-transform-p (eq transform-operator :transform))
635 |          (transform-function-designator (and user-defined-transform-p (third s-exp)))
636 |          (null-p (string= user-symbol "_"))
637 |          (previously-bound-p (options-declare-symbol-pre-bound options user-symbol))
638 |          (vector-p (eq previously-bound-p :using-vector)))
639 |     (when transform-symbol
640 |       (if user-defined-transform-p
641 |           ;; user-defined transform via TRANSFORM
642 |           (unless (and (= 3 tokens)
643 |                        (symbolp user-symbol)
644 |                        ;; The function designator forms are UNEVALUATED MACROEXPANSION ARGS
645 |                        ;; so #'x and 'x are (function x) and (quote x)
646 |                        (or (stringp transform-function-designator)
647 |                            (symbolp transform-function-designator)
648 |                            (and (consp transform-function-designator)
649 |                                 (member (car transform-function-designator)
650 |                                         '(quote function lambda)))))
651 |             (error "Expected (TRANFORM <symbol> <function-designator>), received ~s" s-exp))
652 |           ;; predefined transform
653 |           (unless (and (= 2 tokens)
654 |                        (symbolp user-symbol))
655 |             (error "Expected (<transform> <symbol>), received ~s" s-exp))))
656 |     (%make-binding :sexp s-exp
657 |                    :null-p null-p
658 |                    :previously-bound-p previously-bound-p
659 |                    :vector-p vector-p
660 |                    :user-symbol user-symbol
661 |                    :transform-operator transform-operator
662 |                    :transform-function
663 |                    (and transform-symbol
664 |                         (if user-defined-transform-p
665 |                             transform-function-designator
666 |                             (or (aget transform-operator *builtin-transform-functions*)
667 |                                 (error "Internal error looking up built-in transform function for ~s" 
668 |                                        transform-operator)))))))
669 | 
670 | (defun transform-function-sexp (binding)
671 |   "Given a binding, return the transform-function as an s-expression suitable for a macroexpansion
672 | environment.  Maybe I shouldn't need to do this but I didn't know another way."
673 |   (alexandria:when-let ((tf (binding-transform-function binding)))
674 |     (etypecase tf
675 |       (symbol `',tf)
676 |       (function tf)
677 |       (list (third (binding-sexp binding))))))
678 | 
679 | (defparameter *null-binding*
680 |   (make-binding NIL '_)
681 |   "Binding indicating no variable should be bound.")
682 | 
683 | (defun template-bindings-and-patterns (options template)
684 |   "Given PARSE options (already parsed list of keywords or lists starting with
685 | keywords) and the TEMPLATE, an unevaluated and potentially empty list passed to
686 | PARSE, return an association list whose keys are the binding (structures) in
687 | template, or a binding for the symbol '_ if there isn't one for a pattern, and whose
688 | values are patterns in the template, with possible pseudo end :END-OF-STRING-PATTERN.
689 | 
690 | Adjacent bindings will have a space pattern added between them.
691 | If the template does not end with a pattern, :END-OF-STRING-PATTERN is used.
692 | 
693 | Nothing about the binding or pattern expressions is particularly validated yet.
694 | We let `LET` in the PARSE expansion validate symbol names to bound (it will hopefully
695 | complain about attempts to bind NIL).
696 | 
697 | Note that valid bindings may be lists whose second element is the symbol to be bound,
698 | (the first element is the name of a TRANSFORM), e.g. (UPPER foo).
699 | Note that valid patterns-as-s-exps may be lists whose car is `$`."
700 |   (loop with result = nil               ;ALIST of symbol . pattern pairs.
701 |         with last-binding = nil         ;haven't seen a new binding yet
702 |         for sexp in template
703 |         do (cond ((pattern? sexp)       ;emit current binding/pattern pair
704 |                   (setf result (acons (or last-binding *null-binding*)
705 |                                       (make-pattern sexp) result)
706 |                         last-binding nil))
707 | 
708 |                  ((binding? sexp)
709 |                   (when last-binding    ;consecutive bindings, word-split semantics
710 |                     (setf result 
711 |                           (acons last-binding (make-pattern :word-split-pattern) result)))
712 |                   (setf last-binding (make-binding options sexp))) ;buffer current binding
713 |                  
714 |                  (t (error "Invalid pattern or binding expression: ~s" sexp)))
715 |         finally 
716 |            (when last-binding  ;binding is last template token, matches end-of-string
717 |              (setf result 
718 |                    (acons last-binding (make-pattern :end-of-text-pattern) result)))
719 |            (return (nreverse result))))
720 | 
721 | (defvar *options* nil
722 |   "This special variable is bound to a list of options in effect a PARSE invocation.
723 | It is intended as a way for user extensions to access options they need to act upon
724 | (e.g. CASELESS comparisons).")
725 |   
726 | (defun symbols-to-return (bindings)
727 |   "Given a list of bindings, return all those variables that we will return if the parse BODY
728 | is NIL. This includes regular PARSE allocated bindings based on template symbols,
729 | as well as :USING symbols.  However it does NOT include vectors declared with :USING-VECTOR,
730 | but instead will include shadow symbols allocated if there is no BODY."
731 |   (loop with result = ()
732 |         for binding in bindings
733 |         unless (binding-null-p binding)
734 |         do (pushnew (if (binding-vector-p binding)
735 |                         (binding-shadow-symbol binding)
736 |                         (binding-user-symbol binding))
737 |                     result
738 |                     :test #'string=)
739 |         finally (return (nreverse result))))
740 | 
741 | (defun symbols-to-allocate (bindings body)
742 |   "Given a list of binding structs, return symbols for those bindings which are not null
743 | and that the user has not indicated LET bindings should be omitted.
744 | 
745 | If BODY is non-nil, that's all we need to do.  However if BODY is NIL and there are user-allocated
746 | vector bindings, we need to allocate a shadow variable to generate the customary 
747 | PARSE return value.  See the README.md on :USING-VECTOR for details."
748 |   (let ((result ()))
749 |     (loop for binding in bindings
750 |           when (and (not (binding-null-p binding))
751 |                     (not (binding-previously-bound-p binding)))
752 |             do (pushnew (binding-user-symbol binding) result :test #'string=))
753 |     (unless body
754 |       (loop for binding in bindings
755 |             when (binding-vector-p binding)
756 |             do (let ((sym (gensym "PARSE-VECTOR-SHADOW-")))
757 |                  (setf (binding-shadow-symbol binding) sym)
758 |                  (push sym result))))
759 |     (nreverse result)))
760 | 
761 | (defun parse-aux (options source template body)
762 |   "Generate the macroexpansion for PARSE."
763 |   (let* ((binding-pattern-alist (template-bindings-and-patterns options template))
764 |          (bindings (mapcar #'car binding-pattern-alist))
765 |          (patterns (mapcar #'cdr binding-pattern-alist))
766 |          (source-idx-sym (gensym "PARSE-SOURCE-IDX-"))
767 |          (last-match-start-sym (gensym "PARSE-LAST-MATCH-START-IDX-"))
768 |          (match-start-sym (gensym "PARSE-MATCH-START-IDX-"))
769 |          (next-idx-sym (gensym "PARSE-NEXT-IDX-")) ;is also matched region end idx
770 |          (result-sym   (gensym "PARSE-RESULT-"))
771 |          (source-sym   (gensym "PARSE-SOURCE-"))
772 |          ;; symbols-to-allocate are those symbols for which we will create LET bindings
773 |          ;; Allocates shadow symbols for vector-p bindings if no body
774 |          ;; Note: side effect allocates shadow symbols for vector  bindings if necessary
775 |          ;; Should probably have been done at make-binding time, but wasn't.
776 |          (symbols-to-allocate (symbols-to-allocate bindings body))
777 |          ;; All bindings except null ('_') bindings. Vector bindings are replaced by shadow symbols
778 |          ;; Relies on shadow-symbols previously established for bindings
779 |          (symbols-to-return (symbols-to-return bindings))
780 |          match-forms source-transforms var-initforms)
781 |     (setq match-forms 
782 |           (mapcar (lambda (binding pattern)
783 |                     (let ((symbol (binding-user-symbol binding))
784 |                           decl match-sexp unmatch-sexp extract-p)
785 |                       (if (binding-null-p binding)
786 |                           ;; "_" case, explicitly or implicitly, no bindings to update
787 |                           (setq decl `((declare (ignore ,result-sym)))
788 |                                 match-sexp `(setq ,source-idx-sym ,next-idx-sym
789 |                                                   ,last-match-start-sym ,match-start-sym)
790 |                                 unmatch-sexp `(return))
791 |                           ;; bindings are in effect
792 |                           (let* ((xform (if (binding-transform-function binding)
793 |                                             `(funcall ,(transform-function-sexp binding)
794 |                                                       ,result-sym)
795 |                                             `,result-sym))
796 |                                  (assign-form (if (binding-vector-p binding)
797 |                                                   (if (binding-shadow-symbol binding)
798 |                                                       `(vector-push 
799 |                                                         (setq ,(binding-shadow-symbol binding) ,xform)
800 |                                                         ,symbol)
801 |                                                       `(vector-push ,xform ,symbol))
802 |                                                   `(setq ,symbol ,xform))))
803 |                             (setq match-sexp `(progn
804 |                                                 ,assign-form
805 |                                                 (setq ,source-idx-sym ,next-idx-sym
806 |                                                       ,last-match-start-sym ,match-start-sym))
807 |                                   unmatch-sexp `(progn ,assign-form
808 |                                                        (return))
809 |                                   extract-p t)))
810 |                       `(multiple-value-bind (,result-sym ,match-start-sym ,next-idx-sym)
811 |                            (match-and-extract ,source-sym ,source-idx-sym ,last-match-start-sym
812 |                                               ,pattern ,(pattern-argument pattern) ,extract-p)
813 |                          ,@decl
814 |                          (if ,next-idx-sym
815 |                              ,match-sexp
816 |                              ,unmatch-sexp))))
817 |                   bindings patterns))
818 |     (loop for option in options
819 |           as transform = (case option
820 |                            (:upper `(setq ,source-sym (string-upcase ,source-sym)))
821 |                            (:lower `(setq ,source-sym (string-downcase ,source-sym))))
822 |           when transform
823 |             do (push transform source-transforms))
824 |     (setq var-initforms 
825 |           `((,source-idx-sym 0) 
826 |             ,match-start-sym
827 |             (,last-match-start-sym 0)
828 |             ,next-idx-sym
829 |             ,result-sym
830 |             (,source-sym ,source)
831 |             (*options* ',options)
832 |             (*string-equality-predicate* ,(if (member :caseless options) 
833 |                                               '#'string-equal ;note leading quote requirement here
834 |                                               '#'string=))
835 |             ,@(mapcar (lambda (symbol) 
836 |                         (list symbol `*unmatched-binding-value*))
837 |                       symbols-to-allocate)))
838 |     (setq body (or body `((list ,@symbols-to-return))))
839 |     `(let (,@var-initforms)
840 |        (declare (ignorable ,source-idx-sym ,next-idx-sym
841 |                            ,match-start-sym ,last-match-start-sym
842 |                            ,result-sym ,source-sym 
843 |                            ,@symbols-to-allocate) 
844 |                 (fixnum ,source-idx-sym)) ;also last-match-start-sym?
845 |        (check-type ,source-sym string)  ;condition you can fix
846 |        (locally (declare (string ,source-sym))
847 |          ,@source-transforms
848 |          (block nil ,@match-forms)
849 |          (locally ,@body)))))
850 | 
851 | (defun coalesce-using-options (options)
852 |   "Given a list of options as per `parse-options` but after `:using x` has been
853 | transformed to `(:using x)`, if there are multiple :USING or :USING-VECTOR
854 | directives, merge them into a single k/v pair and remove duplicate symbols that may
855 | exist. Return the potenitally revised OPTIONS list. E.g.
856 | 
857 | (:upper (:using a b c) (:using a d e)) => (:upper (:using a b c d e))"
858 |   (flet ((using? (x) (and (listp x) (eq (car x) :using)))
859 |          (using-vector? (x) (and (listp x) (eq (car x) :using-vector)))
860 |          ;; ((:using x y) (:using y z)) => ((:using x y z))
861 |          (merge-using (usings)
862 |            (let ((n-usings (length usings)))
863 |              (cond ((= 0 n-usings) nil)
864 |                    ((= 1 n-usings) (list (remove-duplicates (first usings))))
865 |                    (t (list (cons (first (first usings)) ;:using, :using-vector
866 |                             (remove-duplicates 
867 |                              (apply #'concatenate 'list (mapcar 'cdr usings))))))))))
868 |     (let ((usings (remove-if (complement #'using?) options)) ;the :using options
869 |           (using-vectors (remove-if (complement #'using-vector?) options)) ;:using-vector options
870 |           (non-using (remove-if #'(lambda (x) (or (using? x) (using-vector? x))) options)))
871 |       (concatenate 'list (merge-using usings) (merge-using using-vectors) non-using))))
872 | 
873 | (defun parse-options (args)
874 |   "Given the argument list to PARSE, return a list of validated options in the form of
875 | either the option keyword for value-less options, or in the form of a list
876 | whose car is the option keyword, and the rest of the list being the values of the options.
877 | 
878 | Return two values, the options list, and the cons in ARGS following the options
879 | (whose CAR would be the SOURCE argument).
880 | 
881 | E.g.  `(parse :UPPER :USING (a b) \"abc\" (d e))`
882 |       => (:UPPER (:USING a b))
883 |          (\"abc\" (d e))
884 | "
885 |   (labels ((token-keyword (token)
886 |              (cond ((keywordp token) token)
887 |                    ((and (listp token) (keywordp (car token)))
888 |                     (car token))))
889 |            (validate (options)
890 |              (when (and (member :upper options)
891 |                         (member :lower options))
892 |                (error ":UPPER and :LOWER options are mutually exclusive."))
893 |              (let ((a (cdr (find :using options :key #'token-keyword)))
894 |                    (b (cdr (find :using-vector options :key #'token-keyword))))
895 |                (when (or (find '_ a :test #'string=)
896 |                          (find '_ b :test #'string=))
897 |                  (error "The null binding symbol '_ is not supported in :USING or :USING-VECTOR declarations."))
898 |                (alexandria:when-let ((i (intersection a b)))
899 |                  (error "These symbol~P were listed in both :USING and :USING-VECTOR declarations, and such declarations are mutually exclusive: ~s"
900 |                         (length i) i)))
901 |              options)
902 |            (validate-symbols (using-list)
903 |              (unless (every #'plain-symbol-p (rest using-list))
904 |                (error "~s expects a list of non-keyword symbols, got ~s instead."
905 |                       (car using-list)
906 |                       (cdr using-list)))
907 |              using-list))
908 |     ;; Accept :USING X, or (:USING X).  Currently not supported
909 |     ;; for case keywords, we'd have to change callers to check for lists.
910 |     (loop with result = ()
911 |           with cons = args
912 |           as token = (car cons)
913 |           as token-keyword = (token-keyword token)
914 |           while token-keyword
915 |           do (case token-keyword
916 |                ((:upper :lower :caseless) 
917 |                 (push token-keyword result)
918 |                 (setf cons (cdr cons)))
919 |                ((:using :using-vector)
920 |                 (cond 
921 |                   ((listp token)        ;input was (:using[-vector] x)
922 |                    (push (validate-symbols token) result)
923 |                    (setf cons (cdr cons)))
924 |                   (t                    ;input was hopefully :using (x)
925 |                    (let ((val (cadr cons)))
926 |                      (unless (listp val)
927 |                        (error "Expected a list as the value of the ~s option, received ~s"
928 |                               token val))
929 |                   (push (validate-symbols (cons token val)) result)
930 |                   (setf cons (cddr cons))))))
931 |                (t (error "~s is not a valid PARSE option." token)))
932 |           finally 
933 |              (return (values (validate (coalesce-using-options result)) cons)))))
934 | 
935 | (defmacro parse (&rest args)
936 |   "PARSE [option ...] source template [body]
937 | Match text literals (or more complex patterns) in TEMPLATE against the text in
938 | SOURCE, and bind the text which exists _between_ the matched literals to variables
939 | named in the template.  BODY is then executed with the bound variables.  If no BODY
940 | is specified, the effect is as if all bound variables are returned in a list in order
941 | of their binding.
942 | 
943 | If there are more variables than text to match, unused variables are bound to 
944 | *UNMATCHED-BINDING-VALUE* which defaults to REXXPARSE:+EMPTY-STRING+.
945 | 
946 | If there is more text than variables to match it, the last variable is bound to all
947 | remaining text and there is no trimming of spaces. When a match for a pattern cannot
948 | be found, it matches the end of the string.
949 | 
950 | Variables without patterns between them have an implicit word-split behavior, which
951 | isn't quite the same as scanning for a single space pattern.
952 | 
953 | Patterns without variables between them discard the matched text, but each pattern
954 | has is evaluated for its impact on advancing the position within the source string.
955 | 
956 | Example:
957 | 
958 |   (parse \"2024-JAN-12: George Washington slept here.\"
959 |     (year \"-\" month \"-\" day \":\" first last did-what)
960 |     (format t \"Yoda says: ~a ~a ~a did, ~a of ~a ~a, it was.~%\"
961 |       did-what first last day month year))
962 | 
963 |    Yoda says: slept here George Washington did, 12 of JAN 2024, it was.
964 |    => NIL
965 | 
966 | Supported options include:
967 | :UPPER -- source string will be uppercased before parsing.
968 | :LOWER -- source string will be lowercased before parsing.
969 | :CASELESS -- character comparisions will use CHAR-EQUAL instead of CHAR=
970 | :USING (vars) -- vars in list will not have binding allocation in macroexpansion
971 | :USING-VECTOR (vars) -- like :USING, but vars must name fill-pointered vectors and will be
972 |                         assigned using VECTOR-PUSH.
973 | 
974 | See README.md for examples of more complex matching behaviors as well as the ability
975 | to transform matched text before binding."
976 |   (let (source template)
977 |     (multiple-value-bind (options body)
978 |         (parse-options args)
979 |       (setq source (or (first body)
980 |                        (error "Missing SOURCE argument."))
981 |             body (cdr body))
982 |       (setq template (first body)
983 |             body (cdr body))
984 |       (unless (listp template)
985 |         (error "TEMPLATE must be a list, not a ~s." (type-of template)))
986 |       (parse-aux options source template body))))
987 | 
988 | #|
989 | - Might be nice to accept `:using[-vector] x` as well when there's only one var (no list required)
990 | 
991 | - Allow characters as pattern literals too?
992 | |#
993 | 


--------------------------------------------------------------------------------