├── UNLICENSE
├── README.md
└── rdp.el
/UNLICENSE:
--------------------------------------------------------------------------------
1 | This is free and unencumbered software released into the public domain.
2 |
3 | Anyone is free to copy, modify, publish, use, compile, sell, or
4 | distribute this software, either in source code form or as a compiled
5 | binary, for any purpose, commercial or non-commercial, and by any
6 | means.
7 |
8 | In jurisdictions that recognize copyright laws, the author or authors
9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 |
24 | For more information, please refer to
25 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Recursive Descent Parser for Emacs Lisp
2 |
3 | This package provides a recursive descent parser for Elisp
4 | programs. Grammars are specified using a pure Elisp s-expression. A
5 | set of matching functions can be provided to the parser to manipulate
6 | the parse tree at read time. These functions can even be used to
7 | [implement a compiler](https://github.com/skeeto/psl-mode/blob/master/psl-compile.el).
8 |
9 | The parser will indicate where the point is in the parse tree, for use
10 | in automatic indentation for major modes. This is no silver bullet:
11 | automatic indentation is
12 | [still a black art](http://www.gnu.org/software/emacs/manual/html_node/elisp/Auto_002dIndentation.html)].
13 | SMIE is probably better suited if that's your goal -- to implement
14 | indentation but not fully parse a language as rdp would.
15 |
16 | Full documentation can be found in the [commentary section of
17 | `rdp.el`](https://github.com/skeeto/rdp/blob/master/rdp.el).
18 |
19 | ## Example
20 |
21 | This is an interpreter for simple arithmetic expressions, including
22 | operator precedence and grouping.
23 |
24 | ```el
25 | (defvar arith-tokens
26 | '((sum prod [([+ -] sum) no-sum])
27 | (prod value [([* /] prod) no-prod])
28 | (num . "-?[0-9]+\\(\\.[0-9]*\\)?")
29 | (+ . "\\+")
30 | (- . "-")
31 | (* . "\\*")
32 | (/ . "/")
33 | (pexpr "(" [sum prod num pexpr] ")")
34 | (value . [pexpr num])
35 | (no-prod . "")
36 | (no-sum . "")))
37 |
38 | (defun arith-op (expr)
39 | (destructuring-bind (a (op b)) expr
40 | (funcall op a b)))
41 |
42 | (defvar arith-funcs
43 | `((sum . ,#'arith-op)
44 | (prod . ,#'arith-op)
45 | (num . ,#'string-to-number)
46 | (+ . ,#'intern)
47 | (- . ,#'intern)
48 | (* . ,#'intern)
49 | (/ . ,#'intern)
50 | (pexpr . ,#'cadr)
51 | (value . ,#'identity)
52 | (no-prod . ,(lambda (e) '(* 1)))
53 | (no-sum . ,(lambda (e) '(+ 0)))))
54 |
55 | (defun arith (string)
56 | (rdp-parse-string string arith-tokens arith-funcs))
57 | ```
58 |
59 | Usage:
60 |
61 | ```el
62 | (arith "(1 + 2 + 3 * 4)*-3/4.0")
63 | ```
64 |
65 | This evaluates to -11.25, as would be expected.
66 |
67 | ## See Also
68 |
69 | * [SMIE](http://www.gnu.org/software/emacs/manual/html_node/elisp/SMIE.html)
70 | * [peg.el](http://emacswiki.org/emacs/peg.el)
71 | * [Semantic Bovinator](http://emacswiki.org/emacs/SemanticBovinator)
72 |
--------------------------------------------------------------------------------
/rdp.el:
--------------------------------------------------------------------------------
1 | ;;; rdp.el --- Recursive Descent Parser library
2 |
3 | ;; This is free and unencumbered software released into the public domain.
4 |
5 | ;; Author: Christopher Wellons
6 | ;; URL: https://github.com/skeeto/rdp
7 | ;; Version: 1.0
8 |
9 | ;;; Commentary:
10 |
11 | ;; This library provides a recursive descent parser for parsing
12 | ;; languages in buffers. Some support is provided for implementing
13 | ;; automatic indentation based on the parser.
14 |
15 | ;; In general, the only two functions you need to worry about are:
16 |
17 | ;; * `rdp-parse' -- parse the current buffer
18 | ;; * `rdp-parse-string' -- parse a string (in a temp buffer)
19 |
20 | ;; A grammar is provided to the parser as an alist of patterns.
21 | ;; Patterns are named by symbols, which can reference other
22 | ;; patterns. The lisp object type indicates the type of the pattern:
23 |
24 | ;; * string -- an Emacs regular expression
25 | ;; * list -- "and" relationship, each pattern must match in order
26 | ;; * vector -- "or" relationship, one of the patterns must match
27 | ;; * symbol -- recursive reference to another pattern in the alist
28 |
29 | ;; The global variable `rdp-best' indicates the furthest point reached
30 | ;; in the buffer by the parser. If parsing failed (i.e. `rdp-best' is
31 | ;; not at the end of the buffer), this is likely to be the position of
32 | ;; the syntax error.
33 |
34 | ;; For example, this grammar parses simple arithmetic with operator
35 | ;; precedence and grouping.
36 |
37 | ;; (defvar arith-tokens
38 | ;; '((sum prod [([+ -] sum) no-sum])
39 | ;; (prod value [([* /] prod) no-prod])
40 | ;; (num . "-?[0-9]+\\(\\.[0-9]*\\)?")
41 | ;; (+ . "\\+")
42 | ;; (- . "-")
43 | ;; (* . "\\*")
44 | ;; (/ . "/")
45 | ;; (pexpr "(" [sum prod num pexpr] ")")
46 | ;; (value . [pexpr num])
47 | ;; (no-prod . "")
48 | ;; (no-sum . "")))
49 |
50 | ;; Given just this grammar to `rdp-parse' it will return an
51 | ;; s-expression of the input where each token match is `cons'ed with
52 | ;; the token name. To make this more useful, the s-expression can be
53 | ;; manipulated as it is read using an alist of token names and
54 | ;; functions. This could be used to simplify the s-expression, build
55 | ;; an interpreter that interprets during parsing, or even build a
56 | ;; compiler.
57 |
58 | ;; For example, this function alist evaluates the arithmetic as it is
59 | ;; parsed:
60 |
61 | ;; (defun arith-op (expr)
62 | ;; (destructuring-bind (a (op b)) expr
63 | ;; (funcall op a b)))
64 | ;;
65 | ;; (defvar arith-funcs
66 | ;; `((sum . ,#'arith-op)
67 | ;; (prod . ,#'arith-op)
68 | ;; (num . ,#'string-to-number)
69 | ;; (+ . ,#'intern)
70 | ;; (- . ,#'intern)
71 | ;; (* . ,#'intern)
72 | ;; (/ . ,#'intern)
73 | ;; (pexpr . ,#'cadr)
74 | ;; (value . ,#'identity)
75 | ;; (no-prod . ,(lambda (e) '(* 1)))
76 | ;; (no-sum . ,(lambda (e) '(+ 0)))))
77 |
78 | ;; Putting this all together:
79 |
80 | ;; (defun arith (string)
81 | ;; (rdp-parse-string string arith-tokens arith-funcs))
82 | ;;
83 | ;; (arith "(1 + 2 + 3 + 4 + 5) * -3/4.0")
84 |
85 | ;; Tips:
86 |
87 | ;; Recursive descent parsers *cannot* be left-recursive. It is
88 | ;; important that a pattern does not recurse without first consuming
89 | ;; some input. Any grammar can be made non-left-recursive but not
90 | ;; necessarily simplistically.
91 |
92 | ;; The parser requires a lot of stack! Consider increasing
93 | ;; `max-lisp-eval-depth' by some factor before calling
94 | ;; `rdp-parse'. After increasing it, running out of stack space is
95 | ;; likely an indication of left-recursion somewhere in the grammar.
96 |
97 | ;; Token functions should not have side effects. Due to the
98 | ;; backtracking of the parser, just because the function was called
99 | ;; doesn't mean there was actually a successful match. Also, these
100 | ;; functions are free to return nil or the empty list as such a return
101 | ;; is *not* an indication of failure.
102 |
103 | ;; By default, whitespace is automatically consumed between matches
104 | ;; using the function `rdp-skip-whitespace'. If some kinds of
105 | ;; whitespace are important or if there are other characters that need
106 | ;; to be skipped, temporarily override this function with your own
107 | ;; definition using `flet' when calling `rdp-parse'.
108 |
109 | ;; In general don't try to parse comments in the grammar. Strip them
110 | ;; from the buffer before calling the parser.
111 |
112 | ;; Indentation facilities:
113 |
114 | ;; To find out where in the parse tree a point lies, set `rdp-start'
115 | ;; to the desired point before starting parsing. After parsing, either
116 | ;; successfully or not,`rdp-point-stack' will contain a stack of
117 | ;; tokens indicating roughly where in the parse tree the point
118 | ;; lies.
119 |
120 | ;; To use this for rudimentary indentation, set `rdp-start' to the
121 | ;; `beginning-of-line' of the current point and count how many
122 | ;; indent-worthy tokens are in the stack once parsing is complete.
123 |
124 | ;; See also:
125 |
126 | ;; * http://emacswiki.org/emacs/peg.el
127 | ;; * http://www.gnu.org/software/emacs/manual/html_node/elisp/SMIE.html
128 | ;; * http://cedet.sourceforge.net/semantic.shtml
129 | ;; * http://en.wikipedia.org/wiki/Recursive_descent_parser
130 | ;; * http://en.wikipedia.org/wiki/Parsing_expression_grammar
131 |
132 | ;;; Code:
133 |
134 | (eval-when-compile (require 'cl))
135 |
136 | (defvar rdp-best 0
137 | "The furthest most point that parsing reached. This information
138 | can be used to determine where parsing failed.")
139 |
140 | (defvar rdp-start 0
141 | "Position of point in original source buffer. The purpose is
142 | for auto-indentation.")
143 |
144 | (defvar rdp-point-stack ()
145 | "The token stack that contains the point. This is used for
146 | auto-indentation.")
147 |
148 | (defvar rdp-token-stack ()
149 | "Stack of tokens at this point.")
150 |
151 | (defun rdp-box (value)
152 | "Box a parse return value, allowing nil to be a valid return."
153 | (vector value))
154 |
155 | (defun rdp-unbox (box)
156 | "Unbox a parse return value."
157 | (aref box 0))
158 |
159 | (defun rdp-get-token-func (token funcs)
160 | "Get the manipulation function for the given token."
161 | (cdr (assq token funcs)))
162 |
163 | (defun rdp-parse (tokens &optional funcs pattern)
164 | "Return the next item in the current buffer."
165 | (setq rdp-best 0)
166 | (setq rdp-token-stack ())
167 | (if pattern
168 | (rdp-unbox (rdp-match pattern tokens funcs))
169 | (dolist (token tokens)
170 | (let ((result (rdp-match (car token) tokens funcs)))
171 | (if result (return (rdp-unbox result)))))))
172 |
173 | (defun rdp-parse-string (string tokens &optional funcs pattern)
174 | "Like `rdp-parse' but operates on a string."
175 | (with-temp-buffer
176 | (insert string)
177 | (goto-char (point-min))
178 | (rdp-parse tokens funcs pattern)))
179 |
180 | (defun rdp-match-list (list tokens funcs)
181 | "Match all patterns in a list."
182 | (let ((result (rdp-match (car list) tokens funcs)))
183 | (when result
184 | (if (null (cdr list))
185 | (rdp-box (list (rdp-unbox result)))
186 | (let ((rest (rdp-match-list (cdr list) tokens funcs)))
187 | (when rest
188 | (rdp-box (cons (rdp-unbox result) (rdp-unbox rest)))))))))
189 |
190 | (defun rdp-match-regex (regex tokens funcs)
191 | "Match a regex."
192 | (when (looking-at regex)
193 | (prog1 (rdp-box (buffer-substring-no-properties (point) (match-end 0)))
194 | (goto-char (match-end 0)))))
195 |
196 | (defun rdp-match-token (token tokens funcs)
197 | "Match a token by name (symbol)."
198 | (push token rdp-token-stack)
199 | (let* ((pattern (cdr (assq token tokens)))
200 | (match (rdp-match pattern tokens funcs)))
201 | (pop rdp-token-stack)
202 | (when match
203 | (let ((macro (rdp-get-token-func token funcs)))
204 | (rdp-box (if macro
205 | (funcall macro (rdp-unbox match))
206 | (cons token (rdp-unbox match))))))))
207 |
208 | (defun rdp-match-or (vec tokens funcs)
209 | "Match at least one pattern in the vector."
210 | (dolist (option (mapcar 'identity vec))
211 | (let ((match (rdp-match option tokens funcs)))
212 | (when match (return match)))))
213 |
214 | (defun rdp-skip-whitespace ()
215 | "Skip over all whitespace."
216 | (search-forward-regexp "[[:space:]]*"))
217 |
218 | (defun rdp-match (pattern tokens &optional funcs)
219 | "Match the given pattern object of any type (toplevel)."
220 | (rdp-skip-whitespace)
221 | (let ((start (point))
222 | (result (etypecase pattern
223 | (string (rdp-match-regex pattern tokens funcs))
224 | (list (rdp-match-list pattern tokens funcs))
225 | (symbol (rdp-match-token pattern tokens funcs))
226 | (vector (rdp-match-or pattern tokens funcs)))))
227 | (when (and (<= (length rdp-point-stack) (length rdp-token-stack))
228 | (> rdp-start start)
229 | (> (point) rdp-start))
230 | (setq rdp-point-stack (reverse rdp-token-stack)))
231 | (unless result
232 | (setq rdp-best (max rdp-best (point)))
233 | (goto-char start))
234 | result))
235 |
236 | (provide 'rdp)
237 |
238 | ;;; rdp.el ends here
239 |
--------------------------------------------------------------------------------