├── UNLICENSE ├── README.md └── rdp.el /UNLICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Recursive Descent Parser for Emacs Lisp 2 | 3 | This package provides a recursive descent parser for Elisp 4 | programs. Grammars are specified using a pure Elisp s-expression. A 5 | set of matching functions can be provided to the parser to manipulate 6 | the parse tree at read time. These functions can even be used to 7 | [implement a compiler](https://github.com/skeeto/psl-mode/blob/master/psl-compile.el). 8 | 9 | The parser will indicate where the point is in the parse tree, for use 10 | in automatic indentation for major modes. This is no silver bullet: 11 | automatic indentation is 12 | [still a black art](http://www.gnu.org/software/emacs/manual/html_node/elisp/Auto_002dIndentation.html)]. 13 | SMIE is probably better suited if that's your goal -- to implement 14 | indentation but not fully parse a language as rdp would. 15 | 16 | Full documentation can be found in the [commentary section of 17 | `rdp.el`](https://github.com/skeeto/rdp/blob/master/rdp.el). 18 | 19 | ## Example 20 | 21 | This is an interpreter for simple arithmetic expressions, including 22 | operator precedence and grouping. 23 | 24 | ```el 25 | (defvar arith-tokens 26 | '((sum prod [([+ -] sum) no-sum]) 27 | (prod value [([* /] prod) no-prod]) 28 | (num . "-?[0-9]+\\(\\.[0-9]*\\)?") 29 | (+ . "\\+") 30 | (- . "-") 31 | (* . "\\*") 32 | (/ . "/") 33 | (pexpr "(" [sum prod num pexpr] ")") 34 | (value . [pexpr num]) 35 | (no-prod . "") 36 | (no-sum . ""))) 37 | 38 | (defun arith-op (expr) 39 | (destructuring-bind (a (op b)) expr 40 | (funcall op a b))) 41 | 42 | (defvar arith-funcs 43 | `((sum . ,#'arith-op) 44 | (prod . ,#'arith-op) 45 | (num . ,#'string-to-number) 46 | (+ . ,#'intern) 47 | (- . ,#'intern) 48 | (* . ,#'intern) 49 | (/ . ,#'intern) 50 | (pexpr . ,#'cadr) 51 | (value . ,#'identity) 52 | (no-prod . ,(lambda (e) '(* 1))) 53 | (no-sum . ,(lambda (e) '(+ 0))))) 54 | 55 | (defun arith (string) 56 | (rdp-parse-string string arith-tokens arith-funcs)) 57 | ``` 58 | 59 | Usage: 60 | 61 | ```el 62 | (arith "(1 + 2 + 3 * 4)*-3/4.0") 63 | ``` 64 | 65 | This evaluates to -11.25, as would be expected. 66 | 67 | ## See Also 68 | 69 | * [SMIE](http://www.gnu.org/software/emacs/manual/html_node/elisp/SMIE.html) 70 | * [peg.el](http://emacswiki.org/emacs/peg.el) 71 | * [Semantic Bovinator](http://emacswiki.org/emacs/SemanticBovinator) 72 | -------------------------------------------------------------------------------- /rdp.el: -------------------------------------------------------------------------------- 1 | ;;; rdp.el --- Recursive Descent Parser library 2 | 3 | ;; This is free and unencumbered software released into the public domain. 4 | 5 | ;; Author: Christopher Wellons 6 | ;; URL: https://github.com/skeeto/rdp 7 | ;; Version: 1.0 8 | 9 | ;;; Commentary: 10 | 11 | ;; This library provides a recursive descent parser for parsing 12 | ;; languages in buffers. Some support is provided for implementing 13 | ;; automatic indentation based on the parser. 14 | 15 | ;; In general, the only two functions you need to worry about are: 16 | 17 | ;; * `rdp-parse' -- parse the current buffer 18 | ;; * `rdp-parse-string' -- parse a string (in a temp buffer) 19 | 20 | ;; A grammar is provided to the parser as an alist of patterns. 21 | ;; Patterns are named by symbols, which can reference other 22 | ;; patterns. The lisp object type indicates the type of the pattern: 23 | 24 | ;; * string -- an Emacs regular expression 25 | ;; * list -- "and" relationship, each pattern must match in order 26 | ;; * vector -- "or" relationship, one of the patterns must match 27 | ;; * symbol -- recursive reference to another pattern in the alist 28 | 29 | ;; The global variable `rdp-best' indicates the furthest point reached 30 | ;; in the buffer by the parser. If parsing failed (i.e. `rdp-best' is 31 | ;; not at the end of the buffer), this is likely to be the position of 32 | ;; the syntax error. 33 | 34 | ;; For example, this grammar parses simple arithmetic with operator 35 | ;; precedence and grouping. 36 | 37 | ;; (defvar arith-tokens 38 | ;; '((sum prod [([+ -] sum) no-sum]) 39 | ;; (prod value [([* /] prod) no-prod]) 40 | ;; (num . "-?[0-9]+\\(\\.[0-9]*\\)?") 41 | ;; (+ . "\\+") 42 | ;; (- . "-") 43 | ;; (* . "\\*") 44 | ;; (/ . "/") 45 | ;; (pexpr "(" [sum prod num pexpr] ")") 46 | ;; (value . [pexpr num]) 47 | ;; (no-prod . "") 48 | ;; (no-sum . ""))) 49 | 50 | ;; Given just this grammar to `rdp-parse' it will return an 51 | ;; s-expression of the input where each token match is `cons'ed with 52 | ;; the token name. To make this more useful, the s-expression can be 53 | ;; manipulated as it is read using an alist of token names and 54 | ;; functions. This could be used to simplify the s-expression, build 55 | ;; an interpreter that interprets during parsing, or even build a 56 | ;; compiler. 57 | 58 | ;; For example, this function alist evaluates the arithmetic as it is 59 | ;; parsed: 60 | 61 | ;; (defun arith-op (expr) 62 | ;; (destructuring-bind (a (op b)) expr 63 | ;; (funcall op a b))) 64 | ;; 65 | ;; (defvar arith-funcs 66 | ;; `((sum . ,#'arith-op) 67 | ;; (prod . ,#'arith-op) 68 | ;; (num . ,#'string-to-number) 69 | ;; (+ . ,#'intern) 70 | ;; (- . ,#'intern) 71 | ;; (* . ,#'intern) 72 | ;; (/ . ,#'intern) 73 | ;; (pexpr . ,#'cadr) 74 | ;; (value . ,#'identity) 75 | ;; (no-prod . ,(lambda (e) '(* 1))) 76 | ;; (no-sum . ,(lambda (e) '(+ 0))))) 77 | 78 | ;; Putting this all together: 79 | 80 | ;; (defun arith (string) 81 | ;; (rdp-parse-string string arith-tokens arith-funcs)) 82 | ;; 83 | ;; (arith "(1 + 2 + 3 + 4 + 5) * -3/4.0") 84 | 85 | ;; Tips: 86 | 87 | ;; Recursive descent parsers *cannot* be left-recursive. It is 88 | ;; important that a pattern does not recurse without first consuming 89 | ;; some input. Any grammar can be made non-left-recursive but not 90 | ;; necessarily simplistically. 91 | 92 | ;; The parser requires a lot of stack! Consider increasing 93 | ;; `max-lisp-eval-depth' by some factor before calling 94 | ;; `rdp-parse'. After increasing it, running out of stack space is 95 | ;; likely an indication of left-recursion somewhere in the grammar. 96 | 97 | ;; Token functions should not have side effects. Due to the 98 | ;; backtracking of the parser, just because the function was called 99 | ;; doesn't mean there was actually a successful match. Also, these 100 | ;; functions are free to return nil or the empty list as such a return 101 | ;; is *not* an indication of failure. 102 | 103 | ;; By default, whitespace is automatically consumed between matches 104 | ;; using the function `rdp-skip-whitespace'. If some kinds of 105 | ;; whitespace are important or if there are other characters that need 106 | ;; to be skipped, temporarily override this function with your own 107 | ;; definition using `flet' when calling `rdp-parse'. 108 | 109 | ;; In general don't try to parse comments in the grammar. Strip them 110 | ;; from the buffer before calling the parser. 111 | 112 | ;; Indentation facilities: 113 | 114 | ;; To find out where in the parse tree a point lies, set `rdp-start' 115 | ;; to the desired point before starting parsing. After parsing, either 116 | ;; successfully or not,`rdp-point-stack' will contain a stack of 117 | ;; tokens indicating roughly where in the parse tree the point 118 | ;; lies. 119 | 120 | ;; To use this for rudimentary indentation, set `rdp-start' to the 121 | ;; `beginning-of-line' of the current point and count how many 122 | ;; indent-worthy tokens are in the stack once parsing is complete. 123 | 124 | ;; See also: 125 | 126 | ;; * http://emacswiki.org/emacs/peg.el 127 | ;; * http://www.gnu.org/software/emacs/manual/html_node/elisp/SMIE.html 128 | ;; * http://cedet.sourceforge.net/semantic.shtml 129 | ;; * http://en.wikipedia.org/wiki/Recursive_descent_parser 130 | ;; * http://en.wikipedia.org/wiki/Parsing_expression_grammar 131 | 132 | ;;; Code: 133 | 134 | (eval-when-compile (require 'cl)) 135 | 136 | (defvar rdp-best 0 137 | "The furthest most point that parsing reached. This information 138 | can be used to determine where parsing failed.") 139 | 140 | (defvar rdp-start 0 141 | "Position of point in original source buffer. The purpose is 142 | for auto-indentation.") 143 | 144 | (defvar rdp-point-stack () 145 | "The token stack that contains the point. This is used for 146 | auto-indentation.") 147 | 148 | (defvar rdp-token-stack () 149 | "Stack of tokens at this point.") 150 | 151 | (defun rdp-box (value) 152 | "Box a parse return value, allowing nil to be a valid return." 153 | (vector value)) 154 | 155 | (defun rdp-unbox (box) 156 | "Unbox a parse return value." 157 | (aref box 0)) 158 | 159 | (defun rdp-get-token-func (token funcs) 160 | "Get the manipulation function for the given token." 161 | (cdr (assq token funcs))) 162 | 163 | (defun rdp-parse (tokens &optional funcs pattern) 164 | "Return the next item in the current buffer." 165 | (setq rdp-best 0) 166 | (setq rdp-token-stack ()) 167 | (if pattern 168 | (rdp-unbox (rdp-match pattern tokens funcs)) 169 | (dolist (token tokens) 170 | (let ((result (rdp-match (car token) tokens funcs))) 171 | (if result (return (rdp-unbox result))))))) 172 | 173 | (defun rdp-parse-string (string tokens &optional funcs pattern) 174 | "Like `rdp-parse' but operates on a string." 175 | (with-temp-buffer 176 | (insert string) 177 | (goto-char (point-min)) 178 | (rdp-parse tokens funcs pattern))) 179 | 180 | (defun rdp-match-list (list tokens funcs) 181 | "Match all patterns in a list." 182 | (let ((result (rdp-match (car list) tokens funcs))) 183 | (when result 184 | (if (null (cdr list)) 185 | (rdp-box (list (rdp-unbox result))) 186 | (let ((rest (rdp-match-list (cdr list) tokens funcs))) 187 | (when rest 188 | (rdp-box (cons (rdp-unbox result) (rdp-unbox rest))))))))) 189 | 190 | (defun rdp-match-regex (regex tokens funcs) 191 | "Match a regex." 192 | (when (looking-at regex) 193 | (prog1 (rdp-box (buffer-substring-no-properties (point) (match-end 0))) 194 | (goto-char (match-end 0))))) 195 | 196 | (defun rdp-match-token (token tokens funcs) 197 | "Match a token by name (symbol)." 198 | (push token rdp-token-stack) 199 | (let* ((pattern (cdr (assq token tokens))) 200 | (match (rdp-match pattern tokens funcs))) 201 | (pop rdp-token-stack) 202 | (when match 203 | (let ((macro (rdp-get-token-func token funcs))) 204 | (rdp-box (if macro 205 | (funcall macro (rdp-unbox match)) 206 | (cons token (rdp-unbox match)))))))) 207 | 208 | (defun rdp-match-or (vec tokens funcs) 209 | "Match at least one pattern in the vector." 210 | (dolist (option (mapcar 'identity vec)) 211 | (let ((match (rdp-match option tokens funcs))) 212 | (when match (return match))))) 213 | 214 | (defun rdp-skip-whitespace () 215 | "Skip over all whitespace." 216 | (search-forward-regexp "[[:space:]]*")) 217 | 218 | (defun rdp-match (pattern tokens &optional funcs) 219 | "Match the given pattern object of any type (toplevel)." 220 | (rdp-skip-whitespace) 221 | (let ((start (point)) 222 | (result (etypecase pattern 223 | (string (rdp-match-regex pattern tokens funcs)) 224 | (list (rdp-match-list pattern tokens funcs)) 225 | (symbol (rdp-match-token pattern tokens funcs)) 226 | (vector (rdp-match-or pattern tokens funcs))))) 227 | (when (and (<= (length rdp-point-stack) (length rdp-token-stack)) 228 | (> rdp-start start) 229 | (> (point) rdp-start)) 230 | (setq rdp-point-stack (reverse rdp-token-stack))) 231 | (unless result 232 | (setq rdp-best (max rdp-best (point))) 233 | (goto-char start)) 234 | result)) 235 | 236 | (provide 'rdp) 237 | 238 | ;;; rdp.el ends here 239 | --------------------------------------------------------------------------------