├── kjlogobgsmall.png ├── ichbins-sexp ├── .gitignore ├── pegcompile.js ├── mkhtml.py ├── style.css ├── Makefile ├── test.html ├── handaxeweb.lua ├── output.js ├── handaxeweb.md └── peg.md /kjlogobgsmall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kragen/peg-bootstrap/HEAD/kjlogobgsmall.png -------------------------------------------------------------------------------- /ichbins-sexp: -------------------------------------------------------------------------------- 1 | (foo 2 | "a string" 3 | some more symbols 4 | 'and 'quoted 'symbols 5 | \s\o\m\e\ \c\h\a\r\s 6 | (a nested list!) 7 | (and (deeper '(with quoting)))) 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.md.html 2 | metacircular.peg 3 | *~ 4 | bootstrap.js 5 | handaxeweb2.lua 6 | handaxeweb3.lua 7 | handaxeweb4.lua 8 | build_handaxeweb.new 9 | crosscompiler.peg 10 | crosscompiler.js 11 | stage2.js 12 | stage3.js 13 | ichbins-parser.peg 14 | ichbins-parser.js 15 | ichbins-sexp.json 16 | -------------------------------------------------------------------------------- /pegcompile.js: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/node 2 | // This is a script to invoke our bootstrap compiler with node.js 3 | // hooking its input up to stdin and its output up to stdout. 4 | var sys = require('sys'); 5 | var Script = process.binding('evals').Script; 6 | 7 | var compiler_script_file = process.argv[2]; 8 | if (!compiler_script_file) { 9 | sys.debug("Usage: "+process.argv[1]+" bootstrap.js < foo.peg > foo.js"); // XXX sys.debug is the wrong thing 10 | process.exit(1); 11 | } 12 | 13 | var compiler = require('./' + compiler_script_file); 14 | 15 | var stdin = process.openStdin(); 16 | var buf = []; 17 | stdin.on('data', function(data) { buf.push(data) }); 18 | stdin.on('end', function() { 19 | sys.print(compiler.parse_sentence(buf.join(''), 0).val); 20 | stdin.destroy(); 21 | }); 22 | -------------------------------------------------------------------------------- /mkhtml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """Turn Markdown documents into HTML documents. 3 | 4 | Depends on python-markdown and Beautiful Soup. 5 | 6 | Markdown normally generates HTML document content; this generates HTML 7 | documents instead. 8 | 9 | """ 10 | import markdown, BeautifulSoup, sys, os.path 11 | 12 | def render(text): 13 | "Given Markdown input as a string, produce an HTML document as a string." 14 | md = markdown.Markdown() 15 | try: 16 | body = md.convert(text.decode('utf-8')) 17 | except AttributeError: 18 | body = str(md) 19 | 20 | soup = BeautifulSoup.BeautifulSoup(body) 21 | 22 | headers = soup('h1') 23 | if len(headers) > 0: 24 | title = headers[0].renderContents() 25 | else: 26 | title = 'Lame document with no top-level header' 27 | 28 | return '''%s 29 | 30 | 31 | %s''' % (title.decode('utf-8'), body) 32 | 33 | def process(infile): 34 | "Given a filename of Markdown input, create an HTML file as output." 35 | outfile = infile + '.html' 36 | 37 | if os.path.exists(outfile) and \ 38 | os.stat(outfile).st_mtime > os.stat(infile).st_mtime: 39 | print "`%s` is newer than `%s`, skipping " % (outfile, infile) 40 | return 41 | 42 | outfiletmp = outfile + '.tmp' 43 | fo = file(outfiletmp, 'w') 44 | fo.write(render(file(infile).read()).encode('utf-8')) 45 | fo.close() 46 | 47 | os.rename(outfiletmp, outfile) # atomic replace; won't work on Win32 48 | print "rendered `%s` to `%s` " % (infile, outfile) 49 | 50 | def main(args): 51 | filenames = args[1:] 52 | if filenames: 53 | for filename in filenames: process(filename) 54 | return 0 55 | else: 56 | print ("usage: `%s foo bar baz`; implicitly writes to `foo.html`, etc." 57 | % args[0]) 58 | return 1 59 | 60 | if __name__ == '__main__': 61 | sys.exit(main(sys.argv)) 62 | -------------------------------------------------------------------------------- /style.css: -------------------------------------------------------------------------------- 1 | /* pending changes: 2 | D color headings? 3 | D move logo to upper left-hand corner? 4 | - move logo back to background? otherwise darken? 5 | */ 6 | 7 | .explanation { background: #f0f0f0; padding: 1ex; } 8 | 9 | /* a lot of the left and right margins are based on the golden ratio (to the 10 | line height 1em) */ 11 | 12 | body { 13 | background-attachment: fixed; 14 | background-image: URL(kjlogobgsmall.png); 15 | background-position: top left; 16 | background-repeat: no-repeat; 17 | margin-left: 7.48em; margin-right: 7.48em; 18 | text-align: justify; 19 | font-family: "URW Palladio L", Palatino, serif; 20 | line-height: 125%; 21 | } 22 | 23 | /* override the normal margin widths of these elements so they don't look funny 24 | with the paragraph indents */ 25 | 26 | ul, ol, menu, dir { -moz-padding-start: 3.24em; } 27 | blockquote { margin: 1em 3.24em; } 28 | 29 | pre { 30 | background-color: #f0f0f0; 31 | border: 1px solid #e3e3e3; 32 | padding: 0.5em; 33 | } 34 | 35 | p { 36 | /* hanging indent for paragraphs */ 37 | /* text-indent: -1.62em; margin-left: 1.62em; */ 38 | } 39 | 40 | p, ul, ol, menu, dir { 41 | margin-top: 0.5em; margin-bottom: 0.5em; 42 | } 43 | 44 | /* pretty headers following the design of 45 | http://www.rotten.com/library/conspiracy/al-qaeda-and-the-assassins/missing-link/ 46 | */ 47 | 48 | h1, h2, h3, h4, h5, h6 { 49 | /* I like Palatino better anyway: font-family: "Arial Narrow", helvetica, sans-serif; */ 50 | font-weight: normal; 51 | font-variant: small-caps; 52 | color: #a4a; 53 | border-bottom: 1px solid #e3e3e3; 54 | /* this contrasts badly with the background logo: background-color: #fafafa; */ 55 | text-align: right; 56 | /* stick out on the right, but we can't use em --- different headers have 57 | * different font sizes, so they wouldn't line up */ 58 | margin-right: -30px; 59 | margin-bottom: 0.25em; margin-top: 0.25em; 60 | } 61 | 62 | h1 { letter-spacing: 6px; padding-bottom: 6px; font-size: 22px; } 63 | h2 { letter-spacing: 6px; padding-bottom: 6px; font-size: 18px; } 64 | h3 { letter-spacing: 4px; padding-bottom: 4px; font-size: 16px; } 65 | h4 { letter-spacing: 3px; padding-bottom: 3px; font-size: 14px; } 66 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: peg.md.html metacircular.peg bootstrap.js \ 2 | crosscompiler.peg output.js crosscompiler.js stage3.js ichbins-sexp.json 3 | clean: 4 | rm peg.md.html metacircular.peg bootstrap.js \ 5 | crosscompiler.peg crosscompiler.js stage2.js stage3.js \ 6 | ichbins-parser.peg ichbins-parser.js ichbins-sexp.json 7 | 8 | # FWIW note that mkhtml.py has its own Make-like mtime-comparison 9 | # logic internally. 10 | %.md.html: %.md mkhtml.py 11 | ./mkhtml.py $< 12 | 13 | metacircular.peg: peg.md handaxeweb.lua 14 | ./handaxeweb.lua 'the metacircular compiler-compiler' < $< > $@ 15 | 16 | bootstrap.js: peg.md handaxeweb.lua 17 | ./handaxeweb.lua 'the bunch-of-functions version' < $< > $@ 18 | 19 | output.js: metacircular.peg pegcompile.js bootstrap.js 20 | node ./pegcompile.js bootstrap.js < $< > $@ 21 | 22 | stage2.js: metacircular.peg pegcompile.js output.js 23 | node ./pegcompile.js output.js < $< > $@ 24 | 25 | # output.js is the grammar compiled into JS with the bootstrap. 26 | # stage2.js is the grammar compiled into JS with a compiled copy of itself; 27 | # most kinds of errors will tend to cause output.js to fail to run successfully, 28 | # so stage2.js won’t be generated. 29 | # However, some kinds of errors might manifest by generating an output parser 30 | # that either doesn’t work at all or works incorrectly; and it is of course 31 | # possible that output.js will differ from stage2.js in innocuous ways 32 | # because they are the outputs of different prorams. 33 | # So we generate a stage3.js using stage2.js: the grammar compiled into JS 34 | # with a compiled version of itself that was itself compiled with itself. 35 | # This should be byte-identical to stage2.js, or there is a bug. 36 | # XXX this Makefile should go into peg.md! 37 | stage3.js: metacircular.peg pegcompile.js stage2.js 38 | node ./pegcompile.js stage2.js < $< > $@ 39 | diff -u stage2.js $@ 40 | 41 | handaxeweb.lua: handaxeweb.md 42 | ./build_handaxeweb 43 | 44 | crosscompiler.peg: peg.md handaxeweb.lua 45 | ./handaxeweb.lua 'the metacircular compiler-compiler' 2 < $< > $@ 46 | 47 | crosscompiler.js: crosscompiler.peg pegcompile.js bootstrap.js 48 | node ./pegcompile.js bootstrap.js < $< > $@ 49 | 50 | ichbins-parser.peg: peg.md 51 | ./handaxeweb.lua $@ < $< > $@ 52 | 53 | ichbins-parser.js: ichbins-parser.peg pegcompile.js stage3.js 54 | node ./pegcompile.js stage3.js < ichbins-parser.peg > $@ 55 | 56 | ichbins-sexp.json: ichbins-parser.js pegcompile.js ichbins-sexp 57 | node ./pegcompile.js ichbins-parser.js < ichbins-sexp > $@ 58 | -------------------------------------------------------------------------------- /test.html: -------------------------------------------------------------------------------- 1 | 2 | test page for metacircular PEG compiler-compiler 3 | 4 | 5 | 6 | 31 | 32 | 33 |

test page for metacircular PEG compiler-compiler

34 | 35 | 36 |

debug output

37 | 38 |

input grammar

39 | 107 | 108 |

output compiler, first stage

109 | go
110 | 111 | 112 |

output compiler, second stage

113 | go
114 | 115 | 116 |

output compiler, third stage

117 | go
118 | 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /handaxeweb.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/lua 2 | function register_chunk(chunks, new_chunk) 3 | if new_chunk.name == nil then return end 4 | 5 | local contents = chunks[new_chunk.name] 6 | if not contents then 7 | contents = {} 8 | chunks[new_chunk.name] = contents 9 | end 10 | 11 | -- If there’s a duplicate, append text to it. 12 | for _, it in ipairs(chunks[new_chunk.name]) do 13 | if it.v == new_chunk.v then 14 | for _, line in ipairs(new_chunk.text) do 15 | table.insert(it.text, line) 16 | end 17 | return 18 | end 19 | end 20 | 21 | -- No duplicate. Add to table. 22 | table.insert(contents, new_chunk) 23 | end 24 | 25 | function is_indented(line) 26 | return string.match(line, "^ ") 27 | end 28 | 29 | assert( is_indented(" hi")) 30 | assert(not is_indented(" hi")) 31 | assert(not is_indented(" hi ")) 32 | 33 | function unindented(line) return string.sub(line, 5) end 34 | assert(unindented(" hi\n") == "hi\n") 35 | 36 | function get_chunk_label(line) 37 | return string.match(line, "^[^%w]*in (.*):[^%w]*$") 38 | end 39 | 40 | assert(get_chunk_label("-- in handaxeweb.lua:") == 41 | "handaxeweb.lua") 42 | assert(get_chunk_label("/* in handaxeweb.c: */") == 43 | "handaxeweb.c") 44 | assert(get_chunk_label("# in a minute: #\n") == 45 | "a minute") 46 | 47 | function parse_chunk_label(label) 48 | local name, version = 49 | string.match(label, "(.*) v(%d+)$") 50 | if name then return name, tonumber(version) 51 | else return label, 0 end 52 | end 53 | 54 | assert(parse_chunk_label("foo") == "foo") 55 | assert(({parse_chunk_label("foo")})[2] == 0) 56 | assert(parse_chunk_label("foo v32") == "foo") 57 | assert(({parse_chunk_label("foo v32")})[2] == 32) 58 | 59 | function parse_input() 60 | local chunks, current_chunk, in_chunk = {}, {text={}}, false 61 | local blank_lines = {} 62 | 63 | for line in io.lines() do 64 | if string.match(line, "^%s*$") then -- blank line 65 | if in_chunk then table.insert(blank_lines, "") end 66 | elseif not in_chunk and is_indented(line) then 67 | local label = get_chunk_label(line) 68 | 69 | if label then -- if that succeeded, change chunks 70 | register_chunk(chunks, current_chunk) 71 | local name, ver = parse_chunk_label(label) 72 | current_chunk = {name = name, v = ver, text = {}} 73 | else 74 | -- incorporate any blank lines seen in between indented lines 75 | for _, blank_line in ipairs(blank_lines) do 76 | table.insert(current_chunk.text, blank_line) 77 | end 78 | blank_lines = {} 79 | 80 | table.insert(current_chunk.text, unindented(line)) 81 | end 82 | in_chunk = true 83 | elseif in_chunk and is_indented(line) then 84 | -- incorporate any blank lines seen in between indented lines 85 | for _, blank_line in ipairs(blank_lines) do 86 | table.insert(current_chunk.text, blank_line) 87 | end 88 | blank_lines = {} 89 | 90 | table.insert(current_chunk.text, unindented(line)) 91 | else 92 | blank_lines = {} 93 | in_chunk = false 94 | end 95 | end 96 | register_chunk(chunks, current_chunk) 97 | 98 | return chunks 99 | end 100 | 101 | function list_chunk_names_and_versions(chunks) 102 | io.write("# Listing versions and root chunk names.\n") 103 | io.write("# Version 12 is displayed as:\n") 104 | io.write("# v 12\n") 105 | io.write("# Chunk name foo bar is displayed as:\n") 106 | io.write("# n foo bar\n") 107 | io.write("# To tangle a particular root chunk, run:\n") 108 | io.write("# "..arg[0].." chunkname\n") 109 | io.write("# That tangles version 0 by default; to specify v69:\n") 110 | io.write("# "..arg[0].." chunkname 69\n") 111 | 112 | local versions, referenced_chunks = {}, {} 113 | for name, contents in pairs(chunks) do 114 | for _, it in ipairs(contents) do 115 | versions[it.v] = true 116 | 117 | for _, line in ipairs(it.text) do 118 | local _, chunkname = parse_reference(line) 119 | if chunkname ~= nil then 120 | referenced_chunks[chunkname] = true 121 | end 122 | end 123 | end 124 | end 125 | 126 | for version, _ in pairs(versions) do 127 | io.write(string.format("v %d\n", version)) 128 | end 129 | 130 | for name, _ in pairs(chunks) do 131 | if not referenced_chunks[name] then 132 | io.write("n "..name.."\n") 133 | end 134 | end 135 | end 136 | 137 | function get_chunk_text(contents, version) 138 | local best 139 | for _, it in ipairs(contents) do 140 | if it.v <= version and (not best or 141 | it.v > best.v) then 142 | best = it 143 | end 144 | end 145 | if best then return best.text else return nil end 146 | end 147 | 148 | do 149 | local contents = {{v=0, text={"a"}}, 150 | {v=2, text={"b"}}, 151 | {v=1, text={"c"}}} 152 | assert(get_chunk_text(contents, 0)[1] == "a") 153 | assert(get_chunk_text(contents, 1)[1] == "c") 154 | assert(get_chunk_text(contents, 2)[1] == "b") 155 | assert(get_chunk_text(contents, 3)[1] == "b") 156 | assert(get_chunk_text(contents, -1) == nil) 157 | end 158 | 159 | function parse_reference(line) 160 | return string.match(line, "^(%s*)<<(.*)>>(%s*)$") 161 | end 162 | 163 | do 164 | local indent, name = parse_reference(" <>\n") 165 | assert(indent == " ") 166 | assert(name == "foo") 167 | assert(parse_reference("bits << shiftlen >> 1") == nil) 168 | end 169 | 170 | function tangle(chunks, chunkname, version, indent) 171 | if indent == nil then indent = '' end 172 | 173 | local contents = chunks[chunkname] 174 | if contents == nil then 175 | error(string.format("chunk `%s` does not exist", 176 | chunkname)) 177 | end 178 | 179 | local text = get_chunk_text(contents, version) 180 | if text == nil then 181 | error(string.format("chunk `%s` has no version `%d`", 182 | chunkname, version)) 183 | end 184 | 185 | for _, line in ipairs(text) do 186 | local nindent, nchunkname = parse_reference(line) 187 | if nindent then 188 | tangle(chunks, nchunkname, version, indent..nindent) 189 | else 190 | io.write(indent..line.."\n") 191 | end 192 | end 193 | end 194 | 195 | local chunks = parse_input() 196 | chunkname, version = ... 197 | if chunkname == nil then 198 | list_chunk_names_and_versions(chunks) 199 | else 200 | if version == nil then version = 0 end 201 | tangle(chunks, chunkname, tonumber(version)) 202 | end 203 | -------------------------------------------------------------------------------- /output.js: -------------------------------------------------------------------------------- 1 | function parse_sp(input, pos) { 2 | var state = { pos: pos }; 3 | var stack = []; 4 | stack.push(state); 5 | state = literal(input, state.pos, ' '); 6 | if (state) { 7 | } 8 | if (!state) { 9 | state = stack.pop(); 10 | stack.push(state); 11 | state = literal(input, state.pos, '\n'); 12 | if (state) { 13 | } 14 | if (!state) { 15 | state = stack.pop(); 16 | state = literal(input, state.pos, '\t'); 17 | if (state) { 18 | } 19 | } else { 20 | stack.pop(); 21 | } 22 | } else { 23 | stack.pop(); 24 | } 25 | return state; 26 | } 27 | 28 | function parse__(input, pos) { 29 | var state = { pos: pos }; 30 | var stack = []; 31 | stack.push(state); 32 | state = parse_sp(input, state.pos); 33 | if (state) { 34 | state = parse__(input, state.pos); 35 | if (state) { 36 | } 37 | } 38 | if (!state) { 39 | state = stack.pop(); 40 | } else { 41 | stack.pop(); 42 | } 43 | return state; 44 | } 45 | 46 | function parse_rule(input, pos) { 47 | var state = { pos: pos }; 48 | var stack = []; 49 | state = parse_name(input, state.pos); 50 | if (state) var n = state.val; 51 | if (state) { 52 | state = parse__(input, state.pos); 53 | if (state) { 54 | state = literal(input, state.pos, '<-'); 55 | if (state) { 56 | state = parse__(input, state.pos); 57 | if (state) { 58 | state = parse_choice(input, state.pos); 59 | if (state) var body = state.val; 60 | if (state) { 61 | state = literal(input, state.pos, '.'); 62 | if (state) { 63 | state = parse__(input, state.pos); 64 | if (state) { 65 | if (state) state.val = ["function parse_", n, "(input, pos) {\n", 66 | ' var state = { pos: pos };\n', 67 | ' var stack = [];\n', 68 | body, 69 | ' return state;\n', 70 | "}\n"].join(''); 71 | } 72 | } 73 | } 74 | } 75 | } 76 | } 77 | } 78 | return state; 79 | } 80 | 81 | function parse_sentence(input, pos) { 82 | var state = { pos: pos }; 83 | var stack = []; 84 | stack.push(state); 85 | state = parse__(input, state.pos); 86 | if (state) { 87 | state = parse_rule(input, state.pos); 88 | if (state) var r = state.val; 89 | if (state) { 90 | state = parse_sentence(input, state.pos); 91 | if (state) var g = state.val; 92 | if (state) { 93 | if (state) state.val = r + "\n" + g; 94 | } 95 | } 96 | } 97 | if (!state) { 98 | state = stack.pop(); 99 | state = parse__(input, state.pos); 100 | if (state) { 101 | state = parse_rule(input, state.pos); 102 | if (state) var r = state.val; 103 | if (state) { 104 | if (state) state.val = r + "\n" 105 | + 'function parse_char(input, pos) {\n' 106 | + ' if (pos >= input.length) return null;\n' 107 | + ' return { pos: pos + 1, val: input.charAt(pos) };\n' 108 | + '}\n' 109 | + 'function literal(input, pos, string) {\n' 110 | + ' if (input.substr(pos, string.length) === string) {\n' 111 | + ' return { pos: pos + string.length, val: string };\n' 112 | + ' } else return null;\n' 113 | + '}\n' 114 | + "if (typeof exports !== 'undefined')\n" 115 | + " exports.parse_sentence = parse_sentence;\n" 116 | ; 117 | } 118 | } 119 | } else { 120 | stack.pop(); 121 | } 122 | return state; 123 | } 124 | 125 | function parse_meta(input, pos) { 126 | var state = { pos: pos }; 127 | var stack = []; 128 | stack.push(state); 129 | state = literal(input, state.pos, '!'); 130 | if (state) { 131 | } 132 | if (!state) { 133 | state = stack.pop(); 134 | stack.push(state); 135 | state = literal(input, state.pos, '\''); 136 | if (state) { 137 | } 138 | if (!state) { 139 | state = stack.pop(); 140 | stack.push(state); 141 | state = literal(input, state.pos, '<-'); 142 | if (state) { 143 | } 144 | if (!state) { 145 | state = stack.pop(); 146 | stack.push(state); 147 | state = literal(input, state.pos, '/'); 148 | if (state) { 149 | } 150 | if (!state) { 151 | state = stack.pop(); 152 | stack.push(state); 153 | state = literal(input, state.pos, '.'); 154 | if (state) { 155 | } 156 | if (!state) { 157 | state = stack.pop(); 158 | stack.push(state); 159 | state = literal(input, state.pos, '('); 160 | if (state) { 161 | } 162 | if (!state) { 163 | state = stack.pop(); 164 | stack.push(state); 165 | state = literal(input, state.pos, ')'); 166 | if (state) { 167 | } 168 | if (!state) { 169 | state = stack.pop(); 170 | stack.push(state); 171 | state = literal(input, state.pos, ':'); 172 | if (state) { 173 | } 174 | if (!state) { 175 | state = stack.pop(); 176 | state = literal(input, state.pos, '->'); 177 | if (state) { 178 | } 179 | } else { 180 | stack.pop(); 181 | } 182 | } else { 183 | stack.pop(); 184 | } 185 | } else { 186 | stack.pop(); 187 | } 188 | } else { 189 | stack.pop(); 190 | } 191 | } else { 192 | stack.pop(); 193 | } 194 | } else { 195 | stack.pop(); 196 | } 197 | } else { 198 | stack.pop(); 199 | } 200 | } else { 201 | stack.pop(); 202 | } 203 | return state; 204 | } 205 | 206 | function parse_name(input, pos) { 207 | var state = { pos: pos }; 208 | var stack = []; 209 | stack.push(state); 210 | state = parse_namechar(input, state.pos); 211 | if (state) var c = state.val; 212 | if (state) { 213 | state = parse_name(input, state.pos); 214 | if (state) var n = state.val; 215 | if (state) { 216 | if (state) state.val = c + n; 217 | } 218 | } 219 | if (!state) { 220 | state = stack.pop(); 221 | state = parse_namechar(input, state.pos); 222 | if (state) { 223 | } 224 | } else { 225 | stack.pop(); 226 | } 227 | return state; 228 | } 229 | 230 | function parse_namechar(input, pos) { 231 | var state = { pos: pos }; 232 | var stack = []; 233 | stack.push(state); 234 | state = parse_meta(input, state.pos); 235 | if (state) { 236 | stack.pop(); 237 | state = null; 238 | } else { 239 | state = stack.pop(); 240 | } 241 | if (state) { 242 | stack.push(state); 243 | state = parse_sp(input, state.pos); 244 | if (state) { 245 | stack.pop(); 246 | state = null; 247 | } else { 248 | state = stack.pop(); 249 | } 250 | if (state) { 251 | state = parse_char(input, state.pos); 252 | if (state) { 253 | } 254 | } 255 | } 256 | return state; 257 | } 258 | 259 | function parse_term(input, pos) { 260 | var state = { pos: pos }; 261 | var stack = []; 262 | stack.push(state); 263 | state = parse_labeled(input, state.pos); 264 | if (state) { 265 | } 266 | if (!state) { 267 | state = stack.pop(); 268 | stack.push(state); 269 | state = parse_nonterminal(input, state.pos); 270 | if (state) { 271 | } 272 | if (!state) { 273 | state = stack.pop(); 274 | stack.push(state); 275 | state = parse_string(input, state.pos); 276 | if (state) { 277 | } 278 | if (!state) { 279 | state = stack.pop(); 280 | stack.push(state); 281 | state = parse_negation(input, state.pos); 282 | if (state) { 283 | } 284 | if (!state) { 285 | state = stack.pop(); 286 | state = parse_parenthesized(input, state.pos); 287 | if (state) { 288 | } 289 | } else { 290 | stack.pop(); 291 | } 292 | } else { 293 | stack.pop(); 294 | } 295 | } else { 296 | stack.pop(); 297 | } 298 | } else { 299 | stack.pop(); 300 | } 301 | return state; 302 | } 303 | 304 | function parse_nonterminal(input, pos) { 305 | var state = { pos: pos }; 306 | var stack = []; 307 | state = parse_name(input, state.pos); 308 | if (state) var n = state.val; 309 | if (state) { 310 | state = parse__(input, state.pos); 311 | if (state) { 312 | if (state) state.val = [' state = parse_', n, '(input, state.pos);\n'].join(''); 313 | } 314 | } 315 | return state; 316 | } 317 | 318 | function parse_labeled(input, pos) { 319 | var state = { pos: pos }; 320 | var stack = []; 321 | state = parse_name(input, state.pos); 322 | if (state) var label = state.val; 323 | if (state) { 324 | state = parse__(input, state.pos); 325 | if (state) { 326 | state = literal(input, state.pos, ':'); 327 | if (state) { 328 | state = parse__(input, state.pos); 329 | if (state) { 330 | state = parse_term(input, state.pos); 331 | if (state) var value = state.val; 332 | if (state) { 333 | if (state) state.val = [value, ' if (state) var ', label, ' = state.val;\n'].join(''); 334 | } 335 | } 336 | } 337 | } 338 | } 339 | return state; 340 | } 341 | 342 | function parse_sequence(input, pos) { 343 | var state = { pos: pos }; 344 | var stack = []; 345 | stack.push(state); 346 | state = parse_term(input, state.pos); 347 | if (state) var foo = state.val; 348 | if (state) { 349 | state = parse_sequence(input, state.pos); 350 | if (state) var bar = state.val; 351 | if (state) { 352 | if (state) state.val = [foo, ' if (state) {\n', bar, ' }\n'].join(''); 353 | } 354 | } 355 | if (!state) { 356 | state = stack.pop(); 357 | stack.push(state); 358 | state = parse_result_expression(input, state.pos); 359 | if (state) { 360 | } 361 | if (!state) { 362 | state = stack.pop(); 363 | if (state) state.val = ''; 364 | } else { 365 | stack.pop(); 366 | } 367 | } else { 368 | stack.pop(); 369 | } 370 | return state; 371 | } 372 | 373 | function parse_string(input, pos) { 374 | var state = { pos: pos }; 375 | var stack = []; 376 | state = literal(input, state.pos, '\''); 377 | if (state) { 378 | state = parse_stringcontents(input, state.pos); 379 | if (state) var s = state.val; 380 | if (state) { 381 | state = literal(input, state.pos, '\''); 382 | if (state) { 383 | state = parse__(input, state.pos); 384 | if (state) { 385 | if (state) state.val = [" state = literal(input, state.pos, '", s, "');\n"].join(''); 386 | } 387 | } 388 | } 389 | } 390 | return state; 391 | } 392 | 393 | function parse_stringcontents(input, pos) { 394 | var state = { pos: pos }; 395 | var stack = []; 396 | stack.push(state); 397 | stack.push(state); 398 | state = literal(input, state.pos, '\\'); 399 | if (state) { 400 | stack.pop(); 401 | state = null; 402 | } else { 403 | state = stack.pop(); 404 | } 405 | if (state) { 406 | stack.push(state); 407 | state = literal(input, state.pos, '\''); 408 | if (state) { 409 | stack.pop(); 410 | state = null; 411 | } else { 412 | state = stack.pop(); 413 | } 414 | if (state) { 415 | state = parse_char(input, state.pos); 416 | if (state) var c = state.val; 417 | if (state) { 418 | state = parse_stringcontents(input, state.pos); 419 | if (state) var s = state.val; 420 | if (state) { 421 | if (state) state.val = c + s; 422 | } 423 | } 424 | } 425 | } 426 | if (!state) { 427 | state = stack.pop(); 428 | stack.push(state); 429 | state = literal(input, state.pos, '\\'); 430 | if (state) var b = state.val; 431 | if (state) { 432 | state = parse_char(input, state.pos); 433 | if (state) var c = state.val; 434 | if (state) { 435 | state = parse_stringcontents(input, state.pos); 436 | if (state) var s = state.val; 437 | if (state) { 438 | if (state) state.val = b + c + s; 439 | } 440 | } 441 | } 442 | if (!state) { 443 | state = stack.pop(); 444 | if (state) state.val = ''; 445 | } else { 446 | stack.pop(); 447 | } 448 | } else { 449 | stack.pop(); 450 | } 451 | return state; 452 | } 453 | 454 | function parse_choice(input, pos) { 455 | var state = { pos: pos }; 456 | var stack = []; 457 | stack.push(state); 458 | state = parse_sequence(input, state.pos); 459 | if (state) var a = state.val; 460 | if (state) { 461 | state = literal(input, state.pos, '/'); 462 | if (state) { 463 | state = parse__(input, state.pos); 464 | if (state) { 465 | state = parse_choice(input, state.pos); 466 | if (state) var b = state.val; 467 | if (state) { 468 | if (state) state.val = [' stack.push(state);\n', 469 | a, 470 | ' if (!state) {\n', 471 | ' state = stack.pop();\n', 472 | b, 473 | ' } else stack.pop();\n'].join(''); 474 | } 475 | } 476 | } 477 | } 478 | if (!state) { 479 | state = stack.pop(); 480 | state = parse_sequence(input, state.pos); 481 | if (state) { 482 | } 483 | } else { 484 | stack.pop(); 485 | } 486 | return state; 487 | } 488 | 489 | function parse_negation(input, pos) { 490 | var state = { pos: pos }; 491 | var stack = []; 492 | state = literal(input, state.pos, '!'); 493 | if (state) { 494 | state = parse__(input, state.pos); 495 | if (state) { 496 | state = parse_term(input, state.pos); 497 | if (state) var t = state.val; 498 | if (state) { 499 | if (state) state.val = [' stack.push(state);\n', 500 | t, 501 | ' if (state) {\n', 502 | ' stack.pop();\n', 503 | ' state = null;\n', 504 | ' } else state = stack.pop();\n'].join(''); 505 | } 506 | } 507 | } 508 | return state; 509 | } 510 | 511 | function parse_result_expression(input, pos) { 512 | var state = { pos: pos }; 513 | var stack = []; 514 | state = literal(input, state.pos, '->'); 515 | if (state) { 516 | state = parse__(input, state.pos); 517 | if (state) { 518 | state = parse_expr(input, state.pos); 519 | if (state) var result = state.val; 520 | if (state) { 521 | state = parse__(input, state.pos); 522 | if (state) { 523 | if (state) state.val = [' if (state) state.val = ', result, ';\n'].join(''); 524 | } 525 | } 526 | } 527 | } 528 | return state; 529 | } 530 | 531 | function parse_expr(input, pos) { 532 | var state = { pos: pos }; 533 | var stack = []; 534 | state = literal(input, state.pos, '('); 535 | if (state) { 536 | state = parse__(input, state.pos); 537 | if (state) { 538 | state = parse_exprcontents(input, state.pos); 539 | if (state) var e = state.val; 540 | if (state) { 541 | state = literal(input, state.pos, ')'); 542 | if (state) { 543 | if (state) state.val = '(' + e + ')'; 544 | } 545 | } 546 | } 547 | } 548 | return state; 549 | } 550 | 551 | function parse_exprcontents(input, pos) { 552 | var state = { pos: pos }; 553 | var stack = []; 554 | stack.push(state); 555 | stack.push(state); 556 | stack.push(state); 557 | state = literal(input, state.pos, '('); 558 | if (state) { 559 | stack.pop(); 560 | state = null; 561 | } else { 562 | state = stack.pop(); 563 | } 564 | if (state) { 565 | stack.push(state); 566 | state = literal(input, state.pos, ')'); 567 | if (state) { 568 | stack.pop(); 569 | state = null; 570 | } else { 571 | state = stack.pop(); 572 | } 573 | if (state) { 574 | state = parse_char(input, state.pos); 575 | if (state) { 576 | } 577 | } 578 | } 579 | if (!state) { 580 | state = stack.pop(); 581 | state = parse_expr(input, state.pos); 582 | if (state) { 583 | } 584 | } else { 585 | stack.pop(); 586 | } 587 | if (state) var c = state.val; 588 | if (state) { 589 | state = parse_exprcontents(input, state.pos); 590 | if (state) var e = state.val; 591 | if (state) { 592 | if (state) state.val = c + e; 593 | } 594 | } 595 | if (!state) { 596 | state = stack.pop(); 597 | if (state) state.val = ''; 598 | } else { 599 | stack.pop(); 600 | } 601 | return state; 602 | } 603 | 604 | function parse_parenthesized(input, pos) { 605 | var state = { pos: pos }; 606 | var stack = []; 607 | state = literal(input, state.pos, '('); 608 | if (state) { 609 | state = parse__(input, state.pos); 610 | if (state) { 611 | state = parse_choice(input, state.pos); 612 | if (state) var body = state.val; 613 | if (state) { 614 | state = literal(input, state.pos, ')'); 615 | if (state) { 616 | state = parse__(input, state.pos); 617 | if (state) { 618 | if (state) state.val = body; 619 | } 620 | } 621 | } 622 | } 623 | } 624 | return state; 625 | } 626 | 627 | function parse_char(input, pos) { 628 | if (pos >= input.length) return null; 629 | return { pos: pos + 1, val: input.charAt(pos) }; 630 | } 631 | function literal(input, pos, string) { 632 | if (input.substr(pos, string.length) === string) { 633 | return { pos: pos + string.length, val: string }; 634 | } else return null; 635 | } 636 | if (typeof exports !== "undefined") { 637 | exports.parse_sentence = parse_sentence; 638 | } 639 | -------------------------------------------------------------------------------- /handaxeweb.md: -------------------------------------------------------------------------------- 1 | handaxeweb: a minimalist literate-programming system 2 | ==================================================== 3 | 4 | > Let us change our traditional attitude to the construction 5 | > of programs: Instead of imagining that our main task is to 6 | > instruct a computer what to do, let us concentrate rather 7 | > on explaining to humans what we want the computer to do. 8 | 9 | > > — Donald E. Knuth, "Literate Programming", in The Computer 10 | > > Journal, 1984, p.99 11 | 12 | Literate-programming systems are systems for writing programs 13 | that are optimized for readability. This is a very simple 14 | literate-programming system called “handaxeweb” 15 | that supports multiple versions of a program in the same 16 | HTML or Markdown document. 17 | 18 | What literate programming is, and how handaxeweb is related to other such systems 19 | --------------------------------------------------------------------------------- 20 | 21 | Traditionally a literate-programming system contains two 22 | programs: one called `tangle`, to feed the program to the compiler, 23 | and one to 24 | produce a printable version called `weave` (related to a 25 | famous couplet alluding to webs). 26 | 27 | Following noweb, handaxeweb doesn’t make any attempt to produce a 28 | “woven” output for human consumption; it only tangles. 29 | The idea is that you 30 | write your literate program either as a plain ASCII text 31 | document, or in Markdown or something, as long as it permits 32 | you to write segments of your program indented by four 33 | spaces. 34 | 35 | ### Phil Bewig’s “The Essence of Literate Programming”: the inspiration ### 36 | 37 | handaxeweb is more directly inspired by Phil Bewig’s “The 38 | Essence of Literate Programming”, a post on 39 | comp.programming.literate on 1996-05-27, message-id 40 | ``, containing the following 41 | noweb-like literate-programming system written in awk: 42 | 43 | # in The Essence of Literate Programming: 44 | /^<<.+>>=$/ { 45 | name = substr($0, 3, length($0) - 5) 46 | while (getline > 0) { 47 | if (length($0) == 0) next 48 | chunk[name, ++count[name]] = $0 } } 49 | END { tangle("*", ""); printf "\n" } 50 | function tangle(name, prefix, i, tag, suffix) { 51 | for (i = 1; i <= count[name]; i++) { 52 | if (i == 2) gsub(/[^ \t]/, " ", prefix) 53 | if (match(chunk[name,i], /<<.+>>/)) { 54 | tag = substr(chunk[name,i], RSTART + 2, RLENGTH - 4) 55 | if (tag in count) { 56 | suffix = substr(chunk[name,i], RSTART + RLENGTH) 57 | tangle(tag, prefix substr(chunk[name,i], 1, RSTART - 1)) 58 | printf "%s", suffix } 59 | else printf "%s%s", prefix, chunk[name,i] } 60 | else printf "%s%s", prefix, chunk[name,i] 61 | if (i < count[name]) printf "\n" } } 62 | 63 | He explained: 64 | 65 | > The essence of literate programming is rearranging chunks 66 | > of code, and a dozen and a half lines of awk is all you 67 | > need for that. 68 | > 69 | > Of course, with so little code it's not possible for 70 | > everything to be perfect. … Even so, this microscopic 71 | > system provides a useful tool that encompasses the essence 72 | > of literate programming. 73 | 74 | ### Overview of handaxeweb's features ### 75 | 76 | Unfortunately, handaxeweb is 208 lines of code, twice the 77 | size of the previous Python version, and 78 | more than ten 79 | times the size of The Essence of Literate Programming (a full 80 | sixth of the size of CWEB!). But it 81 | solves a couple of other problems that I need for my 82 | purposes: 83 | 84 | * versioning: multiple versions of the same program in the 85 | same version of the same document; 86 | * multiple separate programs in the same document; 87 | * listing the programs and versions in a document; 88 | * indentation (needed for languages like Python); 89 | * support for Markdown, which is how I write most 90 | human-readable documents these days. 91 | 92 | Literate programs may contain multiple versions of the program 93 | -------------------------------------------------------------- 94 | 95 | Versioning is one of the biggest problems I've had with the 96 | previous version of handaxeweb, written in Python. 97 | 98 | When I write a literate program, there are often bits of it 99 | that are present for scaffolding in initial versions which 100 | then should be removed in future versions. This is especially 101 | true with these bootstrapping-compiler things 102 | I've been writing lately, where the 103 | initial version of the bootstrapping compiler supports a 104 | minimal number of features and can barely compile itself, 105 | while later versions share a lot of code with the first 106 | version --- but all the versions coexist simultaneously, and 107 | I want to be able to make a bug-fix in the shared code. 108 | 109 | The programming language itself can provide some support for 110 | this, as e.g. CSS does. But what about the case where the 111 | language itself doesn’t help much? 112 | 113 | One obviously possible approach is to redefine the program 114 | from the root down; e.g., first you say 115 | 116 | in the initial version: 117 | <> 118 | <> 119 | <
> 120 | 121 | And defining each of those pieces: 122 | 123 | in initializations: 124 | <> 125 | <> 126 | 127 | etc., and then for the next version: 128 | 129 | in the new version: 130 | <> 131 | <> 132 | <
> 133 | 134 | with new versions of whatever treenodes have changed, such as: 135 | 136 | in new initializations: 137 | <> 138 | <> 139 | 140 | and “new initialize I/O layer”. 141 | 142 | Obviously this is pretty suboptimal in terms of requiring a 143 | lot of copy-and-pasted text that doesn’t really help the 144 | reader. 145 | 146 | Version numbers on chunks allow such versions gracefully 147 | -------------------------------------------------------- 148 | 149 | Here’s a better idea. Every named chunk can have several 150 | versions, each with a version number. The name of the chunk 151 | when it’s being defined may end with “v312” to indicate that 152 | the text that follows belongs to version 312. Otherwise, it 153 | belongs to version 0. You can tangle any version N of any 154 | chunk; this will use the highest-numbered version <= N of 155 | each referenced chunk. 156 | 157 | This means that you can get the effect of the repetition 158 | above simply by saying: 159 | 160 | in initialize I/O layer v1: 161 | 162 | and then tangling v1 of “initial version”. 163 | 164 | The syntax of handaxeweb 165 | ------------------------ 166 | 167 | The previous version of handaxeweb uses 168 | indented lines of the form “(in foo)” to start new named 169 | chunks. This is pretty reasonable, but it would be better if 170 | the line could be a valid comment in whatever language, to 171 | better support syntax-highlighting. So the right thing to do 172 | is to omit leading and trailing punctuation, but require a 173 | trailing ":", as in the previous examples in this document. 174 | 175 | Beyond that, the syntax of `handaxeweb` is simply that 176 | program code is indented by four spaces, and references to 177 | other chunks are enclosed in `<<>>`. 178 | 179 | handaxeweb, the program 180 | ----------------------- 181 | 182 | -- in handaxeweb.lua: 183 | #!/usr/bin/lua 184 | <> 185 | 186 | <> 187 | <> 188 | 189 | The main actions desired are to list the possible chunk names 190 | and version numbers, and to tangle a particular chunk with a 191 | particular version number. 192 | 193 | -- in carry out specified action on it: 194 | chunkname, version = ... 195 | if chunkname == nil then 196 | list_chunk_names_and_versions(chunks) 197 | else 198 | if version == nil then version = 0 end 199 | tangle(chunks, chunkname, tonumber(version)) 200 | end 201 | 202 | The problem of reading the input program can be factored into 203 | a third subroutine: 204 | 205 | -- in read input literate program: 206 | local chunks = parse_input() 207 | 208 | So, the definitions so far needed: 209 | 210 | -- in definitions: 211 | <> 212 | 213 | <> 214 | 215 | <> 216 | 217 | These three need to share a common idea of the contents of 218 | the variable `chunks`. I think it should be a hash from chunk 219 | names to lists of chunk versions, where each version contains 220 | a version number and some text, stored as a list of 221 | lines. 222 | 223 | -- in an example of the chunks variable: 224 | {['read input literate program'] = 225 | {{v=0, text={"local chunks = parse_input()", ...}, 226 | {v=1, ...} 227 | ...} 228 | }, 229 | parse_input={{v=0...}, ...}, 230 | ... 231 | } 232 | 233 | ### `parse_input` ### 234 | 235 | The job of `parse_input` is to turn the input file into such 236 | a structure. It looks for sequences of lines indented by at 237 | least four spaces to use as chunks; they may begin with a 238 | header line specifying their name and version, or they may 239 | just be a continuation of some previous chunk with a name and 240 | version. 241 | 242 | We start with a nameless chunk that will be discarded. 243 | 244 | -- in parse_input: 245 | <> 246 | 247 | function parse_input() 248 | local chunks, current_chunk, in_chunk = {}, {text={}}, false 249 | local blank_lines = {} 250 | 251 | for line in io.lines() do 252 | if string.match(line, "^%s*$") then -- blank line 253 | <> 254 | elseif not in_chunk and is_indented(line) then 255 | <> 256 | in_chunk = true 257 | elseif in_chunk and is_indented(line) then 258 | <> 259 | else 260 | blank_lines = {} 261 | in_chunk = false 262 | end 263 | end 264 | <> 265 | 266 | return chunks 267 | end 268 | 269 | Initially `current_chunk` is `nil`, and we don’t start a 270 | `current_chunk` until we see a header line. After that, 271 | `current_chunk.text` is always a list. 272 | 273 | We need special handling for blank lines because they can 274 | occur inside of an indented region, but not have any spaces 275 | on them, depending on editor settings. So in this case we 276 | leave untouched the `in_chunk` setting, telling us whether we're in the 277 | middle of an indented chunk, and we append the blank 278 | line to a list that gets incorporated only if more nonblank 279 | indented lines appear. 280 | 281 | -- in handle blank line: 282 | if in_chunk then table.insert(blank_lines, "") end 283 | 284 | Handling a normal indented line is very easy. Any parsing 285 | will be handled later by `tangle`. 286 | 287 | -- in handle normal indented line: 288 | -- incorporate any blank lines seen in between indented lines 289 | for _, blank_line in ipairs(blank_lines) do 290 | table.insert(current_chunk.text, blank_line) 291 | end 292 | blank_lines = {} 293 | 294 | table.insert(current_chunk.text, unindented(line)) 295 | 296 | The possible header line may be either a header line (not 297 | included in the chunk itself) or an ordinary chunk line, 298 | possibly adding more lines onto the previous chunk. 299 | 300 | -- in handle possible header line: 301 | local label = get_chunk_label(line) 302 | 303 | if label then -- if that succeeded, change chunks 304 | register_chunk(chunks, current_chunk) 305 | local name, ver = parse_chunk_label(label) 306 | current_chunk = {name = name, v = ver, text = {}} 307 | else 308 | <> 309 | end 310 | 311 | At the end of input, we just need to handle the last chunk: 312 | 313 | -- in handle last chunk: 314 | register_chunk(chunks, current_chunk) 315 | 316 | So the `parse_input` function itself depends on a few other 317 | functions: 318 | 319 | -- in parse_input definitions: 320 | <> 321 | 322 | <> 323 | 324 | <> 325 | 326 | <> 327 | 328 | <> 329 | 330 | `register_chunk` is the only thing that actually builds the 331 | table `chunks`. It has to deal with questions of 332 | duplicate-handling, and discard the initial nil chunk. 333 | 334 | With regard to duplicate-handling: if there are multiple 335 | chunks with the same name and version, then we concatenate 336 | them. This supports two important uses: 337 | 338 | 1. It allows you to intersperse formatted text with the lines 339 | of a chunk without having to add header lines all over the 340 | place. If you like, you can write your entire program this 341 | way, with just a single header line at the top. 342 | 343 | 2. It allows you to progressively add to multiple sections in 344 | parallel throughout your document. The example given in 345 | the CWEB manual is that you might have one section for all 346 | your global variables, progressively adding things to 347 | it. Some other examples follow: in C, it’s often 348 | convenient to put a declaration into a `.h` file at the 349 | same time as an implementation into a `.c` file; in a 350 | bytecode virtual machine, it may be convenient to put 351 | cases into a centralized `switch` statement at the same 352 | time as defining functions that those cases call. 353 | 354 | However, it may run into some difficulty with versioning. If 355 | you define a new version of a chunk, then in that version, it 356 | replaces all of the text in that chunk, not just one 357 | paragraph of it. Clearly if those paragraphs are spread all 358 | over your document, that’s going to be hard to get right. 359 | 360 | -- in register_chunk: 361 | function register_chunk(chunks, new_chunk) 362 | if new_chunk.name == nil then return end 363 | 364 | local contents = chunks[new_chunk.name] 365 | if not contents then 366 | contents = {} 367 | chunks[new_chunk.name] = contents 368 | end 369 | 370 | -- If there’s a duplicate, append text to it. 371 | for _, it in ipairs(chunks[new_chunk.name]) do 372 | if it.v == new_chunk.v then 373 | for _, line in ipairs(new_chunk.text) do 374 | table.insert(it.text, line) 375 | end 376 | return 377 | end 378 | end 379 | 380 | -- No duplicate. Add to table. 381 | table.insert(contents, new_chunk) 382 | end 383 | 384 | The indentation functions are very simple. 385 | 386 | -- in is_indented: 387 | function is_indented(line) 388 | return string.match(line, "^ ") 389 | end 390 | 391 | assert( is_indented(" hi")) 392 | assert(not is_indented(" hi")) 393 | assert(not is_indented(" hi ")) 394 | 395 | The `unindented` function assumes the line is indented. 396 | 397 | -- in unindented: 398 | function unindented(line) return string.sub(line, 5) end 399 | assert(unindented(" hi\n") == "hi\n") 400 | 401 | Recognizing the chunk labels is not too hard with Lua’s 402 | pattern-matching: 403 | 404 | -- in get_chunk_label: 405 | function get_chunk_label(line) 406 | return string.match(line, "^[^%w]*in (.*):[^%w]*$") 407 | end 408 | 409 | assert(get_chunk_label("-- in handaxeweb.lua:") == 410 | "handaxeweb.lua") 411 | assert(get_chunk_label("/* in handaxeweb.c: */") == 412 | "handaxeweb.c") 413 | assert(get_chunk_label("# in a minute: #\n") == 414 | "a minute") 415 | 416 | Pulling the version number out can be done similarly easily. 417 | 418 | -- in parse_chunk_label: 419 | function parse_chunk_label(label) 420 | local name, version = 421 | string.match(label, "(.*) v(%d+)$") 422 | if name then return name, tonumber(version) 423 | else return label, 0 end 424 | end 425 | 426 | assert(parse_chunk_label("foo") == "foo") 427 | assert(({parse_chunk_label("foo")})[2] == 0) 428 | assert(parse_chunk_label("foo v32") == "foo") 429 | assert(({parse_chunk_label("foo v32")})[2] == 32) 430 | 431 | That covers all that’s needed to parse input. 432 | 433 | ### `tangle` ### 434 | 435 | This is the subroutine whose job it is 436 | to produce a runnable version of a 437 | literate program. 438 | 439 | Our `tangle` routine in this case is passed the name of an 440 | initial chunk and a version number. In order for it to be 441 | able to invoke itself recursively and still produce readable 442 | output (and, in Python, parseable output) it also takes an 443 | indentation parameter. 444 | 445 | -- in tangle: 446 | <> 447 | 448 | function tangle(chunks, chunkname, version, indent) 449 | if indent == nil then indent = '' end 450 | 451 | <> 452 | 453 | for _, line in ipairs(text) do 454 | local nindent, nchunkname = parse_reference(line) 455 | if nindent then 456 | tangle(chunks, nchunkname, version, indent..nindent) 457 | else 458 | io.write(indent..line.."\n") 459 | end 460 | end 461 | end 462 | 463 | This is simple enough: when we encounter a reference, we 464 | recurse, concatenating the indentation; and otherwise we 465 | simply indent the line and output it. (The indentation is 466 | essential for languages like Haskell and Python.) 467 | 468 | The process of getting the text must worry about error 469 | conditions. 470 | 471 | -- in get the text of the chunk: 472 | local contents = chunks[chunkname] 473 | if contents == nil then 474 | error(string.format("chunk `%s` does not exist", 475 | chunkname)) 476 | end 477 | 478 | local text = get_chunk_text(contents, version) 479 | if text == nil then 480 | error(string.format("chunk `%s` has no version `%d`", 481 | chunkname, version)) 482 | end 483 | 484 | This depends on functions `get_chunk_text` and `parse_reference`. 485 | 486 | -- in tangle definitions: 487 | <> 488 | 489 | <> 490 | 491 | `get_chunk_text` need only walk the relevant part of the 492 | `chunks` table. Recall that the contents for a chunk are 493 | simply stored as a list of `{v=3, text="foo"}` structs, so we 494 | can pull them out as follows: 495 | 496 | -- in get_chunk_text: 497 | function get_chunk_text(contents, version) 498 | local best 499 | for _, it in ipairs(contents) do 500 | if it.v <= version and (not best or 501 | it.v > best.v) then 502 | best = it 503 | end 504 | end 505 | if best then return best.text else return nil end 506 | end 507 | 508 | do 509 | local contents = {{v=0, text={"a"}}, 510 | {v=2, text={"b"}}, 511 | {v=1, text={"c"}}} 512 | assert(get_chunk_text(contents, 0)[1] == "a") 513 | assert(get_chunk_text(contents, 1)[1] == "c") 514 | assert(get_chunk_text(contents, 2)[1] == "b") 515 | assert(get_chunk_text(contents, 3)[1] == "b") 516 | assert(get_chunk_text(contents, -1) == nil) 517 | end 518 | 519 | `parse_reference` just needs to match the `<>` 520 | references and pull out whatever indentation precedes them; 521 | it turns out Lua’s pattern-matching can do this directly. 522 | 523 | -- in parse_reference: 524 | function parse_reference(line) 525 | return string.match(line, "^(%s*)<<(.*)>>(%s*)$") 526 | end 527 | 528 | do 529 | local indent, name = parse_reference(" <>\n") 530 | assert(indent == " ") 531 | assert(name == "foo") 532 | assert(parse_reference("bits << shiftlen >> 1") == nil) 533 | end 534 | 535 | ### `list_chunk_names_and_versions` ### 536 | 537 | Given this structure, listing either the chunk names or the 538 | versions should be simple. Unfortunately, listing both of 539 | them is a little annoying, because the output then requires 540 | parsing. But we can take advantage of this to be more 541 | explanatory. 542 | 543 | We’d like to only list the names of *root chunks*, that is, 544 | those that aren’t included in any other chunk. Often there 545 | will be only one of them. 546 | 547 | -- in list_chunk_names_and_versions: 548 | function list_chunk_names_and_versions(chunks) 549 | <> 550 | 551 | <> 552 | 553 | <> 554 | 555 | <> 556 | end 557 | 558 | We’ll output one thing per line: 559 | 560 | -- in display help message: 561 | io.write("# Listing versions and root chunk names.\n") 562 | io.write("# Version 12 is displayed as:\n") 563 | io.write("# v 12\n") 564 | io.write("# Chunk name foo bar is displayed as:\n") 565 | io.write("# n foo bar\n") 566 | io.write("# To tangle a particular root chunk, run:\n") 567 | io.write("# "..arg[0].." chunkname\n") 568 | io.write("# That tangles version 0 by default; to specify v69:\n") 569 | io.write("# "..arg[0].." chunkname 69\n") 570 | 571 | We traverse the table to build up information for what we 572 | display later. 573 | 574 | -- in traverse chunks table: 575 | local versions, referenced_chunks = {}, {} 576 | for name, contents in pairs(chunks) do 577 | for _, it in ipairs(contents) do 578 | versions[it.v] = true 579 | 580 | for _, line in ipairs(it.text) do 581 | local _, chunkname = parse_reference(line) 582 | if chunkname ~= nil then 583 | referenced_chunks[chunkname] = true 584 | end 585 | end 586 | end 587 | end 588 | 589 | Then displaying the versions is easy; we need only to produce 590 | the keys from the versions table: 591 | 592 | -- in display versions: 593 | for version, _ in pairs(versions) do 594 | io.write(string.format("v %d\n", version)) 595 | end 596 | 597 | Displaying the chunk names is almost as easy: 598 | 599 | -- in display chunk names: 600 | for name, _ in pairs(chunks) do 601 | if not referenced_chunks[name] then 602 | io.write("n "..name.."\n") 603 | end 604 | end 605 | 606 | The build script 607 | ---------------- 608 | 609 | Rebuilding handaxeweb from this document by hand is a 610 | little tedious. So here's a shell script that syntax-checks 611 | and double-compile checks. 612 | 613 | # in build_handaxeweb: 614 | #!/bin/sh 615 | set -ve 616 | ./handaxeweb.lua handaxeweb.lua 0 < handaxeweb.md > handaxeweb2.lua 617 | 618 | # test new version 619 | lua handaxeweb2.lua handaxeweb.lua 0 < handaxeweb.md > handaxeweb3.lua 620 | 621 | # try building it with itself: 622 | lua handaxeweb3.lua handaxeweb.lua 0 < handaxeweb.md > handaxeweb4.lua 623 | 624 | # verify output is the same: 625 | diff handaxeweb3.lua handaxeweb4.lua 626 | 627 | # okay, we’ll accept it 628 | cp handaxeweb4.lua handaxeweb.lua 629 | 630 | ./handaxeweb.lua build_handaxeweb 0 < handaxeweb.md > build_handaxeweb.new 631 | cp build_handaxeweb.new build_handaxeweb 632 | 633 | Flaws in handaxeweb 634 | ------------------- 635 | 636 | There are several things I could do to improve this program 637 | without changing its functionality. 638 | 639 | (in this part of the document there is no code:) 640 | (This note is needed because of how 641 | Markdown structures nested lists, sigh.) 642 | 643 | * The state machine in `parse_input` is obtuse and bug-prone. 644 | 645 | * There are a number of subroutines and abstraction layers 646 | that would simplify the main program logic: 647 | 648 | * Appending one list to another (in two places). 649 | * Some kind of parsing machinery, probably. 650 | * An ordered container supporting insertion and 651 | nearest-match searching. 652 | * Set arithmetic; in particular, set subtraction. 653 | * Collections stuff: keys of a table, mapping a function 654 | over a list, printing all the items in a list. 655 | 656 | * Appending to a versioned chunk is still kind of 657 | inconvenient. If you could say `<>` this 658 | problem would mostly go away. 659 | 660 | * The default to output should probably be the last version, 661 | not version 0. 662 | 663 | * There’s still no syntax highlighting or tables of contents 664 | in the output. 665 | 666 | * Emacs isn’t smart enough to do syntax highlighting in the 667 | input. 668 | 669 | * Compiler error messages are subpar because handaxeweb 670 | doesn’t know enough to generate `#line` directives. (And 671 | for some languages, there is no such thing.) 672 | 673 | Probably the right thing to do for some of these problems is 674 | to use parsing tools to parse the input. 675 | 676 | ### a PEG for handaxeweb’s input ### 677 | 678 | # in a PEG for handaxeweb: 679 | # Top-level constructs, down to the paragraph level: 680 | litprog <- (!chunk (bl / textpara / codepara))* chunk*. 681 | chunk <- header (textpara* !header codepara)*. 682 | codepara <- first: indented+ more: (bl+ indented+)*. 683 | textpara <- bl* unindented+ bl*. 684 | 685 | # Types of lines: 686 | header <- indent nonalnum* "in " defname ":" nonalnum* nl. 687 | indented <- !bl indent (more: wsp* reference / text: normal+) nl. 688 | bl <- wsp* nl. # Blank line. 689 | unindented <- !indent normal+ nl. 690 | 691 | # Syntax within lines: 692 | defname <- name: (!version normal)* version. 693 | version <- (" v" n: number+ / ) !!":". 694 | reference <- "<<" name: (!">>" normal)* ">>". 695 | indent <- " ". 696 | 697 | # Character classes: 698 | nonalnum <- !alnum normal. 699 | alnum <- uppercase / lowercase / number. 700 | uppercase <- "A" / "B" / "C" / "D" / "E" / "F" / "G" / 701 | "H" / "I" / "J" / "K" / "L" / "M" / "N" / 702 | "O" / "P" / "Q" / "R" / "S" / "T" / "U" / 703 | "V" / "W" / "X" / "Y" / "Z". 704 | lowercase <- "a" / "b" / "c" / "d" / "e" / "f" / "g" / 705 | "h" / "i" / "j" / "k" / "l" / "m" / "n" / 706 | "o" / "p" / "q" / "r" / "s" / "t" / "u" / 707 | "v" / "w" / "x" / "y" / "z". 708 | number <- "0" / "1" / "2" / "3" / "4" / 709 | "5" / "6" / "7" / "8" / "9". 710 | normal <- !nl char. 711 | nl <- "\n". 712 | wsp <- " " / "\t". 713 | 714 | And that pretty much covers the entire deep structure of the 715 | input. All the indentation, logic of blank lines between 716 | other indented lines, parsing of references, version numbers, 717 | carrying chunk headers from one indented region to the next, 718 | and so on, is in there. The only thing that really remains to 719 | be done is specifying what to do with it: concatenate the 720 | `first` and `more` parts of `codepara`s, default version 721 | numbers to zero, dump the codepara parts of chunks into a 722 | dictionary of ordered-search structures, and then run 723 | `tangle`. 724 | 725 | (The grammar is slightly different from the one implemented 726 | by my current implementation: it no longer allows : or >>, in 727 | different contexts, inside of chunk names.) 728 | 729 | 730 | 731 | -------------------------------------------------------------------------------- /peg.md: -------------------------------------------------------------------------------- 1 | PEGs in a PEG 2 | ============= 3 | 4 | So I was reading [Bryan Ford’s thesis][ford] 5 | about parsing expression grammars and packrat parsers, 6 | and I thought it would be fun to implement them 7 | and see how easy they really were. 8 | 9 | It turns out they’re not that hard; 10 | this document contains a one-page PEG parser generator 11 | that generates PEG parsers in JavaScript, 12 | along with an explanation of how it works, 13 | and some example applications. 14 | If you’ve ever thought 15 | that writing a compiler was deep magic 16 | because parsing would take you way too long to understand, 17 | this should show you 18 | that writing a compiler can be simple! 19 | (At least, 20 | if you already know how to program.) 21 | 22 | [ford]: http://pdos.csail.mit.edu/~baford/packrat/thesis/ "Packrat Parsing: a Practical Linear-Time Algorithm with Backtracking" 23 | 24 | What Are PEGs? 25 | -------------- 26 | 27 | A PEG is a formal language description 28 | which describes how to parse some language — 29 | like a regular expression, 30 | it describes the structure of some set of strings. 31 | 32 | ### A Gentle Introduction by Example ### 33 | 34 | Here’s a simple PEG 35 | which describes simple arithmetic expressions 36 | with no operator precedence: 37 | 38 | # in an example arithmetic parser: 39 | sentence <- ('0' / '1' / '2' / '3' / '4' / '5' / '6' / '7' / '8' / '9')+ 40 | ( ('+' / '-' / '*' / '×' / '/' / '÷') sentence / ). 41 | 42 | This says that a `sentence` is 43 | one or more digits, 44 | followed by either an operator and another `sentence`, 45 | or nothing. 46 | The parentheses are used for grouping; 47 | apostrophes `''` are used for literal text; 48 | slashes `/` are used for choice 49 | (“try parsing this, and if it doesn’t work out, try that”); 50 | a left arrow `<-` is used to attach a name 51 | (called a “nonterminal”) 52 | to a parsing rule; 53 | and `x+` means “one or more of `x`”. 54 | 55 | (Typically, 56 | each of the strings that belongs to a language, 57 | such as a program in a programming language, 58 | is called a “sentence” of that language; 59 | thus my choice of that nonterminal name.) 60 | 61 | So, to parse `2*30+4` as a `sentence`, 62 | first we try matching a `0` at the beginning, 63 | where there’s a `2`; 64 | that doesn’t work, so we try a `1`; 65 | that doesn't work, so we try a `2`. 66 | That does work, so then we try for repetition, 67 | looking for a second digit 68 | at the `*`. 69 | That doesn’t work out (after ten tries), so we zoom along and look for a `+`. 70 | The `*` isn’t a `+`, so after a couple of tries, 71 | we find out it’s a `*`. 72 | Then we try parsing a nested `sentence` starting at the `3`. 73 | This time, we match the `3` after three tries, 74 | and then when we look for a second digit, we find a `0`; 75 | the third try fails, so we look for a `+`, and find it; 76 | then we look for a second nested `sentence`. 77 | We match a `4` after four tries, 78 | but we don’t find another digit after it 79 | (because there isn’t anything after it), 80 | so we try to find an operator after it, 81 | which doesn’t work, 82 | so we try to find nothing after it 83 | (the emptiness after the `/` after `sentence`) 84 | which works, 85 | and we’re done. 86 | 87 | Notice that this doesn’t respect operator precedence 88 | (it gives `2*(30+4)` rather than `(2*30)+4`), 89 | and also associates to the right. 90 | 91 | Here’s an example PEG 92 | that handles operator precedence and parentheses, 93 | although not associativity: 94 | 95 | # in an example arithmetic parser with precedence: 96 | sentence <- term ('+'/'-') sentence / term. 97 | term <- atom ('*' / '×' / '/' / '÷') term / atom. 98 | atom <- number / '(' sentence ')'. 99 | number <- ('0' / '1' / '2' / '3' / '4' / '5' / '6' / '7' / '8' / '9')+. 100 | 101 | If we try to parse the same `2*30+4` with this grammar, 102 | we get down to `number` and parse the `2`, 103 | so `atom` succeeds with the `2`, 104 | and then `term` sucks up the `*` 105 | and then looks for an inner `term` at the `3`. 106 | Then `number` parses `30`, 107 | and the inner `term` looks for one of `*×/÷` after it, 108 | which doesn’t work out since what’s after it is a `+`, 109 | so it gives up on its first alternative and tries to parse 110 | just an `atom` starting at the `3`, 111 | rather than an `atom` followed by an operator and another term. 112 | Then `atom` sucks up the `30` just like before, 113 | and the inner `term` finishes, 114 | and then the outer `term` finishes, 115 | and it’s up to `sentence` to deal with the `+4` bit, 116 | which it does in the predictable way. 117 | 118 | It won’t handle `40-1-1` as `(40-1)-1` as you might hope, though. 119 | If you try to rewrite `sentence` to handle this 120 | as `sentence ('+'/'-') term / term`, 121 | you run into trouble — 122 | the first thing `sentence` does 123 | is try to parse a `sentence`, 124 | so you get into an infinite loop. 125 | There are different ways to ameliorate this problem 126 | by enhancing the parser generator, 127 | but in general, 128 | you can always figure out a way 129 | to modify the grammar 130 | to remove this “left recursion”; 131 | it just makes it a little more complicated 132 | to handle the results of the parser. 133 | 134 | (As an aside, most practical PEG systems 135 | let you abbreviate things like 136 | `('0' / '1' / '2' / '3' / '4' / '5' / '6' / '7' / '8' / '9')` 137 | as `[0-9]`, 138 | but the one in this document doesn’t.) 139 | 140 | That covers most of the stuff PEGs can do. 141 | A few things to notice: 142 | 143 | 1. They’re a little more verbose than regular expressions 144 | but a lot more powerful at understanding structure. 145 | And, like with regexps, 146 | you can do a hell of a lot 147 | in a few lines of code. 148 | 2. The obvious implementation is pretty slow; 149 | it spends a lot of time re-parsing 150 | things it’s already parsed 151 | and playing Cheese Shop with the next character. 152 | (“Have you got a 0?” “No.” 153 | “How about a 1?” “No.” ...) 154 | It turns out there are ways to solve this, 155 | although I don’t explore them in this document. 156 | 3. They have trouble with “left recursion”, 157 | which is where the first thing in a “foo” (say, `sentence`) 158 | can be a smaller “foo”. 159 | 160 | There’s one more big feature of PEGs: 161 | the ability to do negative lookahead, or negation. 162 | As an example, in C, 163 | a comment begins at a `/*` 164 | and continues until the next `*/`. 165 | But you can have `*` and `/` and even `/*` inside the comment, 166 | as long as there isn't a `*/`. 167 | Doing this in a regexp is a real pain and the result is unreadable. 168 | You end up with a regexp like `\/\*([^*]|\*[^/])*\*\/`, 169 | assuming you have to backslash your slashes. 170 | In a PEG, it looks like this: 171 | 172 | # in the C comment example PEG: 173 | comment <- '/*' (!'*/' char)* '*/'. 174 | 175 | That is, to parse a comment, first parse a `/*`, 176 | then as long as the next thing isn’t a `*/`, try to parse a `char`, 177 | and then parse a `*/`. 178 | (The `*` means “zero or more”, 179 | just as `+` means “one or more”.) 180 | You can write the same thing 181 | with Perl’s enhanced regexp features: `qr|/\*(?:(?!\*/).)*\*/|`, 182 | and it’s only slightly shorter, 183 | but I think it's not as clear. 184 | 185 | You might think that in `!'*/' char`, 186 | the negation of `'*/'` somehow *modifies* `char`. 187 | But it doesn’t, really; 188 | it just means that the parse fails 189 | at points in the input 190 | where `'*/'` can match, 191 | so `char` 192 | doesn’t get a chance to match there. 193 | Instead, we backtrack from matching the `'*/'`, 194 | break out of the loop, 195 | and get a chance to match the `'*/'` on the outside. 196 | 197 | You can use this magic PEG power 198 | for a variety of things that are traditionally painful. 199 | For example, most programming languages have keywords, 200 | which look like variables (or other identifiers) 201 | but are treated differently syntactically. 202 | In a PEG, you can write this: 203 | 204 | # in the keyword example PEG: 205 | keyword = ('if' / 'while' / 'for' / otherkeyword) !idchar. 206 | identifier = !keyword idstartchar idchar*. 207 | 208 | This first specifies that a `keyword` 209 | is one of the specified words 210 | as long as it's not followed by an `idchar`; 211 | then it specifies that when you’re trying to parse an `identifier`, 212 | first try to parse a `keyword`, 213 | and if that succeeds, 214 | then parsing the `identifier` should fail; 215 | but if there's no `keyword`, 216 | go ahead and try to parse an `idstartchar` 217 | followed by zero or more `idchar`s. 218 | 219 | Note that we throw away the results 220 | of trying to parse the `keyword` — 221 | we were only trying it in order to see 222 | if we shouldn’t do something else. 223 | 224 | ### If You’ve Taken a Compilers Class Lately ### 225 | 226 | I thought I’d stick this section in 227 | for the benefit of folks who are all up on the theory. 228 | The rest of the document 229 | doesn’t depend on it. 230 | 231 | PEGs specify how to *parse* a language, 232 | by contrast with context-free grammars, 233 | which primarily describe how to *generate* sentences of a language. 234 | This difference 235 | makes it much easier to construct parsers for PEGs; 236 | they can be straightforwardly converted 237 | into simple recursive-descent parsers 238 | performing limited backtracking, 239 | with each nonterminal becoming a parsing function. 240 | It also probably makes it much more difficult 241 | to prove properties of the language recognized by a PEG. 242 | 243 | PEGs can parse some languages 244 | that context-free grammars can’t, 245 | such as the language `a`n`b`n`c`n, 246 | that is, 247 | some number of `a`s, 248 | followed by the same number of `b`s, 249 | followed by the same number of `c`s. 250 | However, because PEGs can’t handle ambiguity, 251 | and because there’s a linear-time parsing algorithm for them, 252 | it is suspected that PEGs can’t parse 253 | all languages context-free grammars can. 254 | `S → a S a | a S b | b S a | b S b | a` 255 | is a simple context-free language 256 | which Ford conjectured cannot be parsed with a PEG; 257 | it describes strings of odd numbers of `a`s and `b`s 258 | in which the middle letter is an `a`. 259 | PEGs can parse all languages 260 | that can be parsed with LL(k) or LR(k) parsers. 261 | 262 | PEGs are more composable 263 | than LL(k) or LR(k) CFGs; 264 | because PEGs can’t handle ambiguity, 265 | it’s easy to predict the effect 266 | of adding new parsing rules to the grammar. 267 | 268 | You can parse general CFGs 269 | with a backtracking approach like the PEG approach; 270 | the difference is that each nonterminal 271 | must be able to succeed multiple times 272 | on the same input 273 | with different possible parses, 274 | in case something that follows it fails. 275 | Definite clause grammars in Prolog 276 | are one example of this strategy. 277 | In PEGs, once a nonterminal succeeds at some position, 278 | it throws away its backtracking state, 279 | so it can only produce at most one result at that position. 280 | As a consequence, 281 | even though there are PEGs that take exponential time to parse 282 | (if implemented the naïve way) 283 | CFGs with exponential-time parsing 284 | (again, if implemented the naïve way, as with DCGs) 285 | are much more common. 286 | 287 | (Allan Schiffman tells me 288 | that all you really need to do to make DCGs perform well 289 | is to put cuts in “the obvious places”, 290 | e.g. between statements. 291 | I haven’t tried it myself.) 292 | 293 | A Minimal PEG Language 294 | ---------------------- 295 | 296 | The expressions in PEGs minimally contain 297 | (using the TDPL notation in the thesis) 298 | negation `!`, 299 | ordered choice or alternation `/`, 300 | concatenation or sequencing (denoted by juxtaposition), 301 | terminal strings (written in single quotes `''`), 302 | and nonterminals (written as bare words `foo`). 303 | (We can leave out repetition `*` and `+`, 304 | because as shown below, 305 | we can synthesize them.) 306 | 307 | Here’s a relatively minimal grammar 308 | describing a notation for a grammar 309 | with these features, 310 | the same one I used in the “Gentle Introduction” section, 311 | written in terms of itself: 312 | 313 | # in a minimal parsing expression grammar: 314 | _ <- sp _ / . 315 | sp <- ' ' / '\n' / '\t'. 316 | sentence <- _ rule sentence / _ rule. 317 | rule <- name _ '<-'_ choice '.'_. 318 | choice <- sequence '/'_ choice / sequence. 319 | sequence <- term sequence / . 320 | term <- '!'_ term / '\'' stringcontents '\''_ / name _. 321 | stringcontents <- stringchar stringcontents / . 322 | stringchar <- !'\\' !'\'' char / '\\' char. 323 | name <- namechar name / namechar. 324 | namechar <- !'!' !'\'' !sp !'<-' !'/' !'.' char. 325 | 326 | This all depends on the primitive nonterminal `char`, 327 | which I’m assuming matches any character, 328 | for some definition of character. 329 | 330 | The nonterminal `_` consumes any amount of whitespace. 331 | It’s used everywhere we want to consume whitespace, 332 | generally at the lowest possible level of the grammar, 333 | with the exception of `name` 334 | (on the theory that the whitespace 335 | is not really part of the name.) 336 | (Even though it has a funny non-alphabetic name, 337 | the language doesn’t treat it specially. 338 | I used to call it `s` but it was distracting.) 339 | 340 | There are three cases of the pattern `group <- item group / .`, 341 | which means `group` is zero or more things that match `item`. 342 | Because PEGs are greedy and don’t backtrack after returning, 343 | `group` will only ever parse 344 | the maximum possible number of `item` items. 345 | It’s not possible for a parsing failure after the `group` 346 | to cause `group` to backtrack and return a smaller number of `item` objects, 347 | the way it could in a parser for a context-free grammar, 348 | although a parsing failure inside the last `item` will indeed do so. 349 | This allows us to get by 350 | without a separate scanner for this grammar! 351 | One minor variation of this pattern 352 | is found in `sentence` and `name`, 353 | which match *one* or more of their elements, 354 | not *zero* or more. 355 | 356 | Note that the above grammar tells us how to parse the language, 357 | but doesn’t tell us anything about its semantics. 358 | But it’s nice and short. 359 | 360 | Adding Grouping 361 | --------------- 362 | 363 | The PEG language as written above is pretty weak. 364 | It doesn’t have grouping or repetition, 365 | although they can be emulated with the use of extra productions, 366 | as in the `foos` pattern explained above. 367 | 368 | We can add grouping by redefining `term` like this: 369 | 370 | # in a slightly more powerful parsing expression grammar: 371 | term <- '!'_ term / '\'' stringcontents '\''_ / name _ 372 | / '('_ choice ')'_. 373 | 374 | This simplifies the grammar only slightly; 375 | we can rewrite `stringcontents` as follows: 376 | 377 | stringcontents <- (!'\\' !'\'' char / '\\' char) stringcontents / . 378 | 379 | A Diversion: Adding Repetition 380 | ------------------------------ 381 | 382 | Although it turns out not to be very useful 383 | for what I’ll do next, 384 | adding the capability for repetition to the language 385 | makes it shorter and clearer. 386 | 387 | # in a more powerful PEG: 388 | sp <- ' ' / '\n' / '\t'. 389 | _ <- sp*. 390 | sentence <- _ (name _ '<-'_ choice '.'_)+. 391 | choice <- term* ('/'_ term*)*. 392 | term <- ('!'_ term / string / name / '('_ choice ')')_ ('+' / '*' / )_. 393 | string <- '\'' (!'\\' !'\'' char / '\\' char)* '\''_. 394 | meta <- '!' / '\'' / '<-' / '/' / '.' / '+' / '*' / '(' / ')'. 395 | name <- (!meta !sp char)+. 396 | 397 | That shrinks the grammar considerably, 398 | while significantly expanding 399 | the expressiveness of the grammar language it describes. 400 | 401 | Adding Result Expressions 402 | ------------------------- 403 | 404 | In theory, the grammar as written could be useful. 405 | It’s expressive enough to describe 406 | the tree structure of a language, 407 | such as the PEG language defined above. 408 | So you could use it to parse some string 409 | into a syntax tree. 410 | 411 | However, 412 | it would be even more useful 413 | to have a version of the grammar language 414 | that can include result expressions 415 | written in some programming language 416 | that compute useful things. 417 | For example, 418 | you could use such a system 419 | to write and maintain a working compiler 420 | from PEG grammars to some programming language, 421 | or from some other language. 422 | 423 | A straightforward and readable way to do this 424 | is to label some parts of a sequence with names, 425 | and then to use those names in a result specification 426 | at the end of the sequence. 427 | 428 | Here’s an extension of the above grammar 429 | that allows for such names and result specifications: 430 | 431 | # in a PEG describing results: 432 | sp <- ' ' / '\n' / '\t'. 433 | _ <- sp _ / . 434 | sentence <- _ rule sentence / _ rule. 435 | rule <- name _ '<-'_ choice '.'_. 436 | choice <- sequence '/'_ choice / sequence. 437 | sequence <- term sequence / '->'_ expr / . 438 | expr <- '('_ exprcontents ')'_. 439 | exprcontents <- (!'(' !')' char / expr) exprcontents / . 440 | term <- name _ ':'_ term / '!'_ term / string / name _ 441 | / '('_ choice ')'_. 442 | string <- '\'' stringcontents '\''_. 443 | stringcontents <- !'\\' !'\'' char stringcontents 444 | / '\\' char stringcontents / . 445 | meta <- '!' / '\'' / '<-' / '/' / '.' / '(' / ')' / ':' / '->'. 446 | name <- namechar name / namechar. 447 | namechar <- !meta !sp char. 448 | 449 | This adds the possibility 450 | that a term may be preceded by a colon and a name, 451 | and that a sequence may end 452 | with a `->` and a parenthesized expression. 453 | 454 | This lets you write things like 455 | `n: expr` 456 | and `expr _ -> (print("got expr"))`. 457 | It doesn’t place strong requirements 458 | on the embedded expression, 459 | so it can be in almost any language, 460 | but it does require that any parentheses inside of it 461 | be balanced. 462 | (If that's difficult in a certain case, 463 | due to embedded strings, 464 | maybe you can incorporate some commented-out parentheses 465 | to balance things.) 466 | 467 | A Metacircular Compiler-Compiler 468 | -------------------------------- 469 | 470 | So let’s suppose that we want to use this result-expression facility 471 | to write a compiler for these grammars, 472 | producing a parser for the specified grammar 473 | in, say, JavaScript. 474 | We want to translate each parsing expression 475 | in the grammar language 476 | into an expression in the target language 477 | that parses 478 | the sub-language defined by that parsing expression. 479 | For example, 480 | we want to translate 481 | `choice <- sequence '/'_ choice / sequence.` 482 | into a recursive JavaScript function 483 | that parses expressions containing slash-separated `choice`s. 484 | Since it doesn’t specify a result expression, 485 | it’s sort of indeterminate what it should actually do, 486 | other than consume characters from the input stream 487 | until it finds something `choice` can't parse. 488 | 489 | So now we have to figure out 490 | what the semantics are 491 | of each of the various actions. 492 | 493 | I’m going to factor out 494 | the code generation parts 495 | into separate named blocks 496 | so that it’s relatively easy 497 | to have the parser, say, 498 | generate code in some other language, 499 | or just an abstract syntax tree. 500 | 501 | ### Whitespace ### 502 | 503 | Whitespace is fairly easy: 504 | it is a no-op. 505 | 506 | # in the metacircular compiler-compiler: 507 | sp <- ' ' / '\n' / '\t'. 508 | _ <- sp _ / . 509 | 510 | ### Rules ### 511 | 512 | Let’s compile each rule 513 | into a JavaScript function 514 | that parses the language described by that rule, 515 | and the grammar as a whole 516 | into the collection of these functions 517 | plus whatever support code is needed. 518 | (Here I’m going to use double angle-brackets `<<>>` 519 | to name chunks of code that aren’t given until later.) 520 | 521 | rule <- n: name _ '<-'_ body: choice '.'_ -> 522 | <> 523 | . 524 | sentence <- _ r: rule g: sentence -> (r + "\n" + g) 525 | / _ r: rule -> (r + "\n" 526 | <> 527 | ). 528 | 529 | The code to produce a function 530 | in JavaScript 531 | is quite straightforward: 532 | 533 | # in code to produce a function: 534 | (["function parse_", n, "(input, pos) {\n", 535 | <> 536 | body, 537 | <> 538 | "}\n"].join('')) 539 | 540 | So a grammar nonterminal named `term` 541 | will be compiled into a function called `parse_term`, 542 | whose body will be the value computed by `choice`, 543 | bracketed by some startup and cleanup code, 544 | and therefore `choice` needs to evaluate to 545 | a string of 546 | zero or more valid JavaScript statements. 547 | 548 | These functions 549 | will need to do several things 550 | to implement the semantics of a PEG parser: 551 | 552 | 1. Advance the input position, 553 | starting from the input position the caller passed in, 554 | and in case of success, 555 | communicate the new input position 556 | to the caller. 557 | 2. Save the input position 558 | (and any other state) 559 | in order to backtrack 560 | when a sequence inside a choice fails, 561 | or after testing a negation condition. 562 | They may have to save 563 | several input positions at once 564 | in cases where there is nested alternation. 565 | 3. Compute the value 566 | given by the result expressions in the grammar 567 | and, in case of success, 568 | pass it back to the caller, 569 | along with the new input position. 570 | 571 | In order to avoid global variables, 572 | we’re passing in the input string 573 | (which doesn’t change during a parse) 574 | and the current position in it 575 | as arguments to each parsing function. 576 | 577 | To package the value computed 578 | along with the new input position, 579 | we’ll return a JavaScript object 580 | with `val` and `pos` properties, 581 | like `{val: "foo", pos: 37}`. 582 | In case of failure, 583 | we’ll just return `null`. 584 | 585 | From here we’ll mostly work bottom-up. 586 | 587 | ### Names ### 588 | 589 | Names are used in two contexts: 590 | at the top level of a rule, 591 | they define the name of the nonterminal, 592 | and in a term, 593 | they request a call to that nonterminal. 594 | In both cases, 595 | we basically just need the contents of the name. 596 | 597 | # in the metacircular compiler-compiler: 598 | meta <- '!' / '\'' / '<-' / '/' / '.' / '(' / ')' / ':' / '->'. 599 | name <- c: namechar n: name -> (c + n) / namechar. 600 | namechar <- !meta !sp char. 601 | 602 | In this case, 603 | we presume that the value produced by `char` 604 | (and thus the value produced by `namechar`) 605 | is the character it consumed, 606 | and that in the absence of an explicit result expression, 607 | the result of the whole rule 608 | is that same character. 609 | This can be implemented, for example, 610 | by having a sequence return by default 611 | the value of the last term in it. 612 | (I’m not sure that’s a good default, 613 | because it seems a little error-prone, 614 | but I’ll try it.) 615 | 616 | ### Nonterminals ### 617 | 618 | A reference to a nonterminal 619 | is compiled as a call to its parsing function, 620 | passing in the current position. 621 | 622 | # in the metacircular compiler-compiler: 623 | term <- labeled / nonterminal / string / negation / parenthesized. 624 | nonterminal <- n: name _ -> 625 | <> 626 | . 627 | 628 | Again, the JS implemntation 629 | of a subroutine call 630 | is quite simple: 631 | 632 | # in code to parse another nonterminal: 633 | ([' state = parse_', n, '(input, state.pos);\n'].join('')) 634 | 635 | This means we need a variable `state` 636 | to store this returned value in, 637 | and it needs to be initialized 638 | with the position passed in by the caller. 639 | 640 | # in function prologue: 641 | ' var state = { pos: pos };\n', 642 | 643 | What do we do with `state.val`? 644 | It depends on where the nonterminal is found. 645 | If it’s preceded by a label, 646 | we want to store it in a variable 647 | under that name 648 | for later use, 649 | unless it fails. 650 | Let’s have `term`, 651 | just like `choice`, 652 | return a string of zero or more valid JavaScript statements. 653 | 654 | # in the metacircular compiler-compiler: 655 | labeled <- label: name _ ':'_ value: term -> 656 | <> 657 | . 658 | 659 | We protect this with a conditional on `state` 660 | in case the parse has failed: 661 | 662 | # in code to save a value in a variable: 663 | ([value, ' if (state) var ', label, ' = state.val;\n'].join('')) 664 | 665 | (Ideally we would undo this saving 666 | if the nonterminal is in an alternative 667 | that fails and ends up being backtracked; 668 | but hopefully the result expressions 669 | of later alternatives 670 | will simply not use that variable.) 671 | 672 | Now, 673 | if the nonterminal 674 | was the last thing in a parsing function, 675 | then we want to return the `state.val` it gave us 676 | as our own `state.val`, 677 | and additionally we want to return its `state.pos` 678 | as our `state.pos`; 679 | or, if it failed, 680 | it returned `null`, 681 | in which case we want to return `null`. 682 | 683 | So at the end of the function, 684 | we can just return `state`: 685 | 686 | # in function epilogue: 687 | ' return state;\n', 688 | 689 | Now we just need to ensure 690 | that all of the other expression types 691 | (sequence, terminal strings, ordered choice, negation, parenthesized) 692 | update `state` in a manner analogous 693 | to how calls to nonterminals update `state`. 694 | 695 | While we're on the topic of nonterminals, 696 | we should probably define the one predefined nonterminal, 697 | `char`: 698 | 699 | # in support code: 700 | + 'function parse_char(input, pos) {\n' 701 | + ' if (pos >= input.length) return null;\n' 702 | + ' return { pos: pos + 1, val: input.charAt(pos) };\n' 703 | + '}\n' 704 | 705 | ### Sequence ### 706 | 707 | Sequences are relatively simple. 708 | Given a sequence of two expressions `foo bar`, 709 | we first parse `foo` from the current position, 710 | and if that succeeded, 711 | we parse `bar` from the new position. 712 | If it fails, 713 | the sequence as a whole fails, 714 | and there is no current position. 715 | 716 | This is one of the things 717 | that is easier to do 718 | if you don’t try to write your grammar with features like `*`, 719 | since it treats sequences of arbitrary numbers of things 720 | as nested sequences of two items, 721 | the innermost of which is empty. 722 | 723 | # in the bare grammar: 724 | sequence <- term sequence / '->'_ expr / . 725 | 726 | The case of an empty sequence 727 | doesn’t update `state` at all. 728 | In the case of a non-empty sequence, 729 | we execute `foo`, 730 | and if `foo` doesn’t set `state` to `null`, 731 | we execute `bar`. 732 | 733 | # in the metacircular compiler-compiler: 734 | sequence <- foo: term bar: sequence -> 735 | <> 736 | / result_expression / -> (''). 737 | 738 | The `result_expression` case 739 | is one of the last things explained, 740 | so ignore it for now. 741 | 742 | This will result in deeply nested if statements 743 | without proper indentation 744 | in the output 745 | when there is a long sequence, 746 | but that’s probably okay: 747 | 748 | # in code to handle a sequence: 749 | ([foo, ' if (state) {\n', bar, ' }\n'].join('')) 750 | 751 | ### Terminal Strings ### 752 | 753 | A “terminal” or literal string like `'->'` 754 | either matches some characters in the input 755 | or fails to do so. 756 | Rather than inserting code into every parsing function 757 | to compare parts of the input, 758 | making the parsing functions less readable, 759 | we’ll factor this out into a single “literal” function: 760 | 761 | # in support code: 762 | + 'function literal(input, pos, string) {\n' 763 | + ' if (input.substr(pos, string.length) === string) {\n' 764 | + ' return { pos: pos + string.length, val: string };\n' 765 | + ' } else return null;\n' 766 | + '}\n' 767 | 768 | So then we just need to emit code to call this function 769 | and update `state` appropriately 770 | when we encounter a terminal string. 771 | As it happens, 772 | the translation from string syntax in the PEG language 773 | to string syntax in JavaScript 774 | is the null transformation. 775 | If we were compiling to some other language, 776 | such as C, 777 | this might pose some difficulty. 778 | 779 | # in the metacircular compiler-compiler: 780 | string <- '\'' s: stringcontents '\''_ -> 781 | <> 782 | . 783 | stringcontents <- !'\\' !'\'' c: char s: stringcontents -> (c + s) 784 | / b: '\\' c: char s: stringcontents -> (b + c + s) 785 | / -> (''). 786 | 787 | So here’s the function call: 788 | 789 | # in code to match a literal string: 790 | ([" state = literal(input, state.pos, '", s, "');\n"].join('')) 791 | 792 | As we iterate through the characters or backslash-escapes 793 | inside the string, we convert them to strings — 794 | either by default, 795 | or explicitly by concatenating the backslash 796 | to the character that follows it. 797 | Then we call `literal` 798 | with the current position 799 | and it either returns `null` 800 | or gives us the new position and the value it matched 801 | as our new `state`. 802 | 803 | ### Ordered Choice ### 804 | 805 | Two of the remaining expression types 806 | (ordered choice, negation, but not terminal strings and parenthesized) 807 | can require backtracking. 808 | So we have to save a state 809 | and possibly restore that state. 810 | 811 | Here’s how ordered choice works; 812 | negation is fairly similar. 813 | In ordered choice, 814 | if the first alternative succeeds, 815 | we don’t try the others; 816 | but if it fails, 817 | we restore the previously saved state. 818 | 819 | This is complicated somewhat 820 | by the fact that we might be inside a parenthesized expression, 821 | so there may be a stack of previously saved states, 822 | even inside the same function. 823 | 824 | So on entry to the function, we create a stack: 825 | 826 | # in function prologue: 827 | ' var stack = [];\n', 828 | 829 | The grammar entry treats N-way choices 830 | like `labeled / negation / string / nonterminal / parenthesized` 831 | as nested 2-way choices 832 | like `labeled / (negation / (string / (nonterminal / parenthesized)))`. 833 | This is a little bit needlessly inefficient, 834 | since we’ll be using potentially four stack entries 835 | instead of one, 836 | but it will do for now. 837 | 838 | # in the metacircular compiler-compiler: 839 | choice <- a: sequence '/'_ b: choice -> 840 | <> 841 | / sequence. 842 | 843 | Execution of `b` is conditional on failure of `a`; 844 | if `a` succeeds, 845 | we simply discard the state 846 | we saved before trying it. 847 | 848 | # in code to handle a choice: 849 | ([' stack.push(state);\n', 850 | a, 851 | ' if (!state) {\n', 852 | ' state = stack.pop();\n', 853 | b, 854 | ' } else stack.pop();\n'].join('')) 855 | 856 | It’s only safe to push `state` 857 | rather than a copy of `state` 858 | because we never mutate the existing `state`; 859 | we only make new `state` objects. 860 | 861 | ### Negation ### 862 | 863 | Negation is `!x`: 864 | 865 | # in the metacircular compiler-compiler: 866 | negation <- '!'_ t: term -> 867 | <> 868 | . 869 | 870 | This is implemented by saving the parse state, 871 | trying to parse `x`, 872 | failing if parsing `x` succeeded, 873 | and otherwise proceeding from the saved parse state. 874 | 875 | # in code to handle negation: 876 | ([' stack.push(state);\n', 877 | t, 878 | ' if (state) {\n', 879 | ' stack.pop();\n', 880 | ' state = null;\n', 881 | ' } else state = stack.pop();\n'].join('')) 882 | 883 | You can use a double negative like `!!'->'` 884 | to write a “zero-width positive lookahead assertion” in Perl lingo. 885 | That compiles into this: 886 | 887 | # in the output of the compiler-compiler: 888 | stack.push(state); 889 | stack.push(state); 890 | state = literal(input, state.pos, '->'); 891 | if (state) { 892 | stack.pop(); 893 | state = null; 894 | } else state = stack.pop(); 895 | if (state) { 896 | stack.pop(); 897 | state = null; 898 | } else state = stack.pop(); 899 | 900 | The initial `state` is assumed to be non-`null`. 901 | So after the call to `literal`, 902 | `state` is non-`null` iff the next couple of characters were `->`. 903 | Then, after the first `if`, 904 | `state` is non-`null` iff the next couple of characters *weren’t* `->`. 905 | Then, after the second `if`, 906 | it is again non-`null` iff the next couple of characters were `->`. 907 | And if it’s non-`null`, 908 | it’s the `state` you started with. 909 | 910 | So that does the right thing, 911 | perhaps a bit verbosely. 912 | 913 | ### Result Expressions ### 914 | 915 | A result expression 916 | gives a JavaScript expression 917 | to evaluate 918 | to get the value that a sequence parses to. 919 | Normally, it uses variable bindings 920 | produced by labels. 921 | The value it returns 922 | may become 923 | the value of the term (if the sequence is inside parentheses) 924 | or the value returned by a whole parsing function. 925 | 926 | # in the metacircular compiler-compiler: 927 | result_expression <- '->'_ result: expr _ -> 928 | <> 929 | . 930 | 931 | Note the `_` to discard whitespace. 932 | 933 | Of course, 934 | this is conditional 935 | on the parser not being in a failed state: 936 | 937 | # in code to handle result expressions: 938 | ([' if (state) state.val = ', result, ';\n'].join('')) 939 | 940 | The expression is delimited by parentheses `()`. 941 | The outermost pair of parentheses 942 | are kept, 943 | which simplifies the grammar 944 | and avoids tricky problems of operator precedence 945 | when the result expression is copied into the output program 946 | in the `state.val =` context above. 947 | 948 | # in the metacircular compiler-compiler: 949 | expr <- '('_ e: exprcontents ')' -> ('(' + e + ')'). 950 | exprcontents <- c: (!'(' !')' char / expr) e: exprcontents -> (c + e) 951 | / -> (''). 952 | 953 | `result_expression` discards whitespace after the expression 954 | rather than having the expression production do it itself 955 | in order to preserve whitespace after right parens 956 | consumed by recursive calls to the expression production. 957 | 958 | ### Parenthesized Expressions ### 959 | 960 | Parenthesized expressions 961 | don’t need any real special handling; 962 | or, rather, the special handling 963 | consists of the `stack` variable everything uses to backtrack; 964 | the parentheses are only there 965 | to direct the parser how to parse `/` and `!` and so on. 966 | 967 | parenthesized <- '('_ body: choice ')'_ -> (body). 968 | 969 | ### Exporting ### 970 | 971 | We need one more thing 972 | if our grammar is to be loadable as a [CommonJS module][] 973 | by systems like [node.js][]: 974 | 975 | # in support code: 976 | + "if (typeof exports !== 'undefined')\n" 977 | + " exports.parse_sentence = parse_sentence;\n" 978 | 979 | [CommonJS module]: http://wiki.commonjs.org/wiki/Modules/1.1#Module_Context 980 | [node.js]: http://nodejs.org/ 981 | 982 | This assumes that the grammar being processed 983 | has a production called `sentence`, 984 | which is the only thing that will be exported. 985 | 986 | The Whole Metacircular Compiler-Compiler 987 | ---------------------------------------- 988 | 989 | Here’s the whole thing, 990 | extracted from this document: 991 | 992 | # in the output metacircular compiler-compiler: 993 | sp <- ' ' / '\n' / '\t'. 994 | _ <- sp _ / . 995 | rule <- n: name _ '<-'_ body: choice '.'_ -> 996 | (["function parse_", n, "(input, pos) {\n", 997 | ' var state = { pos: pos };\n', 998 | ' var stack = [];\n', 999 | body, 1000 | ' return state;\n', 1001 | "}\n"].join('')) 1002 | . 1003 | sentence <- _ r: rule g: sentence -> (r + "\n" + g) 1004 | / _ r: rule -> (r + "\n" 1005 | + 'function parse_char(input, pos) {\n' 1006 | + ' if (pos >= input.length) return null;\n' 1007 | + ' return { pos: pos + 1, val: input.charAt(pos) };\n' 1008 | + '}\n' 1009 | + 'function literal(input, pos, string) {\n' 1010 | + ' if (input.substr(pos, string.length) === string) {\n' 1011 | + ' return { pos: pos + string.length, val: string };\n' 1012 | + ' } else return null;\n' 1013 | + '}\n' 1014 | + "if (typeof exports !== 'undefined')\n" 1015 | + " exports.parse_sentence = parse_sentence;\n" 1016 | ). 1017 | meta <- '!' / '\'' / '<-' / '/' / '.' / '(' / ')' / ':' / '->'. 1018 | name <- c: namechar n: name -> (c + n) / namechar. 1019 | namechar <- !meta !sp char. 1020 | term <- labeled / nonterminal / string / negation / parenthesized. 1021 | nonterminal <- n: name _ -> 1022 | ([' state = parse_', n, '(input, state.pos);\n'].join('')) 1023 | . 1024 | labeled <- label: name _ ':'_ value: term -> 1025 | ([value, ' if (state) var ', label, ' = state.val;\n'].join('')) 1026 | . 1027 | sequence <- foo: term bar: sequence -> 1028 | ([foo, ' if (state) {\n', bar, ' }\n'].join('')) 1029 | / result_expression / -> (''). 1030 | string <- '\'' s: stringcontents '\''_ -> 1031 | ([" state = literal(input, state.pos, '", s, "');\n"].join('')) 1032 | . 1033 | stringcontents <- !'\\' !'\'' c: char s: stringcontents -> (c + s) 1034 | / b: '\\' c: char s: stringcontents -> (b + c + s) 1035 | / -> (''). 1036 | choice <- a: sequence '/'_ b: choice -> 1037 | ([' stack.push(state);\n', 1038 | a, 1039 | ' if (!state) {\n', 1040 | ' state = stack.pop();\n', 1041 | b, 1042 | ' } else stack.pop();\n'].join('')) 1043 | / sequence. 1044 | negation <- '!'_ t: term -> 1045 | ([' stack.push(state);\n', 1046 | t, 1047 | ' if (state) {\n', 1048 | ' stack.pop();\n', 1049 | ' state = null;\n', 1050 | ' } else state = stack.pop();\n'].join('')) 1051 | . 1052 | result_expression <- '->'_ result: expr _ -> 1053 | ([' if (state) state.val = ', result, ';\n'].join('')) 1054 | . 1055 | expr <- '('_ e: exprcontents ')' -> ('(' + e + ')'). 1056 | exprcontents <- c: (!'(' !')' char / expr) e: exprcontents -> (c + e) 1057 | / -> (''). 1058 | parenthesized <- '('_ body: choice ')'_ -> (body). 1059 | 1060 | That’s 66 lines of code, 1061 | constituting a compiler 1062 | that can compile itself into JavaScript, 1063 | if you have a way to execute it. 1064 | 1065 | **XXX: a couple of lines are over 80 chars; fix this!** 1066 | 1067 | Bootstrapping to JavaScript 1068 | --------------------------- 1069 | 1070 | But, to actually execute this compiler-compiler, 1071 | you need a version already running, 1072 | so you can compile the compiler-compiler to JavaScript. 1073 | 1074 | ### Hand-compiling: a blind alley ### 1075 | 1076 | I started by trying to compile it by hand, 1077 | using YASnippet, 1078 | but after not very long, I gave up on that approach. 1079 | Here are the hand-compiled versions of 1080 | `sp <- ' ' / '\n' / '\t'.` 1081 | and `_ <- sp _ / .` 1082 | 1083 | # in the hand-compiled metacircular compiler-compiler: 1084 | function parse_sp(input, pos) { 1085 | var state = { pos: pos }; 1086 | var stack = []; 1087 | stack.push(state); 1088 | state = literal(input, state.pos, ' '); 1089 | if (!state) { 1090 | state = stack.pop(); 1091 | stack.push(state); 1092 | state = literal(input, state.pos, '\n'); 1093 | if (!state) { 1094 | state = stack.pop(); 1095 | state = literal(input, state.pos, '\t'); 1096 | } else { 1097 | stack.pop(); 1098 | } 1099 | } else { 1100 | stack.pop(); 1101 | } 1102 | return state; 1103 | } 1104 | function parse__(input, pos) { 1105 | var state = { pos: pos }; 1106 | var stack = []; 1107 | stack.push(state); 1108 | state = parse_sp(input, state.pos); 1109 | if (state) { 1110 | state = parse__(input, state.pos); 1111 | } 1112 | if (!state) { 1113 | state = stack.pop(); 1114 | } else { 1115 | stack.pop(); 1116 | } 1117 | return state; 1118 | } 1119 | 1120 | After thus inflating two lines of grammar 1121 | into 35 lines of JavaScript, 1122 | I knew I needed a better way. 1123 | At that rate, 1124 | the whole thing would be about 1200 lines. 1125 | That’s too much to debug, 1126 | even if YASnippet makes it relatively easy to type, 1127 | unless there's no easier way. 1128 | 1129 | But there is. 1130 | 1131 | ### A Bunch of Functions ### 1132 | 1133 | So, instead, 1134 | I'm writing one function 1135 | for each interesting recognition rule from the grammar, 1136 | returning the same result expressions 1137 | that the parsing function will. 1138 | Then I can construct 1139 | a sort of abstract syntax tree of the grammar 1140 | out of calls to these functions, 1141 | and it will only be a little larger than the grammar itself. 1142 | 1143 | For example, 1144 | the first rule `sp <- ' ' / '\n' / '\t'.` 1145 | will become: 1146 | 1147 | # in the ASTs made of function calls: 1148 | var sp_rule = rule('sp', choice(string(' '), choice(string('\\n'), 1149 | string('\\t')))); 1150 | 1151 | This is a bit of a cheat; 1152 | the innermost choice really parses as 1153 | `choice(sequence(string('\\n'), ''), sequence(string('\\t'), ''))` 1154 | but I'm hoping that doesn’t matter for now. 1155 | 1156 | Then at the end I can combine all of the variables 1157 | into a grammar. 1158 | 1159 | First I need the functions, though. 1160 | 1161 | I’m omitting `sp` 1162 | (likewise `_`, `meta`) 1163 | because they don’t produce interesting values. 1164 | 1165 | # in the bunch-of-functions version: 1166 | function rule(n, body) { 1167 | return (["function parse_", n, "(input, pos) {\n", 1168 | ' var state = { pos: pos };\n', 1169 | ' var stack = [];\n', 1170 | body, 1171 | ' return state;\n', 1172 | "}\n"].join('')); 1173 | } 1174 | 1175 | function sentence2(r, g) { 1176 | return (r + "\n" + g); 1177 | } 1178 | 1179 | function sentence1(r) { 1180 | return (r + "\n" 1181 | <> 1182 | ); 1183 | } 1184 | 1185 | I’m omitting `name` 1186 | (likewise `expr`, `inner`, `exprcontents`, `stringcontents`) 1187 | because it just copies a character string from the input 1188 | into the output. 1189 | I can do that myself. 1190 | And I’m omitting `term` 1191 | because it just returns one of its children's values. 1192 | 1193 | function nonterminal(n) { 1194 | return [' state = parse_', n, '(input, state.pos);\n'].join(''); 1195 | } 1196 | function labeled(label, value) { 1197 | return [value, ' if (state) var ', label, ' = state.val;\n'].join(''); 1198 | } 1199 | function sequence(foo, bar) { 1200 | return [foo, ' if (state) {\n', bar, ' }\n'].join(''); 1201 | } 1202 | function string(s) { 1203 | return [" state = literal(input, state.pos, '", s, "');\n"].join(''); 1204 | } 1205 | function choice(a, b) { 1206 | return [ 1207 | ' stack.push(state);\n', 1208 | a, 1209 | ' if (!state) {\n', 1210 | ' state = stack.pop();\n', 1211 | b, 1212 | ' } else {\n', 1213 | ' stack.pop();\n', // discard unnecessary saved state 1214 | ' }\n'].join(''); 1215 | } 1216 | function negation(t) { 1217 | return [ 1218 | ' stack.push(state);\n', 1219 | t, 1220 | ' if (state) {\n', 1221 | ' stack.pop();\n', 1222 | ' state = null;\n', 1223 | ' } else {\n', 1224 | ' state = stack.pop();\n', 1225 | ' }\n'].join(''); 1226 | } 1227 | function result_expression(result) { 1228 | return [' state.val = ', result, ';\n'].join(''); 1229 | } 1230 | 1231 | We’ll also need the support code 1232 | from the `sentence` rule, 1233 | except for the exporting of `parse_sentence`. 1234 | 1235 | function parse_char(input, pos) { 1236 | if (pos >= input.length) return null; 1237 | return { pos: pos + 1, val: input.charAt(pos) }; 1238 | } 1239 | function literal(input, pos, string) { 1240 | if (input.substr(pos, string.length) === string) { 1241 | return { pos: pos + string.length, val: string }; 1242 | } else return null; 1243 | } 1244 | 1245 | Then, 1246 | after all those functions are defined, 1247 | we can call them to build up the ASTs. 1248 | 1249 | <> 1250 | 1251 | The rule for `_` is quite straightforward: 1252 | 1253 | # in the ASTs made of function calls: 1254 | var __rule = rule('_', 1255 | choice(sequence(nonterminal('sp'), nonterminal('_')), 1256 | '')); 1257 | 1258 | The rule for `rule` 1259 | contains a rather long sequence, 1260 | which will be treated 1261 | as a deeply nested bunch 1262 | of two-element sequences. 1263 | But it’s hard to read and write it that way, 1264 | so I’m going to define a helper function `nseq` 1265 | to make a sequence of an arbitrary number 1266 | of sequence elements. 1267 | 1268 | function nseq() { 1269 | var rv = arguments[arguments.length-1]; 1270 | for (var ii = arguments.length-2; ii >= 0; ii--) 1271 | rv = sequence(arguments[ii], rv); 1272 | return rv; 1273 | } 1274 | 1275 | This will fail (returning `null`) 1276 | if we call it with no arguments, 1277 | so let’s be sure not do that. 1278 | Now we can define the rule for `rule`: 1279 | 1280 | var rule_rule = rule('rule', 1281 | nseq(labeled('n', nonterminal('name')), nonterminal('_'), 1282 | string('<-'), nonterminal('_'), 1283 | labeled('body', nonterminal('choice')), 1284 | string('.'), nonterminal('_'), 1285 | result_expression( 1286 | "[\"function parse_\", n, \"(input, pos) {\\n\",\n" + 1287 | " ' var state = { pos: pos };\\n',\n" + 1288 | " ' var stack = [];\\n',\n" + 1289 | " body, \n" + 1290 | " ' return state;\\n',\n" + 1291 | " \"}\\n\"].join('')"))); 1292 | 1293 | `rule_rule` is clearly pretty verbose; 1294 | it's 12 lines, 1295 | and the corresponding `rule` function is 8 lines, 1296 | for a total of 20 lines for the “hand-compiled” version 1297 | of the original 7-line `rule` rule. 1298 | That’s a manageable expansion factor of about 3×. 1299 | 1300 | So, on to `sentence`. 1301 | I’ve played fast and loose 1302 | with leading whitespace here, 1303 | in order to retain some modicum of readability. 1304 | 1305 | var sentence_rule = rule('sentence', 1306 | choice( 1307 | nseq(nonterminal('_'), 1308 | labeled('r', nonterminal('rule')), 1309 | labeled('g', nonterminal('sentence')), 1310 | result_expression('r + "\\n" + g')), 1311 | nseq(nonterminal('_'), 1312 | labeled('r', nonterminal('rule')), 1313 | result_expression('r + "\\n"\n' + 1314 | "+ 'function parse_char(input, pos) {\\n'\n" + 1315 | "+ ' if (pos >= input.length) return null;\\n'\n" + 1316 | "+ ' return { pos: pos + 1, val: input.charAt(pos) };\\n'\n" + 1317 | "+ '}\\n'\n" + 1318 | "+ 'function literal(input, pos, string) {\\n'\n" + 1319 | "+ ' if (input.substr(pos, string.length) === string) {\\n'\n" + 1320 | "+ ' return { pos: pos + string.length, val: string };\\n'\n" + 1321 | "+ ' } else return null;\\n'\n" + 1322 | "+ '}\\n'\n" + 1323 | "+ 'if (typeof exports !== "+'"undefined"'+") {\\n'\n" + 1324 | "+ ' exports.parse_sentence = parse_sentence;\\n'\n" + 1325 | "+ '}\\n'\n")))); 1326 | 1327 | The quoting of the support code 1328 | is kind of confusing; 1329 | the original is one long string, 1330 | containing a bunch of `\n` newlines, 1331 | broken up into lines for readability, 1332 | joined by the `+` operator. 1333 | This version 1334 | is also one long string, 1335 | containing the lines of the original long string, 1336 | also broken up into lines for readability, 1337 | joined by the `+` operator. 1338 | So there are two levels of quoting. 1339 | The inner level has the `+` on the left and uses single quotes `''`, 1340 | and the outer level has the `+` on the right and uses double quotes `""`. 1341 | 1342 | The next rule is `meta`, 1343 | and it has a lot of `choice`s. 1344 | So we define something like `nseq`, 1345 | but for `choice`s. 1346 | 1347 | function nchoice() { 1348 | var rv = arguments[arguments.length-1]; 1349 | for (var ii = arguments.length-2; ii >= 0; ii--) 1350 | rv = choice(arguments[ii], rv); 1351 | return rv; 1352 | } 1353 | 1354 | var meta_rule = rule('meta', 1355 | nchoice(string('!'), string('\\\''), string('<-'), string('/'), 1356 | string('.'), string('('), string(')'), string(':'), 1357 | string('->'))); 1358 | 1359 | 1360 | The next few rules 1361 | are straightforward translations from the grammar. 1362 | 1363 | var name_rule = rule('name', 1364 | choice(nseq(labeled('c', nonterminal('namechar')), 1365 | labeled('n', nonterminal('name')), 1366 | result_expression('c + n')), 1367 | nonterminal('namechar'))); 1368 | var namechar_rule = rule('namechar', 1369 | nseq(negation(nonterminal('meta')), 1370 | negation(nonterminal('sp')), nonterminal('char'))); 1371 | var term_rule = rule('term', 1372 | nchoice(nonterminal('labeled'), nonterminal('nonterminal'), 1373 | nonterminal('string'), nonterminal('negation'), 1374 | nonterminal('parenthesized'))); 1375 | var nonterminal_rule = rule('nonterminal', 1376 | nseq(labeled('n', nonterminal('name')), nonterminal('_'), 1377 | result_expression("[' state = parse_', n, " + 1378 | "'(input, state.pos);\\n'].join('')"))); 1379 | var labeled_rule = rule('labeled', 1380 | nseq(labeled('label', nonterminal('name')), nonterminal('_'), 1381 | string(':'), nonterminal('_'), 1382 | labeled('value', nonterminal('term')), 1383 | result_expression("[value, ' if (state) var ', " + 1384 | "label, ' = state.val;\\n'].join('')"))); 1385 | var sequence_rule = rule('sequence', 1386 | nchoice(nseq(labeled('foo', nonterminal('term')), 1387 | labeled('bar', nonterminal('sequence')), 1388 | result_expression("[foo, ' if (state) {\\n', " + 1389 | "bar, ' }\\n'].join('')")), 1390 | nonterminal('result_expression'), 1391 | sequence(result_expression("''")))); 1392 | 1393 | That’s 29 lines, 1394 | transliterating 12 lines from the grammar, 1395 | and now the transliteration is halfway done. 1396 | 1397 | var string_rule = rule('string', 1398 | nseq(string("\\'"), labeled('s', nonterminal('stringcontents')), 1399 | string("\\'"), nonterminal('_'), 1400 | result_expression('[" state = literal(input, state.pos, ' + 1401 | '\'", s, "\');\\n"].join(\'\')'))); 1402 | var stringcontents_rule = rule('stringcontents', 1403 | nchoice(nseq(negation(string("\\\\")), negation(string("\\'")), 1404 | labeled('c', nonterminal('char')), 1405 | labeled('s', nonterminal('stringcontents')), 1406 | result_expression('c + s')), 1407 | nseq(labeled('b', string("\\\\")), 1408 | labeled('c', nonterminal('char')), 1409 | labeled('s', nonterminal('stringcontents')), 1410 | result_expression('b + c + s')), 1411 | result_expression("''"))); 1412 | 1413 | For `choice` I’m omitting not only whitespace 1414 | but also a comment. 1415 | 1416 | var choice_rule = rule('choice', 1417 | choice(nseq(labeled('a', nonterminal('sequence')), 1418 | string('/'), nonterminal('_'), 1419 | labeled('b', nonterminal('choice')), 1420 | result_expression( 1421 | "[' stack.push(state);\\n',\n" + 1422 | " a,\n" + 1423 | " ' if (!state) {\\n',\n" + 1424 | " ' state = stack.pop();\\n',\n" + 1425 | " b,\n" + 1426 | " ' } else {\\n',\n" + 1427 | " ' stack.pop();\\n',\n" + 1428 | " ' }\\n'].join('')")), 1429 | nonterminal('sequence'))); 1430 | var negation_rule = rule('negation', 1431 | nseq(string('!'), nonterminal('_'), labeled('t', nonterminal('term')), 1432 | result_expression( 1433 | "[' stack.push(state);\\n',\n" + 1434 | " t,\n" + 1435 | " ' if (state) {\\n',\n" + 1436 | " ' stack.pop();\\n',\n" + 1437 | " ' state = null;\\n',\n" + 1438 | " ' } else {\\n',\n" + 1439 | " ' state = stack.pop();\\n',\n" + 1440 | " ' }\\n'].join('')"))); 1441 | var result_expression_rule = rule('result_expression', 1442 | nseq(string('->'), nonterminal('_'), 1443 | labeled('result', nonterminal('expr')), 1444 | result_expression("[' if (state) state.val = ', " + 1445 | "result, ';\\n'].join('')"))); 1446 | var expr_rule = rule('expr', 1447 | nseq(string('('), nonterminal('_'), 1448 | labeled('e', nonterminal('exprcontents')), 1449 | string(')'), nonterminal('_'), 1450 | result_expression('e'))); 1451 | var inner_rule = rule('inner', 1452 | nseq(string('('), nonterminal('_'), 1453 | labeled('e', nonterminal('exprcontents')), 1454 | string(')'), 1455 | result_expression("'(' + e + ')'"))); 1456 | var exprcontents_rule = rule('exprcontents', 1457 | choice( 1458 | nseq(labeled('c', 1459 | choice(nseq(negation(string('(')), 1460 | negation(string(')')), 1461 | nonterminal('char')), 1462 | nonterminal('inner'))), 1463 | labeled('e', nonterminal('exprcontents')), 1464 | result_expression('c + e')), 1465 | result_expression("''"))); 1466 | var parenthesized_rule = rule('parenthesized', 1467 | nseq(string('('), nonterminal('_'), 1468 | labeled('body', nonterminal('choice')), 1469 | string(')'), nonterminal('_'), 1470 | result_expression('body'))); 1471 | 1472 | So that’s all the rules. 1473 | Now we just need to assemble them into a sentence, 1474 | using a technique similar to `nseq` and `nchoice`. 1475 | 1476 | function nsentence() { 1477 | var rv = sentence1(arguments[arguments.length-1]); 1478 | for (var ii = arguments.length-2; ii >= 0; ii--) 1479 | rv = sentence2(arguments[ii], rv); 1480 | return rv; 1481 | } 1482 | 1483 | var all_rules = nsentence(sp_rule, __rule, rule_rule, sentence_rule, 1484 | meta_rule, name_rule, namechar_rule, term_rule, 1485 | nonterminal_rule, labeled_rule, sequence_rule, 1486 | string_rule, stringcontents_rule, choice_rule, 1487 | negation_rule, result_expression_rule, expr_rule, 1488 | inner_rule, exprcontents_rule, parenthesized_rule); 1489 | 1490 | Now the variable `all_rules` 1491 | has a working parser in it 1492 | in JavaScript. 1493 | 1494 | To get a usable `parse_sentence` function, 1495 | we need to `eval` that script: 1496 | 1497 | eval(all_rules); 1498 | 1499 | And then we can export the function: 1500 | 1501 | if (typeof exports !== 'undefined') exports.parse_sentence = parse_sentence; 1502 | 1503 | ### The Output Parser in JavaScript ### 1504 | 1505 | I used to include here 1506 | the contents of in `all_rules` 1507 | after a couple of iterations. 1508 | It’s ten pages long (660 lines), 1509 | and the compile takes 1510 | about 3–5 seconds on my machine, 1511 | although it’s under 100ms on modern computers. 1512 | However, 1513 | I decided that it was too much to want to include it here; 1514 | this document is for reading. 1515 | If you `git clone` it, 1516 | it’s in `output.js`. 1517 | 1518 | Cross-Compiling to Lua 1519 | ---------------------- 1520 | 1521 | It was a lot of trouble 1522 | getting the short compiler-compiler above 1523 | to an actually runnable state; 1524 | I had to write and debug, 1525 | basically, 1526 | two copies of the same code. 1527 | It would have been much easier 1528 | if I’d already happened to have such a compiler-compiler around 1529 | that I could use to compile my grammar with. 1530 | 1531 | Well, 1532 | for the program I’m using 1533 | to extract the code from this document, 1534 | which I call HandAxeWeb, 1535 | I would like to have such a compiler-compiler 1536 | to generate code in Lua. 1537 | 1538 | So I’m going to define a “version 2” 1539 | of the compiler-compiler 1540 | which, 1541 | instead of generating JS code, 1542 | generates Lua code. 1543 | (It is still written in JS, though.) 1544 | 1545 | First, 1546 | instead of producing JS functions for rules, 1547 | we produce Lua functions for rules: 1548 | 1549 | # in code to produce a function v2: 1550 | (['function parse_',n,'(input, pos)\n', 1551 | <> 1552 | body, 1553 | <> 1554 | 'end\n'].join('')) 1555 | 1556 | Invoking nonterminals needs no change; 1557 | JS and Lua syntax overlap here. 1558 | But local variable declaration 1559 | and finite maps 1560 | look different: 1561 | 1562 | # in function prologue v2: 1563 | ' local state = { pos = pos }\n', 1564 | 1565 | We have to declare variables 1566 | outside their conditional; 1567 | Lua’s scoping rules here 1568 | change the semantics somewhat 1569 | because unless you declare the variables 1570 | at the top of the function 1571 | you can’t write a rule like 1572 | `x <- (bar y: foo / baz y: quux) -> (y)` 1573 | and have it work 1574 | because the inner `y` variables 1575 | are declared in an inner block in Lua, 1576 | while in JS 1577 | they automatically belong to the whole function. 1578 | 1579 | # in code to save a value in a variable v2: 1580 | ([value, 1581 | ' local ',label,'\n', 1582 | ' if state then ',label,' = state.val end\n'].join('')) 1583 | 1584 | The `parse_char` and `literal` functions 1585 | are a bit different; 1586 | remember, Lua numbers 1587 | character positions in strings 1588 | from 1, 1589 | and the second argument to its `string.sub` 1590 | is not a length but an ending index: 1591 | 1592 | # in support code v2: 1593 | + 'function parse_char(input, pos)\n' 1594 | + ' if pos > #input then return nil end\n' 1595 | + ' return { pos = pos + 1, \n' 1596 | + ' val = string.sub(input, pos, pos) }\n' 1597 | + 'end\n' 1598 | + 'function literal(input, pos, needle)\n' 1599 | + ' if string.sub(input, pos, pos + #needle - 1)\n' 1600 | + ' === needle then\n' 1601 | + ' return { pos = pos + #needle, val = needle }\n' 1602 | + ' else return nil end\n' 1603 | + 'end\n' 1604 | 1605 | The code to invoke `literal` 1606 | doesn’t actually need to change. 1607 | 1608 | Sequence-handling differs only in minor bits of syntax: 1609 | 1610 | # in code to handle a sequence v2: 1611 | ([foo, ' if state then\n', bar, ' end\n'].join('')) 1612 | 1613 | Initializing the stack is a little different: 1614 | 1615 | # in function prologue v2: 1616 | ' local stack = {}\n', 1617 | 1618 | Ordered choice looks quite similar to JS: 1619 | 1620 | # in code to handle a choice v2: 1621 | ([' table.insert(stack, state)\n', 1622 | a, 1623 | ' if not state then\n', 1624 | ' state = table.remove(stack)\n', 1625 | b, 1626 | ' else\n', 1627 | ' table.remove(stack)\n', 1628 | ' end\n'].join('')) 1629 | 1630 | Negation too: 1631 | 1632 | # in code to handle negation v2: 1633 | ([' table.insert(stack, state)\n', 1634 | t, 1635 | ' if state then\n', 1636 | ' table.remove(stack)\n', 1637 | ' state = nil\n', 1638 | ' else\n', 1639 | ' state = table.remove(stack)\n', 1640 | ' end\n'].join('')) 1641 | 1642 | Result expressions too: 1643 | 1644 | # in code to handle result expressions v2: 1645 | ([' if state then state.val = ',result,' end\n'].join('')) 1646 | 1647 | And that is sufficient 1648 | to be able to generate compilers in Lua 1649 | from grammars whose result expressions are in Lua. 1650 | Unfortunately, 1651 | it’s still not good enough 1652 | to generate a metacircular compiler-compiler in Lua 1653 | from the grammar given here, 1654 | because that grammar is written in JS, 1655 | even though it generates Lua code. 1656 | 1657 | It would be relatively straightforward 1658 | to make the modification needed to the grammar quite minor: 1659 | all the result expressions 1660 | merely concatenate a bunch of strings, 1661 | and if they did so by calling a function, 1662 | you’d only need to redefine that function 1663 | in the two target languages; 1664 | in JS, something like 1665 | `Array.prototype.slice.apply(arguments).join('')` 1666 | and in Lua, something like 1667 | `table.concat({...})`. 1668 | 1669 | But this is sort of unnecessary. 1670 | Really, we just need to be able to compile our parsers 1671 | using node.js. 1672 | 1673 | TODO 1674 | ---- 1675 | 1676 | - memoization 1677 | - performance measurement: it takes minimally 252ms to compile itself 1678 | on my netbook, wallclock, under whatever version of Node I’m using. 1679 | That's pretty pessimal; it's about 11 or 12 kilobytes per second, 1680 | close to a hundred thousand clock cycles per byte. Follow sets 1681 | may offer a way to improve that by probably an order of magnitude. 1682 | - re-add repetition `+` and `*` (in a later version) 1683 | - factor out loopbody? like, 1684 | loopbody <- term: body -> (loop body code). 1685 | zero_or_more <- loopbody: body -> (body). 1686 | one_or_more <- loopbody: body -> (body + 'if ...'). 1687 | - how about removing `()` grouping? It leaves “a PEG describing 1688 | results” (and “the metacircular compiler-compiler”) one line shorter 1689 | and one line longer, but perhaps it could simplify backtracking by 1690 | eliminating the explicit stack? Because then each parsing function 1691 | would only need to contain one level of backtracking for `/` and one 1692 | for `!` — oh, well, hmm, `!` might be tricky if we want to support 1693 | positive lookahead too. Probably better to leave the stack in. 1694 | - Rewrite the Lua handaxeweb to use a PEG parser. 1695 | - maybe: rewrite the Lua handaxeweb to be written in JS with Node? 1696 | The whole Lua story (“in a later version, this program switched to 1697 | generating Lua grammars and lost the ability to compile itself”) 1698 | kind of stinks. And writing 39 to 48 lines of code to "port" a 1699 | 66-line program also seems kind of silly, like it may not justify 1700 | the abstraction overhead that permits it. 1701 | - maybe: reorganize this document, putting bootstrap.js first? Not sure. 1702 | - maybe: write a Markdown parser? 1703 | - move Makefile and pegcompile.js into this document? 1704 | 1705 | Profiling results 1706 | ----------------- 1707 | 1708 | **XXX these are rough notes that should be cleaned up** 1709 | 1710 | I profiled this thing compiling itself in Arora. 1711 | 1712 | It contains 2939 characters, but makes 32370 calls to `literal`, which 1713 | is about 25% of its CPU time, I think (the profile output is a little 1714 | hard to interpret; some of the numbers are over 100%, probably due to 1715 | recursion, and 85% is attributed merely to “program”). `parse_meta` 1716 | takes more than a third of the CPU time, largely by virtue of calling 1717 | `literal` several times. It also makes 41903 calls each to `push` and 1718 | `pop`. 1719 | 1720 | That means it's testing about 11 literals per character, and 1721 | backtracking 14 times. I could be wrong but I don’t think much of 1722 | this would be improved by memoizing; computing follow sets is likely 1723 | to make a bigger difference by avoiding the majority of that 1724 | backtracking. 1725 | 1726 | `parse_char` is called 4219 times, mostly from `parse_exprcontents`. 1727 | 1728 | Building up the output tree with `string.join` takes only about 0.6% 1729 | of its time. 1730 | 1731 | I suspect that current WebKit has a much better profiler. 1732 | 1733 | Firebug agrees on most things (23.87% in `literal`), but it has the 1734 | interesting result that actually 17% of the time is in `compile`, 1735 | which was called only once and does little more than call `eval`. So 1736 | apparently the time to generate the output JS was only about 4x the 1737 | time needed for SpiderMonkey to compile it! 1738 | 1739 | Other Interesting PEGs 1740 | ---------------------- 1741 | 1742 | Here’s some nifty stuff you can do 1743 | with the one-page parser generator described above. 1744 | 1745 | ### CSV files ### 1746 | 1747 | [Ierusalemschy][ier] gives this grammar 1748 | for parsing Excel-style CSV files: 1749 | 1750 | # in the LPEG notation with captures: 1751 | record <- ( (',' )*)->{} (%nl / !.) 1752 | field <- / 1753 | nonescaped <- { [^,"%nl]* } 1754 | escaped <- '"' {~ ([^"] / '""'->'"')* ~} '"' 1755 | 1756 | The `{}` capture pieces of text 1757 | and `{~ ~}` capture and replace them. 1758 | `*` is for repetition, 1759 | `%nl` is `'\n'`, 1760 | `""` are equivalent to `''`, 1761 | `.` is our `char`, 1762 | `[abc]` is a character class equivalent to `( 'a' / 'b' / 'c' )`, 1763 | and `->{}` means “make a list of the results”. 1764 | In the notation I’ve used for PEGs here, 1765 | without repetition features, 1766 | this looks like this: 1767 | 1768 | # in csv.peg: 1769 | sentence <- d: (f: field ',' r: sentence -> ([f].concat(r)) 1770 | / f: field -> ([f])) ('\n' / !char) 1771 | -> (d). 1772 | field <- escaped / nonescaped. 1773 | normal_char <- !',' !'"' !'\n' char. 1774 | nonescaped <- c: normal_char s: nonescaped -> (c + s) / normal_char. 1775 | escaped_inner_char <- !'"' char / '""' -> ('"'). 1776 | escaped_inner <- c: escaped_inner_char s: escaped_inner -> (c + s) 1777 | / escaped_inner_char. 1778 | escaped <- '"' s: escaped_inner '"' -> (s). 1779 | 1780 | That’s 2½ times as big, 1781 | which is unreasonable. 1782 | If we have `*` repetition that makes JavaScript Arrays, 1783 | we can write it with only a bit more ugliness 1784 | than in LPEG: 1785 | 1786 | # in csvstar.peg: 1787 | sentence <- h: field t: (',' field)* ('\n' / !char) -> ([h].concat(t)). 1788 | field <- escaped / nonescaped. 1789 | nonescaped <- s: (!',' !'"' !'\n' char)* -> (s.join('')). 1790 | escaped <- '"' s: (!'"' char / '""' -> ('"'))* '"' -> (s.join('')). 1791 | 1792 | [ier]: http://www.inf.puc-rio.br/~roberto/docs/peg.pdf "A Text Pattern-Matching Tool based on Parsing Expression Grammars, 2008, SP&E" 1793 | 1794 | ### ichbins ### 1795 | 1796 | [Darius Bacon’s ichbins] [ichbins] 1797 | is an inspiring small Lisp compiler; 1798 | it can compile itself to C 1799 | with full run-time type-checking, 1800 | even though 1801 | it’s only a bit over six pages of code. 1802 | Its recursive-descent parser 1803 | is a model of clarity, 1804 | as recursive-descent parsers go: 1805 | 1806 | # in the parser in ichbins.scm: 1807 | (define (read) 1808 | (read-dispatch (skip-blanks (read-char)))) 1809 | 1810 | (define (skip-blanks c) 1811 | (cond ((memq? c whitespace-chars) (skip-blanks (read-char))) 1812 | ('t c))) 1813 | 1814 | (define whitespace-chars (cons linefeed " ")) 1815 | (define non-symbol-chars "\"\\(')") 1816 | 1817 | (define eof-object '("eof")) 1818 | 1819 | (define (read-dispatch c) 1820 | (cond ((eq? c 'f) eof-object) 1821 | ((eq? c \\) (read-char-literal (read-char))) 1822 | ((eq? c \") (read-string (read-char))) 1823 | ((eq? c \() (read-list)) 1824 | ((eq? c \') (cons 'quote (cons (read) '()))) 1825 | ((eq? c \)) (error "Unbalanced parentheses")) 1826 | ('t (intern (cons c (read-symbol (peek-char))))))) 1827 | 1828 | (define (read-char-literal c) 1829 | (cond ((eq? c 'f) (error "EOF in character literal")) 1830 | ('t c))) 1831 | 1832 | (define (read-string c) 1833 | (cond ((eq? c 'f) (error "Unterminated string literal")) 1834 | ((eq? c \") '()) 1835 | ((eq? c \\) (cons (read-char) (read-string (read-char)))) 1836 | ('t (cons c (read-string (read-char)))))) 1837 | 1838 | (define (read-symbol c) 1839 | (cond ((memq? c whitespace-chars) '()) 1840 | ((memq? c non-symbol-chars) '()) 1841 | ('t (read-char) (cons c (read-symbol (peek-char)))))) 1842 | 1843 | (define (read-list) 1844 | (read-list-dispatch (skip-blanks (read-char)))) 1845 | 1846 | (define (read-list-dispatch c) 1847 | (cond ((eq? c 'f) (error "Unterminated list")) 1848 | ((eq? c \)) '()) 1849 | ('t (cons (read-dispatch c) (read-list))))) 1850 | 1851 | But with a language suited for parsing, 1852 | we can do better. 1853 | Here’s a PEG simply describing the same grammar as the above: 1854 | 1855 | # in ichbins.peg: 1856 | whitespace <- '\n' / ' ' / '\t'. 1857 | _ <- whitespace _ / . 1858 | non-symbol <- '"' / '\\' / '(' / '\'' / ')'. 1859 | sentence <- _ sexp. 1860 | sexp <- '\\' char / '"' string / '(' list / '\'' read / symbol. 1861 | string <- '"' / (!'\\' char / '\\' char) string. 1862 | symbol <- !whitespace !non-symbol char / . 1863 | list <- ')' / read list. 1864 | 1865 | Instead of 33 lines of code, we have 8. 1866 | Note that I’ve followed the kind of weird structure 1867 | of the original parser: 1868 | the closing parenthesis is considered part of the list contents, 1869 | and the closing quote is considered part of the string contents. 1870 | This simplifies the grammar slightly, 1871 | and eliminates nearly all non-tail calls 1872 | (except inside of `list`, and to `_`, and in distinguishing character categories) 1873 | but I think it makes it a little less clear. 1874 | 1875 | In 16 lines, 1876 | we can get a real parser 1877 | that returns a parse of the code, 1878 | in this case as a JSON string: 1879 | 1880 | # in ichbins-parser.peg: 1881 | sentence <- _ s: sexp -> (JSON.stringify(s, null, 4)). 1882 | 1883 | sexp <- '('_ list 1884 | / '"' string 1885 | / s: symbol -> ({symbol: s}) 1886 | / '\''_ s: sexp -> ([{symbol: 'quote'}, s]) 1887 | / '\\' c: char _ -> ({char: c}). 1888 | 1889 | list <- ')'_ -> ([]) 1890 | / a: sexp b: list -> ([a].concat(b)). 1891 | string <- '"'_ -> ('') 1892 | / a: (!'\\' char / '\\' b: char -> ('\\' + b)) 1893 | t: string -> (a + t). 1894 | symbol <- a: symchar b: symtail -> (a + b). 1895 | symtail <- symbol / _ -> (''). 1896 | 1897 | _ <- whitespace _ / . 1898 | whitespace <- '\n' / ' ' / '\t'. 1899 | symchar <- !( whitespace /'"' / '\\' / '(' / '\'' / ')' ) char. 1900 | 1901 | 1902 | [ichbins]: http://www.accesscom.com/~darius/???XXX "ichbins: I can hardly believe it’s not Scheme" 1903 | 1904 | Thanks 1905 | ------ 1906 | 1907 | Thanks to D. Val Schorre for inventing META-II, 1908 | of which this is a refinement, 1909 | in 1964 or a bit before; 1910 | to Bob M. McClure for inventing [TMG](http://www.multicians.org/tmg.html), 1911 | the TransMoGrifier, 1912 | also in 1964, 1913 | and to Doug McIlroy for maintaining it afterwards, 1914 | which not only carried META-II forward, 1915 | but also 1916 | [helped Thompson write B](http://plan9.bell-labs.com/who/dmr/chist.html) 1917 | which became C; 1918 | to Romuald Ireneus 'Scibor-Marchocki, who 1919 | [apparently ported TMG to 1920 | TMGL](http://www.geocities.com/ResearchTriangle/2363/tmg011.html); 1921 | to Bryan Ford for resurrecting TMG’s parsing schema 1922 | and enhancing it into the form of parsing expression grammars, 1923 | in 2002; 1924 | to Alan Kay for bringing META-II back to public attention; 1925 | to Alessandro Warth and Yoshiki Ohshima for developing OMeta 1926 | and showing that PEGs can be extended 1927 | to a wide variety of non-parsing tasks. 1928 | 1929 | To [Aristotle Pagaltzis] (http://plasmasturm.org/) 1930 | for innumerable improvements 1931 | to the readability and correctness 1932 | of this document. 1933 | 1934 | To Andy Isaacson, 1935 | Allan Schiffman, 1936 | [Chris Hibbert] (http://pancrit.org/) 1937 | for further suggestions 1938 | for the readability and content of this document. 1939 | 1940 | 1941 | 1942 | 1945 | 1955 | --------------------------------------------------------------------------------