117 | go
118 |
119 |
120 |
121 |
122 |
--------------------------------------------------------------------------------
/handaxeweb.lua:
--------------------------------------------------------------------------------
1 | #!/usr/bin/lua
2 | function register_chunk(chunks, new_chunk)
3 | if new_chunk.name == nil then return end
4 |
5 | local contents = chunks[new_chunk.name]
6 | if not contents then
7 | contents = {}
8 | chunks[new_chunk.name] = contents
9 | end
10 |
11 | -- If there’s a duplicate, append text to it.
12 | for _, it in ipairs(chunks[new_chunk.name]) do
13 | if it.v == new_chunk.v then
14 | for _, line in ipairs(new_chunk.text) do
15 | table.insert(it.text, line)
16 | end
17 | return
18 | end
19 | end
20 |
21 | -- No duplicate. Add to table.
22 | table.insert(contents, new_chunk)
23 | end
24 |
25 | function is_indented(line)
26 | return string.match(line, "^ ")
27 | end
28 |
29 | assert( is_indented(" hi"))
30 | assert(not is_indented(" hi"))
31 | assert(not is_indented(" hi "))
32 |
33 | function unindented(line) return string.sub(line, 5) end
34 | assert(unindented(" hi\n") == "hi\n")
35 |
36 | function get_chunk_label(line)
37 | return string.match(line, "^[^%w]*in (.*):[^%w]*$")
38 | end
39 |
40 | assert(get_chunk_label("-- in handaxeweb.lua:") ==
41 | "handaxeweb.lua")
42 | assert(get_chunk_label("/* in handaxeweb.c: */") ==
43 | "handaxeweb.c")
44 | assert(get_chunk_label("# in a minute: #\n") ==
45 | "a minute")
46 |
47 | function parse_chunk_label(label)
48 | local name, version =
49 | string.match(label, "(.*) v(%d+)$")
50 | if name then return name, tonumber(version)
51 | else return label, 0 end
52 | end
53 |
54 | assert(parse_chunk_label("foo") == "foo")
55 | assert(({parse_chunk_label("foo")})[2] == 0)
56 | assert(parse_chunk_label("foo v32") == "foo")
57 | assert(({parse_chunk_label("foo v32")})[2] == 32)
58 |
59 | function parse_input()
60 | local chunks, current_chunk, in_chunk = {}, {text={}}, false
61 | local blank_lines = {}
62 |
63 | for line in io.lines() do
64 | if string.match(line, "^%s*$") then -- blank line
65 | if in_chunk then table.insert(blank_lines, "") end
66 | elseif not in_chunk and is_indented(line) then
67 | local label = get_chunk_label(line)
68 |
69 | if label then -- if that succeeded, change chunks
70 | register_chunk(chunks, current_chunk)
71 | local name, ver = parse_chunk_label(label)
72 | current_chunk = {name = name, v = ver, text = {}}
73 | else
74 | -- incorporate any blank lines seen in between indented lines
75 | for _, blank_line in ipairs(blank_lines) do
76 | table.insert(current_chunk.text, blank_line)
77 | end
78 | blank_lines = {}
79 |
80 | table.insert(current_chunk.text, unindented(line))
81 | end
82 | in_chunk = true
83 | elseif in_chunk and is_indented(line) then
84 | -- incorporate any blank lines seen in between indented lines
85 | for _, blank_line in ipairs(blank_lines) do
86 | table.insert(current_chunk.text, blank_line)
87 | end
88 | blank_lines = {}
89 |
90 | table.insert(current_chunk.text, unindented(line))
91 | else
92 | blank_lines = {}
93 | in_chunk = false
94 | end
95 | end
96 | register_chunk(chunks, current_chunk)
97 |
98 | return chunks
99 | end
100 |
101 | function list_chunk_names_and_versions(chunks)
102 | io.write("# Listing versions and root chunk names.\n")
103 | io.write("# Version 12 is displayed as:\n")
104 | io.write("# v 12\n")
105 | io.write("# Chunk name foo bar is displayed as:\n")
106 | io.write("# n foo bar\n")
107 | io.write("# To tangle a particular root chunk, run:\n")
108 | io.write("# "..arg[0].." chunkname\n")
109 | io.write("# That tangles version 0 by default; to specify v69:\n")
110 | io.write("# "..arg[0].." chunkname 69\n")
111 |
112 | local versions, referenced_chunks = {}, {}
113 | for name, contents in pairs(chunks) do
114 | for _, it in ipairs(contents) do
115 | versions[it.v] = true
116 |
117 | for _, line in ipairs(it.text) do
118 | local _, chunkname = parse_reference(line)
119 | if chunkname ~= nil then
120 | referenced_chunks[chunkname] = true
121 | end
122 | end
123 | end
124 | end
125 |
126 | for version, _ in pairs(versions) do
127 | io.write(string.format("v %d\n", version))
128 | end
129 |
130 | for name, _ in pairs(chunks) do
131 | if not referenced_chunks[name] then
132 | io.write("n "..name.."\n")
133 | end
134 | end
135 | end
136 |
137 | function get_chunk_text(contents, version)
138 | local best
139 | for _, it in ipairs(contents) do
140 | if it.v <= version and (not best or
141 | it.v > best.v) then
142 | best = it
143 | end
144 | end
145 | if best then return best.text else return nil end
146 | end
147 |
148 | do
149 | local contents = {{v=0, text={"a"}},
150 | {v=2, text={"b"}},
151 | {v=1, text={"c"}}}
152 | assert(get_chunk_text(contents, 0)[1] == "a")
153 | assert(get_chunk_text(contents, 1)[1] == "c")
154 | assert(get_chunk_text(contents, 2)[1] == "b")
155 | assert(get_chunk_text(contents, 3)[1] == "b")
156 | assert(get_chunk_text(contents, -1) == nil)
157 | end
158 |
159 | function parse_reference(line)
160 | return string.match(line, "^(%s*)<<(.*)>>(%s*)$")
161 | end
162 |
163 | do
164 | local indent, name = parse_reference(" <>\n")
165 | assert(indent == " ")
166 | assert(name == "foo")
167 | assert(parse_reference("bits << shiftlen >> 1") == nil)
168 | end
169 |
170 | function tangle(chunks, chunkname, version, indent)
171 | if indent == nil then indent = '' end
172 |
173 | local contents = chunks[chunkname]
174 | if contents == nil then
175 | error(string.format("chunk `%s` does not exist",
176 | chunkname))
177 | end
178 |
179 | local text = get_chunk_text(contents, version)
180 | if text == nil then
181 | error(string.format("chunk `%s` has no version `%d`",
182 | chunkname, version))
183 | end
184 |
185 | for _, line in ipairs(text) do
186 | local nindent, nchunkname = parse_reference(line)
187 | if nindent then
188 | tangle(chunks, nchunkname, version, indent..nindent)
189 | else
190 | io.write(indent..line.."\n")
191 | end
192 | end
193 | end
194 |
195 | local chunks = parse_input()
196 | chunkname, version = ...
197 | if chunkname == nil then
198 | list_chunk_names_and_versions(chunks)
199 | else
200 | if version == nil then version = 0 end
201 | tangle(chunks, chunkname, tonumber(version))
202 | end
203 |
--------------------------------------------------------------------------------
/output.js:
--------------------------------------------------------------------------------
1 | function parse_sp(input, pos) {
2 | var state = { pos: pos };
3 | var stack = [];
4 | stack.push(state);
5 | state = literal(input, state.pos, ' ');
6 | if (state) {
7 | }
8 | if (!state) {
9 | state = stack.pop();
10 | stack.push(state);
11 | state = literal(input, state.pos, '\n');
12 | if (state) {
13 | }
14 | if (!state) {
15 | state = stack.pop();
16 | state = literal(input, state.pos, '\t');
17 | if (state) {
18 | }
19 | } else {
20 | stack.pop();
21 | }
22 | } else {
23 | stack.pop();
24 | }
25 | return state;
26 | }
27 |
28 | function parse__(input, pos) {
29 | var state = { pos: pos };
30 | var stack = [];
31 | stack.push(state);
32 | state = parse_sp(input, state.pos);
33 | if (state) {
34 | state = parse__(input, state.pos);
35 | if (state) {
36 | }
37 | }
38 | if (!state) {
39 | state = stack.pop();
40 | } else {
41 | stack.pop();
42 | }
43 | return state;
44 | }
45 |
46 | function parse_rule(input, pos) {
47 | var state = { pos: pos };
48 | var stack = [];
49 | state = parse_name(input, state.pos);
50 | if (state) var n = state.val;
51 | if (state) {
52 | state = parse__(input, state.pos);
53 | if (state) {
54 | state = literal(input, state.pos, '<-');
55 | if (state) {
56 | state = parse__(input, state.pos);
57 | if (state) {
58 | state = parse_choice(input, state.pos);
59 | if (state) var body = state.val;
60 | if (state) {
61 | state = literal(input, state.pos, '.');
62 | if (state) {
63 | state = parse__(input, state.pos);
64 | if (state) {
65 | if (state) state.val = ["function parse_", n, "(input, pos) {\n",
66 | ' var state = { pos: pos };\n',
67 | ' var stack = [];\n',
68 | body,
69 | ' return state;\n',
70 | "}\n"].join('');
71 | }
72 | }
73 | }
74 | }
75 | }
76 | }
77 | }
78 | return state;
79 | }
80 |
81 | function parse_sentence(input, pos) {
82 | var state = { pos: pos };
83 | var stack = [];
84 | stack.push(state);
85 | state = parse__(input, state.pos);
86 | if (state) {
87 | state = parse_rule(input, state.pos);
88 | if (state) var r = state.val;
89 | if (state) {
90 | state = parse_sentence(input, state.pos);
91 | if (state) var g = state.val;
92 | if (state) {
93 | if (state) state.val = r + "\n" + g;
94 | }
95 | }
96 | }
97 | if (!state) {
98 | state = stack.pop();
99 | state = parse__(input, state.pos);
100 | if (state) {
101 | state = parse_rule(input, state.pos);
102 | if (state) var r = state.val;
103 | if (state) {
104 | if (state) state.val = r + "\n"
105 | + 'function parse_char(input, pos) {\n'
106 | + ' if (pos >= input.length) return null;\n'
107 | + ' return { pos: pos + 1, val: input.charAt(pos) };\n'
108 | + '}\n'
109 | + 'function literal(input, pos, string) {\n'
110 | + ' if (input.substr(pos, string.length) === string) {\n'
111 | + ' return { pos: pos + string.length, val: string };\n'
112 | + ' } else return null;\n'
113 | + '}\n'
114 | + "if (typeof exports !== 'undefined')\n"
115 | + " exports.parse_sentence = parse_sentence;\n"
116 | ;
117 | }
118 | }
119 | } else {
120 | stack.pop();
121 | }
122 | return state;
123 | }
124 |
125 | function parse_meta(input, pos) {
126 | var state = { pos: pos };
127 | var stack = [];
128 | stack.push(state);
129 | state = literal(input, state.pos, '!');
130 | if (state) {
131 | }
132 | if (!state) {
133 | state = stack.pop();
134 | stack.push(state);
135 | state = literal(input, state.pos, '\'');
136 | if (state) {
137 | }
138 | if (!state) {
139 | state = stack.pop();
140 | stack.push(state);
141 | state = literal(input, state.pos, '<-');
142 | if (state) {
143 | }
144 | if (!state) {
145 | state = stack.pop();
146 | stack.push(state);
147 | state = literal(input, state.pos, '/');
148 | if (state) {
149 | }
150 | if (!state) {
151 | state = stack.pop();
152 | stack.push(state);
153 | state = literal(input, state.pos, '.');
154 | if (state) {
155 | }
156 | if (!state) {
157 | state = stack.pop();
158 | stack.push(state);
159 | state = literal(input, state.pos, '(');
160 | if (state) {
161 | }
162 | if (!state) {
163 | state = stack.pop();
164 | stack.push(state);
165 | state = literal(input, state.pos, ')');
166 | if (state) {
167 | }
168 | if (!state) {
169 | state = stack.pop();
170 | stack.push(state);
171 | state = literal(input, state.pos, ':');
172 | if (state) {
173 | }
174 | if (!state) {
175 | state = stack.pop();
176 | state = literal(input, state.pos, '->');
177 | if (state) {
178 | }
179 | } else {
180 | stack.pop();
181 | }
182 | } else {
183 | stack.pop();
184 | }
185 | } else {
186 | stack.pop();
187 | }
188 | } else {
189 | stack.pop();
190 | }
191 | } else {
192 | stack.pop();
193 | }
194 | } else {
195 | stack.pop();
196 | }
197 | } else {
198 | stack.pop();
199 | }
200 | } else {
201 | stack.pop();
202 | }
203 | return state;
204 | }
205 |
206 | function parse_name(input, pos) {
207 | var state = { pos: pos };
208 | var stack = [];
209 | stack.push(state);
210 | state = parse_namechar(input, state.pos);
211 | if (state) var c = state.val;
212 | if (state) {
213 | state = parse_name(input, state.pos);
214 | if (state) var n = state.val;
215 | if (state) {
216 | if (state) state.val = c + n;
217 | }
218 | }
219 | if (!state) {
220 | state = stack.pop();
221 | state = parse_namechar(input, state.pos);
222 | if (state) {
223 | }
224 | } else {
225 | stack.pop();
226 | }
227 | return state;
228 | }
229 |
230 | function parse_namechar(input, pos) {
231 | var state = { pos: pos };
232 | var stack = [];
233 | stack.push(state);
234 | state = parse_meta(input, state.pos);
235 | if (state) {
236 | stack.pop();
237 | state = null;
238 | } else {
239 | state = stack.pop();
240 | }
241 | if (state) {
242 | stack.push(state);
243 | state = parse_sp(input, state.pos);
244 | if (state) {
245 | stack.pop();
246 | state = null;
247 | } else {
248 | state = stack.pop();
249 | }
250 | if (state) {
251 | state = parse_char(input, state.pos);
252 | if (state) {
253 | }
254 | }
255 | }
256 | return state;
257 | }
258 |
259 | function parse_term(input, pos) {
260 | var state = { pos: pos };
261 | var stack = [];
262 | stack.push(state);
263 | state = parse_labeled(input, state.pos);
264 | if (state) {
265 | }
266 | if (!state) {
267 | state = stack.pop();
268 | stack.push(state);
269 | state = parse_nonterminal(input, state.pos);
270 | if (state) {
271 | }
272 | if (!state) {
273 | state = stack.pop();
274 | stack.push(state);
275 | state = parse_string(input, state.pos);
276 | if (state) {
277 | }
278 | if (!state) {
279 | state = stack.pop();
280 | stack.push(state);
281 | state = parse_negation(input, state.pos);
282 | if (state) {
283 | }
284 | if (!state) {
285 | state = stack.pop();
286 | state = parse_parenthesized(input, state.pos);
287 | if (state) {
288 | }
289 | } else {
290 | stack.pop();
291 | }
292 | } else {
293 | stack.pop();
294 | }
295 | } else {
296 | stack.pop();
297 | }
298 | } else {
299 | stack.pop();
300 | }
301 | return state;
302 | }
303 |
304 | function parse_nonterminal(input, pos) {
305 | var state = { pos: pos };
306 | var stack = [];
307 | state = parse_name(input, state.pos);
308 | if (state) var n = state.val;
309 | if (state) {
310 | state = parse__(input, state.pos);
311 | if (state) {
312 | if (state) state.val = [' state = parse_', n, '(input, state.pos);\n'].join('');
313 | }
314 | }
315 | return state;
316 | }
317 |
318 | function parse_labeled(input, pos) {
319 | var state = { pos: pos };
320 | var stack = [];
321 | state = parse_name(input, state.pos);
322 | if (state) var label = state.val;
323 | if (state) {
324 | state = parse__(input, state.pos);
325 | if (state) {
326 | state = literal(input, state.pos, ':');
327 | if (state) {
328 | state = parse__(input, state.pos);
329 | if (state) {
330 | state = parse_term(input, state.pos);
331 | if (state) var value = state.val;
332 | if (state) {
333 | if (state) state.val = [value, ' if (state) var ', label, ' = state.val;\n'].join('');
334 | }
335 | }
336 | }
337 | }
338 | }
339 | return state;
340 | }
341 |
342 | function parse_sequence(input, pos) {
343 | var state = { pos: pos };
344 | var stack = [];
345 | stack.push(state);
346 | state = parse_term(input, state.pos);
347 | if (state) var foo = state.val;
348 | if (state) {
349 | state = parse_sequence(input, state.pos);
350 | if (state) var bar = state.val;
351 | if (state) {
352 | if (state) state.val = [foo, ' if (state) {\n', bar, ' }\n'].join('');
353 | }
354 | }
355 | if (!state) {
356 | state = stack.pop();
357 | stack.push(state);
358 | state = parse_result_expression(input, state.pos);
359 | if (state) {
360 | }
361 | if (!state) {
362 | state = stack.pop();
363 | if (state) state.val = '';
364 | } else {
365 | stack.pop();
366 | }
367 | } else {
368 | stack.pop();
369 | }
370 | return state;
371 | }
372 |
373 | function parse_string(input, pos) {
374 | var state = { pos: pos };
375 | var stack = [];
376 | state = literal(input, state.pos, '\'');
377 | if (state) {
378 | state = parse_stringcontents(input, state.pos);
379 | if (state) var s = state.val;
380 | if (state) {
381 | state = literal(input, state.pos, '\'');
382 | if (state) {
383 | state = parse__(input, state.pos);
384 | if (state) {
385 | if (state) state.val = [" state = literal(input, state.pos, '", s, "');\n"].join('');
386 | }
387 | }
388 | }
389 | }
390 | return state;
391 | }
392 |
393 | function parse_stringcontents(input, pos) {
394 | var state = { pos: pos };
395 | var stack = [];
396 | stack.push(state);
397 | stack.push(state);
398 | state = literal(input, state.pos, '\\');
399 | if (state) {
400 | stack.pop();
401 | state = null;
402 | } else {
403 | state = stack.pop();
404 | }
405 | if (state) {
406 | stack.push(state);
407 | state = literal(input, state.pos, '\'');
408 | if (state) {
409 | stack.pop();
410 | state = null;
411 | } else {
412 | state = stack.pop();
413 | }
414 | if (state) {
415 | state = parse_char(input, state.pos);
416 | if (state) var c = state.val;
417 | if (state) {
418 | state = parse_stringcontents(input, state.pos);
419 | if (state) var s = state.val;
420 | if (state) {
421 | if (state) state.val = c + s;
422 | }
423 | }
424 | }
425 | }
426 | if (!state) {
427 | state = stack.pop();
428 | stack.push(state);
429 | state = literal(input, state.pos, '\\');
430 | if (state) var b = state.val;
431 | if (state) {
432 | state = parse_char(input, state.pos);
433 | if (state) var c = state.val;
434 | if (state) {
435 | state = parse_stringcontents(input, state.pos);
436 | if (state) var s = state.val;
437 | if (state) {
438 | if (state) state.val = b + c + s;
439 | }
440 | }
441 | }
442 | if (!state) {
443 | state = stack.pop();
444 | if (state) state.val = '';
445 | } else {
446 | stack.pop();
447 | }
448 | } else {
449 | stack.pop();
450 | }
451 | return state;
452 | }
453 |
454 | function parse_choice(input, pos) {
455 | var state = { pos: pos };
456 | var stack = [];
457 | stack.push(state);
458 | state = parse_sequence(input, state.pos);
459 | if (state) var a = state.val;
460 | if (state) {
461 | state = literal(input, state.pos, '/');
462 | if (state) {
463 | state = parse__(input, state.pos);
464 | if (state) {
465 | state = parse_choice(input, state.pos);
466 | if (state) var b = state.val;
467 | if (state) {
468 | if (state) state.val = [' stack.push(state);\n',
469 | a,
470 | ' if (!state) {\n',
471 | ' state = stack.pop();\n',
472 | b,
473 | ' } else stack.pop();\n'].join('');
474 | }
475 | }
476 | }
477 | }
478 | if (!state) {
479 | state = stack.pop();
480 | state = parse_sequence(input, state.pos);
481 | if (state) {
482 | }
483 | } else {
484 | stack.pop();
485 | }
486 | return state;
487 | }
488 |
489 | function parse_negation(input, pos) {
490 | var state = { pos: pos };
491 | var stack = [];
492 | state = literal(input, state.pos, '!');
493 | if (state) {
494 | state = parse__(input, state.pos);
495 | if (state) {
496 | state = parse_term(input, state.pos);
497 | if (state) var t = state.val;
498 | if (state) {
499 | if (state) state.val = [' stack.push(state);\n',
500 | t,
501 | ' if (state) {\n',
502 | ' stack.pop();\n',
503 | ' state = null;\n',
504 | ' } else state = stack.pop();\n'].join('');
505 | }
506 | }
507 | }
508 | return state;
509 | }
510 |
511 | function parse_result_expression(input, pos) {
512 | var state = { pos: pos };
513 | var stack = [];
514 | state = literal(input, state.pos, '->');
515 | if (state) {
516 | state = parse__(input, state.pos);
517 | if (state) {
518 | state = parse_expr(input, state.pos);
519 | if (state) var result = state.val;
520 | if (state) {
521 | state = parse__(input, state.pos);
522 | if (state) {
523 | if (state) state.val = [' if (state) state.val = ', result, ';\n'].join('');
524 | }
525 | }
526 | }
527 | }
528 | return state;
529 | }
530 |
531 | function parse_expr(input, pos) {
532 | var state = { pos: pos };
533 | var stack = [];
534 | state = literal(input, state.pos, '(');
535 | if (state) {
536 | state = parse__(input, state.pos);
537 | if (state) {
538 | state = parse_exprcontents(input, state.pos);
539 | if (state) var e = state.val;
540 | if (state) {
541 | state = literal(input, state.pos, ')');
542 | if (state) {
543 | if (state) state.val = '(' + e + ')';
544 | }
545 | }
546 | }
547 | }
548 | return state;
549 | }
550 |
551 | function parse_exprcontents(input, pos) {
552 | var state = { pos: pos };
553 | var stack = [];
554 | stack.push(state);
555 | stack.push(state);
556 | stack.push(state);
557 | state = literal(input, state.pos, '(');
558 | if (state) {
559 | stack.pop();
560 | state = null;
561 | } else {
562 | state = stack.pop();
563 | }
564 | if (state) {
565 | stack.push(state);
566 | state = literal(input, state.pos, ')');
567 | if (state) {
568 | stack.pop();
569 | state = null;
570 | } else {
571 | state = stack.pop();
572 | }
573 | if (state) {
574 | state = parse_char(input, state.pos);
575 | if (state) {
576 | }
577 | }
578 | }
579 | if (!state) {
580 | state = stack.pop();
581 | state = parse_expr(input, state.pos);
582 | if (state) {
583 | }
584 | } else {
585 | stack.pop();
586 | }
587 | if (state) var c = state.val;
588 | if (state) {
589 | state = parse_exprcontents(input, state.pos);
590 | if (state) var e = state.val;
591 | if (state) {
592 | if (state) state.val = c + e;
593 | }
594 | }
595 | if (!state) {
596 | state = stack.pop();
597 | if (state) state.val = '';
598 | } else {
599 | stack.pop();
600 | }
601 | return state;
602 | }
603 |
604 | function parse_parenthesized(input, pos) {
605 | var state = { pos: pos };
606 | var stack = [];
607 | state = literal(input, state.pos, '(');
608 | if (state) {
609 | state = parse__(input, state.pos);
610 | if (state) {
611 | state = parse_choice(input, state.pos);
612 | if (state) var body = state.val;
613 | if (state) {
614 | state = literal(input, state.pos, ')');
615 | if (state) {
616 | state = parse__(input, state.pos);
617 | if (state) {
618 | if (state) state.val = body;
619 | }
620 | }
621 | }
622 | }
623 | }
624 | return state;
625 | }
626 |
627 | function parse_char(input, pos) {
628 | if (pos >= input.length) return null;
629 | return { pos: pos + 1, val: input.charAt(pos) };
630 | }
631 | function literal(input, pos, string) {
632 | if (input.substr(pos, string.length) === string) {
633 | return { pos: pos + string.length, val: string };
634 | } else return null;
635 | }
636 | if (typeof exports !== "undefined") {
637 | exports.parse_sentence = parse_sentence;
638 | }
639 |
--------------------------------------------------------------------------------
/handaxeweb.md:
--------------------------------------------------------------------------------
1 | handaxeweb: a minimalist literate-programming system
2 | ====================================================
3 |
4 | > Let us change our traditional attitude to the construction
5 | > of programs: Instead of imagining that our main task is to
6 | > instruct a computer what to do, let us concentrate rather
7 | > on explaining to humans what we want the computer to do.
8 |
9 | > > — Donald E. Knuth, "Literate Programming", in The Computer
10 | > > Journal, 1984, p.99
11 |
12 | Literate-programming systems are systems for writing programs
13 | that are optimized for readability. This is a very simple
14 | literate-programming system called “handaxeweb”
15 | that supports multiple versions of a program in the same
16 | HTML or Markdown document.
17 |
18 | What literate programming is, and how handaxeweb is related to other such systems
19 | ---------------------------------------------------------------------------------
20 |
21 | Traditionally a literate-programming system contains two
22 | programs: one called `tangle`, to feed the program to the compiler,
23 | and one to
24 | produce a printable version called `weave` (related to a
25 | famous couplet alluding to webs).
26 |
27 | Following noweb, handaxeweb doesn’t make any attempt to produce a
28 | “woven” output for human consumption; it only tangles.
29 | The idea is that you
30 | write your literate program either as a plain ASCII text
31 | document, or in Markdown or something, as long as it permits
32 | you to write segments of your program indented by four
33 | spaces.
34 |
35 | ### Phil Bewig’s “The Essence of Literate Programming”: the inspiration ###
36 |
37 | handaxeweb is more directly inspired by Phil Bewig’s “The
38 | Essence of Literate Programming”, a post on
39 | comp.programming.literate on 1996-05-27, message-id
40 | ``, containing the following
41 | noweb-like literate-programming system written in awk:
42 |
43 | # in The Essence of Literate Programming:
44 | /^<<.+>>=$/ {
45 | name = substr($0, 3, length($0) - 5)
46 | while (getline > 0) {
47 | if (length($0) == 0) next
48 | chunk[name, ++count[name]] = $0 } }
49 | END { tangle("*", ""); printf "\n" }
50 | function tangle(name, prefix, i, tag, suffix) {
51 | for (i = 1; i <= count[name]; i++) {
52 | if (i == 2) gsub(/[^ \t]/, " ", prefix)
53 | if (match(chunk[name,i], /<<.+>>/)) {
54 | tag = substr(chunk[name,i], RSTART + 2, RLENGTH - 4)
55 | if (tag in count) {
56 | suffix = substr(chunk[name,i], RSTART + RLENGTH)
57 | tangle(tag, prefix substr(chunk[name,i], 1, RSTART - 1))
58 | printf "%s", suffix }
59 | else printf "%s%s", prefix, chunk[name,i] }
60 | else printf "%s%s", prefix, chunk[name,i]
61 | if (i < count[name]) printf "\n" } }
62 |
63 | He explained:
64 |
65 | > The essence of literate programming is rearranging chunks
66 | > of code, and a dozen and a half lines of awk is all you
67 | > need for that.
68 | >
69 | > Of course, with so little code it's not possible for
70 | > everything to be perfect. … Even so, this microscopic
71 | > system provides a useful tool that encompasses the essence
72 | > of literate programming.
73 |
74 | ### Overview of handaxeweb's features ###
75 |
76 | Unfortunately, handaxeweb is 208 lines of code, twice the
77 | size of the previous Python version, and
78 | more than ten
79 | times the size of The Essence of Literate Programming (a full
80 | sixth of the size of CWEB!). But it
81 | solves a couple of other problems that I need for my
82 | purposes:
83 |
84 | * versioning: multiple versions of the same program in the
85 | same version of the same document;
86 | * multiple separate programs in the same document;
87 | * listing the programs and versions in a document;
88 | * indentation (needed for languages like Python);
89 | * support for Markdown, which is how I write most
90 | human-readable documents these days.
91 |
92 | Literate programs may contain multiple versions of the program
93 | --------------------------------------------------------------
94 |
95 | Versioning is one of the biggest problems I've had with the
96 | previous version of handaxeweb, written in Python.
97 |
98 | When I write a literate program, there are often bits of it
99 | that are present for scaffolding in initial versions which
100 | then should be removed in future versions. This is especially
101 | true with these bootstrapping-compiler things
102 | I've been writing lately, where the
103 | initial version of the bootstrapping compiler supports a
104 | minimal number of features and can barely compile itself,
105 | while later versions share a lot of code with the first
106 | version --- but all the versions coexist simultaneously, and
107 | I want to be able to make a bug-fix in the shared code.
108 |
109 | The programming language itself can provide some support for
110 | this, as e.g. CSS does. But what about the case where the
111 | language itself doesn’t help much?
112 |
113 | One obviously possible approach is to redefine the program
114 | from the root down; e.g., first you say
115 |
116 | in the initial version:
117 | <>
118 | <>
119 | <>
120 |
121 | And defining each of those pieces:
122 |
123 | in initializations:
124 | <>
125 | <>
126 |
127 | etc., and then for the next version:
128 |
129 | in the new version:
130 | <>
131 | <>
132 | <>
133 |
134 | with new versions of whatever treenodes have changed, such as:
135 |
136 | in new initializations:
137 | <>
138 | <>
139 |
140 | and “new initialize I/O layer”.
141 |
142 | Obviously this is pretty suboptimal in terms of requiring a
143 | lot of copy-and-pasted text that doesn’t really help the
144 | reader.
145 |
146 | Version numbers on chunks allow such versions gracefully
147 | --------------------------------------------------------
148 |
149 | Here’s a better idea. Every named chunk can have several
150 | versions, each with a version number. The name of the chunk
151 | when it’s being defined may end with “v312” to indicate that
152 | the text that follows belongs to version 312. Otherwise, it
153 | belongs to version 0. You can tangle any version N of any
154 | chunk; this will use the highest-numbered version <= N of
155 | each referenced chunk.
156 |
157 | This means that you can get the effect of the repetition
158 | above simply by saying:
159 |
160 | in initialize I/O layer v1:
161 |
162 | and then tangling v1 of “initial version”.
163 |
164 | The syntax of handaxeweb
165 | ------------------------
166 |
167 | The previous version of handaxeweb uses
168 | indented lines of the form “(in foo)” to start new named
169 | chunks. This is pretty reasonable, but it would be better if
170 | the line could be a valid comment in whatever language, to
171 | better support syntax-highlighting. So the right thing to do
172 | is to omit leading and trailing punctuation, but require a
173 | trailing ":", as in the previous examples in this document.
174 |
175 | Beyond that, the syntax of `handaxeweb` is simply that
176 | program code is indented by four spaces, and references to
177 | other chunks are enclosed in `<<>>`.
178 |
179 | handaxeweb, the program
180 | -----------------------
181 |
182 | -- in handaxeweb.lua:
183 | #!/usr/bin/lua
184 | <>
185 |
186 | <>
187 | <>
188 |
189 | The main actions desired are to list the possible chunk names
190 | and version numbers, and to tangle a particular chunk with a
191 | particular version number.
192 |
193 | -- in carry out specified action on it:
194 | chunkname, version = ...
195 | if chunkname == nil then
196 | list_chunk_names_and_versions(chunks)
197 | else
198 | if version == nil then version = 0 end
199 | tangle(chunks, chunkname, tonumber(version))
200 | end
201 |
202 | The problem of reading the input program can be factored into
203 | a third subroutine:
204 |
205 | -- in read input literate program:
206 | local chunks = parse_input()
207 |
208 | So, the definitions so far needed:
209 |
210 | -- in definitions:
211 | <>
212 |
213 | <>
214 |
215 | <>
216 |
217 | These three need to share a common idea of the contents of
218 | the variable `chunks`. I think it should be a hash from chunk
219 | names to lists of chunk versions, where each version contains
220 | a version number and some text, stored as a list of
221 | lines.
222 |
223 | -- in an example of the chunks variable:
224 | {['read input literate program'] =
225 | {{v=0, text={"local chunks = parse_input()", ...},
226 | {v=1, ...}
227 | ...}
228 | },
229 | parse_input={{v=0...}, ...},
230 | ...
231 | }
232 |
233 | ### `parse_input` ###
234 |
235 | The job of `parse_input` is to turn the input file into such
236 | a structure. It looks for sequences of lines indented by at
237 | least four spaces to use as chunks; they may begin with a
238 | header line specifying their name and version, or they may
239 | just be a continuation of some previous chunk with a name and
240 | version.
241 |
242 | We start with a nameless chunk that will be discarded.
243 |
244 | -- in parse_input:
245 | <>
246 |
247 | function parse_input()
248 | local chunks, current_chunk, in_chunk = {}, {text={}}, false
249 | local blank_lines = {}
250 |
251 | for line in io.lines() do
252 | if string.match(line, "^%s*$") then -- blank line
253 | <>
254 | elseif not in_chunk and is_indented(line) then
255 | <>
256 | in_chunk = true
257 | elseif in_chunk and is_indented(line) then
258 | <>
259 | else
260 | blank_lines = {}
261 | in_chunk = false
262 | end
263 | end
264 | <>
265 |
266 | return chunks
267 | end
268 |
269 | Initially `current_chunk` is `nil`, and we don’t start a
270 | `current_chunk` until we see a header line. After that,
271 | `current_chunk.text` is always a list.
272 |
273 | We need special handling for blank lines because they can
274 | occur inside of an indented region, but not have any spaces
275 | on them, depending on editor settings. So in this case we
276 | leave untouched the `in_chunk` setting, telling us whether we're in the
277 | middle of an indented chunk, and we append the blank
278 | line to a list that gets incorporated only if more nonblank
279 | indented lines appear.
280 |
281 | -- in handle blank line:
282 | if in_chunk then table.insert(blank_lines, "") end
283 |
284 | Handling a normal indented line is very easy. Any parsing
285 | will be handled later by `tangle`.
286 |
287 | -- in handle normal indented line:
288 | -- incorporate any blank lines seen in between indented lines
289 | for _, blank_line in ipairs(blank_lines) do
290 | table.insert(current_chunk.text, blank_line)
291 | end
292 | blank_lines = {}
293 |
294 | table.insert(current_chunk.text, unindented(line))
295 |
296 | The possible header line may be either a header line (not
297 | included in the chunk itself) or an ordinary chunk line,
298 | possibly adding more lines onto the previous chunk.
299 |
300 | -- in handle possible header line:
301 | local label = get_chunk_label(line)
302 |
303 | if label then -- if that succeeded, change chunks
304 | register_chunk(chunks, current_chunk)
305 | local name, ver = parse_chunk_label(label)
306 | current_chunk = {name = name, v = ver, text = {}}
307 | else
308 | <>
309 | end
310 |
311 | At the end of input, we just need to handle the last chunk:
312 |
313 | -- in handle last chunk:
314 | register_chunk(chunks, current_chunk)
315 |
316 | So the `parse_input` function itself depends on a few other
317 | functions:
318 |
319 | -- in parse_input definitions:
320 | <>
321 |
322 | <>
323 |
324 | <>
325 |
326 | <>
327 |
328 | <>
329 |
330 | `register_chunk` is the only thing that actually builds the
331 | table `chunks`. It has to deal with questions of
332 | duplicate-handling, and discard the initial nil chunk.
333 |
334 | With regard to duplicate-handling: if there are multiple
335 | chunks with the same name and version, then we concatenate
336 | them. This supports two important uses:
337 |
338 | 1. It allows you to intersperse formatted text with the lines
339 | of a chunk without having to add header lines all over the
340 | place. If you like, you can write your entire program this
341 | way, with just a single header line at the top.
342 |
343 | 2. It allows you to progressively add to multiple sections in
344 | parallel throughout your document. The example given in
345 | the CWEB manual is that you might have one section for all
346 | your global variables, progressively adding things to
347 | it. Some other examples follow: in C, it’s often
348 | convenient to put a declaration into a `.h` file at the
349 | same time as an implementation into a `.c` file; in a
350 | bytecode virtual machine, it may be convenient to put
351 | cases into a centralized `switch` statement at the same
352 | time as defining functions that those cases call.
353 |
354 | However, it may run into some difficulty with versioning. If
355 | you define a new version of a chunk, then in that version, it
356 | replaces all of the text in that chunk, not just one
357 | paragraph of it. Clearly if those paragraphs are spread all
358 | over your document, that’s going to be hard to get right.
359 |
360 | -- in register_chunk:
361 | function register_chunk(chunks, new_chunk)
362 | if new_chunk.name == nil then return end
363 |
364 | local contents = chunks[new_chunk.name]
365 | if not contents then
366 | contents = {}
367 | chunks[new_chunk.name] = contents
368 | end
369 |
370 | -- If there’s a duplicate, append text to it.
371 | for _, it in ipairs(chunks[new_chunk.name]) do
372 | if it.v == new_chunk.v then
373 | for _, line in ipairs(new_chunk.text) do
374 | table.insert(it.text, line)
375 | end
376 | return
377 | end
378 | end
379 |
380 | -- No duplicate. Add to table.
381 | table.insert(contents, new_chunk)
382 | end
383 |
384 | The indentation functions are very simple.
385 |
386 | -- in is_indented:
387 | function is_indented(line)
388 | return string.match(line, "^ ")
389 | end
390 |
391 | assert( is_indented(" hi"))
392 | assert(not is_indented(" hi"))
393 | assert(not is_indented(" hi "))
394 |
395 | The `unindented` function assumes the line is indented.
396 |
397 | -- in unindented:
398 | function unindented(line) return string.sub(line, 5) end
399 | assert(unindented(" hi\n") == "hi\n")
400 |
401 | Recognizing the chunk labels is not too hard with Lua’s
402 | pattern-matching:
403 |
404 | -- in get_chunk_label:
405 | function get_chunk_label(line)
406 | return string.match(line, "^[^%w]*in (.*):[^%w]*$")
407 | end
408 |
409 | assert(get_chunk_label("-- in handaxeweb.lua:") ==
410 | "handaxeweb.lua")
411 | assert(get_chunk_label("/* in handaxeweb.c: */") ==
412 | "handaxeweb.c")
413 | assert(get_chunk_label("# in a minute: #\n") ==
414 | "a minute")
415 |
416 | Pulling the version number out can be done similarly easily.
417 |
418 | -- in parse_chunk_label:
419 | function parse_chunk_label(label)
420 | local name, version =
421 | string.match(label, "(.*) v(%d+)$")
422 | if name then return name, tonumber(version)
423 | else return label, 0 end
424 | end
425 |
426 | assert(parse_chunk_label("foo") == "foo")
427 | assert(({parse_chunk_label("foo")})[2] == 0)
428 | assert(parse_chunk_label("foo v32") == "foo")
429 | assert(({parse_chunk_label("foo v32")})[2] == 32)
430 |
431 | That covers all that’s needed to parse input.
432 |
433 | ### `tangle` ###
434 |
435 | This is the subroutine whose job it is
436 | to produce a runnable version of a
437 | literate program.
438 |
439 | Our `tangle` routine in this case is passed the name of an
440 | initial chunk and a version number. In order for it to be
441 | able to invoke itself recursively and still produce readable
442 | output (and, in Python, parseable output) it also takes an
443 | indentation parameter.
444 |
445 | -- in tangle:
446 | <>
447 |
448 | function tangle(chunks, chunkname, version, indent)
449 | if indent == nil then indent = '' end
450 |
451 | <>
452 |
453 | for _, line in ipairs(text) do
454 | local nindent, nchunkname = parse_reference(line)
455 | if nindent then
456 | tangle(chunks, nchunkname, version, indent..nindent)
457 | else
458 | io.write(indent..line.."\n")
459 | end
460 | end
461 | end
462 |
463 | This is simple enough: when we encounter a reference, we
464 | recurse, concatenating the indentation; and otherwise we
465 | simply indent the line and output it. (The indentation is
466 | essential for languages like Haskell and Python.)
467 |
468 | The process of getting the text must worry about error
469 | conditions.
470 |
471 | -- in get the text of the chunk:
472 | local contents = chunks[chunkname]
473 | if contents == nil then
474 | error(string.format("chunk `%s` does not exist",
475 | chunkname))
476 | end
477 |
478 | local text = get_chunk_text(contents, version)
479 | if text == nil then
480 | error(string.format("chunk `%s` has no version `%d`",
481 | chunkname, version))
482 | end
483 |
484 | This depends on functions `get_chunk_text` and `parse_reference`.
485 |
486 | -- in tangle definitions:
487 | <>
488 |
489 | <>
490 |
491 | `get_chunk_text` need only walk the relevant part of the
492 | `chunks` table. Recall that the contents for a chunk are
493 | simply stored as a list of `{v=3, text="foo"}` structs, so we
494 | can pull them out as follows:
495 |
496 | -- in get_chunk_text:
497 | function get_chunk_text(contents, version)
498 | local best
499 | for _, it in ipairs(contents) do
500 | if it.v <= version and (not best or
501 | it.v > best.v) then
502 | best = it
503 | end
504 | end
505 | if best then return best.text else return nil end
506 | end
507 |
508 | do
509 | local contents = {{v=0, text={"a"}},
510 | {v=2, text={"b"}},
511 | {v=1, text={"c"}}}
512 | assert(get_chunk_text(contents, 0)[1] == "a")
513 | assert(get_chunk_text(contents, 1)[1] == "c")
514 | assert(get_chunk_text(contents, 2)[1] == "b")
515 | assert(get_chunk_text(contents, 3)[1] == "b")
516 | assert(get_chunk_text(contents, -1) == nil)
517 | end
518 |
519 | `parse_reference` just needs to match the `<>`
520 | references and pull out whatever indentation precedes them;
521 | it turns out Lua’s pattern-matching can do this directly.
522 |
523 | -- in parse_reference:
524 | function parse_reference(line)
525 | return string.match(line, "^(%s*)<<(.*)>>(%s*)$")
526 | end
527 |
528 | do
529 | local indent, name = parse_reference(" <>\n")
530 | assert(indent == " ")
531 | assert(name == "foo")
532 | assert(parse_reference("bits << shiftlen >> 1") == nil)
533 | end
534 |
535 | ### `list_chunk_names_and_versions` ###
536 |
537 | Given this structure, listing either the chunk names or the
538 | versions should be simple. Unfortunately, listing both of
539 | them is a little annoying, because the output then requires
540 | parsing. But we can take advantage of this to be more
541 | explanatory.
542 |
543 | We’d like to only list the names of *root chunks*, that is,
544 | those that aren’t included in any other chunk. Often there
545 | will be only one of them.
546 |
547 | -- in list_chunk_names_and_versions:
548 | function list_chunk_names_and_versions(chunks)
549 | <>
550 |
551 | <>
552 |
553 | <>
554 |
555 | <>
556 | end
557 |
558 | We’ll output one thing per line:
559 |
560 | -- in display help message:
561 | io.write("# Listing versions and root chunk names.\n")
562 | io.write("# Version 12 is displayed as:\n")
563 | io.write("# v 12\n")
564 | io.write("# Chunk name foo bar is displayed as:\n")
565 | io.write("# n foo bar\n")
566 | io.write("# To tangle a particular root chunk, run:\n")
567 | io.write("# "..arg[0].." chunkname\n")
568 | io.write("# That tangles version 0 by default; to specify v69:\n")
569 | io.write("# "..arg[0].." chunkname 69\n")
570 |
571 | We traverse the table to build up information for what we
572 | display later.
573 |
574 | -- in traverse chunks table:
575 | local versions, referenced_chunks = {}, {}
576 | for name, contents in pairs(chunks) do
577 | for _, it in ipairs(contents) do
578 | versions[it.v] = true
579 |
580 | for _, line in ipairs(it.text) do
581 | local _, chunkname = parse_reference(line)
582 | if chunkname ~= nil then
583 | referenced_chunks[chunkname] = true
584 | end
585 | end
586 | end
587 | end
588 |
589 | Then displaying the versions is easy; we need only to produce
590 | the keys from the versions table:
591 |
592 | -- in display versions:
593 | for version, _ in pairs(versions) do
594 | io.write(string.format("v %d\n", version))
595 | end
596 |
597 | Displaying the chunk names is almost as easy:
598 |
599 | -- in display chunk names:
600 | for name, _ in pairs(chunks) do
601 | if not referenced_chunks[name] then
602 | io.write("n "..name.."\n")
603 | end
604 | end
605 |
606 | The build script
607 | ----------------
608 |
609 | Rebuilding handaxeweb from this document by hand is a
610 | little tedious. So here's a shell script that syntax-checks
611 | and double-compile checks.
612 |
613 | # in build_handaxeweb:
614 | #!/bin/sh
615 | set -ve
616 | ./handaxeweb.lua handaxeweb.lua 0 < handaxeweb.md > handaxeweb2.lua
617 |
618 | # test new version
619 | lua handaxeweb2.lua handaxeweb.lua 0 < handaxeweb.md > handaxeweb3.lua
620 |
621 | # try building it with itself:
622 | lua handaxeweb3.lua handaxeweb.lua 0 < handaxeweb.md > handaxeweb4.lua
623 |
624 | # verify output is the same:
625 | diff handaxeweb3.lua handaxeweb4.lua
626 |
627 | # okay, we’ll accept it
628 | cp handaxeweb4.lua handaxeweb.lua
629 |
630 | ./handaxeweb.lua build_handaxeweb 0 < handaxeweb.md > build_handaxeweb.new
631 | cp build_handaxeweb.new build_handaxeweb
632 |
633 | Flaws in handaxeweb
634 | -------------------
635 |
636 | There are several things I could do to improve this program
637 | without changing its functionality.
638 |
639 | (in this part of the document there is no code:)
640 | (This note is needed because of how
641 | Markdown structures nested lists, sigh.)
642 |
643 | * The state machine in `parse_input` is obtuse and bug-prone.
644 |
645 | * There are a number of subroutines and abstraction layers
646 | that would simplify the main program logic:
647 |
648 | * Appending one list to another (in two places).
649 | * Some kind of parsing machinery, probably.
650 | * An ordered container supporting insertion and
651 | nearest-match searching.
652 | * Set arithmetic; in particular, set subtraction.
653 | * Collections stuff: keys of a table, mapping a function
654 | over a list, printing all the items in a list.
655 |
656 | * Appending to a versioned chunk is still kind of
657 | inconvenient. If you could say `<>` this
658 | problem would mostly go away.
659 |
660 | * The default to output should probably be the last version,
661 | not version 0.
662 |
663 | * There’s still no syntax highlighting or tables of contents
664 | in the output.
665 |
666 | * Emacs isn’t smart enough to do syntax highlighting in the
667 | input.
668 |
669 | * Compiler error messages are subpar because handaxeweb
670 | doesn’t know enough to generate `#line` directives. (And
671 | for some languages, there is no such thing.)
672 |
673 | Probably the right thing to do for some of these problems is
674 | to use parsing tools to parse the input.
675 |
676 | ### a PEG for handaxeweb’s input ###
677 |
678 | # in a PEG for handaxeweb:
679 | # Top-level constructs, down to the paragraph level:
680 | litprog <- (!chunk (bl / textpara / codepara))* chunk*.
681 | chunk <- header (textpara* !header codepara)*.
682 | codepara <- first: indented+ more: (bl+ indented+)*.
683 | textpara <- bl* unindented+ bl*.
684 |
685 | # Types of lines:
686 | header <- indent nonalnum* "in " defname ":" nonalnum* nl.
687 | indented <- !bl indent (more: wsp* reference / text: normal+) nl.
688 | bl <- wsp* nl. # Blank line.
689 | unindented <- !indent normal+ nl.
690 |
691 | # Syntax within lines:
692 | defname <- name: (!version normal)* version.
693 | version <- (" v" n: number+ / ) !!":".
694 | reference <- "<<" name: (!">>" normal)* ">>".
695 | indent <- " ".
696 |
697 | # Character classes:
698 | nonalnum <- !alnum normal.
699 | alnum <- uppercase / lowercase / number.
700 | uppercase <- "A" / "B" / "C" / "D" / "E" / "F" / "G" /
701 | "H" / "I" / "J" / "K" / "L" / "M" / "N" /
702 | "O" / "P" / "Q" / "R" / "S" / "T" / "U" /
703 | "V" / "W" / "X" / "Y" / "Z".
704 | lowercase <- "a" / "b" / "c" / "d" / "e" / "f" / "g" /
705 | "h" / "i" / "j" / "k" / "l" / "m" / "n" /
706 | "o" / "p" / "q" / "r" / "s" / "t" / "u" /
707 | "v" / "w" / "x" / "y" / "z".
708 | number <- "0" / "1" / "2" / "3" / "4" /
709 | "5" / "6" / "7" / "8" / "9".
710 | normal <- !nl char.
711 | nl <- "\n".
712 | wsp <- " " / "\t".
713 |
714 | And that pretty much covers the entire deep structure of the
715 | input. All the indentation, logic of blank lines between
716 | other indented lines, parsing of references, version numbers,
717 | carrying chunk headers from one indented region to the next,
718 | and so on, is in there. The only thing that really remains to
719 | be done is specifying what to do with it: concatenate the
720 | `first` and `more` parts of `codepara`s, default version
721 | numbers to zero, dump the codepara parts of chunks into a
722 | dictionary of ordered-search structures, and then run
723 | `tangle`.
724 |
725 | (The grammar is slightly different from the one implemented
726 | by my current implementation: it no longer allows : or >>, in
727 | different contexts, inside of chunk names.)
728 |
729 |
730 |
731 |
--------------------------------------------------------------------------------
/peg.md:
--------------------------------------------------------------------------------
1 | PEGs in a PEG
2 | =============
3 |
4 | So I was reading [Bryan Ford’s thesis][ford]
5 | about parsing expression grammars and packrat parsers,
6 | and I thought it would be fun to implement them
7 | and see how easy they really were.
8 |
9 | It turns out they’re not that hard;
10 | this document contains a one-page PEG parser generator
11 | that generates PEG parsers in JavaScript,
12 | along with an explanation of how it works,
13 | and some example applications.
14 | If you’ve ever thought
15 | that writing a compiler was deep magic
16 | because parsing would take you way too long to understand,
17 | this should show you
18 | that writing a compiler can be simple!
19 | (At least,
20 | if you already know how to program.)
21 |
22 | [ford]: http://pdos.csail.mit.edu/~baford/packrat/thesis/ "Packrat Parsing: a Practical Linear-Time Algorithm with Backtracking"
23 |
24 | What Are PEGs?
25 | --------------
26 |
27 | A PEG is a formal language description
28 | which describes how to parse some language —
29 | like a regular expression,
30 | it describes the structure of some set of strings.
31 |
32 | ### A Gentle Introduction by Example ###
33 |
34 | Here’s a simple PEG
35 | which describes simple arithmetic expressions
36 | with no operator precedence:
37 |
38 | # in an example arithmetic parser:
39 | sentence <- ('0' / '1' / '2' / '3' / '4' / '5' / '6' / '7' / '8' / '9')+
40 | ( ('+' / '-' / '*' / '×' / '/' / '÷') sentence / ).
41 |
42 | This says that a `sentence` is
43 | one or more digits,
44 | followed by either an operator and another `sentence`,
45 | or nothing.
46 | The parentheses are used for grouping;
47 | apostrophes `''` are used for literal text;
48 | slashes `/` are used for choice
49 | (“try parsing this, and if it doesn’t work out, try that”);
50 | a left arrow `<-` is used to attach a name
51 | (called a “nonterminal”)
52 | to a parsing rule;
53 | and `x+` means “one or more of `x`”.
54 |
55 | (Typically,
56 | each of the strings that belongs to a language,
57 | such as a program in a programming language,
58 | is called a “sentence” of that language;
59 | thus my choice of that nonterminal name.)
60 |
61 | So, to parse `2*30+4` as a `sentence`,
62 | first we try matching a `0` at the beginning,
63 | where there’s a `2`;
64 | that doesn’t work, so we try a `1`;
65 | that doesn't work, so we try a `2`.
66 | That does work, so then we try for repetition,
67 | looking for a second digit
68 | at the `*`.
69 | That doesn’t work out (after ten tries), so we zoom along and look for a `+`.
70 | The `*` isn’t a `+`, so after a couple of tries,
71 | we find out it’s a `*`.
72 | Then we try parsing a nested `sentence` starting at the `3`.
73 | This time, we match the `3` after three tries,
74 | and then when we look for a second digit, we find a `0`;
75 | the third try fails, so we look for a `+`, and find it;
76 | then we look for a second nested `sentence`.
77 | We match a `4` after four tries,
78 | but we don’t find another digit after it
79 | (because there isn’t anything after it),
80 | so we try to find an operator after it,
81 | which doesn’t work,
82 | so we try to find nothing after it
83 | (the emptiness after the `/` after `sentence`)
84 | which works,
85 | and we’re done.
86 |
87 | Notice that this doesn’t respect operator precedence
88 | (it gives `2*(30+4)` rather than `(2*30)+4`),
89 | and also associates to the right.
90 |
91 | Here’s an example PEG
92 | that handles operator precedence and parentheses,
93 | although not associativity:
94 |
95 | # in an example arithmetic parser with precedence:
96 | sentence <- term ('+'/'-') sentence / term.
97 | term <- atom ('*' / '×' / '/' / '÷') term / atom.
98 | atom <- number / '(' sentence ')'.
99 | number <- ('0' / '1' / '2' / '3' / '4' / '5' / '6' / '7' / '8' / '9')+.
100 |
101 | If we try to parse the same `2*30+4` with this grammar,
102 | we get down to `number` and parse the `2`,
103 | so `atom` succeeds with the `2`,
104 | and then `term` sucks up the `*`
105 | and then looks for an inner `term` at the `3`.
106 | Then `number` parses `30`,
107 | and the inner `term` looks for one of `*×/÷` after it,
108 | which doesn’t work out since what’s after it is a `+`,
109 | so it gives up on its first alternative and tries to parse
110 | just an `atom` starting at the `3`,
111 | rather than an `atom` followed by an operator and another term.
112 | Then `atom` sucks up the `30` just like before,
113 | and the inner `term` finishes,
114 | and then the outer `term` finishes,
115 | and it’s up to `sentence` to deal with the `+4` bit,
116 | which it does in the predictable way.
117 |
118 | It won’t handle `40-1-1` as `(40-1)-1` as you might hope, though.
119 | If you try to rewrite `sentence` to handle this
120 | as `sentence ('+'/'-') term / term`,
121 | you run into trouble —
122 | the first thing `sentence` does
123 | is try to parse a `sentence`,
124 | so you get into an infinite loop.
125 | There are different ways to ameliorate this problem
126 | by enhancing the parser generator,
127 | but in general,
128 | you can always figure out a way
129 | to modify the grammar
130 | to remove this “left recursion”;
131 | it just makes it a little more complicated
132 | to handle the results of the parser.
133 |
134 | (As an aside, most practical PEG systems
135 | let you abbreviate things like
136 | `('0' / '1' / '2' / '3' / '4' / '5' / '6' / '7' / '8' / '9')`
137 | as `[0-9]`,
138 | but the one in this document doesn’t.)
139 |
140 | That covers most of the stuff PEGs can do.
141 | A few things to notice:
142 |
143 | 1. They’re a little more verbose than regular expressions
144 | but a lot more powerful at understanding structure.
145 | And, like with regexps,
146 | you can do a hell of a lot
147 | in a few lines of code.
148 | 2. The obvious implementation is pretty slow;
149 | it spends a lot of time re-parsing
150 | things it’s already parsed
151 | and playing Cheese Shop with the next character.
152 | (“Have you got a 0?” “No.”
153 | “How about a 1?” “No.” ...)
154 | It turns out there are ways to solve this,
155 | although I don’t explore them in this document.
156 | 3. They have trouble with “left recursion”,
157 | which is where the first thing in a “foo” (say, `sentence`)
158 | can be a smaller “foo”.
159 |
160 | There’s one more big feature of PEGs:
161 | the ability to do negative lookahead, or negation.
162 | As an example, in C,
163 | a comment begins at a `/*`
164 | and continues until the next `*/`.
165 | But you can have `*` and `/` and even `/*` inside the comment,
166 | as long as there isn't a `*/`.
167 | Doing this in a regexp is a real pain and the result is unreadable.
168 | You end up with a regexp like `\/\*([^*]|\*[^/])*\*\/`,
169 | assuming you have to backslash your slashes.
170 | In a PEG, it looks like this:
171 |
172 | # in the C comment example PEG:
173 | comment <- '/*' (!'*/' char)* '*/'.
174 |
175 | That is, to parse a comment, first parse a `/*`,
176 | then as long as the next thing isn’t a `*/`, try to parse a `char`,
177 | and then parse a `*/`.
178 | (The `*` means “zero or more”,
179 | just as `+` means “one or more”.)
180 | You can write the same thing
181 | with Perl’s enhanced regexp features: `qr|/\*(?:(?!\*/).)*\*/|`,
182 | and it’s only slightly shorter,
183 | but I think it's not as clear.
184 |
185 | You might think that in `!'*/' char`,
186 | the negation of `'*/'` somehow *modifies* `char`.
187 | But it doesn’t, really;
188 | it just means that the parse fails
189 | at points in the input
190 | where `'*/'` can match,
191 | so `char`
192 | doesn’t get a chance to match there.
193 | Instead, we backtrack from matching the `'*/'`,
194 | break out of the loop,
195 | and get a chance to match the `'*/'` on the outside.
196 |
197 | You can use this magic PEG power
198 | for a variety of things that are traditionally painful.
199 | For example, most programming languages have keywords,
200 | which look like variables (or other identifiers)
201 | but are treated differently syntactically.
202 | In a PEG, you can write this:
203 |
204 | # in the keyword example PEG:
205 | keyword = ('if' / 'while' / 'for' / otherkeyword) !idchar.
206 | identifier = !keyword idstartchar idchar*.
207 |
208 | This first specifies that a `keyword`
209 | is one of the specified words
210 | as long as it's not followed by an `idchar`;
211 | then it specifies that when you’re trying to parse an `identifier`,
212 | first try to parse a `keyword`,
213 | and if that succeeds,
214 | then parsing the `identifier` should fail;
215 | but if there's no `keyword`,
216 | go ahead and try to parse an `idstartchar`
217 | followed by zero or more `idchar`s.
218 |
219 | Note that we throw away the results
220 | of trying to parse the `keyword` —
221 | we were only trying it in order to see
222 | if we shouldn’t do something else.
223 |
224 | ### If You’ve Taken a Compilers Class Lately ###
225 |
226 | I thought I’d stick this section in
227 | for the benefit of folks who are all up on the theory.
228 | The rest of the document
229 | doesn’t depend on it.
230 |
231 | PEGs specify how to *parse* a language,
232 | by contrast with context-free grammars,
233 | which primarily describe how to *generate* sentences of a language.
234 | This difference
235 | makes it much easier to construct parsers for PEGs;
236 | they can be straightforwardly converted
237 | into simple recursive-descent parsers
238 | performing limited backtracking,
239 | with each nonterminal becoming a parsing function.
240 | It also probably makes it much more difficult
241 | to prove properties of the language recognized by a PEG.
242 |
243 | PEGs can parse some languages
244 | that context-free grammars can’t,
245 | such as the language `a`n`b`n`c`n,
246 | that is,
247 | some number of `a`s,
248 | followed by the same number of `b`s,
249 | followed by the same number of `c`s.
250 | However, because PEGs can’t handle ambiguity,
251 | and because there’s a linear-time parsing algorithm for them,
252 | it is suspected that PEGs can’t parse
253 | all languages context-free grammars can.
254 | `S → a S a | a S b | b S a | b S b | a`
255 | is a simple context-free language
256 | which Ford conjectured cannot be parsed with a PEG;
257 | it describes strings of odd numbers of `a`s and `b`s
258 | in which the middle letter is an `a`.
259 | PEGs can parse all languages
260 | that can be parsed with LL(k) or LR(k) parsers.
261 |
262 | PEGs are more composable
263 | than LL(k) or LR(k) CFGs;
264 | because PEGs can’t handle ambiguity,
265 | it’s easy to predict the effect
266 | of adding new parsing rules to the grammar.
267 |
268 | You can parse general CFGs
269 | with a backtracking approach like the PEG approach;
270 | the difference is that each nonterminal
271 | must be able to succeed multiple times
272 | on the same input
273 | with different possible parses,
274 | in case something that follows it fails.
275 | Definite clause grammars in Prolog
276 | are one example of this strategy.
277 | In PEGs, once a nonterminal succeeds at some position,
278 | it throws away its backtracking state,
279 | so it can only produce at most one result at that position.
280 | As a consequence,
281 | even though there are PEGs that take exponential time to parse
282 | (if implemented the naïve way)
283 | CFGs with exponential-time parsing
284 | (again, if implemented the naïve way, as with DCGs)
285 | are much more common.
286 |
287 | (Allan Schiffman tells me
288 | that all you really need to do to make DCGs perform well
289 | is to put cuts in “the obvious places”,
290 | e.g. between statements.
291 | I haven’t tried it myself.)
292 |
293 | A Minimal PEG Language
294 | ----------------------
295 |
296 | The expressions in PEGs minimally contain
297 | (using the TDPL notation in the thesis)
298 | negation `!`,
299 | ordered choice or alternation `/`,
300 | concatenation or sequencing (denoted by juxtaposition),
301 | terminal strings (written in single quotes `''`),
302 | and nonterminals (written as bare words `foo`).
303 | (We can leave out repetition `*` and `+`,
304 | because as shown below,
305 | we can synthesize them.)
306 |
307 | Here’s a relatively minimal grammar
308 | describing a notation for a grammar
309 | with these features,
310 | the same one I used in the “Gentle Introduction” section,
311 | written in terms of itself:
312 |
313 | # in a minimal parsing expression grammar:
314 | _ <- sp _ / .
315 | sp <- ' ' / '\n' / '\t'.
316 | sentence <- _ rule sentence / _ rule.
317 | rule <- name _ '<-'_ choice '.'_.
318 | choice <- sequence '/'_ choice / sequence.
319 | sequence <- term sequence / .
320 | term <- '!'_ term / '\'' stringcontents '\''_ / name _.
321 | stringcontents <- stringchar stringcontents / .
322 | stringchar <- !'\\' !'\'' char / '\\' char.
323 | name <- namechar name / namechar.
324 | namechar <- !'!' !'\'' !sp !'<-' !'/' !'.' char.
325 |
326 | This all depends on the primitive nonterminal `char`,
327 | which I’m assuming matches any character,
328 | for some definition of character.
329 |
330 | The nonterminal `_` consumes any amount of whitespace.
331 | It’s used everywhere we want to consume whitespace,
332 | generally at the lowest possible level of the grammar,
333 | with the exception of `name`
334 | (on the theory that the whitespace
335 | is not really part of the name.)
336 | (Even though it has a funny non-alphabetic name,
337 | the language doesn’t treat it specially.
338 | I used to call it `s` but it was distracting.)
339 |
340 | There are three cases of the pattern `group <- item group / .`,
341 | which means `group` is zero or more things that match `item`.
342 | Because PEGs are greedy and don’t backtrack after returning,
343 | `group` will only ever parse
344 | the maximum possible number of `item` items.
345 | It’s not possible for a parsing failure after the `group`
346 | to cause `group` to backtrack and return a smaller number of `item` objects,
347 | the way it could in a parser for a context-free grammar,
348 | although a parsing failure inside the last `item` will indeed do so.
349 | This allows us to get by
350 | without a separate scanner for this grammar!
351 | One minor variation of this pattern
352 | is found in `sentence` and `name`,
353 | which match *one* or more of their elements,
354 | not *zero* or more.
355 |
356 | Note that the above grammar tells us how to parse the language,
357 | but doesn’t tell us anything about its semantics.
358 | But it’s nice and short.
359 |
360 | Adding Grouping
361 | ---------------
362 |
363 | The PEG language as written above is pretty weak.
364 | It doesn’t have grouping or repetition,
365 | although they can be emulated with the use of extra productions,
366 | as in the `foos` pattern explained above.
367 |
368 | We can add grouping by redefining `term` like this:
369 |
370 | # in a slightly more powerful parsing expression grammar:
371 | term <- '!'_ term / '\'' stringcontents '\''_ / name _
372 | / '('_ choice ')'_.
373 |
374 | This simplifies the grammar only slightly;
375 | we can rewrite `stringcontents` as follows:
376 |
377 | stringcontents <- (!'\\' !'\'' char / '\\' char) stringcontents / .
378 |
379 | A Diversion: Adding Repetition
380 | ------------------------------
381 |
382 | Although it turns out not to be very useful
383 | for what I’ll do next,
384 | adding the capability for repetition to the language
385 | makes it shorter and clearer.
386 |
387 | # in a more powerful PEG:
388 | sp <- ' ' / '\n' / '\t'.
389 | _ <- sp*.
390 | sentence <- _ (name _ '<-'_ choice '.'_)+.
391 | choice <- term* ('/'_ term*)*.
392 | term <- ('!'_ term / string / name / '('_ choice ')')_ ('+' / '*' / )_.
393 | string <- '\'' (!'\\' !'\'' char / '\\' char)* '\''_.
394 | meta <- '!' / '\'' / '<-' / '/' / '.' / '+' / '*' / '(' / ')'.
395 | name <- (!meta !sp char)+.
396 |
397 | That shrinks the grammar considerably,
398 | while significantly expanding
399 | the expressiveness of the grammar language it describes.
400 |
401 | Adding Result Expressions
402 | -------------------------
403 |
404 | In theory, the grammar as written could be useful.
405 | It’s expressive enough to describe
406 | the tree structure of a language,
407 | such as the PEG language defined above.
408 | So you could use it to parse some string
409 | into a syntax tree.
410 |
411 | However,
412 | it would be even more useful
413 | to have a version of the grammar language
414 | that can include result expressions
415 | written in some programming language
416 | that compute useful things.
417 | For example,
418 | you could use such a system
419 | to write and maintain a working compiler
420 | from PEG grammars to some programming language,
421 | or from some other language.
422 |
423 | A straightforward and readable way to do this
424 | is to label some parts of a sequence with names,
425 | and then to use those names in a result specification
426 | at the end of the sequence.
427 |
428 | Here’s an extension of the above grammar
429 | that allows for such names and result specifications:
430 |
431 | # in a PEG describing results:
432 | sp <- ' ' / '\n' / '\t'.
433 | _ <- sp _ / .
434 | sentence <- _ rule sentence / _ rule.
435 | rule <- name _ '<-'_ choice '.'_.
436 | choice <- sequence '/'_ choice / sequence.
437 | sequence <- term sequence / '->'_ expr / .
438 | expr <- '('_ exprcontents ')'_.
439 | exprcontents <- (!'(' !')' char / expr) exprcontents / .
440 | term <- name _ ':'_ term / '!'_ term / string / name _
441 | / '('_ choice ')'_.
442 | string <- '\'' stringcontents '\''_.
443 | stringcontents <- !'\\' !'\'' char stringcontents
444 | / '\\' char stringcontents / .
445 | meta <- '!' / '\'' / '<-' / '/' / '.' / '(' / ')' / ':' / '->'.
446 | name <- namechar name / namechar.
447 | namechar <- !meta !sp char.
448 |
449 | This adds the possibility
450 | that a term may be preceded by a colon and a name,
451 | and that a sequence may end
452 | with a `->` and a parenthesized expression.
453 |
454 | This lets you write things like
455 | `n: expr`
456 | and `expr _ -> (print("got expr"))`.
457 | It doesn’t place strong requirements
458 | on the embedded expression,
459 | so it can be in almost any language,
460 | but it does require that any parentheses inside of it
461 | be balanced.
462 | (If that's difficult in a certain case,
463 | due to embedded strings,
464 | maybe you can incorporate some commented-out parentheses
465 | to balance things.)
466 |
467 | A Metacircular Compiler-Compiler
468 | --------------------------------
469 |
470 | So let’s suppose that we want to use this result-expression facility
471 | to write a compiler for these grammars,
472 | producing a parser for the specified grammar
473 | in, say, JavaScript.
474 | We want to translate each parsing expression
475 | in the grammar language
476 | into an expression in the target language
477 | that parses
478 | the sub-language defined by that parsing expression.
479 | For example,
480 | we want to translate
481 | `choice <- sequence '/'_ choice / sequence.`
482 | into a recursive JavaScript function
483 | that parses expressions containing slash-separated `choice`s.
484 | Since it doesn’t specify a result expression,
485 | it’s sort of indeterminate what it should actually do,
486 | other than consume characters from the input stream
487 | until it finds something `choice` can't parse.
488 |
489 | So now we have to figure out
490 | what the semantics are
491 | of each of the various actions.
492 |
493 | I’m going to factor out
494 | the code generation parts
495 | into separate named blocks
496 | so that it’s relatively easy
497 | to have the parser, say,
498 | generate code in some other language,
499 | or just an abstract syntax tree.
500 |
501 | ### Whitespace ###
502 |
503 | Whitespace is fairly easy:
504 | it is a no-op.
505 |
506 | # in the metacircular compiler-compiler:
507 | sp <- ' ' / '\n' / '\t'.
508 | _ <- sp _ / .
509 |
510 | ### Rules ###
511 |
512 | Let’s compile each rule
513 | into a JavaScript function
514 | that parses the language described by that rule,
515 | and the grammar as a whole
516 | into the collection of these functions
517 | plus whatever support code is needed.
518 | (Here I’m going to use double angle-brackets `<<>>`
519 | to name chunks of code that aren’t given until later.)
520 |
521 | rule <- n: name _ '<-'_ body: choice '.'_ ->
522 | <>
523 | .
524 | sentence <- _ r: rule g: sentence -> (r + "\n" + g)
525 | / _ r: rule -> (r + "\n"
526 | <>
527 | ).
528 |
529 | The code to produce a function
530 | in JavaScript
531 | is quite straightforward:
532 |
533 | # in code to produce a function:
534 | (["function parse_", n, "(input, pos) {\n",
535 | <>
536 | body,
537 | <>
538 | "}\n"].join(''))
539 |
540 | So a grammar nonterminal named `term`
541 | will be compiled into a function called `parse_term`,
542 | whose body will be the value computed by `choice`,
543 | bracketed by some startup and cleanup code,
544 | and therefore `choice` needs to evaluate to
545 | a string of
546 | zero or more valid JavaScript statements.
547 |
548 | These functions
549 | will need to do several things
550 | to implement the semantics of a PEG parser:
551 |
552 | 1. Advance the input position,
553 | starting from the input position the caller passed in,
554 | and in case of success,
555 | communicate the new input position
556 | to the caller.
557 | 2. Save the input position
558 | (and any other state)
559 | in order to backtrack
560 | when a sequence inside a choice fails,
561 | or after testing a negation condition.
562 | They may have to save
563 | several input positions at once
564 | in cases where there is nested alternation.
565 | 3. Compute the value
566 | given by the result expressions in the grammar
567 | and, in case of success,
568 | pass it back to the caller,
569 | along with the new input position.
570 |
571 | In order to avoid global variables,
572 | we’re passing in the input string
573 | (which doesn’t change during a parse)
574 | and the current position in it
575 | as arguments to each parsing function.
576 |
577 | To package the value computed
578 | along with the new input position,
579 | we’ll return a JavaScript object
580 | with `val` and `pos` properties,
581 | like `{val: "foo", pos: 37}`.
582 | In case of failure,
583 | we’ll just return `null`.
584 |
585 | From here we’ll mostly work bottom-up.
586 |
587 | ### Names ###
588 |
589 | Names are used in two contexts:
590 | at the top level of a rule,
591 | they define the name of the nonterminal,
592 | and in a term,
593 | they request a call to that nonterminal.
594 | In both cases,
595 | we basically just need the contents of the name.
596 |
597 | # in the metacircular compiler-compiler:
598 | meta <- '!' / '\'' / '<-' / '/' / '.' / '(' / ')' / ':' / '->'.
599 | name <- c: namechar n: name -> (c + n) / namechar.
600 | namechar <- !meta !sp char.
601 |
602 | In this case,
603 | we presume that the value produced by `char`
604 | (and thus the value produced by `namechar`)
605 | is the character it consumed,
606 | and that in the absence of an explicit result expression,
607 | the result of the whole rule
608 | is that same character.
609 | This can be implemented, for example,
610 | by having a sequence return by default
611 | the value of the last term in it.
612 | (I’m not sure that’s a good default,
613 | because it seems a little error-prone,
614 | but I’ll try it.)
615 |
616 | ### Nonterminals ###
617 |
618 | A reference to a nonterminal
619 | is compiled as a call to its parsing function,
620 | passing in the current position.
621 |
622 | # in the metacircular compiler-compiler:
623 | term <- labeled / nonterminal / string / negation / parenthesized.
624 | nonterminal <- n: name _ ->
625 | <>
626 | .
627 |
628 | Again, the JS implemntation
629 | of a subroutine call
630 | is quite simple:
631 |
632 | # in code to parse another nonterminal:
633 | ([' state = parse_', n, '(input, state.pos);\n'].join(''))
634 |
635 | This means we need a variable `state`
636 | to store this returned value in,
637 | and it needs to be initialized
638 | with the position passed in by the caller.
639 |
640 | # in function prologue:
641 | ' var state = { pos: pos };\n',
642 |
643 | What do we do with `state.val`?
644 | It depends on where the nonterminal is found.
645 | If it’s preceded by a label,
646 | we want to store it in a variable
647 | under that name
648 | for later use,
649 | unless it fails.
650 | Let’s have `term`,
651 | just like `choice`,
652 | return a string of zero or more valid JavaScript statements.
653 |
654 | # in the metacircular compiler-compiler:
655 | labeled <- label: name _ ':'_ value: term ->
656 | <>
657 | .
658 |
659 | We protect this with a conditional on `state`
660 | in case the parse has failed:
661 |
662 | # in code to save a value in a variable:
663 | ([value, ' if (state) var ', label, ' = state.val;\n'].join(''))
664 |
665 | (Ideally we would undo this saving
666 | if the nonterminal is in an alternative
667 | that fails and ends up being backtracked;
668 | but hopefully the result expressions
669 | of later alternatives
670 | will simply not use that variable.)
671 |
672 | Now,
673 | if the nonterminal
674 | was the last thing in a parsing function,
675 | then we want to return the `state.val` it gave us
676 | as our own `state.val`,
677 | and additionally we want to return its `state.pos`
678 | as our `state.pos`;
679 | or, if it failed,
680 | it returned `null`,
681 | in which case we want to return `null`.
682 |
683 | So at the end of the function,
684 | we can just return `state`:
685 |
686 | # in function epilogue:
687 | ' return state;\n',
688 |
689 | Now we just need to ensure
690 | that all of the other expression types
691 | (sequence, terminal strings, ordered choice, negation, parenthesized)
692 | update `state` in a manner analogous
693 | to how calls to nonterminals update `state`.
694 |
695 | While we're on the topic of nonterminals,
696 | we should probably define the one predefined nonterminal,
697 | `char`:
698 |
699 | # in support code:
700 | + 'function parse_char(input, pos) {\n'
701 | + ' if (pos >= input.length) return null;\n'
702 | + ' return { pos: pos + 1, val: input.charAt(pos) };\n'
703 | + '}\n'
704 |
705 | ### Sequence ###
706 |
707 | Sequences are relatively simple.
708 | Given a sequence of two expressions `foo bar`,
709 | we first parse `foo` from the current position,
710 | and if that succeeded,
711 | we parse `bar` from the new position.
712 | If it fails,
713 | the sequence as a whole fails,
714 | and there is no current position.
715 |
716 | This is one of the things
717 | that is easier to do
718 | if you don’t try to write your grammar with features like `*`,
719 | since it treats sequences of arbitrary numbers of things
720 | as nested sequences of two items,
721 | the innermost of which is empty.
722 |
723 | # in the bare grammar:
724 | sequence <- term sequence / '->'_ expr / .
725 |
726 | The case of an empty sequence
727 | doesn’t update `state` at all.
728 | In the case of a non-empty sequence,
729 | we execute `foo`,
730 | and if `foo` doesn’t set `state` to `null`,
731 | we execute `bar`.
732 |
733 | # in the metacircular compiler-compiler:
734 | sequence <- foo: term bar: sequence ->
735 | <>
736 | / result_expression / -> ('').
737 |
738 | The `result_expression` case
739 | is one of the last things explained,
740 | so ignore it for now.
741 |
742 | This will result in deeply nested if statements
743 | without proper indentation
744 | in the output
745 | when there is a long sequence,
746 | but that’s probably okay:
747 |
748 | # in code to handle a sequence:
749 | ([foo, ' if (state) {\n', bar, ' }\n'].join(''))
750 |
751 | ### Terminal Strings ###
752 |
753 | A “terminal” or literal string like `'->'`
754 | either matches some characters in the input
755 | or fails to do so.
756 | Rather than inserting code into every parsing function
757 | to compare parts of the input,
758 | making the parsing functions less readable,
759 | we’ll factor this out into a single “literal” function:
760 |
761 | # in support code:
762 | + 'function literal(input, pos, string) {\n'
763 | + ' if (input.substr(pos, string.length) === string) {\n'
764 | + ' return { pos: pos + string.length, val: string };\n'
765 | + ' } else return null;\n'
766 | + '}\n'
767 |
768 | So then we just need to emit code to call this function
769 | and update `state` appropriately
770 | when we encounter a terminal string.
771 | As it happens,
772 | the translation from string syntax in the PEG language
773 | to string syntax in JavaScript
774 | is the null transformation.
775 | If we were compiling to some other language,
776 | such as C,
777 | this might pose some difficulty.
778 |
779 | # in the metacircular compiler-compiler:
780 | string <- '\'' s: stringcontents '\''_ ->
781 | <>
782 | .
783 | stringcontents <- !'\\' !'\'' c: char s: stringcontents -> (c + s)
784 | / b: '\\' c: char s: stringcontents -> (b + c + s)
785 | / -> ('').
786 |
787 | So here’s the function call:
788 |
789 | # in code to match a literal string:
790 | ([" state = literal(input, state.pos, '", s, "');\n"].join(''))
791 |
792 | As we iterate through the characters or backslash-escapes
793 | inside the string, we convert them to strings —
794 | either by default,
795 | or explicitly by concatenating the backslash
796 | to the character that follows it.
797 | Then we call `literal`
798 | with the current position
799 | and it either returns `null`
800 | or gives us the new position and the value it matched
801 | as our new `state`.
802 |
803 | ### Ordered Choice ###
804 |
805 | Two of the remaining expression types
806 | (ordered choice, negation, but not terminal strings and parenthesized)
807 | can require backtracking.
808 | So we have to save a state
809 | and possibly restore that state.
810 |
811 | Here’s how ordered choice works;
812 | negation is fairly similar.
813 | In ordered choice,
814 | if the first alternative succeeds,
815 | we don’t try the others;
816 | but if it fails,
817 | we restore the previously saved state.
818 |
819 | This is complicated somewhat
820 | by the fact that we might be inside a parenthesized expression,
821 | so there may be a stack of previously saved states,
822 | even inside the same function.
823 |
824 | So on entry to the function, we create a stack:
825 |
826 | # in function prologue:
827 | ' var stack = [];\n',
828 |
829 | The grammar entry treats N-way choices
830 | like `labeled / negation / string / nonterminal / parenthesized`
831 | as nested 2-way choices
832 | like `labeled / (negation / (string / (nonterminal / parenthesized)))`.
833 | This is a little bit needlessly inefficient,
834 | since we’ll be using potentially four stack entries
835 | instead of one,
836 | but it will do for now.
837 |
838 | # in the metacircular compiler-compiler:
839 | choice <- a: sequence '/'_ b: choice ->
840 | <>
841 | / sequence.
842 |
843 | Execution of `b` is conditional on failure of `a`;
844 | if `a` succeeds,
845 | we simply discard the state
846 | we saved before trying it.
847 |
848 | # in code to handle a choice:
849 | ([' stack.push(state);\n',
850 | a,
851 | ' if (!state) {\n',
852 | ' state = stack.pop();\n',
853 | b,
854 | ' } else stack.pop();\n'].join(''))
855 |
856 | It’s only safe to push `state`
857 | rather than a copy of `state`
858 | because we never mutate the existing `state`;
859 | we only make new `state` objects.
860 |
861 | ### Negation ###
862 |
863 | Negation is `!x`:
864 |
865 | # in the metacircular compiler-compiler:
866 | negation <- '!'_ t: term ->
867 | <>
868 | .
869 |
870 | This is implemented by saving the parse state,
871 | trying to parse `x`,
872 | failing if parsing `x` succeeded,
873 | and otherwise proceeding from the saved parse state.
874 |
875 | # in code to handle negation:
876 | ([' stack.push(state);\n',
877 | t,
878 | ' if (state) {\n',
879 | ' stack.pop();\n',
880 | ' state = null;\n',
881 | ' } else state = stack.pop();\n'].join(''))
882 |
883 | You can use a double negative like `!!'->'`
884 | to write a “zero-width positive lookahead assertion” in Perl lingo.
885 | That compiles into this:
886 |
887 | # in the output of the compiler-compiler:
888 | stack.push(state);
889 | stack.push(state);
890 | state = literal(input, state.pos, '->');
891 | if (state) {
892 | stack.pop();
893 | state = null;
894 | } else state = stack.pop();
895 | if (state) {
896 | stack.pop();
897 | state = null;
898 | } else state = stack.pop();
899 |
900 | The initial `state` is assumed to be non-`null`.
901 | So after the call to `literal`,
902 | `state` is non-`null` iff the next couple of characters were `->`.
903 | Then, after the first `if`,
904 | `state` is non-`null` iff the next couple of characters *weren’t* `->`.
905 | Then, after the second `if`,
906 | it is again non-`null` iff the next couple of characters were `->`.
907 | And if it’s non-`null`,
908 | it’s the `state` you started with.
909 |
910 | So that does the right thing,
911 | perhaps a bit verbosely.
912 |
913 | ### Result Expressions ###
914 |
915 | A result expression
916 | gives a JavaScript expression
917 | to evaluate
918 | to get the value that a sequence parses to.
919 | Normally, it uses variable bindings
920 | produced by labels.
921 | The value it returns
922 | may become
923 | the value of the term (if the sequence is inside parentheses)
924 | or the value returned by a whole parsing function.
925 |
926 | # in the metacircular compiler-compiler:
927 | result_expression <- '->'_ result: expr _ ->
928 | <>
929 | .
930 |
931 | Note the `_` to discard whitespace.
932 |
933 | Of course,
934 | this is conditional
935 | on the parser not being in a failed state:
936 |
937 | # in code to handle result expressions:
938 | ([' if (state) state.val = ', result, ';\n'].join(''))
939 |
940 | The expression is delimited by parentheses `()`.
941 | The outermost pair of parentheses
942 | are kept,
943 | which simplifies the grammar
944 | and avoids tricky problems of operator precedence
945 | when the result expression is copied into the output program
946 | in the `state.val =` context above.
947 |
948 | # in the metacircular compiler-compiler:
949 | expr <- '('_ e: exprcontents ')' -> ('(' + e + ')').
950 | exprcontents <- c: (!'(' !')' char / expr) e: exprcontents -> (c + e)
951 | / -> ('').
952 |
953 | `result_expression` discards whitespace after the expression
954 | rather than having the expression production do it itself
955 | in order to preserve whitespace after right parens
956 | consumed by recursive calls to the expression production.
957 |
958 | ### Parenthesized Expressions ###
959 |
960 | Parenthesized expressions
961 | don’t need any real special handling;
962 | or, rather, the special handling
963 | consists of the `stack` variable everything uses to backtrack;
964 | the parentheses are only there
965 | to direct the parser how to parse `/` and `!` and so on.
966 |
967 | parenthesized <- '('_ body: choice ')'_ -> (body).
968 |
969 | ### Exporting ###
970 |
971 | We need one more thing
972 | if our grammar is to be loadable as a [CommonJS module][]
973 | by systems like [node.js][]:
974 |
975 | # in support code:
976 | + "if (typeof exports !== 'undefined')\n"
977 | + " exports.parse_sentence = parse_sentence;\n"
978 |
979 | [CommonJS module]: http://wiki.commonjs.org/wiki/Modules/1.1#Module_Context
980 | [node.js]: http://nodejs.org/
981 |
982 | This assumes that the grammar being processed
983 | has a production called `sentence`,
984 | which is the only thing that will be exported.
985 |
986 | The Whole Metacircular Compiler-Compiler
987 | ----------------------------------------
988 |
989 | Here’s the whole thing,
990 | extracted from this document:
991 |
992 | # in the output metacircular compiler-compiler:
993 | sp <- ' ' / '\n' / '\t'.
994 | _ <- sp _ / .
995 | rule <- n: name _ '<-'_ body: choice '.'_ ->
996 | (["function parse_", n, "(input, pos) {\n",
997 | ' var state = { pos: pos };\n',
998 | ' var stack = [];\n',
999 | body,
1000 | ' return state;\n',
1001 | "}\n"].join(''))
1002 | .
1003 | sentence <- _ r: rule g: sentence -> (r + "\n" + g)
1004 | / _ r: rule -> (r + "\n"
1005 | + 'function parse_char(input, pos) {\n'
1006 | + ' if (pos >= input.length) return null;\n'
1007 | + ' return { pos: pos + 1, val: input.charAt(pos) };\n'
1008 | + '}\n'
1009 | + 'function literal(input, pos, string) {\n'
1010 | + ' if (input.substr(pos, string.length) === string) {\n'
1011 | + ' return { pos: pos + string.length, val: string };\n'
1012 | + ' } else return null;\n'
1013 | + '}\n'
1014 | + "if (typeof exports !== 'undefined')\n"
1015 | + " exports.parse_sentence = parse_sentence;\n"
1016 | ).
1017 | meta <- '!' / '\'' / '<-' / '/' / '.' / '(' / ')' / ':' / '->'.
1018 | name <- c: namechar n: name -> (c + n) / namechar.
1019 | namechar <- !meta !sp char.
1020 | term <- labeled / nonterminal / string / negation / parenthesized.
1021 | nonterminal <- n: name _ ->
1022 | ([' state = parse_', n, '(input, state.pos);\n'].join(''))
1023 | .
1024 | labeled <- label: name _ ':'_ value: term ->
1025 | ([value, ' if (state) var ', label, ' = state.val;\n'].join(''))
1026 | .
1027 | sequence <- foo: term bar: sequence ->
1028 | ([foo, ' if (state) {\n', bar, ' }\n'].join(''))
1029 | / result_expression / -> ('').
1030 | string <- '\'' s: stringcontents '\''_ ->
1031 | ([" state = literal(input, state.pos, '", s, "');\n"].join(''))
1032 | .
1033 | stringcontents <- !'\\' !'\'' c: char s: stringcontents -> (c + s)
1034 | / b: '\\' c: char s: stringcontents -> (b + c + s)
1035 | / -> ('').
1036 | choice <- a: sequence '/'_ b: choice ->
1037 | ([' stack.push(state);\n',
1038 | a,
1039 | ' if (!state) {\n',
1040 | ' state = stack.pop();\n',
1041 | b,
1042 | ' } else stack.pop();\n'].join(''))
1043 | / sequence.
1044 | negation <- '!'_ t: term ->
1045 | ([' stack.push(state);\n',
1046 | t,
1047 | ' if (state) {\n',
1048 | ' stack.pop();\n',
1049 | ' state = null;\n',
1050 | ' } else state = stack.pop();\n'].join(''))
1051 | .
1052 | result_expression <- '->'_ result: expr _ ->
1053 | ([' if (state) state.val = ', result, ';\n'].join(''))
1054 | .
1055 | expr <- '('_ e: exprcontents ')' -> ('(' + e + ')').
1056 | exprcontents <- c: (!'(' !')' char / expr) e: exprcontents -> (c + e)
1057 | / -> ('').
1058 | parenthesized <- '('_ body: choice ')'_ -> (body).
1059 |
1060 | That’s 66 lines of code,
1061 | constituting a compiler
1062 | that can compile itself into JavaScript,
1063 | if you have a way to execute it.
1064 |
1065 | **XXX: a couple of lines are over 80 chars; fix this!**
1066 |
1067 | Bootstrapping to JavaScript
1068 | ---------------------------
1069 |
1070 | But, to actually execute this compiler-compiler,
1071 | you need a version already running,
1072 | so you can compile the compiler-compiler to JavaScript.
1073 |
1074 | ### Hand-compiling: a blind alley ###
1075 |
1076 | I started by trying to compile it by hand,
1077 | using YASnippet,
1078 | but after not very long, I gave up on that approach.
1079 | Here are the hand-compiled versions of
1080 | `sp <- ' ' / '\n' / '\t'.`
1081 | and `_ <- sp _ / .`
1082 |
1083 | # in the hand-compiled metacircular compiler-compiler:
1084 | function parse_sp(input, pos) {
1085 | var state = { pos: pos };
1086 | var stack = [];
1087 | stack.push(state);
1088 | state = literal(input, state.pos, ' ');
1089 | if (!state) {
1090 | state = stack.pop();
1091 | stack.push(state);
1092 | state = literal(input, state.pos, '\n');
1093 | if (!state) {
1094 | state = stack.pop();
1095 | state = literal(input, state.pos, '\t');
1096 | } else {
1097 | stack.pop();
1098 | }
1099 | } else {
1100 | stack.pop();
1101 | }
1102 | return state;
1103 | }
1104 | function parse__(input, pos) {
1105 | var state = { pos: pos };
1106 | var stack = [];
1107 | stack.push(state);
1108 | state = parse_sp(input, state.pos);
1109 | if (state) {
1110 | state = parse__(input, state.pos);
1111 | }
1112 | if (!state) {
1113 | state = stack.pop();
1114 | } else {
1115 | stack.pop();
1116 | }
1117 | return state;
1118 | }
1119 |
1120 | After thus inflating two lines of grammar
1121 | into 35 lines of JavaScript,
1122 | I knew I needed a better way.
1123 | At that rate,
1124 | the whole thing would be about 1200 lines.
1125 | That’s too much to debug,
1126 | even if YASnippet makes it relatively easy to type,
1127 | unless there's no easier way.
1128 |
1129 | But there is.
1130 |
1131 | ### A Bunch of Functions ###
1132 |
1133 | So, instead,
1134 | I'm writing one function
1135 | for each interesting recognition rule from the grammar,
1136 | returning the same result expressions
1137 | that the parsing function will.
1138 | Then I can construct
1139 | a sort of abstract syntax tree of the grammar
1140 | out of calls to these functions,
1141 | and it will only be a little larger than the grammar itself.
1142 |
1143 | For example,
1144 | the first rule `sp <- ' ' / '\n' / '\t'.`
1145 | will become:
1146 |
1147 | # in the ASTs made of function calls:
1148 | var sp_rule = rule('sp', choice(string(' '), choice(string('\\n'),
1149 | string('\\t'))));
1150 |
1151 | This is a bit of a cheat;
1152 | the innermost choice really parses as
1153 | `choice(sequence(string('\\n'), ''), sequence(string('\\t'), ''))`
1154 | but I'm hoping that doesn’t matter for now.
1155 |
1156 | Then at the end I can combine all of the variables
1157 | into a grammar.
1158 |
1159 | First I need the functions, though.
1160 |
1161 | I’m omitting `sp`
1162 | (likewise `_`, `meta`)
1163 | because they don’t produce interesting values.
1164 |
1165 | # in the bunch-of-functions version:
1166 | function rule(n, body) {
1167 | return (["function parse_", n, "(input, pos) {\n",
1168 | ' var state = { pos: pos };\n',
1169 | ' var stack = [];\n',
1170 | body,
1171 | ' return state;\n',
1172 | "}\n"].join(''));
1173 | }
1174 |
1175 | function sentence2(r, g) {
1176 | return (r + "\n" + g);
1177 | }
1178 |
1179 | function sentence1(r) {
1180 | return (r + "\n"
1181 | <>
1182 | );
1183 | }
1184 |
1185 | I’m omitting `name`
1186 | (likewise `expr`, `inner`, `exprcontents`, `stringcontents`)
1187 | because it just copies a character string from the input
1188 | into the output.
1189 | I can do that myself.
1190 | And I’m omitting `term`
1191 | because it just returns one of its children's values.
1192 |
1193 | function nonterminal(n) {
1194 | return [' state = parse_', n, '(input, state.pos);\n'].join('');
1195 | }
1196 | function labeled(label, value) {
1197 | return [value, ' if (state) var ', label, ' = state.val;\n'].join('');
1198 | }
1199 | function sequence(foo, bar) {
1200 | return [foo, ' if (state) {\n', bar, ' }\n'].join('');
1201 | }
1202 | function string(s) {
1203 | return [" state = literal(input, state.pos, '", s, "');\n"].join('');
1204 | }
1205 | function choice(a, b) {
1206 | return [
1207 | ' stack.push(state);\n',
1208 | a,
1209 | ' if (!state) {\n',
1210 | ' state = stack.pop();\n',
1211 | b,
1212 | ' } else {\n',
1213 | ' stack.pop();\n', // discard unnecessary saved state
1214 | ' }\n'].join('');
1215 | }
1216 | function negation(t) {
1217 | return [
1218 | ' stack.push(state);\n',
1219 | t,
1220 | ' if (state) {\n',
1221 | ' stack.pop();\n',
1222 | ' state = null;\n',
1223 | ' } else {\n',
1224 | ' state = stack.pop();\n',
1225 | ' }\n'].join('');
1226 | }
1227 | function result_expression(result) {
1228 | return [' state.val = ', result, ';\n'].join('');
1229 | }
1230 |
1231 | We’ll also need the support code
1232 | from the `sentence` rule,
1233 | except for the exporting of `parse_sentence`.
1234 |
1235 | function parse_char(input, pos) {
1236 | if (pos >= input.length) return null;
1237 | return { pos: pos + 1, val: input.charAt(pos) };
1238 | }
1239 | function literal(input, pos, string) {
1240 | if (input.substr(pos, string.length) === string) {
1241 | return { pos: pos + string.length, val: string };
1242 | } else return null;
1243 | }
1244 |
1245 | Then,
1246 | after all those functions are defined,
1247 | we can call them to build up the ASTs.
1248 |
1249 | <>
1250 |
1251 | The rule for `_` is quite straightforward:
1252 |
1253 | # in the ASTs made of function calls:
1254 | var __rule = rule('_',
1255 | choice(sequence(nonterminal('sp'), nonterminal('_')),
1256 | ''));
1257 |
1258 | The rule for `rule`
1259 | contains a rather long sequence,
1260 | which will be treated
1261 | as a deeply nested bunch
1262 | of two-element sequences.
1263 | But it’s hard to read and write it that way,
1264 | so I’m going to define a helper function `nseq`
1265 | to make a sequence of an arbitrary number
1266 | of sequence elements.
1267 |
1268 | function nseq() {
1269 | var rv = arguments[arguments.length-1];
1270 | for (var ii = arguments.length-2; ii >= 0; ii--)
1271 | rv = sequence(arguments[ii], rv);
1272 | return rv;
1273 | }
1274 |
1275 | This will fail (returning `null`)
1276 | if we call it with no arguments,
1277 | so let’s be sure not do that.
1278 | Now we can define the rule for `rule`:
1279 |
1280 | var rule_rule = rule('rule',
1281 | nseq(labeled('n', nonterminal('name')), nonterminal('_'),
1282 | string('<-'), nonterminal('_'),
1283 | labeled('body', nonterminal('choice')),
1284 | string('.'), nonterminal('_'),
1285 | result_expression(
1286 | "[\"function parse_\", n, \"(input, pos) {\\n\",\n" +
1287 | " ' var state = { pos: pos };\\n',\n" +
1288 | " ' var stack = [];\\n',\n" +
1289 | " body, \n" +
1290 | " ' return state;\\n',\n" +
1291 | " \"}\\n\"].join('')")));
1292 |
1293 | `rule_rule` is clearly pretty verbose;
1294 | it's 12 lines,
1295 | and the corresponding `rule` function is 8 lines,
1296 | for a total of 20 lines for the “hand-compiled” version
1297 | of the original 7-line `rule` rule.
1298 | That’s a manageable expansion factor of about 3×.
1299 |
1300 | So, on to `sentence`.
1301 | I’ve played fast and loose
1302 | with leading whitespace here,
1303 | in order to retain some modicum of readability.
1304 |
1305 | var sentence_rule = rule('sentence',
1306 | choice(
1307 | nseq(nonterminal('_'),
1308 | labeled('r', nonterminal('rule')),
1309 | labeled('g', nonterminal('sentence')),
1310 | result_expression('r + "\\n" + g')),
1311 | nseq(nonterminal('_'),
1312 | labeled('r', nonterminal('rule')),
1313 | result_expression('r + "\\n"\n' +
1314 | "+ 'function parse_char(input, pos) {\\n'\n" +
1315 | "+ ' if (pos >= input.length) return null;\\n'\n" +
1316 | "+ ' return { pos: pos + 1, val: input.charAt(pos) };\\n'\n" +
1317 | "+ '}\\n'\n" +
1318 | "+ 'function literal(input, pos, string) {\\n'\n" +
1319 | "+ ' if (input.substr(pos, string.length) === string) {\\n'\n" +
1320 | "+ ' return { pos: pos + string.length, val: string };\\n'\n" +
1321 | "+ ' } else return null;\\n'\n" +
1322 | "+ '}\\n'\n" +
1323 | "+ 'if (typeof exports !== "+'"undefined"'+") {\\n'\n" +
1324 | "+ ' exports.parse_sentence = parse_sentence;\\n'\n" +
1325 | "+ '}\\n'\n"))));
1326 |
1327 | The quoting of the support code
1328 | is kind of confusing;
1329 | the original is one long string,
1330 | containing a bunch of `\n` newlines,
1331 | broken up into lines for readability,
1332 | joined by the `+` operator.
1333 | This version
1334 | is also one long string,
1335 | containing the lines of the original long string,
1336 | also broken up into lines for readability,
1337 | joined by the `+` operator.
1338 | So there are two levels of quoting.
1339 | The inner level has the `+` on the left and uses single quotes `''`,
1340 | and the outer level has the `+` on the right and uses double quotes `""`.
1341 |
1342 | The next rule is `meta`,
1343 | and it has a lot of `choice`s.
1344 | So we define something like `nseq`,
1345 | but for `choice`s.
1346 |
1347 | function nchoice() {
1348 | var rv = arguments[arguments.length-1];
1349 | for (var ii = arguments.length-2; ii >= 0; ii--)
1350 | rv = choice(arguments[ii], rv);
1351 | return rv;
1352 | }
1353 |
1354 | var meta_rule = rule('meta',
1355 | nchoice(string('!'), string('\\\''), string('<-'), string('/'),
1356 | string('.'), string('('), string(')'), string(':'),
1357 | string('->')));
1358 |
1359 |
1360 | The next few rules
1361 | are straightforward translations from the grammar.
1362 |
1363 | var name_rule = rule('name',
1364 | choice(nseq(labeled('c', nonterminal('namechar')),
1365 | labeled('n', nonterminal('name')),
1366 | result_expression('c + n')),
1367 | nonterminal('namechar')));
1368 | var namechar_rule = rule('namechar',
1369 | nseq(negation(nonterminal('meta')),
1370 | negation(nonterminal('sp')), nonterminal('char')));
1371 | var term_rule = rule('term',
1372 | nchoice(nonterminal('labeled'), nonterminal('nonterminal'),
1373 | nonterminal('string'), nonterminal('negation'),
1374 | nonterminal('parenthesized')));
1375 | var nonterminal_rule = rule('nonterminal',
1376 | nseq(labeled('n', nonterminal('name')), nonterminal('_'),
1377 | result_expression("[' state = parse_', n, " +
1378 | "'(input, state.pos);\\n'].join('')")));
1379 | var labeled_rule = rule('labeled',
1380 | nseq(labeled('label', nonterminal('name')), nonterminal('_'),
1381 | string(':'), nonterminal('_'),
1382 | labeled('value', nonterminal('term')),
1383 | result_expression("[value, ' if (state) var ', " +
1384 | "label, ' = state.val;\\n'].join('')")));
1385 | var sequence_rule = rule('sequence',
1386 | nchoice(nseq(labeled('foo', nonterminal('term')),
1387 | labeled('bar', nonterminal('sequence')),
1388 | result_expression("[foo, ' if (state) {\\n', " +
1389 | "bar, ' }\\n'].join('')")),
1390 | nonterminal('result_expression'),
1391 | sequence(result_expression("''"))));
1392 |
1393 | That’s 29 lines,
1394 | transliterating 12 lines from the grammar,
1395 | and now the transliteration is halfway done.
1396 |
1397 | var string_rule = rule('string',
1398 | nseq(string("\\'"), labeled('s', nonterminal('stringcontents')),
1399 | string("\\'"), nonterminal('_'),
1400 | result_expression('[" state = literal(input, state.pos, ' +
1401 | '\'", s, "\');\\n"].join(\'\')')));
1402 | var stringcontents_rule = rule('stringcontents',
1403 | nchoice(nseq(negation(string("\\\\")), negation(string("\\'")),
1404 | labeled('c', nonterminal('char')),
1405 | labeled('s', nonterminal('stringcontents')),
1406 | result_expression('c + s')),
1407 | nseq(labeled('b', string("\\\\")),
1408 | labeled('c', nonterminal('char')),
1409 | labeled('s', nonterminal('stringcontents')),
1410 | result_expression('b + c + s')),
1411 | result_expression("''")));
1412 |
1413 | For `choice` I’m omitting not only whitespace
1414 | but also a comment.
1415 |
1416 | var choice_rule = rule('choice',
1417 | choice(nseq(labeled('a', nonterminal('sequence')),
1418 | string('/'), nonterminal('_'),
1419 | labeled('b', nonterminal('choice')),
1420 | result_expression(
1421 | "[' stack.push(state);\\n',\n" +
1422 | " a,\n" +
1423 | " ' if (!state) {\\n',\n" +
1424 | " ' state = stack.pop();\\n',\n" +
1425 | " b,\n" +
1426 | " ' } else {\\n',\n" +
1427 | " ' stack.pop();\\n',\n" +
1428 | " ' }\\n'].join('')")),
1429 | nonterminal('sequence')));
1430 | var negation_rule = rule('negation',
1431 | nseq(string('!'), nonterminal('_'), labeled('t', nonterminal('term')),
1432 | result_expression(
1433 | "[' stack.push(state);\\n',\n" +
1434 | " t,\n" +
1435 | " ' if (state) {\\n',\n" +
1436 | " ' stack.pop();\\n',\n" +
1437 | " ' state = null;\\n',\n" +
1438 | " ' } else {\\n',\n" +
1439 | " ' state = stack.pop();\\n',\n" +
1440 | " ' }\\n'].join('')")));
1441 | var result_expression_rule = rule('result_expression',
1442 | nseq(string('->'), nonterminal('_'),
1443 | labeled('result', nonterminal('expr')),
1444 | result_expression("[' if (state) state.val = ', " +
1445 | "result, ';\\n'].join('')")));
1446 | var expr_rule = rule('expr',
1447 | nseq(string('('), nonterminal('_'),
1448 | labeled('e', nonterminal('exprcontents')),
1449 | string(')'), nonterminal('_'),
1450 | result_expression('e')));
1451 | var inner_rule = rule('inner',
1452 | nseq(string('('), nonterminal('_'),
1453 | labeled('e', nonterminal('exprcontents')),
1454 | string(')'),
1455 | result_expression("'(' + e + ')'")));
1456 | var exprcontents_rule = rule('exprcontents',
1457 | choice(
1458 | nseq(labeled('c',
1459 | choice(nseq(negation(string('(')),
1460 | negation(string(')')),
1461 | nonterminal('char')),
1462 | nonterminal('inner'))),
1463 | labeled('e', nonterminal('exprcontents')),
1464 | result_expression('c + e')),
1465 | result_expression("''")));
1466 | var parenthesized_rule = rule('parenthesized',
1467 | nseq(string('('), nonterminal('_'),
1468 | labeled('body', nonterminal('choice')),
1469 | string(')'), nonterminal('_'),
1470 | result_expression('body')));
1471 |
1472 | So that’s all the rules.
1473 | Now we just need to assemble them into a sentence,
1474 | using a technique similar to `nseq` and `nchoice`.
1475 |
1476 | function nsentence() {
1477 | var rv = sentence1(arguments[arguments.length-1]);
1478 | for (var ii = arguments.length-2; ii >= 0; ii--)
1479 | rv = sentence2(arguments[ii], rv);
1480 | return rv;
1481 | }
1482 |
1483 | var all_rules = nsentence(sp_rule, __rule, rule_rule, sentence_rule,
1484 | meta_rule, name_rule, namechar_rule, term_rule,
1485 | nonterminal_rule, labeled_rule, sequence_rule,
1486 | string_rule, stringcontents_rule, choice_rule,
1487 | negation_rule, result_expression_rule, expr_rule,
1488 | inner_rule, exprcontents_rule, parenthesized_rule);
1489 |
1490 | Now the variable `all_rules`
1491 | has a working parser in it
1492 | in JavaScript.
1493 |
1494 | To get a usable `parse_sentence` function,
1495 | we need to `eval` that script:
1496 |
1497 | eval(all_rules);
1498 |
1499 | And then we can export the function:
1500 |
1501 | if (typeof exports !== 'undefined') exports.parse_sentence = parse_sentence;
1502 |
1503 | ### The Output Parser in JavaScript ###
1504 |
1505 | I used to include here
1506 | the contents of in `all_rules`
1507 | after a couple of iterations.
1508 | It’s ten pages long (660 lines),
1509 | and the compile takes
1510 | about 3–5 seconds on my machine,
1511 | although it’s under 100ms on modern computers.
1512 | However,
1513 | I decided that it was too much to want to include it here;
1514 | this document is for reading.
1515 | If you `git clone` it,
1516 | it’s in `output.js`.
1517 |
1518 | Cross-Compiling to Lua
1519 | ----------------------
1520 |
1521 | It was a lot of trouble
1522 | getting the short compiler-compiler above
1523 | to an actually runnable state;
1524 | I had to write and debug,
1525 | basically,
1526 | two copies of the same code.
1527 | It would have been much easier
1528 | if I’d already happened to have such a compiler-compiler around
1529 | that I could use to compile my grammar with.
1530 |
1531 | Well,
1532 | for the program I’m using
1533 | to extract the code from this document,
1534 | which I call HandAxeWeb,
1535 | I would like to have such a compiler-compiler
1536 | to generate code in Lua.
1537 |
1538 | So I’m going to define a “version 2”
1539 | of the compiler-compiler
1540 | which,
1541 | instead of generating JS code,
1542 | generates Lua code.
1543 | (It is still written in JS, though.)
1544 |
1545 | First,
1546 | instead of producing JS functions for rules,
1547 | we produce Lua functions for rules:
1548 |
1549 | # in code to produce a function v2:
1550 | (['function parse_',n,'(input, pos)\n',
1551 | <>
1552 | body,
1553 | <>
1554 | 'end\n'].join(''))
1555 |
1556 | Invoking nonterminals needs no change;
1557 | JS and Lua syntax overlap here.
1558 | But local variable declaration
1559 | and finite maps
1560 | look different:
1561 |
1562 | # in function prologue v2:
1563 | ' local state = { pos = pos }\n',
1564 |
1565 | We have to declare variables
1566 | outside their conditional;
1567 | Lua’s scoping rules here
1568 | change the semantics somewhat
1569 | because unless you declare the variables
1570 | at the top of the function
1571 | you can’t write a rule like
1572 | `x <- (bar y: foo / baz y: quux) -> (y)`
1573 | and have it work
1574 | because the inner `y` variables
1575 | are declared in an inner block in Lua,
1576 | while in JS
1577 | they automatically belong to the whole function.
1578 |
1579 | # in code to save a value in a variable v2:
1580 | ([value,
1581 | ' local ',label,'\n',
1582 | ' if state then ',label,' = state.val end\n'].join(''))
1583 |
1584 | The `parse_char` and `literal` functions
1585 | are a bit different;
1586 | remember, Lua numbers
1587 | character positions in strings
1588 | from 1,
1589 | and the second argument to its `string.sub`
1590 | is not a length but an ending index:
1591 |
1592 | # in support code v2:
1593 | + 'function parse_char(input, pos)\n'
1594 | + ' if pos > #input then return nil end\n'
1595 | + ' return { pos = pos + 1, \n'
1596 | + ' val = string.sub(input, pos, pos) }\n'
1597 | + 'end\n'
1598 | + 'function literal(input, pos, needle)\n'
1599 | + ' if string.sub(input, pos, pos + #needle - 1)\n'
1600 | + ' === needle then\n'
1601 | + ' return { pos = pos + #needle, val = needle }\n'
1602 | + ' else return nil end\n'
1603 | + 'end\n'
1604 |
1605 | The code to invoke `literal`
1606 | doesn’t actually need to change.
1607 |
1608 | Sequence-handling differs only in minor bits of syntax:
1609 |
1610 | # in code to handle a sequence v2:
1611 | ([foo, ' if state then\n', bar, ' end\n'].join(''))
1612 |
1613 | Initializing the stack is a little different:
1614 |
1615 | # in function prologue v2:
1616 | ' local stack = {}\n',
1617 |
1618 | Ordered choice looks quite similar to JS:
1619 |
1620 | # in code to handle a choice v2:
1621 | ([' table.insert(stack, state)\n',
1622 | a,
1623 | ' if not state then\n',
1624 | ' state = table.remove(stack)\n',
1625 | b,
1626 | ' else\n',
1627 | ' table.remove(stack)\n',
1628 | ' end\n'].join(''))
1629 |
1630 | Negation too:
1631 |
1632 | # in code to handle negation v2:
1633 | ([' table.insert(stack, state)\n',
1634 | t,
1635 | ' if state then\n',
1636 | ' table.remove(stack)\n',
1637 | ' state = nil\n',
1638 | ' else\n',
1639 | ' state = table.remove(stack)\n',
1640 | ' end\n'].join(''))
1641 |
1642 | Result expressions too:
1643 |
1644 | # in code to handle result expressions v2:
1645 | ([' if state then state.val = ',result,' end\n'].join(''))
1646 |
1647 | And that is sufficient
1648 | to be able to generate compilers in Lua
1649 | from grammars whose result expressions are in Lua.
1650 | Unfortunately,
1651 | it’s still not good enough
1652 | to generate a metacircular compiler-compiler in Lua
1653 | from the grammar given here,
1654 | because that grammar is written in JS,
1655 | even though it generates Lua code.
1656 |
1657 | It would be relatively straightforward
1658 | to make the modification needed to the grammar quite minor:
1659 | all the result expressions
1660 | merely concatenate a bunch of strings,
1661 | and if they did so by calling a function,
1662 | you’d only need to redefine that function
1663 | in the two target languages;
1664 | in JS, something like
1665 | `Array.prototype.slice.apply(arguments).join('')`
1666 | and in Lua, something like
1667 | `table.concat({...})`.
1668 |
1669 | But this is sort of unnecessary.
1670 | Really, we just need to be able to compile our parsers
1671 | using node.js.
1672 |
1673 | TODO
1674 | ----
1675 |
1676 | - memoization
1677 | - performance measurement: it takes minimally 252ms to compile itself
1678 | on my netbook, wallclock, under whatever version of Node I’m using.
1679 | That's pretty pessimal; it's about 11 or 12 kilobytes per second,
1680 | close to a hundred thousand clock cycles per byte. Follow sets
1681 | may offer a way to improve that by probably an order of magnitude.
1682 | - re-add repetition `+` and `*` (in a later version)
1683 | - factor out loopbody? like,
1684 | loopbody <- term: body -> (loop body code).
1685 | zero_or_more <- loopbody: body -> (body).
1686 | one_or_more <- loopbody: body -> (body + 'if ...').
1687 | - how about removing `()` grouping? It leaves “a PEG describing
1688 | results” (and “the metacircular compiler-compiler”) one line shorter
1689 | and one line longer, but perhaps it could simplify backtracking by
1690 | eliminating the explicit stack? Because then each parsing function
1691 | would only need to contain one level of backtracking for `/` and one
1692 | for `!` — oh, well, hmm, `!` might be tricky if we want to support
1693 | positive lookahead too. Probably better to leave the stack in.
1694 | - Rewrite the Lua handaxeweb to use a PEG parser.
1695 | - maybe: rewrite the Lua handaxeweb to be written in JS with Node?
1696 | The whole Lua story (“in a later version, this program switched to
1697 | generating Lua grammars and lost the ability to compile itself”)
1698 | kind of stinks. And writing 39 to 48 lines of code to "port" a
1699 | 66-line program also seems kind of silly, like it may not justify
1700 | the abstraction overhead that permits it.
1701 | - maybe: reorganize this document, putting bootstrap.js first? Not sure.
1702 | - maybe: write a Markdown parser?
1703 | - move Makefile and pegcompile.js into this document?
1704 |
1705 | Profiling results
1706 | -----------------
1707 |
1708 | **XXX these are rough notes that should be cleaned up**
1709 |
1710 | I profiled this thing compiling itself in Arora.
1711 |
1712 | It contains 2939 characters, but makes 32370 calls to `literal`, which
1713 | is about 25% of its CPU time, I think (the profile output is a little
1714 | hard to interpret; some of the numbers are over 100%, probably due to
1715 | recursion, and 85% is attributed merely to “program”). `parse_meta`
1716 | takes more than a third of the CPU time, largely by virtue of calling
1717 | `literal` several times. It also makes 41903 calls each to `push` and
1718 | `pop`.
1719 |
1720 | That means it's testing about 11 literals per character, and
1721 | backtracking 14 times. I could be wrong but I don’t think much of
1722 | this would be improved by memoizing; computing follow sets is likely
1723 | to make a bigger difference by avoiding the majority of that
1724 | backtracking.
1725 |
1726 | `parse_char` is called 4219 times, mostly from `parse_exprcontents`.
1727 |
1728 | Building up the output tree with `string.join` takes only about 0.6%
1729 | of its time.
1730 |
1731 | I suspect that current WebKit has a much better profiler.
1732 |
1733 | Firebug agrees on most things (23.87% in `literal`), but it has the
1734 | interesting result that actually 17% of the time is in `compile`,
1735 | which was called only once and does little more than call `eval`. So
1736 | apparently the time to generate the output JS was only about 4x the
1737 | time needed for SpiderMonkey to compile it!
1738 |
1739 | Other Interesting PEGs
1740 | ----------------------
1741 |
1742 | Here’s some nifty stuff you can do
1743 | with the one-page parser generator described above.
1744 |
1745 | ### CSV files ###
1746 |
1747 | [Ierusalemschy][ier] gives this grammar
1748 | for parsing Excel-style CSV files:
1749 |
1750 | # in the LPEG notation with captures:
1751 | record <- ( (',' )*)->{} (%nl / !.)
1752 | field <- /
1753 | nonescaped <- { [^,"%nl]* }
1754 | escaped <- '"' {~ ([^"] / '""'->'"')* ~} '"'
1755 |
1756 | The `{}` capture pieces of text
1757 | and `{~ ~}` capture and replace them.
1758 | `*` is for repetition,
1759 | `%nl` is `'\n'`,
1760 | `""` are equivalent to `''`,
1761 | `.` is our `char`,
1762 | `[abc]` is a character class equivalent to `( 'a' / 'b' / 'c' )`,
1763 | and `->{}` means “make a list of the results”.
1764 | In the notation I’ve used for PEGs here,
1765 | without repetition features,
1766 | this looks like this:
1767 |
1768 | # in csv.peg:
1769 | sentence <- d: (f: field ',' r: sentence -> ([f].concat(r))
1770 | / f: field -> ([f])) ('\n' / !char)
1771 | -> (d).
1772 | field <- escaped / nonescaped.
1773 | normal_char <- !',' !'"' !'\n' char.
1774 | nonescaped <- c: normal_char s: nonescaped -> (c + s) / normal_char.
1775 | escaped_inner_char <- !'"' char / '""' -> ('"').
1776 | escaped_inner <- c: escaped_inner_char s: escaped_inner -> (c + s)
1777 | / escaped_inner_char.
1778 | escaped <- '"' s: escaped_inner '"' -> (s).
1779 |
1780 | That’s 2½ times as big,
1781 | which is unreasonable.
1782 | If we have `*` repetition that makes JavaScript Arrays,
1783 | we can write it with only a bit more ugliness
1784 | than in LPEG:
1785 |
1786 | # in csvstar.peg:
1787 | sentence <- h: field t: (',' field)* ('\n' / !char) -> ([h].concat(t)).
1788 | field <- escaped / nonescaped.
1789 | nonescaped <- s: (!',' !'"' !'\n' char)* -> (s.join('')).
1790 | escaped <- '"' s: (!'"' char / '""' -> ('"'))* '"' -> (s.join('')).
1791 |
1792 | [ier]: http://www.inf.puc-rio.br/~roberto/docs/peg.pdf "A Text Pattern-Matching Tool based on Parsing Expression Grammars, 2008, SP&E"
1793 |
1794 | ### ichbins ###
1795 |
1796 | [Darius Bacon’s ichbins] [ichbins]
1797 | is an inspiring small Lisp compiler;
1798 | it can compile itself to C
1799 | with full run-time type-checking,
1800 | even though
1801 | it’s only a bit over six pages of code.
1802 | Its recursive-descent parser
1803 | is a model of clarity,
1804 | as recursive-descent parsers go:
1805 |
1806 | # in the parser in ichbins.scm:
1807 | (define (read)
1808 | (read-dispatch (skip-blanks (read-char))))
1809 |
1810 | (define (skip-blanks c)
1811 | (cond ((memq? c whitespace-chars) (skip-blanks (read-char)))
1812 | ('t c)))
1813 |
1814 | (define whitespace-chars (cons linefeed " "))
1815 | (define non-symbol-chars "\"\\(')")
1816 |
1817 | (define eof-object '("eof"))
1818 |
1819 | (define (read-dispatch c)
1820 | (cond ((eq? c 'f) eof-object)
1821 | ((eq? c \\) (read-char-literal (read-char)))
1822 | ((eq? c \") (read-string (read-char)))
1823 | ((eq? c \() (read-list))
1824 | ((eq? c \') (cons 'quote (cons (read) '())))
1825 | ((eq? c \)) (error "Unbalanced parentheses"))
1826 | ('t (intern (cons c (read-symbol (peek-char)))))))
1827 |
1828 | (define (read-char-literal c)
1829 | (cond ((eq? c 'f) (error "EOF in character literal"))
1830 | ('t c)))
1831 |
1832 | (define (read-string c)
1833 | (cond ((eq? c 'f) (error "Unterminated string literal"))
1834 | ((eq? c \") '())
1835 | ((eq? c \\) (cons (read-char) (read-string (read-char))))
1836 | ('t (cons c (read-string (read-char))))))
1837 |
1838 | (define (read-symbol c)
1839 | (cond ((memq? c whitespace-chars) '())
1840 | ((memq? c non-symbol-chars) '())
1841 | ('t (read-char) (cons c (read-symbol (peek-char))))))
1842 |
1843 | (define (read-list)
1844 | (read-list-dispatch (skip-blanks (read-char))))
1845 |
1846 | (define (read-list-dispatch c)
1847 | (cond ((eq? c 'f) (error "Unterminated list"))
1848 | ((eq? c \)) '())
1849 | ('t (cons (read-dispatch c) (read-list)))))
1850 |
1851 | But with a language suited for parsing,
1852 | we can do better.
1853 | Here’s a PEG simply describing the same grammar as the above:
1854 |
1855 | # in ichbins.peg:
1856 | whitespace <- '\n' / ' ' / '\t'.
1857 | _ <- whitespace _ / .
1858 | non-symbol <- '"' / '\\' / '(' / '\'' / ')'.
1859 | sentence <- _ sexp.
1860 | sexp <- '\\' char / '"' string / '(' list / '\'' read / symbol.
1861 | string <- '"' / (!'\\' char / '\\' char) string.
1862 | symbol <- !whitespace !non-symbol char / .
1863 | list <- ')' / read list.
1864 |
1865 | Instead of 33 lines of code, we have 8.
1866 | Note that I’ve followed the kind of weird structure
1867 | of the original parser:
1868 | the closing parenthesis is considered part of the list contents,
1869 | and the closing quote is considered part of the string contents.
1870 | This simplifies the grammar slightly,
1871 | and eliminates nearly all non-tail calls
1872 | (except inside of `list`, and to `_`, and in distinguishing character categories)
1873 | but I think it makes it a little less clear.
1874 |
1875 | In 16 lines,
1876 | we can get a real parser
1877 | that returns a parse of the code,
1878 | in this case as a JSON string:
1879 |
1880 | # in ichbins-parser.peg:
1881 | sentence <- _ s: sexp -> (JSON.stringify(s, null, 4)).
1882 |
1883 | sexp <- '('_ list
1884 | / '"' string
1885 | / s: symbol -> ({symbol: s})
1886 | / '\''_ s: sexp -> ([{symbol: 'quote'}, s])
1887 | / '\\' c: char _ -> ({char: c}).
1888 |
1889 | list <- ')'_ -> ([])
1890 | / a: sexp b: list -> ([a].concat(b)).
1891 | string <- '"'_ -> ('')
1892 | / a: (!'\\' char / '\\' b: char -> ('\\' + b))
1893 | t: string -> (a + t).
1894 | symbol <- a: symchar b: symtail -> (a + b).
1895 | symtail <- symbol / _ -> ('').
1896 |
1897 | _ <- whitespace _ / .
1898 | whitespace <- '\n' / ' ' / '\t'.
1899 | symchar <- !( whitespace /'"' / '\\' / '(' / '\'' / ')' ) char.
1900 |
1901 |
1902 | [ichbins]: http://www.accesscom.com/~darius/???XXX "ichbins: I can hardly believe it’s not Scheme"
1903 |
1904 | Thanks
1905 | ------
1906 |
1907 | Thanks to D. Val Schorre for inventing META-II,
1908 | of which this is a refinement,
1909 | in 1964 or a bit before;
1910 | to Bob M. McClure for inventing [TMG](http://www.multicians.org/tmg.html),
1911 | the TransMoGrifier,
1912 | also in 1964,
1913 | and to Doug McIlroy for maintaining it afterwards,
1914 | which not only carried META-II forward,
1915 | but also
1916 | [helped Thompson write B](http://plan9.bell-labs.com/who/dmr/chist.html)
1917 | which became C;
1918 | to Romuald Ireneus 'Scibor-Marchocki, who
1919 | [apparently ported TMG to
1920 | TMGL](http://www.geocities.com/ResearchTriangle/2363/tmg011.html);
1921 | to Bryan Ford for resurrecting TMG’s parsing schema
1922 | and enhancing it into the form of parsing expression grammars,
1923 | in 2002;
1924 | to Alan Kay for bringing META-II back to public attention;
1925 | to Alessandro Warth and Yoshiki Ohshima for developing OMeta
1926 | and showing that PEGs can be extended
1927 | to a wide variety of non-parsing tasks.
1928 |
1929 | To [Aristotle Pagaltzis] (http://plasmasturm.org/)
1930 | for innumerable improvements
1931 | to the readability and correctness
1932 | of this document.
1933 |
1934 | To Andy Isaacson,
1935 | Allan Schiffman,
1936 | [Chris Hibbert] (http://pancrit.org/)
1937 | for further suggestions
1938 | for the readability and content of this document.
1939 |
1940 |
1941 |
1942 |
1945 |
1955 |
--------------------------------------------------------------------------------