├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── arithGrammar_Worklist.ml
├── arithGrammar_WorklistWithLookahead.ml
├── arithTags.ml
├── arithTokenizer.ml
├── lookahead.ml
├── pwZ_Worklist.ml
├── pwZ_WorklistWithLookahead.ml
├── pwZ_WorklistWithLookahead_Help.ml
├── pwZ_Worklist_Help.ml
└── worklist.ml


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.annot
 2 | *.cmo
 3 | *.cma
 4 | *.cmi
 5 | *.a
 6 | *.o
 7 | *.cmx
 8 | *.cmxs
 9 | *.cmxa
10 | 
11 | # ocamlbuild working directory
12 | _build/
13 | 
14 | # ocamlbuild targets
15 | *.byte
16 | *.native
17 | 
18 | # oasis generated files
19 | setup.data
20 | setup.log
21 | 
22 | # Merlin configuring file for Vim and Emacs
23 | .merlin
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2018, U Combinator
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | OCAMLC := ocamlc
 2 | 
 3 | WORKLIST_EXEC := worklist
 4 | LOOKAHEAD_EXEC := lookahead
 5 | EXECS := $(WORKLIST_EXEC) $(LOOKAHEAD_EXEC)
 6 | 
 7 | WORKLIST_LIB := worklist.cma
 8 | LOOKAHEAD_LIB := lookahead.cma
 9 | ARITH_LIB := arith.cma
10 | 
11 | .PHONY: all
12 | all: $(WORKLIST_EXEC) $(LOOKAHEAD_EXEC)
13 | 
14 | $(WORKLIST_EXEC): $(WORKLIST_LIB) worklist.ml
15 | 	$(OCAMLC) $^ -o $@
16 | 
17 | $(WORKLIST_LIB): $(ARITH_LIB) pwZ_Worklist.cmo pwZ_Worklist_Help.cmo arithGrammar_Worklist.cmo
18 | 	$(OCAMLC) -a $^ -o $@
19 | 
20 | arithGrammar_Worklist.cmo: $(ARITH_LIB) pwZ_Worklist.cmo arithGrammar_Worklist.ml
21 | 	$(OCAMLC) -c $^ -o $@
22 | 
23 | 
24 | $(LOOKAHEAD_EXEC): $(LOOKAHEAD_LIB) lookahead.ml
25 | 	$(OCAMLC) $^ -o $@
26 | 
27 | $(LOOKAHEAD_LIB): $(ARITH_LIB) pwZ_WorklistWithLookahead.cmo pwZ_WorklistWithLookahead_Help.cmo arithGrammar_WorklistWithLookahead.cmo
28 | 	$(OCAMLC) -a $^ -o $@
29 | 
30 | arithGrammar_WorklistWithLookahead.cmo: $(ARITH_LIB) pwZ_WorklistWithLookahead.cmo arithGrammar_WorklistWithLookahead.ml
31 | 	$(OCAMLC) -c $^ -o $@
32 | 
33 | 
34 | $(ARITH_LIB): arithTags.cmo arithTokenizer.ml
35 | 	$(OCAMLC) -a $^ -o $@
36 | 
37 | %.cmo: %.ml
38 | 	$(OCAMLC) -c $^
39 | 
40 | .PHONY: clean-all
41 | clean-all: clean
42 | 	$(RM) $(EXECS)
43 | 
44 | .PHONY: clean
45 | clean:
46 | 	$(RM) *.cmi *.cmo *.cma *.cmxa
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Parsing with Zippers
 2 | 
 3 | Parsing with Zippers (PwZ) is an extension of Parsing with Derivatives (Might
 4 | et al. 2011) using McBride derivatives (McBride 2001) and zippers (Huet 1997).
 5 | 
 6 | This repository is an implementation of two extensions to the
 7 | algorithm presented in our paper, namely:
 8 | 
 9 | 1. A global worklist
10 | 2. One-token downward lookahead
11 | 
12 | ## Building
13 | 
14 | The code is written in OCaml and was tested with OCaml 4.06.1.
15 | 
16 | There are two executables that can be built: `worklist` and `lookahead` which
17 | correspond to the algorithms implemented in `pwZ_Worklist.ml` and
18 | `pwZ_WorklistWithLookahead.ml`, respectively.
19 | 
20 | To build both, simply run `make`. They can also be built independently using
21 | `make worklist` or `make lookahead`. (Note that `lookahead` is simply a short
22 | name for the algorith which uses both a worklist and lookahead.)
23 | 
24 | To clean, run `make clean` or (if you wish to remove the executables)
25 | `make clean-all`.
26 | 
27 | ## Running
28 | 
29 | Once the executables are built, they can be run from the command line. They
30 | each expect a single string representing a sequence of tokens for a simple
31 | arithmetic grammar (detailed below). For example:
32 | 
33 | ```
34 | $ ./lookahead "6 * 9"
35 | ```
36 | 
37 | The program will print the resulting parse tree if the parse is successful or
38 | a blank line if there was not a successful parse.
39 | 
40 | ## Arithmetic Grammar
41 | 
42 | The included grammar is very straightforward. It is:
43 | 
44 | ```
45 | NUM     ::= <INT>
46 | PAREN   ::= '(' EXPR ')'
47 | ATOM    ::= NUM
48 |           | PAREN
49 | 
50 | MULT    ::= TERM '*' TERM
51 | T_ATOM  ::= ATOM
52 | TERM    ::= MULT
53 |           | T_ATOM
54 | 
55 | E_TERM  ::= TERM
56 | ADD     ::= EXPR '+' EXPR
57 | EXPR    ::= E_TERM
58 |           | ADD
59 | ```
60 | 
61 | (`<INT>` represents any positive integer (a terminal), and items in single
62 | quotes are terminals.)
63 | 
64 | Some examples of valid expressions:
65 | 
66 | - 42
67 | - 1 + 3
68 | - 4 * 3 + 2
69 | - (3 + 4) * 7
70 | 
71 | This grammar is written in two files: `arithGrammar_Worklist.ml` and
72 | `arithGrammar_WorklistWithLookahead.ml`. The latter file is identical to the
73 | first except that each expression is also given a `first` array, where each
74 | element corresponds to a token tag in the grammar and determines whether that
75 | token is in the FIRST set of the respective expression.
76 | 
77 | Note that token tag 0 is the EOF, which is reserved for use by the parse
78 | function that wraps the algorithm. It should not be used by user-implemented
79 | tokens.
80 | 
81 | The token tags are defined in `arithTags.ml`.
82 | 


--------------------------------------------------------------------------------
/arithGrammar_Worklist.ml:
--------------------------------------------------------------------------------
 1 | open PwZ_Worklist
 2 | open ArithTags
 3 | 
 4 | (* This format is the construction of our grammar.
 5 |  *
 6 |  * We have a Racket program that can generate these types of files, but this
 7 |  * example was written by hand. We would not recommend handwriting more complex
 8 |  * grammars, so we assume a "full" implementation of PwZ would utilize a similar
 9 |  * program to pre-generate these grammars. *)
10 | 
11 | let rec g_INT       = { m = m_0; e = T ("INT", t_INT) }
12 | and g_OPEN_PAREN    = { m = m_0; e = T ("(", t_OPEN_PAREN) }
13 | and g_CLOSE_PAREN   = { m = m_0; e = T (")", t_CLOSE_PAREN) }
14 | and g_TIMES         = { m = m_0; e = T ("*", t_TIMES) }
15 | and g_PLUS          = { m = m_0; e = T ("+", t_PLUS) }
16 | 
17 | and g_NUM           = { m = m_0; e = Seq ("NUM", [ g_INT ]) }
18 | and g_PAREN         = { m = m_0; e = Seq ("PAREN", [ g_OPEN_PAREN; g_EXPR; g_CLOSE_PAREN ]) }
19 | and g_ATOM          = { m = m_0; e = Alt (ref [ g_NUM; g_PAREN ]) }
20 | 
21 | and g_MULT          = { m = m_0; e = Seq ("MULT", [ g_TERM; g_TIMES; g_TERM ]) }
22 | and g_T_ATOM        = { m = m_0; e = Seq ("T_ATOM", [ g_ATOM ]) }
23 | and g_TERM          = { m = m_0; e = Alt (ref [ g_MULT; g_T_ATOM ]) }
24 | 
25 | and g_E_TERM        = { m = m_0; e = Seq ("E_TERM", [ g_TERM ]) }
26 | and g_ADD           = { m = m_0; e = Seq ("ADD", [ g_EXPR; g_PLUS; g_EXPR ]) }
27 | and g_EXPR          = { m = m_0; e = Alt (ref [ g_E_TERM; g_ADD ]) }
28 | 
29 | 


--------------------------------------------------------------------------------
/arithGrammar_WorklistWithLookahead.ml:
--------------------------------------------------------------------------------
 1 | open PwZ_WorklistWithLookahead
 2 | open ArithTags
 3 | 
 4 | (* This format is the construction of our grammar.
 5 |  *
 6 |  * We have a Racket program that can generate these types of files, but this
 7 |  * example was written by hand. We would not recommend handwriting more complex
 8 |  * grammars, so we assume a "full" implementation of PwZ would utilize a similar
 9 |  * program to pre-generate these grammars. *)
10 | 
11 | let rec g_INT       = { m = m_0; e = T ("INT",  t_INT);                                         first = [| false;  true; false; false; false; false |] }
12 | and g_OPEN_PAREN    = { m = m_0; e = T ("(",    t_OPEN_PAREN);                                  first = [| false; false;  true; false; false; false |] }
13 | and g_CLOSE_PAREN   = { m = m_0; e = T (")",    t_CLOSE_PAREN);                                 first = [| false; false; false;  true; false; false |] }
14 | and g_TIMES         = { m = m_0; e = T ("*",    t_TIMES);                                       first = [| false; false; false; false;  true; false |] }
15 | and g_PLUS          = { m = m_0; e = T ("+",    t_PLUS);                                        first = [| false; false; false; false; false;  true |] }
16 | 
17 | and g_NUM           = { m = m_0; e = Seq ("NUM", [ g_INT ]);                                    first = [| false;  true; false; false; false; false |] }
18 | and g_PAREN         = { m = m_0; e = Seq ("PAREN", [ g_OPEN_PAREN; g_EXPR; g_CLOSE_PAREN ]);    first = [| false; false;  true; false; false; false |] }
19 | and g_ATOM          = { m = m_0; e = Alt (ref [ g_NUM; g_PAREN ]);                              first = [| false;  true;  true; false; false; false |] }
20 | 
21 | and g_MULT          = { m = m_0; e = Seq ("MULT", [ g_TERM; g_TIMES; g_TERM ]);                 first = [| false;  true;  true; false; false; false |] }
22 | and g_T_ATOM        = { m = m_0; e = Seq ("T_ATOM", [ g_ATOM ]);                                first = [| false;  true;  true; false; false; false |] }
23 | and g_TERM          = { m = m_0; e = Alt (ref [ g_MULT; g_T_ATOM ]);                            first = [| false;  true;  true; false; false; false |] }
24 | 
25 | and g_E_TERM        = { m = m_0; e = Seq ("E_TERM", [ g_TERM ]);                                first = [| false;  true;  true; false; false; false |] }
26 | and g_ADD           = { m = m_0; e = Seq ("ADD", [ g_EXPR; g_PLUS; g_EXPR ]);                   first = [| false;  true;  true; false; false; false |] }
27 | and g_EXPR          = { m = m_0; e = Alt (ref [ g_E_TERM; g_ADD ]);                             first = [| false;  true;  true; false; false; false |] }
28 | 
29 | 


--------------------------------------------------------------------------------
/arithTags.ml:
--------------------------------------------------------------------------------
1 | (* These are the token tags. 0 is reserved for the EOF in the parser. *)
2 | 
3 | let t_INT = 1
4 | let t_OPEN_PAREN = 2
5 | let t_CLOSE_PAREN = 3
6 | let t_TIMES = 4
7 | let t_PLUS = 5
8 | 


--------------------------------------------------------------------------------
/arithTokenizer.ml:
--------------------------------------------------------------------------------
 1 | open ArithTags
 2 | 
 3 | exception FailedMatch
 4 | 
 5 | type token = string * int
 6 | 
 7 | let getInt (s : string) : token = (s, t_INT)
 8 | 
 9 | (* We only support the simple tokens for this arithmetic grammar. *)
10 | let getToken (c : char) : token =
11 |   match c with
12 |   | '(' -> ("(", t_OPEN_PAREN)
13 |   | ')' -> (")", t_CLOSE_PAREN)
14 |   | '*' -> ("*", t_TIMES)
15 |   | '+' -> ("+", t_PLUS)
16 |   | _   -> raise FailedMatch
17 | 
18 | (* The integer characters are ASCII values 48 through 57 (inclusive). *)
19 | let isInt (c : char) : bool =
20 |   Char.code c >= 48 && Char.code c <= 57
21 | 
22 | (* Simple tokenizer. Iterates through each character and generates a token if it
23 |  * can. The only complex tokens are integers because they can be made up
24 |  * multiple characters. These are stored into an accumulator list `acc` which is
25 |  * tokenized whenever a non-integer character is encountered. *)
26 | let tokenize (str : string) : token list =
27 |   let acc = ref "" in
28 | 
29 |   let append s n =
30 |     s := !s ^ (String.make 1 n)
31 |   in
32 |   let rest s =
33 |     String.sub s 1 ((String.length s) - 1)
34 |   in
35 |   let accumulate s xs =
36 |     if String.length s > 0
37 |     then (acc := "";
38 |           (getInt s) :: xs)
39 |     else xs
40 |   in
41 | 
42 |   let rec iter s xs =
43 |     match s with
44 |     | "" -> List.rev (accumulate !acc xs)
45 |     | _ ->
46 |         (match s.[0] with
47 |          | ' ' ->
48 |              iter (rest s) (accumulate !acc xs)
49 |          | c when isInt c ->
50 |              (append acc c;
51 |               iter (rest s) xs)
52 |          | c ->
53 |              iter (rest s) ((getToken c) :: (accumulate !acc xs))
54 |         )
55 |   in iter str []
56 | 


--------------------------------------------------------------------------------
/lookahead.ml:
--------------------------------------------------------------------------------
 1 | open PwZ_WorklistWithLookahead
 2 | open PwZ_WorklistWithLookahead_Help
 3 | open ArithGrammar_WorklistWithLookahead
 4 | open ArithTokenizer
 5 | 
 6 | exception Bad_Arg_Count of int
 7 | 
 8 | let parse (s : string) : exp list =
 9 |   parse (tokenize s) g_EXPR
10 | 
11 | let string_list_of_exp_list (es : exp list) : string list =
12 |   List.map string_of_exp es
13 | 
14 | let print_string_list (ss : string list) : unit =
15 |   List.iter print_string (ss @ ["\n"])
16 | 
17 | let () =
18 |   if Array.length Sys.argv != 2
19 |   then raise (Bad_Arg_Count (Array.length Sys.argv))
20 |   else print_string_list (string_list_of_exp_list (parse Sys.argv.(1)))
21 | 


--------------------------------------------------------------------------------
/pwZ_Worklist.ml:
--------------------------------------------------------------------------------
  1 | (* Simple type aliases.
  2 |  * Fig 1. *)
  3 | type lab = string       (* token and sequence labels *)
  4 | type tag = int          (* token tag, used for token comparison *)
  5 | type pos = int          (* token position in input *)
  6 | type tok = lab * tag    (* token *)
  7 | 
  8 | (* An exception for when a match fails. Should never appear.
  9 |  * This is primarily to suppress warnings in a safe manner. *)
 10 | exception FailedMatch
 11 | 
 12 | (* Additional types necessary for using zippers without memoization tables.
 13 |  * Fig 19.
 14 |  * The implementation of m_0 required the definition of a new `undefined` value,
 15 |  * which is given here. It is essentially a placeholder to be discarded. *)
 16 | type exp = { mutable m : mem; e : exp' }
 17 | 
 18 | and exp' = T of tok
 19 |          | Seq of lab * exp list
 20 |          | Alt of (exp list) ref
 21 | 
 22 | and cxt  = Top
 23 |          | SeqC of mem * lab * exp list * exp list
 24 |          | AltC of mem
 25 | 
 26 | and mem  = {
 27 |   start : pos;
 28 |   mutable parents : cxt list;
 29 |   mutable end_ : pos;
 30 |   mutable result : exp }
 31 | 
 32 | type zipper = exp' * mem
 33 | 
 34 | let rec undefined = {
 35 |   m = m_undefined;
 36 |   e = T ("undefined", -1) }
 37 | 
 38 | and m_undefined = {
 39 |   start = -1;
 40 |   parents = [];
 41 |   end_ = -1;
 42 |   result = undefined }
 43 | 
 44 | let m_0 = {
 45 |   start = -1;
 46 |   parents = [];
 47 |   end_ = -1;
 48 |   result = undefined }
 49 | 
 50 | (* A global worklist. This is used for keeping track of what to do next. *)
 51 | let worklist : (zipper list) ref = ref []
 52 | 
 53 | (* A list of "tops", which gives us parse-null of a Top for free. This is useful
 54 |  * so that in the end we can simply return the result. *)
 55 | let tops : exp list ref = ref []
 56 | 
 57 | (* Core algorithm. Similar to Fig 20, but with additional steps taken for
 58 |  * performance. Note that the return type is now `unit`. *)
 59 | let derive (p : pos) ((t, i) : tok) ((e, m) : zipper) : unit = 
 60 | 
 61 |   let rec d_d (c : cxt) (e : exp) : unit =
 62 |     if p == e.m.start
 63 |     then (e.m.parents <- c :: e.m.parents;
 64 |           if p == e.m.end_
 65 |           then d_u' e.m.result c
 66 |           else ())
 67 |     else (let m = { start = p; parents = [c]; end_ = -1; result = undefined } in
 68 |           e.m <- m;
 69 |           d_d' m e.e)
 70 | 
 71 |   and d_d' (m : mem) (e : exp') : unit =
 72 |     match e with
 73 |     | T (t', i') ->
 74 |           if i == i'
 75 |             then worklist := (Seq (t, []), m) :: !worklist
 76 |             else ()
 77 |     | Seq (l, [])                       -> d_u (Seq (l, [])) m
 78 |     | Seq (l, e :: es)                  -> d_d (SeqC (m, l, [], es)) e
 79 |     | Alt es                            -> List.iter
 80 |                                              (fun e -> d_d (AltC m) e)
 81 |                                              !es
 82 | 
 83 |   and d_u (e : exp') (m : mem) : unit =
 84 |     let e' = { m = m_0; e = e } in
 85 |     m.end_ <- p;
 86 |     m.result <- e';
 87 |     List.iter (fun c -> d_u' e' c) m.parents
 88 | 
 89 |   and d_u' (e : exp) (c : cxt) : unit =
 90 |     match c with
 91 |     | Top                               -> tops := e :: !tops
 92 |     | SeqC (m, l, es, [])               -> d_u (Seq (l, List.rev (e :: es))) m
 93 |     | SeqC (m, l, left, e' :: right)    -> d_d (SeqC (m, l, e :: left, right)) e'
 94 |     | AltC m                            -> if p == m.end_
 95 |                                            then match m.result.e with
 96 |                                              | Alt es -> es := e :: !es
 97 |                                              | _ -> raise FailedMatch
 98 |                                            else d_u (Alt (ref [e])) m
 99 | 
100 |   in d_u e m
101 | 
102 | (* Here we construct the initial zipper. This allows us to properly traverse the
103 |  * grammar from the first step. This construction is similar in spirit to the
104 |  * Seq/SeqC pair used on l318 (near the end of Section 4) of the paper. *)
105 | let init_zipper (e : exp) : zipper =
106 |   let e' = Seq ("<init_zipper:Seq>", []) in
107 |   let m_top : mem = { start = 0; parents = [Top]; end_ = -1; result = undefined } in
108 |   let c = SeqC (m_top, "<init_zipper:SeqC>", [], [e]) in
109 |   let m_seq : mem = { start = 0; parents = [c]; end_ = -1; result = undefined } in
110 |   (e', m_seq)
111 | 
112 | (* When a result is produced, it will have some vestigial structure remaining
113 |  * from the initial zipper (see above). This function removes those extra bits
114 |  * so only the important stuff is returned once the parse is complete. *)
115 | let unwrap_top_exp (e : exp) : exp =
116 |   match e.e with
117 |   | Seq (_, [_; e'])    -> e'
118 |   | _                   -> raise FailedMatch
119 | 
120 | (* This is our wrapper/driver function. It initializes blank worklist and tops
121 |  * lists for each element in the worklist. This allows for a generational style
122 |  * of worklist (where "child processes" can each have their own worklist).
123 |  *
124 |  * The token tag 0 is assumed to be reserved for the end of the input. *)
125 | let parse (ts : tok list) (e : exp) : exp list =
126 |   let rec parse (p : pos) (ts : tok list) : exp list =
127 |     (let w = !worklist in
128 |      worklist := [];
129 |      tops := [];
130 |      match ts with
131 |      | [] -> List.iter (fun z -> derive p ("EOF", 0) z) w;
132 |              List.map unwrap_top_exp !tops
133 |      | ((t, s) :: ts') ->
134 |              List.iter (fun z -> derive p (t, s) z) w;
135 |              parse (p + 1) ts')
136 |   in worklist := [init_zipper e];
137 |      parse 0 ts
138 | 


--------------------------------------------------------------------------------
/pwZ_WorklistWithLookahead.ml:
--------------------------------------------------------------------------------
  1 | (* Simple type aliases.
  2 |  * Fig 1. *)
  3 | type lab = string       (* token and sequence labels *)
  4 | type tag = int          (* token tag, used for lookahead and token comparison *)
  5 | type pos = int          (* token position in input *)
  6 | type tok = lab * tag    (* token *)
  7 | 
  8 | (* An exception when a match fails. Should never appear.
  9 |  * This is primarily to suppress warnings in a safe manner. *)
 10 | exception FailedMatch
 11 | 
 12 | (* Additional types necessary for using zippers without memoization tables.
 13 |  * Fig 19.
 14 |  * The implementation of m_0 required the definition of a new `undefined` value,
 15 |  * which is given here. It is essentially a placeholder to be discarded. *)
 16 | type exp = { mutable m : mem; e : exp'; first : bool array }
 17 | 
 18 | and exp' = T of tok
 19 |          | Seq of lab * exp list
 20 |          | Alt of (exp list) ref
 21 | 
 22 | and cxt  = Top
 23 |          | SeqC of mem * lab * exp list * exp list
 24 |          | AltC of mem
 25 | 
 26 | and mem  = {
 27 |   start : pos;
 28 |   mutable parents : cxt list;
 29 |   mutable end_ : pos;
 30 |   mutable result : exp }
 31 | 
 32 | type zipper = exp' * mem
 33 | 
 34 | let rec undefined : exp = {
 35 |   m = m_undefined;
 36 |   e = T ("undefined", -1);
 37 |   first = [| |] }
 38 | 
 39 | and m_undefined : mem = {
 40 |   start = -1;
 41 |   parents = [];
 42 |   end_ = -1;
 43 |   result = undefined }
 44 | 
 45 | let m_0 : mem = {
 46 |   start = -1;
 47 |   parents = [];
 48 |   end_ = -1;
 49 |   result = undefined }
 50 | 
 51 | (* A global worklist. This is used for keeping track of what to do next. *)
 52 | let worklist : (zipper list) ref = ref []
 53 | 
 54 | (* A list of "tops", which gives us parse-null of a Top for free. This is useful
 55 |  * so that in the end we can simply return the result. *)
 56 | let tops : exp list ref = ref []
 57 | 
 58 | (* Core algorithm. Similar to Fig 20, but with additional steps taken for
 59 |  * performance. Note that the return type is now `unit`. *)
 60 | let derive (p : pos) ((t, i) : tok) ((e, m) : zipper) : unit =
 61 | 
 62 |   let rec d_d (c : cxt) (e : exp) : unit =
 63 |     if p == e.m.start
 64 |     then (e.m.parents <- c :: e.m.parents;
 65 |           if p == e.m.end_
 66 |           then d_u' e.m.result c
 67 |           else ())
 68 |     else (let m = { start = p; parents = [c]; end_ = -1; result = undefined } in
 69 |           e.m <- m;
 70 |           d_d' m e.e)
 71 | 
 72 |   and d_d' (m : mem) (e : exp') : unit =
 73 |     match e with
 74 |     | T (t', i') ->
 75 |           if i == i'
 76 |             then worklist := (Seq (t, []), m) :: !worklist
 77 |             else ()
 78 |     | Seq (l, [])                       -> d_u (Seq (l, [])) m
 79 |     | Seq (l, e :: es)                  -> d_d (SeqC (m, l, [], es)) e
 80 |     | Alt es                            -> List.iter
 81 |                                              (fun e ->
 82 |                                                if e.first.(i)
 83 |                                                then d_d (AltC m) e
 84 |                                                else ())
 85 |                                              !es
 86 | 
 87 |   and d_u (e : exp') (m : mem) : unit =
 88 |     let e' = { m = m_0; e = e; first = [| |] } in
 89 |     m.end_ <- p;
 90 |     m.result <- e';
 91 |     List.iter (fun c -> d_u' e' c) m.parents
 92 | 
 93 |   and d_u' (e : exp) (c : cxt) : unit =
 94 |     match c with
 95 |     | Top                               -> tops := e :: !tops
 96 |     | SeqC (m, l, es, [])               -> d_u (Seq (l, List.rev (e :: es))) m
 97 |     | SeqC (m, l, left, e' :: right)    -> d_d (SeqC (m, l, e :: left, right)) e'
 98 |     | AltC m                            -> if p == m.end_
 99 |                                            then match m.result.e with
100 |                                              | Alt es -> es := e :: !es
101 |                                              | _ -> raise FailedMatch
102 |                                            else d_u (Alt (ref [e])) m
103 | 
104 |   in d_u e m
105 | 
106 | (* Here we construct the initial zipper. This allows us to properly traverse the
107 |  * grammar from the first step. This construction is similar in spirit to the
108 |  * Seq/SeqC pair used on l318 (near the end of Section 4) of the paper. *)
109 | let init_zipper (e : exp) : zipper =
110 |   let e' = Seq ("<init_zipper:Seq>", []) in
111 |   let m_top : mem = { start = 0; parents = [Top]; end_ = -1; result = undefined } in
112 |   let c = SeqC (m_top, "<init_zipper:SeqC>", [], [e]) in
113 |   let m_seq : mem = { start = 0; parents = [c]; end_ = -1; result = undefined } in
114 |   (e', m_seq)
115 | 
116 | (* When a result is produced, it will have some vestigial structure remaining
117 |  * from the initial zipper (see above). This function removes those extra bits
118 |  * so only the important stuff is returned once the parse is complete. *)
119 | let unwrap_top_exp (e : exp) : exp =
120 |   match e.e with
121 |   | Seq (_, [_; e'])    -> e'
122 |   | _                   -> raise FailedMatch
123 | 
124 | (* This is our wrapper/driver function. It initializes blank worklist and tops
125 |  * lists for each element in the worklist. This allows for a generational style
126 |  * of worklist (where "child processes" can each have their own worklist).
127 |  *
128 |  * The token tag 0 is assumed to be reserved for the end of the input. *)
129 | let parse (ts : tok list) (e : exp) : exp list =
130 |   let rec parse (p : pos) (ts : tok list) : exp list =
131 |     (let w = !worklist in
132 |      worklist := [];
133 |      tops := [];
134 |      match ts with
135 |      | [] -> List.iter (fun z -> derive p ("EOF", 0) z) w;
136 |              List.map unwrap_top_exp !tops
137 |      | ((t, s) :: ts') ->
138 |              List.iter (fun z -> derive p (t, s) z) w;
139 |              parse (p + 1) ts')
140 |   in worklist := [init_zipper e];
141 |      parse 0 ts
142 | 


--------------------------------------------------------------------------------
/pwZ_WorklistWithLookahead_Help.ml:
--------------------------------------------------------------------------------
 1 | open PwZ_WorklistWithLookahead
 2 | 
 3 | let concat_map (f : 'a -> 'b list) (l : 'a list) : 'b list =
 4 |   List.concat (List.map f l)
 5 | 
 6 | let plug (p : pos) ((e, c) : zipper) : exp list =
 7 |   let rec up (e : exp') (m : mem) : exp list =
 8 |     let e' = { m = m_0; e = e; first = [| |] } in
 9 |     m.end_ <- p;
10 |     m.result <- e';
11 |     concat_map (up' e') m.parents
12 |   and up' (e : exp) (c : cxt) : exp list =
13 |     match c with
14 |     | Top -> [e]
15 |     | SeqC (m, l, left, right) ->
16 |         up (Seq (l, List.append (List.rev left) (e :: right))) m
17 |     | AltC m ->
18 |         if p == m.end_
19 |         then match m.result.e with
20 |              | Alt es -> es := e :: !es; []
21 |              | _ -> raise FailedMatch
22 |         else up (Alt (ref [e])) m
23 |   in up e c
24 | 
25 | let string_of_exp (e : exp) : string =
26 |   let rec make_nice_string (e' : exp') (c : int) : string =
27 |     let indent (i : int) (ss : string list) : string list =
28 |       List.map (fun s -> (String.make i ' ') ^ s) ss
29 |     in
30 |     let join (ss : string list) : string =
31 |       String.concat "\n" ss
32 |     in
33 |     let indent_subexp_strings (es : exp list) (ind : int) : string =
34 |       match es with
35 |       | [] -> ""
36 |       | _ -> "\n" ^ (join (indent ind (List.map (fun e -> make_nice_string e ind) (List.map (fun e -> e.e) es))))
37 |     in
38 | 
39 |     match e' with
40 |     | T (l, t) ->
41 |         "(T " ^ l ^ ")"
42 |     | Seq (l, []) ->
43 |         "(Seq " ^ l ^ ")"
44 |     | Seq (l, e :: es) ->
45 |         (let leader = "(Seq " ^ l ^ " " in
46 |          let ind = c + String.length leader in
47 |          leader ^ (make_nice_string e.e ind) ^ (indent_subexp_strings es ind) ^ ")"
48 |         )
49 |     | Alt (res) ->
50 |         match !res with
51 |         | [] ->
52 |             "(Alt)"
53 |         | e :: es ->
54 |             (let leader = "(Alt " in
55 |              let ind = c + String.length leader in
56 |              leader ^ (make_nice_string e.e ind) ^ (indent_subexp_strings es ind) ^ ")"
57 |             )
58 |   in make_nice_string e.e 0
59 | 


--------------------------------------------------------------------------------
/pwZ_Worklist_Help.ml:
--------------------------------------------------------------------------------
 1 | open PwZ_Worklist
 2 | 
 3 | let concat_map (f : 'a -> 'b list) (l : 'a list) : 'b list =
 4 |   List.concat (List.map f l)
 5 | 
 6 | let plug (p : pos) ((e, c) : zipper) : exp list =
 7 |   let rec up (e : exp') (m : mem) : exp list =
 8 |     let e' = { m = m_0; e = e } in
 9 |     m.end_ <- p;
10 |     m.result <- e';
11 |     concat_map (up' e') m.parents
12 |   and up' (e : exp) (c : cxt) : exp list =
13 |     match c with
14 |     | Top -> [e]
15 |     | SeqC (m, l, left, right) ->
16 |         up (Seq (l, List.append (List.rev left) (e :: right))) m
17 |     | AltC m ->
18 |         if p == m.end_
19 |         then match m.result.e with
20 |              | Alt es -> es := e :: !es; []
21 |              | _ -> raise FailedMatch
22 |         else up (Alt (ref [e])) m
23 |   in up e c
24 | 
25 | let string_of_exp (e : exp) : string =
26 |   let rec make_nice_string (e' : exp') (c : int) : string =
27 |     let indent (i : int) (ss : string list) : string list =
28 |       List.map (fun s -> (String.make i ' ') ^ s) ss
29 |     in
30 |     let join (ss : string list) : string =
31 |       String.concat "\n" ss
32 |     in
33 |     let indent_subexp_strings (es : exp list) (ind : int) : string =
34 |       match es with
35 |       | [] -> ""
36 |       | _ -> "\n" ^ (join (indent ind (List.map (fun e -> make_nice_string e ind) (List.map (fun e -> e.e) es))))
37 |     in
38 | 
39 |     match e' with
40 |     | T (l, t) ->
41 |         "(T " ^ l ^ ")"
42 |     | Seq (l, []) ->
43 |         "(Seq " ^ l ^ ")"
44 |     | Seq (l, e :: es) ->
45 |         (let leader = "(Seq " ^ l ^ " " in
46 |          let ind = c + String.length leader in
47 |          leader ^ (make_nice_string e.e ind) ^ (indent_subexp_strings es ind) ^ ")"
48 |         )
49 |     | Alt (res) ->
50 |         match !res with
51 |         | [] ->
52 |             "(Alt)"
53 |         | e :: es ->
54 |             (let leader = "(Alt " in
55 |              let ind = c + String.length leader in
56 |              leader ^ (make_nice_string e.e ind) ^ (indent_subexp_strings es ind) ^ ")"
57 |             )
58 |   in make_nice_string e.e 0
59 | 


--------------------------------------------------------------------------------
/worklist.ml:
--------------------------------------------------------------------------------
 1 | open PwZ_Worklist
 2 | open PwZ_Worklist_Help
 3 | open ArithGrammar_Worklist
 4 | open ArithTokenizer
 5 | 
 6 | exception Bad_Arg_Count of int
 7 | 
 8 | let parse (s : string) : exp list =
 9 |   parse (tokenize s) g_EXPR
10 | 
11 | let string_list_of_exp_list (es : exp list) : string list =
12 |   List.map string_of_exp es
13 | 
14 | let print_string_list (ss : string list) : unit =
15 |   List.iter print_string (ss @ ["\n"])
16 | 
17 | let () =
18 |   if Array.length Sys.argv != 2
19 |   then raise (Bad_Arg_Count (Array.length Sys.argv))
20 |   else print_string_list (string_list_of_exp_list (parse Sys.argv.(1)))
21 | 


--------------------------------------------------------------------------------