├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── arithGrammar_Worklist.ml ├── arithGrammar_WorklistWithLookahead.ml ├── arithTags.ml ├── arithTokenizer.ml ├── lookahead.ml ├── pwZ_Worklist.ml ├── pwZ_WorklistWithLookahead.ml ├── pwZ_WorklistWithLookahead_Help.ml ├── pwZ_Worklist_Help.ml └── worklist.ml /.gitignore: -------------------------------------------------------------------------------- 1 | *.annot 2 | *.cmo 3 | *.cma 4 | *.cmi 5 | *.a 6 | *.o 7 | *.cmx 8 | *.cmxs 9 | *.cmxa 10 | 11 | # ocamlbuild working directory 12 | _build/ 13 | 14 | # ocamlbuild targets 15 | *.byte 16 | *.native 17 | 18 | # oasis generated files 19 | setup.data 20 | setup.log 21 | 22 | # Merlin configuring file for Vim and Emacs 23 | .merlin 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2018, U Combinator 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | OCAMLC := ocamlc 2 | 3 | WORKLIST_EXEC := worklist 4 | LOOKAHEAD_EXEC := lookahead 5 | EXECS := $(WORKLIST_EXEC) $(LOOKAHEAD_EXEC) 6 | 7 | WORKLIST_LIB := worklist.cma 8 | LOOKAHEAD_LIB := lookahead.cma 9 | ARITH_LIB := arith.cma 10 | 11 | .PHONY: all 12 | all: $(WORKLIST_EXEC) $(LOOKAHEAD_EXEC) 13 | 14 | $(WORKLIST_EXEC): $(WORKLIST_LIB) worklist.ml 15 | $(OCAMLC) $^ -o $@ 16 | 17 | $(WORKLIST_LIB): $(ARITH_LIB) pwZ_Worklist.cmo pwZ_Worklist_Help.cmo arithGrammar_Worklist.cmo 18 | $(OCAMLC) -a $^ -o $@ 19 | 20 | arithGrammar_Worklist.cmo: $(ARITH_LIB) pwZ_Worklist.cmo arithGrammar_Worklist.ml 21 | $(OCAMLC) -c $^ -o $@ 22 | 23 | 24 | $(LOOKAHEAD_EXEC): $(LOOKAHEAD_LIB) lookahead.ml 25 | $(OCAMLC) $^ -o $@ 26 | 27 | $(LOOKAHEAD_LIB): $(ARITH_LIB) pwZ_WorklistWithLookahead.cmo pwZ_WorklistWithLookahead_Help.cmo arithGrammar_WorklistWithLookahead.cmo 28 | $(OCAMLC) -a $^ -o $@ 29 | 30 | arithGrammar_WorklistWithLookahead.cmo: $(ARITH_LIB) pwZ_WorklistWithLookahead.cmo arithGrammar_WorklistWithLookahead.ml 31 | $(OCAMLC) -c $^ -o $@ 32 | 33 | 34 | $(ARITH_LIB): arithTags.cmo arithTokenizer.ml 35 | $(OCAMLC) -a $^ -o $@ 36 | 37 | %.cmo: %.ml 38 | $(OCAMLC) -c $^ 39 | 40 | .PHONY: clean-all 41 | clean-all: clean 42 | $(RM) $(EXECS) 43 | 44 | .PHONY: clean 45 | clean: 46 | $(RM) *.cmi *.cmo *.cma *.cmxa 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Parsing with Zippers 2 | 3 | Parsing with Zippers (PwZ) is an extension of Parsing with Derivatives (Might 4 | et al. 2011) using McBride derivatives (McBride 2001) and zippers (Huet 1997). 5 | 6 | This repository is an implementation of two extensions to the 7 | algorithm presented in our paper, namely: 8 | 9 | 1. A global worklist 10 | 2. One-token downward lookahead 11 | 12 | ## Building 13 | 14 | The code is written in OCaml and was tested with OCaml 4.06.1. 15 | 16 | There are two executables that can be built: `worklist` and `lookahead` which 17 | correspond to the algorithms implemented in `pwZ_Worklist.ml` and 18 | `pwZ_WorklistWithLookahead.ml`, respectively. 19 | 20 | To build both, simply run `make`. They can also be built independently using 21 | `make worklist` or `make lookahead`. (Note that `lookahead` is simply a short 22 | name for the algorith which uses both a worklist and lookahead.) 23 | 24 | To clean, run `make clean` or (if you wish to remove the executables) 25 | `make clean-all`. 26 | 27 | ## Running 28 | 29 | Once the executables are built, they can be run from the command line. They 30 | each expect a single string representing a sequence of tokens for a simple 31 | arithmetic grammar (detailed below). For example: 32 | 33 | ``` 34 | $ ./lookahead "6 * 9" 35 | ``` 36 | 37 | The program will print the resulting parse tree if the parse is successful or 38 | a blank line if there was not a successful parse. 39 | 40 | ## Arithmetic Grammar 41 | 42 | The included grammar is very straightforward. It is: 43 | 44 | ``` 45 | NUM ::= 46 | PAREN ::= '(' EXPR ')' 47 | ATOM ::= NUM 48 | | PAREN 49 | 50 | MULT ::= TERM '*' TERM 51 | T_ATOM ::= ATOM 52 | TERM ::= MULT 53 | | T_ATOM 54 | 55 | E_TERM ::= TERM 56 | ADD ::= EXPR '+' EXPR 57 | EXPR ::= E_TERM 58 | | ADD 59 | ``` 60 | 61 | (`` represents any positive integer (a terminal), and items in single 62 | quotes are terminals.) 63 | 64 | Some examples of valid expressions: 65 | 66 | - 42 67 | - 1 + 3 68 | - 4 * 3 + 2 69 | - (3 + 4) * 7 70 | 71 | This grammar is written in two files: `arithGrammar_Worklist.ml` and 72 | `arithGrammar_WorklistWithLookahead.ml`. The latter file is identical to the 73 | first except that each expression is also given a `first` array, where each 74 | element corresponds to a token tag in the grammar and determines whether that 75 | token is in the FIRST set of the respective expression. 76 | 77 | Note that token tag 0 is the EOF, which is reserved for use by the parse 78 | function that wraps the algorithm. It should not be used by user-implemented 79 | tokens. 80 | 81 | The token tags are defined in `arithTags.ml`. 82 | -------------------------------------------------------------------------------- /arithGrammar_Worklist.ml: -------------------------------------------------------------------------------- 1 | open PwZ_Worklist 2 | open ArithTags 3 | 4 | (* This format is the construction of our grammar. 5 | * 6 | * We have a Racket program that can generate these types of files, but this 7 | * example was written by hand. We would not recommend handwriting more complex 8 | * grammars, so we assume a "full" implementation of PwZ would utilize a similar 9 | * program to pre-generate these grammars. *) 10 | 11 | let rec g_INT = { m = m_0; e = T ("INT", t_INT) } 12 | and g_OPEN_PAREN = { m = m_0; e = T ("(", t_OPEN_PAREN) } 13 | and g_CLOSE_PAREN = { m = m_0; e = T (")", t_CLOSE_PAREN) } 14 | and g_TIMES = { m = m_0; e = T ("*", t_TIMES) } 15 | and g_PLUS = { m = m_0; e = T ("+", t_PLUS) } 16 | 17 | and g_NUM = { m = m_0; e = Seq ("NUM", [ g_INT ]) } 18 | and g_PAREN = { m = m_0; e = Seq ("PAREN", [ g_OPEN_PAREN; g_EXPR; g_CLOSE_PAREN ]) } 19 | and g_ATOM = { m = m_0; e = Alt (ref [ g_NUM; g_PAREN ]) } 20 | 21 | and g_MULT = { m = m_0; e = Seq ("MULT", [ g_TERM; g_TIMES; g_TERM ]) } 22 | and g_T_ATOM = { m = m_0; e = Seq ("T_ATOM", [ g_ATOM ]) } 23 | and g_TERM = { m = m_0; e = Alt (ref [ g_MULT; g_T_ATOM ]) } 24 | 25 | and g_E_TERM = { m = m_0; e = Seq ("E_TERM", [ g_TERM ]) } 26 | and g_ADD = { m = m_0; e = Seq ("ADD", [ g_EXPR; g_PLUS; g_EXPR ]) } 27 | and g_EXPR = { m = m_0; e = Alt (ref [ g_E_TERM; g_ADD ]) } 28 | 29 | -------------------------------------------------------------------------------- /arithGrammar_WorklistWithLookahead.ml: -------------------------------------------------------------------------------- 1 | open PwZ_WorklistWithLookahead 2 | open ArithTags 3 | 4 | (* This format is the construction of our grammar. 5 | * 6 | * We have a Racket program that can generate these types of files, but this 7 | * example was written by hand. We would not recommend handwriting more complex 8 | * grammars, so we assume a "full" implementation of PwZ would utilize a similar 9 | * program to pre-generate these grammars. *) 10 | 11 | let rec g_INT = { m = m_0; e = T ("INT", t_INT); first = [| false; true; false; false; false; false |] } 12 | and g_OPEN_PAREN = { m = m_0; e = T ("(", t_OPEN_PAREN); first = [| false; false; true; false; false; false |] } 13 | and g_CLOSE_PAREN = { m = m_0; e = T (")", t_CLOSE_PAREN); first = [| false; false; false; true; false; false |] } 14 | and g_TIMES = { m = m_0; e = T ("*", t_TIMES); first = [| false; false; false; false; true; false |] } 15 | and g_PLUS = { m = m_0; e = T ("+", t_PLUS); first = [| false; false; false; false; false; true |] } 16 | 17 | and g_NUM = { m = m_0; e = Seq ("NUM", [ g_INT ]); first = [| false; true; false; false; false; false |] } 18 | and g_PAREN = { m = m_0; e = Seq ("PAREN", [ g_OPEN_PAREN; g_EXPR; g_CLOSE_PAREN ]); first = [| false; false; true; false; false; false |] } 19 | and g_ATOM = { m = m_0; e = Alt (ref [ g_NUM; g_PAREN ]); first = [| false; true; true; false; false; false |] } 20 | 21 | and g_MULT = { m = m_0; e = Seq ("MULT", [ g_TERM; g_TIMES; g_TERM ]); first = [| false; true; true; false; false; false |] } 22 | and g_T_ATOM = { m = m_0; e = Seq ("T_ATOM", [ g_ATOM ]); first = [| false; true; true; false; false; false |] } 23 | and g_TERM = { m = m_0; e = Alt (ref [ g_MULT; g_T_ATOM ]); first = [| false; true; true; false; false; false |] } 24 | 25 | and g_E_TERM = { m = m_0; e = Seq ("E_TERM", [ g_TERM ]); first = [| false; true; true; false; false; false |] } 26 | and g_ADD = { m = m_0; e = Seq ("ADD", [ g_EXPR; g_PLUS; g_EXPR ]); first = [| false; true; true; false; false; false |] } 27 | and g_EXPR = { m = m_0; e = Alt (ref [ g_E_TERM; g_ADD ]); first = [| false; true; true; false; false; false |] } 28 | 29 | -------------------------------------------------------------------------------- /arithTags.ml: -------------------------------------------------------------------------------- 1 | (* These are the token tags. 0 is reserved for the EOF in the parser. *) 2 | 3 | let t_INT = 1 4 | let t_OPEN_PAREN = 2 5 | let t_CLOSE_PAREN = 3 6 | let t_TIMES = 4 7 | let t_PLUS = 5 8 | -------------------------------------------------------------------------------- /arithTokenizer.ml: -------------------------------------------------------------------------------- 1 | open ArithTags 2 | 3 | exception FailedMatch 4 | 5 | type token = string * int 6 | 7 | let getInt (s : string) : token = (s, t_INT) 8 | 9 | (* We only support the simple tokens for this arithmetic grammar. *) 10 | let getToken (c : char) : token = 11 | match c with 12 | | '(' -> ("(", t_OPEN_PAREN) 13 | | ')' -> (")", t_CLOSE_PAREN) 14 | | '*' -> ("*", t_TIMES) 15 | | '+' -> ("+", t_PLUS) 16 | | _ -> raise FailedMatch 17 | 18 | (* The integer characters are ASCII values 48 through 57 (inclusive). *) 19 | let isInt (c : char) : bool = 20 | Char.code c >= 48 && Char.code c <= 57 21 | 22 | (* Simple tokenizer. Iterates through each character and generates a token if it 23 | * can. The only complex tokens are integers because they can be made up 24 | * multiple characters. These are stored into an accumulator list `acc` which is 25 | * tokenized whenever a non-integer character is encountered. *) 26 | let tokenize (str : string) : token list = 27 | let acc = ref "" in 28 | 29 | let append s n = 30 | s := !s ^ (String.make 1 n) 31 | in 32 | let rest s = 33 | String.sub s 1 ((String.length s) - 1) 34 | in 35 | let accumulate s xs = 36 | if String.length s > 0 37 | then (acc := ""; 38 | (getInt s) :: xs) 39 | else xs 40 | in 41 | 42 | let rec iter s xs = 43 | match s with 44 | | "" -> List.rev (accumulate !acc xs) 45 | | _ -> 46 | (match s.[0] with 47 | | ' ' -> 48 | iter (rest s) (accumulate !acc xs) 49 | | c when isInt c -> 50 | (append acc c; 51 | iter (rest s) xs) 52 | | c -> 53 | iter (rest s) ((getToken c) :: (accumulate !acc xs)) 54 | ) 55 | in iter str [] 56 | -------------------------------------------------------------------------------- /lookahead.ml: -------------------------------------------------------------------------------- 1 | open PwZ_WorklistWithLookahead 2 | open PwZ_WorklistWithLookahead_Help 3 | open ArithGrammar_WorklistWithLookahead 4 | open ArithTokenizer 5 | 6 | exception Bad_Arg_Count of int 7 | 8 | let parse (s : string) : exp list = 9 | parse (tokenize s) g_EXPR 10 | 11 | let string_list_of_exp_list (es : exp list) : string list = 12 | List.map string_of_exp es 13 | 14 | let print_string_list (ss : string list) : unit = 15 | List.iter print_string (ss @ ["\n"]) 16 | 17 | let () = 18 | if Array.length Sys.argv != 2 19 | then raise (Bad_Arg_Count (Array.length Sys.argv)) 20 | else print_string_list (string_list_of_exp_list (parse Sys.argv.(1))) 21 | -------------------------------------------------------------------------------- /pwZ_Worklist.ml: -------------------------------------------------------------------------------- 1 | (* Simple type aliases. 2 | * Fig 1. *) 3 | type lab = string (* token and sequence labels *) 4 | type tag = int (* token tag, used for token comparison *) 5 | type pos = int (* token position in input *) 6 | type tok = lab * tag (* token *) 7 | 8 | (* An exception for when a match fails. Should never appear. 9 | * This is primarily to suppress warnings in a safe manner. *) 10 | exception FailedMatch 11 | 12 | (* Additional types necessary for using zippers without memoization tables. 13 | * Fig 19. 14 | * The implementation of m_0 required the definition of a new `undefined` value, 15 | * which is given here. It is essentially a placeholder to be discarded. *) 16 | type exp = { mutable m : mem; e : exp' } 17 | 18 | and exp' = T of tok 19 | | Seq of lab * exp list 20 | | Alt of (exp list) ref 21 | 22 | and cxt = Top 23 | | SeqC of mem * lab * exp list * exp list 24 | | AltC of mem 25 | 26 | and mem = { 27 | start : pos; 28 | mutable parents : cxt list; 29 | mutable end_ : pos; 30 | mutable result : exp } 31 | 32 | type zipper = exp' * mem 33 | 34 | let rec undefined = { 35 | m = m_undefined; 36 | e = T ("undefined", -1) } 37 | 38 | and m_undefined = { 39 | start = -1; 40 | parents = []; 41 | end_ = -1; 42 | result = undefined } 43 | 44 | let m_0 = { 45 | start = -1; 46 | parents = []; 47 | end_ = -1; 48 | result = undefined } 49 | 50 | (* A global worklist. This is used for keeping track of what to do next. *) 51 | let worklist : (zipper list) ref = ref [] 52 | 53 | (* A list of "tops", which gives us parse-null of a Top for free. This is useful 54 | * so that in the end we can simply return the result. *) 55 | let tops : exp list ref = ref [] 56 | 57 | (* Core algorithm. Similar to Fig 20, but with additional steps taken for 58 | * performance. Note that the return type is now `unit`. *) 59 | let derive (p : pos) ((t, i) : tok) ((e, m) : zipper) : unit = 60 | 61 | let rec d_d (c : cxt) (e : exp) : unit = 62 | if p == e.m.start 63 | then (e.m.parents <- c :: e.m.parents; 64 | if p == e.m.end_ 65 | then d_u' e.m.result c 66 | else ()) 67 | else (let m = { start = p; parents = [c]; end_ = -1; result = undefined } in 68 | e.m <- m; 69 | d_d' m e.e) 70 | 71 | and d_d' (m : mem) (e : exp') : unit = 72 | match e with 73 | | T (t', i') -> 74 | if i == i' 75 | then worklist := (Seq (t, []), m) :: !worklist 76 | else () 77 | | Seq (l, []) -> d_u (Seq (l, [])) m 78 | | Seq (l, e :: es) -> d_d (SeqC (m, l, [], es)) e 79 | | Alt es -> List.iter 80 | (fun e -> d_d (AltC m) e) 81 | !es 82 | 83 | and d_u (e : exp') (m : mem) : unit = 84 | let e' = { m = m_0; e = e } in 85 | m.end_ <- p; 86 | m.result <- e'; 87 | List.iter (fun c -> d_u' e' c) m.parents 88 | 89 | and d_u' (e : exp) (c : cxt) : unit = 90 | match c with 91 | | Top -> tops := e :: !tops 92 | | SeqC (m, l, es, []) -> d_u (Seq (l, List.rev (e :: es))) m 93 | | SeqC (m, l, left, e' :: right) -> d_d (SeqC (m, l, e :: left, right)) e' 94 | | AltC m -> if p == m.end_ 95 | then match m.result.e with 96 | | Alt es -> es := e :: !es 97 | | _ -> raise FailedMatch 98 | else d_u (Alt (ref [e])) m 99 | 100 | in d_u e m 101 | 102 | (* Here we construct the initial zipper. This allows us to properly traverse the 103 | * grammar from the first step. This construction is similar in spirit to the 104 | * Seq/SeqC pair used on l318 (near the end of Section 4) of the paper. *) 105 | let init_zipper (e : exp) : zipper = 106 | let e' = Seq ("", []) in 107 | let m_top : mem = { start = 0; parents = [Top]; end_ = -1; result = undefined } in 108 | let c = SeqC (m_top, "", [], [e]) in 109 | let m_seq : mem = { start = 0; parents = [c]; end_ = -1; result = undefined } in 110 | (e', m_seq) 111 | 112 | (* When a result is produced, it will have some vestigial structure remaining 113 | * from the initial zipper (see above). This function removes those extra bits 114 | * so only the important stuff is returned once the parse is complete. *) 115 | let unwrap_top_exp (e : exp) : exp = 116 | match e.e with 117 | | Seq (_, [_; e']) -> e' 118 | | _ -> raise FailedMatch 119 | 120 | (* This is our wrapper/driver function. It initializes blank worklist and tops 121 | * lists for each element in the worklist. This allows for a generational style 122 | * of worklist (where "child processes" can each have their own worklist). 123 | * 124 | * The token tag 0 is assumed to be reserved for the end of the input. *) 125 | let parse (ts : tok list) (e : exp) : exp list = 126 | let rec parse (p : pos) (ts : tok list) : exp list = 127 | (let w = !worklist in 128 | worklist := []; 129 | tops := []; 130 | match ts with 131 | | [] -> List.iter (fun z -> derive p ("EOF", 0) z) w; 132 | List.map unwrap_top_exp !tops 133 | | ((t, s) :: ts') -> 134 | List.iter (fun z -> derive p (t, s) z) w; 135 | parse (p + 1) ts') 136 | in worklist := [init_zipper e]; 137 | parse 0 ts 138 | -------------------------------------------------------------------------------- /pwZ_WorklistWithLookahead.ml: -------------------------------------------------------------------------------- 1 | (* Simple type aliases. 2 | * Fig 1. *) 3 | type lab = string (* token and sequence labels *) 4 | type tag = int (* token tag, used for lookahead and token comparison *) 5 | type pos = int (* token position in input *) 6 | type tok = lab * tag (* token *) 7 | 8 | (* An exception when a match fails. Should never appear. 9 | * This is primarily to suppress warnings in a safe manner. *) 10 | exception FailedMatch 11 | 12 | (* Additional types necessary for using zippers without memoization tables. 13 | * Fig 19. 14 | * The implementation of m_0 required the definition of a new `undefined` value, 15 | * which is given here. It is essentially a placeholder to be discarded. *) 16 | type exp = { mutable m : mem; e : exp'; first : bool array } 17 | 18 | and exp' = T of tok 19 | | Seq of lab * exp list 20 | | Alt of (exp list) ref 21 | 22 | and cxt = Top 23 | | SeqC of mem * lab * exp list * exp list 24 | | AltC of mem 25 | 26 | and mem = { 27 | start : pos; 28 | mutable parents : cxt list; 29 | mutable end_ : pos; 30 | mutable result : exp } 31 | 32 | type zipper = exp' * mem 33 | 34 | let rec undefined : exp = { 35 | m = m_undefined; 36 | e = T ("undefined", -1); 37 | first = [| |] } 38 | 39 | and m_undefined : mem = { 40 | start = -1; 41 | parents = []; 42 | end_ = -1; 43 | result = undefined } 44 | 45 | let m_0 : mem = { 46 | start = -1; 47 | parents = []; 48 | end_ = -1; 49 | result = undefined } 50 | 51 | (* A global worklist. This is used for keeping track of what to do next. *) 52 | let worklist : (zipper list) ref = ref [] 53 | 54 | (* A list of "tops", which gives us parse-null of a Top for free. This is useful 55 | * so that in the end we can simply return the result. *) 56 | let tops : exp list ref = ref [] 57 | 58 | (* Core algorithm. Similar to Fig 20, but with additional steps taken for 59 | * performance. Note that the return type is now `unit`. *) 60 | let derive (p : pos) ((t, i) : tok) ((e, m) : zipper) : unit = 61 | 62 | let rec d_d (c : cxt) (e : exp) : unit = 63 | if p == e.m.start 64 | then (e.m.parents <- c :: e.m.parents; 65 | if p == e.m.end_ 66 | then d_u' e.m.result c 67 | else ()) 68 | else (let m = { start = p; parents = [c]; end_ = -1; result = undefined } in 69 | e.m <- m; 70 | d_d' m e.e) 71 | 72 | and d_d' (m : mem) (e : exp') : unit = 73 | match e with 74 | | T (t', i') -> 75 | if i == i' 76 | then worklist := (Seq (t, []), m) :: !worklist 77 | else () 78 | | Seq (l, []) -> d_u (Seq (l, [])) m 79 | | Seq (l, e :: es) -> d_d (SeqC (m, l, [], es)) e 80 | | Alt es -> List.iter 81 | (fun e -> 82 | if e.first.(i) 83 | then d_d (AltC m) e 84 | else ()) 85 | !es 86 | 87 | and d_u (e : exp') (m : mem) : unit = 88 | let e' = { m = m_0; e = e; first = [| |] } in 89 | m.end_ <- p; 90 | m.result <- e'; 91 | List.iter (fun c -> d_u' e' c) m.parents 92 | 93 | and d_u' (e : exp) (c : cxt) : unit = 94 | match c with 95 | | Top -> tops := e :: !tops 96 | | SeqC (m, l, es, []) -> d_u (Seq (l, List.rev (e :: es))) m 97 | | SeqC (m, l, left, e' :: right) -> d_d (SeqC (m, l, e :: left, right)) e' 98 | | AltC m -> if p == m.end_ 99 | then match m.result.e with 100 | | Alt es -> es := e :: !es 101 | | _ -> raise FailedMatch 102 | else d_u (Alt (ref [e])) m 103 | 104 | in d_u e m 105 | 106 | (* Here we construct the initial zipper. This allows us to properly traverse the 107 | * grammar from the first step. This construction is similar in spirit to the 108 | * Seq/SeqC pair used on l318 (near the end of Section 4) of the paper. *) 109 | let init_zipper (e : exp) : zipper = 110 | let e' = Seq ("", []) in 111 | let m_top : mem = { start = 0; parents = [Top]; end_ = -1; result = undefined } in 112 | let c = SeqC (m_top, "", [], [e]) in 113 | let m_seq : mem = { start = 0; parents = [c]; end_ = -1; result = undefined } in 114 | (e', m_seq) 115 | 116 | (* When a result is produced, it will have some vestigial structure remaining 117 | * from the initial zipper (see above). This function removes those extra bits 118 | * so only the important stuff is returned once the parse is complete. *) 119 | let unwrap_top_exp (e : exp) : exp = 120 | match e.e with 121 | | Seq (_, [_; e']) -> e' 122 | | _ -> raise FailedMatch 123 | 124 | (* This is our wrapper/driver function. It initializes blank worklist and tops 125 | * lists for each element in the worklist. This allows for a generational style 126 | * of worklist (where "child processes" can each have their own worklist). 127 | * 128 | * The token tag 0 is assumed to be reserved for the end of the input. *) 129 | let parse (ts : tok list) (e : exp) : exp list = 130 | let rec parse (p : pos) (ts : tok list) : exp list = 131 | (let w = !worklist in 132 | worklist := []; 133 | tops := []; 134 | match ts with 135 | | [] -> List.iter (fun z -> derive p ("EOF", 0) z) w; 136 | List.map unwrap_top_exp !tops 137 | | ((t, s) :: ts') -> 138 | List.iter (fun z -> derive p (t, s) z) w; 139 | parse (p + 1) ts') 140 | in worklist := [init_zipper e]; 141 | parse 0 ts 142 | -------------------------------------------------------------------------------- /pwZ_WorklistWithLookahead_Help.ml: -------------------------------------------------------------------------------- 1 | open PwZ_WorklistWithLookahead 2 | 3 | let concat_map (f : 'a -> 'b list) (l : 'a list) : 'b list = 4 | List.concat (List.map f l) 5 | 6 | let plug (p : pos) ((e, c) : zipper) : exp list = 7 | let rec up (e : exp') (m : mem) : exp list = 8 | let e' = { m = m_0; e = e; first = [| |] } in 9 | m.end_ <- p; 10 | m.result <- e'; 11 | concat_map (up' e') m.parents 12 | and up' (e : exp) (c : cxt) : exp list = 13 | match c with 14 | | Top -> [e] 15 | | SeqC (m, l, left, right) -> 16 | up (Seq (l, List.append (List.rev left) (e :: right))) m 17 | | AltC m -> 18 | if p == m.end_ 19 | then match m.result.e with 20 | | Alt es -> es := e :: !es; [] 21 | | _ -> raise FailedMatch 22 | else up (Alt (ref [e])) m 23 | in up e c 24 | 25 | let string_of_exp (e : exp) : string = 26 | let rec make_nice_string (e' : exp') (c : int) : string = 27 | let indent (i : int) (ss : string list) : string list = 28 | List.map (fun s -> (String.make i ' ') ^ s) ss 29 | in 30 | let join (ss : string list) : string = 31 | String.concat "\n" ss 32 | in 33 | let indent_subexp_strings (es : exp list) (ind : int) : string = 34 | match es with 35 | | [] -> "" 36 | | _ -> "\n" ^ (join (indent ind (List.map (fun e -> make_nice_string e ind) (List.map (fun e -> e.e) es)))) 37 | in 38 | 39 | match e' with 40 | | T (l, t) -> 41 | "(T " ^ l ^ ")" 42 | | Seq (l, []) -> 43 | "(Seq " ^ l ^ ")" 44 | | Seq (l, e :: es) -> 45 | (let leader = "(Seq " ^ l ^ " " in 46 | let ind = c + String.length leader in 47 | leader ^ (make_nice_string e.e ind) ^ (indent_subexp_strings es ind) ^ ")" 48 | ) 49 | | Alt (res) -> 50 | match !res with 51 | | [] -> 52 | "(Alt)" 53 | | e :: es -> 54 | (let leader = "(Alt " in 55 | let ind = c + String.length leader in 56 | leader ^ (make_nice_string e.e ind) ^ (indent_subexp_strings es ind) ^ ")" 57 | ) 58 | in make_nice_string e.e 0 59 | -------------------------------------------------------------------------------- /pwZ_Worklist_Help.ml: -------------------------------------------------------------------------------- 1 | open PwZ_Worklist 2 | 3 | let concat_map (f : 'a -> 'b list) (l : 'a list) : 'b list = 4 | List.concat (List.map f l) 5 | 6 | let plug (p : pos) ((e, c) : zipper) : exp list = 7 | let rec up (e : exp') (m : mem) : exp list = 8 | let e' = { m = m_0; e = e } in 9 | m.end_ <- p; 10 | m.result <- e'; 11 | concat_map (up' e') m.parents 12 | and up' (e : exp) (c : cxt) : exp list = 13 | match c with 14 | | Top -> [e] 15 | | SeqC (m, l, left, right) -> 16 | up (Seq (l, List.append (List.rev left) (e :: right))) m 17 | | AltC m -> 18 | if p == m.end_ 19 | then match m.result.e with 20 | | Alt es -> es := e :: !es; [] 21 | | _ -> raise FailedMatch 22 | else up (Alt (ref [e])) m 23 | in up e c 24 | 25 | let string_of_exp (e : exp) : string = 26 | let rec make_nice_string (e' : exp') (c : int) : string = 27 | let indent (i : int) (ss : string list) : string list = 28 | List.map (fun s -> (String.make i ' ') ^ s) ss 29 | in 30 | let join (ss : string list) : string = 31 | String.concat "\n" ss 32 | in 33 | let indent_subexp_strings (es : exp list) (ind : int) : string = 34 | match es with 35 | | [] -> "" 36 | | _ -> "\n" ^ (join (indent ind (List.map (fun e -> make_nice_string e ind) (List.map (fun e -> e.e) es)))) 37 | in 38 | 39 | match e' with 40 | | T (l, t) -> 41 | "(T " ^ l ^ ")" 42 | | Seq (l, []) -> 43 | "(Seq " ^ l ^ ")" 44 | | Seq (l, e :: es) -> 45 | (let leader = "(Seq " ^ l ^ " " in 46 | let ind = c + String.length leader in 47 | leader ^ (make_nice_string e.e ind) ^ (indent_subexp_strings es ind) ^ ")" 48 | ) 49 | | Alt (res) -> 50 | match !res with 51 | | [] -> 52 | "(Alt)" 53 | | e :: es -> 54 | (let leader = "(Alt " in 55 | let ind = c + String.length leader in 56 | leader ^ (make_nice_string e.e ind) ^ (indent_subexp_strings es ind) ^ ")" 57 | ) 58 | in make_nice_string e.e 0 59 | -------------------------------------------------------------------------------- /worklist.ml: -------------------------------------------------------------------------------- 1 | open PwZ_Worklist 2 | open PwZ_Worklist_Help 3 | open ArithGrammar_Worklist 4 | open ArithTokenizer 5 | 6 | exception Bad_Arg_Count of int 7 | 8 | let parse (s : string) : exp list = 9 | parse (tokenize s) g_EXPR 10 | 11 | let string_list_of_exp_list (es : exp list) : string list = 12 | List.map string_of_exp es 13 | 14 | let print_string_list (ss : string list) : unit = 15 | List.iter print_string (ss @ ["\n"]) 16 | 17 | let () = 18 | if Array.length Sys.argv != 2 19 | then raise (Bad_Arg_Count (Array.length Sys.argv)) 20 | else print_string_list (string_list_of_exp_list (parse Sys.argv.(1))) 21 | --------------------------------------------------------------------------------