├── .gitignore ├── .travis.yml ├── LICENSE ├── Makefile ├── README.md ├── benchmarks ├── bench.sh ├── calc │ ├── calc.ml │ ├── dune │ ├── ext.ml │ ├── ext2.ml │ ├── lexer.mll │ ├── parser.mly │ ├── prio.ml │ └── simple.ml ├── dune ├── lexer.mll ├── parser.mly ├── seq.ml └── sexp │ ├── ast.ml │ ├── dune │ ├── err.ml │ ├── lexer.mll │ ├── lr.ml │ ├── parser.mly │ ├── rr.ml │ └── sexp.ml ├── dune ├── dune-project ├── examples ├── README.md ├── calc.ml ├── calc_ext.ml ├── calc_ext2.ml ├── calc_prio.ml ├── catalan.ml ├── dune ├── paragraphs.ml ├── sexp.ml ├── test.txt └── test_ext2.txt ├── lib ├── assoc.ml ├── assoc.mli ├── bench.ml ├── blank.ml ├── blank.mli ├── charset.ml ├── charset.mli ├── comb.ml ├── comb.mli ├── container.ml ├── container.mli ├── dune ├── grammar.ml ├── grammar.mli ├── hashtbl_eq.ml ├── hashtbl_eq.mli ├── heap.ml ├── heap.mli ├── index.mld ├── input.ml ├── input.mli ├── interpolate.ml ├── interpolate.mli ├── keywords.ml ├── keywords.mli ├── lex.ml ├── lex.mli ├── pos.ml ├── pos.mli ├── regexp.ml ├── regexp.mli ├── unionFind.ml ├── unionFind.mli ├── utf8.ml ├── utf8.mli ├── word_list.ml └── word_list.mli ├── pacomb.opam ├── ppx ├── dune └── ppx_pacomb.ml ├── tests ├── Break.ml ├── GraphemeBreakTest.txt ├── big_expr.ml ├── calc_factor.ml ├── calc_utf8.ml ├── dseq_test.ml ├── dune ├── hard.ml ├── ppx_test.ml ├── scan.ml └── test.ml └── tools └── sanity_check.sh /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.install 3 | _build 4 | .merlin 5 | attic 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | install: test -e .travis-opam.sh || wget https://raw.githubusercontent.com/ocaml/ocaml-ci-scripts/master/.travis-opam.sh 3 | script: bash -ex .travis-opam.sh 4 | env: 5 | - OCAML_VERSION=4.02 6 | - OCAML_VERSION=4.03 7 | - OCAML_VERSION=4.04 8 | - OCAML_VERSION=4.05 9 | - OCAML_VERSION=4.06 10 | - OCAML_VERSION=4.07 11 | - OCAML_VERSION=4.08 12 | - OCAML_VERSION=4.09 13 | - OCAML_VERSION=4.10 14 | - OCAML_VERSION=4.11 15 | - OCAML_VERSION=4.12 16 | - OCAML_VERSION=4.13 17 | os: 18 | - freebsd 19 | - linux 20 | - osx 21 | arch: 22 | - amd64 23 | - arm64 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Christophe Raffalli & Rodolphe Lepigre 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | .PHONY: all 4 | all: 5 | dune build 6 | 7 | .PHONY: check 8 | check: 9 | tools/sanity_check.sh 10 | 11 | .PHONY: tests 12 | tests: all check 13 | dune runtest 14 | 15 | .PHONY: bench 16 | bench: all check 17 | ./benchmarks/bench.sh 18 | 19 | .PHONY: clean 20 | clean: 21 | dune clean 22 | rm -rf doc 23 | find -name \*.csv -exec rm {} \; 24 | 25 | .PHONY: install 26 | install: all 27 | dune install 28 | 29 | .PHONY: doc 30 | doc: 31 | dune build @doc 32 | 33 | .PHONY: install_doc 34 | install_doc: doc 35 | rsync -r --delete _build/default/_doc/_html/ ${HOME}/WWW2/Raffalli/pacomb/ 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PaComb: an efficient parsing library for OCaml 2 | ============================================== 3 | 4 | PaComb implements a representation of grammars with semantic actions (values 5 | returned as a result of parsing). Parsing is performed by compiling grammars 6 | defined with the `Grammar` module (or indirectly though a PPX extension) to 7 | the combinators of the `Combinator` module. The library offers _scanner less_ 8 | parsing, but the `Lex` module provide a notion of _terminals_ and _blanks_ 9 | that give a simple way to write grammars in two phases, as usual. 10 | 11 | The main advantage of PaComb and similar solutions, contrary to ocamlyacc, is 12 | that grammars (compiled or not) are __first class values__. This allows using 13 | the full power of OCaml for manipulating grammars. For example, this is very 14 | useful when working with syntax extension mechanisms. 15 | 16 | Importantly, the __performances__ of PaComb are very good: it is only two to 17 | five times slower than grammars generated by ocamlyacc, which is a compiler. 18 | 19 | Defining languages using the `Grammar` module directly is cumbersome. For that 20 | reason, PaComb provides a BNF-like PPX syntax extension (enabled using the 21 | `-ppx pacomb.ppx` compilation flag). 22 | 23 | A complete documentation is available via ocamldoc (make doc) 24 | 25 | Pacomb also support: self extensible grammars, ambiguous grammars (with merge), 26 | late rejection of rule via raising exception from action code, priority and others. 27 | 28 | A complete [documentation is available](https://raffalli.eu/pacomb/pacomb) 29 | 30 | As teaser, the usual calculator example: 31 | 32 | ``` 33 | (* The three levels of priorities *) 34 | type p = Atom | Prod | Sum 35 | 36 | let%parser rec 37 | (* This includes each priority level in the next one *) 38 | expr p = Atom < Prod < Sum 39 | (* all other rule are selected by their priority level *) 40 | ; (p=Atom) (x::FLOAT) => x 41 | ; (p=Atom) '(' (e::expr Sum) ')' => e 42 | ; (p=Prod) (x::expr Prod) '*' (y::expr Atom) => x*.y 43 | ; (p=Prod) (x::expr Prod) '/' (y::expr Atom) => x/.y 44 | ; (p=Sum ) (x::expr Sum ) '+' (y::expr Prod) => x+.y 45 | ; (p=Sum ) (x::expr Sum ) '-' (y::expr Prod) => x-.y 46 | ̀̀̀ 47 | -------------------------------------------------------------------------------- /benchmarks/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | make 4 | 5 | opts=--no-print-directory 6 | cd benchmarks 7 | 8 | echo "testing seq" 9 | dune exec $opts -- ./seq.exe 10 | 11 | echo "testing sexp" 12 | dune exec $opts -- ./sexp/sexp.exe 13 | 14 | echo "testing calc" 15 | dune exec $opts -- ./calc/calc.exe 16 | 17 | cd ../examples 18 | 19 | echo "testing catalan" 20 | dune exec $opts -- ./catalan.exe 80 2 21 | dune exec $opts -- ./catalan.exe 60 3 22 | -------------------------------------------------------------------------------- /benchmarks/calc/calc.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | 3 | let test = Array.length Sys.argv > 1 && Sys.argv.(1) = "--test" 4 | 5 | let rec gen_expr n p s ch = 6 | let print_string s = 7 | output_string ch s; String.length s 8 | in 9 | let rec seq op n atom = 10 | if n <= 0 then atom () 11 | else (atom () + print_string op + seq op (n-1) atom) 12 | in 13 | let paren atom = 14 | print_string ")" + atom () + print_string "(" 15 | in 16 | let int () = 17 | print_string (string_of_int (Random.int 0x20000000)) 18 | in 19 | if n <= 0 then int () 20 | else seq (if Random.bool () then " + " else " - ") 21 | s (fun () -> seq 22 | (if Random.bool () then "*" else "/") 23 | p (fun () -> 24 | if n = 1 then int () else 25 | paren (fun () -> gen_expr (n - 1) p s ch))) 26 | 27 | (* blanks *) 28 | let blank = Blank.from_charset (Charset.singleton ' ') 29 | 30 | let _ = 31 | let bench_simple = Bench.create () in 32 | let bench_prio = Bench.create () in 33 | let bench_ext = Bench.create () in 34 | let bench_ext2 = Bench.create () in 35 | let bench_yacc = Bench.create () in 36 | for n = (if test then 1 else 3) to (if test then 2 else 5) do 37 | for p = 2 to 4 do 38 | for s = 2 to (if n = 6 && p = 4 then 3 else 4) do 39 | let producer ch = let r = gen_expr n p s ch in Printf.fprintf ch "\n%!"; r in 40 | let producer2 ch = 41 | Printf.fprintf ch "rule 1.0 : Exp 1.0 Str \"*\" Exp 0.9 => Op2 \"*\" \n\ 42 | rule 1.0 : Exp 1.0 Str \"/\" Exp 0.9 => Op2 \"/\" \n\ 43 | rule 2.0 : Exp 2.0 Str \"+\" Exp 1.9 => Op2 \"+\" \n\ 44 | rule 2.0 : Exp 2.0 Str \"-\" Exp 1.9 => Op2 \"-\" \n\ 45 | "; 46 | producer ch 47 | in 48 | let size = Bench.size producer in 49 | let ((),ts,w) = Bench.parse_pipe bench_simple Simple.top blank size producer in 50 | Printf.printf "simple %d %d %d %.2f Mb in %.2fms %.2f Mb \n%!" n p s 51 | (float size /. 1024. /. 1024.) 52 | (1000. *. ts) (float w /. 1024. /. 1024. *. float Sys.word_size); 53 | let (_,tp,w) = Bench.parse_pipe bench_prio Prio.top blank size producer in 54 | Printf.printf "prio %d %d %d %.2f Mb in %.2fms %.2f Mb \n%!" n p s 55 | (float size /. 1024. /. 1024.) 56 | (1000. *. tp) (float w /. 1024. /. 1024. *. float Sys.word_size); 57 | let (_,te,w) = Bench.parse_pipe bench_ext Ext.top blank size producer in 58 | Printf.printf "ext %d %d %d %.2f Mb in %.2fms %.2f Mb \n%!" n p s 59 | (float size /. 1024. /. 1024.) 60 | (1000. *. te) (float w /. 1024. /. 1024. *. float Sys.word_size); 61 | let ((),t2,w) = Bench.parse_pipe bench_ext2 Ext2.top blank size producer2 in 62 | Printf.printf "ext2 %d %d %d %.2f Mb in %.2fms %.2f Mb \n%!" n p s 63 | (float size /. 1024. /. 1024.) 64 | (1000. *. t2) (float w /. 1024. /. 1024. *. float Sys.word_size); 65 | let (_,ty,w) = Bench.yacc_pipe bench_yacc Parser.main Lexer.token size producer in 66 | Printf.printf "yacc %d %d %d %.2f Mb in %.2fms %.2f Mb \n%!" n p s 67 | (float size /. 1024. /. 1024.) 68 | (1000. *. ty) (float w /. 1024. /. 1024. *. float Sys.word_size); 69 | Printf.printf "simple/yacc : %f " (ts /. ty); 70 | Printf.printf "prio/yacc : %f " (tp /. ty); 71 | Printf.printf "ext/yacc : %f " (te /. ty); 72 | Printf.printf "ext2/yacc : %f " (t2 /. ty); 73 | Printf.printf "prio/simple: %f " (tp /. ts); 74 | Printf.printf "ext/simple: %f " (te /. ts); 75 | Printf.printf "ext2/simple: %f\n%!" (t2 /. ts); 76 | done 77 | done 78 | done; 79 | Bench.stats "simple" bench_simple; 80 | Bench.stats "prio " bench_prio; 81 | Bench.stats "ext " bench_ext; 82 | Bench.stats "ext2 " bench_ext2; 83 | Bench.stats "yacc " bench_yacc; 84 | if not test then 85 | begin 86 | Bench.csv bench_simple "simple.csv"; 87 | Bench.csv bench_prio "prio.csv"; 88 | Bench.csv bench_ext "ext.csv"; 89 | Bench.csv bench_ext2 "ext2.csv"; 90 | Bench.csv bench_yacc "yacc.csv" 91 | end 92 | -------------------------------------------------------------------------------- /benchmarks/calc/dune: -------------------------------------------------------------------------------- 1 | (test 2 | (name calc) 3 | (modules :standard) 4 | (preprocess (pps pacomb.ppx)) 5 | (libraries unix pacomb) 6 | (action (run ./calc.exe --test))) 7 | 8 | (rule 9 | (targets lexer.ml) 10 | (deps lexer.mll) 11 | (action (run ocamllex lexer.mll))) 12 | 13 | (rule 14 | (targets parser.ml parser.mli) 15 | (deps parser.mly) 16 | (action (run ocamlyacc parser.mly))) 17 | -------------------------------------------------------------------------------- /benchmarks/calc/ext.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | 3 | let eps = 1e-10 4 | 5 | type assoc = RightAssoc | LeftAssoc | NonAssoc 6 | 7 | let cs = Charset.(complement (from_string "0-9()")) 8 | 9 | let bins = Word_list.create ~cs () 10 | 11 | let _ = 12 | Word_list.add_ascii bins "*" (( *. ), 4.0, LeftAssoc); 13 | Word_list.add_ascii bins "/" (( /. ),4.0, LeftAssoc); 14 | Word_list.add_ascii bins "+" (( +. ),6.0, LeftAssoc); 15 | Word_list.add_ascii bins "-" (( -. ),6.0, LeftAssoc) 16 | 17 | let%parser op pmin pmax = 18 | ((f,p,a)::Word_list.word bins) => ( 19 | let good = match a with 20 | | NonAssoc -> pmin < p && p < pmax 21 | | LeftAssoc -> pmin <= p && p < pmax 22 | | RightAssoc -> pmin < p && p <= pmax 23 | in 24 | if not good then Lex.give_up (); 25 | let p = match a with 26 | | RightAssoc -> p 27 | | _ -> p -. 1e-10 28 | in 29 | (p,f)) 30 | 31 | let%parser rec 32 | expr pmax = ((pe,e1)>:expr pmax) ((pop,b)>:op pe pmax) ((__,e2)::expr pop) 33 | => (pop, b e1 e2) 34 | ; (x::FLOAT) => (0.0,x) 35 | ; '(' (e::expr_top) ')' => (0.0,e) 36 | 37 | and expr_top = ((__,e)::expr 1000.0) => e 38 | 39 | let%parser rec top = () => () ; top (__::expr_top) '\n' => () 40 | -------------------------------------------------------------------------------- /benchmarks/calc/ext2.ml: -------------------------------------------------------------------------------- 1 | (* This code is to benchmark extensible grammar ... 2 | it is not the best code to do a calculator with an extensible grammar *) 3 | 4 | open Pacomb 5 | 6 | let get_op2 = function 7 | | "*" -> ( *. ) 8 | | "+" -> ( +. ) 9 | | "/" -> ( /. ) 10 | | "-" -> ( -. ) 11 | | "^" -> ( ** ) 12 | | _ -> failwith "invalid binary op" 13 | 14 | let get_op1 = function 15 | | "cos" -> cos 16 | | "sin" -> sin 17 | | "tan" -> tan 18 | | "ln" -> log 19 | | "exp" -> exp 20 | | _ -> failwith "invalid unary op" 21 | 22 | (** Bigger float = lower priority, 0.0 is for atomic expresion, 23 | so all priorities must be positive *) 24 | type prio = float 25 | 26 | (** list of all priorities, in decreasing order (lowest priority first). *) 27 | type prios = prio list 28 | 29 | (** a parsing rule: a grammar from an environment *) 30 | type 'a rule = env -> 'a Grammar.t 31 | 32 | and rules = (float * float rule list) list 33 | 34 | (** parsing environment: all rules and all prios sorted in 35 | decreasing order (lowest priority first). *) 36 | and env = { rules : rules; prios : prios } 37 | 38 | let empty_env = { rules = []; prios = [] } 39 | 40 | let add_prio p env = 41 | { env with prios = List.rev (List.sort_uniq compare (p::env.prios)) } 42 | 43 | (** get the next priority *) 44 | let next_prio p env = 45 | let rec fn = function 46 | | x::(y::_) when x = p -> y 47 | | x::l -> assert (x >= p); fn l 48 | | _ -> 0.0 49 | in 50 | fn env.prios 51 | 52 | (** get the priority nearest to p . For associativity, 53 | we will use [p - epsilon] to get the priority below p, 54 | and get_prio ill fetch the priority below p *) 55 | let get_prio p env = 56 | let rec fn = function 57 | | x::_ when x <= p -> x 58 | | _::l -> fn l 59 | | _ -> 0.0 60 | in 61 | fn env.prios 62 | 63 | (** the maximum priority *) 64 | let max_prio env = match env.prios with 65 | | x::_ -> x | [] -> 0.0 66 | 67 | (** add a rule with the given priority *) 68 | let add_rule prio r env = 69 | let old = try List.assoc prio env.rules with Not_found -> [] in 70 | let rules = (List.filter (fun (p,_) -> prio <> p) env.rules) in 71 | { env with rules = (prio, r::old) :: rules } 72 | 73 | (** get all the rule of a given priority *) 74 | let get_rule : prio -> env -> float Grammar.t = fun p env -> 75 | let rules = List.assoc p env.rules in 76 | let rules = List.map (fun r -> r env) rules in 77 | Grammar.alt rules 78 | 79 | let pr (_, p) = Printf.sprintf "%g" p 80 | 81 | (** the parsing for expression *) 82 | let%parser [@print_param pr] rec expr env (prio:prio) = 83 | (* constant *) 84 | (prio = 0.) (x::FLOAT) => x 85 | (* parenthesis, using max_prio *) 86 | ; (prio = 0.) '(' (x::expr env (max_prio env)) ')' => x 87 | (* incluse next priority level *) 88 | ; (prio > 0.) (x::expr env (next_prio prio env)) => x 89 | (* get all the rule for the level *) 90 | ; (prio > 0.) (x::get_rule prio env) => x 91 | 92 | (** a type of type *) 93 | type _ ty = 94 | Flt : float ty 95 | | Arr : 'a ty * 'b ty -> ('a -> 'b) ty 96 | 97 | (** the magic parsing : parse a BNF rule and return the parser 98 | for that BNF, parametrized by the current environment *) 99 | (** Remark: we need fake, dependant sequence (<:) because other 100 | with the construction of the grammar loops producing bigger 101 | and bigger types. Dependant sequence builds the grammar lazily *) 102 | let%parser rec rule : type a. a ty -> (env -> a Grammar.t) Grammar.t 103 | = fun t -> 104 | "Exp" (prio<:FLOAT) (r::rule (Arr(Flt,t))) => 105 | (fun env -> (x::expr env (get_prio prio env)) (f::r env) => f x) 106 | ; "Str" (s<:STRING_LIT) (r::rule t) => 107 | (fun env -> (STR s) (x::r env) => x) 108 | ; "=>" (a::action t) => (fun _ -> () => a) 109 | 110 | (** action, syntaxe style calculette HP *) 111 | and action : type a. a ty -> a Grammar.t 112 | = fun t -> 113 | "Cst" (x<:FLOAT) (f::action (Arr(Flt,t))) => f x 114 | ; (t =| Arr(Flt,t1)) "Op1" (s<:STRING_LIT) (f::action (Arr(Flt,t1))) => 115 | (let g = get_op1 s in (fun x -> f (g x) : a)) 116 | ; (t =| Arr(Flt,Arr(Flt,t1))) "Op2" (s<:STRING_LIT) (f::action (Arr(Flt,t1))) => 117 | (let g = get_op2 s in (fun x y -> f (g y x) : a)) 118 | ; (t =| Arr(Flt,Flt)) () => (fun x -> x) 119 | 120 | (** The command parsing a new rule *) 121 | let%parser new_rule env = 122 | "rule" (p::FLOAT) ":" (r::rule Flt) '\n' => 123 | let env = add_rule p r env in 124 | let env = add_prio p env in 125 | (env, ()) 126 | 127 | let%parser top_expr env = 128 | (x::expr env (max_prio env)) '\n' => x 129 | 130 | (** main parsing, right recursion with no action is ok now *) 131 | let%parser rec cmds env = 132 | () => () 133 | ; (top_expr env) (cmds env) => () 134 | ; ((env,()) >: new_rule env) (cmds env) => () 135 | 136 | let top = cmds empty_env 137 | -------------------------------------------------------------------------------- /benchmarks/calc/lexer.mll: -------------------------------------------------------------------------------- 1 | { 2 | open Parser (* The type token is defined in parser.mli *) 3 | exception Eof 4 | } 5 | rule token = parse 6 | [' ''\t''\r'] { token lexbuf } (* skip blanks *) 7 | | '\n' { EOL } 8 | | eof { EOF } 9 | | ['0'-'9']+(['.']['0'-'9']+)?(['e''E']['-']?['0'-'9']+)? as lxm 10 | { FLOAT(float_of_string lxm) } 11 | | '+' { PLUS } 12 | | '-' { MINUS } 13 | | '*' { TIMES } 14 | | '*' '*' { POW } 15 | | '/' { DIV } 16 | | '(' { LPAREN } 17 | | ')' { RPAREN } 18 | -------------------------------------------------------------------------------- /benchmarks/calc/parser.mly: -------------------------------------------------------------------------------- 1 | /* File parser.mly */ 2 | %token FLOAT 3 | %token PLUS MINUS TIMES DIV POW 4 | %token LPAREN RPAREN 5 | %token EOL EOF 6 | %left PLUS MINUS /* lowest precedence */ 7 | %left TIMES DIV /* medium precedence */ 8 | %right POW 9 | %nonassoc UMINUS /* highest precedence */ 10 | %start main /* the entry point */ 11 | %type main 12 | %% 13 | main: 14 | { () } 15 | | main top { () } 16 | ; 17 | top: expr EOL { $1 } 18 | ; 19 | expr: 20 | FLOAT { $1 } 21 | | LPAREN expr RPAREN { $2 } 22 | | expr PLUS expr { $1 +. $3 } 23 | | expr MINUS expr { $1 -. $3 } 24 | | expr TIMES expr { $1 *. $3 } 25 | | expr DIV expr { $1 /. $3 } 26 | | expr POW expr { $1 ** $3 } 27 | | MINUS expr %prec UMINUS { -. $2 } 28 | | PLUS expr %prec UMINUS { $2 } 29 | ; 30 | -------------------------------------------------------------------------------- /benchmarks/calc/prio.ml: -------------------------------------------------------------------------------- 1 | 2 | (* classical calculator example, with parameters priority level *) 3 | (* The three levels of priorities *) 4 | type p = Atom | Prod | Sum 5 | 6 | (* for printing, we provide a function to convert priorities to string *) 7 | let%parser rec 8 | (* This includes each priority level in the next one *) 9 | expr p = Atom < Prod < Sum 10 | (* all other rule are selected by their priority level *) 11 | ; (p=Atom) (x::FLOAT) => x 12 | ; (p=Atom) '(' (e::expr Sum) ')' => e 13 | ; (p=Prod) (x::expr Prod) '*' (y::expr Atom) => x*.y 14 | ; (p=Prod) (x::expr Prod) '/' (y::expr Atom) => x/.y 15 | ; (p=Sum ) (x::expr Sum ) '+' (y::expr Prod) => x+.y 16 | ; (p=Sum ) (x::expr Sum ) '-' (y::expr Prod) => x-.y 17 | 18 | let%parser rec top = () => () ; top (__::expr Sum) '\n' => () 19 | -------------------------------------------------------------------------------- /benchmarks/calc/simple.ml: -------------------------------------------------------------------------------- 1 | 2 | (* classical calculator example, with a non terminal for each priority level *) 3 | let%parser rec 4 | atom = (x::FLOAT) => x (* constant *) 5 | ; '(' (e::expr) ')' => e (* rule for parenthesis *) 6 | 7 | (* Here is the grammar for products *) 8 | and prod = (a::atom) => a 9 | ; (x::prod) '*' (y::atom) => x*.y 10 | ; (x::prod) '/' (y::atom) => x/.y 11 | 12 | (* and finally all remaining expressions *) 13 | and expr = (a::prod) => a 14 | ; (x::expr) '+' (y::prod) => x+.y 15 | ; (x::expr) '-' (y::prod) => x-.y 16 | 17 | let%parser rec top = () => () ; top (__::expr) '\n' => () 18 | -------------------------------------------------------------------------------- /benchmarks/dune: -------------------------------------------------------------------------------- 1 | (test 2 | (name seq) 3 | (modules :standard) 4 | (preprocess (pps pacomb.ppx)) 5 | (libraries unix pacomb) 6 | (action (run ./seq.exe --test))) 7 | 8 | (rule 9 | (targets lexer.ml) 10 | (deps lexer.mll) 11 | (action (run ocamllex lexer.mll))) 12 | 13 | (rule 14 | (targets parser.ml parser.mli) 15 | (deps parser.mly) 16 | (action (run ocamlyacc parser.mly))) 17 | -------------------------------------------------------------------------------- /benchmarks/lexer.mll: -------------------------------------------------------------------------------- 1 | { 2 | open Parser (* The type token is defined in parser.mli *) 3 | exception Eof 4 | } 5 | rule token = parse 6 | | 'a' { A } 7 | | eof { EOF } 8 | -------------------------------------------------------------------------------- /benchmarks/parser.mly: -------------------------------------------------------------------------------- 1 | /* File parser.mly */ 2 | %token A 3 | %token EOF 4 | %start lr_top /* the entry point */ 5 | %start rr_top /* the entry point */ 6 | %type lr_top 7 | %type rr_top 8 | %% 9 | lr: 10 | { () } 11 | | lr A { () } 12 | ; 13 | rr: 14 | { () } 15 | | A rr { () } 16 | ; 17 | lr_top: lr EOF { $1 } 18 | ; 19 | rr_top: rr EOF { $1 } 20 | ; 21 | -------------------------------------------------------------------------------- /benchmarks/seq.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | 3 | let test = Array.length Sys.argv > 1 && Sys.argv.(1) = "--test" 4 | 5 | let expr n ch = 6 | for _ = 1 to n do 7 | output_char ch 'a' 8 | done; 9 | n 10 | 11 | module Lr = struct 12 | let%parser rec lr = () => () 13 | ; lr 'a' => () 14 | 15 | let%parser top = lr EOF => () 16 | end 17 | 18 | module Ld = struct 19 | let%parser rec lr = () => () 20 | ; (__<:lr) 'a' => () 21 | 22 | let%parser top = lr EOF => () 23 | end 24 | 25 | module Rr = struct 26 | let%parser rec rr = () => () 27 | ; 'a' rr => () 28 | 29 | let%parser top = (x::rr) EOF => x 30 | end 31 | 32 | module Rrl = struct 33 | let%parser rec rr = () => lazy 0 34 | ; 'a' (lazy x::rr) => lazy (x+1) 35 | 36 | let%parser top = (x::rr) EOF => Lazy.force x 37 | end 38 | 39 | module Rd = struct 40 | let%parser rec rr = () => () 41 | ; (__<:'a') (__::rr) => () 42 | 43 | let%parser top = (x::rr) EOF => x 44 | end 45 | 46 | module Rdl = struct 47 | let%parser rec rr = () => lazy 0 48 | ; (__<:'a') (lazy x::rr) => lazy (x+1) 49 | 50 | let%parser top = (x::rr) EOF => Lazy.force x 51 | end 52 | 53 | module Lp = struct 54 | let%parser rec lr = () => () 55 | ; lr 'a' => let _ = _pos in () 56 | 57 | let%parser top = lr EOF => () 58 | end 59 | 60 | module Rp = struct 61 | let%parser rec rr = () => lazy () 62 | ; 'a' (lazy __::rr) => lazy (let _ = _pos in ()) 63 | 64 | let%parser top = (x::rr) EOF => Lazy.force x 65 | end 66 | 67 | 68 | let blank = Blank.none 69 | 70 | let _ = 71 | let bench_lr = Bench.create () in 72 | let bench_rr = Bench.create () in 73 | let bench_rrl = Bench.create () in 74 | let bench_ld = Bench.create () in 75 | let bench_rd = Bench.create () in 76 | let bench_rdl = Bench.create () in 77 | let bench_lp = Bench.create () in 78 | let bench_rp = Bench.create () in 79 | let bench_ly = Bench.create () in 80 | let bench_ry = Bench.create () in 81 | let bench_dum = Bench.create () in 82 | let producer0 ch = let r = expr 0 ch in Printf.fprintf ch "%!"; r in 83 | for n = 1 to (if test then 2 else 20) do 84 | let n = n * 200_000 in 85 | let producer ch = let r = expr n ch in Printf.fprintf ch "%!"; r in 86 | let size = Bench.size producer in 87 | let (_,tl,w) = Bench.parse_pipe bench_lr Lr.top blank size producer in 88 | Printf.printf "lr %d %.2f Mb in %.2fms %.2f Mb \n%!" n 89 | (float size /. 1024. /. 1024.) 90 | (1000. *. tl) (float w /. 1024. /. 1024. *. float Sys.word_size); 91 | let (_,tr,w) = Bench.parse_pipe bench_rr Rr.top blank size producer in 92 | Printf.printf "rr %d %.2f Mb in %.2fms %.2f Mb \n%!" n 93 | (float size /. 1024. /. 1024.) 94 | (1000. *. tr) (float w /. 1024. /. 1024. *. float Sys.word_size); 95 | let (_,trl,w) = Bench.parse_pipe bench_rrl Rrl.top blank size producer in 96 | Printf.printf "rrl %d %.2f Mb in %.2fms %.2f Mb \n%!" n 97 | (float size /. 1024. /. 1024.) 98 | (1000. *. trl) (float w /. 1024. /. 1024. *. float Sys.word_size); 99 | let (_,tld,w) = Bench.parse_pipe bench_ld Ld.top blank size producer in 100 | Printf.printf "ld %d %.2f Mb in %.2fms %.2f Mb \n%!" n 101 | (float size /. 1024. /. 1024.) 102 | (1000. *. tld) (float w /. 1024. /. 1024. *. float Sys.word_size); 103 | let (_,trd,w) = Bench.parse_pipe bench_rd Rd.top blank size producer in 104 | Printf.printf "rd %d %.2f Mb in %.2fms %.2f Mb \n%!" n 105 | (float size /. 1024. /. 1024.) 106 | (1000. *. trd) (float w /. 1024. /. 1024. *. float Sys.word_size); 107 | let (_,trdl,w) = Bench.parse_pipe bench_rdl Rdl.top blank size producer in 108 | Printf.printf "rdl %d %.2f Mb in %.2fms %.2f Mb \n%!" n 109 | (float size /. 1024. /. 1024.) 110 | (1000. *. trdl) (float w /. 1024. /. 1024. *. float Sys.word_size); 111 | let (_,tlp,w) = Bench.parse_pipe bench_lp Lp.top blank size producer in 112 | Printf.printf "lp %d %.2f Mb in %.2fms %.2f Mb \n%!" n 113 | (float size /. 1024. /. 1024.) 114 | (1000. *. tlp) (float w /. 1024. /. 1024. *. float Sys.word_size); 115 | let (_,trp,w) = Bench.parse_pipe bench_rp Rp.top blank size producer in 116 | Printf.printf "rp %d %.2f Mb in %.2fms %.2f Mb \n%!" n 117 | (float size /. 1024. /. 1024.) 118 | (1000. *. trp) (float w /. 1024. /. 1024. *. float Sys.word_size); 119 | let (_,tly,w) = Bench.yacc_pipe bench_ly Parser.lr_top Lexer.token size producer in 120 | Printf.printf "ly %d %.2f Mb in %.2fms %.2f Mb \n%!" n 121 | (float size /. 1024. /. 1024.) 122 | (1000. *. tly) (float w /. 1024. /. 1024. *. float Sys.word_size); 123 | let _ = Bench.yacc_pipe bench_dum Parser.lr_top Lexer.token 0 producer0 in 124 | let (_,try_,w) = Bench.yacc_pipe bench_ry Parser.rr_top Lexer.token size producer in 125 | Printf.printf "ry %d %.2f Mb in %.2fms %.2f Mb \n%!" n 126 | (float size /. 1024. /. 1024.) 127 | (1000. *. try_) (float w /. 1024. /. 1024. *. float Sys.word_size); 128 | let _ = Bench.yacc_pipe bench_dum Parser.rr_top Lexer.token 0 producer0 in 129 | Printf.printf "lr/ly: %f " (tl /. tly); 130 | Printf.printf "ry/ly: %f " (try_ /. tly); 131 | Printf.printf "rr/ly: %f " (tr /. tly); 132 | Printf.printf "ld/ly: %f " (tld /. tly); 133 | Printf.printf "rd/ly: %f " (trd /. tly); 134 | Printf.printf "lp/ly: %f " (tlp /. tly); 135 | Printf.printf "rp/ly: %f\n%!" (trp /. tly); 136 | Printf.printf "rr/lr: %f " (tr /. tl); 137 | Printf.printf "ld/lr: %f " (tld /. tl); 138 | Printf.printf "rd/lr: %f " (trd /. tl); 139 | Printf.printf "lp/lr: %f " (tlp /. tl); 140 | Printf.printf "rp/lr: %f " (trp /. tl); 141 | Printf.printf "rd/rr: %f " (trd /. tr); 142 | Printf.printf "rp/rr: %f\n%!" (trp /. tr); 143 | done; 144 | Bench.stats "lr " bench_lr; 145 | Bench.stats "ld " bench_ld; 146 | Bench.stats "rr " bench_rr; 147 | Bench.stats "rd " bench_rd; 148 | Bench.stats "lp " bench_lp; 149 | Bench.stats "rp " bench_rp; 150 | if not test then 151 | begin 152 | Bench.csv bench_lr "lr.csv"; 153 | Bench.csv bench_rr "rr.csv"; 154 | Bench.csv bench_ld "ld.csv"; 155 | Bench.csv bench_rd "rd.csv"; 156 | Bench.csv bench_lp "lp.csv"; 157 | Bench.csv bench_rp "rp.csv" 158 | end 159 | -------------------------------------------------------------------------------- /benchmarks/sexp/ast.ml: -------------------------------------------------------------------------------- 1 | type 'a sexp = { p: 'a; e : 'a sexp' } 2 | and 'a sexp' = 3 | | Idt of string 4 | | Lst of 'a sexp list 5 | -------------------------------------------------------------------------------- /benchmarks/sexp/dune: -------------------------------------------------------------------------------- 1 | (test 2 | (name sexp) 3 | (modules :standard) 4 | (preprocess (pps pacomb.ppx)) 5 | (libraries unix pacomb) 6 | (action (run ./sexp.exe --test))) 7 | 8 | (rule 9 | (targets lexer.ml) 10 | (deps lexer.mll) 11 | (action (run ocamllex lexer.mll))) 12 | 13 | (rule 14 | (targets parser.ml parser.mli) 15 | (deps parser.mly) 16 | (action (run ocamlyacc parser.mly))) 17 | -------------------------------------------------------------------------------- /benchmarks/sexp/err.ml: -------------------------------------------------------------------------------- 1 | open Ast 2 | 3 | let id = "[a-zA-Z_][a-zA-Z_0-9]*[']*" 4 | 5 | let%parser rec sexp 6 | = ERROR(["id";"("]) 7 | ; (x::RE id) => { p = _pos; e = Idt x } 8 | ; '(' (l::sexps) => (ERROR(")") ; ')' 9 | => { p = _pos; e = Lst l }) 10 | and sexps = () => [] 11 | ; (l::sexps) (e::sexp) => e::l 12 | 13 | let%parser top = (s::sexp) => s 14 | -------------------------------------------------------------------------------- /benchmarks/sexp/lexer.mll: -------------------------------------------------------------------------------- 1 | { 2 | open Parser (* The type token is defined in parser.mli *) 3 | exception Eof 4 | } 5 | rule token = parse 6 | [' ' '\t' '\n' '\r'] { token lexbuf } (* skip blanks *) 7 | | eof { EOL } 8 | | ['a'-'z''A'-'Z''_']['a'-'z''A'-'Z''_''0'-'9']* as lxm { ID lxm } 9 | | '(' { LPAREN } 10 | | ')' { RPAREN } 11 | -------------------------------------------------------------------------------- /benchmarks/sexp/lr.ml: -------------------------------------------------------------------------------- 1 | open Ast 2 | 3 | let id = "[a-zA-Z_][a-zA-Z_0-9]*[']*" 4 | let%parser rec sexp 5 | = (x::RE id) => { p = _pos; e = Idt x } 6 | ; '(' (l::sexps) ')' => { p = _pos; e = Lst (List.rev l) } 7 | and sexps = () => [] 8 | ; (l::sexps) (e::sexp) => e::l 9 | 10 | let%parser top = (s::sexp) => s 11 | -------------------------------------------------------------------------------- /benchmarks/sexp/parser.mly: -------------------------------------------------------------------------------- 1 | /* File parser.mly */ 2 | %{ 3 | open Ast 4 | let mkloc e = { p= (Parsing.symbol_start_pos (), Parsing.symbol_end_pos ()) 5 | ; e} 6 | %} 7 | %token ID 8 | %token LPAREN RPAREN 9 | %token EOL 10 | %start main /* the entry point */ 11 | %type <(Lexing.position*Lexing.position) Ast.sexp> main 12 | %% 13 | 14 | main: 15 | sexp EOL { $1 } 16 | ; 17 | sexp: 18 | ID { mkloc (Ast.Idt $1) } 19 | | LPAREN sexps RPAREN { mkloc (Ast.Lst (List.rev $2)) } 20 | sexps: 21 | { [] } 22 | | sexps sexp { $2 :: $1 } 23 | -------------------------------------------------------------------------------- /benchmarks/sexp/rr.ml: -------------------------------------------------------------------------------- 1 | open Ast 2 | 3 | let id = "[a-zA-Z_][a-zA-Z_0-9]*[']*" 4 | 5 | let%parser rec sexp = (x::RE id) => { p = _pos; e = Idt x } 6 | ; '(' (lazy l::sexps) ')' => { p = _pos; e = Lst l } 7 | and sexps = () => lazy [] 8 | ; (e::sexp) (lazy l::sexps) => lazy (e::l) 9 | 10 | let%parser top = (s::sexp) => s 11 | -------------------------------------------------------------------------------- /benchmarks/sexp/sexp.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | 3 | let test = Array.length Sys.argv > 1 && Sys.argv.(1) = "--test" 4 | 5 | let random_id () = 6 | let c = Char.chr (Char.code 'a' + Random.int 26) in 7 | let n = string_of_int (Random.int 1000) in 8 | String.make 1 c ^ n 9 | 10 | let print_spaces ch lvl = 11 | output_string ch "\n"; 12 | for _ = 1 to lvl do 13 | output_string ch "\t" 14 | done; 15 | 1+lvl 16 | 17 | let sqrt n = Random.int (truncate ((float n) ** 0.7) + 1) 18 | 19 | let rec expr lvl n ch = 20 | let print_string s = 21 | output_string ch s; String.length s 22 | in 23 | if n <= 0 then 0 24 | else if n = 1 then print_string (random_id ()) 25 | else 26 | begin 27 | let x = print_spaces ch lvl in 28 | let lvl = lvl + 1 in 29 | let y = print_string "(" in 30 | let s = sqrt n in 31 | let n = n - 1 in 32 | let rec fn s n = 33 | if s <= 1 then expr lvl n ch 34 | else 35 | begin 36 | let q = n/2 in 37 | let n0 = n/s - q/2 + Random.int(min 1 q) in 38 | let n0 = max 1 (min n n0) in 39 | let x = expr lvl n0 ch in 40 | let y = print_string " " in 41 | let z = fn (s-1) (n - n0) in 42 | x + y + z 43 | end 44 | in 45 | let z = fn s n in 46 | let t = print_string ")" in 47 | x + y + z + t 48 | end 49 | 50 | (* blanks *) 51 | let blank = Blank.from_charset (Charset.from_string " \t\n") 52 | 53 | let _ = 54 | let bench_lr = Bench.create () in 55 | let bench_rr = Bench.create () in 56 | let bench_err = Bench.create () in 57 | let bench_yacc = Bench.create () in 58 | for n = 1 to (if test then 2 else 20) do 59 | let n = n * 50_000 in 60 | let producer ch = let r = expr 0 n ch in Printf.fprintf ch "\n%!"; r in 61 | let size = Bench.size producer in 62 | let (_,ts,w) = Bench.parse_pipe bench_lr Lr.top blank size producer in 63 | Printf.printf "lr %d %.2f Mb in %.2fms %.2f Mb \n%!" n 64 | (float size /. 1024. /. 1024.) 65 | (1000. *. ts) (float w /. 1024. /. 1024. *. float Sys.word_size); 66 | let (_,tr,w) = Bench.parse_pipe bench_rr Rr.top blank size producer in 67 | Printf.printf "rr %d %.2f Mb in %.2fms %.2f Mb \n%!" n 68 | (float size /. 1024. /. 1024.) 69 | (1000. *. tr) (float w /. 1024. /. 1024. *. float Sys.word_size); 70 | let (_,te,w) = Bench.parse_pipe bench_err Err.top blank size producer in 71 | Printf.printf "err %d %.2f Mb in %.2fms %.2f Mb \n%!" n 72 | (float size /. 1024. /. 1024.) 73 | (1000. *. te) (float w /. 1024. /. 1024. *. float Sys.word_size); 74 | let (_,ty,w) = Bench.yacc_pipe bench_yacc Parser.main Lexer.token size producer in 75 | Printf.printf "yacc %d %.2f Mb in %.2fms %.2f Mb \n%!" n 76 | (float size /. 1024. /. 1024.) 77 | (1000. *. ty) (float w /. 1024. /. 1024. *. float Sys.word_size); 78 | Printf.printf "lr/yacc : %f " (ts /. ty); 79 | Printf.printf "rr/yacc : %f " (tr /. ty); 80 | Printf.printf "err/yacc : %f " (te /. ty); 81 | Printf.printf "rr/lr: %f " (tr /. ts); 82 | Printf.printf "err/lr: %f\n%!" (te /. ts); 83 | done; 84 | Bench.stats "lr " bench_lr; 85 | Bench.stats "rr " bench_rr; 86 | Bench.stats "err " bench_err; 87 | Bench.stats "yacc " bench_yacc; 88 | if not test then 89 | begin 90 | Bench.csv bench_lr "lr.csv"; 91 | Bench.csv bench_rr "rr.csv"; 92 | Bench.csv bench_rr "err.csv"; 93 | Bench.csv bench_yacc "yacc.csv" 94 | end 95 | -------------------------------------------------------------------------------- /dune: -------------------------------------------------------------------------------- 1 | (env 2 | (dev (flags :standard -w -9 -I +unix)) 3 | (release (flags :standard -w -9 -I +unix))) 4 | -------------------------------------------------------------------------------- /dune-project: -------------------------------------------------------------------------------- 1 | (lang dune 1.9) 2 | (name pacomb) 3 | (allow_approximate_merlin) 4 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | This folder contains some useful examples: 2 | 3 | - calc_ext.ml : an extensible calculator where new operator can be defined with 4 | their priority. 5 | -------------------------------------------------------------------------------- /examples/calc.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | 3 | (* classical calculator example, with a non terminal for each priority level *) 4 | 5 | (* Here is the definition of the parser with the ppx syntax extension described 6 | in the documentation. 7 | 8 | Here, we deal with priorities by manually defining three different grammars. 9 | Starting with the grammar for atomic expressions. *) 10 | let%parser rec 11 | atom = (x::FLOAT) => x (* constant *) 12 | ; '(' (e::expr) ')' => e (* rule for parenthesis *) 13 | 14 | (* Here is the grammar for products *) 15 | and prod = (a::atom) => a 16 | ; (x::prod) '*' (y::atom) => x*.y 17 | ; (x::prod) '/' (y::atom) => x/.y 18 | 19 | (* and finally all remaining expressions *) 20 | and expr = (a::prod) => a 21 | ; (x::expr) '+' (y::prod) => x+.y 22 | ; (x::expr) '-' (y::prod) => x-.y 23 | 24 | (* A subtlety : we want to parse expression, one by one and print the 25 | result. Pacomb needs to do things that require buffer examination after 26 | each token. So printing after parsing the newline does not work. 27 | A trick that works is to test for the newline, not parsing it, 28 | using Grammar.test_after. Another solution would be to read each 29 | line with input_line and use Grammar.parse_string on the result. 30 | *) 31 | let nl _ b i _ _ = 32 | let (c,_,_) = Input.read b i in c = '\n' 33 | let%parser rec top = (t::Grammar.test_after nl expr) => Printf.printf "%g\n=> %!" t 34 | let%parser rec exprs = () => () ; exprs top '\n' => () 35 | 36 | (* parsing command line arguments, illustrating grammar printing *) 37 | let usage_msg = Printf.sprintf "%s [options]" Sys.argv.(0) 38 | 39 | let rec help () = 40 | Arg.usage spec usage_msg; 41 | Printf.eprintf "\nParsing with:\n\n%a\n%!" 42 | (fun ch -> Grammar.print_grammar ch) exprs 43 | 44 | and spec = [( "-help", Arg.Unit help, "print help message") 45 | ;("--help", Arg.Unit help, "print help message")] 46 | 47 | let _ = Arg.parse spec (fun s -> raise (Arg.Bad s)) usage_msg 48 | 49 | (* blanks *) 50 | let blank = Blank.from_charset (Charset.singleton ' ') 51 | 52 | let _ = 53 | try 54 | while true do 55 | let f () = 56 | Printf.printf "=> %!"; (* initial prompt *) 57 | (* no need to stack the buffer of in_channel and those of Pacomb. So 58 | file desciptor are preferred. *) 59 | Grammar.parse_fd exprs blank Unix.stdin; 60 | raise End_of_file 61 | in 62 | (* [Pos] module provides a function to handle exception with 63 | an optional argument to call for error (default is to exit with 64 | code 1 *) 65 | Pos.handle_exception ~error:(fun _ -> ()) f () 66 | done 67 | with 68 | End_of_file -> () 69 | -------------------------------------------------------------------------------- /examples/calc_ext.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | open Lex 3 | open Pos 4 | open Grammar 5 | (* this example illustrates various features: 6 | - extensible grammars 7 | - dependant sequences to deal with priorities (necessary for extensibility 8 | - error reporting 9 | 10 | 11 | Here is an example of accepted input: 12 | 13 | x ++ y priority 6 left associative = (x+y)/2 14 | -- x priority 1 = x - 1 15 | ++ x priority 1 = x + 1 16 | a = 2 17 | b(x) = sin(x) + cos(x) 18 | c = a ++ b(a) 19 | c + 5 20 | *) 21 | 22 | (* type of expressions *) 23 | type expr = 24 | | Cst of float 25 | | Idt of string * expr array 26 | 27 | (* and the values inthe environment *) 28 | type func = 29 | | Def of (expr*string array) 30 | | Op0 of float 31 | | Op1 of (float -> float) 32 | | Op2 of (float -> float -> float) 33 | | Op3 of (float -> float -> float -> float) 34 | 35 | (* The initial environment *) 36 | let init_env =[(("^",2) , (Op2 ( ** ))) 37 | ;(("*",2) , (Op2 ( *. ))) 38 | ;(("/",2) , (Op2 ( /. ))) 39 | ;(("+",2) , (Op2 ( +. ))) 40 | ;(("-",2) , (Op2 ( -. ))) 41 | ;(("-",1) , (Op1 (fun x -> -. x))) 42 | ;(("+",1) , (Op1 (fun x -> x))) 43 | ;(("e",0) , (Op0 (exp 1.0))) 44 | ;(("exp",1), (Op1 (exp ))) 45 | ;(("log",1), (Op1 (log ))) 46 | ;(("pi",0) , (Op0 (acos(-1.0)))) 47 | ;(("cos",1), (Op1 (cos ))) 48 | ;(("sin",1), (Op1 (sin ))) 49 | ;(("tan",1), (Op1 (tan ))) 50 | ] 51 | 52 | let env = 53 | let e = Hashtbl.create 32 in 54 | List.iter (fun (k,d) -> Hashtbl.add e k d) init_env; 55 | e 56 | 57 | exception Unbound of string * int 58 | 59 | (* and the evaluation function *) 60 | let rec eval env = function 61 | | Cst x -> x 62 | | Idt(id,args) -> 63 | try 64 | let args = Array.map (eval env) args in 65 | let f = Hashtbl.find env (id,Array.length args) in 66 | match f with 67 | | Def(f,params) -> 68 | let add i id = Hashtbl.add env (id,0) (Op0 args.(i)) in 69 | let remove _ id = Hashtbl.remove env (id,0) in 70 | Array.iteri add params; 71 | (try 72 | let r = eval env f in 73 | Array.iteri remove params; r 74 | with 75 | e -> Array.iteri remove params; raise e) 76 | | Op0(x) -> x 77 | | Op1(f) -> f args.(0) 78 | | Op2(f) -> f args.(0) args.(1) 79 | | Op3(f) -> f args.(0) args.(1) args.(2) 80 | 81 | with Not_found -> raise (Unbound (id, Array.length args)) 82 | 83 | (* parser for identifier, notice the error construct to report error messages, 84 | using regular expressions *) 85 | let%parser ident = (id::RE("[a-zA-Z][a-zA-Z0-9_]*")) => id 86 | ; ERROR("ident") 87 | 88 | (* tables giving the syntax class of each symbol *) 89 | type assoc = RightAssoc | LeftAssoc | NonAssoc 90 | 91 | let cs = Charset.(from_string "-&~^+=*/\\$!:") 92 | 93 | let infix_tbl = 94 | let t = Word_list.create ~cs () in 95 | Word_list.add_ascii t "^" ("^", 2.0, RightAssoc); 96 | Word_list.add_ascii t "*" ("*", 4.0, LeftAssoc); 97 | Word_list.add_ascii t "/" ("/", 4.0, LeftAssoc); 98 | Word_list.add_ascii t "+" ("+", 6.0, LeftAssoc); 99 | Word_list.add_ascii t "-" ("-", 6.0, LeftAssoc); 100 | t 101 | 102 | let prefix_tbl = 103 | let t = Word_list.create ~cs () in 104 | Word_list.add_ascii t "+" ("+", 5.0); 105 | Word_list.add_ascii t "-" ("-", 5.0); 106 | t 107 | 108 | (* parser for symbols, it uses Lex.give_up to reject some rule from the action 109 | code. *) 110 | let%parser op = 111 | (c::RE("[-&~^+=*/\\$!:]+\\(_[a-zA-Z0-9_]+\\)?")) 112 | => (if c = "=" then give_up ~msg:"= not valid as op bin" (); c) 113 | ; (ERROR "symbol") 114 | 115 | (* parser for infix symbol parametrized with the maximum and minimum 116 | priority. It returns the actual priority, minum eps for left and non 117 | associative symbols *) 118 | 119 | let eps = 1e-10 120 | 121 | let%parser infix pmin pmax = 122 | ((c,p,a)::Word_list.word infix_tbl) => 123 | let good = match a with 124 | | NonAssoc -> pmin < p && p < pmax 125 | | LeftAssoc -> pmin <= p && p < pmax 126 | | RightAssoc -> pmin < p && p <= pmax 127 | in 128 | if not good then give_up (); 129 | let p = match a with 130 | | RightAssoc -> p 131 | | _ -> p -. 1e-10 132 | in 133 | (p,c) 134 | 135 | (* parser for prefix symbol *) 136 | let%parser prefix pmax = 137 | ((c,p)::Word_list.word prefix_tbl) => 138 | let good = p <= pmax in 139 | if not good then give_up (); 140 | (p,c) 141 | 142 | (* some keywords *) 143 | let%parser opening = '(' => (); ERROR "closing parenthesis" 144 | let%parser closing = ')' => (); ERROR "closing parenthesis" 145 | let%parser comma = ',' => (); ERROR "comma" 146 | 147 | (* parser for expressions, using dependant sequence. when writing 148 | (p,x)>:grammar in a rule, p can be used in the rest of the rule while x can 149 | only be used in the rest of the grammar. 150 | 151 | Like infix and prefix, expression are parametrized with a priority (here a 152 | maximum and we use the priorities returned by the parsing of expressions, 153 | infix and prefix to deal with priority of whats coming next. 154 | *) 155 | let%parser rec 156 | expr pmax = ((pe,e1)>:expr pmax) ((pop,b)>:infix pe pmax) 157 | ((__,e2)::expr pop) => (pop, Idt(b,[|e1;e2|])) 158 | ; ((pop,b)>:prefix pmax) 159 | ((__,e1)::expr pop) => (pop, Idt(b,[|e1|])) 160 | ; (x::FLOAT) => (0.0,Cst x) 161 | ; opening (e::expr_top) closing => (0.0,e) 162 | ; (id::ident) (args::args) => (0.0, Idt(id,args)) 163 | 164 | (* the ppx extension has syntactic sugar for option and sequences. 165 | ~+ [comma] grammar denotes non empty lists with separator. *) 166 | and args = 167 | () => [||] 168 | ; opening (l:: ~+ [comma] expr_top) closing => Array.of_list l 169 | 170 | and expr_top = ((__,e)::expr 1000.0) => e 171 | 172 | (* here we define the keywords for parsing symbol definitions *) 173 | let%parser assoc = "none" => NonAssoc 174 | ; "left" => LeftAssoc 175 | ; "right" => RightAssoc 176 | ; ERROR("assoc: none | left | right") 177 | 178 | let%parser priority = (x::FLOAT) => x 179 | ; ERROR("float") 180 | 181 | let%parser eq = '=' => () ; ERROR("=") 182 | let%parser priority_kwd = "priority" => (); ERROR("priority keyword") 183 | let%parser assoc_kwd = "associative" => (); ERROR("associative keyword") 184 | 185 | (* list of parameters for definition of functions *) 186 | let%parser params = 187 | () => [||] 188 | ; opening (l:: ~+ [comma] ident) closing => Array.of_list l 189 | 190 | (* toplevel commands *) 191 | let%parser cmd = 192 | (e::expr_top) EOF 193 | => (Printf.printf "%f\n%!" (eval env e)) 194 | ; (id::ident) (params::params) eq (e::expr_top) EOF 195 | => Hashtbl.add env (id,Array.length params) (Def(e,params)) 196 | ; (id::op) (a1::ident) 197 | priority_kwd (p::priority) 198 | eq (e::expr_top) EOF 199 | => (let params = [|a1|] in 200 | Hashtbl.add env (id,Array.length params) (Def(e,params)); 201 | Word_list.add_ascii prefix_tbl id (id,p)) 202 | ; (a1::ident) (id::op) (a2::ident) 203 | priority_kwd (p::priority) 204 | (a::assoc) assoc_kwd 205 | eq (e::expr_top) EOF 206 | => (let params = [|a1;a2|] in 207 | Hashtbl.add env (id,Array.length params) (Def(e,params)); 208 | Word_list.add_ascii infix_tbl id (id,p,a)) 209 | 210 | (* blanks *) 211 | let blank = Blank.from_charset (Charset.singleton ' ') 212 | 213 | (* main loop *) 214 | let _ = 215 | try 216 | while true do 217 | let f () = 218 | try 219 | Printf.printf "=> %!"; 220 | let line = input_line stdin in 221 | parse_string cmd blank line 222 | with Unbound(s,n) -> 223 | Printf.eprintf "unbound %s with arity %d\n%!" s n 224 | in handle_exception ~error:(fun _ -> ()) f () 225 | done 226 | with 227 | End_of_file -> () 228 | -------------------------------------------------------------------------------- /examples/calc_ext2.ml: -------------------------------------------------------------------------------- 1 | (* This code is to benchmark extensible grammar ... 2 | it is not the best code to do a calculator with an extensible grammar *) 3 | 4 | open Pacomb 5 | 6 | (** Access to float functions for the action of rules *) 7 | let get_op2 = function 8 | | "*" -> ( *. ) 9 | | "+" -> ( +. ) 10 | | "/" -> ( /. ) 11 | | "-" -> ( -. ) 12 | | "^" -> ( ** ) 13 | | "atan2" -> atan2 14 | | _ -> failwith "invalid binary op" 15 | 16 | let get_op1 = function 17 | | "sqrt" -> sqrt 18 | | "cos" -> cos 19 | | "acos" -> acos 20 | | "sin" -> sin 21 | | "asin" -> asin 22 | | "tan" -> tan 23 | | "atan" -> atan 24 | | "ln" -> log 25 | | "log" -> log10 26 | | "exp" -> exp 27 | | _ -> failwith "invalid unary op" 28 | 29 | (** Bigger float = lower priority, 0.0 is for atomic expresion, 30 | so all priorities must be positive *) 31 | type prio = float 32 | 33 | (** list of all priorities, in decreasing order (lowest priority first). *) 34 | type prios = prio list 35 | 36 | (** a parsing rule: a grammar from an environment *) 37 | type 'a rule = env -> 'a Grammar.t 38 | 39 | (** list of rule, with a name for each rule *) 40 | and rules = (float * (string * float rule) list) list 41 | 42 | (** parsing environment: all rules and all prios sorted in 43 | decreasing order (lowest priority first). *) 44 | and env = { rules : rules; prios : prios } 45 | 46 | let empty_env = { rules = []; prios = [] } 47 | 48 | let add_prio p env = 49 | { env with prios = List.rev (List.sort_uniq compare (p::env.prios)) } 50 | 51 | (** get the next priority *) 52 | let next_prio p env = 53 | let rec fn = function 54 | | x::(y::_) when x = p -> y 55 | | x::l -> assert (x >= p); fn l 56 | | _ -> 0.0 57 | in 58 | fn env.prios 59 | 60 | (** get the priority nearest to p . For associativity, we will use [p] of [ x | [] -> 0.0 67 | 68 | (** add a rule with the given priority *) 69 | let add_rule name prio r env = 70 | let old = try List.assoc prio env.rules with Not_found -> [] in 71 | let rules = (List.filter (fun (p,_) -> prio <> p) env.rules) in 72 | let env = add_prio prio env in 73 | { env with rules = (prio, (name,r)::old) :: rules } 74 | 75 | let rm_rule name env = 76 | let count = ref 0 in 77 | let rm l = List.filter (fun (n,_) -> if n = name then incr count; 78 | n <> name) l 79 | in 80 | let rules = List.map (fun (p,l) -> (p, rm l)) env.rules in 81 | let rules = List.filter (fun (_,l) -> l <> []) rules in 82 | let prios = List.rev (List.sort_uniq compare (List.map fst rules)) in 83 | { rules; prios }, !count 84 | 85 | (** get all the rule of a given priority *) 86 | let get_rule : prio -> env -> float Grammar.t = fun p env -> 87 | let rules = try List.assoc p env.rules with Not_found -> [] in 88 | let rules = List.map (fun (_,r) -> r env) rules in 89 | Grammar.alt rules 90 | 91 | let pr (_, p) = Printf.sprintf "%g" p 92 | 93 | (** the parsing for expression *) 94 | let%parser [@print_param pr] rec expr env (prio:prio) = 95 | (* constant *) 96 | (prio = 0.) (x::FLOAT) => x 97 | (* parenthesis, using max_prio *) 98 | ; (prio = 0.) '(' (x::expr env (max_prio env)) ')' => x 99 | (* incluse next priority level *) 100 | ; (prio > 0.) (x::expr env (next_prio prio env)) => x 101 | (* get all the rule for the level (including 0.0) *) 102 | ; (x::get_rule prio env) => x 103 | 104 | (** a type of type *) 105 | type _ ty = 106 | Flt : float ty 107 | | Arr : 'a ty * 'b ty -> ('a -> 'b) ty 108 | 109 | (** action of a given type , syntaxe, HP style *) 110 | (** Remark: we need fake, dependant sequence (<:) because otherwise 111 | the construction of the grammar loops producing bigger 112 | and bigger types. Dependant sequence builds the grammar lazily *) 113 | let%parser rec action : type a. a ty -> a Grammar.t 114 | = fun t -> 115 | "Cst" (x<:FLOAT) (f::action (Arr(Flt,t))) => f x 116 | ; (t =| Arr(Flt,t1)) "Op1" (s<:STRING_LIT) (f::action (Arr(Flt,t1))) => 117 | (let g = get_op1 s in (fun x -> f (g x) : a)) 118 | ; (t =| Arr(Flt,Arr(Flt,t1))) "Op2" (s<:STRING_LIT) (f::action (Arr(Flt,t1))) => 119 | (let g = get_op2 s in (fun x y -> f (g y x) : a)) 120 | ; (t =| Arr(Flt,Flt)) () => (fun x -> x) 121 | 122 | (** the magic parsing : parse a BNF rule and return the parser 123 | for that BNF, parametrized by the current environment *) 124 | let%parser rec rule : type a. a ty -> (env -> a Grammar.t) Grammar.t 125 | = fun t -> 126 | "Exp" (leq:: ~? [true] ("<" => false)) (prio<:FLOAT) (r::rule (Arr(Flt,t))) => 127 | (fun env -> (x::expr env (get_prio leq prio env)) (f::r env) => f x) 128 | ; "Str" (s<:STRING_LIT) (r::rule t) => 129 | (fun env -> (STR s) (x::r env) => x) 130 | ; "=>" (a::action t) => (fun _ -> () => a) 131 | 132 | (* A subtlety : we want to parse expression, one by one and print the 133 | result. Pacomb needs to do things that require buffer examination after 134 | each token. So printing after parsing the newline does not work. 135 | A trick that works is to test for the newline, not parsing it, 136 | using Grammar.test_after. Another solution would be to read each 137 | line with input_line and use Grammar.parse_string on the result. 138 | *) 139 | let nl _ b i _ _ = 140 | let (c,_,_) = Input.read b i in c = '\n' 141 | 142 | (** the command parsing an expression and printing it *) 143 | let%parser top_expr env = 144 | (x::Grammar.test_after nl (expr env (max_prio env))) => 145 | Printf.printf "%g\n=> %!" x 146 | 147 | (** reference only uses to keep the env in case of parse error *) 148 | let env_ref = ref empty_env 149 | 150 | (** The command parsing a new rule *) 151 | let%parser new_rule env = 152 | "rule" (n::STRING_LIT) (p::FLOAT) ":" (r::Grammar.test_after nl (rule Flt)) => 153 | let env = add_rule n p r env in 154 | env_ref := env; 155 | Printf.printf "new rule accepted\n=> %!"; 156 | (env, ()) 157 | 158 | (** The command parsing removing all rules with a given name *) 159 | let%parser rem_rule env = 160 | "remove" "rule" (n::STRING_LIT) => 161 | let (env, nb) = rm_rule n env in 162 | env_ref := env; 163 | Printf.printf "%d rule(s) removed\n=> %!" nb; 164 | (env, ()) 165 | 166 | (** main parsing, right recursion with no action is ok now *) 167 | let%parser rec cmds env = 168 | () => () 169 | ; (top_expr env) '\n' (cmds env) => () 170 | ; ((env,()) >: new_rule env) '\n' (cmds env) => () 171 | ; ((env,()) >: rem_rule env) '\n' (cmds env) => () 172 | 173 | let top = cmds empty_env 174 | 175 | (* blanks *) 176 | let blank = Blank.from_charset (Charset.singleton ' ') 177 | 178 | let _ = 179 | try 180 | while true do 181 | let f () = 182 | Printf.printf "=> %!"; (* initial prompt *) 183 | (* no need to stack the buffer of in_channel and those of Pacomb. So 184 | file desciptor are preferred. *) 185 | (* as we parse stdin, we need to keed the whole buffer in memory 186 | to have line and column number, ~rescan:false only give byte 187 | position *) 188 | Grammar.parse_fd (cmds !env_ref) blank Unix.stdin; 189 | raise End_of_file 190 | in 191 | (* [Pos] module provides a function to handle exception with 192 | an optional argument to call for error (default is to exit with 193 | code 1 *) 194 | Pos.handle_exception ~error:(fun _ -> ()) f () 195 | done 196 | with 197 | End_of_file -> () 198 | -------------------------------------------------------------------------------- /examples/calc_prio.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | 3 | (* This example (read calc.ml first) illustrates another way to handle 4 | priorities with parametric grammars. *) 5 | 6 | (* The three levels of priorities *) 7 | type p = Atom | Prod | Sum 8 | 9 | let string_of_prio = function 10 | | Atom -> "A" 11 | | Prod -> "P" 12 | | Sum -> "S" 13 | 14 | (* for printing, we provide a function to convert priorities to string *) 15 | let%parser [@print_param string_of_prio] rec 16 | (* This includes each priority level in the next one *) 17 | expr p = Atom < Prod < Sum 18 | (* all other rule are selected by their priority level *) 19 | ; (p=Atom) (x::FLOAT) => x 20 | ; (p=Atom) '(' (e::expr Sum) ')' => e 21 | ; (p=Prod) (x::expr Prod) '*' (y::expr Atom) => x*.y 22 | ; (p=Prod) (x::expr Prod) '/' (y::expr Atom) => x/.y 23 | ; (p=Sum ) (x::expr Sum ) '+' (y::expr Prod) => x+.y 24 | ; (p=Sum ) (x::expr Sum ) '-' (y::expr Prod) => x-.y 25 | 26 | (* A subtlety : we want to parse expression, one by one and print the 27 | result. Pacomb needs to do things that require buffer examination after 28 | each token. So printing after parsing the newline does not work. 29 | A trick that works is to test for the newline, not parsing it, 30 | using Grammar.test_after. Another solution would be to read each 31 | line with input_line and use Grammar.parse_string on the result. 32 | *) 33 | let nl _ b i _ _ = 34 | let (c,_,_) = Input.read b i in c = '\n' 35 | let%parser rec top = 36 | (t::Grammar.test_after nl (expr Sum)) => Printf.printf "%g\n=> %!" t 37 | let%parser rec exprs = 38 | () => () ; exprs top '\n' => () 39 | 40 | (* parsing command line arguments, illustrating grammar printing *) 41 | let usage_msg = Printf.sprintf "%s [options]" Sys.argv.(0) 42 | 43 | let rec help () = 44 | Arg.usage spec usage_msg; 45 | Printf.eprintf "\nParsing with:\n\n%a\n%!" 46 | (fun ch -> Grammar.print_grammar ch) exprs 47 | 48 | and spec = [( "-help", Arg.Unit help, "print help message") 49 | ;("--help", Arg.Unit help, "print help message")] 50 | 51 | let _ = Arg.parse spec (fun s -> raise (Arg.Bad s)) usage_msg 52 | 53 | (* blanks *) 54 | let blank = Blank.from_charset (Charset.singleton ' ') 55 | 56 | let _ = 57 | try 58 | while true do 59 | let f () = 60 | Printf.printf "=> %!"; (* initial prompt *) 61 | (* no need to stack the buffer of in_channel and those of Pacomb. So 62 | file desciptor are preferred *) 63 | Grammar.parse_fd exprs blank Unix.stdin; 64 | raise End_of_file 65 | in 66 | (* [Pos] module provides a function to handle exception with 67 | an optional argument to call for error (default is to exit with 68 | code 1 *) 69 | Pos.handle_exception ~error:(fun _ -> ()) f () 70 | done 71 | with 72 | End_of_file -> () 73 | -------------------------------------------------------------------------------- /examples/catalan.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | 3 | (* This (useless) example illustrate the use of merge on ambiguous grammars. We 4 | define two ambiguous grammars parsing sequences of characters to generate all 5 | binary and ternary trees. 6 | 7 | "catalan 10 2" will compute number of binary trees of size 0 to 10 8 | "catalan 10 3" will compute number of ternary trees of size 0 to 10 9 | 10 | *) 11 | 12 | (* A parser parsing arbitrary sequence of charaters 'a' as a set of binary 13 | trees. We use merge in case of ambiguity and, as [@merge ...] implies 14 | [@cache] pacomb parses only once each part of the input. We just return the 15 | number of trees. *) 16 | let%parser [@merge (+.)] rec bin_seq = 17 | () => 1.0 18 | ; (t1::bin_seq) 'a' (t2::bin_seq) => t1 *. t2 19 | 20 | let _ = Grammar.print_grammar stdout ~def:false bin_seq; print_newline () 21 | 22 | (* Idem for ternary tree, we need an internal cache to achieve polynomial 23 | complexity. *) 24 | let%parser [@merge (+.)] rec ter_seq = 25 | () => 1.0 26 | ; (t1::ter_seq) 'a' (t2t3::ter_seq_aux) => t1 *. t2t3 27 | 28 | and [@merge (+.)] ter_seq_aux = 29 | (t2::ter_seq) (t3::ter_seq) => t2 *. t3 30 | 31 | let _ = Grammar.print_grammar stdout ~def:false ter_seq; print_newline () 32 | 33 | (* To test, here is Catalan number, i.e. the number of binary trees of a given size *) 34 | let catalan = 35 | let memo = Hashtbl.create 128 in 36 | let rec fn n = 37 | if n <= 1 then 1.0 else 38 | try Hashtbl.find memo n 39 | with Not_found -> 40 | let r = ref 0.0 in 41 | for i = 0 to n-1 do 42 | r := fn i *. fn (n - i - 1) +. !r 43 | done; 44 | Hashtbl.add memo n !r; 45 | !r 46 | in 47 | fn 48 | 49 | (* idem for ternary trees *) 50 | let catalan3 = 51 | let memo = Hashtbl.create 128 in 52 | let rec fn n = 53 | if n <= 1 then 1.0 else 54 | try Hashtbl.find memo n 55 | with Not_found -> 56 | let r = ref 0.0 in 57 | for i = 0 to n-1 do 58 | for j = 0 to n-i-1 do 59 | r := fn i *. fn j *. fn (n - i - j - 1) +. !r 60 | done 61 | done; 62 | Hashtbl.add memo n !r; 63 | !r 64 | in 65 | fn 66 | 67 | (* parsing command line *) 68 | let catalan_max, branches = 69 | try 70 | if Array.length Sys.argv <> 3 then raise Not_found; 71 | int_of_string Sys.argv.(1), int_of_string Sys.argv.(2) 72 | with _ -> 73 | Printf.eprintf "usage: %s max_len [2|3]\n%!" Sys.argv.(0); 74 | exit 1 75 | 76 | let _ = 77 | Printf.printf "checking the number of parsetrees on an ambiguous example,\ 78 | using merge and cache\n%!"; 79 | let (p,f) = if branches = 2 then bin_seq, catalan 80 | else ter_seq, catalan3 81 | in 82 | let bench = Bench.create () in 83 | for i = 0 to catalan_max do 84 | let s = String.make i 'a' in 85 | let (t,dt,w) = Bench.parse_string bench p Blank.none s in 86 | let k = f i in 87 | Printf.printf "catalan(%d): %d => %e=%e in %.2fms %.2f Mb \n%!" 88 | branches i t k (1000. *. dt) (float w /. 1024. /. 1024. *. float Sys.word_size); 89 | done; 90 | Bench.stats "catalan " bench; 91 | Bench.csv bench (Printf.sprintf "catalan%d.csv" branches) 92 | -------------------------------------------------------------------------------- /examples/dune: -------------------------------------------------------------------------------- 1 | (test 2 | (name calc) 3 | (modules calc) 4 | (preprocess (pps pacomb.ppx)) 5 | (libraries unix pacomb) 6 | (deps ../tests/big_expr.exe calc.exe) 7 | (action 8 | (system "../tests/big_expr.exe 4 4 4 | ./calc.exe"))) 9 | 10 | (test 11 | (name calc_prio) 12 | (modules calc_prio) 13 | (preprocess (pps pacomb.ppx)) 14 | (libraries unix pacomb) 15 | (deps ../tests/big_expr.exe calc_prio.exe) 16 | (action 17 | (system "../tests/big_expr.exe 4 4 4 | ./calc_prio.exe"))) 18 | 19 | (test 20 | (name calc_ext) 21 | (modules calc_ext) 22 | (preprocess (pps pacomb.ppx)) 23 | (libraries unix pacomb) 24 | (deps ../tests/big_expr.exe calc_ext.exe) 25 | (action 26 | (system "../tests/big_expr.exe 3 4 4 | ./calc_ext.exe"))) 27 | 28 | (test 29 | (name calc_ext2) 30 | (modules calc_ext2) 31 | (preprocess (pps pacomb.ppx)) 32 | (libraries unix pacomb) 33 | (deps ../tests/big_expr.exe calc_ext2.exe test_ext2.txt) 34 | (action 35 | (system "(cat test_ext2.txt && ../tests/big_expr.exe 3 4 4) | ./calc_ext2.exe"))) 36 | 37 | (test 38 | (name sexp) 39 | (modules sexp) 40 | (preprocess (pps pacomb.ppx)) 41 | (libraries unix pacomb) 42 | (deps sexp.exe) 43 | (action 44 | (system "echo 'a b (c (d e) f) (g h)' | ./sexp.exe"))) 45 | 46 | (test 47 | (name paragraphs) 48 | (modules paragraphs) 49 | (preprocess (pps pacomb.ppx)) 50 | (libraries unix pacomb) 51 | (deps paragraphs.exe test.txt) 52 | (action 53 | (system "./paragraphs.exe < test.txt"))) 54 | 55 | (test 56 | (name catalan) 57 | (modules catalan) 58 | (preprocess (pps pacomb.ppx)) 59 | (libraries unix pacomb) 60 | (deps catalan.exe) 61 | (action 62 | (progn 63 | (system "./catalan.exe 30 2") 64 | (system "./catalan.exe 30 3")))) 65 | -------------------------------------------------------------------------------- /examples/paragraphs.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | 3 | let%parser word = 4 | (w::RE"[-a-zA-Z0-9']+") => w 5 | ; (p::RE"[.;,:!?]") => p 6 | 7 | (* blank with at most one newline for paragraphs *) 8 | let blank1 = Regexp.blank_regexp "[ \t\r]*\n?[ \t\r]*" 9 | 10 | (* general blank, no newline, we parse them as separator. *) 11 | let blank2 = Regexp.blank_regexp "[ \t\r]*" 12 | 13 | (* for paragraph, we use [Grammar.layout] to change the blank from [blank2] to 14 | [blank1] and we parse with blank1 after the paragraph to read the last 15 | newline at the end of each paragraph, the default would be to parse with 16 | [blank2] only after the paragraph. This way, the minimum number of newline left to 17 | parse as paragraph separation is 1. *) 18 | let%parser 19 | [@layout blank1 ~config:Blank.{ default_layout_config with 20 | new_blanks_after = true 21 | ; old_blanks_after = false }] 22 | paragraph = ((p:: ~+ word) => (p, p_pos)) 23 | 24 | (* paragraph separator, at least one newline *) 25 | let%parser sep = (~+ (CHAR('\n'))) => () 26 | 27 | (* text are list separated by sep and may have initial and final newlines *) 28 | let%parser text = (~? sep) (t:: ~* [sep] paragraph) (~? sep) => t 29 | 30 | (* we call the parser *) 31 | let _ = 32 | let t = Pos.handle_exception (Grammar.parse_channel text blank2) stdin in 33 | Format.printf "%d paragraphs\n%!" (List.length t); 34 | List.iteri (fun i (p,pos) -> Format.printf " paragraph %d at %a: %d word(s)\n%!" 35 | i 36 | (Pos.print_pos ~style:Short ()) 37 | pos 38 | (List.length p)) t 39 | -------------------------------------------------------------------------------- /examples/sexp.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | 3 | type sexp = { p: Pos.pos; e : sexp' } 4 | and sexp' = 5 | | Idt of string 6 | | Lst of sexp list 7 | 8 | let id = "[a-zA-Z_][a-zA-Z_0-9]*[']*" 9 | 10 | let%parser rec sexp 11 | = ERROR(["id";"("]) 12 | ; (x::RE id) => { p = _pos; e = Idt x } 13 | ; '(' (l::sexps) => (ERROR(")") ; ')' 14 | => { p = _pos; e = Lst (List.rev l) }) 15 | and sexps = () => [] 16 | ; (l::sexps) (e::sexp) => e::l 17 | 18 | (* blank c++ like end of line comments *) 19 | let blank = Blank.line_comments "\\\\" 20 | 21 | let _ = 22 | let f () = Grammar.parse_channel sexps blank stdin in 23 | Pos.handle_exception f () 24 | -------------------------------------------------------------------------------- /examples/test.txt: -------------------------------------------------------------------------------- 1 | Un permier paragraph; 2 | coucou. 3 | 4 | Puis un second. 5 | Avec deux phrases. 6 | 7 | Et 8 | un 9 | dernier. 10 | -------------------------------------------------------------------------------- /examples/test_ext2.txt: -------------------------------------------------------------------------------- 1 | 2 2 | 3 3 | (4) 4 | 5 5 | rule "suc" 1 : Str "S" Exp 1 => Cst 1 Op2 "+" 6 | rule "pre" 1 : Str "P" Exp 1 => Cst 1 Op2 "-" 7 | SPPS6 8 | rule "pow" 2 : Exp <2 Str "^" Exp 2 => Op2 "^" 9 | rule "mul" 3 : Exp 3 Str "*" Exp <3 => Op2 "*" 10 | rule "div" 3 : Exp 3 Str "/" Exp <3 => Op2 "/" 11 | rule "add" 4 : Exp 4 Str "+" Exp <4 => Op2 "+" 12 | rule "sub" 4 : Exp 4 Str "-" Exp <4 => Op2 "-" 13 | 3*3 14 | 2+2 15 | 2+3*3 16 | 2+3*5/2-1 17 | (5+5) 18 | SPS6 * 3 + S1 19 | remove rule "suc" 20 | remove rule "pre" 21 | 2^2^2 22 | 2^3^2 23 | (2^3)^2 24 | 2^(3^2) 25 | -------------------------------------------------------------------------------- /lib/assoc.ml: -------------------------------------------------------------------------------- 1 | type _ token = .. 2 | 3 | type ('a, 'b) eq = 4 | | Eq : ('a, 'a) eq 5 | | NEq : ('a, 'b) eq 6 | 7 | type 'a key = { tok : 'a token ; uid : int; eq : 'b. 'b token -> ('a, 'b) eq } 8 | 9 | (** To store keys in lists *) 10 | type any_key = K : 'a key -> any_key [@@unboxed] 11 | 12 | let key_count = ref 0 13 | let new_key : type a. unit -> a key = fun () -> 14 | let module M = struct type _ token += T : a token end in let open M in 15 | let eq : type b. b token -> (a, b) eq = function T -> Eq | _ -> NEq in 16 | let uid = !key_count in incr key_count; 17 | { tok = T ; uid; eq } 18 | 19 | let compare k1 k2 = compare k1.uid k2.uid 20 | 21 | module Make(T:sig type 'a data end) = struct 22 | open T 23 | 24 | type t = 25 | | Nil : t 26 | | Cns : 'a key * 'a data * t -> t 27 | 28 | let empty : t = Nil 29 | 30 | let length : t -> int = 31 | let rec length acc l = 32 | match l with 33 | | Nil -> acc 34 | | Cns(_,_,l) -> length (acc+1) l 35 | in 36 | length 0 37 | 38 | let add : type a. a key -> a data -> t -> t = fun k x l -> 39 | Cns(k,x,l) 40 | 41 | type iter = { f : 'a. 'a key -> 'a data -> unit } 42 | let rec iter : iter -> t -> unit = fun f l -> 43 | match l with 44 | | Nil -> () 45 | | Cns(j,x,l) -> f.f j x; iter f l 46 | 47 | let add_key : type a. a data -> t -> a key * t = fun x l -> 48 | let k = new_key () in (k, Cns(k,x,l)) 49 | 50 | let rec find : type a. a key -> t -> a data = fun k l -> 51 | match l with 52 | | Nil -> raise Not_found 53 | | Cns(j,x,l) -> match j.eq k.tok with Eq -> x | NEq -> find k l 54 | 55 | let rec mem : type a. a key -> t -> bool = fun k l -> 56 | match l with 57 | | Nil -> false 58 | | Cns(j,_,l) -> match k.eq j.tok with Eq -> true | NEq -> mem k l 59 | 60 | let rec remove : type a. a key -> t -> t = fun k l -> 61 | match l with 62 | | Nil -> raise Not_found 63 | | Cns(j,v,l) -> match k.eq j.tok with Eq -> l | NEq -> Cns(j,v,remove k l) 64 | 65 | let rec replace : type a. a key -> a data -> t -> t = fun k x l -> 66 | match l with 67 | | Nil -> Cns(k,x,Nil) 68 | | Cns(j,v,l) -> match k.eq j.tok with Eq -> Cns(j,x,l) 69 | | NEq -> Cns(j,v,replace k x l) 70 | 71 | let rec append : t -> t -> t = fun l1 l2 -> 72 | match l1 with 73 | | Nil -> l2 74 | | Cns(j,v,l) -> Cns(j,v,append l l2) 75 | end 76 | 77 | include Make(struct type 'a data = 'a end) 78 | -------------------------------------------------------------------------------- /lib/assoc.mli: -------------------------------------------------------------------------------- 1 | (** Dependant association lists. *) 2 | 3 | (** Standard equality type using a GADT. *) 4 | type ('a, 'b) eq = 5 | | Eq : ('a, 'a) eq 6 | | NEq : ('a, 'b) eq 7 | 8 | (** Type of tokens used to make keys unique, and carrying a type. This type is 9 | not intended to be extended by the user, hence it is private... but not 10 | declared priveta as it fails if 4.04 *) 11 | type _ token = .. 12 | 13 | (** Type of a key for a value of type ['a]. It contains a unique token and the 14 | corresponding (very efficient) equality test. *) 15 | type 'a key = { tok : 'a token ; uid:int; eq : 'b. 'b token -> ('a, 'b) eq } 16 | 17 | (** To store keys in lists *) 18 | type any_key = K : 'a key -> any_key [@@unboxed] 19 | 20 | (** [new_key ()] generates a new unique key for a value of type ['a]. *) 21 | val new_key : unit -> 'a key 22 | 23 | (** Type of an association list, where items may have different types *) 24 | type t 25 | 26 | (** [empty] is the empty association list. *) 27 | val empty : t 28 | 29 | (** compare keys by uid *) 30 | val compare : 'a key -> 'b key -> int 31 | 32 | (** [add k v l] inserts a new binding of [k] to [v] at the head of [l]. A 33 | previous binding of [k] will not be removed. Hence removing [k] will uncover 34 | a previous binding. *) 35 | val add : 'a key -> 'a -> t -> t 36 | 37 | (** [length l] returns the size of the association list [l]. *) 38 | val length : t -> int 39 | 40 | (** [add_key v l] is equivalent to [let k = new_key () in (k, add k v l)]. *) 41 | val add_key : 'a -> t -> 'a key * t 42 | 43 | (** [find k l] returns the latest inserted value with key [k] in list [l]. The 44 | exception [Not_found] is raised if there is none. *) 45 | val find : 'a key -> t -> 'a 46 | 47 | (** [mem k l] tells whether an element is mapped to [k] in the list [l]. *) 48 | val mem : 'a key -> t -> bool 49 | 50 | (** [remove k l] removes the latest inserted binding of the key [k] in [l]. If 51 | there is no such binding, then [Not_found] is raised. *) 52 | val remove : 'a key -> t -> t 53 | 54 | (** [replace k l] replaces a previous binding if it exists. If two bindings 55 | existed, only the first is removed to be replaced. *) 56 | val replace : 'a key -> 'a -> t -> t 57 | 58 | (** [append l1 l2] concatenate the two association lists. Duplicated are not 59 | removed. *) 60 | val append : t -> t -> t 61 | 62 | (** Iterator *) 63 | type iter = { f : 'a. 'a key -> 'a -> unit } 64 | val iter : iter -> t -> unit 65 | 66 | (** Variation on the above to associate value of type ['a data] to key of type 67 | ['a key]. The abobe function are obatained with [Make(struct type 'a data = 68 | 'a end)] *) 69 | module Make(T:sig type 'a data end) : 70 | sig 71 | type t 72 | val empty : t 73 | 74 | val add : 'a key -> 'a T.data -> t -> t 75 | val length : t -> int 76 | val add_key : 'a T.data -> t -> 'a key * t 77 | val find : 'a key -> t -> 'a T.data 78 | val mem : 'a key -> t -> bool 79 | val remove : 'a key -> t -> t 80 | val replace : 'a key -> 'a T.data -> t -> t 81 | val append : t -> t -> t 82 | 83 | type iter = { f : 'a. 'a key -> 'a T.data -> unit } 84 | val iter : iter -> t -> unit 85 | end 86 | -------------------------------------------------------------------------------- /lib/bench.ml: -------------------------------------------------------------------------------- 1 | 2 | let create () = 3 | (* when bench are created, no more compaction to keep it manual *) 4 | Gc.(set { (get ()) with max_overhead = 1_000_000 }); 5 | ref [] 6 | 7 | let parse_string bench gram blank s = 8 | let _ = Grammar.compile gram in 9 | let _ = Gc.compact () in 10 | let size = String.length s in 11 | let t0 = Unix.gettimeofday () in 12 | let r = Grammar.parse_string gram blank s in 13 | let t1 = Unix.gettimeofday () in 14 | let dw = (Gc.quick_stat ()).heap_words in 15 | let dt = t1 -. t0 in 16 | bench := (size, dt, float dw) :: !bench; 17 | (r, dt, dw) 18 | 19 | let parse_ch bench gram blank size ch = 20 | let _ = Grammar.compile gram in 21 | let _ = Gc.compact () in 22 | let t0 = Unix.gettimeofday () in 23 | let r = Grammar.parse_channel gram blank ch in 24 | let t1 = Unix.gettimeofday () in 25 | let dw = (Gc.quick_stat ()).heap_words in 26 | let dt = t1 -. t0 in 27 | bench := (size, dt, float dw) :: !bench; 28 | (r, dt, dw) 29 | 30 | let parse_fd bench gram blank size fd = 31 | let _ = Grammar.compile gram in 32 | let _ = Gc.compact () in 33 | let t0 = Unix.gettimeofday () in 34 | let r = Grammar.parse_fd gram blank fd in 35 | let t1 = Unix.gettimeofday () in 36 | let dw = (Gc.quick_stat ()).heap_words in 37 | let dt = t1 -. t0 in 38 | bench := (size, dt, float dw) :: !bench; 39 | (r, dt, dw) 40 | 41 | let parse_yacc main lex ch = 42 | let _ = Gc.compact () in 43 | let t0 = Unix.gettimeofday () in 44 | let lexbuf = Lexing.from_channel ch in 45 | let r = main lex lexbuf in 46 | let t1 = Unix.gettimeofday () in 47 | let dw = (Gc.quick_stat ()).heap_words in 48 | let dt = t1 -. t0 in 49 | (r, dt, dw) 50 | 51 | let parse_pipe bench gram blank size producer = 52 | let (fd_in, fd_out) = Unix.pipe () in 53 | let pid = Unix.fork () in 54 | if pid = 0 then 55 | (Unix.close fd_in; 56 | let ch_out = Unix.out_channel_of_descr fd_out in 57 | let _ = producer ch_out in 58 | close_out ch_out; 59 | exit 0) 60 | else 61 | (Unix.close fd_out; 62 | (parse_fd bench gram blank size fd_in)) 63 | 64 | let yacc_pipe bench entry lex size producer = 65 | let (fd_in, fd_out) = Unix.pipe () in 66 | let pid = Unix.fork () in 67 | if pid = 0 then 68 | (Unix.close fd_in; 69 | let ch_out = Unix.out_channel_of_descr fd_out in 70 | let _ = producer ch_out in 71 | close_out ch_out; 72 | exit 0) 73 | else 74 | (Unix.close fd_out; 75 | let ch_in = Unix.in_channel_of_descr fd_in in 76 | let (res_in, res_out) = Unix.pipe () in 77 | (* Parsing.clear_parser () does not seem to work! we fork again! *) 78 | let pid = Unix.fork () in 79 | if pid = 0 then 80 | (Unix.close res_in; 81 | let ch_out = Unix.out_channel_of_descr res_out in 82 | let res = parse_yacc entry lex ch_in in 83 | output_value ch_out res; 84 | flush ch_out; 85 | close_out ch_out; exit 0) 86 | else 87 | (Unix.close res_out; 88 | let ch_in = Unix.in_channel_of_descr res_in in 89 | let ((_,dt,dw) as r) = input_value ch_in in 90 | bench := (size, dt, float dw) :: !bench; 91 | close_in ch_in; 92 | r)) 93 | 94 | let size producer = 95 | let null = open_out "/dev/null" in 96 | let size = producer null in 97 | close_out null; 98 | size 99 | 100 | 101 | (* ln(C N^e) = c + e ln(N) + f ln(ln(N)) *) 102 | module Base = struct 103 | type input = int 104 | let base = [| (fun _ -> 1.0) 105 | ; (fun n -> log (float (n + 1))) 106 | |] 107 | end 108 | 109 | module Interpolate = Interpolate.Make(Base) 110 | 111 | let corr samples = 112 | let nb = float (Array.length samples) in 113 | let sx, sy = 114 | Array.fold_left (fun (sx,sy) (n,y) -> 115 | let x = log (float (1+n)) in 116 | (sx +. x, sy +. y)) 117 | (0.,0.) samples 118 | in 119 | let ax = sx /. nb in 120 | let ay = sy /. nb in 121 | let dxy, dx2, dy2 = 122 | Array.fold_left (fun (dxy,dx2,dy2) (n,y) -> 123 | let x = log (float (1+n)) in 124 | ( dxy +. (x -. ax) *. (y -. ay) 125 | , dx2 +. (x -. ax) *. (x -. ax) 126 | , dy2 +. (y -. ay) *. (y -. ay))) 127 | (0.,0.,0.) samples 128 | in 129 | dxy /. sqrt (dx2 *. dy2) 130 | 131 | 132 | let stats msg bench = 133 | let open Interpolate in 134 | let time = Array.of_list (List.map (fun (n,t,_) -> (n, log(t))) !bench) in 135 | let coefs = compute_coefs time in 136 | let c = corr time in 137 | let coefs = get coefs in 138 | Printf.printf "%s Time ~ %g N^%g (corr: %.2f)\n%!" 139 | msg (exp coefs.(0)) coefs.(1) c; 140 | let space = Array.of_list (List.map (fun (n,_,s) -> (n, log(s))) !bench) in 141 | let coefs = compute_coefs space in 142 | let c = corr space in 143 | let coefs = get coefs in 144 | Printf.printf "%s Space ~ %g N^%g (corr: %.2f)\n%!" 145 | msg (exp coefs.(0)) coefs.(1) c 146 | 147 | let csv bench file = 148 | let replace = String.map (function '.' -> ',' | c -> c) in 149 | let ch = open_out file in 150 | let prf ch x = 151 | output_string ch (replace (Printf.sprintf "%f" x)) 152 | in 153 | let m = List.length !bench + 3 in 154 | Printf.fprintf ch "pente(t) corr(t) pente(s) corr(s)\n%!"; 155 | Printf.fprintf ch "N ln(1+N) T ln(T) S ln(S)\n%!"; 156 | Printf.fprintf ch "=pente($D$4:$D$%d;$B$4:$B$%d) " 157 | m m; 158 | Printf.fprintf ch "=coefficient.correlation($D$4:$D$%d;$B$4:$B$%d) " 159 | m m; 160 | Printf.fprintf ch "=pente($F$4:$F$%d;$B$4:$B$%d) " 161 | m m; 162 | Printf.fprintf ch "=coefficient.correlation($F$4:$F$%d;$B$4:$B$%d)\n%!" 163 | m m; 164 | List.iter (fun (n, t, s) -> 165 | Printf.fprintf ch "%d %a %a %a %a %a\n%!" n 166 | prf (log (float (1 + n))) 167 | prf t 168 | prf (log t) 169 | prf s 170 | prf (log s)) (List.rev !bench); 171 | close_out ch 172 | -------------------------------------------------------------------------------- /lib/blank.ml: -------------------------------------------------------------------------------- 1 | (** Functions managing blanks *) 2 | 3 | type buf = Input.buffer 4 | type idx = Input.idx 5 | 6 | (** A blank function is just a function progressing in a buffer *) 7 | type blank = buf -> idx -> buf * idx 8 | type t = blank 9 | 10 | (** Use when you have no blank chars *) 11 | let none : blank = fun s n -> (s,n) 12 | 13 | (** Blank from a charset *) 14 | let from_charset : Charset.t -> blank = 15 | fun cs s n -> 16 | let rec fn s n = 17 | let (c,s',n') = Input.read s n in 18 | if Charset.mem cs c then fn s' n' else (s,n) 19 | in 20 | fn s n 21 | 22 | (** Blank from a terminal *) 23 | let from_terminal : 'a Lex.t -> blank = 24 | fun t s n -> 25 | try 26 | let (_,s,n) = t.f s n in 27 | (s,n) 28 | with Lex.NoParse -> (s,n) 29 | 30 | (** blank with c++/lisp/shell like comments *) 31 | let line_comments : ?cs:Charset.t -> string -> blank = 32 | fun ?(cs=Charset.from_string " \t\n\r") start_comment -> 33 | let start_comment = (Lex.string start_comment).f in 34 | fun s n -> 35 | let rec fn s n = 36 | let (c,s',n') = Input.read s n in 37 | if Charset.mem cs c then fn s' n' else 38 | try 39 | let (_,s,n) = start_comment s n in 40 | let rec gn s n = 41 | let (c,s',n') = Input.read s n in 42 | if c <> '\n' && c <> '\r' then gn s' n' 43 | else fn s n 44 | in 45 | gn s n 46 | with Lex.NoParse -> (s,n) 47 | in 48 | fn s n 49 | 50 | (** Layout records *) 51 | 52 | type layout_config = 53 | { old_blanks_before : bool 54 | (** Ignoring blanks with the old blank function before parsing? *) 55 | ; new_blanks_before : bool 56 | (** Then ignore blanks with the new blank function (before parsing)? *) 57 | ; new_blanks_after : bool 58 | (** Use the new blank function one last time before resuming old layout? *) 59 | ; old_blanks_after : bool 60 | (** Use then the old blank function one last time as well? *) } 61 | 62 | let default_layout_config : layout_config = 63 | { old_blanks_before = true 64 | ; new_blanks_before = false 65 | ; new_blanks_after = false 66 | ; old_blanks_after = true } 67 | -------------------------------------------------------------------------------- /lib/blank.mli: -------------------------------------------------------------------------------- 1 | (* {1 Blank: ignoring insignificant characters} 2 | 3 | It provides functions to eliminate "blank" characteres. *) 4 | 5 | 6 | (** Position in a buffer is a [Input.buffer] together with an index 7 | [Input.pos]. *) 8 | type buf = Input.buffer 9 | type idx = Input.idx 10 | 11 | (** A blank function is just a function progressing in a buffer *) 12 | type blank = buf -> idx -> buf * idx 13 | type t = blank 14 | 15 | (** {2 Functions managing blanks} *) 16 | 17 | (** Use when you have no blank chars *) 18 | val none : blank 19 | 20 | (** Blank from a charset *) 21 | val from_charset : Charset.t -> blank 22 | 23 | (** Blank from a terminal *) 24 | val from_terminal : 'a Lex.t -> blank 25 | 26 | (** [line_comments s] Blank with standard spaces and line starting with [s]. 27 | [cs] defaults to [Charset.from_string " \t\n\r"] *) 28 | val line_comments : ?cs:Charset.t -> string -> blank 29 | 30 | (* {2 records for layout (i.e. blank function) change} *) 31 | 32 | type layout_config = 33 | { old_blanks_before : bool 34 | (** Ignoring blanks with the old blank function before parsing? *) 35 | ; new_blanks_before : bool 36 | (** Then ignore blanks with the new blank function (before parsing)? *) 37 | ; new_blanks_after : bool 38 | (** Use the new blank function one last time before resuming old layout? *) 39 | ; old_blanks_after : bool 40 | (** Use then the old blank function one last time as well? *) } 41 | 42 | (** Default configuration, parsing with the old blanks before (i.e., the field 43 | [old_blanks_before] is [true]), and the new blanks after (i.e., the field 44 | [old_blanks_after] is also [true]). The other two fields are [false]. *) 45 | val default_layout_config : layout_config 46 | -------------------------------------------------------------------------------- /lib/charset.ml: -------------------------------------------------------------------------------- 1 | type charset = int array 2 | type t = charset 3 | 4 | let mask, shift, size = 5 | match Sys.word_size with 6 | | 32 -> 15, 4, 256 / 16 7 | | 64 -> 31, 5, 256 / 32 8 | | _ -> assert false (* Cannot happen... *) 9 | 10 | let compare = Stdlib.compare 11 | let equal = (=) 12 | 13 | let empty = Array.make size 0 14 | let full = Array.make size (-1) 15 | 16 | let complement = Array.map ((lxor) (-1)) 17 | 18 | let mem cs c = 19 | let i = Char.code c in 20 | cs.(i lsr shift) land (1 lsl (i land mask)) <> 0 21 | 22 | let addq cs c = 23 | let i = Char.code c in 24 | cs.(i lsr shift) <- cs.(i lsr shift) lor (1 lsl (i land mask)) 25 | 26 | let add cs c = 27 | let i = Char.code c in 28 | let cs = Array.copy cs in 29 | cs.(i lsr shift) <- cs.(i lsr shift) lor (1 lsl (i land mask)); 30 | cs 31 | 32 | let range cmin cmax = 33 | let res = ref empty in 34 | for i = Char.code cmin to Char.code cmax do 35 | res := add !res (Char.chr i) 36 | done; !res 37 | 38 | let delq cs c = 39 | let i = Char.code c in 40 | cs.(i lsr shift) <- cs.(i lsr shift) land (lnot (1 lsl (i land mask))) 41 | 42 | let del cs c = 43 | let i = Char.code c in 44 | let cs = Array.copy cs in 45 | cs.(i lsr shift) <- cs.(i lsr shift) land (lnot (1 lsl (i land mask))); 46 | cs 47 | 48 | let union cs1 cs2 = 49 | Array.mapi (fun i x -> x lor cs2.(i)) cs1 50 | 51 | let singleton = 52 | let tbl = Array.init 256 (fun i -> add empty (Char.chr i)) in 53 | fun c -> tbl.(Char.code c) 54 | 55 | let copy = Array.copy 56 | 57 | let from_string s = 58 | let rec build cs l = 59 | match l with 60 | | [] -> cs 61 | | '-' :: '-' :: _ -> invalid_arg "bad charset description" 62 | | '-' :: l -> build (add cs '-') l 63 | | _ :: '-' :: '-' :: _ -> invalid_arg "bad charset description" 64 | | c1 :: '-' :: c2 :: l -> build (union cs (range c1 c2)) l 65 | | c :: l -> build (add cs c) l 66 | in 67 | let string_to_list s = 68 | let l = ref [] in 69 | String.iter (fun c -> l := c :: !l) s; 70 | List.rev !l 71 | in 72 | build empty (string_to_list s) 73 | 74 | let show cs = 75 | let has_range min max = 76 | let has_all = ref true in 77 | for i = (Char.code min) to (Char.code max) do 78 | if not (mem cs (Char.chr i)) then has_all := false 79 | done; !has_all 80 | in 81 | if cs = full then "" 82 | else if cs = empty then "" 83 | else 84 | let res = ref "" in 85 | let add_all min max = 86 | for i = min to max do 87 | if mem cs (Char.chr i) then 88 | res := !res ^ (Char.escaped (Char.chr i)) 89 | done 90 | in 91 | let has_all_nums = has_range '0' '9' in 92 | let has_all_upper = has_range 'A' 'Z' in 93 | let has_all_lower = has_range 'a' 'z' in 94 | (* Before character '0' *) 95 | add_all 0 (Char.code '0' - 1); 96 | (* Numbers. *) 97 | if has_all_nums then res := !res ^ "0-9" 98 | else add_all (Char.code '0') (Char.code '9'); 99 | (* Before character 'A' *) 100 | add_all (Char.code '9' + 1) (Char.code 'A' - 1); 101 | (* Uppercase letters. *) 102 | if has_all_upper then res := !res ^ "A-Z" 103 | else add_all (Char.code 'A') (Char.code 'Z'); 104 | (* Before character 'a' *) 105 | add_all (Char.code 'Z' + 1) (Char.code 'a' - 1); 106 | (* Lowercase letters. *) 107 | if has_all_lower then res := !res ^ "a-z" 108 | else add_all (Char.code 'a') (Char.code 'z'); 109 | (* After character 'z'. *) 110 | add_all (Char.code 'z' + 1) 255; 111 | !res 112 | 113 | let pp ff cs = 114 | Format.pp_print_string ff (show cs) 115 | 116 | let show_full cs = 117 | let res = ref "" in 118 | for i = 0 to 255 do 119 | if mem cs (Char.chr i) then 120 | res := !res ^ (Char.escaped (Char.chr i)) 121 | done; !res 122 | 123 | let pp_full ff cs = 124 | Format.pp_print_string ff (show_full cs) 125 | -------------------------------------------------------------------------------- /lib/charset.mli: -------------------------------------------------------------------------------- 1 | (** A module providing efficient character sets. *) 2 | 3 | (** {2 Type} *) 4 | 5 | (** The abstract type for a character set. *) 6 | type charset 7 | 8 | (** Synonym of [charset]. *) 9 | type t = charset 10 | 11 | (** {2 Charset construction} *) 12 | 13 | (** The empty character set. *) 14 | val empty : charset 15 | 16 | (** The full character set. *) 17 | val full : charset 18 | 19 | (** [singleton c] returns a charset containing only [c]. *) 20 | val singleton : char -> charset 21 | 22 | (** [range cmin cmax] returns the charset containing all the characters 23 | between [cmin] and [cmax]. *) 24 | val range : char -> char -> charset 25 | 26 | (** [from_string s] returns the charset corresponding to the description 27 | string [s], which may contain standalone characters (different from 28 | ['-'], which is only allowed as first character) or ranges. They are 29 | build of start and end characters, separated by ['-']. An example of 30 | a valid description is ["-_a-zA-Z0-9"]. Note that [Invalid_argument] 31 | is raised in case of ill-formed description. *) 32 | val from_string : string -> charset 33 | 34 | (** [union cs1 cs2] builds a new charset that contins the union of the 35 | characters of [cs1] and [cs2]. *) 36 | val union : charset -> charset -> charset 37 | 38 | (** [complement cs] returns a new charset containing exactly characters 39 | that are not in [cs]. *) 40 | val complement : charset -> charset 41 | 42 | (** [add cs c] returns a new charset containing the characters of [cs] 43 | and the character [c]. *) 44 | val add : charset -> char -> charset 45 | 46 | (** [del cs c] returns a new charset containing the characters of [cs] 47 | but not the character [c]. *) 48 | val del : charset -> char -> charset 49 | 50 | (** {2 Membership test} *) 51 | 52 | (** [mem cs c] tests whether the charset [cs] contains [c]. *) 53 | val mem : charset -> char -> bool 54 | 55 | (** {2 Printing and string representation} *) 56 | 57 | (** [pp ff cs] prints the charset [cs] to output formatter [ff]. Compact 58 | format is used for printing: ranges, full and empty charsets are not 59 | given in full, but abbreviated. *) 60 | val pp : Format.formatter -> charset -> unit 61 | 62 | (** [pp_full ff cs] is similar to [pp ff cs], but it does not abbreviate 63 | ranges, full and empty charsets. *) 64 | val pp_full : Format.formatter -> charset -> unit 65 | 66 | (** [show oc cs] builds a string representing the charset [cs] using the 67 | same compact format as [print]. *) 68 | val show : charset -> string 69 | 70 | (** [show_full oc cs] is the same as [show oc cs] but it does not use 71 | abreviations (i.e. all characters appear). *) 72 | val show_full : charset -> string 73 | 74 | (** {2 Manipulating charsets imperatively} *) 75 | 76 | (** [copy cs] make a copy of the charset [cs]. *) 77 | val copy : charset -> charset 78 | 79 | (** [addq cs c] adds the character [c] to the charset [cs]. Users must 80 | be particularly careful when using this function. In particular, it 81 | should not be used directly on [empty], [full] or the result of the 82 | [singleton] function as it would change their value permanently. It 83 | is advisable to prefer the use of [add] or to work on a [copy]. *) 84 | val addq : charset -> char -> unit 85 | 86 | (** [delq cs c] deletes the character [c] from the charset [cs]. Similar 87 | recomendatiosn as for [addq] apply. *) 88 | val delq : charset -> char -> unit 89 | 90 | (** {2 Comparison and equality test} *) 91 | 92 | (** [compare cs1 cs2] compares the charsets [cs1] and [cs2] according to 93 | some (unspecified) total order. *) 94 | val compare : charset -> charset -> int 95 | 96 | (** [equal cs1 cs2] tests the equality of charsets [cs1] and [cs2]. *) 97 | val equal : charset -> charset -> bool 98 | -------------------------------------------------------------------------------- /lib/comb.mli: -------------------------------------------------------------------------------- 1 | (** {1 Combinator library, using continuation} 2 | 3 | As usual left recursion is not supported, but the library is intended to be 4 | used through the [Grammar] module that provides elimination of left 5 | recursion. However, a cache combinatr is supported to overcome the cost of 6 | backtracking. *) 7 | 8 | (** {2 function and type usefull to the end-user} *) 9 | 10 | (** The type of combinator *) 11 | type 'a t 12 | 13 | (** Partial parsing. Beware, the returned position is not the maximum position 14 | that can be reached by the grammar. *) 15 | val partial_parse_buffer : 'a t -> Blank.t -> ?blank_after:bool -> 16 | Lex.buf -> Lex.idx -> 'a * Lex.buf * Lex.idx 17 | 18 | (** Returns all possible parse trees. Usefull for natural languages but also to 19 | debug ambiguity in a supposed non ambiguous grammar. If end of input is not 20 | parsed in some ways, some value may correspond to only the beginning of the 21 | input. Except when debugging or testing, you should rather use cache/merge 22 | anyway. *) 23 | val parse_all_buffer : 'a t -> Blank.t -> Lex.buf -> Lex.idx -> 'a list 24 | 25 | (** {2 combinator constructors, normally not needed by the casual user } *) 26 | 27 | (** [fail] is a parser rejecting every input (it always fails). *) 28 | val fail : 'a t 29 | 30 | (** Fails and report an error *) 31 | val error : string list -> 'a t 32 | 33 | val assert_false : 'a t 34 | 35 | (** [empty v] is a parser that only accepts the empty input and returns [v] as 36 | its semantic value. *) 37 | val empty : 'a -> 'a t 38 | 39 | (** [lexeme l] is a parser accepting the lexeme (or terminal) [l], and returns 40 | the corresponding semantic value. *) 41 | val lexeme : 'a Lex.lexeme -> 'a t 42 | 43 | (** [seq g1 g2] sequences the parsers [g1] and [g2]. The resulting parser 44 | starts by parsing using [g1], and then parses the rest of the input using 45 | [g2]. The result of parsing with [g2] is then apply to the result of [g1]. 46 | *) 47 | val seq : 'a t -> Charset.t -> ('a -> 'b) t -> 'b t 48 | 49 | (** optimisation of the above with no argument, allows costless right recursion. 50 | with no semantics *) 51 | val iseq : 'a t -> Charset.t -> 'b t -> 'b t 52 | 53 | (** [dseq c1 c2] is a dependant sequence, contrary to [seq c1 c2], the 54 | combinator used to parse after [c1] depends upon the first value returned by 55 | [c1]. It is a good idea to memoize the function c2. The separation of ['a] 56 | and ['b] in the smeantics of [g1] allows to depend on the smallest set of 57 | possible vaue which is important in case of memoisation. *) 58 | val dseq: ('a * 'b) t -> ('a -> ('b -> 'c) t) -> 'c t 59 | 60 | (** optimisation of the above with no argument, allows costless right recursion. 61 | with no semantics *) 62 | val diseq: 'a t -> ('a -> 'b t) -> 'b t 63 | 64 | (** Combinator parsing with the first combinator and in case of failure with the 65 | second from the same position. The optionnal charset corresponds to the 66 | charaters accepted at the beginning of the input for each combinators. The 67 | charset must be Charset.full if the corresponding combinator accept the 68 | empty input *) 69 | val alt : (Charset.t * 'a t) list -> 'a t 70 | 71 | (** [option a ~cs c] is an optimisation for [alt (empty a) ~cs c]. In fact it 72 | is better to use [alt] with grammar not accepting empty and use [option] to 73 | deal with an empty case *) 74 | val option: 'a -> Charset.t -> 'a t -> 'a t 75 | 76 | (** Parses with the given combinator and transforms the semantics with the given 77 | function *) 78 | val appl : 'a t -> ('a -> 'b) -> 'b t 79 | 80 | (** Replace the semantics of the given combinator *) 81 | val repl : 'a t -> 'b -> 'b t 82 | 83 | val lazy_ : 'a t -> 'a lazy_t t 84 | val force : 'a lazy_t t -> 'a t 85 | 86 | (** unmerge a merged ambiguous grammar, typically if the rest of the parsing 87 | uses dependant sequences. Allows for ambiguous terminals by return a list and 88 | the using unmerge. *) 89 | val unmerge : 'a list t -> 'a t 90 | 91 | (** Parses as the given combinator and give the position to the left of the 92 | parsing input as argument to the action *) 93 | val left_pos : (Pos.spos -> 'a) t -> 'a t 94 | 95 | (** Same as above with the position to the right *) 96 | val right_pos : (Pos.spos -> 'a) t -> 'a t 97 | 98 | (** To eliminate left recursion, lpos has to be left factored. if lpos is one 99 | single combinator, this adds a lot of closures in action code. To solve this 100 | problem, lpos is splitted in two combinators, one that pushes the position 101 | to a stack and pops after parsing and another that reads the position. *) 102 | val read_pos : (Pos.spos -> 'a) t -> 'a t 103 | 104 | (** key used by lr below *) 105 | type 'a key = 'a Assoc.key 106 | 107 | (** [lr c1 v c2] is an optimized version of [let rec r = seq c1 (seq r c2)] 108 | which is illegal as it is left recursive and loops. The optional charset 109 | indicates the characteres accepted by [c2] at the beginning of input. [v] is 110 | like variable bound in [c2], see [read_tbl] below *) 111 | val lr : 'a t -> 'a key -> Charset.t -> 'a t -> 'a t 112 | 113 | (** Same as above, but also store the position *) 114 | val lr_pos : 'a t -> 'a key -> Charset.t -> 'a t -> 'a t 115 | 116 | (** type to represent the left prefix of a mutually recursive grammar. 117 | the key represents the produced grammar for each left prefix. *) 118 | type mlr_left = 119 | LNil : mlr_left 120 | | LCns : 'a key * Charset.t * 'a t * mlr_left -> mlr_left 121 | 122 | (** type of the suffix to be repeted in a mutually recursive grammar. 123 | the first key represents the grammar that parsed the input before 124 | the second key represents the produced grammar. 125 | 126 | Somehow, mlr_right is a matrix R, the two keys being the index of 127 | the coefficient and mlr_left is a vector L. Parsing, will somehow use 128 | L R^n for n large enough; 129 | *) 130 | type mlr_right = 131 | RNil : mlr_right 132 | | RCns : 'a key * 'b key * Charset.t * 'b t * mlr_right -> mlr_right 133 | 134 | (** The combinator itself. The optionnal argument indicated that we need 135 | the position before parsing *) 136 | val mlr : ?lpos:bool -> mlr_left -> mlr_right -> 'a key -> 'a t 137 | 138 | (** combinator to access the value stored by lr. It must be uses as prefix of 139 | [c2] in [lr c1 c2]. For instance, the coding of [let rec r = seq c1 (seq r 140 | c2)] is [let k = Assoc.new_key () in lr c1 k (seq (read_tbl k) c2)]. Here we 141 | ommited the actions. This way of coding left recursion avoids to transform 142 | the action and produce closure. The code for elimination of left recursion 143 | is also much simpler *) 144 | val read_tbl : 'a key -> 'a t 145 | 146 | (** Allow to test the blank characteres before a grammar and more *) 147 | val test_before : (Lex.buf -> Lex.idx -> Lex.buf -> Lex.idx -> bool) 148 | -> 'a t -> 'a t 149 | 150 | (** Allow to test the blank characteres after a grammar and more *) 151 | val test_after : ('a -> Lex.buf -> Lex.idx -> Lex.buf -> Lex.idx -> bool) 152 | -> 'a t -> 'a t 153 | 154 | (** Access to a reference to a combinator, used by Grammar.compile for recursive 155 | grammars (not for left recursion *) 156 | val deref : 'a t ref -> 'a t 157 | 158 | (** Change the blank function used to parse with the given combinator. we can 159 | choose which blank to use at the boundary with the optional parameters. *) 160 | val change_layout : ?config:Blank.layout_config -> Blank.t -> 'a t -> 'a t 161 | 162 | (** Combinator that caches a grammar to avoid exponential behavior. parsing 163 | with the grammar from each position is memoized to avoid parsing twice the 164 | same sequence with the same grammar. *) 165 | type 'a merge = infos:Input.infos -> start:Input.byte_pos -> end_:Input.byte_pos 166 | -> 'a -> 'a -> 'a 167 | 168 | val cache : ?merge:'a merge -> 'a t -> 'a t 169 | -------------------------------------------------------------------------------- /lib/container.ml: -------------------------------------------------------------------------------- 1 | (** Standard eq-type. *) 2 | type ('a,'b) eq = 3 | | Y : ('a,'a) eq 4 | | N : ('a,'b) eq 5 | 6 | (** GADT to represent types in the syntax (extended when needed). *) 7 | type _ tag = .. 8 | 9 | module Make(V : sig type ('a,'b) elt end) = struct 10 | include V 11 | 12 | (** Non-uniform list (containing elements of possibly different types). *) 13 | type ptag = T : 'a tag -> ptag [@@unboxed] 14 | 15 | type 'b etag = E : 'a tag * ('a,'b) elt -> 'b etag 16 | 17 | type 'b nu_list = (ptag, 'b etag) Hashtbl.t 18 | 19 | 20 | (** Actual container. *) 21 | type 'a container = 22 | { mutable data : 'a nu_list (** Contents for each table. *) 23 | ; uid : 'a tag (** Unique identifier. *) 24 | ; eq : 'b. 'b tag -> ('a,'b) eq} 25 | 26 | (** Creation function for containers. *) 27 | let create : type a. unit -> a container = 28 | fun () -> 29 | let module M = struct type _ tag += T : a tag end in 30 | let eq : type b. b tag -> (a, b) eq = function M.T -> Y | _ -> N in 31 | {data = Hashtbl.create 16; uid = M.T; eq } 32 | 33 | (** Obtain the UID of a container. *) 34 | let address : 'b container -> 'b tag = fun c -> c.uid 35 | 36 | (** Equality on containers *) 37 | let eq : 'a container -> 'b container -> ('a, 'b) eq = 38 | fun c1 c2 -> c1.eq c2.uid 39 | 40 | (** unboxed mandatory for weak hashtbl to work, from 4.04.0 *) 41 | type any = C : 'b container -> any [@@unboxed] 42 | 43 | (** Container table. *) 44 | type 'a table = 45 | { tag : 'a tag (** Unique tag for this table. *) 46 | ; eq : 'b. 'b tag -> ('a,'b) eq (** Equality to the table's tag. *) 47 | ; mutable elts : any list 48 | } 49 | 50 | (* Find the value associated to the given table and container. *) 51 | let find : type a b. a table -> b container -> (a, b) elt = 52 | fun tab c -> 53 | match Hashtbl.find c.data (T tab.tag) with E(t,v) -> 54 | match tab.eq t with 55 | | Y -> v 56 | | N -> assert false 57 | 58 | (** Insert a new value associated to the given table and container. If a 59 | value is already present, it is overwriten. *) 60 | let add : type a b. a table -> b container -> (a, b) elt -> unit = 61 | fun tab c v -> 62 | let mem = Hashtbl.mem c.data (T tab.tag) in 63 | Hashtbl.replace c.data (T tab.tag) (E(tab.tag, v)); 64 | if not mem then tab.elts <- C c :: tab.elts 65 | 66 | let clear : type a. a table -> unit = fun tab -> 67 | List.iter (fun (C c) -> Hashtbl.remove c.data (T tab.tag)) tab.elts; 68 | tab.elts <- [] 69 | 70 | let create_table : type a. unit -> a table = fun () -> 71 | let module M = struct type _ tag += T : a tag end in 72 | let eq : type b. b tag -> (a, b) eq = function M.T -> Y | _ -> N in 73 | let res = { tag = M.T ; eq ; elts = [] } in 74 | Gc.finalise clear res; 75 | res 76 | 77 | let length : type a. a table -> int = fun tab -> 78 | let n = ref 0 in 79 | let fn : type b. b nu_list -> unit = fun data -> 80 | Hashtbl.iter (fun (T t) _ -> 81 | match tab.eq t with 82 | | Y -> incr n; 83 | | N -> assert false) data 84 | in 85 | List.iter (fun (C c) -> fn c.data) tab.elts; 86 | !n 87 | 88 | type 'a iter = { f : 'b.('a, 'b) elt -> unit } 89 | 90 | let iter : type a. a iter -> a table -> unit = fun f tab -> 91 | let fn : type b. b nu_list -> unit = fun data -> 92 | Hashtbl.iter (fun _ (E (t,v)) -> 93 | match tab.eq t with 94 | | Y -> f.f v; 95 | | N -> assert false) data 96 | in 97 | List.iter (fun (C c) -> fn c.data) tab.elts 98 | 99 | type ('a,'c) fold = { f : 'b.('a, 'b) elt -> 'c -> 'c } 100 | 101 | let fold : type a c. (a, c) fold -> a table -> c -> c = fun f tab acc -> 102 | let fn : type b. b nu_list -> c -> c = fun data acc -> 103 | Hashtbl.fold (fun _ (E (t,v)) acc -> 104 | match tab.eq t with 105 | | Y -> f.f v acc 106 | | N -> assert false) data acc 107 | in 108 | List.fold_left (fun acc (C c) -> fn c.data acc) acc tab.elts 109 | 110 | end 111 | 112 | module type Param = sig 113 | type 'a table 114 | type 'b container 115 | type ('a, 'b) elt 116 | val create : unit -> 'b container 117 | val create_table : unit -> 'a table 118 | val address : 'b container -> 'b tag 119 | val eq : 'a container -> 'b container -> ('a, 'b) eq 120 | val add : 'a table -> 'b container -> ('a, 'b) elt -> unit 121 | val find : 'a table -> 'b container -> ('a, 'b) elt 122 | val clear : 'a table -> unit 123 | val length : 'a table -> int 124 | type 'a iter = { f : 'b.('a, 'b) elt -> unit } 125 | val iter : 'a iter -> 'a table -> unit 126 | type ('a,'c) fold = { f : 'b.('a, 'b) elt -> 'c -> 'c } 127 | val fold : ('a, 'c) fold -> 'a table -> 'c -> 'c 128 | end 129 | 130 | type ('a, 'b) el = 'a 131 | include Make(struct type ('a, 'b) elt = ('a, 'b) el end) 132 | 133 | (* redefine iter and fold, to avoid the useless record but also to avoid 134 | https://caml.inria.fr/mantis/view.php?id=7636 135 | *) 136 | let iter : type a. (a -> unit) -> a table -> unit = fun f tab -> 137 | let fn : type b. b nu_list -> unit = fun data -> 138 | Hashtbl.iter (fun _ (E (t,v)) -> 139 | match tab.eq t with 140 | | Y -> f v; 141 | | N -> assert false) data 142 | in 143 | List.iter (fun (C c) -> fn c.data) tab.elts 144 | 145 | let fold : type a c. (a -> c -> c) -> a table -> c -> c = fun f tab acc -> 146 | let fn : type b. b nu_list -> c -> c = fun data acc -> 147 | Hashtbl.fold (fun _ (E (t,v)) acc -> 148 | match tab.eq t with 149 | | Y -> f v acc 150 | | N -> assert false) data acc 151 | in 152 | List.fold_left (fun acc (C c) -> fn c.data acc) acc tab.elts 153 | 154 | type ('a, 'b) le = 'b 155 | module Ref = Make(struct type ('a, 'b) elt = ('a, 'b) le end) 156 | 157 | (** Exported name for [container]. *) 158 | type t = unit container 159 | 160 | let eq c1 c2 = match eq c1 c2 with Y -> true | N -> false 161 | 162 | (* This does not work ! 163 | let iter : type a.(a -> unit) -> a table -> unit = 164 | fun f tabl -> 165 | iter { f = (let f : type b.(a, b) el -> unit = f in f) } tab 166 | *) 167 | -------------------------------------------------------------------------------- /lib/container.mli: -------------------------------------------------------------------------------- 1 | (** This library provide a type [Container.t], which can be used as a 2 | list of polymorphic references. 3 | 4 | Another way to see it, is map with access time in O(N) where 5 | N is the number of tables! Note: O(1) is possible, but usually 6 | there are very few tables at the same time. 7 | 8 | The typical use case is to have a record with a field of type 9 | [Container.t]. Then when you want to store some information in 10 | that field of type [a], you create with [Container.create_table], 11 | a value of type [a Container.table]. Then with [Container.add] 12 | and [Container.find], you can store value in your field of 13 | type [Container.t]. 14 | 15 | More precisely, consider the following type for oriented graphs: 16 | {[ 17 | type node = { name: string; 18 | mutable next: node list; 19 | ptrs : Container.t } 20 | type graph = node list (* at least on node per component *) 21 | ]} 22 | If you want to traverse the graphe, you create a table 23 | to associate a boolean to each node: 24 | {[ 25 | let iter graph f = 26 | let visited : bool Container.table = Container.create_table 101 in 27 | .... 28 | (* the table is automatically freed when visited is collected *) 29 | ]} 30 | If you want to compute the distance between two nodes: 31 | {[ 32 | let distance a b = 33 | let distance_to_a : int Container.table = Container.create_table 101 in 34 | .... 35 | ]} 36 | 37 | The functorial interface is useful when you have a parametric 38 | type. Considier a record type [ 'b t ] to which you want to 39 | associate values of type [ ('a,'b) v ]. It is enough for this to 40 | call the functor with 41 | 42 | [ module M = Constainer.Make(struct type ('a,'b) v end) ] 43 | 44 | and have a field of type [ 'b M.container ] inside your record. 45 | Then, you use the type [ 'a M.table ] when you want to start to 46 | associate values of type [ ('a,'b) v ] to the record. The same 47 | module [ M ] can be used for many types [ 'a ]. 48 | 49 | Remark: the non funtorial version is just defined by: 50 | [ Container.Make(struct type ('a, 'b) elt = 'a end ] 51 | *) 52 | 53 | (** Type of a container cell *) 54 | type t 55 | 56 | (** equality on container *) 57 | val eq : t -> t -> bool 58 | 59 | (** Type of a container table. You must create a container table of 60 | type [ a table ] to store value of type [ a ] in a container 61 | cell. Many table can have the same type, the value are associated 62 | to the pair (cell, table), not just to the cell. *) 63 | type 'a table 64 | 65 | (** [ create () ] creates a new container cell *) 66 | val create : unit -> t 67 | 68 | (** [ add tab cell v ] associates the value v to the pair (tab, cell). 69 | complexity if O(N) where N is the number of tables with a value 70 | associated to this cell. *) 71 | val add : 'a table -> t -> 'a -> unit 72 | 73 | (** [ find tab cell ] return the value associated to (tab, cell). 74 | raises Not_found if the are no such value *) 75 | val find : 'a table -> t -> 'a 76 | 77 | (** [ clear tab ] removed all value associated to a table. *) 78 | val clear : 'a table -> unit 79 | 80 | (** [ create_table () ] creates a new table *) 81 | val create_table : unit -> 'a table 82 | 83 | (** GADT to represent types in the syntax (extended when needed). *) 84 | type _ tag = .. 85 | 86 | (** [ address n ] return a unique id of each cell *) 87 | val address : t -> unit tag 88 | 89 | (** iterator as in List module *) 90 | val iter : ('a -> unit) -> 'a table -> unit 91 | 92 | (** fold as in List module *) 93 | val fold : ('a -> 'c -> 'c) -> 'a table -> 'c -> 'c 94 | 95 | (** definition of the [elt] type *) 96 | type ('a, 'b) elt = 'a 97 | 98 | (** Standard eq-type. *) 99 | type ('a,'b) eq = 100 | | Y : ('a,'a) eq 101 | | N : ('a,'b) eq 102 | 103 | (** functorial verstion storing element of type ('a, 'b) elt for container of 104 | the 'b and table of type 'a *) 105 | module type Param = sig 106 | type 'a table 107 | type 'b container 108 | type ('a, 'b) elt 109 | val create : unit -> 'b container 110 | val create_table : unit -> 'a table 111 | val address : 'b container -> 'b tag 112 | val eq : 'a container -> 'b container -> ('a, 'b) eq 113 | val add : 'a table -> 'b container -> ('a, 'b) elt -> unit 114 | val find : 'a table -> 'b container -> ('a, 'b) elt 115 | val clear : 'a table -> unit 116 | val length : 'a table -> int 117 | type 'a iter = { f : 'b.('a, 'b) elt -> unit } 118 | val iter : 'a iter -> 'a table -> unit 119 | type ('a,'c) fold = { f : 'b.('a, 'b) elt -> 'c -> 'c } 120 | val fold : ('a, 'c) fold -> 'a table -> 'c -> 'c 121 | end 122 | 123 | module Make(T : sig type ('a,'b) elt end) : Param 124 | with type ('a, 'b) elt = ('a,'b) T.elt 125 | 126 | (** useful particular case, when container are used for graphs or union 127 | find and you associate containers to containers *) 128 | module Ref : Param with type ('a,'b) elt = 'b 129 | -------------------------------------------------------------------------------- /lib/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (name pacomb) 3 | (public_name pacomb) 4 | (modules :standard) 5 | (preprocess (pps pacomb.ppx)) 6 | (libraries stdlib-shims unix) 7 | (private_modules container hashtbl_eq assoc comb unionFind heap interpolate)) 8 | 9 | (documentation 10 | (package pacomb)) 11 | -------------------------------------------------------------------------------- /lib/grammar.mli: -------------------------------------------------------------------------------- 1 | (** {1 Main module of Pacomb} *) 2 | 3 | (** {2 Type} *) 4 | 5 | (** type of a grammar with semantical action of type ['a ].*) 6 | type 'a grammar 7 | 8 | (** An abbreviation *) 9 | type 'a t = 'a grammar 10 | 11 | type name_kind = Created | Inherited | Given 12 | 13 | type name = string * name_kind 14 | 15 | (** {2 Grammar contructors} *) 16 | 17 | (** All construœqctors can be given an optional [name] argument that is used 18 | when printing the grammar. *) 19 | 20 | (** [print_grammar ch g] prints the grammar [g] of the given output channel. if 21 | [def=false] (the default is [true]) it will print the transformed grammar 22 | prior to compilation. *) 23 | val print_grammar : ?no_other:bool -> ?def:bool -> out_channel -> 24 | 'a grammar -> unit 25 | 26 | (** [fail ()] is a grammar that parses nothing (always fails) *) 27 | val fail : ?name:string -> unit -> 'a grammar 28 | 29 | (** fails reporting an error *) 30 | val error : ?name:string -> string list -> 'a grammar 31 | 32 | (** [empty a] accepts the empty input and returns [a] *) 33 | val empty : ?name:string -> 'a -> 'a grammar 34 | 35 | (** [cond b] is [if b then empty () else fail ()]. Very usefull in grammar 36 | family at the beginning of a rule. The test is done at grammar construction, 37 | not at parsing time (except if it is used in a dependant grammar). *) 38 | val cond : ?name:string -> bool -> unit grammar 39 | 40 | (** [term t] accepts the terminal [t] and returns its semantics. See module 41 | [Lex] *) 42 | val term : ?name:string -> 'a Lex.terminal -> 'a grammar 43 | 44 | (** [appl g f] parses with [g] and apply [f] to the resulting semantics *) 45 | val appl : ?name:string -> 'a grammar -> ('a -> 'b) -> 'b grammar 46 | 47 | (** [unmerge g] introduce multiple parse branch from a list of semantics. 48 | Allows to create ambiguous terminals and allows fr unmerge to continue 49 | a dependent parsing *) 50 | val unmerge : ?name:string -> 'a list grammar -> 'a grammar 51 | 52 | val lazy_ : ?name:string -> 'a grammar -> 'a lazy_t grammar 53 | val force : ?name:string -> 'a lazy_t grammar -> 'a grammar 54 | 55 | (** [alt [g1;g2;...;gn]] parses with [g1] and if it fails then [g2] and so on *) 56 | val alt : ?name:string -> 'a grammar list -> 'a grammar 57 | 58 | (** [seq g1 g2] parses with [g1] and then with [g2] for the rest of the input, 59 | combine both semantics by apply the semantics of [g2] to [g1] *) 60 | val seq : ?name:string -> 'a grammar -> ('a -> 'b) grammar -> 'b grammar 61 | 62 | (** variation of the abover when we do not use all semantics, it allows cost 63 | less right recursion with no semantics *) 64 | val iseq : ?name:string -> 'a grammar -> 'b grammar -> 'b grammar 65 | 66 | (** [dseq g1 g2)] is a dependant sequence, the grammar [g2] used after [g1] 67 | may depend upon the semantics of [g1]. This is not very efficient as the 68 | grammar [g2] must be compiled at parsing time. [g2] is memoized by default 69 | to partially overcome this fact. *) 70 | val dseq : ?name:string -> ('a * 'b) grammar 71 | -> ('a -> ('b -> 'c) grammar) -> 'c grammar 72 | 73 | (** variation of the abover when we do not use all semantics, it allows cost 74 | less right recursion with no semantics *) 75 | val diseq : ?name:string -> 'a grammar -> ('a -> 'b grammar) -> 'b grammar 76 | 77 | (** [lpos g] is identical to [g] but passes the position just before parsing 78 | with [g] to the semantical action of [g] *) 79 | val lpos : ?name:string -> (Pos.spos -> 'a) grammar -> 'a grammar 80 | 81 | (** [rpos g] is identical to [g] but passes the position just after parsing with 82 | [g] to the semantical action of [g] *) 83 | val rpos : ?name:string -> (Pos.spos -> 'a) grammar -> 'a grammar 84 | 85 | val mk_pos : ?name:string -> (Pos.pos -> 'a) grammar -> 'a grammar 86 | 87 | (** variants of seq with the position of the first iterm *) 88 | val seq_pos : ?name:string -> 'a grammar -> (Pos.pos * 'a -> 'b) grammar 89 | -> 'b grammar 90 | 91 | (** variants of dseq with the position of the first iterm *) 92 | val dseq_pos : ?name:string -> ('a * 'b) grammar 93 | -> ('a -> (Pos.pos * 'b -> 'c) grammar) -> 'c grammar 94 | 95 | (** type for a merge function *) 96 | type 'a merge = infos:Input.infos -> start:Input.byte_pos -> end_:Input.byte_pos 97 | -> 'a -> 'a -> 'a 98 | 99 | (** [cache g] avoids to parse twice the same input with [g] by memoizing the 100 | result of the first parsing. The optional [merge] parameter is applied to 101 | group semantics corresponding to the same part of the input. Using [cache] 102 | with [merge] allows to recover a polynomial time complexity (cubic at worst) 103 | and a quadratic space (in the size of the input) *) 104 | val cache : ?name:string -> ?merge:'a merge -> 'a grammar -> 'a grammar 105 | 106 | (** [set_debug_merge fmt] Will produce a debugging message for all ambiguities 107 | including the possition. *) 108 | val set_debug_merge : Format.formatter -> unit 109 | val unset_debug_merge : unit -> unit 110 | 111 | (** allows to perform a test, the test function receive the position before 112 | and after the blanks *) 113 | val test_before : ?name:string 114 | -> (Lex.buf -> Lex.idx -> Lex.buf -> Lex.idx -> bool) 115 | -> 'a grammar -> 'a grammar 116 | 117 | val test_after : ?name:string 118 | -> ('a -> Lex.buf -> Lex.idx -> Lex.buf -> Lex.idx -> bool) 119 | -> 'a grammar -> 'a grammar 120 | 121 | (** particular cases of the above testing the absence of blanks. *) 122 | val no_blank_before : ?name:string -> 'a grammar -> 'a grammar 123 | val no_blank_after : ?name:string -> 'a grammar -> 'a grammar 124 | 125 | (** [layout b g] changes the blank function to parse the input with the grammar 126 | [g]. The optional parameters allow to control which blanks are used at the 127 | boundary. Both can be used in which case the new blanks are used second 128 | before parsing with [g] and first after. *) 129 | val layout : ?name:string -> ?config:Blank.layout_config 130 | -> Blank.t -> 'a grammar -> 'a grammar 131 | 132 | (** usual option/star/plus combinator *) 133 | val option : ?name:string -> 'a grammar -> 'a option grammar 134 | val default_option : ?name:string -> 'a -> 'a grammar -> 'a grammar 135 | val star : ?name:string -> 'a grammar -> 'a list grammar 136 | val plus : ?name:string -> 'a grammar -> 'a list grammar 137 | val star_sep : ?name:string -> 'b grammar -> 'a grammar -> 'a list grammar 138 | val plus_sep : ?name:string -> 'b grammar -> 'a grammar -> 'a list grammar 139 | 140 | (** {2 Definition of recursive grammars } *) 141 | 142 | (** to define recursive grammars, one may declare the grammar first and then 143 | gives its value. [declare_grammar name] creates an undefined grammar with 144 | the given name *) 145 | val declare_grammar : string -> 'a grammar 146 | 147 | (** [set_grammar g1 g2] set the value of [g1] declared with [declare_grammar]. 148 | will raise [Invalid_argument] if [g1] was not defined using 149 | [declare_grammar] or if it was already set.*) 150 | val set_grammar : 'a grammar -> 'a grammar -> unit 151 | 152 | (** [fixpoint g] compute the fixpoint of [g], that is a grammar [g0] such that 153 | [g0 = g g0] *) 154 | val fixpoint : ?name:string -> ('a grammar -> 'a grammar) -> 'a grammar 155 | 156 | (** [grammar_family to_str name] returns a pair [(gs, set_gs)], where [gs] is a 157 | finite family of grammars parametrized by a value of type ['a]. A name 158 | [name] is to be provided for the family, and an optional function [to_str] 159 | can be provided to print the parameter and display better error messages. *) 160 | val grammar_family : ?param_to_string:('a -> string) -> string 161 | -> ('a -> 'b grammar) * (('a -> 'b grammar) -> unit) 162 | 163 | (** 164 | {[ 165 | (* Declare the grammar family *) 166 | let (gr, set_gr) = grammar_family to_str name in 167 | 168 | ... code using grammars of gr to define mutually recursive grammars ... 169 | 170 | (* Define the grammar family *) 171 | let _ = set_gr the_grammars 172 | 173 | ... now the new family can be used ... 174 | ]} 175 | *) 176 | 177 | (** {2 Compilation of a grammar and various} *) 178 | 179 | (** [compile g] produces a combinator that can be used to actually do the 180 | parsing see the [Comb] module *) 181 | val compile : 'a grammar -> 'a Comb.t 182 | 183 | (** gives the grammar name *) 184 | val grammar_name : 'a grammar -> string 185 | 186 | (** allows to rename a grammar *) 187 | val give_name : string -> 'a grammar -> 'a grammar 188 | 189 | (** Parse a whole input buffer. the eof combinator is added at 190 | the end of the given combinator *) 191 | val parse_buffer : 'a grammar -> Blank.t -> ?offset:Lex.idx -> Lex.buf -> 'a 192 | 193 | (** Partial parsing. Beware, the returned position is not the maximum position 194 | that can be reached by the grammar if the grammar is ambiguous. In this 195 | case, a message is printed on stderr. The charset is the character accepted 196 | at the end of input. Mainly useful with 'eof' when [blank_after] is 197 | [true]. *) 198 | val partial_parse_buffer : 'a grammar -> Blank.t -> ?blank_after:bool -> 199 | ?offset:Lex.idx -> Lex.buf -> 'a * Lex.buf * Lex.idx 200 | 201 | (** Returns all possible parse trees. Usefull for natural languages but also to 202 | debug ambiguity in a supposed non ambiguous grammar. *) 203 | val parse_all_buffer : 'a grammar -> Blank.t -> 204 | ?offset:Lex.idx -> Lex.buf -> 'a list 205 | 206 | (** Parse a whole string, reporting position according to utf8 if optional 207 | argument [utf8] is given and [Utf8.UTF8 or Utf8.CJK_UTF8] *) 208 | val parse_string : ?utf8:Utf8.context -> 'a grammar -> Blank.t -> string -> 'a 209 | 210 | (** Parse a whole input channel, reporting postiion according to utf8. 211 | After closing the file position reporting by parsing cannot be transformed 212 | bash to line/column number. *) 213 | val parse_channel : ?utf8:Utf8.context -> ?filename:string -> 214 | 'a grammar -> Blank.t -> in_channel -> 'a 215 | 216 | (** Parse a whole Unix.file_desc, reporting postiion according to utf8. 217 | After closing the file position reporting by parsing cannot be transformed 218 | bash to line/column number. *) 219 | val parse_fd : ?utf8:Utf8.context -> ?filename:string -> 220 | 'a grammar -> Blank.t -> Unix.file_descr -> 'a 221 | 222 | (** Parse a whole file, reporting postiion according to utf8. 223 | File is reopen to read position. So the file should not change on disk *) 224 | val parse_file : ?utf8:Utf8.context -> 225 | 'a grammar -> Blank.t -> string -> 'a 226 | -------------------------------------------------------------------------------- /lib/hashtbl_eq.ml: -------------------------------------------------------------------------------- 1 | (** [eq_closure] is an alternative to the polymorphic equality function [(=)], 2 | that compares closures using [(==)] instead of failing. Note that equality 3 | testing is consequently not perfect. *) 4 | let eq_closure : type a. a -> a -> bool = fun v1 v2 -> 5 | (* We remember encountered values in [eq_done] to handle cyclicity. *) 6 | let eq_done : (Obj.t * Obj.t) list ref = ref [] in 7 | let rec eq : Obj.t -> Obj.t -> bool = fun v1 v2 -> 8 | (* Physical equality is tested first. *) 9 | v1 == v2 || 10 | (* We then look at tags, and unfold potential forward blocks. *) 11 | let t1 = Obj.tag v1 in 12 | if t1 = Obj.forward_tag then eq (Obj.field v1 0) v2 else 13 | let t2 = Obj.tag v2 in 14 | if t2 = Obj.forward_tag then eq v1 (Obj.field v2 0) else 15 | (* Tags must otherwise be the same to have equality. *) 16 | t1 == t2 && 17 | (* Strings, doubles and arrays of doubles are compared using [=]. *) 18 | if t1 = Obj.string_tag then v1 = v2 else 19 | if t1 = Obj.double_tag then v1 = v2 else 20 | if t1 = Obj.double_array_tag then v1 = v2 else 21 | (* For everything that is not a non-constant constructors, equality failed 22 | at this point (e.g., for closures or sealed values). Such values are only 23 | considered equal if physical equality succeeds (it was tested already). *) 24 | Obj.first_non_constant_constructor_tag <= t1 && 25 | t1 <= Obj.last_non_constant_constructor_tag && 26 | Obj.size v1 == Obj.size v2 && (* Number of fields should be equal. *) 27 | (* We recursively explore the fields. *) 28 | let rec fn = function 29 | | (v1', v2')::l -> 30 | begin 31 | match (v1 == v1', v2 == v2') with 32 | | (true , true ) -> true 33 | | (true , false) -> false 34 | | (false, true ) -> false 35 | | (_ , _ ) -> fn l 36 | end 37 | | [] -> 38 | let rec gn i = 39 | i < 0 || (eq (Obj.field v1 i) (Obj.field v2 i) && gn (i-1)) 40 | in 41 | eq_done := (v1, v2) :: !eq_done; gn (Obj.size v1 - 1) 42 | in 43 | fn !eq_done 44 | in 45 | eq (Obj.repr v1) (Obj.repr v2) 46 | 47 | type ('a, 'b) t = 48 | { eq_key : 'a -> 'a -> bool 49 | (** Equality function for keys. *) 50 | ; mutable nb_buckets : int 51 | (** Number of buckets. *) 52 | ; mutable buckets : ('a * 'b) list array 53 | (** Array of buckets. *) 54 | ; mutable max_size : int 55 | (** Current maximum bucket size. *) 56 | ; mutable size_limit : int 57 | (** Maximum size allowed for a bucket. *) } 58 | 59 | (** Create an empty hash table. *) 60 | let create : ?eq_key:('a -> 'a -> bool) -> int -> ('a, 'b) t = 61 | fun ?(eq_key=eq_closure) nb_buckets -> 62 | let rec log2 n = if n <= 0 then 0 else 1 + log2 (n lsr 1) in 63 | let nb_buckets = max nb_buckets 8 in 64 | let buckets = Array.make nb_buckets [] in 65 | let size_limit = log2 nb_buckets + 7 in 66 | { eq_key ; nb_buckets ; buckets ; max_size = 0 ; size_limit } 67 | 68 | (** Iterates a function over the bindings of the given hash table. *) 69 | let iter : ('a -> 'b -> unit) -> ('a, 'b) t -> unit = fun f htbl -> 70 | Array.iter (List.iter (fun (k,v) -> f k v)) htbl.buckets 71 | 72 | (** Finds the bucket corresponding to the given key in the hash table. *) 73 | let find_bucket : ('a, 'b) t -> 'a -> int = fun htbl k -> 74 | Hashtbl.hash k mod htbl.nb_buckets 75 | 76 | (** Lookup function. *) 77 | let find : ('a, 'b) t -> 'a -> 'b = fun h k -> 78 | let i = find_bucket h k in 79 | let rec find = function 80 | | [] -> raise Not_found 81 | | (kv, v) :: xs -> if h.eq_key k kv then v else find xs 82 | in 83 | find h.buckets.(i) 84 | 85 | (** Insertion function (replacing existing binding). *) 86 | let rec add : ('a, 'b) t -> 'a -> 'b -> unit = fun h k v -> 87 | (* Doubles the size of the hash table. *) 88 | let grow : ('a, 'b) t -> unit = fun h -> 89 | let old_tbl = h.buckets in 90 | h.nb_buckets <- h.nb_buckets * 2; 91 | h.buckets <- Array.make h.nb_buckets []; 92 | h.size_limit <- h.size_limit + 1; 93 | h.max_size <- 0; 94 | Array.iter (List.iter (fun (k,v) -> add h k v)) old_tbl 95 | in 96 | (* Removes existing binding, or returns size of bucket with exception. *) 97 | let exception Size_is of int in 98 | let rec remove sz l = 99 | match l with 100 | | [] -> raise (Size_is sz) 101 | | b :: l -> if h.eq_key k (fst b) then l else b :: remove (sz+1) l 102 | in 103 | (* Find the right bucket and replace the binding (if any). *) 104 | let i = find_bucket h k in 105 | try h.buckets.(i) <- (k,v) :: remove 0 h.buckets.(i) with Size_is(sz) -> 106 | (* Otherwise insert the new binding. *) 107 | h.buckets.(i) <- (k,v) :: h.buckets.(i); 108 | h.max_size <- max h.max_size sz; 109 | (* Grow the table if the bucket is too large. *) 110 | if h.max_size > h.size_limit then grow h 111 | -------------------------------------------------------------------------------- /lib/hashtbl_eq.mli: -------------------------------------------------------------------------------- 1 | (** Custom hash-table module. *) 2 | 3 | (** The [Hashtbl] module (of the standard library) does not work when keys can 4 | contain closures, since they are compared with polymorphic equality [(=)]. 5 | For memoization though, using a perfect equality test is not important. In 6 | other words, it does not really matter if the equality test produces false 7 | negatives when comparing closures. We thus use an alternative, polymorphic 8 | equality function that behaves as [(=)], but compares closures with using 9 | [Marshall.to_string] instead of failing. *) 10 | 11 | (** Representation of a hash table with keys of type ['a] and elements of type 12 | ['b], with a custom equality test. *) 13 | type ('a, 'b) t 14 | 15 | (** [create ~eq_key n] creates an empty hash table with initial size n (if [n] 16 | is not reasonable then a default value is used), and using [eq_key] as an 17 | equality test for keys. If no [eq_key] is given, the function described in 18 | the preamble of this module is used. *) 19 | val create : ?eq_key:('a -> 'a -> bool) -> int -> ('a, 'b) t 20 | 21 | (** [add htbl k v] extends the hash table [htbl] with a new binding of key [k] 22 | to value [v]. Any previous binding for key [k] is removed. *) 23 | val add : ('a, 'b) t -> 'a -> 'b -> unit 24 | 25 | (** [find htbl k] returns the value maped to [k] in the hash table [htbl]. The 26 | exception [Not_found] is raised if there is no such binding. *) 27 | val find : ('a, 'b) t -> 'a -> 'b 28 | 29 | (** [iter f htbl] calls [f k v] for every binding of a key [k] to a value [v], 30 | in the hash table [htbl]. The order of iteration is unspecified. *) 31 | val iter : ('a -> 'b -> unit) -> ('a, 'b) t -> unit 32 | -------------------------------------------------------------------------------- /lib/heap.ml: -------------------------------------------------------------------------------- 1 | 2 | let empty = [] 3 | 4 | type 'a t = 'a list list 5 | 6 | let add cmp x h = 7 | let rec fn acc h = match h with 8 | | [] -> List.rev_append acc [[x]] 9 | | []::_ -> assert false 10 | | (y::_ as l)::h' -> 11 | match cmp x y with 12 | | 0 -> List.rev_append acc ((x::l) :: h') 13 | | n when n < 0 -> List.rev_append acc ([x]:: h) 14 | | _ -> fn (l::acc) h' 15 | in 16 | fn [] h 17 | 18 | let remove l = match l with 19 | | [] -> raise Not_found 20 | | [x]::h -> x, h 21 | | (x::l)::h -> (x, (l::h)) 22 | | []::_ -> assert false 23 | -------------------------------------------------------------------------------- /lib/heap.mli: -------------------------------------------------------------------------------- 1 | 2 | (** type of a priority heap, holding elements of type 'a *) 3 | type 'a t 4 | 5 | (** empty heap *) 6 | val empty : 'a t 7 | 8 | (** [add cmp x h] add [x] to the heap [h] with the priority [p] *) 9 | val add : ('a -> 'a -> int) -> 'a -> 'a t -> 'a t 10 | 11 | (** [remove h] returns an element with the least priority from the heap and the 12 | resulting heap. Semantics is unspecified for equal priorities *) 13 | val remove : 'a t -> ('a * 'a t) 14 | -------------------------------------------------------------------------------- /lib/input.ml: -------------------------------------------------------------------------------- 1 | type context = Utf8.context 2 | 3 | type stream_infos = 4 | | File of { name : string 5 | ; length : int 6 | ; date : float 7 | ; close : unit -> unit 8 | } 9 | | String of string 10 | | Stream 11 | 12 | let stream_infos_of_fd file_name fd close_file = 13 | let open Unix in 14 | let s = fstat fd in 15 | File { name = file_name 16 | ; length = s.st_size 17 | ; date = s.st_mtime 18 | ; close = close_file } 19 | 20 | let stream_infos_of_ch file_name ch = 21 | stream_infos_of_fd file_name (Unix.descr_of_in_channel ch) 22 | (fun () -> close_in ch) 23 | 24 | let stream_infos_of_str str = 25 | String str 26 | 27 | type infos = 28 | { utf8 : Utf8.context (** Uses utf8 for positions *) 29 | ; stream_infos : stream_infos (** The name of the buffer (e.g. file name) *) 30 | ; uid : int (** Unique identifier *) 31 | } 32 | 33 | let stream_infos infos = infos.stream_infos 34 | 35 | let filename infos = match infos.stream_infos with 36 | | File { name } -> name 37 | | _ -> "" 38 | 39 | let utf8 infos = infos.utf8 40 | 41 | type buffer = 42 | { boff : int (* Offset to the line ( bytes ) *) 43 | ; data : string (* Contents of the buffer *) 44 | ; mutable next : buffer Lazy.t (* Following line *) 45 | ; mutable ctnr : Container.t option array 46 | (* for map table, initialized if used *) 47 | ; infos : infos (* infos common to the whole file *) 48 | } 49 | 50 | let close buffer = 51 | match buffer.infos.stream_infos with 52 | | File f -> f.close () 53 | | _ -> () 54 | 55 | 56 | (* Generate a unique identifier. *) 57 | let new_uid = 58 | let c = ref 0 in 59 | fun () -> let uid = !c in incr c; uid 60 | 61 | (** infos function *) 62 | let infos b = b.infos 63 | 64 | let phantom_infos = 65 | { utf8 = Utf8.ASCII 66 | ; stream_infos = Stream 67 | ; uid = new_uid () 68 | } 69 | 70 | 71 | (** idx type and constant *) 72 | type idx = int 73 | 74 | let init_idx = 0 75 | 76 | (** byte_pos type and constant *) 77 | type byte_pos = int 78 | 79 | let int_of_byte_pos x = x 80 | 81 | let init_byte_pos = 0 82 | 83 | let phantom_byte_pos = -1 84 | 85 | (** spos type and constant *) 86 | type spos = infos * byte_pos 87 | 88 | let phantom_spos = (phantom_infos, phantom_byte_pos) 89 | 90 | (* Emtpy buffer. *) 91 | let empty_buffer infos boff = 92 | let rec line = lazy 93 | { boff; data = "" ; next = line ; infos ; ctnr = [||] } 94 | in Lazy.force line 95 | 96 | let is_eof b = b.data = "" 97 | 98 | let llen b = String.length b.data 99 | 100 | (* Test if a buffer is empty. *) 101 | let rec is_empty l idx = 102 | if idx < llen l then false 103 | else if idx = 0 then is_eof l 104 | else is_empty (Lazy.force l.next) (idx - llen l) 105 | 106 | (* Read the character at the given position in the given buffer. *) 107 | let [@inline] rec read l i = 108 | if i < llen l then (l.data.[i], l , i+1) 109 | else if is_eof l then ('\255', l, 0) 110 | else read (Lazy.force l.next) (i - llen l) 111 | 112 | (* Get the character at the given position in the given buffer. *) 113 | let [@nline] rec get l i = 114 | if i < llen l then l.data.[i] else 115 | if is_eof l then '\255' else 116 | get (Lazy.force l.next) (i - llen l) 117 | 118 | (* substring of a buffer *) 119 | let sub b i len = 120 | let s = Bytes.create len in 121 | let rec fn b i j = 122 | if j = len then Bytes.unsafe_to_string s 123 | else 124 | let (c,b,i) = read b i in 125 | Bytes.set s j c; 126 | fn b i (j+1) 127 | in 128 | fn b i 0 129 | 130 | (* byte position *) 131 | let [@inline] byte_pos b p = b.boff + p 132 | 133 | (* short position *) 134 | let [@inline] spos b p = (b.infos, b.boff + p) 135 | 136 | (* Equality of buffers. *) 137 | let buffer_equal b1 b2 = 138 | b1.infos.uid = b2.infos.uid && b1.boff = b2.boff 139 | 140 | (* Comparison of buffers. *) 141 | let buffer_compare b1 b2 = 142 | match b1.boff - b2.boff with 143 | | 0 -> b1.infos.uid - b2.infos.uid 144 | | c -> c 145 | 146 | (* Get the unique identifier of the buffer. *) 147 | let buffer_uid b = b.infos.uid 148 | 149 | 150 | exception NoLineNorColumnNumber 151 | 152 | 153 | let buf_size = 0x10000 154 | 155 | (* returns [(s,nl)] with [nl = true] iff there is a newline at the end of [s] *) 156 | let input_buffer ch = 157 | let res = Bytes.create buf_size in 158 | let n = input ch res 0 buf_size in 159 | if n = 0 then (* n = 0: we are at EOF *) 160 | raise End_of_file 161 | else if n = buf_size then 162 | Bytes.unsafe_to_string res 163 | else 164 | Bytes.sub_string res 0 n 165 | 166 | let fd_buffer fd = 167 | let res = Bytes.create buf_size in 168 | let n = Unix.read fd res 0 buf_size in 169 | if n = 0 then (* n = 0: we are at EOF *) 170 | raise End_of_file 171 | else if n = buf_size then 172 | Bytes.unsafe_to_string res 173 | else 174 | Bytes.sub_string res 0 n 175 | 176 | let from_fun utf8 stream_infos get_line file = 177 | let infos = { utf8; stream_infos; uid = new_uid () } in 178 | let cont boff = 179 | empty_buffer infos boff 180 | in 181 | let rec fn boff = 182 | begin 183 | (* Tail rec exception trick to avoid stack overflow. *) 184 | try 185 | let data = get_line file in 186 | let llen = String.length data in 187 | fun () -> 188 | { boff; data ; infos 189 | ; next = lazy (fn (boff + llen)) 190 | ; ctnr = [||] } 191 | with End_of_file -> 192 | fun () -> cont boff 193 | end () 194 | in 195 | fn 0 196 | 197 | let from_channel 198 | : ?utf8:context -> ?filename:string -> in_channel -> buffer = 199 | fun ?(utf8=Utf8.ASCII) ?(filename="") ch -> 200 | let filename = stream_infos_of_ch filename ch in 201 | from_fun utf8 filename input_buffer ch 202 | 203 | let from_fd 204 | : ?utf8:context -> ?filename:string -> Unix.file_descr -> buffer = 205 | fun ?(utf8=Utf8.ASCII) ?(filename="") fd -> 206 | let filename = stream_infos_of_fd filename fd (fun () -> Unix.close fd) in 207 | from_fun utf8 filename fd_buffer fd 208 | 209 | let from_file : ?utf8:context -> string -> buffer = 210 | fun ?(utf8=Utf8.ASCII) filename -> 211 | let fd = Unix.(openfile filename [O_RDONLY] 0) in 212 | let filename = stream_infos_of_fd filename fd (fun () -> Unix.close fd) in 213 | from_fun utf8 filename fd_buffer fd 214 | 215 | let from_string : ?utf8:context -> string -> buffer = 216 | fun ?(utf8=Utf8.ASCII) str -> 217 | let stream_infos = stream_infos_of_str str in 218 | let b = ref true in 219 | let string_buffer () = 220 | if !b then (b := false; str) else raise End_of_file 221 | in 222 | from_fun utf8 stream_infos string_buffer () 223 | 224 | let leq_buf {boff = b1} i1 {boff = b2} i2 = 225 | b1 < b2 || (b1 = b2 && (i1 <= i2)) 226 | 227 | let buffer_before b1 i1 b2 i2 = leq_buf b1 i1 b2 i2 228 | 229 | (** Table to associate value to positions in input buffers *) 230 | module Tbl = struct 231 | type 'a t = 'a Container.table 232 | 233 | let create = Container.create_table 234 | 235 | let ctnr buf idx = 236 | if buf.ctnr = [||] then 237 | buf.ctnr <- Array.make (llen buf + 1) None; 238 | let a = buf.ctnr.(idx) in 239 | match a with 240 | | None -> let c = Container.create () in buf.ctnr.(idx) <- Some c; c 241 | | Some c -> c 242 | 243 | let add tbl buf idx x = 244 | Container.add tbl (ctnr buf idx) x 245 | 246 | let find tbl buf idx = 247 | Container.find tbl (ctnr buf idx) 248 | 249 | let clear = Container.clear 250 | 251 | let iter : type a. a t -> (a -> unit) -> unit = fun tbl f -> 252 | Container.iter f tbl 253 | 254 | end 255 | -------------------------------------------------------------------------------- /lib/input.mli: -------------------------------------------------------------------------------- 1 | (** A module providing efficient input buffers with preprocessing. *) 2 | 3 | (** {2 Type} *) 4 | 5 | (** Information for a file for reopening *) 6 | type stream_infos = 7 | (* regular file, that can be reopened *) 8 | | File of { name : string 9 | ; length : int 10 | ; date : float 11 | ; close : unit -> unit 12 | } 13 | (* string *) 14 | | String of string 15 | (* stream: no detailed position available *) 16 | | Stream 17 | 18 | (** The abstract type for an input buffer. *) 19 | type buffer 20 | 21 | (** Type of fixed data attached to the buffer (like file name) *) 22 | type infos 23 | 24 | (** return the [infos] associated to a buffer *) 25 | val infos : buffer -> infos 26 | 27 | (** dummy infos *) 28 | val phantom_infos : infos 29 | 30 | (** returns the [stream_infos] stored in [infos] *) 31 | val stream_infos : infos -> stream_infos 32 | 33 | (** returns the filename if it exists, the empty string otherwise *) 34 | val filename : infos -> string 35 | 36 | (** [utf8 infos] return the unicode context in use for this file *) 37 | val utf8 : infos -> Utf8.context 38 | 39 | (** The abstract type position relative to the current buffer *) 40 | type idx 41 | 42 | (** position at the beginning of a buffer *) 43 | val init_idx : idx 44 | 45 | (** The abstract position relative to the beginning of buffer *) 46 | type byte_pos 47 | 48 | (** convert byte_pos to natural number *) 49 | val int_of_byte_pos : byte_pos -> int 50 | 51 | (** zero *) 52 | val init_byte_pos : byte_pos 53 | 54 | (** dummy value, to initiaize references for instance *) 55 | val phantom_byte_pos : byte_pos 56 | 57 | (** Short (and quick) type for positions *) 58 | type spos = infos * byte_pos 59 | 60 | (** dummy value, to initiaize references for instance *) 61 | val phantom_spos : spos 62 | 63 | (** {2 Reading from a buffer} *) 64 | 65 | (** [read buf idx] returns the character at position [idx] in the buffer [buf], 66 | together with the new buffer and position. Read infinitely many '\255' at end 67 | of buffer *) 68 | val read : buffer -> idx -> char * buffer * idx 69 | 70 | (** [sub b i len] returns [len] characters from position [idx]. If the end of 71 | buffer is reached, the string is filed with eof '\255' *) 72 | val sub : buffer -> idx -> int -> string 73 | 74 | (** [get buf idx] returns the character at position [idx] in the buffer 75 | [buf]. *) 76 | val get : buffer -> idx -> char 77 | 78 | (** {2 Creating a buffer} *) 79 | 80 | type context = Utf8.context 81 | 82 | (** [from_file fn] returns a buffer constructed using the file [fn]. 83 | 84 | If [utf8] is [Utf8.UTF8] or [Utf8.CJK_UTF8] ([Utf8.ASCII] is the default), 85 | positions are reported according to [utf8]. [read] is still reading bytes. 86 | 87 | Getting line number and column number requires rescanning the file and if 88 | the file is not a regular file, it is kept in memory. Setting [rescan] to 89 | false avoid this, but only byte position and file name will be available. *) 90 | val from_file : ?utf8:context -> string -> buffer 91 | 92 | (** [from_channel ~filename ch] returns a buffer constructed using the channel 93 | [ch]. The optional [filename] is only used as a reference to the channel in 94 | error messages. 95 | 96 | [uft8] and [rescan] as in [from_file]. *) 97 | val from_channel : ?utf8:context -> ?filename:string 98 | -> in_channel -> buffer 99 | 100 | (** Same as above for file descriptor *) 101 | val from_fd : ?utf8:context -> ?filename:string 102 | -> Unix.file_descr -> buffer 103 | 104 | (** For buffer represented by file, this will close the file *) 105 | val close: buffer -> unit 106 | 107 | (** [from_string ~filename str] returns a buffer constructed using the string 108 | [str]. The optional [filename] is only used as a reference to the channel in 109 | error messages. *) 110 | val from_string : ?utf8:context -> string -> buffer 111 | 112 | (** {2 Buffer manipulation functions} *) 113 | 114 | (** [is_empty buf] test whether the buffer [buf] is empty. *) 115 | val is_empty : buffer -> int -> bool 116 | 117 | exception NoLineNorColumnNumber 118 | 119 | (** position in bytes, regardless to utf8 *) 120 | val byte_pos : buffer -> idx -> byte_pos 121 | 122 | (** get spos from buffer and idx, to get line_num and col_num if needed 123 | later. *) 124 | val spos : buffer -> idx -> spos 125 | 126 | (** [buffer_uid buf] returns a unique identifier. [Input.read] does not change 127 | the uid. The uid is created when creating the initial buffer. *) 128 | val buffer_uid : buffer -> int 129 | 130 | (** [buffer_eq b1 b2] tests the equality of [b1] and [b2]. *) 131 | val buffer_equal : buffer -> buffer -> bool 132 | 133 | (** [buffer_compare b1 b2] compares [b1] and [b2]. *) 134 | val buffer_compare : buffer -> buffer -> int 135 | 136 | (** [buffer_before b1 i1 b2 i2] returns true if the position [b1, i1] is before 137 | [b2, i2]. Gives meaningless result if [b1] and [b2] do not refer to the same 138 | file, i.e. do not have the same uid. *) 139 | val buffer_before : buffer -> int -> buffer -> int -> bool 140 | 141 | (** Table to associate value to positions in input buffers. The complexity of 142 | access in the table is O(ln(N)) where N is the number of tables. *) 143 | module Tbl : sig 144 | type 'a t 145 | 146 | val create : unit -> 'a t 147 | 148 | val add : 'a t -> buffer -> idx -> 'a -> unit 149 | 150 | val find : 'a t -> buffer -> idx -> 'a 151 | 152 | val clear : 'a t -> unit 153 | 154 | val iter : 'a t -> ('a -> unit) -> unit 155 | end 156 | -------------------------------------------------------------------------------- /lib/interpolate.ml: -------------------------------------------------------------------------------- 1 | 2 | let solve mat vector = 3 | let dim = Array.length vector in 4 | 5 | for i = 0 to dim - 2 do 6 | 7 | let pivot, pivot_val = 8 | let r = ref (-1, 0.0) in 9 | for j = i to dim - 1 do 10 | let v = abs_float mat.(j).(i) in 11 | if v > snd !r then r := (j, v) 12 | done; 13 | !r 14 | in 15 | if pivot = -1 then failwith "non invertible"; 16 | for j = i to dim-1 do 17 | let v = mat.(pivot).(j) in 18 | mat.(pivot).(j) <- mat.(i).(j); 19 | mat.(i).(j) <- v 20 | done; 21 | let v = vector.(pivot) in 22 | vector.(pivot) <- vector.(i); 23 | vector.(i) <- v; 24 | 25 | for j = i+1 to dim-1 do 26 | let v = mat.(j).(i) in 27 | mat.(j).(i) <- 0.0; 28 | for k = i+1 to dim-1 do 29 | mat.(j).(k) <- mat.(j).(k) -. v *. mat.(i).(k) /. pivot_val 30 | done; 31 | vector.(j) <- vector.(j) -. v *. vector.(i) /. pivot_val 32 | done; 33 | 34 | done; 35 | 36 | let r = Array.copy vector in 37 | 38 | for i = dim - 1 downto 0 do 39 | for j = i + 1 to dim - 1 do 40 | r.(i) <- r.(i) -. r.(j) *. mat.(i).(j) 41 | done; 42 | r.(i) <- r.(i) /. mat.(i).(i) 43 | done; 44 | 45 | r 46 | 47 | type 'a base = ('a -> float) array 48 | 49 | module type Base = sig 50 | type input 51 | val base : input base 52 | end 53 | 54 | module type Interpolation = sig 55 | type input 56 | type interpolation 57 | 58 | val get : interpolation -> float array 59 | 60 | val zero : interpolation 61 | 62 | val compute : interpolation -> input -> float 63 | 64 | val compute_coefs : (input * float) array -> interpolation 65 | 66 | val error : (input * float) array -> interpolation -> float 67 | 68 | val print : out_channel -> interpolation -> unit 69 | end 70 | 71 | module Make(B:Base) = struct 72 | 73 | let funs = B.base 74 | let dim = Array.length funs 75 | 76 | let get x = x 77 | 78 | let zero = Array.make dim 0.0 79 | 80 | type input = B.input 81 | type interpolation = float array 82 | 83 | let compute (coefs:float array) (x:input) = 84 | let r = ref 0.0 in 85 | for i = 0 to dim - 1 do 86 | r := !r +. coefs.(i) *. funs.(i)(x) 87 | done; 88 | !r 89 | 90 | let compute_coefs (samples:(input * float) array) = 91 | let ns = Array.length samples in 92 | let m i j = funs.(i)(fst samples.(j)) in 93 | let v j = snd samples.(j) in 94 | let a = 95 | Array.init dim (fun i -> 96 | Array.init dim (fun i' -> 97 | let r = ref 0.0 in 98 | for j = 0 to ns - 1 do 99 | r := !r +. m i j *. m i' j 100 | done; 101 | !r)) 102 | in 103 | let b = 104 | Array.init dim (fun i -> 105 | let r = ref 0.0 in 106 | for j = 0 to ns - 1 do 107 | r := !r +. m i j *. v j 108 | done; 109 | !r) 110 | in 111 | solve a b 112 | 113 | let print ch a = 114 | Printf.fprintf ch "("; 115 | for i = 0 to dim - 1 do 116 | Printf.fprintf ch "%s%e" (if i > 0 then ", " else "") a.(i) 117 | done; 118 | Printf.fprintf ch ")" 119 | 120 | let error (samples:(input * float) array) coefs = 121 | let nb = float (Array.length samples) in 122 | let error = Array.fold_left (fun e (n, x) -> 123 | let dx = (x -. compute coefs n) /. x in 124 | e +. dx *. dx) 0.0 samples 125 | in 126 | sqrt (error /. nb) 127 | 128 | end 129 | -------------------------------------------------------------------------------- /lib/interpolate.mli: -------------------------------------------------------------------------------- 1 | 2 | type 'a base = ('a -> float) array 3 | 4 | module type Base = sig 5 | type input 6 | val base : input base 7 | end 8 | 9 | module type Interpolation = sig 10 | type input 11 | type interpolation 12 | 13 | val get : interpolation -> float array 14 | 15 | val zero : interpolation 16 | 17 | val compute : interpolation -> input -> float 18 | 19 | val compute_coefs : (input * float) array -> interpolation 20 | 21 | val error : (input * float) array -> interpolation -> float 22 | 23 | val print : out_channel -> interpolation -> unit 24 | end 25 | 26 | module Make(B:Base) : Interpolation with type input = B.input 27 | -------------------------------------------------------------------------------- /lib/keywords.ml: -------------------------------------------------------------------------------- 1 | module type Spec = 2 | sig 3 | val id_charset : Charset.t 4 | val reserved : string list 5 | end 6 | 7 | let keyword_uid = ref 0 8 | 9 | module Make(S : Spec) = 10 | struct 11 | let reserved = Word_list.create () 12 | 13 | let uid = incr keyword_uid; !keyword_uid 14 | 15 | let mem : string -> bool = fun s -> 16 | Word_list.mem_ascii reserved s 17 | 18 | let reserve : string -> unit = fun s -> 19 | try Word_list.add_ascii reserved s () 20 | with Word_list.Already_bound -> invalid_arg "already reserved" 21 | 22 | let _ = List.iter reserve S.reserved 23 | 24 | let check : string -> unit = fun s -> 25 | if mem s then Lex.give_up () 26 | 27 | let special : string -> unit Grammar.t = fun s -> 28 | if s = "" then invalid_arg "empty word"; 29 | let fn str pos = 30 | let str = ref str in 31 | let pos = ref pos in 32 | for i = 0 to String.length s - 1 do 33 | let (c, str', pos') = Input.read !str !pos in 34 | if c <> s.[i] then Lex.give_up (); 35 | str := str'; pos := pos' 36 | done; 37 | let c = Input.get !str !pos in 38 | if Charset.mem S.id_charset c then Lex.give_up (); 39 | ((), !str, !pos) 40 | in 41 | let n = Printf.sprintf "%S" s in 42 | Grammar.term { n; f = fn ; a = Keyword(s,uid); 43 | c = Charset.singleton s.[0] } 44 | 45 | let create : string -> unit Grammar.t = fun s -> 46 | if mem s then invalid_arg "keyword already defined"; 47 | reserve s; special s 48 | end 49 | -------------------------------------------------------------------------------- /lib/keywords.mli: -------------------------------------------------------------------------------- 1 | (** Signature of a module for keyword properties. *) 2 | module type Spec = 3 | sig 4 | (** [id_charset] contains the characters that are not allowed to directly 5 | follow a keyword (because it would make it an identifier for example). 6 | In general, it should correspond to characters that are can be used in 7 | identifiers. *) 8 | val id_charset : Charset.t 9 | 10 | (** [reserved] is a liste of words that must be rejected as identifiers as 11 | if they were keywords. Note that if a word is in the list, it will not 12 | be possible to define it as a keyword. *) 13 | val reserved : string list 14 | end 15 | 16 | module Make(S : Spec) : 17 | sig 18 | (** [reserve s] reserves the word [s] as if it was a keyword, although it 19 | will not be defined as one. The exception [Invalid_argument] is raised 20 | if the word [s] is already reserved. *) 21 | val reserve : string -> unit 22 | 23 | (** [mem s] tests whether [s] is a reserved word or not. *) 24 | val mem : string -> bool 25 | 26 | (** [check s] calls [Earley.give_up ()] if [s] is a reserved word. It can 27 | be used to reject keywords while parsing identifiers for example. *) 28 | val check : string -> unit 29 | 30 | (** [create s] reserves the keyword [s] and returns a parser accepting the 31 | string [s], not followed by a character of [S.id_charset]. In the case 32 | where [s] is already reserved, [Invalid_argument] is raised. *) 33 | val create : string -> unit Grammar.t 34 | 35 | (** [special s] accpets the same input as [create s], but the word [s] is 36 | not reserved. *) 37 | val special : string -> unit Grammar.t 38 | end 39 | -------------------------------------------------------------------------------- /lib/lex.mli: -------------------------------------------------------------------------------- 1 | (** {1 Lexing: grouping characters before parsing} 2 | 3 | It is traditionnal to do parsing in two phases (scanning/parsing). This is 4 | not necessary with combinators in general (scannerless). This is still true 5 | with Pacomb. However, this makes the grammar more readable to use a lexing 6 | phase. 7 | 8 | Moreover, lexing is often done with a longuest match rule that is not 9 | semantically equivalent to the semantics of context free grammar. 10 | 11 | This modules provide combinator to create terminals that the parser will 12 | call. 13 | 14 | *) 15 | 16 | (** {2 Types and exception} *) 17 | 18 | (** Position in a buffer is a [Input.buffer] together with an index 19 | [Input.pos]. *) 20 | type buf = Input.buffer 21 | type idx = Input.idx 22 | 23 | (** Type of terminal function, similar to blank, but with a returned value *) 24 | type 'a lexeme = buf -> idx -> 'a * buf * idx 25 | 26 | (** ast for terminals, needed for equality *) 27 | type _ ast = 28 | | Any : char ast 29 | | Any_utf8 : Uchar.t ast 30 | | Any_grapheme : string ast 31 | | Eof : unit ast 32 | | Char : char -> unit ast 33 | | Grapheme : string -> unit ast 34 | | String : string -> unit ast 35 | | Nat : int ast 36 | | Int : int ast 37 | | Float : float ast 38 | | CharLit : char ast 39 | | StringLit : string ast 40 | | Test : (char -> bool) -> char ast 41 | | NotTest : (char -> bool) -> unit ast 42 | | Seq : 'a t * 'b t * ('a -> 'b -> 'c) * 'c Assoc.key -> 'c ast 43 | | Alt : 'a t * 'a t -> 'a ast 44 | | Save : 'a t * (string -> 'a -> 'b) * 'b Assoc.key -> 'b ast 45 | | Option : 'a * 'a t -> 'a ast 46 | | Appl : 'a t * ('a -> 'b) * 'b Assoc.key -> 'b ast 47 | | Star : 'a t * (unit -> 'b) * ('b -> 'a -> 'b) * 'b Assoc.key -> 'b ast 48 | | Plus : 'a t * (unit -> 'b) * ('b -> 'a -> 'b) * 'b Assoc.key -> 'b ast 49 | | Sub : 'a t * ('a -> bool) * 'a Assoc.key -> 'a ast 50 | | Keyword : string * int -> unit ast 51 | | Custom : 'a lexeme * 'a Assoc.key -> 'a ast 52 | 53 | (** The previous types encapsulated in a record *) 54 | and 'a terminal = { n : string (** name *) 55 | ; f : 'a lexeme (** the terminal itself *) 56 | ; a : 'a ast 57 | ; c : Charset.t (** the set of characters accepted 58 | at the beginning of input *) } 59 | 60 | (** Abbreviation *) 61 | and 'a t = 'a terminal 62 | 63 | (** exception when failing, 64 | - can be raised (but not captured) by terminals 65 | - can be raised (but not captured) by action code in the grammar, see 66 | [Combinator.give_up] 67 | - will be raised and captured by [Combinator.parse_buffer] that will give 68 | the most advanced position *) 69 | exception NoParse 70 | 71 | (** from action ony may give an error message when rejecting a rule *) 72 | exception Give_up of string 73 | 74 | (** [give_up ()] rejects parsing from a corresponding semantic action. An error 75 | message can be provided. Can be used both in the semantics of terminals and 76 | parsing rules. *) 77 | val give_up : ?msg:string -> unit -> 'a 78 | 79 | (** {2 Combinators to create terminals} *) 80 | 81 | (** accept any character, except eof*) 82 | val any : ?name:string -> unit -> char t 83 | 84 | (** Terminal accepting the end of a buffer only. remark: [eof] is automatically 85 | added at the end of a grammar by [Combinator.parse_buffer]. 86 | [name] default is ["EOF"] *) 87 | val eof : ?name:string -> unit -> unit t 88 | 89 | (** Terminal accepting a given char, remark: [char '\255'] is equivalent to 90 | [eof]. 91 | [name] default is the given charater. *) 92 | val char : ?name:string -> char -> unit t 93 | 94 | (** Accept any character for which the test returns [true]. 95 | [name] default to the result of [Charset.show]. *) 96 | val test : ?name:string -> (char -> bool) -> char t 97 | 98 | (** Accept a character in the given charset. [name] default as in [test] *) 99 | val charset : ?name:string -> Charset.t -> char t 100 | 101 | (** Reject the input (raises [Noparse]) if the first character of the input 102 | passed the test. Does not read the character if the test fails. 103 | [name] default to ["^"] prepended to the result of [Charset.show]. *) 104 | val not_test : ?name:string -> (char -> bool) -> unit t 105 | 106 | (** Reject the input (raises [Noparse]) if the first character of the input is 107 | in the charset. Does not read the character if not in the charset. 108 | [name] default as in [not_test] *) 109 | val not_charset : ?name:string -> Charset.t -> unit t 110 | 111 | (** Does a test on the result of a given lexer and reject if it returns 112 | false. You may provide a restricted charset for the set of charaters 113 | accepted in the initial position. *) 114 | val sub : ?name:string -> ?charset:Charset.t -> 'a t -> ('a -> bool) -> 'a t 115 | 116 | (** Compose two terminals in sequence. [name] default is the concatenation of 117 | the two names. *) 118 | val seq : ?name:string -> 'a t -> 'b t -> ('a -> 'b -> 'c) -> 'c t 119 | 120 | (** variation on the above *) 121 | val seq1 : ?name:string -> 'a t -> 'b t -> 'a t 122 | val seq2 : ?name:string -> 'a t -> 'b t -> 'b t 123 | val seqs : 'a t list -> ('a -> 'a -> 'a) -> 'a t 124 | 125 | (** [save t f] save the part of the input parsed by the terminal [t] and combine 126 | it with its semantics using [f] *) 127 | val save : ?name:string -> 'a t -> (string -> 'a -> 'b) -> 'b t 128 | 129 | (** [alt t1 t2] parses the input with [t1] or [t2]. Contrary to grammars, 130 | terminals does not use continuations, if [t1] succeds, no backtrack will be 131 | performed to try [t2]. For instance, 132 | {[seq1 (alt (char 'a' ()) 133 | (seq1 (char 'a' ()) (char 'b' ()))) 134 | (char 'b' ())]} 135 | will reject "ab". 136 | If both [t1] and [t2] accept the input, longuest match is selected. 137 | [name] default to [sprintf "(%s)|(%s)" t1.n t2.n]. *) 138 | val alt : ?name:string -> 'a t -> 'a t -> 'a t 139 | val alts : 'a t list -> 'a t 140 | 141 | (** [option x t] parses the given terminal 0 or 1 time. [x] is returned if 0. 142 | [name] defaults to [sprintf "(%s)?" t.n]. *) 143 | val option : ?name:string -> 'a -> 'a t -> 'a t 144 | 145 | (** Applies a function to the result of the given terminal. 146 | [name] defaults to the terminal name. *) 147 | val appl : ?name:string -> ('a -> 'b) -> 'a t -> 'b t 148 | 149 | (** [star t a f] Repetition of a given terminal 0,1 or more times. The type of 150 | function to compose the action allows for ['b = Buffer.t] for 151 | efficiency. The returned value is [f ( ... (f(f (a ()) x_1) x_2) ...) x_n] 152 | if [t] returns [x_1] ... [x_n]. 153 | [name] defaults to [sprintf "(%s)*" t.n] *) 154 | val star : ?name:string -> 'a t -> (unit -> 'b) -> ('b -> 'a -> 'b) -> 'b t 155 | 156 | (** Same as above but parses at least once.*) 157 | val plus : ?name:string -> 'a t -> (unit -> 'b) -> ('b -> 'a -> 'b) -> 'b t 158 | 159 | (** [string s] Accepts only the given string. 160 | Raises [Invalid_argument] if [s = ""]. 161 | [name] defaults to [sprintf "%S" s]. *) 162 | val string : ?name:string -> string -> unit t 163 | 164 | (** Parses an natural in base 10. ["-42"] and ["-42"] are not accepted. 165 | [name] defaults to ["NAT"] *) 166 | val nat : ?name:string -> unit -> int t 167 | 168 | (** Parses an integer in base 10. ["+42"] is accepted. 169 | [name] defaults to ["INT"] *) 170 | val int : ?name:string -> unit -> int t 171 | 172 | (** Parses a float in base 10. [".1"] is accepted as ["0.1"] 173 | [name] defaults to ["FLOAT"] *) 174 | val float : ?name:string -> unit -> float t 175 | 176 | (** Parses a char litteral 'c' using ocaml escaping convention 177 | [name] defaults to ["CHARLIT"] *) 178 | val char_lit : ?name:string -> unit -> char t 179 | 180 | (** Parses a string litteral "cccc" using ocaml escaping convention 181 | [name] defaults to ["STRINGLIT"] *) 182 | val string_lit : ?name:string -> unit -> string t 183 | 184 | (** Parses a unicode UTF8 char 185 | [name] defaults to ["UTF8"] *) 186 | val any_utf8 : ?name:string -> unit -> Uchar.t t 187 | 188 | (** [utf8 c] parses a specific unicode char and returns [()], 189 | [name] defaults to the string representing the char *) 190 | val utf8 : ?name:string -> Uchar.t -> unit t 191 | 192 | (** Parses any utf8 grapheme. 193 | [name] defaults to ["GRAPHEME"] *) 194 | val any_grapheme : ?name:string -> unit -> string t 195 | 196 | (** [grapheme s] parses the given utf8 grapheme and return [()]. 197 | The difference with [string s x] is that if the input starts 198 | with a grapheme [s'] such that [s] is a strict prefix of [s'], 199 | parsing will fail. 200 | [name] defaults to ["GRAPHEME("^s^")"] *) 201 | val grapheme : ?name:string -> string -> unit t 202 | 203 | (** Test wether a terminal accept the empty string. Such a terminal are illegal 204 | in a grammar, but may be used in combinator below to create terminals *) 205 | val accept_empty : 'a t -> bool 206 | 207 | (** Test constructor for the test constructor in [Grammar] *) 208 | val test_from_lex : bool t -> buf -> idx -> buf -> idx -> bool 209 | val blank_test_from_lex : bool t -> buf -> idx -> buf -> idx -> bool 210 | 211 | (** equality, incomplete in particular for "alt" *) 212 | val eq : 'a t -> 'b t -> ('a,'b) Assoc.eq 213 | 214 | (** If you build custom lexeme, you need to use this to fill the 215 | [a] field of the record *) 216 | val custom : 'a lexeme -> 'a ast 217 | 218 | (** where to put it ... *) 219 | val default : 'a -> 'a option -> 'a 220 | -------------------------------------------------------------------------------- /lib/pos.ml: -------------------------------------------------------------------------------- 1 | (** Functions managing positions *) 2 | 3 | (** Position in AST resulting from parsetree may be a non negligible part of 4 | parsing complexity, both in speed and space. For instance: - computing column 5 | number in Unicode is costly - all information about a position takes a lot of 6 | machine words (up to 10) 7 | 8 | Moreover, in most cases, position are only used in error messages! 9 | 10 | Pacomb proposes to use three levels of positions to minimise the costs: 11 | 12 | 1°) the type Pos.t = Input.spos is the type return to action. It is a 13 | position in bytes together with some information about the file. 14 | 15 | 2°) type type 'a located (or pos = unit located) is some data decorated with 16 | a start and end position. This is the type recommended to be used in AST. it 17 | can be constructed by the in_pos, no_pos and mk_pos functions below. Each 18 | AST node located with a position costs 5 words. 19 | 20 | 3°) the type pos_info contain all the information about a position. It is 21 | produced by rescanning the file (or the string). If one parses a stream which 22 | is not a regular file, or if the regular file is no more available, the 23 | position is not really useful and pacomb will still print the byte position. 24 | *) 25 | 26 | (** byte position from input, type type returned by parsing combinator *) 27 | type spos = Input.spos 28 | 29 | (** Type to represent data with a position *) 30 | type pos = 31 | { offset_start : int 32 | ; offset_end : int 33 | ; infos : Input.infos } 34 | 35 | let mk_pos s e infos = 36 | { offset_start = Input.int_of_byte_pos s 37 | ; offset_end = Input.int_of_byte_pos e 38 | ; infos } 39 | 40 | (** merging of two positions, create the smallest position pair containing both 41 | *) 42 | let merge : pos -> pos -> pos = fun p1 p2 -> 43 | { p1 with 44 | offset_start = min p1.offset_start p2.offset_end 45 | ; offset_end = max p1.offset_end p2.offset_end } 46 | 47 | 48 | let phantom_pos = 49 | { offset_start = -1 50 | ; offset_end = -1 51 | ; infos = Input.phantom_infos } 52 | let no_pos = phantom_pos 53 | 54 | let has_pos p = 55 | p.offset_start <> -1 && 56 | p.offset_end <> -1 57 | 58 | let file_cache = Hashtbl.create 32 59 | let str_cache = Hashtbl.create 32 60 | 61 | (* The original file is not available *) 62 | exception No_detailed_position 63 | 64 | (* The file is available but has changed *) 65 | exception File_changed 66 | 67 | let init_file_cache file_name length date = 68 | try 69 | let ch = open_in file_name in 70 | let open Unix in 71 | let s = fstat (descr_of_in_channel ch) in 72 | if s.st_size <> length || s.st_mtime <> date then raise File_changed; 73 | let r =(ch, ref []) in 74 | Hashtbl.add file_cache file_name r; 75 | r 76 | with 77 | | Sys_error _ -> raise No_detailed_position 78 | 79 | let init_str_cache str = 80 | let r =ref [] in 81 | Hashtbl.add str_cache str r; 82 | r 83 | 84 | type pos_info = 85 | { start_line : int 86 | ; start_col : int 87 | ; start_line_offset : int 88 | ; start_byte : int 89 | ; end_line : int 90 | ; end_col : int 91 | ; end_line_offset : int 92 | ; end_byte : int 93 | ; file_name : string 94 | ; text : string } 95 | 96 | let cache_interval = 1024 97 | 98 | let ch_sub_string ch n p = 99 | seek_in ch n; 100 | let s = p-n in 101 | let buf = Bytes.create s in 102 | really_input ch buf 0 s; 103 | Bytes.unsafe_to_string buf 104 | 105 | let ch_col_num ch utf8 n p = 106 | if utf8 = Utf8.ASCII then p - n 107 | else 108 | begin 109 | let str = ch_sub_string ch n p in 110 | (*Printf.printf "text: %s\n%!" str;*) 111 | Utf8.length utf8 str 112 | end 113 | 114 | let str_col_num str utf8 n p = 115 | if utf8 = Utf8.ASCII then p - n 116 | else 117 | begin 118 | let str = String.sub str n (p-n) in 119 | Utf8.length utf8 str 120 | end 121 | 122 | let pos_info 123 | : ?relocate:(string -> string) -> ?text:bool -> pos -> pos_info 124 | = fun ?(relocate=fun x -> x) ?(text=false) pos -> 125 | let file_name, cache, seek_in, input_char, col_num, sub_string = 126 | match Input.stream_infos pos.infos with 127 | | File { name; length; date } -> 128 | let name0 = relocate name in 129 | let (ch, cache) = try Hashtbl.find file_cache name0 130 | with Not_found -> init_file_cache name0 length date 131 | in 132 | let seek_in = seek_in ch in 133 | let input_char () = input_char ch in 134 | (name, cache, seek_in, input_char, ch_col_num ch, ch_sub_string ch) 135 | | String str -> 136 | let cache = try Hashtbl.find str_cache str 137 | with Not_found -> init_str_cache str 138 | in 139 | let pos = ref 0 in 140 | let seek_in n = pos := n in 141 | let input_char () = 142 | if !pos >= String.length str then raise End_of_file; 143 | let x = !pos in incr pos; 144 | str.[x] 145 | in 146 | ("", cache, seek_in, input_char, str_col_num str, String.sub str) 147 | | Stream -> 148 | raise No_detailed_position 149 | in 150 | let n0 = pos.offset_start in 151 | let n1 = pos.offset_end in 152 | if n0 = -1 || n1 = -1 then raise No_detailed_position; 153 | let utf8 = Input.utf8 pos.infos in 154 | let rec fn n = function 155 | | (p, _ , _)::ls when p > n -> fn n ls 156 | | c ::_ -> c 157 | | [] -> (0, 0, 1) 158 | in 159 | let rec gn n p lo ln = 160 | if p mod cache_interval = 0 then cache := (p, lo, ln) :: !cache; 161 | if p = n then (lo, ln) else 162 | begin 163 | assert(p < n); 164 | let c = try input_char () with End_of_file -> assert false in 165 | (*Printf.printf "%d %d %d %d %C\n" n p lo ln c;*) 166 | let p = p + 1 in 167 | if c = '\n' then gn n p p (ln + 1) 168 | else gn n p lo ln 169 | end 170 | in 171 | let to_eol n = 172 | let rec fn n = 173 | try 174 | let c = input_char () in 175 | if c = '\n' then n else fn (n+1) 176 | with 177 | End_of_file -> n 178 | in 179 | seek_in n; 180 | fn n 181 | in 182 | let (start_line_offset, start_line) = 183 | let (p, lo, ln) = fn n0 !cache in 184 | seek_in p; 185 | gn n0 p lo ln 186 | in 187 | let (end_line_offset , end_line) = 188 | let (p, lo, ln) = fn n1 !cache in 189 | seek_in p; 190 | gn n1 p lo ln 191 | in 192 | let start_col = col_num utf8 start_line_offset n0 in 193 | let end_col = col_num utf8 end_line_offset n1 in 194 | let text = if text then sub_string start_line_offset (to_eol n1) 195 | else "" 196 | in 197 | { start_line; start_col; start_line_offset 198 | ; end_line ; end_col ; end_line_offset 199 | ; start_byte = n0; end_byte = n1 200 | ; file_name ; text } 201 | 202 | (* quote functions *) 203 | type quote = 204 | { numbers : bool 205 | ; prefix : string 206 | ; header : string 207 | ; footer : string 208 | ; enlight : string -> string } 209 | 210 | type style = 211 | | OCaml 212 | | Short 213 | 214 | let decorate : int -> string -> string = fun width s -> 215 | if s = "" then 216 | String.make width '=' 217 | else 218 | let n = width - String.length s in 219 | let n1 = n / 2 in 220 | let n2 = n - n1 - 2 in 221 | String.make n1 '=' ^ " " ^ s ^ " " ^ String.make n2 '=' 222 | 223 | let _red : string -> string = 224 | fun s -> "\027[0m\027[31m" ^ s ^ "\027[0m" 225 | 226 | let ulined : string -> string = 227 | fun s -> "\027[0m\027[4m" ^ s ^ "\027[0m" 228 | 229 | let default_quote = 230 | { numbers = true 231 | ; prefix = "" 232 | ; header = "" 233 | ; footer = "" 234 | ; enlight = ulined } 235 | 236 | let quote_text : quote -> Format.formatter -> pos_info -> unit = 237 | fun quote ch pos -> 238 | let open Format in 239 | if pos.text = "" then () else 240 | let lines = String.split_on_char '\n' pos.text in 241 | let start = pos.start_line in 242 | (*Printf.printf "start %d offset %d text: %s\n%!" start offset pos.text;*) 243 | let max_num = 244 | String.length (string_of_int (List.length lines + start - 1)) 245 | in 246 | let last = List.length lines - 1 in 247 | let print i line = 248 | let line = 249 | if i = 0 && i = last then 250 | begin 251 | let byte_offset1 = pos.start_byte - pos.start_line_offset in 252 | let byte_offset2 = pos.end_byte - pos.end_line_offset in 253 | let s1 = byte_offset2 - byte_offset1 in 254 | let s2 = String.length line - byte_offset2 in 255 | String.sub line 0 byte_offset1 ^ 256 | quote.enlight (String.sub line byte_offset1 s1) ^ 257 | String.sub line byte_offset2 s2 258 | 259 | end 260 | else if i = 0 then 261 | begin 262 | let byte_offset = pos.start_byte - pos.start_line_offset in 263 | let s = String.length line - byte_offset in 264 | String.sub line 0 byte_offset ^ 265 | quote.enlight (String.sub line byte_offset s) 266 | end 267 | else if i = last then 268 | begin 269 | let byte_offset = pos.end_byte - pos.end_line_offset in 270 | let s = String.length line - byte_offset in 271 | quote.enlight (String.sub line 0 byte_offset) ^ 272 | String.sub line byte_offset s 273 | end 274 | else quote.enlight line 275 | in 276 | let number = 277 | if quote.numbers then 278 | let num = string_of_int (i + start) in 279 | let pad = String.make (max_num - String.length num) ' ' in 280 | pad ^ num ^ "|" 281 | else "" 282 | in 283 | fprintf ch "%s%s%s\n" quote.prefix number line 284 | in 285 | if quote.header <> "" then fprintf ch "%s\n" quote.header; 286 | List.iteri print lines; 287 | if quote.footer <> "" then 288 | fprintf ch "%s%s" quote.prefix quote.footer 289 | 290 | let print_spos ?(style=OCaml) () ch ((infos,n):spos) = 291 | let open Format in 292 | if n = Input.phantom_byte_pos then 293 | fprintf ch "NO POSITION" 294 | else 295 | let n = Input.int_of_byte_pos n in 296 | let name = Input.filename infos in 297 | if name = "" then 298 | let format : (_,_,_) format = match style with 299 | | OCaml -> "character %d" 300 | | Short -> "%d" 301 | in 302 | fprintf ch format n 303 | else 304 | let format : (_,_,_) format = match style with 305 | | OCaml -> "File %S, character %d" 306 | | Short -> "%S:%d" 307 | in 308 | fprintf ch format name n 309 | 310 | let print_pos ?(style=OCaml) () ch pos = 311 | let open Format in 312 | let n1 = pos.offset_start in 313 | let n2 = pos.offset_end in 314 | if n1 = -1 || n2 = -1 then 315 | fprintf ch "NO POSITION" 316 | else 317 | let name = Input.filename pos.infos in 318 | if name = "" then 319 | let format : (_,_,_) format = match style with 320 | | OCaml -> "character %d-%d" 321 | | Short -> "%d-%d" 322 | in 323 | fprintf ch format n1 n2 324 | else 325 | let format : (_,_,_) format = match style with 326 | | OCaml -> "File %S, character %d to %d" 327 | | Short -> "%S:%d:%d" 328 | in 329 | fprintf ch format name n1 n2 330 | 331 | let print_pos_info ?(style=OCaml) ?quote () ch (pos:pos_info) = 332 | let open Format in 333 | let str_pos = 334 | if pos.file_name = "" then 335 | if pos.start_line = pos.end_line then 336 | if pos.start_col = pos.end_col then 337 | let format : (_,_,_) format = match style with 338 | | OCaml -> "line %d, characters %d" 339 | | Short -> "%d:%d" 340 | in 341 | sprintf format pos.start_line pos.start_col 342 | else 343 | let format : (_,_,_) format = match style with 344 | | OCaml -> "line %d, characters %d-%d" 345 | | Short -> "%d:%d-%d" 346 | in 347 | sprintf format pos.start_line pos.start_col pos.end_col 348 | else 349 | let format : (_,_,_) format = match style with 350 | | OCaml -> "line %d, character %d - line %d, character %d" 351 | | Short -> "%d:%d-%d:%d" 352 | in 353 | sprintf format pos.start_line 354 | pos.start_col pos.end_line pos.end_col 355 | else 356 | if pos.start_line = pos.end_line then 357 | if pos.start_col = pos.end_col then 358 | let format : (_,_,_) format = match style with 359 | | OCaml -> "File %S, line %d, characters %d" 360 | | Short -> "%S:%d:%d" 361 | in 362 | sprintf format pos.file_name pos.start_line pos.start_col 363 | else 364 | let format : (_,_,_) format = match style with 365 | | OCaml -> "File %S, line %d, characters %d-%d" 366 | | Short -> "%S:%d:%d-%d" 367 | in 368 | sprintf format pos.file_name pos.start_line 369 | pos.start_col pos.end_col 370 | else 371 | let format : (_,_,_) format = match style with 372 | | OCaml -> "File %S, line %d, character %d - line %d, character %d" 373 | | Short -> "%S:%d:%d-%d:%d" 374 | in 375 | sprintf format pos.file_name pos.start_line 376 | pos.start_col pos.end_line pos.end_col 377 | in 378 | match quote with 379 | | None -> fprintf ch "%s" str_pos 380 | | Some q -> quote_text { q with header = decorate 79 str_pos; 381 | footer = decorate 70 "" } 382 | ch pos 383 | 384 | let print_spos ?(style=OCaml) ?quote () ch (infos,offset as p) = 385 | try 386 | let p2 = mk_pos offset offset infos in 387 | print_pos_info ~style ?quote () ch (pos_info ~text:(quote<>None) p2) 388 | with No_detailed_position -> 389 | print_spos ~style () ch p 390 | 391 | let print_pos ?(style=OCaml) ?quote () ch p = 392 | try 393 | print_pos_info ~style ?quote () ch (pos_info ~text:(quote<>None) p) 394 | with No_detailed_position -> 395 | print_pos ~style () ch p 396 | 397 | let print_buf_pos ?(style=OCaml) ?quote () ch (buf,idx) = 398 | print_spos ~style ?quote () ch (Input.spos buf idx) 399 | 400 | (** exception returned by the parser *) 401 | exception Parse_error of Input.buffer * Input.idx * string list 402 | 403 | let fail_no_parse (_:exn) = exit 1 404 | 405 | (** A helper to handle exceptions *) 406 | let handle_exception ?(error=fail_no_parse) ?(style=OCaml) f a = 407 | try f a with Parse_error(buf, pos, msgs) as e -> 408 | let red fmt = "\027[31m" ^^ fmt ^^ "\027[0m%!" in 409 | Format.eprintf (red "Parse error: %a.\n%!") 410 | (print_buf_pos ~style ()) (buf, pos); 411 | if msgs <> [] then 412 | begin 413 | let open Format in 414 | let prl ch l = List.iter (fprintf ch "%s@ ") l in 415 | eprintf "@[expecting:@ %a@]@." prl msgs 416 | end; 417 | error e 418 | -------------------------------------------------------------------------------- /lib/pos.mli: -------------------------------------------------------------------------------- 1 | (** {1 Functions managing positions} *) 2 | 3 | (** short position *) 4 | type spos = Input.spos 5 | 6 | (** Type to represent an interval between two position *) 7 | type pos = 8 | { offset_start : int 9 | ; offset_end : int 10 | ; infos : Input.infos } 11 | 12 | val mk_pos : Input.byte_pos -> Input.byte_pos -> Input.infos -> pos 13 | 14 | (** a phantom position, used for grammar accepting the empty input, and other 15 | reference initialisation *) 16 | val phantom_pos : pos 17 | val no_pos : pos 18 | 19 | (** return false on [phantom_pos/no_pos] *) 20 | val has_pos : pos -> bool 21 | 22 | (** build the smallest position containing both position *) 23 | val merge : pos -> pos -> pos 24 | 25 | (** Fully informed positions *) 26 | type pos_info = 27 | { start_line : int 28 | ; start_col : int 29 | ; start_line_offset : int 30 | ; start_byte : int 31 | ; end_line : int 32 | ; end_col : int 33 | ; end_line_offset : int 34 | ; end_byte : int 35 | ; file_name : string 36 | ; text : string } 37 | 38 | (** Function to recover full position from the information stored in located 39 | data. 40 | 41 | The optional paramater ~relocate, which is identity by default allows to 42 | deal with situation like a moved file since position was produced, or a file 43 | name stored with a relative position. The file_name in the pos_info is not 44 | affected by [relocate] which is only used to reopen the file. 45 | 46 | The optionnal parameter [~text] (false by default) will fill the [text] 47 | field within the position if it is true. Otherwise the [text] field is set to 48 | the empty string. 49 | 50 | This function may raise [No_detailled_position] is the file can not be 51 | reopen or [File_changed] if the file is present but changed its last 52 | modification time of size. *) 53 | exception No_detailed_position 54 | exception File_changed 55 | 56 | val pos_info 57 | : ?relocate:(string -> string) -> ?text:bool -> pos -> pos_info 58 | 59 | (** configuration for quoting file *) 60 | type quote = 61 | { numbers : bool (** includes line number *) 62 | ; prefix : string (** prefix added after each newline but not added for the 63 | first printed line *) 64 | ; header : string (** header, added as first line if non empty *) 65 | ; footer : string (** footer, added as last line if non empty *) 66 | ; enlight : string -> string 67 | (** used to transform the quoted part of the printed 68 | lines *) 69 | } 70 | 71 | (** default quote: 72 | {[ 73 | let ulined : string -> string = 74 | fun s -> "\027[0m\027[4m" ^ s ^ "\027[0m" 75 | 76 | let default_quote = 77 | { numbers = true 78 | ; prefix = "" 79 | ; header = "" 80 | ; footer = "" 81 | ; enlight = ulined } 82 | ]} 83 | *) 84 | val default_quote : quote 85 | 86 | (** Style for printing positions: *) 87 | type style = OCaml (** like OCaml *) 88 | | Short (** like gcc *) 89 | 90 | (** printing for the three kind of positions, and the current position 91 | of a buffer. *) 92 | val print_pos_info : ?style:style -> ?quote:quote -> unit 93 | -> Format.formatter -> pos_info -> unit 94 | (** The three functions below will print only the byte_pos if the file 95 | cannot be reopenned. The exception File_changed is raised if the file 96 | changed since last openning. 97 | 98 | If quote is given, the file is quoted. 99 | *) 100 | val print_pos : ?style:style -> ?quote:quote -> unit 101 | -> Format.formatter -> pos -> unit 102 | val print_spos : ?style:style -> ?quote:quote -> unit 103 | -> Format.formatter -> spos -> unit 104 | val print_buf_pos : ?style:style -> ?quote:quote -> unit 105 | -> Format.formatter -> (Input.buffer * Input.idx) -> unit 106 | 107 | (** Exception raised when parsing fails *) 108 | exception Parse_error of Input.buffer * Input.idx * string list 109 | 110 | (** [handle_exception fn v] applies the function [fn] to [v] and handles the 111 | [Parse_error] exception. In particular, a parse error message is presented 112 | to the user in case of a failure, then [error e] is called where e is the 113 | raised exception. The default [error] is [fun _ -> exit 1]. [raise] is 114 | another possibility. *) 115 | val handle_exception : ?error:(exn -> 'b) -> ?style:style 116 | -> ('a -> 'b) -> 'a -> 'b 117 | -------------------------------------------------------------------------------- /lib/regexp.ml: -------------------------------------------------------------------------------- 1 | (** Type of a regular expression. *) 2 | type regexp = 3 | | Chr of char (** Single character. *) 4 | | Set of Charset.t (** Any character in a charset. *) 5 | | Seq of regexp list (** Sequence of regexps. *) 6 | | Alt of regexp list (** Alternative between regexps. *) 7 | | Opt of regexp (** Optional regexp. *) 8 | | Str of regexp (** Zero or more times the regexp. *) 9 | | Pls of regexp (** One or more times the regexp. *) 10 | | Sav of regexp (** Save the matching string. *) 11 | 12 | (** Short synonym of {!type:regexp}. *) 13 | type t = regexp 14 | 15 | (** [pp ff re] outputs the regexp [re] to the formatter [ff]. *) 16 | let rec pp : Format.formatter -> t -> unit = fun ff re -> 17 | let pp_sep ff _ = Format.pp_print_string ff ";" in 18 | let pp_list ff = Format.pp_print_list ~pp_sep pp ff in 19 | match re with 20 | | Chr(c) ->Format.fprintf ff "Chr(%C)" c 21 | | Set(s) ->Format.fprintf ff "Set(%a)" Charset.pp s 22 | | Seq(l) ->Format.fprintf ff "Seq([%a])" pp_list l 23 | | Alt(l) ->Format.fprintf ff "Alt([%a])" pp_list l 24 | | Opt(r) ->Format.fprintf ff "Opt(%a)" pp r 25 | | Str(r) ->Format.fprintf ff "Str(%a)" pp r 26 | | Pls(r) ->Format.fprintf ff "Pls(%a)" pp r 27 | | Sav(r) ->Format.fprintf ff "Sav(%a)" pp r 28 | 29 | (** [accepts_empty re] tells whether the empty input is valid for [re]. *) 30 | let rec accepts_empty : regexp -> bool = fun re -> 31 | match re with 32 | | Chr(_) -> false 33 | | Set(s) -> Charset.equal s Charset.empty 34 | | Seq(l) -> List.for_all accepts_empty l 35 | | Alt(l) -> List.exists accepts_empty l 36 | | Opt(_) -> true 37 | | Str(_) -> true 38 | | Pls(r) -> accepts_empty r 39 | | Sav(r) -> accepts_empty r 40 | 41 | (** [accepted_first_chars re] returns the set of characters that are possible, 42 | valid first characters for matching [re]. *) 43 | let rec accepted_first_chars : regexp -> Charset.t = fun re -> 44 | let rec aux_seq l = 45 | match l with 46 | | [] -> Charset.empty 47 | | r::l -> let cs = accepted_first_chars r in 48 | if accepts_empty r then Charset.union cs (aux_seq l) else cs 49 | in 50 | match re with 51 | | Chr(c) -> Charset.singleton c 52 | | Set(s) -> s 53 | | Seq(l) -> aux_seq l 54 | | Alt(l) -> let fn cs r = Charset.union cs (accepted_first_chars r) in 55 | List.fold_left fn Charset.empty l 56 | | Opt(r) -> accepted_first_chars r 57 | | Str(r) -> accepted_first_chars r 58 | | Pls(r) -> accepted_first_chars r 59 | | Sav(r) -> accepted_first_chars r 60 | 61 | module Pacomb = struct 62 | module Lex = Lex 63 | module Grammar = Grammar 64 | end 65 | 66 | let%parser atom_charset first = 67 | (c1::CHAR) '-' (c2::CHAR) => (if c1 = '-' || (not first && c1 = ']') || 68 | (first && c1 = '^') || 69 | c2 = '-' then Lex.give_up (); 70 | Charset.range c1 c2) 71 | ; (c1::CHAR) => (if (not first && (c1 = '-' || c1 = ']')) || 72 | (first && c1 = '^') then Lex.give_up (); 73 | Charset.singleton c1) 74 | 75 | let%parser p_charset = 76 | (cs1::atom_charset true) (cs2:: ~* (atom_charset false)) => 77 | List.fold_left Charset.union cs1 cs2 78 | 79 | let is_spe c = List.mem c ['\\';'.';'*';'+';'?';'[';']'] 80 | 81 | let%parser rec atom_regexp = 82 | '[' (cpl:: ~? '^') (cs::p_charset) ']' => 83 | begin 84 | let cs = if cpl <> None then Charset.complement (Charset.add cs '\255') 85 | else cs 86 | in 87 | Set cs 88 | end 89 | ; (c::CHAR) => 90 | begin 91 | if is_spe c then Lex.give_up () else Chr c 92 | end 93 | ; '\\' (c::CHAR) => 94 | begin 95 | if is_spe c then Chr c else Lex.give_up () 96 | end 97 | ; '.' => Set (Charset.del Charset.full '\255') 98 | ; "\\(" (r::regexp) "\\)" => Sav r 99 | ; (r::atom_regexp) '?' => Opt r 100 | ; (r::atom_regexp) '*' => Str r 101 | ; (r::atom_regexp) '+' => Pls r 102 | 103 | and seq_regexp = 104 | (rs :: ~+ atom_regexp) => Seq rs 105 | 106 | and regexp = 107 | (rs :: ~+ ["\\|"] seq_regexp) => Alt rs 108 | 109 | (* Exception raised when a regexp cannot be parsed. *) 110 | exception Regexp_error of Input.buffer * Input.idx 111 | 112 | let from_string : string -> regexp = fun s -> 113 | try Grammar.parse_string regexp Blank.none s 114 | with Pos.Parse_error(b,s,_) -> raise (Regexp_error(b,s)) 115 | 116 | open Lex 117 | 118 | let from_regexp_grps : ?grps:int list -> regexp -> string list Lex.t = 119 | fun ?grps r -> 120 | let n = ref 0 in 121 | let do_save fn r = 122 | let n0 = !n in 123 | incr n; 124 | let r = fn r in 125 | match grps with 126 | | None -> save r (fun s l -> s :: l) 127 | | Some l -> 128 | if List.mem n0 l then save r (fun s l -> s :: l) 129 | else r 130 | in 131 | let rec fn = function 132 | | Chr c -> appl (fun _ -> []) (char c) 133 | | Set s -> appl (fun _ -> []) (charset s) 134 | | Alt l -> alts (List.map fn l) 135 | | Seq l -> seqs (List.map fn l) (@) 136 | | Opt r -> option [] (fn r) 137 | | Str r -> star (fn r) (fun () -> []) (@) 138 | | Pls r -> plus (fn r) (fun () -> []) (@) 139 | | Sav r -> do_save fn r 140 | in 141 | let r = do_save fn r in 142 | begin 143 | match grps with 144 | | None -> () 145 | | Some l -> if List.exists (fun g -> g < 0 || g >= !n) l 146 | then invalid_arg "from_regexp_grps" 147 | end; 148 | r 149 | 150 | let from_regexp : regexp -> string Lex.t = fun r -> 151 | Lex.appl 152 | (function [s] -> s | _ -> assert false) 153 | (from_regexp_grps ~grps:[0] r) 154 | 155 | (** create a terminal from a regexp. Returns the groups list, last to finish 156 | to be parsed is first in the result *) 157 | let regexp_grps : ?name:string -> ?grps:int list -> regexp -> string list t = 158 | fun ?name ?grps r -> 159 | let r = from_regexp_grps ?grps r in 160 | { r with n = default r.n name } 161 | 162 | let regexp : ?name:string -> regexp -> string t = fun ?name r -> 163 | let r = from_regexp r in 164 | { r with n = default r.n name } 165 | 166 | let blank_regexp s = Blank.from_terminal (regexp (from_string s)) 167 | -------------------------------------------------------------------------------- /lib/regexp.mli: -------------------------------------------------------------------------------- 1 | (** A small module for efficient regular expressions. *) 2 | 3 | (** Type of a regular expression. *) 4 | type regexp = 5 | | Chr of char (* Single character. *) 6 | | Set of Charset.t (* Any character in a charset. *) 7 | | Seq of regexp list (* Sequence of regular expressions. *) 8 | | Alt of regexp list (* Alternative between regexps. *) 9 | | Opt of regexp (* Optional regexp. *) 10 | | Str of regexp (* Zero or more times the regexp. *) 11 | | Pls of regexp (* One or more times the regexp. *) 12 | | Sav of regexp (* Save the matching string. *) 13 | 14 | (** Short synonym of {!type:regexp}. *) 15 | type t = regexp 16 | 17 | (** [pp ff re] outputs the regexp [re] to the formatter [ff]. *) 18 | val pp : Format.formatter -> regexp -> unit 19 | 20 | (** [accepts_empty re] tells whether the empty input is valid for [re]. *) 21 | val accepts_empty : regexp -> bool 22 | 23 | (** [accepted_first_chars re] returns the set of characters that are possible, 24 | valid first characters for matching [re]. *) 25 | val accepted_first_chars : regexp -> Charset.t 26 | 27 | (** Exception raised when a regexp does not match. Note that the given buffer 28 | and position correspond to the first character that cannot be matched. *) 29 | exception Regexp_error of Input.buffer * Input.idx 30 | 31 | (** [from_string s] convert a string into a regexp following [Str] syntax. *) 32 | val from_string : string -> regexp 33 | 34 | (** create a terminal from a regexp. Returns the whole matched string *) 35 | val regexp : ?name:string -> regexp -> string Lex.t 36 | 37 | (** create a terminal from a regexp. Returns the groups list, last to finish to 38 | be parsed is first in the result. The optional argument grps allows 39 | selection of the produced groups. As usual, 0 means the whole regexp and n > 40 | 0 the sub string corresponding to the nth opening parenthesis. *) 41 | val regexp_grps : ?name:string -> ?grps:int list -> regexp -> string list Lex.t 42 | 43 | (** create a blank function from a string representing a regexp *) 44 | val blank_regexp : string -> Blank.t 45 | -------------------------------------------------------------------------------- /lib/unionFind.ml: -------------------------------------------------------------------------------- 1 | 2 | type 'a t = 'a cell ref 3 | 4 | and 'a cell = Link of 'a t | Root of 'a 5 | 6 | let root x = ref (Root x) 7 | 8 | let rec find x = 9 | match !x with 10 | | Root r -> (r, x) 11 | | Link y -> let (_, y' as c) = find y in 12 | if y != y' then x := Link y'; 13 | c 14 | 15 | let union fn x y = 16 | let (rx,x) = find x in 17 | let (ry,y) = find y in 18 | if x != y then 19 | begin 20 | let r = fn rx ry in 21 | x := Link y; 22 | y := Root r 23 | end 24 | 25 | let set_root x r = 26 | match !x with 27 | | Root _ -> x := Root r 28 | | Link _ -> invalid_arg "set_root on non root" 29 | -------------------------------------------------------------------------------- /lib/unionFind.mli: -------------------------------------------------------------------------------- 1 | (* type a union find equivalent classes with data of type 'a attached *) 2 | type 'a t 3 | 4 | (* creationg of a new equivalent class *) 5 | val root : 'a -> 'a t 6 | 7 | (* get the value and root of an equivalent class *) 8 | val find : 'a t -> ('a * 'a t) 9 | 10 | (* merge 2 equivalent class *) 11 | val union : ('a -> 'a -> 'a) -> 'a t -> 'a t -> unit 12 | 13 | (* set the root of a root element of an equivalent class. 14 | raise Invalig_argument "set_root on non root" is the 15 | first argument is not a root (i.e. the result of find *) 16 | val set_root : 'a t -> 'a -> unit 17 | -------------------------------------------------------------------------------- /lib/utf8.mli: -------------------------------------------------------------------------------- 1 | 2 | (* possible context for column numbering in file *) 3 | type context = 4 | ASCII | UTF8 | CJK_UTF8 5 | 6 | (* [width n] return the column width of the unicode cahcar*) 7 | val width : ?context:context -> Uchar.t -> int 8 | 9 | type grapheme_break_property = 10 | | Other 11 | | CR 12 | | LF 13 | | Prepend 14 | | Control 15 | | Extend 16 | | SpacingMark 17 | | L 18 | | V 19 | | T 20 | | LV 21 | | LVT 22 | | ZWJ 23 | | RegionalIndicator 24 | | ExtPict 25 | 26 | (* Give the grapheme break property f a charactere *) 27 | val gbp : Uchar.t -> grapheme_break_property 28 | 29 | type previous_chars = 30 | EvenRegionalIndicator | ExtPictExtendStar | NoPrevious 31 | 32 | val encode : Uchar.t -> string 33 | 34 | val decode : string -> int -> Uchar.t * int 35 | 36 | val look : string -> int -> Uchar.t 37 | 38 | val next : string -> int -> int 39 | 40 | val prev : string -> int -> int 41 | 42 | val of_list : Uchar.t list -> string 43 | 44 | val to_list : string -> Uchar.t list 45 | 46 | val fold : ('a -> Uchar.t -> 'a) -> 'a -> string -> 'a 47 | 48 | val length : context -> string -> int 49 | 50 | val sub : string -> int -> int -> string 51 | 52 | val grapheme_break : string -> int -> bool 53 | 54 | val grapheme_break_after : Uchar.t list -> Uchar.t -> bool 55 | 56 | val next_grapheme : string -> int -> int 57 | 58 | val prev_grapheme : string -> int -> int 59 | 60 | val fold_grapheme : ('a -> string -> 'a) -> 'a -> string -> 'a 61 | -------------------------------------------------------------------------------- /lib/word_list.ml: -------------------------------------------------------------------------------- 1 | module Pacomb = struct 2 | module Lex = Lex 3 | module Grammar = Grammar 4 | end 5 | 6 | type 'a data = 7 | { mutable leafs : 'a list 8 | ; mutable next : 'a data option array } 9 | 10 | type ('a,'b) t = 11 | { data : 'b data 12 | ; uniq : bool 13 | ; map : 'a -> 'a 14 | ; cs : Charset.t 15 | ; finl : Input.buffer -> Input.idx -> bool } 16 | 17 | let create_data () = { leafs = []; next = Array.make 256 None } 18 | 19 | let idt x = x 20 | 21 | let create ?(unique=true) ?(map=idt) 22 | ?(cs=Charset.full) ?(final_test=fun _ _ -> true) () = 23 | { data = create_data () ; uniq = unique; map; cs; finl = final_test } 24 | 25 | let reset t = t.data.leafs <- []; t.data.next <- Array.make 256 None 26 | 27 | let save t = { leafs = t.data.leafs; next = t.data.next } 28 | 29 | let save_and_reset t = let s = save t in reset t; s 30 | 31 | let restore t s = t.data.leafs <- s.leafs; t.data.next <- s.next 32 | 33 | let size { data = {leafs; next}; _} = 34 | let res = ref 0 in 35 | let rec fn {leafs; next} = 36 | res := !res + List.length leafs; 37 | Array.iter gn next 38 | and gn = function 39 | | None -> () 40 | | Some d -> fn d 41 | in 42 | res := !res + List.length leafs; 43 | Array.iter gn next; 44 | !res 45 | 46 | exception Already_bound 47 | 48 | let next tbl c = tbl.next.(Char.code c) 49 | 50 | let advance : bool -> (char -> char) -> 'b data -> string -> 'b data = 51 | fun add map tbl s -> 52 | let r = ref tbl in 53 | for i = 0 to String.length s - 1 do 54 | let c = map s.[i] in 55 | match !r.next.(Char.code c) with 56 | | Some tbl -> r := tbl 57 | | None -> 58 | if add then 59 | let tbl = create_data () in 60 | !r.next.(Char.code c) <- Some tbl; 61 | r := tbl 62 | else raise Not_found 63 | done; 64 | !r 65 | 66 | let add_ascii : (char,'b) t -> string -> 'b -> unit = 67 | fun { data; uniq; map; cs } s v -> 68 | if s = "" then invalid_arg "Word_list.add_ascii: empty word"; 69 | if not (Charset.mem cs s.[0]) then 70 | invalid_arg "Word_list.add: charset mismatch"; 71 | let data = advance true map data s in 72 | if uniq && data.leafs <> [] then raise Already_bound; 73 | data.leafs <- v :: data.leafs 74 | 75 | let mem_ascii : (char,'b) t -> string -> bool = 76 | fun { data; map; _ } s -> 77 | try 78 | let data = advance false map data s in 79 | data.leafs <> [] 80 | with 81 | Not_found -> false 82 | 83 | let add_utf8 : (string, 'b) t -> string -> 'b -> unit = 84 | fun { data; map; uniq; cs } s v -> 85 | if s = "" then invalid_arg "Word_list.add_utf8: empty word"; 86 | if not (Charset.mem cs s.[0]) then 87 | invalid_arg "Word_list.add: charset mismatch"; 88 | let fn data s = advance true (fun c -> c) data (map s) in 89 | let data = Utf8.fold_grapheme fn data s in 90 | if uniq && data.leafs <> [] then raise Already_bound; 91 | data.leafs <- v :: data.leafs 92 | 93 | let mem_utf8 : (string, 'b) t -> string -> bool = 94 | fun { data; map; _ } s -> 95 | try 96 | let fn data s = advance false (fun c -> c) data (map s) in 97 | let data = Utf8.fold_grapheme fn data s in 98 | data.leafs <> [] 99 | with 100 | Not_found -> false 101 | 102 | let word : ?name:string -> (char, 'a) t -> 'a Grammar.t = 103 | fun ?name { data = tbl; map; cs; finl; uniq } -> 104 | let n = Lex.default "WORD" name in 105 | if uniq then 106 | let rec f tbl s0 n0 = 107 | let (c,s,n) = Input.read s0 n0 in 108 | let c = map c in 109 | match next tbl c with 110 | | Some t -> f t s n 111 | | None -> 112 | if finl s0 n0 && tbl.leafs <> [] then (List.hd tbl.leafs, s0, n0) 113 | else (raise Lex.NoParse) 114 | in 115 | let f = f tbl in 116 | let lex = Lex.{ n; f; a = Custom(f,Assoc.new_key ()); c = cs } 117 | in 118 | Grammar.term ?name lex 119 | else 120 | let rec f tbl s0 n0 = 121 | let (c,s,n) = Input.read s0 n0 in 122 | let c = map c in 123 | match next tbl c with 124 | | Some t -> f t s n 125 | | None -> 126 | if finl s0 n0 && tbl.leafs <> [] then (tbl.leafs, s0, n0) 127 | else raise Lex.NoParse 128 | in 129 | let f = f tbl in 130 | let lex = Lex.{ n; f; a = Custom(f,Assoc.new_key ()); c = cs } 131 | in 132 | Grammar.unmerge ?name (Grammar.term lex) 133 | 134 | let utf8_word : ?name:string -> (string, 'a) t -> 'a Grammar.t = 135 | fun ?name { data = tbl; map; finl; cs; uniq } -> 136 | let n = Lex.default "UTF8_WORD" name in 137 | if uniq then 138 | let rec f tbl s n = 139 | try 140 | let (g,s,n) = Lex.((any_grapheme ()).f s n) in 141 | let g = map g in 142 | f (advance false (fun c -> c) tbl g) s n 143 | with 144 | Not_found -> 145 | if finl s n && tbl.leafs <> [] then(List.hd tbl.leafs, s, n) 146 | else raise Lex.NoParse 147 | in 148 | let f = f tbl in 149 | let lex = Lex.{ n; f; a = Custom(f,Assoc.new_key ()); c = cs } 150 | in 151 | Grammar.term ?name lex 152 | else 153 | let rec f tbl s n = 154 | try 155 | let (g,s,n) = Lex.((any_grapheme ()).f s n) in 156 | let g = map g in 157 | f (advance false (fun c -> c) tbl g) s n 158 | with 159 | Not_found -> 160 | if finl s n && tbl.leafs <> [] then (tbl.leafs, s, n) 161 | else raise Lex.NoParse 162 | in 163 | let f = f tbl in 164 | let lex = Lex.{ n; f; a = Custom(f,Assoc.new_key ()); c = cs } 165 | in 166 | Grammar.unmerge ?name (Grammar.term lex) 167 | -------------------------------------------------------------------------------- /lib/word_list.mli: -------------------------------------------------------------------------------- 1 | (** {1 Module to build and parse list of words} *) 2 | 3 | (** Type of a word list with 4 | 'a : the type of characters (typically, char for ascii or string for utf8) 5 | 'b : a value associated to each word *) 6 | type ('a,'b) t 7 | 8 | (** exception raise when multiple binding are added and not allowed *) 9 | exception Already_bound 10 | 11 | (** Create a new empty table. The optional parameter [ unique ] defaults to 12 | true. Setting it to false with allow multiple identical bindings, creating 13 | ambiguous grammars. If [ unique ] is true, then adding multiple bindings will 14 | raise the exception [ Already_bound ]. 15 | 16 | [map] is a function transforming character before addition (typically a case 17 | transformer or a unicode normalisation). (defaults to identity). 18 | 19 | [final_test] will be called after parsing. It may be used typically to 20 | ensure that the next character is not alphanumeric. Defaults to an always 21 | passing test. 22 | 23 | [cs] can be given as an optimisation. All words added should start with 24 | characters in this set. *) 25 | val create : ?unique:bool -> 26 | ?map:('a -> 'a) -> 27 | ?cs:Charset.t -> 28 | ?final_test:(Input.buffer -> Input.idx -> bool) 29 | -> unit -> ('a,'b) t 30 | 31 | (** Returns the number of bindings in the table *) 32 | val size : ('a,'b) t -> int 33 | 34 | (** empty a table *) 35 | val reset : ('a,'b) t -> unit 36 | 37 | (** [add_ascii tbl s v] adds a binding from [s] to [v] in [tbl], keep all 38 | previous bindings. *) 39 | 40 | val add_ascii : (char,'b) t -> string -> 'b -> unit 41 | 42 | (** [mem_ascii tbl s] tells if [s] if present in [tbl]. Typically used to reject 43 | identifiers that are keywords *) 44 | val mem_ascii : (char,'b) t -> string -> bool 45 | 46 | (** Same as above for a unicode string, which are splitted in graphemes *) 47 | val add_utf8 : (string, 'b) t -> string -> 'b -> unit 48 | 49 | val mem_utf8 : (string,'b) t -> string -> bool 50 | 51 | (** Parses word from a dictionnary returning as action all the assiociated 52 | values (it is an ambiguous grammar if there is more than one value).*) 53 | val word : ?name:string -> (char, 'a) t -> 'a Grammar.t 54 | 55 | val utf8_word : ?name:string -> (string, 'a) t -> 'a Grammar.t 56 | 57 | type 'a data 58 | val save : ('a,'b) t -> 'b data 59 | val save_and_reset : ('a,'b) t -> 'b data 60 | val restore : ('a,'b) t -> 'b data -> unit 61 | -------------------------------------------------------------------------------- /pacomb.opam: -------------------------------------------------------------------------------- 1 | synopsis: "Parsing library based on combinators and ppx extension to write languages" 2 | description: 3 | """ 4 | Pacomb is a parsing library that compiles grammars to combinators prior to 5 | parsing together with a PPX extension to write parsers inside OCaml files. 6 | 7 | The advantages of Pacomb are 8 | 9 | - Grammars as first class values defined in your OCaml files. This is 10 | an example from the distribution: 11 | 12 | (* The three levels of priorities *) 13 | type p = Atom | Prod | Sum 14 | let%parser rec 15 | (* This includes each priority level in the next one *) 16 | expr p = Atom < Prod < Sum 17 | (* all other rule are selected by their priority level *) 18 | ; (p=Atom) (x::FLOAT) => x 19 | ; (p=Atom) '(' (e::expr Sum) ')' => e 20 | ; (p=Prod) (x::expr Prod) '*' (y::expr Atom) => x*.y 21 | ; (p=Prod) (x::expr Prod) '/' (y::expr Atom) => x/.y 22 | ; (p=Sum ) (x::expr Sum ) '+' (y::expr Prod) => x+.y 23 | ; (p=Sum ) (x::expr Sum ) '-' (y::expr Prod) => x-.y 24 | 25 | - Good performances: 26 | - on non ambiguous grammars, 2 to 3 time slower compared to ocamlyacc 27 | - on ambiguous grammars O(N^3 ln(N)) can be achieved. 28 | 29 | - Parsing from left to right (despite the use of combinators) allowing not 30 | to keep the whole input in memory and allowing to parse streams. 31 | 32 | - Dependant sequence allowing for self extensible grammars (like new infix 33 | with a given priority in a given example). 34 | 35 | - Managing of blanks that for instance allows for nested language using 36 | different kind of comments or blanks. 37 | 38 | - Support for cache and merge for ambiguous grammars (to get O(N^3 ln(N))) 39 | 40 | - Enough support for utf8 to write parser for a language using utf8. 41 | 42 | - Comes with documentation and various examples illustrating most possibilities. 43 | 44 | All this makes Pacomb a promising solution to write languages in OCaml. 45 | """ 46 | 47 | opam-version: "2.0" 48 | maintainer: "Christophe Raffalli " 49 | bug-reports: "https://github.com/craff/pacomb/issues" 50 | homepage: "https://github.com/craff/pacomb" 51 | dev-repo: "git+https://github.com/craff/pacomb.git" 52 | authors: [ 53 | "Christophe Raffalli " 54 | "Rodolphe Lepigre " ] 55 | license: "MIT" 56 | 57 | depends: [ 58 | "ocaml" { >= "4.04.1" } 59 | "dune" { >= "1.9.0" } 60 | "ppxlib" { >= "0.10.0" } 61 | "stdlib-shims" 62 | ] 63 | 64 | build: [ [ "dune" "build" "-p" name "-j" jobs ] ] 65 | run-test: [ [ "dune" "runtest" "-p" name "-j" jobs ] ] 66 | -------------------------------------------------------------------------------- /ppx/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (name pacomb_ppx) 3 | (public_name pacomb.ppx) 4 | (preprocess (pps ppxlib.metaquot)) 5 | (modules :standard) 6 | (kind ppx_rewriter) 7 | (libraries ppxlib stdlib-shims)) 8 | -------------------------------------------------------------------------------- /tests/Break.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | open Grammar 3 | 4 | (* Blank function *) 5 | let blank = Regexp.blank_regexp "\\(\\([#][^\n]*\\)\\|[ \r\t\026]+\\)*" 6 | (* bug: "\\([ \r\t\026]\\|\\(\\(#[^\n]*\\)\\)*" *) 7 | 8 | (* Parser for hexadecimal integers *) 9 | let%parser char = 10 | (i::RE"[0-9A-Fa-F]+") => Uchar.of_int (int_of_string ("0x" ^ i)) 11 | 12 | let%parser sep = "÷" => true ; "×" => false 13 | 14 | let%parser rec sample_aux = (l::sample_aux) (c::char) (s::sep) => (c,s) :: l 15 | ; (c::char) (s::sep) => [(c,s)] 16 | 17 | let%parser sample = sep (l::sample_aux) => List.rev l 18 | 19 | let%parser rec break = () => [] 20 | ; (g::GRAPHEME) (l::break) => g::l 21 | 22 | let good = ref true 23 | 24 | let test pos l0 = 25 | try 26 | let chars = List.map fst l0 in 27 | let s = Utf8.of_list chars in 28 | let rec fn = function 29 | | [] -> [] 30 | | []::_ -> Format.eprintf "unexpected empty at %a\n%!" 31 | (Pos.print_pos ()) pos; 32 | good := false; 33 | raise Exit 34 | | [x]::l -> (x,true)::fn l 35 | | (x::l1)::l -> (x,false)::fn (l1::l) 36 | in 37 | let l = parse_string ~utf8:Utf8.UTF8 break Blank.none s in 38 | let l = List.map (fun s -> Utf8.to_list s) l in 39 | let l = fn l in 40 | if l <> l0 then 41 | begin 42 | Format.eprintf "break fail at %a\n%!" (Pos.print_pos ()) pos; 43 | List.iter (fun (l,b) -> Printf.eprintf "%x %b " (Uchar.to_int l) b) l; 44 | Format.eprintf " <> "; 45 | List.iter (fun (l,b) -> Printf.eprintf "%x %b " (Uchar.to_int l) b) l0; 46 | Format.eprintf "\n%!"; 47 | good := false; 48 | raise Exit 49 | end 50 | with Exit -> () 51 | 52 | (* Single mapping parser *) 53 | let%parser test = (l::sample) (~+ '\n' => ()) => test l_pos l 54 | 55 | let%parser tests = 56 | (star ('\n' => ())) (star test) => () 57 | 58 | let parse = parse_channel ~utf8:Utf8.UTF8 tests blank 59 | 60 | let _ = 61 | (* Command line args *) 62 | if Array.length Sys.argv != 2 then 63 | begin 64 | let pn = Sys.argv.(0) in 65 | Printf.eprintf "Usage: %s " pn; 66 | exit 1 67 | end; 68 | let infile = Sys.argv.(1) in 69 | 70 | (* Parsing and preparing the data *) 71 | let infile = open_in infile in 72 | let _ = Pos.handle_exception parse infile in 73 | 74 | close_in infile; 75 | if not !good then exit 1 76 | -------------------------------------------------------------------------------- /tests/big_expr.ml: -------------------------------------------------------------------------------- 1 | 2 | let rec seq op n atom = 3 | if n <= 0 then atom () 4 | else (atom (); print_string op; seq op (n-1) atom) 5 | 6 | let paren atom = 7 | print_string "("; atom (); print_string ")" 8 | 9 | let int () = 10 | print_string (string_of_int (Random.int 0x20000000)) 11 | 12 | let rec expr n p s = 13 | if n <= 0 then int () 14 | else seq (if Random.bool () then " + " else " - ") 15 | s (fun () -> seq 16 | (if Random.bool () then "*" else "/") 17 | p (fun () -> 18 | if n = 1 then int () else 19 | paren (fun () -> expr (n - 1) p s))) 20 | 21 | let n = int_of_string (Sys.argv.(1)) 22 | let p = int_of_string (Sys.argv.(2)) 23 | let s = int_of_string (Sys.argv.(3)) 24 | 25 | let _ = expr n p s; print_newline () 26 | -------------------------------------------------------------------------------- /tests/calc_factor.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | open Pos 3 | open Grammar 4 | 5 | (* factorisation ... just a test for the ppx, 6 | here left factorisation is done by the elimination of left recursion, 7 | to the result is the same as with calc_prio.ml *) 8 | type p = Atom | Prod | Sum 9 | let%parser rec 10 | expr p = Atom < Prod < Sum 11 | ; (p=Atom) (x::FLOAT) => x 12 | ; (p=Atom) '(' (e::expr Sum) ')' => e 13 | ; (p=Prod) (x::expr Prod) => ( '*' (y::expr Atom) => x*.y 14 | ; '/' (y::expr Atom) => x/.y) 15 | ; (p=Sum ) (x::expr Sum ) => ('+' (y::expr Prod) => x+.y 16 | ; '-' (y::expr Prod) => x-.y) 17 | 18 | let top = expr Sum 19 | 20 | let blank = Blank.from_charset (Charset.singleton ' ') 21 | 22 | let _ = 23 | try 24 | while true do 25 | let f () = 26 | Printf.printf "=> %!"; 27 | let line = input_line stdin in 28 | let n = parse_string top blank line in 29 | Printf.printf "%f\n%!" n 30 | in 31 | handle_exception ~error:(fun _ -> ()) f () 32 | done 33 | with 34 | End_of_file -> () 35 | -------------------------------------------------------------------------------- /tests/calc_utf8.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | open Pos 3 | open Grammar 4 | 5 | (* If -v option is given, the value of expression between parenthesis is 6 | printed *) 7 | let show_sub = Array.length Sys.argv > 1 && Sys.argv.(1) = "-v" 8 | 9 | (* Here is the definition of the parser with the ppx syntax extension described 10 | in the documentation. 11 | 12 | Here, we deal with priorities by manually defining three different grammars. 13 | Starting with the grammar for atomic expressions. *) 14 | let%parser rec 15 | atom = (x::FLOAT) => x (* constant *) 16 | ; (show_sub=false) '(' (e::expr) ')' => e (* rule for parenthesis 17 | when show_sub is false *) 18 | ; (show_sub=true) '(' (e::expr) ')' =>(* idem with show_sub true *) 19 | 20 | (Format.printf "%a: %f\n" 21 | (Pos.print_pos ()) _pos e; 22 | (* ^^^^^^ to access position of l *) 23 | e) 24 | 25 | (* Here is the grammar for products *) 26 | and prod = (a::atom) => a 27 | ; (x::prod) '*' (y::atom) => x*.y 28 | ; (x::prod) '/' (y::atom) => x/.y 29 | 30 | (* and finally all remaining expressions *) 31 | and expr = (a::prod) => a 32 | ; (x::expr) '+' (y::prod) => x+.y 33 | ; (x::expr) '-' (y::prod) => x-.y 34 | 35 | (* A subtlety : we want to parse expression, on by one and print the 36 | result. Pacomb evaluates action after the next token to avoid some useless 37 | evaluation but still make give_up usable. So we evaluate after newline... 38 | But if we where parsing blank after newline, we still would have to wait 39 | the input after the newline. 40 | 41 | To solve this, we parse line with no blank, use [Grammar.layout] to accept 42 | blank inside expression. 43 | 44 | A simpler solution (used in calc_ext.ml) is to read the input line 45 | by line and parse each line using [Grammar.parse_string]. 46 | *) 47 | 48 | let config = 49 | Blank.{ default_layout_config with 50 | new_blanks_before = true 51 | ; new_blanks_after = true} 52 | 53 | (* The parsing calling expression and changing the blank, 54 | printing the result and the next prompt. *) 55 | let%parser rec exprs = 56 | () => () 57 | ; exprs (e::expr) '\n' => Format.printf "%f\n=> %!" e 58 | 59 | (* we define the characters to be ignored, here space only *) 60 | let blank = Blank.from_charset (Charset.singleton ' ') 61 | 62 | let _ = 63 | try 64 | while true do 65 | let f () = 66 | Format.printf "=> %!"; (* initial prompt *) 67 | parse_channel ~utf8:Utf8.UTF8 exprs blank stdin; 68 | print_grammar ~def:false stdout exprs; 69 | raise End_of_file 70 | in 71 | (* [Pos] module provides a function to handle exception with 72 | an optional argument to call for error (default is to exit with 73 | code 1 *) 74 | handle_exception ~error:(fun _ -> ()) f (); 75 | done 76 | with 77 | End_of_file -> () 78 | -------------------------------------------------------------------------------- /tests/dseq_test.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | open Pos 3 | open Grammar 4 | 5 | let bspace = Blank.from_charset (Charset.singleton ' ') 6 | 7 | let test ?(blank=bspace) g s r = 8 | assert (parse_string g blank s = r) 9 | 10 | let tests ?(blank=bspace) g l = 11 | List.iter (fun (s,r) -> test ~blank g s r) l 12 | 13 | let test_fail ?(blank=bspace) g s = 14 | try 15 | let _ = parse_string g blank s in assert false 16 | with Parse_error _ -> () 17 | 18 | let tests_fail ?(blank=bspace) g l = 19 | List.iter (test_fail ~blank g) l 20 | 21 | let%parser [@cache] rec seq (n:int) = 22 | (n = 0) () => [] 23 | ; (n > 0) 'a' (l::seq (n-1)) => ('a' :: l) 24 | 25 | let%parser [@cache] seqf = 26 | ((n,__)>:(n::INT => (n,()))) (l::seq n) => l 27 | 28 | let _ = test seqf "0" [] 29 | let _ = test seqf "1 a" ['a'] 30 | let _ = test seqf "2 a a" ['a';'a'] 31 | let _ = test seqf "3 a a a" ['a';'a';'a'] 32 | 33 | let _ = test_fail seqf "1" 34 | let _ = test_fail seqf "1 a a" 35 | 36 | let%parser infix = '+' => (2,'+') 37 | ; '*' => (1,'*') 38 | 39 | type t = L of int | N of t * char * t 40 | 41 | let%parser [@cache] rec term = 42 | (n::INT) => (0, L n) 43 | ; '(' ((__,t) :: term) ')' => (0,t) 44 | ; (t::term_infix) => t 45 | 46 | and [@cache] term_infix = 47 | ((pl,tl)>:term) 48 | ((ps,s)::(((ps,__) = c) :: infix => (if pl > ps then Lex.give_up (); c))) 49 | ((pr,tr)::term) 50 | => (if pr >= ps then Lex.give_up (); (ps,N(tl,s,tr))) 51 | 52 | let _ = test term "1" (0, L 1) 53 | let _ = test term "1 + 1" (2, N(L 1,'+', L 1)) 54 | let _ = test term "1 + 1 + 1" (2, N(N(L 1,'+',L 1),'+', L 1)) 55 | let _ = test term "1 * 1" (1, N(L 1,'*', L 1)) 56 | let _ = test term "1 + 1 * 1 + 1" (2, N(N(L 1,'+', N(L 1,'*',L 1)),'+',L 1)) 57 | -------------------------------------------------------------------------------- /tests/dune: -------------------------------------------------------------------------------- 1 | (test 2 | (name test) 3 | (modules test) 4 | (libraries unix pacomb) 5 | (action (run ./test.exe))) 6 | 7 | (test 8 | (name hard) 9 | (modules hard) 10 | (preprocess (pps pacomb.ppx)) 11 | (libraries unix pacomb) 12 | (action (run ./hard.exe 1000))) 13 | 14 | (test 15 | (name ppx_test) 16 | (modules ppx_test) 17 | (preprocess (pps pacomb.ppx)) 18 | (libraries unix pacomb) 19 | (action (run ./ppx_test.exe))) 20 | 21 | (test 22 | (name dseq_test) 23 | (modules dseq_test) 24 | (preprocess (pps pacomb.ppx)) 25 | (libraries unix pacomb) 26 | (action (run ./dseq_test.exe))) 27 | 28 | (executable 29 | (name big_expr) 30 | (modules big_expr) 31 | (libraries)) 32 | 33 | (executable 34 | (name scan) 35 | (modules scan) 36 | (libraries unix pacomb)) 37 | 38 | (test 39 | (name Break) 40 | (modules Break) 41 | (preprocess (pps pacomb.ppx)) 42 | (libraries unix pacomb) 43 | (deps Break.exe GraphemeBreakTest.txt) 44 | (action 45 | (run ./Break.exe ./GraphemeBreakTest.txt))) 46 | 47 | (test 48 | (name calc_factor) 49 | (modules calc_factor) 50 | (preprocess (pps pacomb.ppx)) 51 | (libraries unix pacomb) 52 | (deps big_expr.exe calc_factor.exe) 53 | (action 54 | (system "./big_expr.exe 4 4 4 | ./calc_factor.exe"))) 55 | 56 | (test 57 | (name calc_utf8) 58 | (modules calc_utf8) 59 | (preprocess (pps pacomb.ppx)) 60 | (libraries unix pacomb) 61 | (deps big_expr.exe calc_utf8.exe) 62 | (action 63 | (system "./big_expr.exe 4 4 4 | ./calc_utf8.exe"))) 64 | -------------------------------------------------------------------------------- /tests/hard.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | open Grammar 3 | 4 | 5 | let gamma_gen n = 6 | let b = Buffer.create (2*n) in 7 | let rec gen n = 8 | if n <= 0 then () else 9 | match Random.int 3 with 10 | | 0 -> 11 | Buffer.add_string b "a"; 12 | gen (n-1); 13 | Buffer.add_string b "b" 14 | | 1 -> 15 | Buffer.add_string b "a"; 16 | Buffer.add_string b "b"; 17 | gen (n-1); 18 | Buffer.add_string b "c" 19 | | 2 -> 20 | Buffer.add_string b "a"; 21 | Buffer.add_string b "b"; 22 | Buffer.add_string b "c"; 23 | gen (n-1); 24 | Buffer.add_string b "d" 25 | | _ -> assert false 26 | in 27 | gen n; 28 | Buffer.contents b 29 | 30 | let parse_string c = parse_string c (Blank.from_charset (Charset.singleton ' ')) 31 | 32 | let%parser [@cache] rec g = () => () 33 | ; 'a' g 'b' => () 34 | ; 'a' g 'b' g 'c' => () 35 | ; 'a' g 'b' g 'c' g 'd' => () 36 | 37 | let n = int_of_string Sys.argv.(1) 38 | 39 | let chrono_parse g s = 40 | let n = String.length s in 41 | Printf.printf "parsing %d chars in %!" n; 42 | let t0 = Unix.gettimeofday () in 43 | let r = parse_string g s in 44 | let t1 = Unix.gettimeofday () in 45 | Printf.printf "%f seconds\n%!" (t1 -. t0); 46 | r 47 | 48 | let _ = 49 | for i = 1 to 4 do 50 | let str = gamma_gen (n*i) in 51 | chrono_parse g str 52 | done 53 | -------------------------------------------------------------------------------- /tests/ppx_test.ml: -------------------------------------------------------------------------------- 1 | open Pacomb 2 | open Pos 3 | open Grammar 4 | 5 | let ps pos = (pos.offset_start, pos.offset_end) 6 | 7 | let bspace = Blank.from_charset (Charset.singleton ' ') 8 | 9 | let test ?(blank=bspace) g s r = 10 | assert (parse_string g blank s = r) 11 | 12 | let tests ?(blank=bspace) g l = 13 | List.iter (fun (s,r) -> test ~blank g s r) l 14 | 15 | let test_fail ?(blank=bspace) g s = 16 | try 17 | let _ = parse_string g blank s in assert false 18 | with Parse_error _ -> () 19 | 20 | let tests_fail ?(blank=bspace) g l = 21 | List.iter (test_fail ~blank g) l 22 | 23 | (* test syntax for terminal *) 24 | let%parser g : unit grammar = () => () 25 | let _ = test g "" () 26 | let%parser g : unit grammar = 'a' => () 27 | let _ = test g "a" () 28 | let%parser g : unit grammar = "a" => () 29 | let _ = test g "a" () 30 | let%parser g : int grammar = (x::INT) => x 31 | let _ = test g "42" 42 32 | let%parser g : float grammar = (x::FLOAT) => x 33 | let _ = test g "42.42E-42" 42.42E-42 34 | 35 | let%parser g : string grammar = (x::RE "\\([a-zA-Z_][a-zA-Z_0-9]*\\)") => x 36 | let _ = test g "toto_x3" "toto_x3" 37 | let%parser g : 'a -> 'a -> float grammar = 38 | fun x y -> (x=y) (z::INT) => float_of_int z 39 | ; (x<>y) (z::FLOAT) => z 40 | let _ = test (g 0 0) "42" 42.0 41 | let _ = test (g 0 1) "42.0" 42.0 42 | 43 | 44 | (* test patterns in terminals *) 45 | let%parser g0 : (int * int) grammar = 46 | (x::INT) => ps x_pos 47 | let _ = test g0 " 123 " (1,4) 48 | 49 | let%parser g : (int * int) grammar = ((x,y)::g0) => (y,x) 50 | let _ = test g " 123 " (4,1) 51 | let%parser g : (int * int * int) grammar = 52 | (((x,y)=z)::g0) => (y,x,snd (ps z_pos)) 53 | let _ = test g " 123 " (4,1,4) 54 | let%parser g : (int * int * int) grammar = 55 | ((((x:int),(y:int))=z)::g0) => (y,x,snd (ps z_pos)) 56 | let _ = test g " 123 " (4,1,4) 57 | 58 | (* test rules and sequences *) 59 | type op = Add | Sub | Mul | Div 60 | let%parser bin = (x::INT) 61 | (op::('+'=>Add ; '-'=>Sub; '*'=>Mul; '/'=>Div)) 62 | (y::INT) => (x,op,y) 63 | let _ = test bin "42 + 73" (42,Add,73) 64 | let%parser g = (x::INT) 'a' 'b' => x 65 | ; 'a' (x::INT) 'b' => x 66 | ; 'a' 'b' (x::INT) => x 67 | let _ = tests g [("42 a b",42); ("a 42 b",42); ("a b 42",42)] 68 | 69 | (* test condiiton *) 70 | let%parser g = (true && true) 'a' => () 71 | let _ = test g "a" () 72 | let%parser g = (=) true true 'a' => () 73 | let _ = test g "a" () 74 | let%parser g = (true = true) 'a' => () 75 | let _ = test g "a" () 76 | let%parser g = (true <= true) 'a' => () 77 | let _ = test g "a" () 78 | let%parser g = (not false) 'a' => () 79 | let _ = test g "a" () 80 | 81 | (* test positions *) 82 | let%parser g = 83 | (x::INT) 'a' 'b' => (fst (ps x_pos),x,snd (ps x_pos)) 84 | ; 'a' (x::INT) (b::'b') => (fst (ps x_pos),x,snd (ps b_pos)) 85 | ; (a::'a') 'b' (x::INT) => (fst (ps a_pos),x,snd (ps x_pos)) 86 | let _ = tests g [("42 a b ",(0,42,2)) 87 | ; ("a 42 b ",(2,42,6)) 88 | ; ("a b 42 ",(0,42,6))] 89 | let%parser g = 90 | (x::bin) 'a' 'b' => (fst (ps x_pos),x,snd (ps x_pos)) 91 | ; 'a' (x::bin) (b::'b') => (fst (ps x_pos),x,snd (ps b_pos)) 92 | ; (a::'a') 'b' (x::bin) => (fst (ps a_pos),x,snd (ps x_pos)) 93 | let _ = tests g [("42+13 a b ",(0,(42,Add,13),5)) 94 | ; ("a 42 * 4 b ",(2,(42,Mul,4),10)) 95 | ; ("a b 42 / 2 ",(0,(42,Div,2),10))] 96 | let%parser g = 'a' 'b' => Pos.(_pos.offset_start,_pos.offset_end) 97 | let _ = tests g [("a b ",(0,3))] 98 | 99 | (* test recursion *) 100 | let%parser rec g = (y::g) (x::INT) => x+y 101 | ; (x::INT) => x 102 | let _ = tests g [("42", 42); ("1 2 3",6)] 103 | let%parser rec g = (x::INT) (y::g) => x+y 104 | ; (x::INT) => x 105 | let _ = tests g [("42", 42); ("1 2 3",6)] 106 | let%parser rec g = (x::INT) '+' (y::g) => x+y 107 | ; (x::g) '-' (y::INT) => x-y 108 | ; (x::INT) => x 109 | (* This is ambiguous !!! *) 110 | let _ = tests g [("42", 42); ("1 + 2 - 3",0); ("1 - 2 - 3",-4)] 111 | let%parser rec g1 = (x::g3) 'a' 'b' => x+1 112 | ; 'c' (x::g1) 'd' => x-1 113 | ; 'e' 'f' (x::g2) => x 114 | ; () => 0 115 | and g2 = (x::g1) 'b' 'a' => x+1 116 | ; 'c' (x::g2) 'd' => x-1 117 | ; 'f' 'e' (x::g3) => x 118 | ; () => 0 119 | and g3 = (x::g2) 'a' 'b' => x+1 120 | ; 'd' (x::g3) 'c' => x-1 121 | ; 'e' 'f' (x::g1) => x 122 | ; () => 0 123 | let _ = tests g1 [("", 0); ("ab",1); ("cd",-1); ("ef", 0)] 124 | let _ = tests g2 [("", 0); ("ba",1); ("cd",-1); ("fe", 0)] 125 | let _ = tests g3 [("", 0); ("ab",1); ("dc",-1); ("ef", 0)] 126 | let _ = tests g1 [("cdefefcfedcabd",-2)] 127 | 128 | (* test right recursion and lazy *) 129 | let n = ref 0 130 | let%parser rec g = (c::CHAR) (l::force g) => lazy (incr n; String.make 1 c :: l) 131 | ; () => lazy [] 132 | let%parser h = (lazy l::g) EOF => String.concat "" l 133 | 134 | let _ = 135 | let nb = 1_000_000 in 136 | let s = (String.make nb 'a') in test h s s; assert (!n = nb) 137 | 138 | (* test parameters *) 139 | let%parser rec g g0 n = (n=0) () => 0 140 | ; (n>0) (x::g g0 (n-1)) g0 => x+1 141 | let%parser _ = 142 | for i = 0 to 10 do 143 | test (g ('a' => ()) i) (String.make i 'a') i; 144 | test_fail (g ('a' => ()) i) (String.make (i+1) 'a') 145 | done 146 | 147 | let%parser rec g g0 n = (n=0) () => 0 148 | ; (n>0) g0 (x::g g0 (n-1)) => x+1 149 | let%parser _ = 150 | for i = 0 to 10 do 151 | test (g ('a' => ()) i) (String.make i 'a') i; 152 | test_fail (g ('a' => ()) i) (String.make (i+1) 'a') 153 | done 154 | 155 | let%parser rec g ~g:g0 ?(n=0) () = (n=0) () => 0 156 | ; (n>0) g0 (x::g ~g:g0 ~n:(n-1) ()) => x+1 157 | let%parser _ = 158 | test (g ~g:('a' => ()) ()) (String.make 0 'a') 0; 159 | test_fail (g ~g:('a' => ()) ()) (String.make 1 'a'); 160 | for i = 0 to 10 do 161 | test (g ~g:('a' => ()) ~n:i ()) (String.make i 'a') i; 162 | test_fail (g ~g:('a' => ()) ~n:i ()) (String.make (i+1) 'a') 163 | done 164 | 165 | 166 | (* test grammar under sub expressions or sub modules *) 167 | let noblank = layout Blank.none 168 | let%parser f = (x::(noblank ('a' 'a' => 2))) => x 169 | 170 | module%parser H = 171 | struct 172 | let f = 'b' (x::(noblank ('a' 'a' => 2))) => x 173 | end 174 | let _ = test f "aa" 2 175 | let _ = test_fail f "a a" 176 | 177 | let _ = test H.f "b aa " 2 178 | let _ = test_fail H.f "b a a " 179 | 180 | type tree = Nil of int | Bin of tree * tree | Alt of tree * tree 181 | let rec nb_tree = function 182 | | Nil _ -> 1 183 | | Bin(t1,t2) -> nb_tree t1 * nb_tree t2 184 | | Alt(t1,t2) -> nb_tree t1 + nb_tree t2 185 | 186 | let size t = 187 | let adone = ref [] in 188 | let rec fn t = 189 | if List.memq t !adone then 0 else 190 | begin 191 | adone := t :: !adone; 192 | match t with 193 | | Nil _ -> 0 194 | | Bin(t1,t2) -> fn t1 + fn t2 + 1 195 | | Alt(t1,t2) -> fn t1 + fn t2 + 1 196 | end 197 | in fn t 198 | 199 | let catalan_fn = 200 | let memo = Hashtbl.create 128 in 201 | let rec fn n = 202 | if n = 0 then 1 else if n = 1 then 1 else 203 | try Hashtbl.find memo n 204 | with Not_found -> 205 | let r = ref 0 in 206 | for i = 0 to n-1 do 207 | r := fn i * fn (n - i - 1) + !r 208 | done; 209 | Hashtbl.add memo n !r; 210 | !r 211 | in 212 | fn 213 | (* test cache and merge attribute *) 214 | let%parser [@merge (fun x y -> Alt(x,y))] rec catalan = 215 | (x::INT) => Nil x 216 | ; (t1::catalan) ',' (t2::catalan) => Bin(t1,t2) 217 | 218 | let _ = nb_tree (parse_string catalan bspace "0,0,0,0") = catalan_fn 4 219 | let _ = nb_tree (parse_string catalan bspace "0,0,0,0,0,0") = catalan_fn 6 220 | -------------------------------------------------------------------------------- /tests/scan.ml: -------------------------------------------------------------------------------- 1 | 2 | (* This test is for measuring the performance of Input *) 3 | open Pacomb 4 | 5 | let utf8 = ref Utf8.ASCII 6 | 7 | let files = ref [] 8 | 9 | let spec = Arg.[("--utf8", Unit(fun () -> utf8 := Utf8.UTF8), "use utf8 mode")] 10 | 11 | let _ = Arg.parse spec 12 | (fun x -> files := x :: ! files) 13 | "scan [--utf8] files ('-' to read stdin)" 14 | 15 | let utf8 = !utf8 16 | let files = if !files = [] then ["-"] else !files 17 | 18 | let do_file file = 19 | let buf = 20 | if file = "-" then 21 | Input.from_channel ~utf8 stdin 22 | else Input.from_file ~utf8 file 23 | in 24 | let t0 = Unix.gettimeofday () in 25 | let rec fn n buf pos = 26 | let (c,buf,pos) = Input.read buf pos in 27 | if c = '\255' then Input.line_num buf, n else fn (n+1) buf pos 28 | in 29 | let l, n = fn 0 buf Input.init_pos in 30 | let t1 = Unix.gettimeofday () in 31 | let d = t1 -. t0 in 32 | let s = (float) n /. d /. (1024. *. 1024.) in 33 | let m = (float n) /. (1024. *. 1024.) in 34 | Printf.printf 35 | "%s: %.2f Mo/%d lines read in %f.3s (%.2f Mo/s) with utf8 mode %b\n" 36 | file m l d s (utf8 = Utf8.UTF8) 37 | 38 | let _ = List.iter do_file files 39 | -------------------------------------------------------------------------------- /tools/sanity_check.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/bin/bash 3 | 4 | # Sanity checks for source files in directory [src]. 5 | 6 | ML_FILES=`find lib -name "*.ml" -or -name "*.mli"` 7 | TESTS_FILES=`find tests -name "*.ml" -or -name "*.mli" -or -name "*.mll" -or -name "*.mly"` 8 | FILES="ppx/ppx_pacomb.ml $ML_FILES $TESTS_FILES" 9 | 10 | awk 'length>80 {print FILENAME ", line " FNR ": more than 80 characters..."}' $FILES 11 | awk '/.*\s$/ {print FILENAME ", line " FNR ": trailing spaces..."} ' $FILES 12 | awk '/.*\t.*/ {print FILENAME ", line " FNR ": contains tabs..."} ' $FILES 13 | 14 | # Check for issue number of FIXMEs and TODOs in source files and PML files. 15 | 16 | awk '/FIXME:? [^#]/ {print FILENAME ", line " FNR ": FIXME without issue number"}' $FILES 17 | awk '/TODO:? [^#]/ {print FILENAME ", line " FNR ": TODO without issue number"}' $FILES 18 | --------------------------------------------------------------------------------