├── dune-project
├── .gitattributes
├── .merlin
├── .gitmodules
├── src
    ├── lib
    │   ├── dune
    │   ├── sedlexing.mli
    │   └── sedlexing.ml
    └── syntax
    │   ├── dune
    │   ├── sedlex_cset.mli
    │   ├── sedlex.mli
    │   ├── unicode63.mli
    │   ├── sedlex.ml
    │   ├── sedlex_cset.ml
    │   └── ppx_sedlex.ml
├── .github
    └── CODEOWNERS
├── examples
    ├── complement.ml
    ├── bsconfig.json
    ├── subtraction.ml
    ├── performance.ml
    ├── repeat.ml
    ├── dune
    └── tokenizer.ml
├── bsconfig.json
├── .gitignore
├── Makefile
├── LICENSE
├── sedlex.opam
├── scripts
    ├── preversion.sh
    ├── package.js
    ├── release-ppx.sh
    ├── travis.sh
    └── version.sh
├── package.json
├── CHANGES
├── .travis.yml
└── README.md


/dune-project:
--------------------------------------------------------------------------------
1 | (lang dune 1.0)
2 | (name sedlex)
3 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | package-lock.json       linguist-generated -diff
2 | 


--------------------------------------------------------------------------------
/.merlin:
--------------------------------------------------------------------------------
1 | PKG compiler-libs ppx_tools ppx_tools.metaquot
2 | 
3 | SRC src/**
4 | 
5 | B src/**
6 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "ppx-sedlex"]
2 | 	path = ppx-sedlex
3 | 	url = https://github.com/ELLIOTTCABLE/ppx-sedlex.git
4 | 


--------------------------------------------------------------------------------
/src/lib/dune:
--------------------------------------------------------------------------------
1 | (library
2 |  (name sedlex)
3 |  (public_name sedlex)
4 |  (wrapped false)
5 |  (libraries gen uchar)
6 |  (flags :standard -w +A-4-9 -safe-string))
7 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # These are the default owners for everything in the repo. They will
2 | # be requested for review when someone opens a pull request.
3 | * @alainfrisch @Drup @pmetzger
4 | 


--------------------------------------------------------------------------------
/src/syntax/dune:
--------------------------------------------------------------------------------
 1 | (library
 2 |  (name sedlex_ppx)
 3 |  (public_name sedlex.ppx)
 4 |  (kind ppx_rewriter)
 5 |  (libraries ppx_tools_versioned.metaquot_405 ocaml-migrate-parsetree sedlex)
 6 |  (ppx_runtime_libraries sedlex)
 7 |  (preprocess
 8 |   (pps ppx_tools_versioned.metaquot_405))
 9 |  (flags (:standard -w -9)))
10 | 


--------------------------------------------------------------------------------
/examples/complement.ml:
--------------------------------------------------------------------------------
 1 | let ucase = [%sedlex.regexp? 'A'..'Z']
 2 | let lcase = [%sedlex.regexp? 'a'..'z']
 3 | 
 4 | let rec token buf =
 5 |   match%sedlex buf with
 6 |   | lcase -> print_char 'L';token buf
 7 |   | Compl (ucase | lcase) -> print_char '?'; token buf
 8 |   | ucase -> print_char 'U';token buf
 9 |   | eof -> print_endline "."
10 |   | _ -> assert false
11 | 
12 | let () =
13 |   let lexbuf = Sedlexing.Latin1.from_string "Abc::DefG" in
14 |   token lexbuf
15 | 


--------------------------------------------------------------------------------
/bsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "bs-sedlex",
 3 |   "version": "20.0.0",
 4 |   "sources": [
 5 |     {
 6 |       "dir": "src/lib"
 7 |     }
 8 |   ],
 9 |   "bs-dependencies": [
10 |     "bs-uchar",
11 |     "bs-gen"
12 |   ],
13 |   "package-specs": {
14 |     "module": "commonjs",
15 |     "in-source": true
16 |   },
17 |   "suffix": ".bs.js",
18 |   "bsc-flags": [
19 |     "-open UcharShim",
20 |     "-bs-super-errors",
21 |     "-bs-no-version-header"
22 |   ],
23 |   "refmt": 3
24 | }
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.annot
 2 | *.cmo
 3 | *.cma
 4 | *.cmi
 5 | *.cmt
 6 | *.cmti
 7 | *.a
 8 | *.o
 9 | *.cmx
10 | *.cmxs
11 | *.cmxa
12 | *.exe
13 | *.install
14 | **/*.merlin
15 | _build
16 | libdoc
17 | src/syntax/ppx_sedlex
18 | examples/complement
19 | examples/tokenizer
20 | examples/subtraction
21 | 
22 | _opam
23 | 
24 | # BuckleScript
25 | node_modules
26 | **/lib/bs
27 | **/lib/ocaml
28 | *.cmj
29 | *.bs.js
30 | *.mlast*
31 | *.mliast*
32 | 
33 | ppx/
34 | dist/ppx-sedlex-*.zip
35 | .deploy-status.env
36 | 


--------------------------------------------------------------------------------
/examples/bsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "name": "bs-sedlex-examples",
 3 |    "sources": [
 4 |       {
 5 |          "dir": "../src/lib"
 6 |       },
 7 |       {
 8 |          "dir": "./",
 9 |          "type": "dev"
10 |       }
11 |    ],
12 |    "ppx-flags": ["ppx-sedlex/ppx.js"],
13 |    "bs-dependencies": ["bs-uchar", "bs-gen"],
14 |    "package-specs": {
15 |       "module": "commonjs",
16 |       "in-source": true
17 |    },
18 |    "suffix": ".bs.js",
19 |    "bsc-flags": ["-open UcharShim", "-bs-super-errors", "-bs-no-version-header"],
20 |    "refmt": 3
21 | }
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # The package sedlex is released under the terms of an MIT-like license.
 2 | # See the attached LICENSE file.
 3 | # Copyright 2005, 2013 by Alain Frisch and LexiFi.
 4 | 
 5 | INSTALL_ARGS := $(if $(PREFIX),--prefix $(PREFIX),)
 6 | 
 7 | .PHONY: build install uninstall clean doc test all
 8 | 
 9 | build:
10 | 	dune build @install
11 | 
12 | install:
13 | 	dune install $(INSTALL_ARGS)
14 | 
15 | uninstall:
16 | 	dune uninstall $(INSTALL_ARGS)
17 | 
18 | clean:
19 | 	dune clean
20 | 
21 | doc:
22 | 	dune build @doc
23 | 
24 | test:
25 | 	dune build @runtest
26 | 
27 | all: build test doc
28 | 


--------------------------------------------------------------------------------
/examples/subtraction.ml:
--------------------------------------------------------------------------------
 1 | let rec token buf =
 2 |   match%sedlex buf with
 3 |     | white_space -> print_endline "\tWhitespace"; token buf
 4 |     | Sub (Chars "ab","b") -> print_endline "a"; token buf
 5 |     | (Chars "ab"|"c") -> print_endline "abc"; token buf
 6 |     | Intersect ("d", Chars "abd") -> print_endline "d"; token buf
 7 |     | eof -> print_endline "\tEnd"
 8 |     | any -> print_endline "Other"; token buf
 9 |     | _ -> failwith "Internal failure: Reached impossible place"
10 | 
11 | 
12 | let () =
13 |   let lexbuf = Sedlexing.Utf8.from_string "a b c d e" in
14 |   token lexbuf
15 | 


--------------------------------------------------------------------------------
/examples/performance.ml:
--------------------------------------------------------------------------------
 1 | let rec token buf =
 2 |   match%sedlex buf with
 3 |   | any -> token buf
 4 |   | eof -> ()
 5 |   | _ -> assert false
 6 | 
 7 | let time f x =
 8 |   let rec acc f x = function
 9 |     | 0 -> f x
10 |     | n -> f x|>ignore; acc f x (n-1) in
11 |   let t = Sys.time() in
12 |   let fx = acc f x 10 in
13 |   Printf.printf "Execution time: %fs\n" (Sys.time() -. t);
14 |   fx
15 | 
16 | let () =
17 |   let long_str = String.make 1000000 '\n' in
18 |   let token_from _ =
19 |     let lexbuf = Sedlexing.Latin1.from_string long_str in
20 |     (* let () = Sedlexing.set_curr_p lexbuf Lexing.dummy_pos in *)
21 |     token lexbuf
22 |   in time token_from long_str
23 | 


--------------------------------------------------------------------------------
/examples/repeat.ml:
--------------------------------------------------------------------------------
 1 | let rec token buf =
 2 |   match%sedlex buf with
 3 |     | white_space -> print_endline "\tWhitespace"; token buf
 4 |     | 'a', Rep(white_space, 1) -> print_endline "a\n\tWhitespace"; token buf
 5 |     | Rep("bc", 2) -> print_endline "bcbc"; token buf
 6 |     | Rep("d", 1 .. 1) -> print_endline "d"; token buf
 7 |     | Rep("ef", 1 .. 3) -> Printf.printf "%s\n" (Sedlexing.Utf8.lexeme buf); token buf 
 8 |     | eof -> print_endline "\tEnd"
 9 |     | any -> print_endline "Other"; token buf
10 |     | _ -> failwith "Internal failure: Reached impossible place"
11 | 
12 | 
13 | let () =
14 |   let lexbuf = Sedlexing.Utf8.from_string "a bcbc d ef efef efefef" in
15 |   token lexbuf
16 | 


--------------------------------------------------------------------------------
/examples/dune:
--------------------------------------------------------------------------------
 1 | (executables
 2 |  (names tokenizer complement subtraction repeat performance)
 3 |  (libraries sedlex)
 4 |  (preprocess
 5 |   (pps sedlex.ppx))
 6 |   (flags :standard -w -39))
 7 | 
 8 | (alias
 9 |  (name runtest)
10 |  (deps
11 |   (:< tokenizer.exe))
12 |  (action
13 |   (run %{<})))
14 | 
15 | (alias
16 |  (name runtest)
17 |  (deps
18 |   (:< complement.exe))
19 |  (action
20 |   (run %{<})))
21 | 
22 | (alias
23 |  (name runtest)
24 |  (deps
25 |   (:< subtraction.exe))
26 |  (action
27 |   (run %{<})))
28 | 
29 | (alias
30 |  (name runtest)
31 |  (deps
32 |   (:< repeat.exe))
33 |  (action
34 |   (run %{<})))
35 | 
36 | (alias
37 |  (name runtest)
38 |  (deps
39 |   (:< performance.exe))
40 |  (action
41 |   (run %{<})))
42 | 


--------------------------------------------------------------------------------
/examples/tokenizer.ml:
--------------------------------------------------------------------------------
 1 | let digit = [%sedlex.regexp? '0'..'9']
 2 | let number = [%sedlex.regexp? Plus digit]
 3 | 
 4 | let rec token buf =
 5 |   let letter = [%sedlex.regexp? 'a'..'z'|'A'..'Z'] in
 6 |   match%sedlex buf with
 7 |   | number -> Printf.printf "Number %s\n" (Sedlexing.Latin1.lexeme buf); token buf
 8 |   | letter, Star ('A'..'Z' | 'a'..'z' | digit) -> Printf.printf "Ident %s\n" (Sedlexing.Latin1.lexeme buf); token buf
 9 |   | Plus xml_blank -> token buf
10 |   | Plus (Chars "+*-/") -> Printf.printf "Op %s\n" (Sedlexing.Latin1.lexeme buf); token buf
11 |   | 128 .. 255 -> print_endline "Non ASCII"
12 |   | eof -> print_endline "EOF"
13 |   | _ -> failwith "Unexpected character"
14 | 
15 | let () =
16 |   let lexbuf = Sedlexing.Latin1.from_string "foobar A123Bfoo  ++123Xbar/foo" in
17 |   token lexbuf
18 | 


--------------------------------------------------------------------------------
/src/syntax/sedlex_cset.mli:
--------------------------------------------------------------------------------
 1 | (* The package sedlex is released under the terms of an MIT-like license. *)
 2 | (* See the attached LICENSE file.                                         *)
 3 | (* Copyright 2005, 2013 by Alain Frisch and LexiFi.                       *)
 4 | 
 5 | (** Representation of sets of unicode code points. *)
 6 | 
 7 | type t = (int * int) list
 8 | 
 9 | val min_code: int
10 | val max_code: int
11 | 
12 | val empty: t
13 | val any: t
14 | val union: t -> t -> t
15 | val difference: t -> t -> t
16 | val intersection: t -> t -> t
17 | val is_empty: t -> bool
18 | val eof: t
19 | val singleton: int -> t
20 | val interval: int -> int -> t
21 | 
22 | val letter: t
23 | val digit: t
24 | val extender: t
25 | val base_char: t
26 | val ideographic: t
27 | val combining_char: t
28 | val blank: t
29 | val tr8876_ident_char: t
30 | 
31 | 


--------------------------------------------------------------------------------
/src/syntax/sedlex.mli:
--------------------------------------------------------------------------------
 1 | (* The package sedlex is released under the terms of an MIT-like license. *)
 2 | (* See the attached LICENSE file.                                         *)
 3 | (* Copyright 2005, 2013 by Alain Frisch and LexiFi.                       *)
 4 | 
 5 | type regexp
 6 | 
 7 | val chars: Sedlex_cset.t -> regexp
 8 | val seq: regexp -> regexp -> regexp
 9 | val alt: regexp -> regexp -> regexp
10 | val rep: regexp -> regexp
11 | val plus: regexp -> regexp
12 | val eps: regexp
13 | 
14 | val compl: regexp -> regexp option
15 |    (* If the argument is a single [chars] regexp, returns a regexp
16 |       which matches the complement set.  Otherwise returns [None]. *)
17 | val subtract: regexp -> regexp -> regexp option
18 |    (* If each argument is a single [chars] regexp, returns a regexp
19 |       which matches the set (arg1 - arg2).  Otherwise returns [None]. *)
20 | val intersection: regexp -> regexp -> regexp option
21 |    (* If each argument is a single [chars] regexp, returns a regexp
22 |       which matches the intersection set.  Otherwise returns [None]. *)
23 | 
24 | val compile: regexp array -> ((Sedlex_cset.t * int) array * bool array) array
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright 2005, 2014 by Alain Frisch and LexiFi.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/sedlex.opam:
--------------------------------------------------------------------------------
 1 | opam-version: "2.0"
 2 | name: "sedlex"
 3 | synopsis: "An OCaml lexer generator for Unicode"
 4 | description: "
 5 | sedlex is a lexer generator for OCaml. It is similar to ocamllex, but supports
 6 | Unicode. Unlike ocamllex, sedlex allows lexer specifications within regular
 7 | OCaml source files. Lexing specific constructs are provided via a ppx syntax
 8 | extension.
 9 | "
10 | version: "2.0"
11 | license: "MIT"
12 | doc: "https://ocaml-community.github.io/sedlex/index.html"
13 | maintainer: "Alain Frisch <alain.frisch@lexifi.com>"
14 | authors: [
15 |   "Alain Frisch <alain.frisch@lexifi.com>"
16 |   "https://github.com/ocaml-community/sedlex/graphs/contributors"
17 | ]
18 | homepage: "https://github.com/ocaml-community/sedlex"
19 | dev-repo: "git+https://github.com/ocaml-community/sedlex.git"
20 | bug-reports: "https://github.com/ocaml-community/sedlex/issues"
21 | build: [
22 |   ["dune" "subst"] {pinned}
23 |   ["dune" "build" "-p" name "-j" jobs]
24 |   ["dune" "runtest" "-p" name "-j" jobs] {with-test}
25 |   ["dune" "build" "@doc" "-p" name "-j" jobs] {with-doc}
26 | ]
27 | depends: [
28 |   "ocaml" {build & >= "4.02.3"}
29 |   "dune" {build & >= "1.0"}
30 |   "ppx_tools_versioned"
31 |   "ocaml-migrate-parsetree"
32 |   "gen"
33 |   "uchar"
34 | ]
35 | 


--------------------------------------------------------------------------------
/scripts/preversion.sh:
--------------------------------------------------------------------------------
 1 | puts() { printf %s\\n "$@" ;}
 2 | pute() { printf %s\\n "!! $*" >&2 ;}
 3 | argq() { [ $# -gt 0 ] && printf "'%s' " "$@" ;}
 4 | 
 5 | 
 6 | if ! command -v sponge >/dev/null; then
 7 |    pute 'You need `sponge` to use this script!'
 8 |    pute 'Try: `brew install moreutils`'
 9 |    puts '' >&2
10 |    missing_dep=true
11 | fi
12 | 
13 | if [ -n "$missing_dep" ]; then exit 3; fi
14 | 
15 | (
16 |    cd ./ppx-sedlex;
17 |    if ! git diff --exit-code >/dev/null || ! git diff --cached --exit-code >/dev/null; then
18 |       pute 'There appear to be uncommitted changes in `ppx-sedlex`. You should ensure the'
19 |       pute 'submodule is entirely committed and clean before attempting to version-bump'
20 |       pute 'the superproject.'
21 |       puts '' >&2
22 | 
23 |       pute 'Try:'
24 |       puts '' >&2
25 |       puts '      (cd ./ppx-sedlex && git stash save)' >&2
26 |       puts '' >&2
27 |       exit 2
28 |    fi
29 | )
30 | 
31 | branch="$(git rev-parse --abbrev-ref HEAD)"
32 | if [ "$branch" != "master" ]; then
33 |    pute "You're on a topic branch, '$branch'."
34 |    puts '' >&2
35 | 
36 |    pute "You need to merge your work into 'master' before you can version-bump,"
37 |    pute 'commit, and push.'
38 |    puts '' >&2
39 | 
40 |    pute 'Try:'
41 |    puts '' >&2
42 |    puts '      git checkout master' >&2
43 |    puts "      git merge --ff '$branch'" >&2
44 |    puts '' >&2
45 |    exit 2
46 | fi
47 | 


--------------------------------------------------------------------------------
/src/syntax/unicode63.mli:
--------------------------------------------------------------------------------
 1 | module Categories : sig
 2 | 
 3 |   val cc : Sedlex_cset.t
 4 |   val cf : Sedlex_cset.t
 5 |   val cn : Sedlex_cset.t
 6 |   val co : Sedlex_cset.t
 7 |   val cs : Sedlex_cset.t
 8 |   val ll : Sedlex_cset.t
 9 |   val lm : Sedlex_cset.t
10 |   val lo : Sedlex_cset.t
11 |   val lt : Sedlex_cset.t
12 |   val lu : Sedlex_cset.t
13 |   val mc : Sedlex_cset.t
14 |   val me : Sedlex_cset.t
15 |   val mn : Sedlex_cset.t
16 |   val nd : Sedlex_cset.t
17 |   val nl : Sedlex_cset.t
18 |   val no : Sedlex_cset.t
19 |   val pc : Sedlex_cset.t
20 |   val pd : Sedlex_cset.t
21 |   val pe : Sedlex_cset.t
22 |   val pf : Sedlex_cset.t
23 |   val pi : Sedlex_cset.t
24 |   val po : Sedlex_cset.t
25 |   val ps : Sedlex_cset.t
26 |   val sc : Sedlex_cset.t
27 |   val sk : Sedlex_cset.t
28 |   val sm : Sedlex_cset.t
29 |   val so : Sedlex_cset.t
30 |   val zl : Sedlex_cset.t
31 |   val zp : Sedlex_cset.t
32 |   val zs : Sedlex_cset.t
33 | 
34 | end
35 | 
36 | module Properties : sig
37 | 
38 |   val alphabetic       : Sedlex_cset.t
39 |   val ascii_hex_digit  : Sedlex_cset.t
40 |   val hex_digit        : Sedlex_cset.t
41 |   val id_continue      : Sedlex_cset.t
42 |   val id_start         : Sedlex_cset.t
43 |   val lowercase        : Sedlex_cset.t
44 |   val math             : Sedlex_cset.t
45 |   val other_alphabetic : Sedlex_cset.t
46 |   val other_lowercase  : Sedlex_cset.t
47 |   val other_math       : Sedlex_cset.t
48 |   val other_uppercase  : Sedlex_cset.t
49 |   val uppercase        : Sedlex_cset.t
50 |   val white_space      : Sedlex_cset.t
51 |   val xid_continue     : Sedlex_cset.t
52 |   val xid_start        : Sedlex_cset.t
53 | 
54 | end
55 | 


--------------------------------------------------------------------------------
/scripts/package.js:
--------------------------------------------------------------------------------
 1 | // This is run on Travis, and presumably eventually AppVeyor, builds. It's supposed to be as
 2 | // cross-platform as possible, unlike the adjacent release-script, which I only care if *I* can run.
 3 | const current_ppx_sedlex_id = require("ppx-sedlex/identify"),
 4 |    path = require("path"),
 5 |    fs = require("fs"),
 6 |    cpy = require("cpy"),
 7 |    makeDir = require("make-dir"),
 8 |    archiver = require("archiver")
 9 | 
10 | const zipfile = `ppx-sedlex-${current_ppx_sedlex_id}.zip`,
11 |    dist_dir = "dist/",
12 |    submodule_ppx_dir = `ppx-sedlex/ppx-sedlex-${current_ppx_sedlex_id}/`,
13 |    build_dir = "_build/install/default/lib/sedlex/ppx/",
14 |    zip_dir = "ppx/",
15 |    exe = "ppx.exe"
16 | 
17 | // FIXME: Does any of these even work on Windows
18 | ;(async () => {
19 |    // Copy the executable to the submodule,
20 |    console.log(`Copy: ${path.join(build_dir, exe)} -> ${submodule_ppx_dir}`)
21 |    await cpy(path.join(build_dir, exe), submodule_ppx_dir)
22 | 
23 |    // ... and again to the zip-directory,
24 |    console.log(`Copy: ${path.join(build_dir, exe)} -> ${zip_dir}`)
25 |    await cpy(path.join(build_dir, exe), zip_dir)
26 | 
27 |    // Create a zip-archive,
28 |    const dist = await makeDir(dist_dir)
29 |    console.log(`Zip: ${path.join(zip_dir, exe)} >>> ${path.join(dist, zipfile)}`)
30 |    const output = fs.createWriteStream(path.join(dist, zipfile)),
31 |       archive = archiver("zip", { zlib: { level: 9 } })
32 | 
33 |    output.on("close", function() {
34 |       console.log(`>> Zipped: ${archive.pointer()} total bytes`)
35 |    })
36 | 
37 |    archive.pipe(output)
38 |    archive.directory(zip_dir, false)
39 |    archive.finalize()
40 | })()
41 | 


--------------------------------------------------------------------------------
/scripts/release-ppx.sh:
--------------------------------------------------------------------------------
 1 | puts() { printf %s\\n "$@" ;}
 2 | pute() { printf %s\\n "~~ $*" >&2 ;}
 3 | argq() { [ $# -gt 0 ] && printf "'%s' " "$@" ;}
 4 | 
 5 | alias jq="$(npm prefix)/node_modules/node-jq/bin/jq"
 6 | pj() { jq -er "$1" package.json ;}
 7 | 
 8 | 
 9 | # for EXTDEP in jq; do
10 | #    if ! command -v $EXTDEP >/dev/null; then
11 | #       pute 'You need `'"$EXTDEP"'` to use this script!'
12 | #       pute 'Try: `brew install '"$EXTDEP"'`'
13 | #       printf \\n >&2
14 | #       missing_dep=true
15 | #    fi
16 | # done
17 | 
18 | if ! command -v sponge >/dev/null; then
19 |    pute 'You need `sponge` to use this script!'
20 |    pute 'Try: `brew install moreutils`'
21 |    puts '' >&2
22 |    missing_dep=true
23 | fi
24 | 
25 | if [ -n "$missing_dep" ]; then exit 3; fi
26 | 
27 | 
28 | new_version="$(pj .version)"
29 | puts 'New version (from `bs-sedlex/package.json`):  '"v$new_version"
30 | 
31 | old_version="$(cd ./ppx-sedlex; pj .version)"
32 | puts 'Old version (from `ppx-sedlex/package.json`): '"v$old_version"
33 | puts '' >&2
34 | 
35 | if [ "$new_version" = "$old_version" ]; then
36 |    pute 'The version in `bs-sedlex` doesn'\''t appear to have been bumped!'
37 |    puts '' >&2
38 | 
39 |    pute 'This script is intended to be used *after* Travis builds and packages binary'
40 |    pute 'artifacts; try something like the following:'
41 |    puts '' >&2
42 | 
43 |    puts '      git stash save' >&2
44 |    puts '      git checkout master' >&2
45 |    puts '      npm version patch --no-git-tag-version' >&2
46 |    puts '      git add --all' >&2
47 |    puts '      git commit "(rel) Bump to v`jq -r .version package.json`"' >&2
48 |    puts '      git push' >&2
49 |    puts '' >&2
50 |    exit 2
51 | fi
52 | 
53 | puts 'yay!'
54 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "bs-sedlex",
 3 |   "version": "20.0.0",
 4 |   "description": "An OCaml lexer generator for Unicode",
 5 |   "main": "index.js",
 6 |   "directories": {
 7 |     "example": "examples"
 8 |   },
 9 |   "dependencies": {
10 |     "bs-gen": "^0.5.2",
11 |     "bs-uchar": "^2.0.0"
12 |   },
13 |   "devDependencies": {
14 |     "archiver": "^3.1.1",
15 |     "bs-platform": "^5.2.1",
16 |     "cpy": "^7.3.0",
17 |     "foreach-cli": "^1.8.1",
18 |     "make-dir": "^1.3.0",
19 |     "node-jq": "^1.10.3",
20 |     "ppx-sedlex": "=20.0.0"
21 |   },
22 |   "peerDependencies": {
23 |     "bs-platform": ">=5.0.0"
24 |   },
25 |   "scripts": {
26 |     "clean:examples": "npm run clean:ppx && cd examples && bsb -clean-world",
27 |     "build:examples": "npm run build:ppx && cd examples && bsb -make-world",
28 |     "clean:runtime": "bsb -clean-world",
29 |     "build:runtime": "bsb -make-world",
30 |     "clean:ppx": "dune clean",
31 |     "build:ppx": "dune build && node scripts/package.js",
32 |     "clean": "npm run clean:runtime && npm run clean:ppx",
33 |     "build": "npm run build:runtime && npm run build:ppx",
34 |     "prepare": "npm run clean && npm run build:runtime",
35 |     "preversion": "./scripts/preversion.sh",
36 |     "version": "./scripts/version.sh",
37 |     "test": "foreach -g 'examples/*.bs.js' -x 'node #{path}'",
38 |     "travis": "./scripts/travis.sh"
39 |   },
40 |   "files": [
41 |     "bsconfig.json",
42 |     "sedlex.opam",
43 |     "examples/**/*.ml",
44 |     "examples/**/*.mli",
45 |     "examples/**/*.bs.js",
46 |     "src/**/*.ml",
47 |     "src/**/*.mli",
48 |     "src/**/*.bs.js",
49 |     "!**/dune",
50 |     "!**/.merlin"
51 |   ],
52 |   "repository": {
53 |     "type": "git",
54 |     "url": "git+https://github.com/ELLIOTTCABLE/bs-sedlex.git"
55 |   },
56 |   "author": "Alain Frisch <alain.frisch@lexifi.com>",
57 |   "license": "MIT",
58 |   "bugs": {
59 |     "url": "https://github.com/ELLIOTTCABLE/bs-sedlex/issues"
60 |   },
61 |   "homepage": "https://github.com/ELLIOTTCABLE/bs-sedlex#readme"
62 | }
63 | 


--------------------------------------------------------------------------------
/CHANGES:
--------------------------------------------------------------------------------
 1 | 2.0
 2 |      * GPR#70: Switch to dune, opam v2
 3 |      * GPR#60: Breaking change: switch from int codepoints to Uchar.t
 4 |        codepoints
 5 |      * GPR#59: Track lexing position
 6 | 
 7 | -------------------------------------------------------------------------------
 8 | 
 9 | 1.99.4
10 |      * GPR#47: Switch to ocaml-migrate-parsetree (contributed by Adrien Guatto)
11 |      * GPR#42: Added 'Rep' (repeat operator) (contributed by jpathy)
12 | 
13 | 1.99.3
14 |      * Update to work with 4.03 (4.02 still supported)
15 | 
16 | 1.99.2
17 |      * First official release of sedlex
18 | 
19 | 1.99.1
20 |      * Support for new Ast_mapper registration API, follow OCaml trunk after
21 |        the inclusion of the extension_point branch
22 | 
23 | 1.99
24 |      * First version of sedlex.  The history below refers to ulex, the ancestor
25 |        or sedlex implemented with Camlp4.
26 | 
27 | -------------------------------------------------------------------------------
28 | 
29 | 1.1
30 |      * Generate (more) globally unique identifiers to avoid conflicts when open'ing another module
31 |        processed by ulex (issue reported by Gerd Stolpmann)
32 | 
33 | 1.0
34 |      * Update to the new Camlp4 and to ocamlbuild (release for OCaml 3.10
35 |        only), by Nicolas Pouillard.
36 | 
37 | 0.8
38 |      * Really make it work with OCaml 3.09.
39 |      * Support for Utf-16.
40 | 
41 | 0.7 released May 24 2005
42 |      * Bug fixes
43 |      * Update to OCaml 3.09 (currently CVS). Still works with OCaml 3.08.
44 |      * MIT-like license (used to LGPL)
45 | 
46 | 0.5 release Jul. 8 2004
47 |      * Document how to use a custom implementation for lex buffers
48 |      * Update to OCaml 3.08
49 | 
50 | 0.4 released Jan. 10 2004
51 |      * Bug fix (accept 1114111 as valid Unicode code point)
52 |      * Add the rollback function
53 | 
54 | 0.3 released Oct. 8 2003
55 |      * Bug fix
56 |      * Add a new predefined class for ISO identifiers
57 | 
58 | 0.2 released Sep. 22 2003
59 |      * Changed the names of predefined regexp
60 |      * Fix max_code = 0x10ffff
61 |      * Lexers that changes encoding on the fly
62 |      * Documentation of the interface Ulexing
63 | 
64 | 0.1 released Sep. 20 2003
65 |      * Initial release
66 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | git:
 3 |    depth: 3
 4 | 
 5 | language: node_js
 6 | node_js:
 7 |    - "10"
 8 | 
 9 | cache:
10 |    directories:
11 |       - $HOME/.opam
12 |       - $HOME/.npm
13 | 
14 | install: STAGE=install npm run travis
15 | 
16 | script: STAGE=test npm run travis
17 | 
18 | jobs:
19 |    include:
20 |       - stage: deploy
21 |         os: linux
22 |         env: COMPONENT=ppx-examples-test
23 |         script: STAGE=deploy npm run travis
24 |         deploy: &gh-release
25 |            provider: releases
26 |            on:
27 |               tags: true
28 |            skip_cleanup: true
29 |            api_key:
30 |               secure: FXlYB0nsXDE2DXf+8zAEH7j2pnRxRIHiHZFN0pCDQqTkfOSKHIfhZ1+L0JD8vkV7GtXDBYAiFmVlYZyPTOWSDHKTWROdpe6qsJsV5EJXmREIjmgMUI14SDJITtyXSE7cezPVtKdjwprJtZUHIvKPbFYscM0DF0ayT0NwFqz01aE/DIQk2AWrzcaDzNSnCsYYoJsDtetOLmWXeABHV/ohfrEF1bfurqk8hSLdhDirE7xZm2gr+KgY8tYP3geU4hqVX4tk9xydnxiWGj3N7Z7CBmOLAXDm2sXJT8fjAN/DaorO7EPEZQw4tScmTigGQKrJ0jvNtJ6QpvG3sthFyxpkuhvW0+YGNdco9PcK8aoJGztwbi0+Fc54CfIUGtN/hxTsGWopUOGAAV3R0o91TXB25oNZlzzeSVZx2tbyDHqeP1w3l/9/bLbejBQcgBh0vEx6QDsCZT/c8i/QVOozZD6y50BmaSRdPu/QtAs3IU7cQLCz42l2IYH3gDTjpy1zpblZaHabQ6xfzl95PiodXKekHblypgOWgc+5QNuUo1+kXqIa84KlV4G67EOFKCR+QvnHpVgX9PAnw5K9G4Fp6Uc5og13verN/phSTeW5R08AGx9T5ieEz39g4FrwZSnQzqzbWBwygt9NHHiauUxETI44n3haLgFj0gQVnshZWn4ZSsg=
31 |            file_glob: true
32 |            file: dist/*
33 |            overwrite: true
34 |            draft: true
35 |       - os: linux
36 |         env: COMPONENT=ppx-examples-test NPM_TAG=dev
37 |         script: STAGE=deploy npm run travis
38 |         deploy: *gh-release
39 |       - os: osx
40 |         env: COMPONENT=ppx-examples-test
41 |         script: STAGE=deploy npm run travis
42 |         deploy: *gh-release
43 |       - os: osx
44 |         env: COMPONENT=ppx-examples-test NPM_TAG=dev
45 |         script: STAGE=deploy npm run travis
46 |         deploy: *gh-release
47 | 
48 | env:
49 |    - COMPONENT=ppx-examples-test
50 |    - COMPONENT=ppx-examples-test NPM_TAG=latest
51 |    - COMPONENT=ppx-examples-test NPM_TAG=dev
52 |    - COMPONENT=runtime
53 |    - COMPONENT=runtime NPM_TAG=latest
54 |    - COMPONENT=runtime NPM_TAG=dev
55 | 
56 | # Allow failures on the upcoming major version of BuckleScript
57 | #matrix:
58 | #   fast_finish: true
59 | #   allow_failures:
60 | #      - env: COMPONENT=ppx-examples-test NPM_TAG=dev
61 | #      - env: COMPONENT=runtime NPM_TAG=dev
62 | 
63 | os:
64 |    - linux
65 |    - osx
66 | 


--------------------------------------------------------------------------------
/scripts/travis.sh:
--------------------------------------------------------------------------------
  1 | set -e
  2 | 
  3 | puts() { printf %s\\n "$@" ;}
  4 | pute() { printf %s\\n "~~ $*" >&2 ;}
  5 | argq() { [ $# -gt 0 ] && printf "'%s' " "$@" ;}
  6 | 
  7 | # An alternative to `set +o xtrace` that doesn't print the unset line.
  8 | x () { puts '`` '"$*" >&2 ; "$@" || exit $? ;}
  9 | 
 10 | if [ -z "$STAGE" ] || [ -z "$COMPONENT" ]; then
 11 |    pute 'This script is intended to be called in a CI environment with a current $STAGE,'
 12 |    pute 'and a $COMPONENT to build. e.g.'
 13 | 
 14 |    puts '' >&2
 15 |    puts '      STAGE=install COMPONENT=runtime npm run travis' >&2
 16 |    puts '' >&2
 17 |    exit 2
 18 | fi
 19 | 
 20 | # Helpers
 21 | # -------
 22 | install_matching_ocaml() {
 23 |    x wget https://raw.githubusercontent.com/ocaml/ocaml-ci-scripts/master/.travis-ocaml.sh
 24 | 
 25 |    BSC_VERSION="$(bsc -vnum)"
 26 | 
 27 |    case "$BSC_VERSION" in
 28 |       *OCaml*4.02.3*) export OCAML_VERSION=4.02 OPAM_SWITCH=ocaml-base-compiler.4.02.3;;
 29 |       *OCaml*4.06.1*) export OCAML_VERSION=4.06 OPAM_SWITCH=ocaml-base-compiler.4.06.1;;
 30 |       *)
 31 |          pute 'Unrecognized `bsc` version: '"$BSC_VERSION"
 32 |          exit 10
 33 |       ;;
 34 |    esac
 35 | 
 36 |    puts 'export OPAM_SWITCH='"$OPAM_SWITCH"
 37 |    x bash -ex .travis-ocaml.sh
 38 |    x opam pin -y -n add sedlex .
 39 |    x opam install -y --deps-only sedlex
 40 |    eval `opam config env`
 41 | 
 42 |    if [ -n "$VERBOSE" ]; then
 43 |       x opam config env
 44 |       x opam list
 45 |    fi
 46 | }
 47 | 
 48 | # Stages
 49 | # ------
 50 | stage_install() {
 51 |    # Enable configurable debugging without adding new commits. (If something goes wrong,
 52 |    # you can set $VERBOSE to some value inside Travis's configuration, and then hit
 53 |    # "rebuild".)
 54 |    if [ -n "$VERBOSE" ]; then
 55 |       x npm config set loglevel verbose
 56 |    fi
 57 | 
 58 |    x git fetch --tags --no-recurse-submodules
 59 | 
 60 |    # Install npm dependencies, but avoid invoking our own `prepare` script
 61 |    x npm ci --ignore-scripts
 62 | 
 63 |    # A horrible hack is necessary here, with which we *manually* apply any `postinstall`
 64 |    # scripts of transitive dependencies. Yuck.
 65 |    #
 66 |    # Remember to add any new dependencies! No, I don't know an easy way to remember to do
 67 |    # this, and I'm probably going to forget. Sorry, future-me, when you figure out that
 68 |    # this is what's screwing up the build.
 69 |    #
 70 |    # See: <https://npm.community/t/ignore-scripts-ignores-all-scripts/1403/10>
 71 |    x npm rebuild node-jq
 72 | 
 73 |    # Now we either select a particular `bs-platform` to install, or manually process the
 74 |    # `postinstall` script of the one we installed above.
 75 |    if [ -n "$NPM_TAG" ]; then
 76 |       x npm install "bs-platform@$NPM_TAG"
 77 |    else
 78 |       x npm rebuild bs-platform
 79 |    fi
 80 | 
 81 |    # ‘Install’ our own local `ppx-sedlex` version.
 82 |    x npm link ./ppx-sedlex
 83 | 
 84 |    # Finally, for everything except the `runtime`, we need a working OCaml installation of
 85 |    # the same version as the BuckleScript we just built.
 86 |    if [ "$COMPONENT" != "runtime" ]; then install_matching_ocaml; fi
 87 | }
 88 | 
 89 | stage_test() {
 90 |    case "$COMPONENT" in
 91 |       runtime)
 92 |          x npm run --silent build:runtime
 93 |       ;;
 94 |       ppx-examples-test)
 95 |          eval `opam config env`
 96 | 
 97 |          x npm run --silent build:ppx
 98 |          (cd examples && x bsb -make-world)
 99 |          x npm run --silent test
100 |       ;;
101 |    esac
102 | }
103 | 
104 | stage_deploy() {
105 |    eval `opam config env`
106 | 
107 |    x npm run --silent clean:ppx
108 |    x npm run --silent build:ppx
109 | }
110 | 
111 | # Invocation
112 | # ----------
113 | case "$STAGE" in
114 |    install) stage_install ;;
115 |    test) stage_test ;;
116 |    deploy) stage_deploy ;;
117 | esac
118 | 


--------------------------------------------------------------------------------
/src/syntax/sedlex.ml:
--------------------------------------------------------------------------------
  1 | (* The package sedlex is released under the terms of an MIT-like license. *)
  2 | (* See the attached LICENSE file.                                         *)
  3 | (* Copyright 2005, 2013 by Alain Frisch and LexiFi.                       *)
  4 | 
  5 | module Cset = Sedlex_cset
  6 | 
  7 | (* NFA *)
  8 | 
  9 | type node = {
 10 |   id : int;
 11 |   mutable eps : node list;
 12 |   mutable trans : (Cset.t * node) list;
 13 | }
 14 | 
 15 | (* Compilation regexp -> NFA *)
 16 | 
 17 | type regexp = node -> node
 18 | 
 19 | let cur_id = ref 0
 20 | let new_node () =
 21 |   incr cur_id;
 22 |   { id = !cur_id; eps = []; trans = [] }
 23 | 
 24 | let seq r1 r2 succ = r1 (r2 succ)
 25 | 
 26 | let is_chars final = function
 27 |   | {eps = []; trans = [c, f]} when f == final -> Some c
 28 |   | _ -> None
 29 | 
 30 | let chars c succ =
 31 |   let n = new_node () in
 32 |   n.trans <- [c,succ];
 33 |   n
 34 | 
 35 | let alt r1 r2 succ =
 36 |   let nr1 = r1 succ and nr2 = r2 succ in
 37 |   match is_chars succ nr1, is_chars succ nr2 with
 38 |   | Some c1, Some c2 -> chars (Cset.union c1 c2) succ
 39 |   | _ ->
 40 |     let n = new_node () in
 41 |     n.eps <- [r1 succ; r2 succ];
 42 |     n
 43 | 
 44 | let rep r succ =
 45 |   let n = new_node () in
 46 |   n.eps <- [r n; succ];
 47 |   n
 48 | 
 49 | let plus r succ =
 50 |   let n = new_node () in
 51 |   let nr = r n in
 52 |   n.eps <- [nr; succ];
 53 |   nr
 54 | 
 55 | let eps succ = succ (* eps for epsilon *)
 56 | 
 57 | let compl r =
 58 |   let n = new_node () in
 59 |   match is_chars n (r n) with
 60 |   | Some c ->
 61 |     Some (chars (Cset.difference Cset.any c))
 62 |   | _ ->
 63 |     None
 64 | 
 65 | let pair_op f r0 r1 = (* Construct subtract or intersection *)
 66 |   let n = new_node () in
 67 |   let to_chars r = is_chars n (r n) in
 68 |   match to_chars r0, to_chars r1 with
 69 |   | Some c0, Some c1 ->
 70 |     Some (chars (f c0 c1))
 71 |   | _ ->
 72 |     None
 73 | 
 74 | let subtract = pair_op Cset.difference
 75 | 
 76 | let intersection = pair_op Cset.intersection
 77 | 
 78 | let compile_re re =
 79 |   let final = new_node () in
 80 |   (re final, final)
 81 | 
 82 | (* Determinization *)
 83 | 
 84 | type state = node list
 85 |       (* A state of the DFA corresponds to a set of nodes in the NFA. *)
 86 | 
 87 | let rec add_node state node =
 88 |   if List.memq node state then state else add_nodes (node::state) node.eps
 89 | and add_nodes state nodes =
 90 |   List.fold_left add_node state nodes
 91 | 
 92 | 
 93 | let transition (state : state) =
 94 |   (* Merge transition with the same target *)
 95 |   let rec norm = function
 96 |     | (c1, n1)::((c2, n2)::q as l) ->
 97 | 	if n1 == n2 then norm ((Cset.union c1 c2, n1)::q)
 98 | 	else (c1, n1)::(norm l)
 99 |     | l -> l in
100 |   let t = List.concat (List.map (fun n -> n.trans) state) in
101 |   let t = norm (List.sort (fun (_, n1) (_, n2) -> n1.id - n2.id) t) in
102 | 
103 |   (* Split char sets so as to make them disjoint *)
104 |   let split (all, t) (c0, n0) =
105 |     let t =
106 |       (Cset.difference c0 all, [n0]) ::
107 |       List.map (fun (c, ns) -> (Cset.intersection c c0, n0::ns)) t @
108 |       List.map (fun (c, ns) -> (Cset.difference c c0, ns)) t
109 |     in
110 |     Cset.union all c0,
111 |     List.filter (fun (c, _) -> not (Cset.is_empty c)) t
112 |   in
113 | 
114 |   let (_,t) = List.fold_left split (Cset.empty,[]) t in
115 | 
116 |   (* Epsilon closure of targets *)
117 |   let t = List.map (fun (c, ns) -> (c, add_nodes [] ns)) t in
118 | 
119 |   (* Canonical ordering *)
120 |   let t = Array.of_list t in
121 |   Array.sort (fun (c1, _) (c2, _) -> compare c1 c2) t;
122 |   t
123 | 
124 | let compile rs =
125 |   let rs = Array.map compile_re rs in
126 |   let counter = ref 0 in
127 |   let states = Hashtbl.create 31 in
128 |   let states_def = Hashtbl.create 31 in
129 |   let rec aux state =
130 |     try Hashtbl.find states state
131 |     with Not_found ->
132 |       let i = !counter in
133 |       incr counter;
134 |       Hashtbl.add states state i;
135 |       let trans = transition state in
136 |       let trans = Array.map (fun (p, t) -> (p, aux t)) trans in
137 |       let finals = Array.map (fun (_, f) -> List.memq f state) rs in
138 |       Hashtbl.add states_def i (trans, finals);
139 |       i
140 |   in
141 |   let init = ref [] in
142 |   Array.iter (fun (i,_) -> init := add_node !init i) rs;
143 |   let i = aux !init in
144 |   assert(i = 0);
145 |   Array.init !counter (Hashtbl.find states_def)
146 | 


--------------------------------------------------------------------------------
/scripts/version.sh:
--------------------------------------------------------------------------------
  1 | puts() { printf %s\\n "$@" ;}
  2 | pute() { printf %s\\n "!! $*" >&2 ;}
  3 | argq() { [ $# -gt 0 ] && printf "'%s' " "$@" ;}
  4 | 
  5 | jq_bin="$(npm prefix)/node_modules/node-jq/bin/jq"
  6 | pj() {
  7 |    "$jq_bin" -e "$@" package.json
  8 | }
  9 | jqinplace() {
 10 |    file="$2"
 11 |    "$jq_bin" -e "$@" | sponge "$file"
 12 | }
 13 | 
 14 | dump_status() {
 15 |    cat <<EOF > .deploy-status.env
 16 | STEP="$STEP"
 17 | NEW_VERSION="$NEW_VERSION"
 18 | OLD_VERSION="$OLD_VERSION"
 19 | EOF
 20 | }
 21 | 
 22 | 
 23 | # Script setup & resumption
 24 | # ------ ----- - ----------
 25 | 
 26 | if [ -r .deploy-status.env ]; then
 27 |    puts ''
 28 |    puts 'These values were found in `.deploy-status.env`:'
 29 |    puts ''
 30 |    cat .deploy-status.env
 31 |    puts ''
 32 | 
 33 |    printf %s 'Load them and attempt to continue an aborted deployment? [Y/n] '; read yn
 34 |    if [ "$yn" != "${yn#[Yy]}" ]; then
 35 |       source .deploy-status.env
 36 |       export STEP NEW_VERSION OLD_VERSION
 37 |    else
 38 |       rm .deploy-status.env
 39 |       exit 3
 40 |    fi
 41 | fi
 42 | 
 43 | if [ -z "$STEP" ]; then
 44 |    if [ $# -ge 1 ]; then
 45 |       STEP="$1"
 46 |    else
 47 |       STEP=1
 48 |    fi
 49 | fi
 50 | 
 51 | if [ -z "$NEW_VERSION" ]; then
 52 |    if [ $# -ge 2 ]; then
 53 |       NEW_VERSION="$2"
 54 |       puts 'New version: '"v$NEW_VERSION"
 55 |    else
 56 |       NEW_VERSION="$(pj -r .version)"
 57 |       puts 'New version (from `bs-sedlex/package.json`):  '"v$NEW_VERSION"
 58 |    fi
 59 | fi
 60 | 
 61 | if [ -z "$OLD_VERSION" ]; then
 62 |    if [ $# -ge 3 ]; then
 63 |       OLD_VERSION="$2"
 64 |       puts 'Old version: '"v$OLD_VERSION"
 65 |    else
 66 |       OLD_VERSION="$(cd ./ppx-sedlex; pj -r .version)"
 67 |       puts 'Old version (from `ppx-sedlex/package.json`): '"v$OLD_VERSION"
 68 |    fi
 69 | fi
 70 | 
 71 | if [ "$NEW_VERSION" = "$OLD_VERSION" ]; then
 72 |    pute 'The version in `bs-sedlex` doesn'\''t appear to have been bumped!'
 73 |    puts '' >&2
 74 | 
 75 |    pute 'You may have called this directly by accident; this script is intended to be'
 76 |    pute 'called from the `version` lifecycle. Try the following, instead:'
 77 |    puts '' >&2
 78 |    puts '      npm version patch --no-git-tag-version' >&2
 79 |    puts '' >&2
 80 |    exit 2
 81 | fi
 82 | 
 83 | 
 84 | # Steps
 85 | # -----
 86 | step_1() {
 87 |    STEP=1 && dump_status
 88 | 
 89 |    puts ''
 90 |    printf %s 'Update bsconfig.json to match? [Y/n] '; read yn
 91 |    if [ "$yn" != "${yn#[Yy]}" ]; then
 92 |       jqinplace ".version = \"$NEW_VERSION\"" bsconfig.json || exit 127
 93 |    else
 94 |       exit 4
 95 |    fi
 96 | 
 97 |    step_2
 98 | }
 99 | 
100 | 
101 | step_2() {
102 |    STEP=2 && dump_status
103 | 
104 |    puts ''
105 |    printf %s 'Update ppx-sedlex/package.json to match? [Y/n] '; read yn
106 |    if [ "$yn" != "${yn#[Yy]}" ]; then
107 |       (  cd ./ppx-sedlex && \
108 |          npm version --no-git-tag-version "$NEW_VERSION" && \
109 |          git add package.json bsconfig.json package-lock.json) || exit 127
110 |    else
111 |       exit 4
112 |    fi
113 | 
114 |    step_3
115 | }
116 | 
117 | 
118 | step_3() {
119 |    STEP=3 && dump_status
120 | 
121 |    puts ''
122 |    printf %s 'Remove previous artifacts and checksums from ppx-sedlex? [Y/n] '; read yn
123 |    if [ "$yn" != "${yn#[Yy]}" ]; then
124 |       (  cd ./ppx-sedlex &&
125 |          git clean -Xdf;
126 |       git rm SHASUM256.txt )
127 |    else
128 |       exit 4
129 |    fi
130 | 
131 |    step_4
132 | }
133 | 
134 | 
135 | step_4() {
136 |    STEP=4 && dump_status
137 | 
138 |    puts ''
139 |    (  cd ./ppx-sedlex && \
140 |       git status )
141 |    printf %s 'Commit new, unpopulated version of ppx-sedlex with these changes? [Y/n] '; read yn
142 |    if [ "$yn" != "${yn#[Yy]}" ]; then
143 |       (  cd ./ppx-sedlex && \
144 |          git commit -m "(- rel NF) Prep for v$NEW_VERSION" ) || exit 127
145 |       git add ./ppx-sedlex || exit 127
146 |    else
147 |       exit 4
148 |    fi
149 | 
150 |    step_5
151 | }
152 | 
153 | 
154 | step_5() {
155 |    STEP=5 && dump_status
156 | 
157 |    puts ''
158 |    git add package.json package-lock.json bsconfig.json || exit 127
159 |    git status
160 |    printf %s 'Further, commit these changes to a new version of bs-sedlex as well? [Y/n] '; read yn
161 |    if [ "$yn" != "${yn#[Yy]}" ]; then
162 |       git commit -m "(rel) v$NEW_VERSION" || exit 127
163 |    else
164 |       exit 4
165 |    fi
166 | 
167 |    step_6
168 | }
169 | 
170 | 
171 | step_6() {
172 |    STEP=6 && dump_status
173 | 
174 |    puts ''
175 |    git --no-pager show
176 |    printf %s "Tag this changeset as 'v$NEW_VERSION'? [Y/n] "; read yn
177 |    if [ "$yn" != "${yn#[Yy]}" ]; then
178 |       git tag "v$NEW_VERSION" --file=- <<MSG || exit 127
179 | Version $NEW_VERSION of Sedlexing, packaged for BuckleScript.
180 | 
181 | Of note, the corresponding version of \`ppx-sedlex\` is *not* included, due to a recursively mutual dependency. These packages are, however, properly packaged on npm:
182 | 
183 |     npm install --save bs-sedlex@$NEW_VERSION
184 |     npm install --save-dev ppx-sedlex@$NEW_VERSION
185 | 
186 | If you wish to acquire them via git, ensure you manually pull the matching version of my \`ppx-sedlex\` package! The submodule linkage recorded in the repository will be perpetually one release behind.
187 | MSG
188 |    else
189 |       exit 4
190 |    fi
191 | 
192 |    step_7
193 | }
194 | 
195 | 
196 | step_7() {
197 |    STEP=7 && dump_status
198 | 
199 |    puts ''
200 |    printf %s 'Push these changes to GitHub, so Travis can start building the artifacts? [Y/n] '; read yn
201 |    if [ "$yn" != "${yn#[Yy]}" ]; then
202 |       (  cd ./ppx-sedlex && \
203 |          git push ) || exit 127
204 |       git push --tags || exit 127
205 |       git push || exit 127
206 |    else
207 |       exit 4
208 |    fi
209 | 
210 |    step_8
211 | }
212 | 
213 | 
214 | step_8() {
215 |    STEP=8 && dump_status
216 | 
217 |    puts ''
218 |    puts "When the Travis build is done, if successful, it'll upload the artifacts to a GitHub 'draft.'"
219 |    puts "You'll need to visit the Releases page and hit 'Publish release' on that draft, before continuing:"
220 |    puts '   https://github.com/ELLIOTTCABLE/bs-deriving/releases'
221 |    puts ''
222 | 
223 |    printf %s 'Start checking for those build artifacts on GitHub? [Y/n] '; read yn
224 |    if [ "$yn" != "${yn#[Yy]}" ]; then
225 |       until (
226 |          cd ./ppx-sedlex && \
227 |          make SHASUM256.txt
228 |       ); do puts ''; sleep 15; done
229 |    else
230 |       exit 4
231 |    fi
232 | 
233 |    step_9
234 | }
235 | 
236 | 
237 | step_9() {
238 |    STEP=9 && dump_status
239 | 
240 |    puts ''
241 |    (  cd ./ppx-sedlex && \
242 |       git add SHASUM256.txt ) || exit 127
243 |    (  cd ./ppx-sedlex && \
244 |       git --no-pager diff --cached )
245 |    printf %s 'Success! Now, commit the updated checksum to ppx-sedlex, and tag? [Y/n] '; read yn
246 |    if [ "$yn" != "${yn#[Yy]}" ]; then
247 |       (  cd ./ppx-sedlex && \
248 |          git commit -m "(rel) v$NEW_VERSION" && \
249 |          git tag "v$NEW_VERSION" ) || exit 127
250 |       git add ./ppx-sedlex || exit 127
251 |    else
252 |       exit 4
253 |    fi
254 | 
255 |    step_10
256 | }
257 | 
258 | 
259 | step_10() {
260 |    STEP=10 && dump_status
261 | 
262 |    puts ''
263 |    (  cd ./ppx-sedlex && \
264 |       npm pack && \
265 |       rm "ppx-sedlex-$NEW_VERSION.tgz" ) || exit 127
266 |    printf %s "Publish 'ppx-sedlex@$NEW_VERSION' to npm? [Y/n] "; read yn
267 |    if [ "$yn" != "${yn#[Yy]}" ]; then
268 |       (  cd ./ppx-sedlex && \
269 |          npm publish ) || exit 127
270 |    else
271 |       exit 4
272 |    fi
273 | 
274 |    step_11
275 | }
276 | 
277 | 
278 | step_11() {
279 |    STEP=11 && dump_status
280 | 
281 |    puts ''
282 |    printf %s "Update superproject's dependency on ppx-sedlex? [Y/n] "; read yn
283 |    if [ "$yn" != "${yn#[Yy]}" ]; then
284 |       jqinplace '.devDependencies["ppx-sedlex"] = "=" + .version' package.json || exit 127
285 |       rm -rf node_modules/ppx-sedlex && \
286 |          npm install --package-lock-only && \
287 |          ln -s ../ppx-sedlex node_modules/ppx-sedlex || exit 127
288 |       git add package.json package-lock.json || exit 127
289 |    else
290 |       exit 4
291 |    fi
292 | 
293 |    step_12
294 | }
295 | 
296 | 
297 | step_12() {
298 |    STEP=12 && dump_status
299 | 
300 |    puts ''
301 |    git status
302 |    printf %s 'Commit updated dependency and checksummed ppx to superproject? [Y/n] '; read yn
303 |    if [ "$yn" != "${yn#[Yy]}" ]; then
304 |       git commit -m "(- rel up sub) Fast-forward to artifact-checksummed v$NEW_VERSION of ppx" || exit 127
305 |    else
306 |       exit 4
307 |    fi
308 | 
309 |    step_13
310 | }
311 | 
312 | 
313 | step_13() {
314 |    STEP=13 && dump_status
315 | 
316 |    puts ''
317 |    printf %s 'Push the final, linked commits upstream? [Y/n] '; read yn
318 |    if [ "$yn" != "${yn#[Yy]}" ]; then
319 |       (  cd ./ppx-sedlex && \
320 |          git push && \
321 |          git push --tags ) || exit 127
322 |       git push || exit 127
323 |    else
324 |       exit 4
325 |    fi
326 | 
327 |    step_14
328 | }
329 | 
330 | 
331 | step_14() {
332 |    STEP=14 && dump_status
333 | 
334 |    puts ''
335 |    npm pack && \
336 |       rm "bs-sedlex-$NEW_VERSION.tgz" || exit 127
337 |    printf %s "Publish 'bs-sedlex@$NEW_VERSION' to npm? [Y/n] "; read yn
338 |    if [ "$yn" != "${yn#[Yy]}" ]; then
339 |       npm publish || exit 127
340 |    else
341 |       exit 4
342 |    fi
343 | 
344 |    # there is no step 15! ... lol.
345 |    rm .deploy-status.env
346 | }
347 | 
348 | 
349 | # Step dispatch
350 | # ---- --------
351 | eval "'step_$STEP'"
352 | 


--------------------------------------------------------------------------------
/src/lib/sedlexing.mli:
--------------------------------------------------------------------------------
  1 | (* The package sedlex is released under the terms of an MIT-like license. *)
  2 | (* See the attached LICENSE file.                                         *)
  3 | (* Copyright 2005, 2013 by Alain Frisch and LexiFi.                       *)
  4 | 
  5 | (** Runtime support for lexers generated by [sedlex]. *)
  6 | 
  7 | (** This module is roughly equivalent to the module Lexing from the
  8 |     OCaml standard library, except that its lexbuffers handle Unicode
  9 |     code points (OCaml type: {!Uchar.t} in the range [0..0x10ffff])
 10 |     instead of bytes (OCaml type: [char]).
 11 | 
 12 |     It is possible to have sedlex-generated lexers work on a custom
 13 |     implementation for lex buffers. To do this, define a module [L]
 14 |     which implements the [start], [next], [mark] and [backtrack]
 15 |     functions (See the Internal Interface section below for a
 16 |     specification).  They need not work on a type named [lexbuf]: you
 17 |     can use the type name you want. Then, just do in your
 18 |     sedlex-processed source, bind this module to the name [Sedlexing]
 19 |     (for instance, with a local module definition: [let module Sedlexing
 20 |     = L in ...].
 21 | 
 22 |     Of course, you'll probably want to define functions like [lexeme] to
 23 |     be used in the lexers semantic actions.  *)
 24 | 
 25 | type lexbuf
 26 |       (** The type of lexer buffers. A lexer buffer is the argument passed
 27 |           to the scanning functions defined by the generated lexers.
 28 |           The lexer buffer holds the internal information for the
 29 |           scanners, including the code points of the token currently scanned,
 30 |           its position from the beginning of the input stream,
 31 |           and the current position of the lexer. *)
 32 | 
 33 | exception InvalidCodepoint of int
 34 |     (** Raised by some functions to signal that some code point is not
 35 |         compatible with a specified encoding. *)
 36 | 
 37 | exception MalFormed
 38 |     (** Raised by functions in the [Utf8] and [Utf16] modules to report
 39 |         strings which do not comply to the encoding. *)
 40 | 
 41 | (** {6 Creating generic lexbufs} *)
 42 | 
 43 | val create: (Uchar.t array -> int -> int -> int) -> lexbuf
 44 |     (** Create a generic lexer buffer.  When the lexer needs more
 45 |         characters, it will call the given function, giving it an array of
 46 |         Uchars [a], a position [pos] and a code point count [n].  The
 47 |         function should put [n] code points or less in [a], starting at
 48 |         position [pos], and return the number of characters provided. A
 49 |         return value of 0 means end of input. *)
 50 | 
 51 | val set_position: lexbuf -> Lexing.position -> unit
 52 |     (** set the initial tracked input position for [lexbuf].
 53 |         If set to [Lexing.dummy_pos], Sedlexing will not track position
 54 |         information for you. *)
 55 | 
 56 | val set_filename: lexbuf -> string -> unit
 57 |     (** [set_filename lexbuf file] sets the filename to [file] in
 58 |         [lexbuf]. It also sets the {!Lexing.pos_fname} field in
 59 |         returned {!Lexing.position} records. *)
 60 | 
 61 | val from_gen: Uchar.t Gen.t -> lexbuf
 62 |     (** Create a lexbuf from a stream of Unicode code points. *)
 63 | 
 64 | val from_stream: Uchar.t Stream.t -> lexbuf
 65 |     [@@ocaml.deprecated "Use [Sedlexing.from_gen] instead."]
 66 |     (** Create a lexbuf from a stream of Unicode code points. *)
 67 | 
 68 | val from_int_array: int array -> lexbuf
 69 |     (** Create a lexbuf from an array of Unicode code points. *)
 70 | 
 71 | val from_uchar_array: Uchar.t array -> lexbuf
 72 |     (** Create a lexbuf from an array of Unicode code points. *)
 73 | 
 74 | (** {6 Interface for lexers semantic actions} *)
 75 | 
 76 | (** The following functions can be called from the semantic actions of
 77 |     lexer definitions.  They give access to the character string matched
 78 |     by the regular expression associated with the semantic action. *)
 79 | 
 80 | val lexeme_start: lexbuf -> int
 81 |     (** [Sedlexing.lexeme_start lexbuf] returns the offset in the
 82 |         input stream of the first code point of the matched string.
 83 |         The first code point of the stream has offset 0. *)
 84 | 
 85 | val lexeme_end: lexbuf -> int
 86 |     (** [Sedlexing.lexeme_end lexbuf] returns the offset in the input
 87 |         stream of the character following the last code point of the
 88 |         matched string. The first character of the stream has offset
 89 |         0. *)
 90 | 
 91 | val loc: lexbuf -> int * int
 92 |     (** [Sedlexing.loc lexbuf] returns the pair
 93 |         [(Sedlexing.lexeme_start lexbuf,Sedlexing.lexeme_end
 94 |         lexbuf)]. *)
 95 | 
 96 | val lexeme_length: lexbuf -> int
 97 |     (** [Sedlexing.loc lexbuf] returns the difference
 98 |         [(Sedlexing.lexeme_end lexbuf) - (Sedlexing.lexeme_start
 99 |         lexbuf)], that is, the length (in code points) of the matched
100 |         string. *)
101 | 
102 | val lexing_positions : lexbuf -> Lexing.position*Lexing.position
103 |     (** [Sedlexing.lexing_positions lexbuf] returns the start and end
104 |         positions of the current token, using a record of type
105 |         [Lexing.position]. This is intended for consumption
106 |         by parsers like those generated by [Menhir]. *)
107 | 
108 | val new_line: lexbuf -> unit
109 |     (** [Sedlexing.new_line lexbuf] increments the line count and
110 |         sets the beginning of line to the current position, as though
111 |         a newline character had been encountered in the input. *)
112 | 
113 | val lexeme: lexbuf -> Uchar.t array
114 |     (** [Sedlexing.lexeme lexbuf] returns the string matched by the
115 |         regular expression as an array of Unicode code point. *)
116 | 
117 | val lexeme_char: lexbuf -> int -> Uchar.t
118 |     (** [Sedlexing.lexeme_char lexbuf pos] returns code point number [pos] in
119 |         the matched string. *)
120 | 
121 | val sub_lexeme: lexbuf -> int -> int -> Uchar.t array
122 | (** [Sedlexing.sub_lexeme lexbuf pos len] returns a substring of the string
123 |     matched by the regular expression as an array of Unicode code point. *)
124 | 
125 | val rollback: lexbuf -> unit
126 | (** [Sedlexing.rollback lexbuf] puts [lexbuf] back in its configuration before
127 |     the last lexeme was matched. It is then possible to use another
128 |     lexer to parse the same characters again. The other functions
129 |     above in this section should not be used in the semantic action
130 |     after a call to [Sedlexing.rollback]. *)
131 | 
132 | (** {6 Internal interface} *)
133 | 
134 | (** These functions are used internally by the lexers. They could be used
135 |     to write lexers by hand, or with a lexer generator different from
136 |     [sedlex]. The lexer buffers have a unique internal slot that can store
137 |     an integer. They also store a "backtrack" position.
138 |  *)
139 | 
140 | val start: lexbuf -> unit
141 | (** [start t] informs the lexer buffer that any
142 |     code points until the current position can be discarded.
143 |     The current position become the "start" position as returned
144 |     by [Sedlexing.lexeme_start]. Moreover, the internal slot is set to
145 |     [-1] and the backtrack position is set to the current position.
146 |  *)
147 | 
148 | val next: lexbuf -> Uchar.t option
149 | (** [next lexbuf] extracts the next code point from the
150 |     lexer buffer and increments to current position. If the input stream
151 |     is exhausted, the function returns [None].
152 |     If a ['\n'] is encountered, the tracked line number is incremented. *)
153 | 
154 | val mark: lexbuf -> int -> unit
155 | (** [mark lexbuf i] stores the integer [i] in the internal
156 |     slot. The backtrack position is set to the current position. *)
157 | 
158 | val backtrack: lexbuf -> int
159 | (** [backtrack lexbuf] returns the value stored in the
160 |     internal slot of the buffer, and performs backtracking
161 |     (the current position is set to the value of the backtrack position). *)
162 | 
163 | val with_tokenizer: (lexbuf -> 'token) -> lexbuf -> (unit -> 'token * Lexing.position * Lexing.position)
164 | (** [with_tokenizer tokenizer lexbuf] given a lexer and a lexbuf,
165 |     returns a generator of tokens annotated with positions.
166 |     This generator can be used with the Menir parser generator's
167 |     incremental API. *)
168 | 
169 | (** {6 Support for common encodings} *)
170 | 
171 | module Latin1: sig
172 |   val from_gen: char Gen.t -> lexbuf
173 |       (** Create a lexbuf from a Latin1 encoded stream (ie a stream
174 |           of Unicode code points in the range [0..255]) *)
175 | 
176 |   val from_stream: char Stream.t -> lexbuf
177 |       [@@ocaml.deprecated "Use [Sedlexing.Latin1.from_gen] instead."]
178 |       (** Create a lexbuf from a Latin1 encoded stream (ie a stream
179 |           of Unicode code points in the range [0..255]) *)
180 | 
181 |   val from_channel: in_channel -> lexbuf
182 |       (** Create a lexbuf from a Latin1 encoded input channel.
183 |           The client is responsible for closing the channel. *)
184 | 
185 |   val from_string: string -> lexbuf
186 |       (** Create a lexbuf from a Latin1 encoded string. *)
187 | 
188 | 
189 |   val lexeme: lexbuf -> string
190 |       (** As [Sedlexing.lexeme] with a result encoded in Latin1.  This
191 |           function throws an exception [InvalidCodepoint] if it is not
192 |           possible to encode the result in Latin1. *)
193 | 
194 |   val sub_lexeme: lexbuf -> int -> int -> string
195 |       (** As [Sedlexing.sub_lexeme] with a result encoded in Latin1.
196 |           This function throws an exception [InvalidCodepoint] if it
197 |           is not possible to encode the result in Latin1. *)
198 | 
199 |   val lexeme_char: lexbuf -> int -> char
200 |       (** As [Sedlexing.lexeme_char] with a result encoded in Latin1.
201 |           This function throws an exception [InvalidCodepoint] if it
202 |           is not possible to encode the result in Latin1. *)
203 | end
204 | 
205 | 
206 | module Utf8: sig
207 |   val from_gen: char Gen.t -> lexbuf
208 |       (** Create a lexbuf from a UTF-8 encoded stream. *)
209 | 
210 |   val from_stream: char Stream.t -> lexbuf
211 |       [@@ocaml.deprecated "Use [Sedlexing.Utf8.from_gen] instead."]
212 |       (** Create a lexbuf from a UTF-8 encoded stream. *)
213 | 
214 |   val from_channel: in_channel -> lexbuf
215 |       (** Create a lexbuf from a UTF-8 encoded input channel. *)
216 | 
217 |   val from_string: string -> lexbuf
218 |       (** Create a lexbuf from a UTF-8 encoded string. *)
219 | 
220 |   val lexeme: lexbuf -> string
221 |    (** As [Sedlexing.lexeme] with a result encoded in UTF-8. *)
222 | 
223 |   val sub_lexeme: lexbuf -> int -> int -> string
224 |       (** As [Sedlexing.sub_lexeme] with a result encoded in UTF-8. *)
225 | end
226 | 
227 | 
228 | module Utf16: sig
229 |   type byte_order = Little_endian | Big_endian
230 | 
231 |   val from_gen: char Gen.t -> byte_order option -> lexbuf
232 |       (** [Utf16.from_gen s opt_bo] creates a lexbuf from an UTF-16
233 |           encoded stream. If [opt_bo] matches with [None] the function
234 |           expects a BOM (Byte Order Mark), and takes the byte order as
235 |           [Utf16.Big_endian] if it cannot find one. When [opt_bo]
236 |           matches with [Some bo], [bo] is taken as byte order. In this
237 |           case a leading BOM is kept in the stream - the lexer has to
238 |           ignore it and a `wrong' BOM ([0xfffe]) will raise
239 |           Utf16.InvalidCodepoint.  *)
240 | 
241 |   val from_stream: char Stream.t -> byte_order option -> lexbuf
242 |       [@@ocaml.deprecated "Use [Sedlexing.Utf16.from_gen] instead."]
243 |       (** Works as [Utf16.from_gen] with a [stream]. *)
244 | 
245 |   val from_channel: in_channel -> byte_order option-> lexbuf
246 |       (** Works as [Utf16.from_gen] with an [in_channel]. *)
247 | 
248 |   val from_string: string -> byte_order option -> lexbuf
249 |       (** Works as [Utf16.from_gen] with a [string]. *)
250 | 
251 |   val lexeme: lexbuf -> byte_order -> bool -> string
252 |       (** [utf16_lexeme lb bo bom] as [Sedlexing.lexeme] with a result
253 |           encoded in UTF-16 in byte_order [bo] and starting with a BOM
254 |           if [bom = true].  *)
255 | 
256 |   val sub_lexeme: lexbuf -> int -> int -> byte_order -> bool -> string
257 |       (** [sub_lexeme lb pos len bo bom] as
258 |           [Sedlexing.sub_lexeme] with a result encoded in UTF-16 with
259 |           byte order [bo] and starting with a BOM if [bom=true] *)
260 | end
261 | 


--------------------------------------------------------------------------------
/src/syntax/sedlex_cset.ml:
--------------------------------------------------------------------------------
  1 | (* The package sedlex is released under the terms of an MIT-like license. *)
  2 | (* See the attached LICENSE file.                                         *)
  3 | (* Copyright 2005, 2013 by Alain Frisch and LexiFi.                       *)
  4 | 
  5 | (* Character sets are represented as lists of intervals.  The
  6 |    intervals must be non-overlapping and not collapsable, and the list
  7 |    must be ordered in increasing order. *)
  8 | 
  9 | type t = (int * int) list
 10 | 
 11 | let max_code = 0x10ffff  (* must be < max_int *)
 12 | let min_code = -1
 13 | 
 14 | let empty = []
 15 | let singleton i = [i,i]
 16 | let is_empty = function [] -> true | _ -> false
 17 | let interval i j = if i <= j then [i,j] else [j,i]
 18 | let eof = singleton (-1)
 19 | let any = interval 0 max_code
 20 | 
 21 | let rec union c1 c2 =
 22 |   match c1,c2 with
 23 |   | [], _ -> c2
 24 |   | _, [] -> c1
 25 |   | ((i1, j1) as s1)::r1, (i2, j2)::r2 ->
 26 | 	if (i1 <= i2) then
 27 | 	  if j1 + 1 < i2 then s1::(union r1 c2)
 28 | 	  else if (j1 < j2) then union r1 ((i1, j2)::r2)
 29 | 	  else union c1 r2
 30 | 	else union c2 c1
 31 | 
 32 | let complement c =
 33 |   let rec aux start = function
 34 |     | [] -> if start <= max_code then [start,max_code] else []
 35 |     | (i, j)::l -> (start, i-1)::(aux (succ j) l)
 36 |   in
 37 |   match c with
 38 |     | (-1,j)::l -> aux (succ j) l
 39 |     | l -> aux (-1) l
 40 | 
 41 | let intersection c1 c2 =
 42 |   complement (union (complement c1) (complement c2))
 43 | 
 44 | let difference c1 c2 =
 45 |   complement (union (complement c1) c2)
 46 | 
 47 | 
 48 | (* Unicode classes from XML *)
 49 | 
 50 | let base_char =
 51 |   [ 0x0041,0x005A; 0x0061,0x007A; 0x00C0,0x00D6; 0x00D8,0x00F6;
 52 |     0x00F8,0x00FF; 0x0100,0x0131; 0x0134,0x013E; 0x0141,0x0148;
 53 |     0x014A,0x017E; 0x0180,0x01C3; 0x01CD,0x01F0; 0x01F4,0x01F5;
 54 |     0x01FA,0x0217; 0x0250,0x02A8; 0x02BB,0x02C1; 0x0386,0x0386;
 55 |     0x0388,0x038A; 0x038C,0x038C; 0x038E,0x03A1; 0x03A3,0x03CE;
 56 |     0x03D0,0x03D6; 0x03DA,0x03DA; 0x03DC,0x03DC; 0x03DE,0x03DE;
 57 |     0x03E0,0x03E0; 0x03E2,0x03F3;
 58 |     0x0401,0x040C; 0x040E,0x044F; 0x0451,0x045C; 0x045E,0x0481;
 59 |     0x0490,0x04C4; 0x04C7,0x04C8; 0x04CB,0x04CC; 0x04D0,0x04EB;
 60 |     0x04EE,0x04F5; 0x04F8,0x04F9; 0x0531,0x0556; 0x0559,0x0559;
 61 |     0x0561,0x0586; 0x05D0,0x05EA; 0x05F0,0x05F2; 0x0621,0x063A;
 62 |     0x0641,0x064A; 0x0671,0x06B7; 0x06BA,0x06BE; 0x06C0,0x06CE;
 63 |     0x06D0,0x06D3; 0x06D5,0x06D5; 0x06E5,0x06E6; 0x0905,0x0939;
 64 |     0x093D,0x093D;
 65 |     0x0958,0x0961; 0x0985,0x098C; 0x098F,0x0990; 0x0993,0x09A8;
 66 |     0x09AA,0x09B0; 0x09B2,0x09B2; 0x09B6,0x09B9; 0x09DC,0x09DD;
 67 |     0x09DF,0x09E1; 0x09F0,0x09F1; 0x0A05,0x0A0A; 0x0A0F,0x0A10;
 68 |     0x0A13,0x0A28; 0x0A2A,0x0A30; 0x0A32,0x0A33; 0x0A35,0x0A36;
 69 |     0x0A38,0x0A39; 0x0A59,0x0A5C; 0x0A5E,0x0A5E; 0x0A72,0x0A74;
 70 |     0x0A85,0x0A8B; 0x0A8D,0x0A8D; 0x0A8F,0x0A91; 0x0A93,0x0AA8;
 71 |     0x0AAA,0x0AB0; 0x0AB2,0x0AB3; 0x0AB5,0x0AB9; 0x0ABD,0x0ABD;
 72 |     0x0AE0,0x0AE0;
 73 |     0x0B05,0x0B0C; 0x0B0F,0x0B10; 0x0B13,0x0B28; 0x0B2A,0x0B30;
 74 |     0x0B32,0x0B33; 0x0B36,0x0B39; 0x0B3D,0x0B3D; 0x0B5C,0x0B5D;
 75 |     0x0B5F,0x0B61; 0x0B85,0x0B8A; 0x0B8E,0x0B90; 0x0B92,0x0B95;
 76 |     0x0B99,0x0B9A; 0x0B9C,0x0B9C; 0x0B9E,0x0B9F; 0x0BA3,0x0BA4;
 77 |     0x0BA8,0x0BAA; 0x0BAE,0x0BB5; 0x0BB7,0x0BB9; 0x0C05,0x0C0C;
 78 |     0x0C0E,0x0C10; 0x0C12,0x0C28; 0x0C2A,0x0C33; 0x0C35,0x0C39;
 79 |     0x0C60,0x0C61; 0x0C85,0x0C8C; 0x0C8E,0x0C90; 0x0C92,0x0CA8;
 80 |     0x0CAA,0x0CB3; 0x0CB5,0x0CB9; 0x0CDE,0x0CDE; 0x0CE0,0x0CE1;
 81 |     0x0D05,0x0D0C; 0x0D0E,0x0D10; 0x0D12,0x0D28; 0x0D2A,0x0D39;
 82 |     0x0D60,0x0D61; 0x0E01,0x0E2E; 0x0E30,0x0E30; 0x0E32,0x0E33;
 83 |     0x0E40,0x0E45; 0x0E81,0x0E82; 0x0E84,0x0E84; 0x0E87,0x0E88;
 84 |     0x0E8A,0x0E8A;
 85 |     0x0E8D,0x0E8D; 0x0E94,0x0E97; 0x0E99,0x0E9F; 0x0EA1,0x0EA3;
 86 |     0x0EA5,0x0EA5;
 87 |     0x0EA7,0x0EA7; 0x0EAA,0x0EAB; 0x0EAD,0x0EAE; 0x0EB0,0x0EB0;
 88 |     0x0EB2,0x0EB3;
 89 |     0x0EBD,0x0EBD; 0x0EC0,0x0EC4; 0x0F40,0x0F47; 0x0F49,0x0F69;
 90 |     0x10A0,0x10C5; 0x10D0,0x10F6; 0x1100,0x1100; 0x1102,0x1103;
 91 |     0x1105,0x1107; 0x1109,0x1109; 0x110B,0x110C; 0x110E,0x1112;
 92 |     0x113C,0x113C;
 93 |     0x113E,0x113E; 0x1140,0x1140; 0x114C,0x114C; 0x114E,0x114E;
 94 |     0x1150,0x1150; 0x1154,0x1155; 0x1159,0x1159;
 95 |     0x115F,0x1161; 0x1163,0x1163; 0x1165,0x1165; 0x1167,0x1167;
 96 |     0x1169,0x1169; 0x116D,0x116E;
 97 |     0x1172,0x1173; 0x1175,0x1175; 0x119E,0x119E; 0x11A8,0x11A8;
 98 |     0x11AB,0x11AB; 0x11AE,0x11AF;
 99 |     0x11B7,0x11B8; 0x11BA,0x11BA; 0x11BC,0x11C2; 0x11EB,0x11EB;
100 |     0x11F0,0x11F0; 0x11F9,0x11F9;
101 |     0x1E00,0x1E9B; 0x1EA0,0x1EF9; 0x1F00,0x1F15; 0x1F18,0x1F1D;
102 |     0x1F20,0x1F45; 0x1F48,0x1F4D; 0x1F50,0x1F57; 0x1F59,0x1F59;
103 |     0x1F5B,0x1F5B;
104 |     0x1F5D,0x1F5D; 0x1F5F,0x1F7D; 0x1F80,0x1FB4; 0x1FB6,0x1FBC;
105 |     0x1FBE,0x1FBE;
106 |     0x1FC2,0x1FC4; 0x1FC6,0x1FCC; 0x1FD0,0x1FD3; 0x1FD6,0x1FDB;
107 |     0x1FE0,0x1FEC; 0x1FF2,0x1FF4; 0x1FF6,0x1FFC; 0x2126,0x2126;
108 |     0x212A,0x212B; 0x212E,0x212E; 0x2180,0x2182; 0x3041,0x3094;
109 |     0x30A1,0x30FA; 0x3105,0x312C; 0xAC00,0xD7A3 ]
110 | 
111 | let ideographic =
112 |   [ 0x3007,0x3007; 0x3021,0x3029; 0x4E00,0x9FA5 ]
113 | 
114 | let combining_char =
115 |   [ 0x0300,0x0345; 0x0360,0x0361; 0x0483,0x0486; 0x0591,0x05A1;
116 |     0x05A3,0x05B9; 0x05BB,0x05BD; 0x05BF,0x05BF; 0x05C1,0x05C2;
117 |     0x05C4,0x05C4; 0x064B,0x0652; 0x0670,0x0670; 0x06D6,0x06DC;
118 |     0x06DD,0x06DF; 0x06E0,0x06E4; 0x06E7,0x06E8; 0x06EA,0x06ED;
119 |     0x0901,0x0903; 0x093C,0x093C; 0x093E,0x094C; 0x094D,0x094D;
120 |     0x0951,0x0954; 0x0962,0x0963; 0x0981,0x0983; 0x09BC,0x09BC;
121 |     0x09BE,0x09BE; 0x09BF,0x09BF; 0x09C0,0x09C4; 0x09C7,0x09C8;
122 |     0x09CB,0x09CD; 0x09D7,0x09D7; 0x09E2,0x09E3; 0x0A02,0x0A02;
123 |     0x0A3C,0x0A3C; 0x0A3E,0x0A3E; 0x0A3F,0x0A3F; 0x0A40,0x0A42;
124 |     0x0A47,0x0A48; 0x0A4B,0x0A4D; 0x0A70,0x0A71; 0x0A81,0x0A83;
125 |     0x0ABC,0x0ABC; 0x0ABE,0x0AC5; 0x0AC7,0x0AC9; 0x0ACB,0x0ACD;
126 |     0x0B01,0x0B03; 0x0B3C,0x0B3C; 0x0B3E,0x0B43; 0x0B47,0x0B48;
127 |     0x0B4B,0x0B4D; 0x0B56,0x0B57; 0x0B82,0x0B83; 0x0BBE,0x0BC2;
128 |     0x0BC6,0x0BC8; 0x0BCA,0x0BCD; 0x0BD7,0x0BD7; 0x0C01,0x0C03;
129 |     0x0C3E,0x0C44; 0x0C46,0x0C48; 0x0C4A,0x0C4D; 0x0C55,0x0C56;
130 |     0x0C82,0x0C83; 0x0CBE,0x0CC4; 0x0CC6,0x0CC8; 0x0CCA,0x0CCD;
131 |     0x0CD5,0x0CD6; 0x0D02,0x0D03; 0x0D3E,0x0D43; 0x0D46,0x0D48;
132 |     0x0D4A,0x0D4D; 0x0D57,0x0D57; 0x0E31,0x0E31; 0x0E34,0x0E3A;
133 |     0x0E47,0x0E4E; 0x0EB1,0x0EB1; 0x0EB4,0x0EB9; 0x0EBB,0x0EBC;
134 |     0x0EC8,0x0ECD; 0x0F18,0x0F19; 0x0F35,0x0F35; 0x0F37,0x0F37;
135 |     0x0F39,0x0F39; 0x0F3E,0x0F3E; 0x0F3F,0x0F3F; 0x0F71,0x0F84;
136 |     0x0F86,0x0F8B; 0x0F90,0x0F95; 0x0F97,0x0F97; 0x0F99,0x0FAD;
137 |     0x0FB1,0x0FB7; 0x0FB9,0x0FB9; 0x20D0,0x20DC; 0x20E1,0x20E1;
138 |     0x302A,0x302F; 0x3099,0x3099; 0x309A,0x309A ]
139 | 
140 | let digit =
141 |   [ 0x0030,0x0039;
142 |     0x0660,0x0669; 0x06F0,0x06F9; 0x0966,0x096F; 0x09E6,0x09EF;
143 |     0x0A66,0x0A6F; 0x0AE6,0x0AEF; 0x0B66,0x0B6F; 0x0BE7,0x0BEF;
144 |     0x0C66,0x0C6F; 0x0CE6,0x0CEF; 0x0D66,0x0D6F; 0x0E50,0x0E59;
145 |     0x0ED0,0x0ED9; 0x0F20,0x0F29 ]
146 | 
147 | let extender =
148 |   [ 0x00B7,0x00B7; 0x02D0,0x02D1; 0x0387,0x0387; 0x0640,0x0640;
149 |     0x0E46,0x0E46; 0x0EC6,0x0EC6; 0x3005,0x3005; 0x3031,0x3035;
150 |     0x309D,0x309E; 0x30FC,0x30FE ]
151 | 
152 | let blank =
153 |   [ 0x0009,0x000A; 0x000D,0x000D; 0x0020,0x0020 ]
154 | 
155 | let letter = union base_char ideographic
156 | 
157 | 
158 | (* Letters to be used in identifiers, as specified
159 |    by ISO ....
160 |    Data provided by John M. Skaller *)
161 | let tr8876_ident_char = [
162 |   (* ASCII *)
163 |   (0x0041,0x005a);
164 |   (0x0061,0x007a);
165 | 
166 |   (* Latin *)
167 |   (0x00c0,0x00d6);
168 |   (0x00d8,0x00f6);
169 |   (0x00f8,0x01f5);
170 |   (0x01fa,0x0217);
171 |   (0x0250,0x02a8);
172 | 
173 |   (* Greek *)
174 |   (0x0384,0x0384);
175 |   (0x0388,0x038a);
176 |   (0x038c,0x038c);
177 |   (0x038e,0x03a1);
178 |   (0x03a3,0x03ce);
179 |   (0x03d0,0x03d6);
180 |   (0x03da,0x03da);
181 |   (0x03dc,0x03dc);
182 |   (0x03de,0x03de);
183 |   (0x03e0,0x03e0);
184 |   (0x03e2,0x03f3);
185 | 
186 |   (* Cyrillic *)
187 |   (0x0401,0x040d);
188 |   (0x040f,0x044f);
189 |   (0x0451,0x045c);
190 |   (0x045e,0x0481);
191 |   (0x0490,0x04c4);
192 |   (0x04c7,0x04c4);
193 |   (0x04cb,0x04cc);
194 |   (0x04d0,0x04eb);
195 |   (0x04ee,0x04f5);
196 |   (0x04f8,0x04f9);
197 | 
198 |   (* Armenian *)
199 |   (0x0531,0x0556);
200 |   (0x0561,0x0587);
201 |   (0x04d0,0x04eb);
202 | 
203 |   (* Hebrew *)
204 |   (0x05d0,0x05ea);
205 |   (0x05f0,0x05f4);
206 | 
207 |   (* Arabic *)
208 |   (0x0621,0x063a);
209 |   (0x0640,0x0652);
210 |   (0x0670,0x06b7);
211 |   (0x06ba,0x06be);
212 |   (0x06c0,0x06ce);
213 |   (0x06e5,0x06e7);
214 | 
215 |   (* Devanagari *)
216 |   (0x0905,0x0939);
217 |   (0x0958,0x0962);
218 | 
219 |   (* Bengali *)
220 |   (0x0985,0x098c);
221 |   (0x098f,0x0990);
222 |   (0x0993,0x09a8);
223 |   (0x09aa,0x09b0);
224 |   (0x09b2,0x09b2);
225 |   (0x09b6,0x09b9);
226 |   (0x09dc,0x09dd);
227 |   (0x09df,0x09e1);
228 |   (0x09f0,0x09f1);
229 | 
230 |   (* Gurmukhi *)
231 |   (0x0a05,0x0a0a);
232 |   (0x0a0f,0x0a10);
233 |   (0x0a13,0x0a28);
234 |   (0x0a2a,0x0a30);
235 |   (0x0a32,0x0a33);
236 |   (0x0a35,0x0a36);
237 |   (0x0a38,0x0a39);
238 |   (0x0a59,0x0a5c);
239 |   (0x0a5e,0x0a5e);
240 | 
241 |   (* Gunjarati *)
242 |   (0x0a85,0x0a8b);
243 |   (0x0a8d,0x0a8d);
244 |   (0x0a8f,0x0a91);
245 |   (0x0a93,0x0aa8);
246 |   (0x0aaa,0x0ab0);
247 |   (0x0ab2,0x0ab3);
248 |   (0x0ab5,0x0ab9);
249 |   (0x0ae0,0x0ae0);
250 | 
251 |   (* Oriya *)
252 |   (0x0b05,0x0b0c);
253 |   (0x0b0f,0x0b10);
254 |   (0x0b13,0x0b28);
255 |   (0x0b2a,0x0b30);
256 |   (0x0b32,0x0b33);
257 |   (0x0b36,0x0b39);
258 |   (0x0b5c,0x0b5d);
259 |   (0x0b5f,0x0b61);
260 | 
261 |   (* Tamil *)
262 |   (0x0b85,0x0b8a);
263 |   (0x0b8e,0x0b90);
264 |   (0x0b92,0x0b95);
265 |   (0x0b99,0x0b9a);
266 |   (0x0b9c,0x0b9c);
267 |   (0x0b9e,0x0b9f);
268 |   (0x0ba3,0x0ba4);
269 |   (0x0ba8,0x0baa);
270 |   (0x0bae,0x0bb5);
271 |   (0x0bb7,0x0bb9);
272 | 
273 |   (* Telugu *)
274 |   (0x0c05,0x0c0c);
275 |   (0x0c0e,0x0c10);
276 |   (0x0c12,0x0c28);
277 |   (0x0c2a,0x0c33);
278 |   (0x0c35,0x0c39);
279 |   (0x0c60,0x0c61);
280 | 
281 |   (* Kannada *)
282 |   (0x0c85,0x0c8c);
283 |   (0x0c8e,0x0c90);
284 |   (0x0c92,0x0ca8);
285 |   (0x0caa,0x0cb3);
286 |   (0x0cb5,0x0cb9);
287 |   (0x0ce0,0x0ce1);
288 | 
289 |   (* Malayam *)
290 |   (0x0d05,0x0d0c);
291 |   (0x0d0e,0x0d10);
292 |   (0x0d12,0x0d28);
293 |   (0x0d2a,0x0d39);
294 |   (0x0d60,0x0d61);
295 | 
296 |   (* Thai *)
297 |   (0x0e01,0x0e30);
298 |   (0x0e32,0x0e33);
299 |   (0x0e40,0x0e46);
300 |   (0x0e4f,0x0e5b);
301 | 
302 |   (* Lao *)
303 |   (0x0e81,0x0e82);
304 |   (0x0e84,0x0e84);
305 |   (0x0e87,0x0e88);
306 |   (0x0e8a,0x0e8a);
307 |   (0x0e0d,0x0e0d);
308 |   (0x0e94,0x0e97);
309 |   (0x0e99,0x0e9f);
310 |   (0x0ea1,0x0ea3);
311 |   (0x0ea5,0x0ea5);
312 |   (0x0ea7,0x0ea7);
313 |   (0x0eaa,0x0eab);
314 |   (0x0ead,0x0eb0);
315 |   (0x0eb2,0x0eb3);
316 |   (0x0ebd,0x0ebd);
317 |   (0x0ec0,0x0ec4);
318 |   (0x0ec6,0x0ec6);
319 | 
320 |   (* Georgian *)
321 |   (0x10a0,0x10c5);
322 |   (0x10d0,0x10f6);
323 | 
324 |   (* Hangul Jamo *)
325 |   (0x1100,0x1159);
326 |   (0x1161,0x11a2);
327 |   (0x11a8,0x11f9);
328 |   (0x11d0,0x11f6);
329 | 
330 |   (* Latin extensions *)
331 |   (0x1e00,0x1e9a);
332 |   (0x1ea0,0x1ef9);
333 | 
334 |   (* Greek extended *)
335 |   (0x1f00,0x1f15);
336 |   (0x1f18,0x1f1d);
337 |   (0x1f20,0x1f45);
338 |   (0x1f48,0x1f4d);
339 |   (0x1f50,0x1f57);
340 |   (0x1f59,0x1f59);
341 |   (0x1f5b,0x1f5b);
342 |   (0x1f5d,0x1f5d);
343 |   (0x1f5f,0x1f7d);
344 |   (0x1f80,0x1fb4);
345 |   (0x1fb6,0x1fbc);
346 |   (0x1fc2,0x1fc4);
347 |   (0x1fc6,0x1fcc);
348 |   (0x1fd0,0x1fd3);
349 |   (0x1fd6,0x1fdb);
350 |   (0x1fe0,0x1fec);
351 |   (0x1ff2,0x1ff4);
352 |   (0x1ff6,0x1ffc);
353 | 
354 | 
355 |   (* Hiragana *)
356 |   (0x3041,0x3094);
357 |   (0x309b,0x309e);
358 | 
359 |   (* Katakana *)
360 |   (0x30a1,0x30fe);
361 | 
362 |   (* Bopmofo *)
363 |   (0x3105,0x312c);
364 | 
365 |   (* CJK Unified Ideographs *)
366 |   (0x4e00,0x9fa5);
367 | 
368 |   (* CJK Compatibility Ideographs *)
369 |   (0xf900,0xfa2d);
370 | 
371 |   (* Arabic Presentation Forms *)
372 |   (0xfb1f,0xfb36);
373 |   (0xfb38,0xfb3c);
374 |   (0xfb3e,0xfb3e);
375 |   (0xfb40,0xfb41);
376 |   (0xfb42,0xfb44);
377 |   (0xfb46,0xfbb1);
378 |   (0xfbd3,0xfd35);
379 | 
380 |   (* Arabic Presentation Forms-A *)
381 |   (0xfd50,0xfd85);
382 |   (0xfd92,0xfbc7);
383 |   (0xfdf0,0xfdfb);
384 | 
385 |   (* Arabic Presentation Forms-B *)
386 |   (0xfe70,0xfe72);
387 |   (0xfe74,0xfe74);
388 |   (0xfe76,0xfefc);
389 | 
390 |   (* Half width and Fullwidth Forms *)
391 |   (0xff21,0xff3a);
392 |   (0xff41,0xff5a);
393 |   (0xff66,0xffbe);
394 |   (0xffc2,0xffc7);
395 |   (0xffca,0xffcf);
396 |   (0xffd2,0xffd7);
397 |   (0xffd2,0xffd7);
398 |   (0xffda,0xffdc)
399 | ]
400 | 


--------------------------------------------------------------------------------
/src/lib/sedlexing.ml:
--------------------------------------------------------------------------------
  1 | (* The package sedlex is released under the terms of an MIT-like license. *)
  2 | (* See the attached LICENSE file.                                         *)
  3 | (* Copyright 2005, 2013 by Alain Frisch and LexiFi.                       *)
  4 | 
  5 | exception InvalidCodepoint of int
  6 | exception MalFormed
  7 | 
  8 | 
  9 | let gen_of_channel chan =
 10 |   let f () =
 11 |     try Some (input_char chan)
 12 |     with End_of_file -> None
 13 |   in
 14 |   f
 15 | 
 16 | let (>>=) o f = match o with
 17 |   | Some x -> f x
 18 |   | None -> None
 19 | 
 20 | 
 21 | (* For legacy purposes. *)
 22 | let gen_of_stream stream =
 23 |   let f () =
 24 |     try Some (Stream.next stream)
 25 |     with Stream.Failure -> None
 26 |   in f
 27 | 
 28 | (* Absolute position from the beginning of the stream *)
 29 | type apos = int
 30 | 
 31 | type lexbuf = {
 32 |   refill: (Uchar.t array -> int -> int -> int);
 33 |   mutable buf: Uchar.t array;
 34 |   mutable len: int;    (* Number of meaningful char in buffer *)
 35 |   mutable offset: apos; (* Position of the first char in buffer
 36 |                             in the input stream *)
 37 |   mutable pos: int; (* pos is the index in the buffer *)
 38 |   mutable curr_bol: int; (* bol is the index in the input stream but not buffer *)
 39 |   mutable curr_line: int; (* start from 1, if it is 0, we would not track postion info for you *)
 40 |   mutable start_pos: int; (* First char we need to keep visible *)
 41 |   mutable start_bol: int;
 42 |   mutable start_line: int;
 43 | 
 44 |   mutable marked_pos: int;
 45 |   mutable marked_bol: int;
 46 |   mutable marked_line: int;
 47 |   mutable marked_val: int;
 48 | 
 49 |   mutable filename: string;
 50 | 
 51 |   mutable finished: bool;
 52 | }
 53 | 
 54 | let chunk_size = 512
 55 | 
 56 | let empty_lexbuf = {
 57 |   refill = (fun _ _ _ -> assert false);
 58 |   buf = [| |];
 59 |   len = 0;
 60 |   offset = 0;
 61 |   pos = 0;
 62 |   curr_bol = 0;
 63 |   curr_line = 0;
 64 |   start_pos = 0;
 65 |   start_bol = 0;
 66 |   start_line = 0;
 67 |   marked_pos = 0;
 68 |   marked_bol = 0;
 69 |   marked_line = 0;
 70 |   marked_val = 0;
 71 |   filename = "";
 72 |   finished = false;
 73 | }
 74 | 
 75 | let create f = {
 76 |   empty_lexbuf with
 77 |     refill = f;
 78 |     buf = Array.make chunk_size (Uchar.of_int 0);
 79 |     curr_line = 1;
 80 | }
 81 | 
 82 | let set_position lexbuf position =
 83 |   lexbuf.offset <- position.Lexing.pos_cnum - lexbuf.pos;
 84 |   lexbuf.curr_bol <- position.Lexing.pos_bol;
 85 |   lexbuf.curr_line <- position.Lexing.pos_lnum
 86 | 
 87 | let set_filename lexbuf fname =
 88 |   lexbuf.filename <- fname
 89 | 
 90 | let fill_buf_from_gen f gen buf pos len =
 91 |   let rec aux i =
 92 |     if i >= len then len
 93 |     else match gen () with
 94 |       | Some c -> buf.(pos + i) <- f c ; aux (i+1)
 95 |       | None -> i
 96 |   in
 97 |   aux 0
 98 | 
 99 | let from_gen s =
100 |   create (fill_buf_from_gen (fun id -> id) s)
101 | 
102 | let from_stream s = from_gen @@ gen_of_stream s
103 | 
104 | let from_int_array a =
105 |   let len = Array.length a in
106 |   {
107 |     empty_lexbuf with
108 |       buf = Array.init len (fun i -> Uchar.of_int a.(i));
109 |       len = len;
110 |       finished = true;
111 |   }
112 | 
113 | let from_uchar_array a =
114 |   let len = Array.length a in
115 |   {
116 |     empty_lexbuf with
117 |       buf = Array.init len (fun i -> a.(i));
118 |       len = len;
119 |       finished = true;
120 |   }
121 | 
122 | 
123 | let refill lexbuf =
124 |   if lexbuf.len + chunk_size > Array.length lexbuf.buf
125 |   then begin
126 |     let s = lexbuf.start_pos in
127 |     let ls = lexbuf.len - s in
128 |     if ls + chunk_size <= Array.length lexbuf.buf then
129 |       Array.blit lexbuf.buf s lexbuf.buf 0 ls
130 |     else begin
131 |       let newlen = (Array.length lexbuf.buf + chunk_size) * 2 in
132 |       let newbuf = Array.make newlen (Uchar.of_int 0) in
133 |       Array.blit lexbuf.buf s newbuf 0 ls;
134 |       lexbuf.buf <- newbuf
135 |     end;
136 |     lexbuf.len <- ls;
137 |     lexbuf.offset <- lexbuf.offset + s;
138 |     lexbuf.pos <- lexbuf.pos - s;
139 |     lexbuf.marked_pos <- lexbuf.marked_pos - s;
140 |     lexbuf.start_pos <- 0
141 |   end;
142 |   let n = lexbuf.refill lexbuf.buf lexbuf.pos chunk_size in
143 |   if n = 0
144 |   then lexbuf.finished <- true
145 |   else lexbuf.len <- lexbuf.len + n
146 | 
147 | let new_line lexbuf =
148 |   if lexbuf.curr_line != 0 then
149 |   lexbuf.curr_line <- lexbuf.curr_line + 1;
150 |   lexbuf.curr_bol <- lexbuf.pos + lexbuf.offset
151 | 
152 | let next lexbuf =
153 |   if (not lexbuf.finished) && (lexbuf.pos = lexbuf.len) then refill lexbuf;
154 |   if lexbuf.finished && (lexbuf.pos = lexbuf.len) then None
155 |   else begin
156 |     let ret = lexbuf.buf.(lexbuf.pos) in
157 |     lexbuf.pos <- lexbuf.pos + 1;
158 |     if ret = (Uchar.of_int 10) then new_line lexbuf;
159 |     Some ret
160 |   end
161 | 
162 | let mark lexbuf i =
163 |   lexbuf.marked_pos <- lexbuf.pos;
164 |   lexbuf.marked_bol <- lexbuf.curr_bol;
165 |   lexbuf.marked_line <- lexbuf.curr_line;
166 |   lexbuf.marked_val <- i
167 | 
168 | let start lexbuf =
169 |   lexbuf.start_pos <- lexbuf.pos;
170 |   lexbuf.start_bol <- lexbuf.curr_bol;
171 |   lexbuf.start_line <- lexbuf.curr_line;
172 |   mark lexbuf (-1)
173 | 
174 | let backtrack lexbuf =
175 |   lexbuf.pos <- lexbuf.marked_pos;
176 |   lexbuf.curr_bol <- lexbuf.marked_bol;
177 |   lexbuf.curr_line <- lexbuf.marked_line;
178 |   lexbuf.marked_val
179 | 
180 | let rollback lexbuf =
181 |   lexbuf.pos <- lexbuf.start_pos;
182 |   lexbuf.curr_bol <- lexbuf.start_bol;
183 |   lexbuf.curr_line <- lexbuf.start_line
184 | 
185 | let lexeme_start lexbuf = lexbuf.start_pos + lexbuf.offset
186 | let lexeme_end lexbuf = lexbuf.pos + lexbuf.offset
187 | 
188 | let loc lexbuf = (lexbuf.start_pos + lexbuf.offset, lexbuf.pos + lexbuf.offset)
189 | 
190 | let lexeme_length lexbuf = lexbuf.pos - lexbuf.start_pos
191 | 
192 | let sub_lexeme lexbuf pos len =
193 |   Array.sub lexbuf.buf (lexbuf.start_pos + pos) len
194 | 
195 | let lexeme lexbuf =
196 |   Array.sub lexbuf.buf (lexbuf.start_pos) (lexbuf.pos - lexbuf.start_pos)
197 | 
198 | let lexeme_char lexbuf pos =
199 |   lexbuf.buf.(lexbuf.start_pos + pos)
200 | 
201 | let lexing_positions lexbuf =
202 |   let start_p = {
203 |     Lexing.pos_fname = lexbuf.filename;
204 |     pos_lnum = lexbuf.start_line;
205 |     pos_cnum = lexbuf.start_pos + lexbuf.offset;
206 |     pos_bol = lexbuf.start_bol;
207 |   } and curr_p = {
208 |     Lexing.pos_fname = lexbuf.filename;
209 |     pos_lnum = lexbuf.curr_line;
210 |     pos_cnum = lexbuf.pos + lexbuf.offset;
211 |     pos_bol = lexbuf.curr_bol;
212 |   } in
213 |   (start_p, curr_p)
214 | 
215 | let with_tokenizer lexer' lexbuf =
216 |   let lexer () =
217 |     let token = lexer' lexbuf in
218 |     let (start_p, curr_p) = lexing_positions lexbuf in
219 |     (token, start_p, curr_p)
220 |   in lexer
221 | 
222 | module Latin1 = struct
223 |   let from_gen s =
224 |     create (fill_buf_from_gen Uchar.of_char s)
225 | 
226 |   let from_stream s = from_gen @@ gen_of_stream s
227 | 
228 |   let from_string s =
229 |     let len = String.length s in
230 |     {
231 |      empty_lexbuf with
232 |      buf = Array.init len (fun i -> Uchar.of_char s.[i]);
233 |      len = len;
234 |      finished = true;
235 |     }
236 | 
237 |   let from_channel ic =
238 |     from_gen (gen_of_channel ic)
239 | 
240 |   let to_latin1 c =
241 |     if Uchar.is_char c
242 |     then Uchar.to_char c
243 |     else raise (InvalidCodepoint (Uchar.to_int c))
244 | 
245 |   let lexeme_char lexbuf pos =
246 |     to_latin1 (lexeme_char lexbuf pos)
247 | 
248 |   let sub_lexeme lexbuf pos len =
249 |     let s = Bytes.create len in
250 |     for i = 0 to len - 1 do Bytes.set s i (to_latin1 lexbuf.buf.(lexbuf.start_pos + pos + i)) done;
251 |     Bytes.to_string s
252 | 
253 |   let lexeme lexbuf =
254 |     sub_lexeme lexbuf 0 (lexbuf.pos - lexbuf.start_pos)
255 | end
256 | 
257 | 
258 | module Utf8 = struct
259 |   module Helper = struct
260 |     (* http://www.faqs.org/rfcs/rfc3629.html *)
261 | 
262 |     let width = Array.make 256 (-1)
263 |     let () =
264 |       for i = 0 to 127 do width.(i) <- 1 done;
265 |       for i = 192 to 223 do width.(i) <- 2 done;
266 |       for i = 224 to 239 do width.(i) <- 3 done;
267 |       for i = 240 to 247 do width.(i) <- 4 done
268 | 
269 |     let next s i =
270 |       match s.[i] with
271 |       | '\000'..'\127' as c ->
272 |           Char.code c
273 |       | '\192'..'\223' as c ->
274 | 	  let n1 = Char.code c in
275 | 	  let n2 = Char.code s.[i+1] in
276 |           if (n2 lsr 6 != 0b10) then raise MalFormed;
277 |           ((n1 land 0x1f) lsl 6) lor (n2 land 0x3f)
278 |       | '\224'..'\239' as c ->
279 | 	  let n1 = Char.code c in
280 | 	  let n2 = Char.code s.[i+1] in
281 | 	  let n3 = Char.code s.[i+2] in
282 |           if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) then raise MalFormed;
283 | 	  let p =
284 |             ((n1 land 0x0f) lsl 12) lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f)
285 | 	  in
286 | 	  if (p >= 0xd800) && (p <= 0xdf00) then raise MalFormed;
287 | 	  p
288 |       | '\240'..'\247' as c ->
289 | 	  let n1 = Char.code c in
290 | 	  let n2 = Char.code s.[i+1] in
291 | 	  let n3 = Char.code s.[i+2] in
292 | 	  let n4 = Char.code s.[i+3] in
293 |           if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10)
294 | 	  then raise MalFormed;
295 |           ((n1 land 0x07) lsl 18) lor ((n2 land 0x3f) lsl 12) lor
296 |           ((n3 land 0x3f) lsl 6) lor (n4 land 0x3f)
297 |       | _ -> raise MalFormed
298 | 
299 | 
300 |     let from_gen s =
301 |       Gen.next s >>= function
302 |       | '\000'..'\127' as c ->
303 |           Some (Uchar.of_char c)
304 |       | '\192'..'\223' as c ->
305 | 	  let n1 = Char.code c in
306 | 	  Gen.next s >>= fun c2 ->
307 | 	  let n2 = Char.code c2 in
308 |           if (n2 lsr 6 != 0b10) then raise MalFormed;
309 |           Some (Uchar.of_int (((n1 land 0x1f) lsl 6) lor (n2 land 0x3f)))
310 |       | '\224'..'\239' as c ->
311 | 	  let n1 = Char.code c in
312 | 	  Gen.next s >>= fun c2 ->
313 | 	  let n2 = Char.code c2 in
314 | 	  Gen.next s >>= fun c3 ->
315 | 	  let n3 = Char.code c3 in
316 |           if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) then raise MalFormed;
317 |           Some (Uchar.of_int (((n1 land 0x0f) lsl 12)
318 |                               lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f)))
319 |       | '\240'..'\247' as c ->
320 | 	  let n1 = Char.code c in
321 | 	  Gen.next s >>= fun c2 ->
322 | 	  let n2 = Char.code c2 in
323 | 	  Gen.next s >>= fun c3 ->
324 | 	  let n3 = Char.code c3 in
325 | 	  Gen.next s >>= fun c4 ->
326 | 	  let n4 = Char.code c4 in
327 |           if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10)
328 | 	  then raise MalFormed;
329 |           Some (Uchar.of_int (((n1 land 0x07) lsl 18)
330 |                               lor ((n2 land 0x3f) lsl 12)
331 |                               lor ((n3 land 0x3f) lsl 6) lor (n4 land 0x3f)))
332 |       | _ -> raise MalFormed
333 | 
334 | 
335 | 
336 |     let compute_len s pos bytes =
337 |       let rec aux n i =
338 |         if i >= pos + bytes then if i = pos + bytes then n else raise MalFormed
339 |         else
340 |           let w = width.(Char.code s.[i]) in
341 |           if w > 0 then aux (succ n) (i + w)
342 |           else raise MalFormed
343 |       in
344 |       aux 0 pos
345 | 
346 |     let rec blit_to_int s spos a apos n =
347 |       if n > 0 then begin
348 |         a.(apos) <- next s spos;
349 |         blit_to_int s (spos + width.(Char.code s.[spos])) a (succ apos) (pred n)
350 |       end
351 | 
352 |     let to_int_array s pos bytes =
353 |       let n = compute_len s pos bytes in
354 |       let a = Array.make n 0 in
355 |       blit_to_int s pos a 0 n;
356 |       a
357 | 
358 | (**************************)
359 | 
360 |     let store b p =
361 |       if p <= 0x7f then
362 |         Buffer.add_char b (Char.chr p)
363 |       else if p <= 0x7ff then (
364 |         Buffer.add_char b (Char.chr (0xc0 lor (p lsr 6)));
365 |         Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f)))
366 |        )
367 |       else if p <= 0xffff then (
368 |         if (p >= 0xd800 && p < 0xe000) then raise MalFormed;
369 |         Buffer.add_char b (Char.chr (0xe0 lor (p lsr 12)));
370 |         Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6) land 0x3f)));
371 |         Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f)))
372 |        )
373 |       else if p <= 0x10ffff then (
374 |         Buffer.add_char b (Char.chr (0xf0 lor (p lsr 18)));
375 |         Buffer.add_char b (Char.chr (0x80 lor ((p lsr 12) land 0x3f)));
376 |         Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6)  land 0x3f)));
377 |         Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f)))
378 |        )
379 |       else raise MalFormed
380 | 
381 |     let from_uchar_array a apos len =
382 |       let b = Buffer.create (len * 4) in
383 |       let rec aux apos len =
384 |         if len > 0
385 |         then (store b (Uchar.to_int a.(apos)); aux (succ apos) (pred len))
386 |         else Buffer.contents b in
387 |       aux apos len
388 | 
389 |     let gen_from_char_gen s = (fun () -> from_gen s)
390 |   end
391 | 
392 |   let from_channel ic =
393 |     from_gen (Helper.gen_from_char_gen (gen_of_channel ic))
394 | 
395 |   let from_gen s =
396 |     create (fill_buf_from_gen (fun id -> id)
397 |         (Helper.gen_from_char_gen s))
398 | 
399 |   let from_stream s = from_gen @@ gen_of_stream s
400 | 
401 |   let from_string s =
402 |     from_int_array (Helper.to_int_array s 0 (String.length s))
403 | 
404 |   let sub_lexeme lexbuf pos len =
405 |     Helper.from_uchar_array lexbuf.buf (lexbuf.start_pos + pos) len
406 | 
407 |   let lexeme lexbuf =
408 |     sub_lexeme lexbuf 0 (lexbuf.pos - lexbuf.start_pos)
409 | end
410 | 
411 | 
412 | module Utf16 = struct
413 |   type byte_order = Little_endian | Big_endian
414 |   module Helper = struct
415 |     (* http://www.ietf.org/rfc/rfc2781.txt *)
416 | 
417 |     let number_of_char_pair bo c1 c2 = match bo with
418 |     | Little_endian -> ((Char.code c2) lsl 8) + (Char.code c1)
419 |     | Big_endian -> ((Char.code c1) lsl 8) + (Char.code c2)
420 | 
421 |     let char_pair_of_number bo num = match bo with
422 |     | Little_endian ->
423 |         (Char.chr (num land 0xFF), Char.chr ((num lsr 8) land 0xFF ))
424 |     | Big_endian ->
425 |         (Char.chr ((num lsr 8) land 0xFF), Char.chr (num land 0xFF))
426 | 
427 |     let next_in_gen bo s =
428 |       Gen.next s >>= fun c1 ->
429 |       Gen.next s >>= fun c2 ->
430 |       Some (number_of_char_pair bo c1 c2)
431 | 
432 |     let from_gen bo s w1 =
433 |       if w1 = 0xfffe then raise (InvalidCodepoint w1);
434 |       if w1 < 0xd800 || 0xdfff < w1 then Some (Uchar.of_int w1)
435 |       else if w1 <= 0xdbff
436 |       then
437 |         next_in_gen bo s >>= fun w2 ->
438 |         if w2 < 0xdc00 || w2 > 0xdfff then raise MalFormed;
439 |         let upper10 = (w1 land 0x3ff) lsl 10
440 |         and lower10 = w2 land 0x3ff in
441 |         Some (Uchar.of_int (0x10000 + upper10 + lower10))
442 |       else raise MalFormed
443 | 
444 |     let gen_from_char_gen opt_bo s =
445 |       let bo = ref opt_bo in
446 |       fun () ->
447 |         Gen.next s >>= fun c1 ->
448 |         Gen.next s >>= fun c2 ->
449 |         let o = match !bo with
450 |           | Some o -> o
451 |           | None ->
452 |               let o = match (Char.code c1, Char.code c2) with
453 |                 | (0xff,0xfe) -> Little_endian
454 |                 | _ -> Big_endian in
455 |               bo := Some o;
456 |               o in
457 |         from_gen o s (number_of_char_pair o c1 c2)
458 | 
459 | 
460 |     let compute_len opt_bo str pos bytes =
461 |       let s = gen_from_char_gen opt_bo
462 |           (Gen.init ~limit:(bytes - pos) (fun i -> (str.[i + pos])))
463 |       in
464 |       let l = ref 0 in
465 |       Gen.iter (fun _ -> incr l) s ;
466 |       !l
467 | 
468 |     let blit_to_int opt_bo s spos a apos bytes =
469 |       let s = gen_from_char_gen opt_bo
470 |           (Gen.init ~limit:(bytes - spos) (fun i -> (s.[i + spos]))) in
471 |       let p = ref apos in
472 |       Gen.iter (fun x -> a.(!p) <- x ; incr p) s
473 | 
474 | 
475 |     let to_uchar_array opt_bo s pos bytes =
476 |       let len = compute_len opt_bo s pos bytes in
477 |       let a = Array.make len (Uchar.of_int 0) in
478 |       blit_to_int opt_bo s pos a 0 bytes ;
479 |        a
480 | 
481 |     let store bo buf code =
482 |       if code < 0x10000
483 |       then (
484 |         let (c1,c2) = char_pair_of_number bo code in
485 |         Buffer.add_char buf c1;
486 |         Buffer.add_char buf c2
487 |        ) else (
488 |         let u' = code - 0x10000  in
489 |         let w1 = 0xd800 + (u' lsr 10)
490 |         and w2 = 0xdc00 + (u' land 0x3ff) in
491 |         let (c1,c2) = char_pair_of_number bo w1
492 |         and (c3,c4) = char_pair_of_number bo w2 in
493 |         Buffer.add_char buf c1;
494 |         Buffer.add_char buf c2;
495 |         Buffer.add_char buf c3;
496 |         Buffer.add_char buf c4
497 |        )
498 | 
499 |     let from_uchar_array bo a apos len bom =
500 |       let b = Buffer.create (len * 4 + 2) in (* +2 for the BOM *)
501 |       if bom then store bo b 0xfeff ; (* first, store the BOM *)
502 |       let rec aux apos len =
503 |         if len > 0
504 |         then (store bo b (Uchar.to_int a.(apos)); aux (succ apos) (pred len))
505 |         else Buffer.contents b  in
506 |       aux apos len
507 | end
508 | 
509 | 
510 |   let from_gen s opt_bo =
511 |     from_gen (Helper.gen_from_char_gen opt_bo s)
512 | 
513 |   let from_stream s = from_gen @@ gen_of_stream s
514 | 
515 |   let from_channel ic opt_bo =
516 |     from_gen (gen_of_channel ic) opt_bo
517 | 
518 |   let from_string s opt_bo =
519 |     let a = Helper.to_uchar_array opt_bo s 0 (String.length s) in
520 |     from_uchar_array a
521 | 
522 |   let sub_lexeme lb pos len bo bom  =
523 |     Helper.from_uchar_array bo lb.buf (lb.start_pos + pos) len bom
524 | 
525 |   let lexeme lb bo bom =
526 |     sub_lexeme lb 0 (lb.pos - lb.start_pos) bo bom
527 | end
528 | 


--------------------------------------------------------------------------------
/src/syntax/ppx_sedlex.ml:
--------------------------------------------------------------------------------
  1 | (* The package sedlex is released under the terms of an MIT-like license. *)
  2 | (* See the attached LICENSE file.                                         *)
  3 | (* Copyright 2005, 2013 by Alain Frisch and LexiFi.                       *)
  4 | 
  5 | open Longident
  6 | open Migrate_parsetree
  7 | open Ast_405
  8 | open Parsetree
  9 | open Asttypes
 10 | open Ast_helper
 11 | open Ast_convenience_405
 12 | 
 13 | module Ast_mapper_class = Ast_mapper_class_405
 14 | 
 15 | let ocaml_version = Versions.ocaml_405
 16 | 
 17 | module Cset = Sedlex_cset
 18 | 
 19 | (* Decision tree for partitions *)
 20 | 
 21 | type decision_tree =
 22 |   | Lte of int * decision_tree * decision_tree
 23 |   | Table of int * int array
 24 |   | Return of int
 25 | 
 26 | let decision l =
 27 |   let l = List.map (fun (a, b, i) -> (a, b, Return i)) l in
 28 |   let rec merge2 = function
 29 |     | (a1, b1, d1) :: (a2, b2, d2) :: rest ->
 30 |         let x =
 31 |           if b1 + 1 = a2 then d2
 32 |           else Lte (a2 - 1, Return (-1), d2)
 33 |         in
 34 |         (a1, b2, Lte (b1, d1, x)) :: merge2 rest
 35 |     | rest -> rest
 36 |   in
 37 |   let rec aux = function
 38 |     | [(a, b, d)] -> Lte (a - 1, Return (-1), Lte (b, d, Return (-1)))
 39 |     | [] -> Return (-1)
 40 |     | l -> aux (merge2 l)
 41 |   in
 42 |   aux l
 43 | 
 44 | let limit = 8192
 45 | 
 46 | let decision_table l =
 47 |   let rec aux m accu = function
 48 |     | ((a, b, i) as x)::rem when b < limit && i < 255->
 49 |         aux (min a m) (x :: accu) rem
 50 |     | rem -> m, accu, rem
 51 |   in
 52 |   let (min, table, rest) = aux max_int [] l in
 53 |   match table with
 54 |   | [] -> decision l
 55 |   | [(min, max, i)] ->
 56 |       Lte (min - 1, Return (-1), (Lte (max, Return i, decision rest)))
 57 |   | (_, max, _) :: _ ->
 58 |       let arr = Array.make (max - min + 1) 0 in
 59 |       let set (a, b, i) = for j = a to b do arr.(j - min) <- i + 1 done in
 60 |       List.iter set table;
 61 |       Lte (min - 1, Return (-1), Lte (max, Table (min, arr), decision rest))
 62 | 
 63 | let rec simplify min max = function
 64 |   | Lte (i,yes,no) ->
 65 |       if i >= max then simplify min max yes
 66 |       else if i < min then simplify min max no
 67 |       else Lte (i, simplify min i yes, simplify (i+1) max no)
 68 |   | x -> x
 69 | 
 70 | let segments_of_partition p =
 71 |   let seg = ref [] in
 72 |   Array.iteri
 73 |     (fun i c -> List.iter (fun (a, b) -> seg := (a, b, i) :: !seg) c)
 74 |     p;
 75 |   List.sort (fun (a1,_,_) (a2,_,_) -> compare a1 a2) !seg
 76 | 
 77 | let decision_table p =
 78 |   simplify (-1) (Cset.max_code) (decision_table (segments_of_partition p))
 79 | 
 80 | 
 81 | (* Helpers to build AST *)
 82 | 
 83 | let appfun s l = app (evar s) l
 84 | let glb_value name def = Str.value Nonrecursive [Vb.mk (pvar name) def]
 85 | 
 86 | (* Named regexps *)
 87 | 
 88 | module StringMap = Map.Make(struct
 89 |   type t = string
 90 |   let compare = compare
 91 | end)
 92 | 
 93 | let builtin_regexps =
 94 |   List.fold_left (fun acc (n, c) -> StringMap.add n (Sedlex.chars c) acc)
 95 |     StringMap.empty
 96 |     [
 97 |      "any", Cset.any;
 98 |      "eof", Cset.eof;
 99 |      "xml_letter", Cset.letter;
100 |      "xml_digit", Cset.digit;
101 |      "xml_extender", Cset.extender;
102 |      "xml_base_char", Cset.base_char;
103 |      "xml_ideographic", Cset.ideographic;
104 |      "xml_combining_char", Cset.combining_char;
105 |      "xml_blank", Cset.blank;
106 |      "tr8876_ident_char", Cset.tr8876_ident_char;
107 | 
108 |      (* Unicode 6.3 categories *)
109 |      "cc", Unicode63.Categories.cc;
110 |      "cf", Unicode63.Categories.cf;
111 |      "cn", Unicode63.Categories.cn;
112 |      "co", Unicode63.Categories.co;
113 |      "cs", Unicode63.Categories.cs;
114 |      "ll", Unicode63.Categories.ll;
115 |      "lm", Unicode63.Categories.lm;
116 |      "lo", Unicode63.Categories.lo;
117 |      "lt", Unicode63.Categories.lt;
118 |      "lu", Unicode63.Categories.lu;
119 |      "mc", Unicode63.Categories.mc;
120 |      "me", Unicode63.Categories.me;
121 |      "mn", Unicode63.Categories.mn;
122 |      "nd", Unicode63.Categories.nd;
123 |      "nl", Unicode63.Categories.nl;
124 |      "no", Unicode63.Categories.no;
125 |      "pc", Unicode63.Categories.pc;
126 |      "pd", Unicode63.Categories.pd;
127 |      "pe", Unicode63.Categories.pe;
128 |      "pf", Unicode63.Categories.pf;
129 |      "pi", Unicode63.Categories.pi;
130 |      "po", Unicode63.Categories.po;
131 |      "ps", Unicode63.Categories.ps;
132 |      "sc", Unicode63.Categories.sc;
133 |      "sk", Unicode63.Categories.sk;
134 |      "sm", Unicode63.Categories.sm;
135 |      "so", Unicode63.Categories.so;
136 |      "zl", Unicode63.Categories.zl;
137 |      "zp", Unicode63.Categories.zp;
138 |      "zs", Unicode63.Categories.zs;
139 | 
140 |      (* Unicode 6.3 properties *)
141 |      "alphabetic", Unicode63.Properties.alphabetic;
142 |      "ascii_hex_digit", Unicode63.Properties.ascii_hex_digit;
143 |      "hex_digit", Unicode63.Properties.hex_digit;
144 |      "id_continue", Unicode63.Properties.id_continue;
145 |      "id_start", Unicode63.Properties.id_start;
146 |      "lowercase", Unicode63.Properties.lowercase;
147 |      "math", Unicode63.Properties.math;
148 |      "other_alphabetic", Unicode63.Properties.other_alphabetic;
149 |      "other_lowercase", Unicode63.Properties.other_lowercase;
150 |      "other_math", Unicode63.Properties.other_math;
151 |      "other_uppercase", Unicode63.Properties.other_uppercase;
152 |      "uppercase", Unicode63.Properties.uppercase;
153 |      "white_space", Unicode63.Properties.white_space;
154 |      "xid_continue", Unicode63.Properties.xid_continue;
155 |      "xid_start", Unicode63.Properties.xid_start;
156 |     ]
157 | 
158 | (* Tables (indexed mapping: codepoint -> next state) *)
159 | 
160 | let tables = Hashtbl.create 31
161 | let table_counter = ref 0
162 | let get_tables () = Hashtbl.fold (fun key x accu -> (x, key) :: accu) tables []
163 | 
164 | let table_name x =
165 |   try Hashtbl.find tables x
166 |   with Not_found ->
167 |     incr table_counter;
168 |     let s = Printf.sprintf "__sedlex_table_%i" !table_counter in
169 |     Hashtbl.add tables x s;
170 |     s
171 | 
172 | let table (name, v) =
173 |   let n = Array.length v in
174 |   let s = Bytes.create n in
175 |   for i = 0 to n - 1 do Bytes.set s i (Char.chr v.(i)) done;
176 |   glb_value name (str (Bytes.to_string s))
177 | 
178 | (* Partition (function: codepoint -> next state) *)
179 | 
180 | let partitions = Hashtbl.create 31
181 | let partition_counter = ref 0
182 | let get_partitions () = Hashtbl.fold (fun key x accu -> (x, key) :: accu) partitions []
183 | 
184 | let partition_name x =
185 |   try Hashtbl.find partitions x
186 |   with Not_found ->
187 |     incr partition_counter;
188 |     let s = Printf.sprintf "__sedlex_partition_%i" !partition_counter in
189 |     Hashtbl.add partitions x s;
190 |     s
191 | 
192 | (* We duplicate the body for the EOF (-1) case rather than creating
193 |    an interior utility function. *)
194 | let partition (name, p) =
195 |   let rec gen_tree = function
196 |     | Lte (i, yes, no) ->
197 |         [%expr if c <= [%e int i] then [%e gen_tree yes] else [%e gen_tree no]]
198 |     | Return i -> int i
199 |     | Table (offset, t) ->
200 |               let c = if offset = 0 then [%expr c] else [%expr c - [%e int offset]] in
201 |         [%expr Char.code (String.get [%e evar (table_name t)] [%e c]) - 1]
202 |   in
203 |   let body = gen_tree (decision_table p) in
204 |   glb_value name (func [(pconstr "Some" [pvar "uc"],
205 |                          [%expr let c = Uchar.to_int uc in [%e body]]);
206 |                         (pconstr "None" [],
207 |                          [%expr let c = (-1) in [%e body]])])
208 | 
209 | (* Code generation for the automata *)
210 | 
211 | let best_final final =
212 |   let fin = ref None in
213 |   for i = Array.length final - 1 downto 0 do
214 |     if final.(i) then fin := Some i
215 |   done;
216 |   !fin
217 | 
218 | let state_fun state = Printf.sprintf "__sedlex_state_%i" state
219 | 
220 | let call_state lexbuf auto state =
221 |   let (trans, final) = auto.(state) in
222 |   if Array.length trans = 0
223 |   then match best_final final with
224 |   | Some i -> int i
225 |   | None -> assert false
226 |   else appfun (state_fun state) [evar lexbuf]
227 | 
228 | let gen_state lexbuf auto i (trans, final) =
229 |   let partition = Array.map fst trans in
230 |   let cases = Array.mapi (fun i (_, j) -> Exp.case(pint i) (call_state lexbuf auto j)) trans in
231 |   let cases = Array.to_list cases in
232 |   let body () =
233 |     Exp.match_
234 |       (appfun (partition_name partition) [[%expr Sedlexing.next [%e evar lexbuf]]])
235 |       (cases @ [Exp.case [%pat? _] [%expr Sedlexing.backtrack [%e evar lexbuf]]])
236 |   in
237 |   let ret body = [ Vb.mk (pvar (state_fun i)) (func [pvar lexbuf, body]) ] in
238 |   match best_final final with
239 |     | None -> ret (body ())
240 |     | Some _ when Array.length trans = 0 -> []
241 |     | Some i -> ret [%expr Sedlexing.mark [%e evar lexbuf] [%e int i]; [%e body ()]]
242 | 
243 | let gen_definition lexbuf l error =
244 |   let brs = Array.of_list l in
245 |   let auto = Sedlex.compile (Array.map fst brs) in
246 |   let cases = Array.to_list (Array.mapi (fun i (_, e) -> Exp.case (pint i) e) brs) in
247 |   let states = Array.mapi (gen_state lexbuf auto) auto in
248 |   let states = List.flatten (Array.to_list states) in
249 |   Exp.let_ Recursive states
250 |     (Exp.sequence
251 |        [%expr Sedlexing.start [%e evar lexbuf]]
252 |        (Exp.match_ (appfun (state_fun 0) [evar lexbuf])
253 |           (cases @ [Exp.case (Pat.any ()) error])
254 |        )
255 |     )
256 | 
257 | (* Lexer specification parser *)
258 | 
259 | let codepoint i =
260 |   if i < 0 || i > Cset.max_code then
261 |     failwith (Printf.sprintf "Invalid Unicode code point: %i" i);
262 |   i
263 | 
264 | let regexp_for_char c =
265 |   Sedlex.chars (Cset.singleton (Char.code c))
266 | 
267 | let regexp_for_string s =
268 |   let rec aux n =
269 |     if n = String.length s then Sedlex.eps
270 |     else
271 |       Sedlex.seq (regexp_for_char s.[n]) (aux (succ n))
272 |   in aux 0
273 | 
274 | let err loc s =
275 |   raise (Location.Error (Location.error ~loc ("Sedlex: " ^ s)))
276 | 
277 | let rec repeat r = function
278 |   | 0, 0 -> Sedlex.eps
279 |   | 0, m -> Sedlex.alt Sedlex.eps (Sedlex.seq r (repeat r (0, m - 1)))
280 |   | n, m -> Sedlex.seq r (repeat r (n - 1, m - 1))
281 | 
282 | let regexp_of_pattern env =
283 |   let rec char_pair_op func name p tuple = (* Construct something like Sub(a,b) *)
284 |     match tuple with
285 |       | Some {ppat_desc=Ppat_tuple (p0 :: p1 :: [])} ->
286 |         begin match func (aux p0) (aux p1) with
287 |         | Some r -> r
288 |         | None ->
289 |           err p.ppat_loc @@
290 |             "the "^name^" operator can only applied to single-character length regexps"
291 |         end
292 |       | _ -> err p.ppat_loc @@ "the "^name^" operator requires two arguments, like "^name^"(a,b)"
293 |   and aux p = (* interpret one pattern node *)
294 |     match p.ppat_desc with
295 |     | Ppat_or (p1, p2) -> Sedlex.alt (aux p1) (aux p2)
296 |     | Ppat_tuple (p :: pl) ->
297 |         List.fold_left (fun r p -> Sedlex.seq r (aux p))
298 |           (aux p)
299 |           pl
300 |     | Ppat_construct ({txt = Lident "Star"}, Some p) ->
301 |         Sedlex.rep (aux p)
302 |     | Ppat_construct ({txt = Lident "Plus"}, Some p) ->
303 |         Sedlex.plus (aux p)
304 |     | Ppat_construct
305 |         ({txt = Lident "Rep"},
306 |          Some {ppat_desc=Ppat_tuple[p0; {ppat_desc=Ppat_constant (i1 as i2)|Ppat_interval(i1, i2)}]}) ->
307 |          begin match Constant.of_constant i1, Constant.of_constant i2 with
308 |          | Pconst_integer(i1,_), Pconst_integer(i2,_) ->
309 |              let i1 = int_of_string i1 in
310 |              let i2 = int_of_string i2 in
311 |              if 0 <= i1 && i1 <= i2 then repeat (aux p0) (i1, i2)
312 |              else err p.ppat_loc "Invalid range for Rep operator"
313 |          | _ ->
314 |              err p.ppat_loc "Rep must take an integer constant or interval"
315 |          end
316 |     | Ppat_construct ({txt = Lident "Rep"}, _) ->
317 |         err p.ppat_loc "the Rep operator takes 2 arguments"
318 |     | Ppat_construct ({txt = Lident "Opt"}, Some p) ->
319 |         Sedlex.alt Sedlex.eps (aux p)
320 |     | Ppat_construct ({txt = Lident "Compl"}, arg) ->
321 |         begin match arg with
322 |         | Some p0 ->
323 |             begin match Sedlex.compl (aux p0) with
324 |             | Some r -> r
325 |             | None ->
326 |               err p.ppat_loc
327 |                 "the Compl operator can only applied to a single-character length regexp"
328 |             end
329 |         | _ -> err p.ppat_loc "the Compl operator requires an argument"
330 |         end
331 |     | Ppat_construct ({ txt = Lident "Sub" }, arg) ->
332 |         char_pair_op Sedlex.subtract "Sub" p arg
333 |     | Ppat_construct ({ txt = Lident "Intersect" }, arg) ->
334 |         char_pair_op Sedlex.intersection "Intersect" p arg
335 |     | Ppat_construct ({txt = Lident "Chars"}, arg) ->
336 |         let const = match arg with
337 |           | Some {ppat_desc=Ppat_constant const} ->
338 |               Some (Constant.of_constant const)
339 |           | _ -> None
340 |         in
341 |         begin match const with
342 |         | Some (Pconst_string(s,_))->
343 |             let c = ref Cset.empty in
344 |             for i = 0 to String.length s - 1 do
345 |               c := Cset.union !c (Cset.singleton (Char.code s.[i]))
346 |             done;
347 |             Sedlex.chars !c
348 |         | _ -> err p.ppat_loc "the Chars operator requires a string argument"
349 |         end
350 |     | Ppat_interval (i_start, i_end) ->
351 |         begin match Constant.of_constant i_start, Constant.of_constant i_end with
352 |           | Pconst_char c1, Pconst_char c2 -> Sedlex.chars (Cset.interval (Char.code c1) (Char.code c2))
353 |           | Pconst_integer(i1,_), Pconst_integer(i2,_) ->
354 |               Sedlex.chars (Cset.interval (codepoint (int_of_string i1)) (codepoint (int_of_string i2)))
355 |           | _ -> err p.ppat_loc "this pattern is not a valid interval regexp"
356 |         end
357 |     | Ppat_constant (const) ->
358 |         begin match Constant.of_constant const with
359 |           | Pconst_string (s, _) -> regexp_for_string s
360 |           | Pconst_char c -> regexp_for_char c
361 |           | Pconst_integer(i,_) -> Sedlex.chars (Cset.singleton (codepoint (int_of_string i)))
362 |           | _ -> err p.ppat_loc "this pattern is not a valid regexp"
363 |         end
364 |     | Ppat_var {txt=x} ->
365 |         begin try StringMap.find x env
366 |         with Not_found ->
367 |           err p.ppat_loc (Printf.sprintf "unbound regexp %s" x)
368 |         end
369 |     | _ ->
370 |       err p.ppat_loc "this pattern is not a valid regexp"
371 |   in
372 |   aux
373 | 
374 | 
375 | let mapper cookies =
376 |   object(this)
377 |     inherit Ast_mapper_class.mapper as super
378 | 
379 |     val env = builtin_regexps
380 | 
381 |     method define_regexp name p =
382 |       {< env = StringMap.add name (regexp_of_pattern env p) env >}
383 | 
384 |     method! expr e =
385 |       match e with
386 |       | [%expr [%sedlex [%e? {pexp_desc=Pexp_match (lexbuf, cases)}]]] ->
387 |             let lexbuf =
388 |               match lexbuf with
389 |               | {pexp_desc=Pexp_ident{txt=Lident lexbuf}} -> lexbuf
390 |               | _ ->
391 |                 err lexbuf.pexp_loc "the matched expression must be a single identifier"
392 |             in
393 |             let cases = List.rev cases in
394 |             let error =
395 |               match List.hd cases with
396 |               | {pc_lhs = [%pat? _]; pc_rhs = e; pc_guard = None} -> super # expr e
397 |               | {pc_lhs = p} ->
398 |                 err p.ppat_loc "the last branch must be a catch-all error case"
399 |             in
400 |             let cases = List.rev (List.tl cases) in
401 |             let cases =
402 |               List.map
403 |                 (function
404 |                   | {pc_lhs = p; pc_rhs = e; pc_guard = None} -> regexp_of_pattern env p, super # expr e
405 |                   | {pc_guard = Some e} ->
406 |                     err e.pexp_loc "'when' guards are not supported"
407 |                 ) cases
408 |             in
409 |             gen_definition lexbuf cases error
410 |       | [%expr let [%p? {ppat_desc=Ppat_var{txt=name}}] = [%sedlex.regexp? [%p? p]] in [%e? body]] ->
411 |           (this # define_regexp name p) # expr body
412 |       | [%expr [%sedlex [%e? _]]] ->
413 |         err e.pexp_loc "the %sedlex extension is only recognized on match expressions"
414 |       | _ -> super # expr e
415 | 
416 | 
417 |     val toplevel = true
418 | 
419 |     method structure_with_regexps l =
420 |       let mapper = ref this in
421 |       let regexps = ref [] in
422 |       let l = List.concat
423 |         (List.map
424 |            (function
425 |              | [%stri let [%p? {ppat_desc=Ppat_var{txt=name}}] = [%sedlex.regexp? [%p? p]]] as i ->
426 |                regexps := i :: !regexps;
427 |                mapper := !mapper # define_regexp name p;
428 |                []
429 |              | i ->
430 |                [ !mapper # structure_item i ]
431 |          ) l) in
432 |       (l, List.rev !regexps)
433 | 
434 |     method! structure l =
435 |       if toplevel then
436 |         let sub = {< toplevel = false >} in
437 |         let previous =
438 |           match Driver.get_cookie cookies "sedlex.regexps" ocaml_version with
439 |           | Some {pexp_desc = Pexp_extension (_, PStr l)} -> l
440 |           | Some _ -> assert false
441 |           | None -> []
442 |         in
443 |         let l, regexps = sub # structure_with_regexps (previous @ l) in
444 |         let parts = List.map partition (get_partitions ()) in
445 |         let tables = List.map table (get_tables ()) in
446 |         Driver.set_cookie cookies "sedlex.regexps" ocaml_version (Exp.extension (Location.mknoloc "regexps", PStr regexps));
447 |         tables @ parts @ l
448 |       else
449 |         fst (this # structure_with_regexps l)
450 | 
451 |  end
452 | 
453 | let () =
454 |   Driver.register
455 |     ~name:"sedlex"
456 |     ocaml_version
457 |     (fun _ cookies -> Ast_mapper_class.to_mapper (mapper cookies))
458 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1><img alt='Maintenance status: maintained' src="https://img.shields.io/maintenance/yes/2019.svg?style=popout-square&logo=verizon&logoColor=000000" align=right><a href="https://github.com/ELLIOTTCABLE/bs-sedlex/releases" align=right><img alt='Latest npm release' src="https://img.shields.io/npm/v/bs-sedlex.svg?style=popout-square&logo=npm&label=bs%20version" align=right></a><a target="_blank" href="https://travis-ci.com/ELLIOTTCABLE/bs-sedlex" align=right><img alt='Build status on Travis-CI' src="https://img.shields.io/travis/com/ELLIOTTCABLE/bs-sedlex.svg?style=popout-square&logo=travis&label=bs%20build" align=right></a><a target="_blank" href="https://twitter.com/intent/follow?screen_name=ELLIOTTCABLE" align=right><img alt='Follow my work on Twitter' src="https://img.shields.io/twitter/follow/ELLIOTTCABLE.svg?style=popout-square&logo=twitter&label=%40ELLIOTTCABLE&color=blue" align=right></a>
  2 | <code>bs-sedlex</code></h1>
  3 | 
  4 | > **For details on purpose, usage, and API of sedlex, [scroll down](#sedlex).** These sections I've
  5 | > added at the top are specific to ways that installation and usage of the `bs-sedlex` distribution
  6 | > **differ** from using the upstream release.
  7 | 
  8 | This repository contains a fork of the [sedlex][] lexer-generator tooling for OCaml-family
  9 | languages, packaged for use in projects utilizing [BuckleScript][] (an OCaml-to-JavaScript compiler)
 10 | and [ReasonML][] (an alternative OCaml syntax targeting that compiler.)
 11 | 
 12 | Care is taken in this project to publish pre-compiled binaries of the [ppx
 13 | syntax-extension](#lexer-specifications) component necessary to use sedlex in practice. These are
 14 | published to npm as the separate npm package, [`ppx-sedlex`][ppx-sedlex], versioned in lockstep with
 15 | the parent `bs-sedlex` package. Instructions for *enabling* this extension in your BuckleScript
 16 | configuration-file, `bsconfig.json`, are included below. Don't miss them!
 17 | 
 18 |    [sedlex]: <https://github.com/ocaml-community/sedlex>
 19 |       "The upstream distribution of sedlex, maintained by the OCaml community"
 20 |    [BuckleScript]: <https://bucklescript.github.io/>
 21 |    [ReasonML]: <https://reasonml.github.io/>
 22 |    [ppx-sedlex]: <https://www.npmjs.com/package/ppx-sedlex>
 23 |       "The native syntax-extension component of bs-sedlex, published separately to npm"
 24 | 
 25 | ## Installation in BuckleScript projects
 26 | 
 27 | You can safely ignore the installation instructions in the upstream README reproduced below, when
 28 | compiling to JS using BuckleScript. Instead:
 29 | 
 30 | 1. If you're writing an app or a similar end-consumer project, install BuckleScript compiler (a
 31 |    peerDependency of this project) via [npm][].
 32 | 
 33 |    ```sh
 34 |    $ npm install --save bs-platform
 35 |    ```
 36 | 
 37 |    Worh repeating: *do not add this dependency to a library.* The final application-developer
 38 |    should generally select the version of the BuckleScript compiler; you don't want users having
 39 |    duplicated versions of the compiler in their `node_modules`. Instead, library developers should
 40 |    add `bs-platform` to both `"peerDependencies"` (with a permissive version), and
 41 |    `"devDependencies"` (with a restrictive version):
 42 | 
 43 |    ```sh
 44 |    $ npm install --save-dev bs-platform
 45 |    ```
 46 | 
 47 |    ```diff
 48 |     "devDependencies": {
 49 |       ...
 50 |       "bs-platform": "^5.0.0"
 51 |     },
 52 |     "peerDependencies": {
 53 |    +  "bs-platform": "4.x || 5.x" // example. express the versions of BuckleScript you support here.
 54 |     },
 55 |    ```
 56 | 
 57 | 2. Add the ppx transformer to your `"devDependencies"`:
 58 | 
 59 |    ```sh
 60 |    $ npm install --save-dev ppx-sedlex
 61 |    ```
 62 | 
 63 | 3. Add the runtime package (this one!) to your direct `"dependencies"`, for both libraries and apps:
 64 | 
 65 |    ```sh
 66 |    $ npm install --save bs-sedlex
 67 |    ```
 68 | 
 69 | 4. Manually add it (the runtime package, `bs-sedlex`) to your `bsconfig.json`'s `bs-dependencies`
 70 |    field:
 71 | 
 72 |    ```diff
 73 |     "bs-dependencies": [
 74 |       ...
 75 |    +  "bs-sedlex"
 76 |     ],
 77 |    ```
 78 | 
 79 | 5. Additionally tell BuckleScript to apply the `ppx-sedlex` syntax-transformer over your source-code
 80 |    by adding a `ppx-flags` field at the root level of the same `bsconfig.json`. (Note that,
 81 |    unintuitively, this is *not* a relative path; it follows the format `package-name/file-path`.)
 82 | 
 83 |    ```diff
 84 |     "bs-dependencies": [
 85 |       ...
 86 |       "bs-sedlex"
 87 |     ],
 88 |    +"ppx-flags": [
 89 |    +   "ppx-sedlex/ppx.js"
 90 |    +],
 91 |    ```
 92 | 
 93 | 6. Write blazing-fast, type-safe, and Unicode-aware / multilingual lexers and parsers galore!
 94 | 
 95 |    [npm]: <https://www.npmjs.com/>
 96 |       "npm, the package-manager for the JavaScript ecosystem"
 97 | 
 98 | ## Versioning of this package
 99 | 
100 | Thanks to [SemVer not including a ‘generation’ number][semver-213], there's really no way I can
101 | reasonably tie this project's version on npm to the upstream version of Sedlex as released by the
102 | community maintainers. As ugly as it is, I've opted to pin the *major version* of `bs-sedlex`, to
103 | the *flattened* major and minor versions of the upstream project.
104 | 
105 | I started doing this with Sedlex 2.0; thus, the mapping looks like this:
106 | 
107 | | Sedlex    | `bs-sedlex` |
108 | | --------- | ----------- |
109 | | `v1.99.4` | `v1.99.4`   |
110 | | `v2.0`    | `v20.0.x`   |
111 | 
112 | Correspondingly, this project can't really strictly adhere to SemVer. Tentatively, I intend to use
113 | the ‘minor’ field for breaking changes to the port, and the ‘patch’ field for everything else.
114 | 
115 |    [semver-213]: <https://github.com/semver/semver/issues/213#issuecomment-266914818>
116 |       "A discussion around extending SemVer with an additional, human-focused major component"
117 | 
118 | ## Parser-writing tips from a fellow JavaScripter
119 | 
120 | I'm dogfooding this port on a parsing-project in JavaScript & ML (Excmd.js, <https://excmd.js.org>).
121 | Feel free to refer to that for a real-world example of compiling industrial-strength OCaml parsing
122 | tooling down to JavaScript for the web. Some takeaways follow:
123 | 
124 |  - Use [Menhir][] for parser-generation. Seriously. It's got [spectacularly clear
125 |    docs][menhir-docs], an [entire chapter in Real World OCaml][menhir-rwo] dedicated to it, and a
126 |    laundry-list of advanced features — everything from automated tooling that *explains* reported
127 |    parsing-conflicts to you, neophyte language-developer; to [an incremental-parsing
128 |    API][menhir-incremental] allowing you to implement extremely advanced error-recovery and
129 |    introspection/reporting tools.
130 | 
131 |  - If you want to take that advice, unfortunately, there's no cool, easy port to JavaScript for you,
132 |    like this one for Sedlex. 😉 (Maybe I'll publish one someday!) Until one exists, you'll have to
133 |    maintain a dualistic build-system that uses the standard OCaml tooling and build-system (i.e.
134 |    [opam][] and <https://dune.build>) to produce the `.ml` parsing-automaton, and then feed that
135 |    into the BuckleScript build. Maybe you can [glean][excmd-bsconf-generators] some
136 |    [ideas][excmd-bsconf-sources] from [my experiences][excmd-dune-libraries] here.
137 | 
138 |  - A major selling-point of sedlex is the deep and thorough Unicode compatibility. Use it! [I
139 |    suggest][uax-notes] reading through the Unicode Consortium's documentation on the topics, known
140 |    as Unicode Standard Annex №. 31, or [UAX#31][uax31]. It goes into more detail than you could ever
141 |    want to know about a vast number of topics. Get this stuff right!
142 | 
143 |  - I (ELLIOTTCABLE) am also very happy to help with any of these topics — I spent a lot of time and
144 |    effort figuring this out; and although it'll hopefully improve as the BuckleScript community
145 |    grows, until then, there's a lot of minutae to get just right. I'm active on both the
146 |    [OCaml][ocaml-discord] and [ReasonML][reasonml-discord] Discord servers (why there are *two*, I
147 |    cannot fathom); as well as on the Freenode IRC server, in both `#ocaml` and
148 |    [`#ELLIOTTCABLE`](http://ell.io/irc). Feel free to reach out if you just want to chat about these
149 |    topics, or to get more formal support!
150 | 
151 |    [Menhir]: <http://gallium.inria.fr/~fpottier/menhir/> "The Menhir parser-generator for OCaml"
152 |    [menhir-docs]: <http://gallium.inria.fr/~fpottier/menhir/manual.html>
153 |       "HTML version of Menhir's in-depth documentation"
154 |    [menhir-rwo]: <https://dev.realworldocaml.org/parsing-with-ocamllex-and-menhir.html>
155 |       "Real World OCaml - Chapter 16: Parsing with OCamllex and Menhir"
156 |    [menhir-incremental]: <http://gallium.inria.fr/~fpottier/menhir/manual.html#sec57>
157 |       "Menhir manual: the Incremental API"
158 |    [opam]: <https://opam.ocaml.org/> "opam, the OCaml package-manager"
159 |    [excmd-bsconf-generators]: <https://github.com/ELLIOTTCABLE/excmd.js/blob/63ac20b5/bsconfig.json#L21-L42>
160 |       "Setting up the undocumented 'generators' configuration in bsb to invoke Menhir"
161 |    [excmd-bsconf-sources]: <https://github.com/ELLIOTTCABLE/excmd.js/blob/63ac20b5/bsconfig.json#L5-L13>
162 |       "Using the undocumented 'generators' configuration in bsb to produce a parser-automaton"
163 |    [excmd-dune-libraries]: <https://github.com/ELLIOTTCABLE/excmd.js/blob/63ac20b5/src/dune#L4>
164 |       "Ensuring menhirLib is reachable from the OCaml side"
165 |    [uax-notes]: <https://github.com/ELLIOTTCABLE/excmd.js#internationalization-concerns-wrt-lexing>
166 |       "My own notes on UAX#31 adherence with regards to lexing and sedlex"
167 |    [uax31]: <http://unicode.org/reports/tr31/>
168 |       "Unicode® Standard Annex #31: Unicode Identifier and Pattern Syntax"
169 |    [ocaml-discord]: <https://discord.gg/cCYQbqN>
170 |       "Official OCaml Discord server"
171 |    [reasonml-discord]: <https://discordapp.com/invite/reasonml>
172 |       "Official ReasonML Discord server"
173 | 
174 | # sedlex
175 | 
176 | [![Build Status](https://travis-ci.com/ocaml-community/sedlex.svg?branch=master)](https://travis-ci.com/ocaml-community/sedlex)
177 | 
178 | Unicode-friendly lexer generator for OCaml.
179 | 
180 | This package is licensed by LexiFi under the terms of the MIT license.
181 | 
182 | sedlex was originally written by Alain Frisch
183 | <alain.frisch@lexifi.com> and is now maintained as part of the
184 | ocaml-community repositories on github.
185 | 
186 | ## API
187 | The API is documented [here](https://ocaml-community.github.io/sedlex).
188 | 
189 | ## Overview
190 | 
191 | sedlex is a lexer generator for OCaml, similar to ocamllex, but
192 | supporting Unicode.  Contrary to ocamllex, lexer specifications for
193 | sedlex are embedded in regular OCaml source files.
194 | 
195 | The lexers work with a new kind of "lexbuf", similar to ocamllex
196 | Lexing lexbufs, but designed to support Unicode, and abstracting from
197 | a specific encoding.  A single lexer can work with arbitrary encodings
198 | of the input stream.
199 | 
200 | sedlex is the successor of the ulex project. Contrary to ulex which
201 | was implemented as a Camlp4 syntax extension, sedlex is based on the
202 | new "-ppx" technology of OCaml, which allow rewriting OCaml parse
203 | trees through external rewriters. (And what a better name than "sed"
204 | for a rewriter?)
205 | 
206 | As any -ppx rewriter, sedlex does not touch the concrete syntax of the
207 | language: lexer specifications are written in source file which comply
208 | with the standard grammar of OCaml programs. sedlex reuse the syntax
209 | for pattern matching in order to describe lexers (regular expressions
210 | are encoded within OCaml patterns). A nice consequence is that your
211 | editor (vi, emacs, ...) won't get confused (indentation, coloring) and
212 | you don't need to learn new priority rules. Moreover, sedlex is
213 | compatible with any front-end parsing technology: it works fine even
214 | if you use camlp4 or camlp5, with the standard or revised syntax.
215 | 
216 | 
217 | ## Lexer specifications
218 | 
219 | 
220 | sedlex adds a new kind of expression to OCaml: lexer definitions.
221 | The syntax for the new construction is:
222 | 
223 | ```ocaml
224 |   match%sedlex lexbuf with
225 |   | R1 -> e1
226 |   ...
227 |   | Rn -> en
228 |   | _  -> def
229 | ```
230 | 
231 | or:
232 | 
233 | ```ocaml
234 |   [%sedlex match lexbuf with
235 |   | R1 -> e1
236 |   ...
237 |   | Rn -> en
238 |   | _  -> def
239 |   ]
240 | ```
241 | 
242 | (The first vertical bar is optional as in any OCaml pattern matching.
243 | Guard expressions are not allowed.)
244 | 
245 | where:
246 | - lexbuf is an arbitrary lowercase identifier, which must refer to
247 |   an existing value of type `Sedlexing.lexbuf`.
248 | - the Ri are regular expressions (see below);
249 | - the ei and def are OCaml expressions (called actions) of the same type
250 |   (the type for the whole lexer definition).
251 | 
252 | Unlike ocamllex, lexers work on stream of Unicode codepoints, not
253 | bytes.
254 | 
255 | The actions can call functions from the Sedlexing module to extract
256 | (parts of) the matched lexeme, in the desired encoding.
257 | 
258 | Regular expressions are syntactically OCaml patterns:
259 | 
260 | - `"...."` (string constant): recognize the specified string
261 | - `'....'` (character constant) : recognize the specified character
262 | - `i` (integer constant) : recognize the specified codepoint
263 | - `'...' .. '...'`: character range
264 | - `i1 .. i2`: range between two codepoints
265 | - `R1 | R2` : alternation
266 | - `R, R2, ..., Rn` : concatenation
267 | - `Star R` : Kleene star (0 or more repetition)
268 | - `Plus R` : equivalent to `R, R*`
269 | - `Opt R` : equivalent to `("" | R)`
270 | - `Rep (R, n)` : equivalent to `R{n}`
271 | - `Rep (R, n .. m)` : equivalent to `R{n, m}`
272 | - `Chars "..."` : recognize any character in the string
273 | - `Compl R` : assume that R is a single-character length regexp (see below)
274 |   and recognize the complement set
275 | - `Sub (R1,R2)` : assume that R is a single-character length regexp (see below)
276 |   and recognize the set of items in `R1` but not in `R2` ("subtract")
277 | - `Intersect (R1,R2)` : assume that `R` is a single-character length regexp (see
278 |   below) and recognize the set of items which are in both `R1` and `R2`
279 | - `lid` (lowercase identifier) : reference a named regexp (see below)
280 | 
281 | A single-character length regexp is a regexp which does not contain (after
282 | expansion of references) concatenation, Star, Plus, Opt or string constants
283 | with a length different from one.
284 | 
285 | 
286 | 
287 | Note:
288 |  - The OCaml source is assumed to be encoded in Latin1 (for string
289 |    and character literals).
290 | 
291 | 
292 | It is possible to define named regular expressions with the following
293 | construction, that can appear in place of a structure item:
294 | 
295 | ```ocaml
296 |   let lid = [%sedlex.regexp? R]
297 | ```
298 | 
299 | where lid is the regexp name to be defined and R its definition.  The
300 | scope of the "lid" regular expression is the rest of the structure,
301 | after the definition.
302 | 
303 | The same syntax can be used for local binding:
304 | 
305 | ```ocaml
306 |   let lid = [%sedlex.regexp? R] in
307 |   body
308 | ```
309 | 
310 | The scope of "lid" is the body expression.
311 | 
312 | 
313 | ## Predefined regexps
314 | 
315 | sedlex provides a set of predefined regexps:
316 | - any: any character
317 | - eof: the virtual end-of-file character
318 | - xml_letter, xml_digit, xml_extender, xml_base_char, xml_ideographic,
319 |   xml_combining_char, xml_blank: as defined by the XML recommandation
320 | - tr8876_ident_char: characters names in identifiers from ISO TR8876
321 | - cc, cf, cn, co, cs, ll, lm, lo, lt, lu, mc, me, mn, nd, nl, no, pc, pd,
322 |   pe, pf, pi, po, ps, sc, sk, sm, so, zl, zp, zs: as defined by the
323 |   Unicode standard (categories)
324 | - alphabetic, ascii_hex_digit, hex_digit, id_continue, id_start,
325 |   lowercase, math, other_alphabetic, other_lowercase, other_math,
326 |   other_uppercase, uppercase, white_space, xid_continue, xid_start: as
327 |   defined by the Unicode standard (properties)
328 | 
329 | 
330 | ## Running a lexer
331 | 
332 | See the interface of the Sedlexing module for a description of how to
333 | create lexbuf values (from strings, stream or channels encoded in
334 | Latin1, utf8 or utf16, or from integer arrays or streams representing
335 | Unicode code points).
336 | 
337 | It is possible to work with a custom implementation for lex buffers.
338 | To do this, you just have to ensure that a module called Sedlexing is
339 | in scope of your lexer specifications, and that it defines at least
340 | the following functions: start, next, mark, backtrack.  See the interface
341 | of the Sedlexing module for more information.
342 | 
343 | 
344 | 
345 | ## Using sedlex
346 | 
347 | The quick way:
348 | 
349 | ```
350 |    opam install sedlex
351 | ```
352 | 
353 | 
354 | Otherwise, the first thing to do is to compile and install sedlex.
355 | You need a recent version of OCaml and [dune](https://dune.build/).
356 | 
357 | ```
358 |   make
359 | ```
360 | 
361 | ### With findlib
362 | 
363 | If you have findlib, you can use it to install and use sedlex.
364 | The name of the findlib package is "sedlex".
365 | 
366 | Installation (after "make"):
367 | 
368 | ```
369 |   make install
370 | ```
371 | 
372 | Compilation of OCaml files with lexer specifications:
373 | 
374 | ```
375 |   ocamlfind ocamlc -c -package sedlex my_file.ml
376 | ```
377 | 
378 | When linking, you must also include the sedlex package:
379 | 
380 | ```
381 |   ocamlfind ocamlc -o my_prog -linkpkg -package sedlex my_file.cmo
382 | ```
383 | 
384 | 
385 | There is also a sedlex.ppx subpackage containing the code of the ppx
386 | filter.  This can be used to build custom drivers (combining several ppx
387 | transformations in a single process).
388 | 
389 | 
390 | ### Without findlib
391 | 
392 | You can use sedlex without findlib. To compile, you need to run the
393 | source file through -ppx rewriter ppx_sedlex. Moreover, you need to
394 | link the application with the runtime support library for sedlex
395 | (sedlexing.cma / sedlexing.cmxa).
396 | 
397 | ## Examples
398 | 
399 | The `examples/` subdirectory contains several samples of sedlex in use.
400 | 
401 | ## Contributors
402 | 
403 | - Benus Becker: implementation of Utf16
404 | - sghost: for Unicode 6.3 categories and properties
405 | - Peter Zotov:
406 |   - improvements to the build system
407 |   - switched parts of ppx_sedlex to using concrete syntax (with ppx_metaquot)
408 | - Steffen Smolka: port to dune
409 | - Elliott Cable: publish this BuckleScript-compatible fork on npm
410 | 


--------------------------------------------------------------------------------