├── dune-project ├── .gitattributes ├── .merlin ├── .gitmodules ├── src ├── lib │ ├── dune │ ├── sedlexing.mli │ └── sedlexing.ml └── syntax │ ├── dune │ ├── sedlex_cset.mli │ ├── sedlex.mli │ ├── unicode63.mli │ ├── sedlex.ml │ ├── sedlex_cset.ml │ └── ppx_sedlex.ml ├── .github └── CODEOWNERS ├── examples ├── complement.ml ├── bsconfig.json ├── subtraction.ml ├── performance.ml ├── repeat.ml ├── dune └── tokenizer.ml ├── bsconfig.json ├── .gitignore ├── Makefile ├── LICENSE ├── sedlex.opam ├── scripts ├── preversion.sh ├── package.js ├── release-ppx.sh ├── travis.sh └── version.sh ├── package.json ├── CHANGES ├── .travis.yml └── README.md /dune-project: -------------------------------------------------------------------------------- 1 | (lang dune 1.0) 2 | (name sedlex) 3 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | package-lock.json linguist-generated -diff 2 | -------------------------------------------------------------------------------- /.merlin: -------------------------------------------------------------------------------- 1 | PKG compiler-libs ppx_tools ppx_tools.metaquot 2 | 3 | SRC src/** 4 | 5 | B src/** 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ppx-sedlex"] 2 | path = ppx-sedlex 3 | url = https://github.com/ELLIOTTCABLE/ppx-sedlex.git 4 | -------------------------------------------------------------------------------- /src/lib/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (name sedlex) 3 | (public_name sedlex) 4 | (wrapped false) 5 | (libraries gen uchar) 6 | (flags :standard -w +A-4-9 -safe-string)) 7 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # These are the default owners for everything in the repo. They will 2 | # be requested for review when someone opens a pull request. 3 | * @alainfrisch @Drup @pmetzger 4 | -------------------------------------------------------------------------------- /src/syntax/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (name sedlex_ppx) 3 | (public_name sedlex.ppx) 4 | (kind ppx_rewriter) 5 | (libraries ppx_tools_versioned.metaquot_405 ocaml-migrate-parsetree sedlex) 6 | (ppx_runtime_libraries sedlex) 7 | (preprocess 8 | (pps ppx_tools_versioned.metaquot_405)) 9 | (flags (:standard -w -9))) 10 | -------------------------------------------------------------------------------- /examples/complement.ml: -------------------------------------------------------------------------------- 1 | let ucase = [%sedlex.regexp? 'A'..'Z'] 2 | let lcase = [%sedlex.regexp? 'a'..'z'] 3 | 4 | let rec token buf = 5 | match%sedlex buf with 6 | | lcase -> print_char 'L';token buf 7 | | Compl (ucase | lcase) -> print_char '?'; token buf 8 | | ucase -> print_char 'U';token buf 9 | | eof -> print_endline "." 10 | | _ -> assert false 11 | 12 | let () = 13 | let lexbuf = Sedlexing.Latin1.from_string "Abc::DefG" in 14 | token lexbuf 15 | -------------------------------------------------------------------------------- /bsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bs-sedlex", 3 | "version": "20.0.0", 4 | "sources": [ 5 | { 6 | "dir": "src/lib" 7 | } 8 | ], 9 | "bs-dependencies": [ 10 | "bs-uchar", 11 | "bs-gen" 12 | ], 13 | "package-specs": { 14 | "module": "commonjs", 15 | "in-source": true 16 | }, 17 | "suffix": ".bs.js", 18 | "bsc-flags": [ 19 | "-open UcharShim", 20 | "-bs-super-errors", 21 | "-bs-no-version-header" 22 | ], 23 | "refmt": 3 24 | } 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.annot 2 | *.cmo 3 | *.cma 4 | *.cmi 5 | *.cmt 6 | *.cmti 7 | *.a 8 | *.o 9 | *.cmx 10 | *.cmxs 11 | *.cmxa 12 | *.exe 13 | *.install 14 | **/*.merlin 15 | _build 16 | libdoc 17 | src/syntax/ppx_sedlex 18 | examples/complement 19 | examples/tokenizer 20 | examples/subtraction 21 | 22 | _opam 23 | 24 | # BuckleScript 25 | node_modules 26 | **/lib/bs 27 | **/lib/ocaml 28 | *.cmj 29 | *.bs.js 30 | *.mlast* 31 | *.mliast* 32 | 33 | ppx/ 34 | dist/ppx-sedlex-*.zip 35 | .deploy-status.env 36 | -------------------------------------------------------------------------------- /examples/bsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bs-sedlex-examples", 3 | "sources": [ 4 | { 5 | "dir": "../src/lib" 6 | }, 7 | { 8 | "dir": "./", 9 | "type": "dev" 10 | } 11 | ], 12 | "ppx-flags": ["ppx-sedlex/ppx.js"], 13 | "bs-dependencies": ["bs-uchar", "bs-gen"], 14 | "package-specs": { 15 | "module": "commonjs", 16 | "in-source": true 17 | }, 18 | "suffix": ".bs.js", 19 | "bsc-flags": ["-open UcharShim", "-bs-super-errors", "-bs-no-version-header"], 20 | "refmt": 3 21 | } 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # The package sedlex is released under the terms of an MIT-like license. 2 | # See the attached LICENSE file. 3 | # Copyright 2005, 2013 by Alain Frisch and LexiFi. 4 | 5 | INSTALL_ARGS := $(if $(PREFIX),--prefix $(PREFIX),) 6 | 7 | .PHONY: build install uninstall clean doc test all 8 | 9 | build: 10 | dune build @install 11 | 12 | install: 13 | dune install $(INSTALL_ARGS) 14 | 15 | uninstall: 16 | dune uninstall $(INSTALL_ARGS) 17 | 18 | clean: 19 | dune clean 20 | 21 | doc: 22 | dune build @doc 23 | 24 | test: 25 | dune build @runtest 26 | 27 | all: build test doc 28 | -------------------------------------------------------------------------------- /examples/subtraction.ml: -------------------------------------------------------------------------------- 1 | let rec token buf = 2 | match%sedlex buf with 3 | | white_space -> print_endline "\tWhitespace"; token buf 4 | | Sub (Chars "ab","b") -> print_endline "a"; token buf 5 | | (Chars "ab"|"c") -> print_endline "abc"; token buf 6 | | Intersect ("d", Chars "abd") -> print_endline "d"; token buf 7 | | eof -> print_endline "\tEnd" 8 | | any -> print_endline "Other"; token buf 9 | | _ -> failwith "Internal failure: Reached impossible place" 10 | 11 | 12 | let () = 13 | let lexbuf = Sedlexing.Utf8.from_string "a b c d e" in 14 | token lexbuf 15 | -------------------------------------------------------------------------------- /examples/performance.ml: -------------------------------------------------------------------------------- 1 | let rec token buf = 2 | match%sedlex buf with 3 | | any -> token buf 4 | | eof -> () 5 | | _ -> assert false 6 | 7 | let time f x = 8 | let rec acc f x = function 9 | | 0 -> f x 10 | | n -> f x|>ignore; acc f x (n-1) in 11 | let t = Sys.time() in 12 | let fx = acc f x 10 in 13 | Printf.printf "Execution time: %fs\n" (Sys.time() -. t); 14 | fx 15 | 16 | let () = 17 | let long_str = String.make 1000000 '\n' in 18 | let token_from _ = 19 | let lexbuf = Sedlexing.Latin1.from_string long_str in 20 | (* let () = Sedlexing.set_curr_p lexbuf Lexing.dummy_pos in *) 21 | token lexbuf 22 | in time token_from long_str 23 | -------------------------------------------------------------------------------- /examples/repeat.ml: -------------------------------------------------------------------------------- 1 | let rec token buf = 2 | match%sedlex buf with 3 | | white_space -> print_endline "\tWhitespace"; token buf 4 | | 'a', Rep(white_space, 1) -> print_endline "a\n\tWhitespace"; token buf 5 | | Rep("bc", 2) -> print_endline "bcbc"; token buf 6 | | Rep("d", 1 .. 1) -> print_endline "d"; token buf 7 | | Rep("ef", 1 .. 3) -> Printf.printf "%s\n" (Sedlexing.Utf8.lexeme buf); token buf 8 | | eof -> print_endline "\tEnd" 9 | | any -> print_endline "Other"; token buf 10 | | _ -> failwith "Internal failure: Reached impossible place" 11 | 12 | 13 | let () = 14 | let lexbuf = Sedlexing.Utf8.from_string "a bcbc d ef efef efefef" in 15 | token lexbuf 16 | -------------------------------------------------------------------------------- /examples/dune: -------------------------------------------------------------------------------- 1 | (executables 2 | (names tokenizer complement subtraction repeat performance) 3 | (libraries sedlex) 4 | (preprocess 5 | (pps sedlex.ppx)) 6 | (flags :standard -w -39)) 7 | 8 | (alias 9 | (name runtest) 10 | (deps 11 | (:< tokenizer.exe)) 12 | (action 13 | (run %{<}))) 14 | 15 | (alias 16 | (name runtest) 17 | (deps 18 | (:< complement.exe)) 19 | (action 20 | (run %{<}))) 21 | 22 | (alias 23 | (name runtest) 24 | (deps 25 | (:< subtraction.exe)) 26 | (action 27 | (run %{<}))) 28 | 29 | (alias 30 | (name runtest) 31 | (deps 32 | (:< repeat.exe)) 33 | (action 34 | (run %{<}))) 35 | 36 | (alias 37 | (name runtest) 38 | (deps 39 | (:< performance.exe)) 40 | (action 41 | (run %{<}))) 42 | -------------------------------------------------------------------------------- /examples/tokenizer.ml: -------------------------------------------------------------------------------- 1 | let digit = [%sedlex.regexp? '0'..'9'] 2 | let number = [%sedlex.regexp? Plus digit] 3 | 4 | let rec token buf = 5 | let letter = [%sedlex.regexp? 'a'..'z'|'A'..'Z'] in 6 | match%sedlex buf with 7 | | number -> Printf.printf "Number %s\n" (Sedlexing.Latin1.lexeme buf); token buf 8 | | letter, Star ('A'..'Z' | 'a'..'z' | digit) -> Printf.printf "Ident %s\n" (Sedlexing.Latin1.lexeme buf); token buf 9 | | Plus xml_blank -> token buf 10 | | Plus (Chars "+*-/") -> Printf.printf "Op %s\n" (Sedlexing.Latin1.lexeme buf); token buf 11 | | 128 .. 255 -> print_endline "Non ASCII" 12 | | eof -> print_endline "EOF" 13 | | _ -> failwith "Unexpected character" 14 | 15 | let () = 16 | let lexbuf = Sedlexing.Latin1.from_string "foobar A123Bfoo ++123Xbar/foo" in 17 | token lexbuf 18 | -------------------------------------------------------------------------------- /src/syntax/sedlex_cset.mli: -------------------------------------------------------------------------------- 1 | (* The package sedlex is released under the terms of an MIT-like license. *) 2 | (* See the attached LICENSE file. *) 3 | (* Copyright 2005, 2013 by Alain Frisch and LexiFi. *) 4 | 5 | (** Representation of sets of unicode code points. *) 6 | 7 | type t = (int * int) list 8 | 9 | val min_code: int 10 | val max_code: int 11 | 12 | val empty: t 13 | val any: t 14 | val union: t -> t -> t 15 | val difference: t -> t -> t 16 | val intersection: t -> t -> t 17 | val is_empty: t -> bool 18 | val eof: t 19 | val singleton: int -> t 20 | val interval: int -> int -> t 21 | 22 | val letter: t 23 | val digit: t 24 | val extender: t 25 | val base_char: t 26 | val ideographic: t 27 | val combining_char: t 28 | val blank: t 29 | val tr8876_ident_char: t 30 | 31 | -------------------------------------------------------------------------------- /src/syntax/sedlex.mli: -------------------------------------------------------------------------------- 1 | (* The package sedlex is released under the terms of an MIT-like license. *) 2 | (* See the attached LICENSE file. *) 3 | (* Copyright 2005, 2013 by Alain Frisch and LexiFi. *) 4 | 5 | type regexp 6 | 7 | val chars: Sedlex_cset.t -> regexp 8 | val seq: regexp -> regexp -> regexp 9 | val alt: regexp -> regexp -> regexp 10 | val rep: regexp -> regexp 11 | val plus: regexp -> regexp 12 | val eps: regexp 13 | 14 | val compl: regexp -> regexp option 15 | (* If the argument is a single [chars] regexp, returns a regexp 16 | which matches the complement set. Otherwise returns [None]. *) 17 | val subtract: regexp -> regexp -> regexp option 18 | (* If each argument is a single [chars] regexp, returns a regexp 19 | which matches the set (arg1 - arg2). Otherwise returns [None]. *) 20 | val intersection: regexp -> regexp -> regexp option 21 | (* If each argument is a single [chars] regexp, returns a regexp 22 | which matches the intersection set. Otherwise returns [None]. *) 23 | 24 | val compile: regexp array -> ((Sedlex_cset.t * int) array * bool array) array 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright 2005, 2014 by Alain Frisch and LexiFi. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /sedlex.opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | name: "sedlex" 3 | synopsis: "An OCaml lexer generator for Unicode" 4 | description: " 5 | sedlex is a lexer generator for OCaml. It is similar to ocamllex, but supports 6 | Unicode. Unlike ocamllex, sedlex allows lexer specifications within regular 7 | OCaml source files. Lexing specific constructs are provided via a ppx syntax 8 | extension. 9 | " 10 | version: "2.0" 11 | license: "MIT" 12 | doc: "https://ocaml-community.github.io/sedlex/index.html" 13 | maintainer: "Alain Frisch " 14 | authors: [ 15 | "Alain Frisch " 16 | "https://github.com/ocaml-community/sedlex/graphs/contributors" 17 | ] 18 | homepage: "https://github.com/ocaml-community/sedlex" 19 | dev-repo: "git+https://github.com/ocaml-community/sedlex.git" 20 | bug-reports: "https://github.com/ocaml-community/sedlex/issues" 21 | build: [ 22 | ["dune" "subst"] {pinned} 23 | ["dune" "build" "-p" name "-j" jobs] 24 | ["dune" "runtest" "-p" name "-j" jobs] {with-test} 25 | ["dune" "build" "@doc" "-p" name "-j" jobs] {with-doc} 26 | ] 27 | depends: [ 28 | "ocaml" {build & >= "4.02.3"} 29 | "dune" {build & >= "1.0"} 30 | "ppx_tools_versioned" 31 | "ocaml-migrate-parsetree" 32 | "gen" 33 | "uchar" 34 | ] 35 | -------------------------------------------------------------------------------- /scripts/preversion.sh: -------------------------------------------------------------------------------- 1 | puts() { printf %s\\n "$@" ;} 2 | pute() { printf %s\\n "!! $*" >&2 ;} 3 | argq() { [ $# -gt 0 ] && printf "'%s' " "$@" ;} 4 | 5 | 6 | if ! command -v sponge >/dev/null; then 7 | pute 'You need `sponge` to use this script!' 8 | pute 'Try: `brew install moreutils`' 9 | puts '' >&2 10 | missing_dep=true 11 | fi 12 | 13 | if [ -n "$missing_dep" ]; then exit 3; fi 14 | 15 | ( 16 | cd ./ppx-sedlex; 17 | if ! git diff --exit-code >/dev/null || ! git diff --cached --exit-code >/dev/null; then 18 | pute 'There appear to be uncommitted changes in `ppx-sedlex`. You should ensure the' 19 | pute 'submodule is entirely committed and clean before attempting to version-bump' 20 | pute 'the superproject.' 21 | puts '' >&2 22 | 23 | pute 'Try:' 24 | puts '' >&2 25 | puts ' (cd ./ppx-sedlex && git stash save)' >&2 26 | puts '' >&2 27 | exit 2 28 | fi 29 | ) 30 | 31 | branch="$(git rev-parse --abbrev-ref HEAD)" 32 | if [ "$branch" != "master" ]; then 33 | pute "You're on a topic branch, '$branch'." 34 | puts '' >&2 35 | 36 | pute "You need to merge your work into 'master' before you can version-bump," 37 | pute 'commit, and push.' 38 | puts '' >&2 39 | 40 | pute 'Try:' 41 | puts '' >&2 42 | puts ' git checkout master' >&2 43 | puts " git merge --ff '$branch'" >&2 44 | puts '' >&2 45 | exit 2 46 | fi 47 | -------------------------------------------------------------------------------- /src/syntax/unicode63.mli: -------------------------------------------------------------------------------- 1 | module Categories : sig 2 | 3 | val cc : Sedlex_cset.t 4 | val cf : Sedlex_cset.t 5 | val cn : Sedlex_cset.t 6 | val co : Sedlex_cset.t 7 | val cs : Sedlex_cset.t 8 | val ll : Sedlex_cset.t 9 | val lm : Sedlex_cset.t 10 | val lo : Sedlex_cset.t 11 | val lt : Sedlex_cset.t 12 | val lu : Sedlex_cset.t 13 | val mc : Sedlex_cset.t 14 | val me : Sedlex_cset.t 15 | val mn : Sedlex_cset.t 16 | val nd : Sedlex_cset.t 17 | val nl : Sedlex_cset.t 18 | val no : Sedlex_cset.t 19 | val pc : Sedlex_cset.t 20 | val pd : Sedlex_cset.t 21 | val pe : Sedlex_cset.t 22 | val pf : Sedlex_cset.t 23 | val pi : Sedlex_cset.t 24 | val po : Sedlex_cset.t 25 | val ps : Sedlex_cset.t 26 | val sc : Sedlex_cset.t 27 | val sk : Sedlex_cset.t 28 | val sm : Sedlex_cset.t 29 | val so : Sedlex_cset.t 30 | val zl : Sedlex_cset.t 31 | val zp : Sedlex_cset.t 32 | val zs : Sedlex_cset.t 33 | 34 | end 35 | 36 | module Properties : sig 37 | 38 | val alphabetic : Sedlex_cset.t 39 | val ascii_hex_digit : Sedlex_cset.t 40 | val hex_digit : Sedlex_cset.t 41 | val id_continue : Sedlex_cset.t 42 | val id_start : Sedlex_cset.t 43 | val lowercase : Sedlex_cset.t 44 | val math : Sedlex_cset.t 45 | val other_alphabetic : Sedlex_cset.t 46 | val other_lowercase : Sedlex_cset.t 47 | val other_math : Sedlex_cset.t 48 | val other_uppercase : Sedlex_cset.t 49 | val uppercase : Sedlex_cset.t 50 | val white_space : Sedlex_cset.t 51 | val xid_continue : Sedlex_cset.t 52 | val xid_start : Sedlex_cset.t 53 | 54 | end 55 | -------------------------------------------------------------------------------- /scripts/package.js: -------------------------------------------------------------------------------- 1 | // This is run on Travis, and presumably eventually AppVeyor, builds. It's supposed to be as 2 | // cross-platform as possible, unlike the adjacent release-script, which I only care if *I* can run. 3 | const current_ppx_sedlex_id = require("ppx-sedlex/identify"), 4 | path = require("path"), 5 | fs = require("fs"), 6 | cpy = require("cpy"), 7 | makeDir = require("make-dir"), 8 | archiver = require("archiver") 9 | 10 | const zipfile = `ppx-sedlex-${current_ppx_sedlex_id}.zip`, 11 | dist_dir = "dist/", 12 | submodule_ppx_dir = `ppx-sedlex/ppx-sedlex-${current_ppx_sedlex_id}/`, 13 | build_dir = "_build/install/default/lib/sedlex/ppx/", 14 | zip_dir = "ppx/", 15 | exe = "ppx.exe" 16 | 17 | // FIXME: Does any of these even work on Windows 18 | ;(async () => { 19 | // Copy the executable to the submodule, 20 | console.log(`Copy: ${path.join(build_dir, exe)} -> ${submodule_ppx_dir}`) 21 | await cpy(path.join(build_dir, exe), submodule_ppx_dir) 22 | 23 | // ... and again to the zip-directory, 24 | console.log(`Copy: ${path.join(build_dir, exe)} -> ${zip_dir}`) 25 | await cpy(path.join(build_dir, exe), zip_dir) 26 | 27 | // Create a zip-archive, 28 | const dist = await makeDir(dist_dir) 29 | console.log(`Zip: ${path.join(zip_dir, exe)} >>> ${path.join(dist, zipfile)}`) 30 | const output = fs.createWriteStream(path.join(dist, zipfile)), 31 | archive = archiver("zip", { zlib: { level: 9 } }) 32 | 33 | output.on("close", function() { 34 | console.log(`>> Zipped: ${archive.pointer()} total bytes`) 35 | }) 36 | 37 | archive.pipe(output) 38 | archive.directory(zip_dir, false) 39 | archive.finalize() 40 | })() 41 | -------------------------------------------------------------------------------- /scripts/release-ppx.sh: -------------------------------------------------------------------------------- 1 | puts() { printf %s\\n "$@" ;} 2 | pute() { printf %s\\n "~~ $*" >&2 ;} 3 | argq() { [ $# -gt 0 ] && printf "'%s' " "$@" ;} 4 | 5 | alias jq="$(npm prefix)/node_modules/node-jq/bin/jq" 6 | pj() { jq -er "$1" package.json ;} 7 | 8 | 9 | # for EXTDEP in jq; do 10 | # if ! command -v $EXTDEP >/dev/null; then 11 | # pute 'You need `'"$EXTDEP"'` to use this script!' 12 | # pute 'Try: `brew install '"$EXTDEP"'`' 13 | # printf \\n >&2 14 | # missing_dep=true 15 | # fi 16 | # done 17 | 18 | if ! command -v sponge >/dev/null; then 19 | pute 'You need `sponge` to use this script!' 20 | pute 'Try: `brew install moreutils`' 21 | puts '' >&2 22 | missing_dep=true 23 | fi 24 | 25 | if [ -n "$missing_dep" ]; then exit 3; fi 26 | 27 | 28 | new_version="$(pj .version)" 29 | puts 'New version (from `bs-sedlex/package.json`): '"v$new_version" 30 | 31 | old_version="$(cd ./ppx-sedlex; pj .version)" 32 | puts 'Old version (from `ppx-sedlex/package.json`): '"v$old_version" 33 | puts '' >&2 34 | 35 | if [ "$new_version" = "$old_version" ]; then 36 | pute 'The version in `bs-sedlex` doesn'\''t appear to have been bumped!' 37 | puts '' >&2 38 | 39 | pute 'This script is intended to be used *after* Travis builds and packages binary' 40 | pute 'artifacts; try something like the following:' 41 | puts '' >&2 42 | 43 | puts ' git stash save' >&2 44 | puts ' git checkout master' >&2 45 | puts ' npm version patch --no-git-tag-version' >&2 46 | puts ' git add --all' >&2 47 | puts ' git commit "(rel) Bump to v`jq -r .version package.json`"' >&2 48 | puts ' git push' >&2 49 | puts '' >&2 50 | exit 2 51 | fi 52 | 53 | puts 'yay!' 54 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bs-sedlex", 3 | "version": "20.0.0", 4 | "description": "An OCaml lexer generator for Unicode", 5 | "main": "index.js", 6 | "directories": { 7 | "example": "examples" 8 | }, 9 | "dependencies": { 10 | "bs-gen": "^0.5.2", 11 | "bs-uchar": "^2.0.0" 12 | }, 13 | "devDependencies": { 14 | "archiver": "^3.1.1", 15 | "bs-platform": "^5.2.1", 16 | "cpy": "^7.3.0", 17 | "foreach-cli": "^1.8.1", 18 | "make-dir": "^1.3.0", 19 | "node-jq": "^1.10.3", 20 | "ppx-sedlex": "=20.0.0" 21 | }, 22 | "peerDependencies": { 23 | "bs-platform": ">=5.0.0" 24 | }, 25 | "scripts": { 26 | "clean:examples": "npm run clean:ppx && cd examples && bsb -clean-world", 27 | "build:examples": "npm run build:ppx && cd examples && bsb -make-world", 28 | "clean:runtime": "bsb -clean-world", 29 | "build:runtime": "bsb -make-world", 30 | "clean:ppx": "dune clean", 31 | "build:ppx": "dune build && node scripts/package.js", 32 | "clean": "npm run clean:runtime && npm run clean:ppx", 33 | "build": "npm run build:runtime && npm run build:ppx", 34 | "prepare": "npm run clean && npm run build:runtime", 35 | "preversion": "./scripts/preversion.sh", 36 | "version": "./scripts/version.sh", 37 | "test": "foreach -g 'examples/*.bs.js' -x 'node #{path}'", 38 | "travis": "./scripts/travis.sh" 39 | }, 40 | "files": [ 41 | "bsconfig.json", 42 | "sedlex.opam", 43 | "examples/**/*.ml", 44 | "examples/**/*.mli", 45 | "examples/**/*.bs.js", 46 | "src/**/*.ml", 47 | "src/**/*.mli", 48 | "src/**/*.bs.js", 49 | "!**/dune", 50 | "!**/.merlin" 51 | ], 52 | "repository": { 53 | "type": "git", 54 | "url": "git+https://github.com/ELLIOTTCABLE/bs-sedlex.git" 55 | }, 56 | "author": "Alain Frisch ", 57 | "license": "MIT", 58 | "bugs": { 59 | "url": "https://github.com/ELLIOTTCABLE/bs-sedlex/issues" 60 | }, 61 | "homepage": "https://github.com/ELLIOTTCABLE/bs-sedlex#readme" 62 | } 63 | -------------------------------------------------------------------------------- /CHANGES: -------------------------------------------------------------------------------- 1 | 2.0 2 | * GPR#70: Switch to dune, opam v2 3 | * GPR#60: Breaking change: switch from int codepoints to Uchar.t 4 | codepoints 5 | * GPR#59: Track lexing position 6 | 7 | ------------------------------------------------------------------------------- 8 | 9 | 1.99.4 10 | * GPR#47: Switch to ocaml-migrate-parsetree (contributed by Adrien Guatto) 11 | * GPR#42: Added 'Rep' (repeat operator) (contributed by jpathy) 12 | 13 | 1.99.3 14 | * Update to work with 4.03 (4.02 still supported) 15 | 16 | 1.99.2 17 | * First official release of sedlex 18 | 19 | 1.99.1 20 | * Support for new Ast_mapper registration API, follow OCaml trunk after 21 | the inclusion of the extension_point branch 22 | 23 | 1.99 24 | * First version of sedlex. The history below refers to ulex, the ancestor 25 | or sedlex implemented with Camlp4. 26 | 27 | ------------------------------------------------------------------------------- 28 | 29 | 1.1 30 | * Generate (more) globally unique identifiers to avoid conflicts when open'ing another module 31 | processed by ulex (issue reported by Gerd Stolpmann) 32 | 33 | 1.0 34 | * Update to the new Camlp4 and to ocamlbuild (release for OCaml 3.10 35 | only), by Nicolas Pouillard. 36 | 37 | 0.8 38 | * Really make it work with OCaml 3.09. 39 | * Support for Utf-16. 40 | 41 | 0.7 released May 24 2005 42 | * Bug fixes 43 | * Update to OCaml 3.09 (currently CVS). Still works with OCaml 3.08. 44 | * MIT-like license (used to LGPL) 45 | 46 | 0.5 release Jul. 8 2004 47 | * Document how to use a custom implementation for lex buffers 48 | * Update to OCaml 3.08 49 | 50 | 0.4 released Jan. 10 2004 51 | * Bug fix (accept 1114111 as valid Unicode code point) 52 | * Add the rollback function 53 | 54 | 0.3 released Oct. 8 2003 55 | * Bug fix 56 | * Add a new predefined class for ISO identifiers 57 | 58 | 0.2 released Sep. 22 2003 59 | * Changed the names of predefined regexp 60 | * Fix max_code = 0x10ffff 61 | * Lexers that changes encoding on the fly 62 | * Documentation of the interface Ulexing 63 | 64 | 0.1 released Sep. 20 2003 65 | * Initial release 66 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | git: 3 | depth: 3 4 | 5 | language: node_js 6 | node_js: 7 | - "10" 8 | 9 | cache: 10 | directories: 11 | - $HOME/.opam 12 | - $HOME/.npm 13 | 14 | install: STAGE=install npm run travis 15 | 16 | script: STAGE=test npm run travis 17 | 18 | jobs: 19 | include: 20 | - stage: deploy 21 | os: linux 22 | env: COMPONENT=ppx-examples-test 23 | script: STAGE=deploy npm run travis 24 | deploy: &gh-release 25 | provider: releases 26 | on: 27 | tags: true 28 | skip_cleanup: true 29 | api_key: 30 | secure: FXlYB0nsXDE2DXf+8zAEH7j2pnRxRIHiHZFN0pCDQqTkfOSKHIfhZ1+L0JD8vkV7GtXDBYAiFmVlYZyPTOWSDHKTWROdpe6qsJsV5EJXmREIjmgMUI14SDJITtyXSE7cezPVtKdjwprJtZUHIvKPbFYscM0DF0ayT0NwFqz01aE/DIQk2AWrzcaDzNSnCsYYoJsDtetOLmWXeABHV/ohfrEF1bfurqk8hSLdhDirE7xZm2gr+KgY8tYP3geU4hqVX4tk9xydnxiWGj3N7Z7CBmOLAXDm2sXJT8fjAN/DaorO7EPEZQw4tScmTigGQKrJ0jvNtJ6QpvG3sthFyxpkuhvW0+YGNdco9PcK8aoJGztwbi0+Fc54CfIUGtN/hxTsGWopUOGAAV3R0o91TXB25oNZlzzeSVZx2tbyDHqeP1w3l/9/bLbejBQcgBh0vEx6QDsCZT/c8i/QVOozZD6y50BmaSRdPu/QtAs3IU7cQLCz42l2IYH3gDTjpy1zpblZaHabQ6xfzl95PiodXKekHblypgOWgc+5QNuUo1+kXqIa84KlV4G67EOFKCR+QvnHpVgX9PAnw5K9G4Fp6Uc5og13verN/phSTeW5R08AGx9T5ieEz39g4FrwZSnQzqzbWBwygt9NHHiauUxETI44n3haLgFj0gQVnshZWn4ZSsg= 31 | file_glob: true 32 | file: dist/* 33 | overwrite: true 34 | draft: true 35 | - os: linux 36 | env: COMPONENT=ppx-examples-test NPM_TAG=dev 37 | script: STAGE=deploy npm run travis 38 | deploy: *gh-release 39 | - os: osx 40 | env: COMPONENT=ppx-examples-test 41 | script: STAGE=deploy npm run travis 42 | deploy: *gh-release 43 | - os: osx 44 | env: COMPONENT=ppx-examples-test NPM_TAG=dev 45 | script: STAGE=deploy npm run travis 46 | deploy: *gh-release 47 | 48 | env: 49 | - COMPONENT=ppx-examples-test 50 | - COMPONENT=ppx-examples-test NPM_TAG=latest 51 | - COMPONENT=ppx-examples-test NPM_TAG=dev 52 | - COMPONENT=runtime 53 | - COMPONENT=runtime NPM_TAG=latest 54 | - COMPONENT=runtime NPM_TAG=dev 55 | 56 | # Allow failures on the upcoming major version of BuckleScript 57 | #matrix: 58 | # fast_finish: true 59 | # allow_failures: 60 | # - env: COMPONENT=ppx-examples-test NPM_TAG=dev 61 | # - env: COMPONENT=runtime NPM_TAG=dev 62 | 63 | os: 64 | - linux 65 | - osx 66 | -------------------------------------------------------------------------------- /scripts/travis.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | puts() { printf %s\\n "$@" ;} 4 | pute() { printf %s\\n "~~ $*" >&2 ;} 5 | argq() { [ $# -gt 0 ] && printf "'%s' " "$@" ;} 6 | 7 | # An alternative to `set +o xtrace` that doesn't print the unset line. 8 | x () { puts '`` '"$*" >&2 ; "$@" || exit $? ;} 9 | 10 | if [ -z "$STAGE" ] || [ -z "$COMPONENT" ]; then 11 | pute 'This script is intended to be called in a CI environment with a current $STAGE,' 12 | pute 'and a $COMPONENT to build. e.g.' 13 | 14 | puts '' >&2 15 | puts ' STAGE=install COMPONENT=runtime npm run travis' >&2 16 | puts '' >&2 17 | exit 2 18 | fi 19 | 20 | # Helpers 21 | # ------- 22 | install_matching_ocaml() { 23 | x wget https://raw.githubusercontent.com/ocaml/ocaml-ci-scripts/master/.travis-ocaml.sh 24 | 25 | BSC_VERSION="$(bsc -vnum)" 26 | 27 | case "$BSC_VERSION" in 28 | *OCaml*4.02.3*) export OCAML_VERSION=4.02 OPAM_SWITCH=ocaml-base-compiler.4.02.3;; 29 | *OCaml*4.06.1*) export OCAML_VERSION=4.06 OPAM_SWITCH=ocaml-base-compiler.4.06.1;; 30 | *) 31 | pute 'Unrecognized `bsc` version: '"$BSC_VERSION" 32 | exit 10 33 | ;; 34 | esac 35 | 36 | puts 'export OPAM_SWITCH='"$OPAM_SWITCH" 37 | x bash -ex .travis-ocaml.sh 38 | x opam pin -y -n add sedlex . 39 | x opam install -y --deps-only sedlex 40 | eval `opam config env` 41 | 42 | if [ -n "$VERBOSE" ]; then 43 | x opam config env 44 | x opam list 45 | fi 46 | } 47 | 48 | # Stages 49 | # ------ 50 | stage_install() { 51 | # Enable configurable debugging without adding new commits. (If something goes wrong, 52 | # you can set $VERBOSE to some value inside Travis's configuration, and then hit 53 | # "rebuild".) 54 | if [ -n "$VERBOSE" ]; then 55 | x npm config set loglevel verbose 56 | fi 57 | 58 | x git fetch --tags --no-recurse-submodules 59 | 60 | # Install npm dependencies, but avoid invoking our own `prepare` script 61 | x npm ci --ignore-scripts 62 | 63 | # A horrible hack is necessary here, with which we *manually* apply any `postinstall` 64 | # scripts of transitive dependencies. Yuck. 65 | # 66 | # Remember to add any new dependencies! No, I don't know an easy way to remember to do 67 | # this, and I'm probably going to forget. Sorry, future-me, when you figure out that 68 | # this is what's screwing up the build. 69 | # 70 | # See: 71 | x npm rebuild node-jq 72 | 73 | # Now we either select a particular `bs-platform` to install, or manually process the 74 | # `postinstall` script of the one we installed above. 75 | if [ -n "$NPM_TAG" ]; then 76 | x npm install "bs-platform@$NPM_TAG" 77 | else 78 | x npm rebuild bs-platform 79 | fi 80 | 81 | # ‘Install’ our own local `ppx-sedlex` version. 82 | x npm link ./ppx-sedlex 83 | 84 | # Finally, for everything except the `runtime`, we need a working OCaml installation of 85 | # the same version as the BuckleScript we just built. 86 | if [ "$COMPONENT" != "runtime" ]; then install_matching_ocaml; fi 87 | } 88 | 89 | stage_test() { 90 | case "$COMPONENT" in 91 | runtime) 92 | x npm run --silent build:runtime 93 | ;; 94 | ppx-examples-test) 95 | eval `opam config env` 96 | 97 | x npm run --silent build:ppx 98 | (cd examples && x bsb -make-world) 99 | x npm run --silent test 100 | ;; 101 | esac 102 | } 103 | 104 | stage_deploy() { 105 | eval `opam config env` 106 | 107 | x npm run --silent clean:ppx 108 | x npm run --silent build:ppx 109 | } 110 | 111 | # Invocation 112 | # ---------- 113 | case "$STAGE" in 114 | install) stage_install ;; 115 | test) stage_test ;; 116 | deploy) stage_deploy ;; 117 | esac 118 | -------------------------------------------------------------------------------- /src/syntax/sedlex.ml: -------------------------------------------------------------------------------- 1 | (* The package sedlex is released under the terms of an MIT-like license. *) 2 | (* See the attached LICENSE file. *) 3 | (* Copyright 2005, 2013 by Alain Frisch and LexiFi. *) 4 | 5 | module Cset = Sedlex_cset 6 | 7 | (* NFA *) 8 | 9 | type node = { 10 | id : int; 11 | mutable eps : node list; 12 | mutable trans : (Cset.t * node) list; 13 | } 14 | 15 | (* Compilation regexp -> NFA *) 16 | 17 | type regexp = node -> node 18 | 19 | let cur_id = ref 0 20 | let new_node () = 21 | incr cur_id; 22 | { id = !cur_id; eps = []; trans = [] } 23 | 24 | let seq r1 r2 succ = r1 (r2 succ) 25 | 26 | let is_chars final = function 27 | | {eps = []; trans = [c, f]} when f == final -> Some c 28 | | _ -> None 29 | 30 | let chars c succ = 31 | let n = new_node () in 32 | n.trans <- [c,succ]; 33 | n 34 | 35 | let alt r1 r2 succ = 36 | let nr1 = r1 succ and nr2 = r2 succ in 37 | match is_chars succ nr1, is_chars succ nr2 with 38 | | Some c1, Some c2 -> chars (Cset.union c1 c2) succ 39 | | _ -> 40 | let n = new_node () in 41 | n.eps <- [r1 succ; r2 succ]; 42 | n 43 | 44 | let rep r succ = 45 | let n = new_node () in 46 | n.eps <- [r n; succ]; 47 | n 48 | 49 | let plus r succ = 50 | let n = new_node () in 51 | let nr = r n in 52 | n.eps <- [nr; succ]; 53 | nr 54 | 55 | let eps succ = succ (* eps for epsilon *) 56 | 57 | let compl r = 58 | let n = new_node () in 59 | match is_chars n (r n) with 60 | | Some c -> 61 | Some (chars (Cset.difference Cset.any c)) 62 | | _ -> 63 | None 64 | 65 | let pair_op f r0 r1 = (* Construct subtract or intersection *) 66 | let n = new_node () in 67 | let to_chars r = is_chars n (r n) in 68 | match to_chars r0, to_chars r1 with 69 | | Some c0, Some c1 -> 70 | Some (chars (f c0 c1)) 71 | | _ -> 72 | None 73 | 74 | let subtract = pair_op Cset.difference 75 | 76 | let intersection = pair_op Cset.intersection 77 | 78 | let compile_re re = 79 | let final = new_node () in 80 | (re final, final) 81 | 82 | (* Determinization *) 83 | 84 | type state = node list 85 | (* A state of the DFA corresponds to a set of nodes in the NFA. *) 86 | 87 | let rec add_node state node = 88 | if List.memq node state then state else add_nodes (node::state) node.eps 89 | and add_nodes state nodes = 90 | List.fold_left add_node state nodes 91 | 92 | 93 | let transition (state : state) = 94 | (* Merge transition with the same target *) 95 | let rec norm = function 96 | | (c1, n1)::((c2, n2)::q as l) -> 97 | if n1 == n2 then norm ((Cset.union c1 c2, n1)::q) 98 | else (c1, n1)::(norm l) 99 | | l -> l in 100 | let t = List.concat (List.map (fun n -> n.trans) state) in 101 | let t = norm (List.sort (fun (_, n1) (_, n2) -> n1.id - n2.id) t) in 102 | 103 | (* Split char sets so as to make them disjoint *) 104 | let split (all, t) (c0, n0) = 105 | let t = 106 | (Cset.difference c0 all, [n0]) :: 107 | List.map (fun (c, ns) -> (Cset.intersection c c0, n0::ns)) t @ 108 | List.map (fun (c, ns) -> (Cset.difference c c0, ns)) t 109 | in 110 | Cset.union all c0, 111 | List.filter (fun (c, _) -> not (Cset.is_empty c)) t 112 | in 113 | 114 | let (_,t) = List.fold_left split (Cset.empty,[]) t in 115 | 116 | (* Epsilon closure of targets *) 117 | let t = List.map (fun (c, ns) -> (c, add_nodes [] ns)) t in 118 | 119 | (* Canonical ordering *) 120 | let t = Array.of_list t in 121 | Array.sort (fun (c1, _) (c2, _) -> compare c1 c2) t; 122 | t 123 | 124 | let compile rs = 125 | let rs = Array.map compile_re rs in 126 | let counter = ref 0 in 127 | let states = Hashtbl.create 31 in 128 | let states_def = Hashtbl.create 31 in 129 | let rec aux state = 130 | try Hashtbl.find states state 131 | with Not_found -> 132 | let i = !counter in 133 | incr counter; 134 | Hashtbl.add states state i; 135 | let trans = transition state in 136 | let trans = Array.map (fun (p, t) -> (p, aux t)) trans in 137 | let finals = Array.map (fun (_, f) -> List.memq f state) rs in 138 | Hashtbl.add states_def i (trans, finals); 139 | i 140 | in 141 | let init = ref [] in 142 | Array.iter (fun (i,_) -> init := add_node !init i) rs; 143 | let i = aux !init in 144 | assert(i = 0); 145 | Array.init !counter (Hashtbl.find states_def) 146 | -------------------------------------------------------------------------------- /scripts/version.sh: -------------------------------------------------------------------------------- 1 | puts() { printf %s\\n "$@" ;} 2 | pute() { printf %s\\n "!! $*" >&2 ;} 3 | argq() { [ $# -gt 0 ] && printf "'%s' " "$@" ;} 4 | 5 | jq_bin="$(npm prefix)/node_modules/node-jq/bin/jq" 6 | pj() { 7 | "$jq_bin" -e "$@" package.json 8 | } 9 | jqinplace() { 10 | file="$2" 11 | "$jq_bin" -e "$@" | sponge "$file" 12 | } 13 | 14 | dump_status() { 15 | cat < .deploy-status.env 16 | STEP="$STEP" 17 | NEW_VERSION="$NEW_VERSION" 18 | OLD_VERSION="$OLD_VERSION" 19 | EOF 20 | } 21 | 22 | 23 | # Script setup & resumption 24 | # ------ ----- - ---------- 25 | 26 | if [ -r .deploy-status.env ]; then 27 | puts '' 28 | puts 'These values were found in `.deploy-status.env`:' 29 | puts '' 30 | cat .deploy-status.env 31 | puts '' 32 | 33 | printf %s 'Load them and attempt to continue an aborted deployment? [Y/n] '; read yn 34 | if [ "$yn" != "${yn#[Yy]}" ]; then 35 | source .deploy-status.env 36 | export STEP NEW_VERSION OLD_VERSION 37 | else 38 | rm .deploy-status.env 39 | exit 3 40 | fi 41 | fi 42 | 43 | if [ -z "$STEP" ]; then 44 | if [ $# -ge 1 ]; then 45 | STEP="$1" 46 | else 47 | STEP=1 48 | fi 49 | fi 50 | 51 | if [ -z "$NEW_VERSION" ]; then 52 | if [ $# -ge 2 ]; then 53 | NEW_VERSION="$2" 54 | puts 'New version: '"v$NEW_VERSION" 55 | else 56 | NEW_VERSION="$(pj -r .version)" 57 | puts 'New version (from `bs-sedlex/package.json`): '"v$NEW_VERSION" 58 | fi 59 | fi 60 | 61 | if [ -z "$OLD_VERSION" ]; then 62 | if [ $# -ge 3 ]; then 63 | OLD_VERSION="$2" 64 | puts 'Old version: '"v$OLD_VERSION" 65 | else 66 | OLD_VERSION="$(cd ./ppx-sedlex; pj -r .version)" 67 | puts 'Old version (from `ppx-sedlex/package.json`): '"v$OLD_VERSION" 68 | fi 69 | fi 70 | 71 | if [ "$NEW_VERSION" = "$OLD_VERSION" ]; then 72 | pute 'The version in `bs-sedlex` doesn'\''t appear to have been bumped!' 73 | puts '' >&2 74 | 75 | pute 'You may have called this directly by accident; this script is intended to be' 76 | pute 'called from the `version` lifecycle. Try the following, instead:' 77 | puts '' >&2 78 | puts ' npm version patch --no-git-tag-version' >&2 79 | puts '' >&2 80 | exit 2 81 | fi 82 | 83 | 84 | # Steps 85 | # ----- 86 | step_1() { 87 | STEP=1 && dump_status 88 | 89 | puts '' 90 | printf %s 'Update bsconfig.json to match? [Y/n] '; read yn 91 | if [ "$yn" != "${yn#[Yy]}" ]; then 92 | jqinplace ".version = \"$NEW_VERSION\"" bsconfig.json || exit 127 93 | else 94 | exit 4 95 | fi 96 | 97 | step_2 98 | } 99 | 100 | 101 | step_2() { 102 | STEP=2 && dump_status 103 | 104 | puts '' 105 | printf %s 'Update ppx-sedlex/package.json to match? [Y/n] '; read yn 106 | if [ "$yn" != "${yn#[Yy]}" ]; then 107 | ( cd ./ppx-sedlex && \ 108 | npm version --no-git-tag-version "$NEW_VERSION" && \ 109 | git add package.json bsconfig.json package-lock.json) || exit 127 110 | else 111 | exit 4 112 | fi 113 | 114 | step_3 115 | } 116 | 117 | 118 | step_3() { 119 | STEP=3 && dump_status 120 | 121 | puts '' 122 | printf %s 'Remove previous artifacts and checksums from ppx-sedlex? [Y/n] '; read yn 123 | if [ "$yn" != "${yn#[Yy]}" ]; then 124 | ( cd ./ppx-sedlex && 125 | git clean -Xdf; 126 | git rm SHASUM256.txt ) 127 | else 128 | exit 4 129 | fi 130 | 131 | step_4 132 | } 133 | 134 | 135 | step_4() { 136 | STEP=4 && dump_status 137 | 138 | puts '' 139 | ( cd ./ppx-sedlex && \ 140 | git status ) 141 | printf %s 'Commit new, unpopulated version of ppx-sedlex with these changes? [Y/n] '; read yn 142 | if [ "$yn" != "${yn#[Yy]}" ]; then 143 | ( cd ./ppx-sedlex && \ 144 | git commit -m "(- rel NF) Prep for v$NEW_VERSION" ) || exit 127 145 | git add ./ppx-sedlex || exit 127 146 | else 147 | exit 4 148 | fi 149 | 150 | step_5 151 | } 152 | 153 | 154 | step_5() { 155 | STEP=5 && dump_status 156 | 157 | puts '' 158 | git add package.json package-lock.json bsconfig.json || exit 127 159 | git status 160 | printf %s 'Further, commit these changes to a new version of bs-sedlex as well? [Y/n] '; read yn 161 | if [ "$yn" != "${yn#[Yy]}" ]; then 162 | git commit -m "(rel) v$NEW_VERSION" || exit 127 163 | else 164 | exit 4 165 | fi 166 | 167 | step_6 168 | } 169 | 170 | 171 | step_6() { 172 | STEP=6 && dump_status 173 | 174 | puts '' 175 | git --no-pager show 176 | printf %s "Tag this changeset as 'v$NEW_VERSION'? [Y/n] "; read yn 177 | if [ "$yn" != "${yn#[Yy]}" ]; then 178 | git tag "v$NEW_VERSION" --file=- < int -> int -> int) -> lexbuf 44 | (** Create a generic lexer buffer. When the lexer needs more 45 | characters, it will call the given function, giving it an array of 46 | Uchars [a], a position [pos] and a code point count [n]. The 47 | function should put [n] code points or less in [a], starting at 48 | position [pos], and return the number of characters provided. A 49 | return value of 0 means end of input. *) 50 | 51 | val set_position: lexbuf -> Lexing.position -> unit 52 | (** set the initial tracked input position for [lexbuf]. 53 | If set to [Lexing.dummy_pos], Sedlexing will not track position 54 | information for you. *) 55 | 56 | val set_filename: lexbuf -> string -> unit 57 | (** [set_filename lexbuf file] sets the filename to [file] in 58 | [lexbuf]. It also sets the {!Lexing.pos_fname} field in 59 | returned {!Lexing.position} records. *) 60 | 61 | val from_gen: Uchar.t Gen.t -> lexbuf 62 | (** Create a lexbuf from a stream of Unicode code points. *) 63 | 64 | val from_stream: Uchar.t Stream.t -> lexbuf 65 | [@@ocaml.deprecated "Use [Sedlexing.from_gen] instead."] 66 | (** Create a lexbuf from a stream of Unicode code points. *) 67 | 68 | val from_int_array: int array -> lexbuf 69 | (** Create a lexbuf from an array of Unicode code points. *) 70 | 71 | val from_uchar_array: Uchar.t array -> lexbuf 72 | (** Create a lexbuf from an array of Unicode code points. *) 73 | 74 | (** {6 Interface for lexers semantic actions} *) 75 | 76 | (** The following functions can be called from the semantic actions of 77 | lexer definitions. They give access to the character string matched 78 | by the regular expression associated with the semantic action. *) 79 | 80 | val lexeme_start: lexbuf -> int 81 | (** [Sedlexing.lexeme_start lexbuf] returns the offset in the 82 | input stream of the first code point of the matched string. 83 | The first code point of the stream has offset 0. *) 84 | 85 | val lexeme_end: lexbuf -> int 86 | (** [Sedlexing.lexeme_end lexbuf] returns the offset in the input 87 | stream of the character following the last code point of the 88 | matched string. The first character of the stream has offset 89 | 0. *) 90 | 91 | val loc: lexbuf -> int * int 92 | (** [Sedlexing.loc lexbuf] returns the pair 93 | [(Sedlexing.lexeme_start lexbuf,Sedlexing.lexeme_end 94 | lexbuf)]. *) 95 | 96 | val lexeme_length: lexbuf -> int 97 | (** [Sedlexing.loc lexbuf] returns the difference 98 | [(Sedlexing.lexeme_end lexbuf) - (Sedlexing.lexeme_start 99 | lexbuf)], that is, the length (in code points) of the matched 100 | string. *) 101 | 102 | val lexing_positions : lexbuf -> Lexing.position*Lexing.position 103 | (** [Sedlexing.lexing_positions lexbuf] returns the start and end 104 | positions of the current token, using a record of type 105 | [Lexing.position]. This is intended for consumption 106 | by parsers like those generated by [Menhir]. *) 107 | 108 | val new_line: lexbuf -> unit 109 | (** [Sedlexing.new_line lexbuf] increments the line count and 110 | sets the beginning of line to the current position, as though 111 | a newline character had been encountered in the input. *) 112 | 113 | val lexeme: lexbuf -> Uchar.t array 114 | (** [Sedlexing.lexeme lexbuf] returns the string matched by the 115 | regular expression as an array of Unicode code point. *) 116 | 117 | val lexeme_char: lexbuf -> int -> Uchar.t 118 | (** [Sedlexing.lexeme_char lexbuf pos] returns code point number [pos] in 119 | the matched string. *) 120 | 121 | val sub_lexeme: lexbuf -> int -> int -> Uchar.t array 122 | (** [Sedlexing.sub_lexeme lexbuf pos len] returns a substring of the string 123 | matched by the regular expression as an array of Unicode code point. *) 124 | 125 | val rollback: lexbuf -> unit 126 | (** [Sedlexing.rollback lexbuf] puts [lexbuf] back in its configuration before 127 | the last lexeme was matched. It is then possible to use another 128 | lexer to parse the same characters again. The other functions 129 | above in this section should not be used in the semantic action 130 | after a call to [Sedlexing.rollback]. *) 131 | 132 | (** {6 Internal interface} *) 133 | 134 | (** These functions are used internally by the lexers. They could be used 135 | to write lexers by hand, or with a lexer generator different from 136 | [sedlex]. The lexer buffers have a unique internal slot that can store 137 | an integer. They also store a "backtrack" position. 138 | *) 139 | 140 | val start: lexbuf -> unit 141 | (** [start t] informs the lexer buffer that any 142 | code points until the current position can be discarded. 143 | The current position become the "start" position as returned 144 | by [Sedlexing.lexeme_start]. Moreover, the internal slot is set to 145 | [-1] and the backtrack position is set to the current position. 146 | *) 147 | 148 | val next: lexbuf -> Uchar.t option 149 | (** [next lexbuf] extracts the next code point from the 150 | lexer buffer and increments to current position. If the input stream 151 | is exhausted, the function returns [None]. 152 | If a ['\n'] is encountered, the tracked line number is incremented. *) 153 | 154 | val mark: lexbuf -> int -> unit 155 | (** [mark lexbuf i] stores the integer [i] in the internal 156 | slot. The backtrack position is set to the current position. *) 157 | 158 | val backtrack: lexbuf -> int 159 | (** [backtrack lexbuf] returns the value stored in the 160 | internal slot of the buffer, and performs backtracking 161 | (the current position is set to the value of the backtrack position). *) 162 | 163 | val with_tokenizer: (lexbuf -> 'token) -> lexbuf -> (unit -> 'token * Lexing.position * Lexing.position) 164 | (** [with_tokenizer tokenizer lexbuf] given a lexer and a lexbuf, 165 | returns a generator of tokens annotated with positions. 166 | This generator can be used with the Menir parser generator's 167 | incremental API. *) 168 | 169 | (** {6 Support for common encodings} *) 170 | 171 | module Latin1: sig 172 | val from_gen: char Gen.t -> lexbuf 173 | (** Create a lexbuf from a Latin1 encoded stream (ie a stream 174 | of Unicode code points in the range [0..255]) *) 175 | 176 | val from_stream: char Stream.t -> lexbuf 177 | [@@ocaml.deprecated "Use [Sedlexing.Latin1.from_gen] instead."] 178 | (** Create a lexbuf from a Latin1 encoded stream (ie a stream 179 | of Unicode code points in the range [0..255]) *) 180 | 181 | val from_channel: in_channel -> lexbuf 182 | (** Create a lexbuf from a Latin1 encoded input channel. 183 | The client is responsible for closing the channel. *) 184 | 185 | val from_string: string -> lexbuf 186 | (** Create a lexbuf from a Latin1 encoded string. *) 187 | 188 | 189 | val lexeme: lexbuf -> string 190 | (** As [Sedlexing.lexeme] with a result encoded in Latin1. This 191 | function throws an exception [InvalidCodepoint] if it is not 192 | possible to encode the result in Latin1. *) 193 | 194 | val sub_lexeme: lexbuf -> int -> int -> string 195 | (** As [Sedlexing.sub_lexeme] with a result encoded in Latin1. 196 | This function throws an exception [InvalidCodepoint] if it 197 | is not possible to encode the result in Latin1. *) 198 | 199 | val lexeme_char: lexbuf -> int -> char 200 | (** As [Sedlexing.lexeme_char] with a result encoded in Latin1. 201 | This function throws an exception [InvalidCodepoint] if it 202 | is not possible to encode the result in Latin1. *) 203 | end 204 | 205 | 206 | module Utf8: sig 207 | val from_gen: char Gen.t -> lexbuf 208 | (** Create a lexbuf from a UTF-8 encoded stream. *) 209 | 210 | val from_stream: char Stream.t -> lexbuf 211 | [@@ocaml.deprecated "Use [Sedlexing.Utf8.from_gen] instead."] 212 | (** Create a lexbuf from a UTF-8 encoded stream. *) 213 | 214 | val from_channel: in_channel -> lexbuf 215 | (** Create a lexbuf from a UTF-8 encoded input channel. *) 216 | 217 | val from_string: string -> lexbuf 218 | (** Create a lexbuf from a UTF-8 encoded string. *) 219 | 220 | val lexeme: lexbuf -> string 221 | (** As [Sedlexing.lexeme] with a result encoded in UTF-8. *) 222 | 223 | val sub_lexeme: lexbuf -> int -> int -> string 224 | (** As [Sedlexing.sub_lexeme] with a result encoded in UTF-8. *) 225 | end 226 | 227 | 228 | module Utf16: sig 229 | type byte_order = Little_endian | Big_endian 230 | 231 | val from_gen: char Gen.t -> byte_order option -> lexbuf 232 | (** [Utf16.from_gen s opt_bo] creates a lexbuf from an UTF-16 233 | encoded stream. If [opt_bo] matches with [None] the function 234 | expects a BOM (Byte Order Mark), and takes the byte order as 235 | [Utf16.Big_endian] if it cannot find one. When [opt_bo] 236 | matches with [Some bo], [bo] is taken as byte order. In this 237 | case a leading BOM is kept in the stream - the lexer has to 238 | ignore it and a `wrong' BOM ([0xfffe]) will raise 239 | Utf16.InvalidCodepoint. *) 240 | 241 | val from_stream: char Stream.t -> byte_order option -> lexbuf 242 | [@@ocaml.deprecated "Use [Sedlexing.Utf16.from_gen] instead."] 243 | (** Works as [Utf16.from_gen] with a [stream]. *) 244 | 245 | val from_channel: in_channel -> byte_order option-> lexbuf 246 | (** Works as [Utf16.from_gen] with an [in_channel]. *) 247 | 248 | val from_string: string -> byte_order option -> lexbuf 249 | (** Works as [Utf16.from_gen] with a [string]. *) 250 | 251 | val lexeme: lexbuf -> byte_order -> bool -> string 252 | (** [utf16_lexeme lb bo bom] as [Sedlexing.lexeme] with a result 253 | encoded in UTF-16 in byte_order [bo] and starting with a BOM 254 | if [bom = true]. *) 255 | 256 | val sub_lexeme: lexbuf -> int -> int -> byte_order -> bool -> string 257 | (** [sub_lexeme lb pos len bo bom] as 258 | [Sedlexing.sub_lexeme] with a result encoded in UTF-16 with 259 | byte order [bo] and starting with a BOM if [bom=true] *) 260 | end 261 | -------------------------------------------------------------------------------- /src/syntax/sedlex_cset.ml: -------------------------------------------------------------------------------- 1 | (* The package sedlex is released under the terms of an MIT-like license. *) 2 | (* See the attached LICENSE file. *) 3 | (* Copyright 2005, 2013 by Alain Frisch and LexiFi. *) 4 | 5 | (* Character sets are represented as lists of intervals. The 6 | intervals must be non-overlapping and not collapsable, and the list 7 | must be ordered in increasing order. *) 8 | 9 | type t = (int * int) list 10 | 11 | let max_code = 0x10ffff (* must be < max_int *) 12 | let min_code = -1 13 | 14 | let empty = [] 15 | let singleton i = [i,i] 16 | let is_empty = function [] -> true | _ -> false 17 | let interval i j = if i <= j then [i,j] else [j,i] 18 | let eof = singleton (-1) 19 | let any = interval 0 max_code 20 | 21 | let rec union c1 c2 = 22 | match c1,c2 with 23 | | [], _ -> c2 24 | | _, [] -> c1 25 | | ((i1, j1) as s1)::r1, (i2, j2)::r2 -> 26 | if (i1 <= i2) then 27 | if j1 + 1 < i2 then s1::(union r1 c2) 28 | else if (j1 < j2) then union r1 ((i1, j2)::r2) 29 | else union c1 r2 30 | else union c2 c1 31 | 32 | let complement c = 33 | let rec aux start = function 34 | | [] -> if start <= max_code then [start,max_code] else [] 35 | | (i, j)::l -> (start, i-1)::(aux (succ j) l) 36 | in 37 | match c with 38 | | (-1,j)::l -> aux (succ j) l 39 | | l -> aux (-1) l 40 | 41 | let intersection c1 c2 = 42 | complement (union (complement c1) (complement c2)) 43 | 44 | let difference c1 c2 = 45 | complement (union (complement c1) c2) 46 | 47 | 48 | (* Unicode classes from XML *) 49 | 50 | let base_char = 51 | [ 0x0041,0x005A; 0x0061,0x007A; 0x00C0,0x00D6; 0x00D8,0x00F6; 52 | 0x00F8,0x00FF; 0x0100,0x0131; 0x0134,0x013E; 0x0141,0x0148; 53 | 0x014A,0x017E; 0x0180,0x01C3; 0x01CD,0x01F0; 0x01F4,0x01F5; 54 | 0x01FA,0x0217; 0x0250,0x02A8; 0x02BB,0x02C1; 0x0386,0x0386; 55 | 0x0388,0x038A; 0x038C,0x038C; 0x038E,0x03A1; 0x03A3,0x03CE; 56 | 0x03D0,0x03D6; 0x03DA,0x03DA; 0x03DC,0x03DC; 0x03DE,0x03DE; 57 | 0x03E0,0x03E0; 0x03E2,0x03F3; 58 | 0x0401,0x040C; 0x040E,0x044F; 0x0451,0x045C; 0x045E,0x0481; 59 | 0x0490,0x04C4; 0x04C7,0x04C8; 0x04CB,0x04CC; 0x04D0,0x04EB; 60 | 0x04EE,0x04F5; 0x04F8,0x04F9; 0x0531,0x0556; 0x0559,0x0559; 61 | 0x0561,0x0586; 0x05D0,0x05EA; 0x05F0,0x05F2; 0x0621,0x063A; 62 | 0x0641,0x064A; 0x0671,0x06B7; 0x06BA,0x06BE; 0x06C0,0x06CE; 63 | 0x06D0,0x06D3; 0x06D5,0x06D5; 0x06E5,0x06E6; 0x0905,0x0939; 64 | 0x093D,0x093D; 65 | 0x0958,0x0961; 0x0985,0x098C; 0x098F,0x0990; 0x0993,0x09A8; 66 | 0x09AA,0x09B0; 0x09B2,0x09B2; 0x09B6,0x09B9; 0x09DC,0x09DD; 67 | 0x09DF,0x09E1; 0x09F0,0x09F1; 0x0A05,0x0A0A; 0x0A0F,0x0A10; 68 | 0x0A13,0x0A28; 0x0A2A,0x0A30; 0x0A32,0x0A33; 0x0A35,0x0A36; 69 | 0x0A38,0x0A39; 0x0A59,0x0A5C; 0x0A5E,0x0A5E; 0x0A72,0x0A74; 70 | 0x0A85,0x0A8B; 0x0A8D,0x0A8D; 0x0A8F,0x0A91; 0x0A93,0x0AA8; 71 | 0x0AAA,0x0AB0; 0x0AB2,0x0AB3; 0x0AB5,0x0AB9; 0x0ABD,0x0ABD; 72 | 0x0AE0,0x0AE0; 73 | 0x0B05,0x0B0C; 0x0B0F,0x0B10; 0x0B13,0x0B28; 0x0B2A,0x0B30; 74 | 0x0B32,0x0B33; 0x0B36,0x0B39; 0x0B3D,0x0B3D; 0x0B5C,0x0B5D; 75 | 0x0B5F,0x0B61; 0x0B85,0x0B8A; 0x0B8E,0x0B90; 0x0B92,0x0B95; 76 | 0x0B99,0x0B9A; 0x0B9C,0x0B9C; 0x0B9E,0x0B9F; 0x0BA3,0x0BA4; 77 | 0x0BA8,0x0BAA; 0x0BAE,0x0BB5; 0x0BB7,0x0BB9; 0x0C05,0x0C0C; 78 | 0x0C0E,0x0C10; 0x0C12,0x0C28; 0x0C2A,0x0C33; 0x0C35,0x0C39; 79 | 0x0C60,0x0C61; 0x0C85,0x0C8C; 0x0C8E,0x0C90; 0x0C92,0x0CA8; 80 | 0x0CAA,0x0CB3; 0x0CB5,0x0CB9; 0x0CDE,0x0CDE; 0x0CE0,0x0CE1; 81 | 0x0D05,0x0D0C; 0x0D0E,0x0D10; 0x0D12,0x0D28; 0x0D2A,0x0D39; 82 | 0x0D60,0x0D61; 0x0E01,0x0E2E; 0x0E30,0x0E30; 0x0E32,0x0E33; 83 | 0x0E40,0x0E45; 0x0E81,0x0E82; 0x0E84,0x0E84; 0x0E87,0x0E88; 84 | 0x0E8A,0x0E8A; 85 | 0x0E8D,0x0E8D; 0x0E94,0x0E97; 0x0E99,0x0E9F; 0x0EA1,0x0EA3; 86 | 0x0EA5,0x0EA5; 87 | 0x0EA7,0x0EA7; 0x0EAA,0x0EAB; 0x0EAD,0x0EAE; 0x0EB0,0x0EB0; 88 | 0x0EB2,0x0EB3; 89 | 0x0EBD,0x0EBD; 0x0EC0,0x0EC4; 0x0F40,0x0F47; 0x0F49,0x0F69; 90 | 0x10A0,0x10C5; 0x10D0,0x10F6; 0x1100,0x1100; 0x1102,0x1103; 91 | 0x1105,0x1107; 0x1109,0x1109; 0x110B,0x110C; 0x110E,0x1112; 92 | 0x113C,0x113C; 93 | 0x113E,0x113E; 0x1140,0x1140; 0x114C,0x114C; 0x114E,0x114E; 94 | 0x1150,0x1150; 0x1154,0x1155; 0x1159,0x1159; 95 | 0x115F,0x1161; 0x1163,0x1163; 0x1165,0x1165; 0x1167,0x1167; 96 | 0x1169,0x1169; 0x116D,0x116E; 97 | 0x1172,0x1173; 0x1175,0x1175; 0x119E,0x119E; 0x11A8,0x11A8; 98 | 0x11AB,0x11AB; 0x11AE,0x11AF; 99 | 0x11B7,0x11B8; 0x11BA,0x11BA; 0x11BC,0x11C2; 0x11EB,0x11EB; 100 | 0x11F0,0x11F0; 0x11F9,0x11F9; 101 | 0x1E00,0x1E9B; 0x1EA0,0x1EF9; 0x1F00,0x1F15; 0x1F18,0x1F1D; 102 | 0x1F20,0x1F45; 0x1F48,0x1F4D; 0x1F50,0x1F57; 0x1F59,0x1F59; 103 | 0x1F5B,0x1F5B; 104 | 0x1F5D,0x1F5D; 0x1F5F,0x1F7D; 0x1F80,0x1FB4; 0x1FB6,0x1FBC; 105 | 0x1FBE,0x1FBE; 106 | 0x1FC2,0x1FC4; 0x1FC6,0x1FCC; 0x1FD0,0x1FD3; 0x1FD6,0x1FDB; 107 | 0x1FE0,0x1FEC; 0x1FF2,0x1FF4; 0x1FF6,0x1FFC; 0x2126,0x2126; 108 | 0x212A,0x212B; 0x212E,0x212E; 0x2180,0x2182; 0x3041,0x3094; 109 | 0x30A1,0x30FA; 0x3105,0x312C; 0xAC00,0xD7A3 ] 110 | 111 | let ideographic = 112 | [ 0x3007,0x3007; 0x3021,0x3029; 0x4E00,0x9FA5 ] 113 | 114 | let combining_char = 115 | [ 0x0300,0x0345; 0x0360,0x0361; 0x0483,0x0486; 0x0591,0x05A1; 116 | 0x05A3,0x05B9; 0x05BB,0x05BD; 0x05BF,0x05BF; 0x05C1,0x05C2; 117 | 0x05C4,0x05C4; 0x064B,0x0652; 0x0670,0x0670; 0x06D6,0x06DC; 118 | 0x06DD,0x06DF; 0x06E0,0x06E4; 0x06E7,0x06E8; 0x06EA,0x06ED; 119 | 0x0901,0x0903; 0x093C,0x093C; 0x093E,0x094C; 0x094D,0x094D; 120 | 0x0951,0x0954; 0x0962,0x0963; 0x0981,0x0983; 0x09BC,0x09BC; 121 | 0x09BE,0x09BE; 0x09BF,0x09BF; 0x09C0,0x09C4; 0x09C7,0x09C8; 122 | 0x09CB,0x09CD; 0x09D7,0x09D7; 0x09E2,0x09E3; 0x0A02,0x0A02; 123 | 0x0A3C,0x0A3C; 0x0A3E,0x0A3E; 0x0A3F,0x0A3F; 0x0A40,0x0A42; 124 | 0x0A47,0x0A48; 0x0A4B,0x0A4D; 0x0A70,0x0A71; 0x0A81,0x0A83; 125 | 0x0ABC,0x0ABC; 0x0ABE,0x0AC5; 0x0AC7,0x0AC9; 0x0ACB,0x0ACD; 126 | 0x0B01,0x0B03; 0x0B3C,0x0B3C; 0x0B3E,0x0B43; 0x0B47,0x0B48; 127 | 0x0B4B,0x0B4D; 0x0B56,0x0B57; 0x0B82,0x0B83; 0x0BBE,0x0BC2; 128 | 0x0BC6,0x0BC8; 0x0BCA,0x0BCD; 0x0BD7,0x0BD7; 0x0C01,0x0C03; 129 | 0x0C3E,0x0C44; 0x0C46,0x0C48; 0x0C4A,0x0C4D; 0x0C55,0x0C56; 130 | 0x0C82,0x0C83; 0x0CBE,0x0CC4; 0x0CC6,0x0CC8; 0x0CCA,0x0CCD; 131 | 0x0CD5,0x0CD6; 0x0D02,0x0D03; 0x0D3E,0x0D43; 0x0D46,0x0D48; 132 | 0x0D4A,0x0D4D; 0x0D57,0x0D57; 0x0E31,0x0E31; 0x0E34,0x0E3A; 133 | 0x0E47,0x0E4E; 0x0EB1,0x0EB1; 0x0EB4,0x0EB9; 0x0EBB,0x0EBC; 134 | 0x0EC8,0x0ECD; 0x0F18,0x0F19; 0x0F35,0x0F35; 0x0F37,0x0F37; 135 | 0x0F39,0x0F39; 0x0F3E,0x0F3E; 0x0F3F,0x0F3F; 0x0F71,0x0F84; 136 | 0x0F86,0x0F8B; 0x0F90,0x0F95; 0x0F97,0x0F97; 0x0F99,0x0FAD; 137 | 0x0FB1,0x0FB7; 0x0FB9,0x0FB9; 0x20D0,0x20DC; 0x20E1,0x20E1; 138 | 0x302A,0x302F; 0x3099,0x3099; 0x309A,0x309A ] 139 | 140 | let digit = 141 | [ 0x0030,0x0039; 142 | 0x0660,0x0669; 0x06F0,0x06F9; 0x0966,0x096F; 0x09E6,0x09EF; 143 | 0x0A66,0x0A6F; 0x0AE6,0x0AEF; 0x0B66,0x0B6F; 0x0BE7,0x0BEF; 144 | 0x0C66,0x0C6F; 0x0CE6,0x0CEF; 0x0D66,0x0D6F; 0x0E50,0x0E59; 145 | 0x0ED0,0x0ED9; 0x0F20,0x0F29 ] 146 | 147 | let extender = 148 | [ 0x00B7,0x00B7; 0x02D0,0x02D1; 0x0387,0x0387; 0x0640,0x0640; 149 | 0x0E46,0x0E46; 0x0EC6,0x0EC6; 0x3005,0x3005; 0x3031,0x3035; 150 | 0x309D,0x309E; 0x30FC,0x30FE ] 151 | 152 | let blank = 153 | [ 0x0009,0x000A; 0x000D,0x000D; 0x0020,0x0020 ] 154 | 155 | let letter = union base_char ideographic 156 | 157 | 158 | (* Letters to be used in identifiers, as specified 159 | by ISO .... 160 | Data provided by John M. Skaller *) 161 | let tr8876_ident_char = [ 162 | (* ASCII *) 163 | (0x0041,0x005a); 164 | (0x0061,0x007a); 165 | 166 | (* Latin *) 167 | (0x00c0,0x00d6); 168 | (0x00d8,0x00f6); 169 | (0x00f8,0x01f5); 170 | (0x01fa,0x0217); 171 | (0x0250,0x02a8); 172 | 173 | (* Greek *) 174 | (0x0384,0x0384); 175 | (0x0388,0x038a); 176 | (0x038c,0x038c); 177 | (0x038e,0x03a1); 178 | (0x03a3,0x03ce); 179 | (0x03d0,0x03d6); 180 | (0x03da,0x03da); 181 | (0x03dc,0x03dc); 182 | (0x03de,0x03de); 183 | (0x03e0,0x03e0); 184 | (0x03e2,0x03f3); 185 | 186 | (* Cyrillic *) 187 | (0x0401,0x040d); 188 | (0x040f,0x044f); 189 | (0x0451,0x045c); 190 | (0x045e,0x0481); 191 | (0x0490,0x04c4); 192 | (0x04c7,0x04c4); 193 | (0x04cb,0x04cc); 194 | (0x04d0,0x04eb); 195 | (0x04ee,0x04f5); 196 | (0x04f8,0x04f9); 197 | 198 | (* Armenian *) 199 | (0x0531,0x0556); 200 | (0x0561,0x0587); 201 | (0x04d0,0x04eb); 202 | 203 | (* Hebrew *) 204 | (0x05d0,0x05ea); 205 | (0x05f0,0x05f4); 206 | 207 | (* Arabic *) 208 | (0x0621,0x063a); 209 | (0x0640,0x0652); 210 | (0x0670,0x06b7); 211 | (0x06ba,0x06be); 212 | (0x06c0,0x06ce); 213 | (0x06e5,0x06e7); 214 | 215 | (* Devanagari *) 216 | (0x0905,0x0939); 217 | (0x0958,0x0962); 218 | 219 | (* Bengali *) 220 | (0x0985,0x098c); 221 | (0x098f,0x0990); 222 | (0x0993,0x09a8); 223 | (0x09aa,0x09b0); 224 | (0x09b2,0x09b2); 225 | (0x09b6,0x09b9); 226 | (0x09dc,0x09dd); 227 | (0x09df,0x09e1); 228 | (0x09f0,0x09f1); 229 | 230 | (* Gurmukhi *) 231 | (0x0a05,0x0a0a); 232 | (0x0a0f,0x0a10); 233 | (0x0a13,0x0a28); 234 | (0x0a2a,0x0a30); 235 | (0x0a32,0x0a33); 236 | (0x0a35,0x0a36); 237 | (0x0a38,0x0a39); 238 | (0x0a59,0x0a5c); 239 | (0x0a5e,0x0a5e); 240 | 241 | (* Gunjarati *) 242 | (0x0a85,0x0a8b); 243 | (0x0a8d,0x0a8d); 244 | (0x0a8f,0x0a91); 245 | (0x0a93,0x0aa8); 246 | (0x0aaa,0x0ab0); 247 | (0x0ab2,0x0ab3); 248 | (0x0ab5,0x0ab9); 249 | (0x0ae0,0x0ae0); 250 | 251 | (* Oriya *) 252 | (0x0b05,0x0b0c); 253 | (0x0b0f,0x0b10); 254 | (0x0b13,0x0b28); 255 | (0x0b2a,0x0b30); 256 | (0x0b32,0x0b33); 257 | (0x0b36,0x0b39); 258 | (0x0b5c,0x0b5d); 259 | (0x0b5f,0x0b61); 260 | 261 | (* Tamil *) 262 | (0x0b85,0x0b8a); 263 | (0x0b8e,0x0b90); 264 | (0x0b92,0x0b95); 265 | (0x0b99,0x0b9a); 266 | (0x0b9c,0x0b9c); 267 | (0x0b9e,0x0b9f); 268 | (0x0ba3,0x0ba4); 269 | (0x0ba8,0x0baa); 270 | (0x0bae,0x0bb5); 271 | (0x0bb7,0x0bb9); 272 | 273 | (* Telugu *) 274 | (0x0c05,0x0c0c); 275 | (0x0c0e,0x0c10); 276 | (0x0c12,0x0c28); 277 | (0x0c2a,0x0c33); 278 | (0x0c35,0x0c39); 279 | (0x0c60,0x0c61); 280 | 281 | (* Kannada *) 282 | (0x0c85,0x0c8c); 283 | (0x0c8e,0x0c90); 284 | (0x0c92,0x0ca8); 285 | (0x0caa,0x0cb3); 286 | (0x0cb5,0x0cb9); 287 | (0x0ce0,0x0ce1); 288 | 289 | (* Malayam *) 290 | (0x0d05,0x0d0c); 291 | (0x0d0e,0x0d10); 292 | (0x0d12,0x0d28); 293 | (0x0d2a,0x0d39); 294 | (0x0d60,0x0d61); 295 | 296 | (* Thai *) 297 | (0x0e01,0x0e30); 298 | (0x0e32,0x0e33); 299 | (0x0e40,0x0e46); 300 | (0x0e4f,0x0e5b); 301 | 302 | (* Lao *) 303 | (0x0e81,0x0e82); 304 | (0x0e84,0x0e84); 305 | (0x0e87,0x0e88); 306 | (0x0e8a,0x0e8a); 307 | (0x0e0d,0x0e0d); 308 | (0x0e94,0x0e97); 309 | (0x0e99,0x0e9f); 310 | (0x0ea1,0x0ea3); 311 | (0x0ea5,0x0ea5); 312 | (0x0ea7,0x0ea7); 313 | (0x0eaa,0x0eab); 314 | (0x0ead,0x0eb0); 315 | (0x0eb2,0x0eb3); 316 | (0x0ebd,0x0ebd); 317 | (0x0ec0,0x0ec4); 318 | (0x0ec6,0x0ec6); 319 | 320 | (* Georgian *) 321 | (0x10a0,0x10c5); 322 | (0x10d0,0x10f6); 323 | 324 | (* Hangul Jamo *) 325 | (0x1100,0x1159); 326 | (0x1161,0x11a2); 327 | (0x11a8,0x11f9); 328 | (0x11d0,0x11f6); 329 | 330 | (* Latin extensions *) 331 | (0x1e00,0x1e9a); 332 | (0x1ea0,0x1ef9); 333 | 334 | (* Greek extended *) 335 | (0x1f00,0x1f15); 336 | (0x1f18,0x1f1d); 337 | (0x1f20,0x1f45); 338 | (0x1f48,0x1f4d); 339 | (0x1f50,0x1f57); 340 | (0x1f59,0x1f59); 341 | (0x1f5b,0x1f5b); 342 | (0x1f5d,0x1f5d); 343 | (0x1f5f,0x1f7d); 344 | (0x1f80,0x1fb4); 345 | (0x1fb6,0x1fbc); 346 | (0x1fc2,0x1fc4); 347 | (0x1fc6,0x1fcc); 348 | (0x1fd0,0x1fd3); 349 | (0x1fd6,0x1fdb); 350 | (0x1fe0,0x1fec); 351 | (0x1ff2,0x1ff4); 352 | (0x1ff6,0x1ffc); 353 | 354 | 355 | (* Hiragana *) 356 | (0x3041,0x3094); 357 | (0x309b,0x309e); 358 | 359 | (* Katakana *) 360 | (0x30a1,0x30fe); 361 | 362 | (* Bopmofo *) 363 | (0x3105,0x312c); 364 | 365 | (* CJK Unified Ideographs *) 366 | (0x4e00,0x9fa5); 367 | 368 | (* CJK Compatibility Ideographs *) 369 | (0xf900,0xfa2d); 370 | 371 | (* Arabic Presentation Forms *) 372 | (0xfb1f,0xfb36); 373 | (0xfb38,0xfb3c); 374 | (0xfb3e,0xfb3e); 375 | (0xfb40,0xfb41); 376 | (0xfb42,0xfb44); 377 | (0xfb46,0xfbb1); 378 | (0xfbd3,0xfd35); 379 | 380 | (* Arabic Presentation Forms-A *) 381 | (0xfd50,0xfd85); 382 | (0xfd92,0xfbc7); 383 | (0xfdf0,0xfdfb); 384 | 385 | (* Arabic Presentation Forms-B *) 386 | (0xfe70,0xfe72); 387 | (0xfe74,0xfe74); 388 | (0xfe76,0xfefc); 389 | 390 | (* Half width and Fullwidth Forms *) 391 | (0xff21,0xff3a); 392 | (0xff41,0xff5a); 393 | (0xff66,0xffbe); 394 | (0xffc2,0xffc7); 395 | (0xffca,0xffcf); 396 | (0xffd2,0xffd7); 397 | (0xffd2,0xffd7); 398 | (0xffda,0xffdc) 399 | ] 400 | -------------------------------------------------------------------------------- /src/lib/sedlexing.ml: -------------------------------------------------------------------------------- 1 | (* The package sedlex is released under the terms of an MIT-like license. *) 2 | (* See the attached LICENSE file. *) 3 | (* Copyright 2005, 2013 by Alain Frisch and LexiFi. *) 4 | 5 | exception InvalidCodepoint of int 6 | exception MalFormed 7 | 8 | 9 | let gen_of_channel chan = 10 | let f () = 11 | try Some (input_char chan) 12 | with End_of_file -> None 13 | in 14 | f 15 | 16 | let (>>=) o f = match o with 17 | | Some x -> f x 18 | | None -> None 19 | 20 | 21 | (* For legacy purposes. *) 22 | let gen_of_stream stream = 23 | let f () = 24 | try Some (Stream.next stream) 25 | with Stream.Failure -> None 26 | in f 27 | 28 | (* Absolute position from the beginning of the stream *) 29 | type apos = int 30 | 31 | type lexbuf = { 32 | refill: (Uchar.t array -> int -> int -> int); 33 | mutable buf: Uchar.t array; 34 | mutable len: int; (* Number of meaningful char in buffer *) 35 | mutable offset: apos; (* Position of the first char in buffer 36 | in the input stream *) 37 | mutable pos: int; (* pos is the index in the buffer *) 38 | mutable curr_bol: int; (* bol is the index in the input stream but not buffer *) 39 | mutable curr_line: int; (* start from 1, if it is 0, we would not track postion info for you *) 40 | mutable start_pos: int; (* First char we need to keep visible *) 41 | mutable start_bol: int; 42 | mutable start_line: int; 43 | 44 | mutable marked_pos: int; 45 | mutable marked_bol: int; 46 | mutable marked_line: int; 47 | mutable marked_val: int; 48 | 49 | mutable filename: string; 50 | 51 | mutable finished: bool; 52 | } 53 | 54 | let chunk_size = 512 55 | 56 | let empty_lexbuf = { 57 | refill = (fun _ _ _ -> assert false); 58 | buf = [| |]; 59 | len = 0; 60 | offset = 0; 61 | pos = 0; 62 | curr_bol = 0; 63 | curr_line = 0; 64 | start_pos = 0; 65 | start_bol = 0; 66 | start_line = 0; 67 | marked_pos = 0; 68 | marked_bol = 0; 69 | marked_line = 0; 70 | marked_val = 0; 71 | filename = ""; 72 | finished = false; 73 | } 74 | 75 | let create f = { 76 | empty_lexbuf with 77 | refill = f; 78 | buf = Array.make chunk_size (Uchar.of_int 0); 79 | curr_line = 1; 80 | } 81 | 82 | let set_position lexbuf position = 83 | lexbuf.offset <- position.Lexing.pos_cnum - lexbuf.pos; 84 | lexbuf.curr_bol <- position.Lexing.pos_bol; 85 | lexbuf.curr_line <- position.Lexing.pos_lnum 86 | 87 | let set_filename lexbuf fname = 88 | lexbuf.filename <- fname 89 | 90 | let fill_buf_from_gen f gen buf pos len = 91 | let rec aux i = 92 | if i >= len then len 93 | else match gen () with 94 | | Some c -> buf.(pos + i) <- f c ; aux (i+1) 95 | | None -> i 96 | in 97 | aux 0 98 | 99 | let from_gen s = 100 | create (fill_buf_from_gen (fun id -> id) s) 101 | 102 | let from_stream s = from_gen @@ gen_of_stream s 103 | 104 | let from_int_array a = 105 | let len = Array.length a in 106 | { 107 | empty_lexbuf with 108 | buf = Array.init len (fun i -> Uchar.of_int a.(i)); 109 | len = len; 110 | finished = true; 111 | } 112 | 113 | let from_uchar_array a = 114 | let len = Array.length a in 115 | { 116 | empty_lexbuf with 117 | buf = Array.init len (fun i -> a.(i)); 118 | len = len; 119 | finished = true; 120 | } 121 | 122 | 123 | let refill lexbuf = 124 | if lexbuf.len + chunk_size > Array.length lexbuf.buf 125 | then begin 126 | let s = lexbuf.start_pos in 127 | let ls = lexbuf.len - s in 128 | if ls + chunk_size <= Array.length lexbuf.buf then 129 | Array.blit lexbuf.buf s lexbuf.buf 0 ls 130 | else begin 131 | let newlen = (Array.length lexbuf.buf + chunk_size) * 2 in 132 | let newbuf = Array.make newlen (Uchar.of_int 0) in 133 | Array.blit lexbuf.buf s newbuf 0 ls; 134 | lexbuf.buf <- newbuf 135 | end; 136 | lexbuf.len <- ls; 137 | lexbuf.offset <- lexbuf.offset + s; 138 | lexbuf.pos <- lexbuf.pos - s; 139 | lexbuf.marked_pos <- lexbuf.marked_pos - s; 140 | lexbuf.start_pos <- 0 141 | end; 142 | let n = lexbuf.refill lexbuf.buf lexbuf.pos chunk_size in 143 | if n = 0 144 | then lexbuf.finished <- true 145 | else lexbuf.len <- lexbuf.len + n 146 | 147 | let new_line lexbuf = 148 | if lexbuf.curr_line != 0 then 149 | lexbuf.curr_line <- lexbuf.curr_line + 1; 150 | lexbuf.curr_bol <- lexbuf.pos + lexbuf.offset 151 | 152 | let next lexbuf = 153 | if (not lexbuf.finished) && (lexbuf.pos = lexbuf.len) then refill lexbuf; 154 | if lexbuf.finished && (lexbuf.pos = lexbuf.len) then None 155 | else begin 156 | let ret = lexbuf.buf.(lexbuf.pos) in 157 | lexbuf.pos <- lexbuf.pos + 1; 158 | if ret = (Uchar.of_int 10) then new_line lexbuf; 159 | Some ret 160 | end 161 | 162 | let mark lexbuf i = 163 | lexbuf.marked_pos <- lexbuf.pos; 164 | lexbuf.marked_bol <- lexbuf.curr_bol; 165 | lexbuf.marked_line <- lexbuf.curr_line; 166 | lexbuf.marked_val <- i 167 | 168 | let start lexbuf = 169 | lexbuf.start_pos <- lexbuf.pos; 170 | lexbuf.start_bol <- lexbuf.curr_bol; 171 | lexbuf.start_line <- lexbuf.curr_line; 172 | mark lexbuf (-1) 173 | 174 | let backtrack lexbuf = 175 | lexbuf.pos <- lexbuf.marked_pos; 176 | lexbuf.curr_bol <- lexbuf.marked_bol; 177 | lexbuf.curr_line <- lexbuf.marked_line; 178 | lexbuf.marked_val 179 | 180 | let rollback lexbuf = 181 | lexbuf.pos <- lexbuf.start_pos; 182 | lexbuf.curr_bol <- lexbuf.start_bol; 183 | lexbuf.curr_line <- lexbuf.start_line 184 | 185 | let lexeme_start lexbuf = lexbuf.start_pos + lexbuf.offset 186 | let lexeme_end lexbuf = lexbuf.pos + lexbuf.offset 187 | 188 | let loc lexbuf = (lexbuf.start_pos + lexbuf.offset, lexbuf.pos + lexbuf.offset) 189 | 190 | let lexeme_length lexbuf = lexbuf.pos - lexbuf.start_pos 191 | 192 | let sub_lexeme lexbuf pos len = 193 | Array.sub lexbuf.buf (lexbuf.start_pos + pos) len 194 | 195 | let lexeme lexbuf = 196 | Array.sub lexbuf.buf (lexbuf.start_pos) (lexbuf.pos - lexbuf.start_pos) 197 | 198 | let lexeme_char lexbuf pos = 199 | lexbuf.buf.(lexbuf.start_pos + pos) 200 | 201 | let lexing_positions lexbuf = 202 | let start_p = { 203 | Lexing.pos_fname = lexbuf.filename; 204 | pos_lnum = lexbuf.start_line; 205 | pos_cnum = lexbuf.start_pos + lexbuf.offset; 206 | pos_bol = lexbuf.start_bol; 207 | } and curr_p = { 208 | Lexing.pos_fname = lexbuf.filename; 209 | pos_lnum = lexbuf.curr_line; 210 | pos_cnum = lexbuf.pos + lexbuf.offset; 211 | pos_bol = lexbuf.curr_bol; 212 | } in 213 | (start_p, curr_p) 214 | 215 | let with_tokenizer lexer' lexbuf = 216 | let lexer () = 217 | let token = lexer' lexbuf in 218 | let (start_p, curr_p) = lexing_positions lexbuf in 219 | (token, start_p, curr_p) 220 | in lexer 221 | 222 | module Latin1 = struct 223 | let from_gen s = 224 | create (fill_buf_from_gen Uchar.of_char s) 225 | 226 | let from_stream s = from_gen @@ gen_of_stream s 227 | 228 | let from_string s = 229 | let len = String.length s in 230 | { 231 | empty_lexbuf with 232 | buf = Array.init len (fun i -> Uchar.of_char s.[i]); 233 | len = len; 234 | finished = true; 235 | } 236 | 237 | let from_channel ic = 238 | from_gen (gen_of_channel ic) 239 | 240 | let to_latin1 c = 241 | if Uchar.is_char c 242 | then Uchar.to_char c 243 | else raise (InvalidCodepoint (Uchar.to_int c)) 244 | 245 | let lexeme_char lexbuf pos = 246 | to_latin1 (lexeme_char lexbuf pos) 247 | 248 | let sub_lexeme lexbuf pos len = 249 | let s = Bytes.create len in 250 | for i = 0 to len - 1 do Bytes.set s i (to_latin1 lexbuf.buf.(lexbuf.start_pos + pos + i)) done; 251 | Bytes.to_string s 252 | 253 | let lexeme lexbuf = 254 | sub_lexeme lexbuf 0 (lexbuf.pos - lexbuf.start_pos) 255 | end 256 | 257 | 258 | module Utf8 = struct 259 | module Helper = struct 260 | (* http://www.faqs.org/rfcs/rfc3629.html *) 261 | 262 | let width = Array.make 256 (-1) 263 | let () = 264 | for i = 0 to 127 do width.(i) <- 1 done; 265 | for i = 192 to 223 do width.(i) <- 2 done; 266 | for i = 224 to 239 do width.(i) <- 3 done; 267 | for i = 240 to 247 do width.(i) <- 4 done 268 | 269 | let next s i = 270 | match s.[i] with 271 | | '\000'..'\127' as c -> 272 | Char.code c 273 | | '\192'..'\223' as c -> 274 | let n1 = Char.code c in 275 | let n2 = Char.code s.[i+1] in 276 | if (n2 lsr 6 != 0b10) then raise MalFormed; 277 | ((n1 land 0x1f) lsl 6) lor (n2 land 0x3f) 278 | | '\224'..'\239' as c -> 279 | let n1 = Char.code c in 280 | let n2 = Char.code s.[i+1] in 281 | let n3 = Char.code s.[i+2] in 282 | if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) then raise MalFormed; 283 | let p = 284 | ((n1 land 0x0f) lsl 12) lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f) 285 | in 286 | if (p >= 0xd800) && (p <= 0xdf00) then raise MalFormed; 287 | p 288 | | '\240'..'\247' as c -> 289 | let n1 = Char.code c in 290 | let n2 = Char.code s.[i+1] in 291 | let n3 = Char.code s.[i+2] in 292 | let n4 = Char.code s.[i+3] in 293 | if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10) 294 | then raise MalFormed; 295 | ((n1 land 0x07) lsl 18) lor ((n2 land 0x3f) lsl 12) lor 296 | ((n3 land 0x3f) lsl 6) lor (n4 land 0x3f) 297 | | _ -> raise MalFormed 298 | 299 | 300 | let from_gen s = 301 | Gen.next s >>= function 302 | | '\000'..'\127' as c -> 303 | Some (Uchar.of_char c) 304 | | '\192'..'\223' as c -> 305 | let n1 = Char.code c in 306 | Gen.next s >>= fun c2 -> 307 | let n2 = Char.code c2 in 308 | if (n2 lsr 6 != 0b10) then raise MalFormed; 309 | Some (Uchar.of_int (((n1 land 0x1f) lsl 6) lor (n2 land 0x3f))) 310 | | '\224'..'\239' as c -> 311 | let n1 = Char.code c in 312 | Gen.next s >>= fun c2 -> 313 | let n2 = Char.code c2 in 314 | Gen.next s >>= fun c3 -> 315 | let n3 = Char.code c3 in 316 | if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) then raise MalFormed; 317 | Some (Uchar.of_int (((n1 land 0x0f) lsl 12) 318 | lor ((n2 land 0x3f) lsl 6) lor (n3 land 0x3f))) 319 | | '\240'..'\247' as c -> 320 | let n1 = Char.code c in 321 | Gen.next s >>= fun c2 -> 322 | let n2 = Char.code c2 in 323 | Gen.next s >>= fun c3 -> 324 | let n3 = Char.code c3 in 325 | Gen.next s >>= fun c4 -> 326 | let n4 = Char.code c4 in 327 | if (n2 lsr 6 != 0b10) || (n3 lsr 6 != 0b10) || (n4 lsr 6 != 0b10) 328 | then raise MalFormed; 329 | Some (Uchar.of_int (((n1 land 0x07) lsl 18) 330 | lor ((n2 land 0x3f) lsl 12) 331 | lor ((n3 land 0x3f) lsl 6) lor (n4 land 0x3f))) 332 | | _ -> raise MalFormed 333 | 334 | 335 | 336 | let compute_len s pos bytes = 337 | let rec aux n i = 338 | if i >= pos + bytes then if i = pos + bytes then n else raise MalFormed 339 | else 340 | let w = width.(Char.code s.[i]) in 341 | if w > 0 then aux (succ n) (i + w) 342 | else raise MalFormed 343 | in 344 | aux 0 pos 345 | 346 | let rec blit_to_int s spos a apos n = 347 | if n > 0 then begin 348 | a.(apos) <- next s spos; 349 | blit_to_int s (spos + width.(Char.code s.[spos])) a (succ apos) (pred n) 350 | end 351 | 352 | let to_int_array s pos bytes = 353 | let n = compute_len s pos bytes in 354 | let a = Array.make n 0 in 355 | blit_to_int s pos a 0 n; 356 | a 357 | 358 | (**************************) 359 | 360 | let store b p = 361 | if p <= 0x7f then 362 | Buffer.add_char b (Char.chr p) 363 | else if p <= 0x7ff then ( 364 | Buffer.add_char b (Char.chr (0xc0 lor (p lsr 6))); 365 | Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f))) 366 | ) 367 | else if p <= 0xffff then ( 368 | if (p >= 0xd800 && p < 0xe000) then raise MalFormed; 369 | Buffer.add_char b (Char.chr (0xe0 lor (p lsr 12))); 370 | Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6) land 0x3f))); 371 | Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f))) 372 | ) 373 | else if p <= 0x10ffff then ( 374 | Buffer.add_char b (Char.chr (0xf0 lor (p lsr 18))); 375 | Buffer.add_char b (Char.chr (0x80 lor ((p lsr 12) land 0x3f))); 376 | Buffer.add_char b (Char.chr (0x80 lor ((p lsr 6) land 0x3f))); 377 | Buffer.add_char b (Char.chr (0x80 lor (p land 0x3f))) 378 | ) 379 | else raise MalFormed 380 | 381 | let from_uchar_array a apos len = 382 | let b = Buffer.create (len * 4) in 383 | let rec aux apos len = 384 | if len > 0 385 | then (store b (Uchar.to_int a.(apos)); aux (succ apos) (pred len)) 386 | else Buffer.contents b in 387 | aux apos len 388 | 389 | let gen_from_char_gen s = (fun () -> from_gen s) 390 | end 391 | 392 | let from_channel ic = 393 | from_gen (Helper.gen_from_char_gen (gen_of_channel ic)) 394 | 395 | let from_gen s = 396 | create (fill_buf_from_gen (fun id -> id) 397 | (Helper.gen_from_char_gen s)) 398 | 399 | let from_stream s = from_gen @@ gen_of_stream s 400 | 401 | let from_string s = 402 | from_int_array (Helper.to_int_array s 0 (String.length s)) 403 | 404 | let sub_lexeme lexbuf pos len = 405 | Helper.from_uchar_array lexbuf.buf (lexbuf.start_pos + pos) len 406 | 407 | let lexeme lexbuf = 408 | sub_lexeme lexbuf 0 (lexbuf.pos - lexbuf.start_pos) 409 | end 410 | 411 | 412 | module Utf16 = struct 413 | type byte_order = Little_endian | Big_endian 414 | module Helper = struct 415 | (* http://www.ietf.org/rfc/rfc2781.txt *) 416 | 417 | let number_of_char_pair bo c1 c2 = match bo with 418 | | Little_endian -> ((Char.code c2) lsl 8) + (Char.code c1) 419 | | Big_endian -> ((Char.code c1) lsl 8) + (Char.code c2) 420 | 421 | let char_pair_of_number bo num = match bo with 422 | | Little_endian -> 423 | (Char.chr (num land 0xFF), Char.chr ((num lsr 8) land 0xFF )) 424 | | Big_endian -> 425 | (Char.chr ((num lsr 8) land 0xFF), Char.chr (num land 0xFF)) 426 | 427 | let next_in_gen bo s = 428 | Gen.next s >>= fun c1 -> 429 | Gen.next s >>= fun c2 -> 430 | Some (number_of_char_pair bo c1 c2) 431 | 432 | let from_gen bo s w1 = 433 | if w1 = 0xfffe then raise (InvalidCodepoint w1); 434 | if w1 < 0xd800 || 0xdfff < w1 then Some (Uchar.of_int w1) 435 | else if w1 <= 0xdbff 436 | then 437 | next_in_gen bo s >>= fun w2 -> 438 | if w2 < 0xdc00 || w2 > 0xdfff then raise MalFormed; 439 | let upper10 = (w1 land 0x3ff) lsl 10 440 | and lower10 = w2 land 0x3ff in 441 | Some (Uchar.of_int (0x10000 + upper10 + lower10)) 442 | else raise MalFormed 443 | 444 | let gen_from_char_gen opt_bo s = 445 | let bo = ref opt_bo in 446 | fun () -> 447 | Gen.next s >>= fun c1 -> 448 | Gen.next s >>= fun c2 -> 449 | let o = match !bo with 450 | | Some o -> o 451 | | None -> 452 | let o = match (Char.code c1, Char.code c2) with 453 | | (0xff,0xfe) -> Little_endian 454 | | _ -> Big_endian in 455 | bo := Some o; 456 | o in 457 | from_gen o s (number_of_char_pair o c1 c2) 458 | 459 | 460 | let compute_len opt_bo str pos bytes = 461 | let s = gen_from_char_gen opt_bo 462 | (Gen.init ~limit:(bytes - pos) (fun i -> (str.[i + pos]))) 463 | in 464 | let l = ref 0 in 465 | Gen.iter (fun _ -> incr l) s ; 466 | !l 467 | 468 | let blit_to_int opt_bo s spos a apos bytes = 469 | let s = gen_from_char_gen opt_bo 470 | (Gen.init ~limit:(bytes - spos) (fun i -> (s.[i + spos]))) in 471 | let p = ref apos in 472 | Gen.iter (fun x -> a.(!p) <- x ; incr p) s 473 | 474 | 475 | let to_uchar_array opt_bo s pos bytes = 476 | let len = compute_len opt_bo s pos bytes in 477 | let a = Array.make len (Uchar.of_int 0) in 478 | blit_to_int opt_bo s pos a 0 bytes ; 479 | a 480 | 481 | let store bo buf code = 482 | if code < 0x10000 483 | then ( 484 | let (c1,c2) = char_pair_of_number bo code in 485 | Buffer.add_char buf c1; 486 | Buffer.add_char buf c2 487 | ) else ( 488 | let u' = code - 0x10000 in 489 | let w1 = 0xd800 + (u' lsr 10) 490 | and w2 = 0xdc00 + (u' land 0x3ff) in 491 | let (c1,c2) = char_pair_of_number bo w1 492 | and (c3,c4) = char_pair_of_number bo w2 in 493 | Buffer.add_char buf c1; 494 | Buffer.add_char buf c2; 495 | Buffer.add_char buf c3; 496 | Buffer.add_char buf c4 497 | ) 498 | 499 | let from_uchar_array bo a apos len bom = 500 | let b = Buffer.create (len * 4 + 2) in (* +2 for the BOM *) 501 | if bom then store bo b 0xfeff ; (* first, store the BOM *) 502 | let rec aux apos len = 503 | if len > 0 504 | then (store bo b (Uchar.to_int a.(apos)); aux (succ apos) (pred len)) 505 | else Buffer.contents b in 506 | aux apos len 507 | end 508 | 509 | 510 | let from_gen s opt_bo = 511 | from_gen (Helper.gen_from_char_gen opt_bo s) 512 | 513 | let from_stream s = from_gen @@ gen_of_stream s 514 | 515 | let from_channel ic opt_bo = 516 | from_gen (gen_of_channel ic) opt_bo 517 | 518 | let from_string s opt_bo = 519 | let a = Helper.to_uchar_array opt_bo s 0 (String.length s) in 520 | from_uchar_array a 521 | 522 | let sub_lexeme lb pos len bo bom = 523 | Helper.from_uchar_array bo lb.buf (lb.start_pos + pos) len bom 524 | 525 | let lexeme lb bo bom = 526 | sub_lexeme lb 0 (lb.pos - lb.start_pos) bo bom 527 | end 528 | -------------------------------------------------------------------------------- /src/syntax/ppx_sedlex.ml: -------------------------------------------------------------------------------- 1 | (* The package sedlex is released under the terms of an MIT-like license. *) 2 | (* See the attached LICENSE file. *) 3 | (* Copyright 2005, 2013 by Alain Frisch and LexiFi. *) 4 | 5 | open Longident 6 | open Migrate_parsetree 7 | open Ast_405 8 | open Parsetree 9 | open Asttypes 10 | open Ast_helper 11 | open Ast_convenience_405 12 | 13 | module Ast_mapper_class = Ast_mapper_class_405 14 | 15 | let ocaml_version = Versions.ocaml_405 16 | 17 | module Cset = Sedlex_cset 18 | 19 | (* Decision tree for partitions *) 20 | 21 | type decision_tree = 22 | | Lte of int * decision_tree * decision_tree 23 | | Table of int * int array 24 | | Return of int 25 | 26 | let decision l = 27 | let l = List.map (fun (a, b, i) -> (a, b, Return i)) l in 28 | let rec merge2 = function 29 | | (a1, b1, d1) :: (a2, b2, d2) :: rest -> 30 | let x = 31 | if b1 + 1 = a2 then d2 32 | else Lte (a2 - 1, Return (-1), d2) 33 | in 34 | (a1, b2, Lte (b1, d1, x)) :: merge2 rest 35 | | rest -> rest 36 | in 37 | let rec aux = function 38 | | [(a, b, d)] -> Lte (a - 1, Return (-1), Lte (b, d, Return (-1))) 39 | | [] -> Return (-1) 40 | | l -> aux (merge2 l) 41 | in 42 | aux l 43 | 44 | let limit = 8192 45 | 46 | let decision_table l = 47 | let rec aux m accu = function 48 | | ((a, b, i) as x)::rem when b < limit && i < 255-> 49 | aux (min a m) (x :: accu) rem 50 | | rem -> m, accu, rem 51 | in 52 | let (min, table, rest) = aux max_int [] l in 53 | match table with 54 | | [] -> decision l 55 | | [(min, max, i)] -> 56 | Lte (min - 1, Return (-1), (Lte (max, Return i, decision rest))) 57 | | (_, max, _) :: _ -> 58 | let arr = Array.make (max - min + 1) 0 in 59 | let set (a, b, i) = for j = a to b do arr.(j - min) <- i + 1 done in 60 | List.iter set table; 61 | Lte (min - 1, Return (-1), Lte (max, Table (min, arr), decision rest)) 62 | 63 | let rec simplify min max = function 64 | | Lte (i,yes,no) -> 65 | if i >= max then simplify min max yes 66 | else if i < min then simplify min max no 67 | else Lte (i, simplify min i yes, simplify (i+1) max no) 68 | | x -> x 69 | 70 | let segments_of_partition p = 71 | let seg = ref [] in 72 | Array.iteri 73 | (fun i c -> List.iter (fun (a, b) -> seg := (a, b, i) :: !seg) c) 74 | p; 75 | List.sort (fun (a1,_,_) (a2,_,_) -> compare a1 a2) !seg 76 | 77 | let decision_table p = 78 | simplify (-1) (Cset.max_code) (decision_table (segments_of_partition p)) 79 | 80 | 81 | (* Helpers to build AST *) 82 | 83 | let appfun s l = app (evar s) l 84 | let glb_value name def = Str.value Nonrecursive [Vb.mk (pvar name) def] 85 | 86 | (* Named regexps *) 87 | 88 | module StringMap = Map.Make(struct 89 | type t = string 90 | let compare = compare 91 | end) 92 | 93 | let builtin_regexps = 94 | List.fold_left (fun acc (n, c) -> StringMap.add n (Sedlex.chars c) acc) 95 | StringMap.empty 96 | [ 97 | "any", Cset.any; 98 | "eof", Cset.eof; 99 | "xml_letter", Cset.letter; 100 | "xml_digit", Cset.digit; 101 | "xml_extender", Cset.extender; 102 | "xml_base_char", Cset.base_char; 103 | "xml_ideographic", Cset.ideographic; 104 | "xml_combining_char", Cset.combining_char; 105 | "xml_blank", Cset.blank; 106 | "tr8876_ident_char", Cset.tr8876_ident_char; 107 | 108 | (* Unicode 6.3 categories *) 109 | "cc", Unicode63.Categories.cc; 110 | "cf", Unicode63.Categories.cf; 111 | "cn", Unicode63.Categories.cn; 112 | "co", Unicode63.Categories.co; 113 | "cs", Unicode63.Categories.cs; 114 | "ll", Unicode63.Categories.ll; 115 | "lm", Unicode63.Categories.lm; 116 | "lo", Unicode63.Categories.lo; 117 | "lt", Unicode63.Categories.lt; 118 | "lu", Unicode63.Categories.lu; 119 | "mc", Unicode63.Categories.mc; 120 | "me", Unicode63.Categories.me; 121 | "mn", Unicode63.Categories.mn; 122 | "nd", Unicode63.Categories.nd; 123 | "nl", Unicode63.Categories.nl; 124 | "no", Unicode63.Categories.no; 125 | "pc", Unicode63.Categories.pc; 126 | "pd", Unicode63.Categories.pd; 127 | "pe", Unicode63.Categories.pe; 128 | "pf", Unicode63.Categories.pf; 129 | "pi", Unicode63.Categories.pi; 130 | "po", Unicode63.Categories.po; 131 | "ps", Unicode63.Categories.ps; 132 | "sc", Unicode63.Categories.sc; 133 | "sk", Unicode63.Categories.sk; 134 | "sm", Unicode63.Categories.sm; 135 | "so", Unicode63.Categories.so; 136 | "zl", Unicode63.Categories.zl; 137 | "zp", Unicode63.Categories.zp; 138 | "zs", Unicode63.Categories.zs; 139 | 140 | (* Unicode 6.3 properties *) 141 | "alphabetic", Unicode63.Properties.alphabetic; 142 | "ascii_hex_digit", Unicode63.Properties.ascii_hex_digit; 143 | "hex_digit", Unicode63.Properties.hex_digit; 144 | "id_continue", Unicode63.Properties.id_continue; 145 | "id_start", Unicode63.Properties.id_start; 146 | "lowercase", Unicode63.Properties.lowercase; 147 | "math", Unicode63.Properties.math; 148 | "other_alphabetic", Unicode63.Properties.other_alphabetic; 149 | "other_lowercase", Unicode63.Properties.other_lowercase; 150 | "other_math", Unicode63.Properties.other_math; 151 | "other_uppercase", Unicode63.Properties.other_uppercase; 152 | "uppercase", Unicode63.Properties.uppercase; 153 | "white_space", Unicode63.Properties.white_space; 154 | "xid_continue", Unicode63.Properties.xid_continue; 155 | "xid_start", Unicode63.Properties.xid_start; 156 | ] 157 | 158 | (* Tables (indexed mapping: codepoint -> next state) *) 159 | 160 | let tables = Hashtbl.create 31 161 | let table_counter = ref 0 162 | let get_tables () = Hashtbl.fold (fun key x accu -> (x, key) :: accu) tables [] 163 | 164 | let table_name x = 165 | try Hashtbl.find tables x 166 | with Not_found -> 167 | incr table_counter; 168 | let s = Printf.sprintf "__sedlex_table_%i" !table_counter in 169 | Hashtbl.add tables x s; 170 | s 171 | 172 | let table (name, v) = 173 | let n = Array.length v in 174 | let s = Bytes.create n in 175 | for i = 0 to n - 1 do Bytes.set s i (Char.chr v.(i)) done; 176 | glb_value name (str (Bytes.to_string s)) 177 | 178 | (* Partition (function: codepoint -> next state) *) 179 | 180 | let partitions = Hashtbl.create 31 181 | let partition_counter = ref 0 182 | let get_partitions () = Hashtbl.fold (fun key x accu -> (x, key) :: accu) partitions [] 183 | 184 | let partition_name x = 185 | try Hashtbl.find partitions x 186 | with Not_found -> 187 | incr partition_counter; 188 | let s = Printf.sprintf "__sedlex_partition_%i" !partition_counter in 189 | Hashtbl.add partitions x s; 190 | s 191 | 192 | (* We duplicate the body for the EOF (-1) case rather than creating 193 | an interior utility function. *) 194 | let partition (name, p) = 195 | let rec gen_tree = function 196 | | Lte (i, yes, no) -> 197 | [%expr if c <= [%e int i] then [%e gen_tree yes] else [%e gen_tree no]] 198 | | Return i -> int i 199 | | Table (offset, t) -> 200 | let c = if offset = 0 then [%expr c] else [%expr c - [%e int offset]] in 201 | [%expr Char.code (String.get [%e evar (table_name t)] [%e c]) - 1] 202 | in 203 | let body = gen_tree (decision_table p) in 204 | glb_value name (func [(pconstr "Some" [pvar "uc"], 205 | [%expr let c = Uchar.to_int uc in [%e body]]); 206 | (pconstr "None" [], 207 | [%expr let c = (-1) in [%e body]])]) 208 | 209 | (* Code generation for the automata *) 210 | 211 | let best_final final = 212 | let fin = ref None in 213 | for i = Array.length final - 1 downto 0 do 214 | if final.(i) then fin := Some i 215 | done; 216 | !fin 217 | 218 | let state_fun state = Printf.sprintf "__sedlex_state_%i" state 219 | 220 | let call_state lexbuf auto state = 221 | let (trans, final) = auto.(state) in 222 | if Array.length trans = 0 223 | then match best_final final with 224 | | Some i -> int i 225 | | None -> assert false 226 | else appfun (state_fun state) [evar lexbuf] 227 | 228 | let gen_state lexbuf auto i (trans, final) = 229 | let partition = Array.map fst trans in 230 | let cases = Array.mapi (fun i (_, j) -> Exp.case(pint i) (call_state lexbuf auto j)) trans in 231 | let cases = Array.to_list cases in 232 | let body () = 233 | Exp.match_ 234 | (appfun (partition_name partition) [[%expr Sedlexing.next [%e evar lexbuf]]]) 235 | (cases @ [Exp.case [%pat? _] [%expr Sedlexing.backtrack [%e evar lexbuf]]]) 236 | in 237 | let ret body = [ Vb.mk (pvar (state_fun i)) (func [pvar lexbuf, body]) ] in 238 | match best_final final with 239 | | None -> ret (body ()) 240 | | Some _ when Array.length trans = 0 -> [] 241 | | Some i -> ret [%expr Sedlexing.mark [%e evar lexbuf] [%e int i]; [%e body ()]] 242 | 243 | let gen_definition lexbuf l error = 244 | let brs = Array.of_list l in 245 | let auto = Sedlex.compile (Array.map fst brs) in 246 | let cases = Array.to_list (Array.mapi (fun i (_, e) -> Exp.case (pint i) e) brs) in 247 | let states = Array.mapi (gen_state lexbuf auto) auto in 248 | let states = List.flatten (Array.to_list states) in 249 | Exp.let_ Recursive states 250 | (Exp.sequence 251 | [%expr Sedlexing.start [%e evar lexbuf]] 252 | (Exp.match_ (appfun (state_fun 0) [evar lexbuf]) 253 | (cases @ [Exp.case (Pat.any ()) error]) 254 | ) 255 | ) 256 | 257 | (* Lexer specification parser *) 258 | 259 | let codepoint i = 260 | if i < 0 || i > Cset.max_code then 261 | failwith (Printf.sprintf "Invalid Unicode code point: %i" i); 262 | i 263 | 264 | let regexp_for_char c = 265 | Sedlex.chars (Cset.singleton (Char.code c)) 266 | 267 | let regexp_for_string s = 268 | let rec aux n = 269 | if n = String.length s then Sedlex.eps 270 | else 271 | Sedlex.seq (regexp_for_char s.[n]) (aux (succ n)) 272 | in aux 0 273 | 274 | let err loc s = 275 | raise (Location.Error (Location.error ~loc ("Sedlex: " ^ s))) 276 | 277 | let rec repeat r = function 278 | | 0, 0 -> Sedlex.eps 279 | | 0, m -> Sedlex.alt Sedlex.eps (Sedlex.seq r (repeat r (0, m - 1))) 280 | | n, m -> Sedlex.seq r (repeat r (n - 1, m - 1)) 281 | 282 | let regexp_of_pattern env = 283 | let rec char_pair_op func name p tuple = (* Construct something like Sub(a,b) *) 284 | match tuple with 285 | | Some {ppat_desc=Ppat_tuple (p0 :: p1 :: [])} -> 286 | begin match func (aux p0) (aux p1) with 287 | | Some r -> r 288 | | None -> 289 | err p.ppat_loc @@ 290 | "the "^name^" operator can only applied to single-character length regexps" 291 | end 292 | | _ -> err p.ppat_loc @@ "the "^name^" operator requires two arguments, like "^name^"(a,b)" 293 | and aux p = (* interpret one pattern node *) 294 | match p.ppat_desc with 295 | | Ppat_or (p1, p2) -> Sedlex.alt (aux p1) (aux p2) 296 | | Ppat_tuple (p :: pl) -> 297 | List.fold_left (fun r p -> Sedlex.seq r (aux p)) 298 | (aux p) 299 | pl 300 | | Ppat_construct ({txt = Lident "Star"}, Some p) -> 301 | Sedlex.rep (aux p) 302 | | Ppat_construct ({txt = Lident "Plus"}, Some p) -> 303 | Sedlex.plus (aux p) 304 | | Ppat_construct 305 | ({txt = Lident "Rep"}, 306 | Some {ppat_desc=Ppat_tuple[p0; {ppat_desc=Ppat_constant (i1 as i2)|Ppat_interval(i1, i2)}]}) -> 307 | begin match Constant.of_constant i1, Constant.of_constant i2 with 308 | | Pconst_integer(i1,_), Pconst_integer(i2,_) -> 309 | let i1 = int_of_string i1 in 310 | let i2 = int_of_string i2 in 311 | if 0 <= i1 && i1 <= i2 then repeat (aux p0) (i1, i2) 312 | else err p.ppat_loc "Invalid range for Rep operator" 313 | | _ -> 314 | err p.ppat_loc "Rep must take an integer constant or interval" 315 | end 316 | | Ppat_construct ({txt = Lident "Rep"}, _) -> 317 | err p.ppat_loc "the Rep operator takes 2 arguments" 318 | | Ppat_construct ({txt = Lident "Opt"}, Some p) -> 319 | Sedlex.alt Sedlex.eps (aux p) 320 | | Ppat_construct ({txt = Lident "Compl"}, arg) -> 321 | begin match arg with 322 | | Some p0 -> 323 | begin match Sedlex.compl (aux p0) with 324 | | Some r -> r 325 | | None -> 326 | err p.ppat_loc 327 | "the Compl operator can only applied to a single-character length regexp" 328 | end 329 | | _ -> err p.ppat_loc "the Compl operator requires an argument" 330 | end 331 | | Ppat_construct ({ txt = Lident "Sub" }, arg) -> 332 | char_pair_op Sedlex.subtract "Sub" p arg 333 | | Ppat_construct ({ txt = Lident "Intersect" }, arg) -> 334 | char_pair_op Sedlex.intersection "Intersect" p arg 335 | | Ppat_construct ({txt = Lident "Chars"}, arg) -> 336 | let const = match arg with 337 | | Some {ppat_desc=Ppat_constant const} -> 338 | Some (Constant.of_constant const) 339 | | _ -> None 340 | in 341 | begin match const with 342 | | Some (Pconst_string(s,_))-> 343 | let c = ref Cset.empty in 344 | for i = 0 to String.length s - 1 do 345 | c := Cset.union !c (Cset.singleton (Char.code s.[i])) 346 | done; 347 | Sedlex.chars !c 348 | | _ -> err p.ppat_loc "the Chars operator requires a string argument" 349 | end 350 | | Ppat_interval (i_start, i_end) -> 351 | begin match Constant.of_constant i_start, Constant.of_constant i_end with 352 | | Pconst_char c1, Pconst_char c2 -> Sedlex.chars (Cset.interval (Char.code c1) (Char.code c2)) 353 | | Pconst_integer(i1,_), Pconst_integer(i2,_) -> 354 | Sedlex.chars (Cset.interval (codepoint (int_of_string i1)) (codepoint (int_of_string i2))) 355 | | _ -> err p.ppat_loc "this pattern is not a valid interval regexp" 356 | end 357 | | Ppat_constant (const) -> 358 | begin match Constant.of_constant const with 359 | | Pconst_string (s, _) -> regexp_for_string s 360 | | Pconst_char c -> regexp_for_char c 361 | | Pconst_integer(i,_) -> Sedlex.chars (Cset.singleton (codepoint (int_of_string i))) 362 | | _ -> err p.ppat_loc "this pattern is not a valid regexp" 363 | end 364 | | Ppat_var {txt=x} -> 365 | begin try StringMap.find x env 366 | with Not_found -> 367 | err p.ppat_loc (Printf.sprintf "unbound regexp %s" x) 368 | end 369 | | _ -> 370 | err p.ppat_loc "this pattern is not a valid regexp" 371 | in 372 | aux 373 | 374 | 375 | let mapper cookies = 376 | object(this) 377 | inherit Ast_mapper_class.mapper as super 378 | 379 | val env = builtin_regexps 380 | 381 | method define_regexp name p = 382 | {< env = StringMap.add name (regexp_of_pattern env p) env >} 383 | 384 | method! expr e = 385 | match e with 386 | | [%expr [%sedlex [%e? {pexp_desc=Pexp_match (lexbuf, cases)}]]] -> 387 | let lexbuf = 388 | match lexbuf with 389 | | {pexp_desc=Pexp_ident{txt=Lident lexbuf}} -> lexbuf 390 | | _ -> 391 | err lexbuf.pexp_loc "the matched expression must be a single identifier" 392 | in 393 | let cases = List.rev cases in 394 | let error = 395 | match List.hd cases with 396 | | {pc_lhs = [%pat? _]; pc_rhs = e; pc_guard = None} -> super # expr e 397 | | {pc_lhs = p} -> 398 | err p.ppat_loc "the last branch must be a catch-all error case" 399 | in 400 | let cases = List.rev (List.tl cases) in 401 | let cases = 402 | List.map 403 | (function 404 | | {pc_lhs = p; pc_rhs = e; pc_guard = None} -> regexp_of_pattern env p, super # expr e 405 | | {pc_guard = Some e} -> 406 | err e.pexp_loc "'when' guards are not supported" 407 | ) cases 408 | in 409 | gen_definition lexbuf cases error 410 | | [%expr let [%p? {ppat_desc=Ppat_var{txt=name}}] = [%sedlex.regexp? [%p? p]] in [%e? body]] -> 411 | (this # define_regexp name p) # expr body 412 | | [%expr [%sedlex [%e? _]]] -> 413 | err e.pexp_loc "the %sedlex extension is only recognized on match expressions" 414 | | _ -> super # expr e 415 | 416 | 417 | val toplevel = true 418 | 419 | method structure_with_regexps l = 420 | let mapper = ref this in 421 | let regexps = ref [] in 422 | let l = List.concat 423 | (List.map 424 | (function 425 | | [%stri let [%p? {ppat_desc=Ppat_var{txt=name}}] = [%sedlex.regexp? [%p? p]]] as i -> 426 | regexps := i :: !regexps; 427 | mapper := !mapper # define_regexp name p; 428 | [] 429 | | i -> 430 | [ !mapper # structure_item i ] 431 | ) l) in 432 | (l, List.rev !regexps) 433 | 434 | method! structure l = 435 | if toplevel then 436 | let sub = {< toplevel = false >} in 437 | let previous = 438 | match Driver.get_cookie cookies "sedlex.regexps" ocaml_version with 439 | | Some {pexp_desc = Pexp_extension (_, PStr l)} -> l 440 | | Some _ -> assert false 441 | | None -> [] 442 | in 443 | let l, regexps = sub # structure_with_regexps (previous @ l) in 444 | let parts = List.map partition (get_partitions ()) in 445 | let tables = List.map table (get_tables ()) in 446 | Driver.set_cookie cookies "sedlex.regexps" ocaml_version (Exp.extension (Location.mknoloc "regexps", PStr regexps)); 447 | tables @ parts @ l 448 | else 449 | fst (this # structure_with_regexps l) 450 | 451 | end 452 | 453 | let () = 454 | Driver.register 455 | ~name:"sedlex" 456 | ocaml_version 457 | (fun _ cookies -> Ast_mapper_class.to_mapper (mapper cookies)) 458 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Maintenance status: maintainedLatest npm releaseBuild status on Travis-CIFollow my work on Twitter 2 | bs-sedlex

3 | 4 | > **For details on purpose, usage, and API of sedlex, [scroll down](#sedlex).** These sections I've 5 | > added at the top are specific to ways that installation and usage of the `bs-sedlex` distribution 6 | > **differ** from using the upstream release. 7 | 8 | This repository contains a fork of the [sedlex][] lexer-generator tooling for OCaml-family 9 | languages, packaged for use in projects utilizing [BuckleScript][] (an OCaml-to-JavaScript compiler) 10 | and [ReasonML][] (an alternative OCaml syntax targeting that compiler.) 11 | 12 | Care is taken in this project to publish pre-compiled binaries of the [ppx 13 | syntax-extension](#lexer-specifications) component necessary to use sedlex in practice. These are 14 | published to npm as the separate npm package, [`ppx-sedlex`][ppx-sedlex], versioned in lockstep with 15 | the parent `bs-sedlex` package. Instructions for *enabling* this extension in your BuckleScript 16 | configuration-file, `bsconfig.json`, are included below. Don't miss them! 17 | 18 | [sedlex]: 19 | "The upstream distribution of sedlex, maintained by the OCaml community" 20 | [BuckleScript]: 21 | [ReasonML]: 22 | [ppx-sedlex]: 23 | "The native syntax-extension component of bs-sedlex, published separately to npm" 24 | 25 | ## Installation in BuckleScript projects 26 | 27 | You can safely ignore the installation instructions in the upstream README reproduced below, when 28 | compiling to JS using BuckleScript. Instead: 29 | 30 | 1. If you're writing an app or a similar end-consumer project, install BuckleScript compiler (a 31 | peerDependency of this project) via [npm][]. 32 | 33 | ```sh 34 | $ npm install --save bs-platform 35 | ``` 36 | 37 | Worh repeating: *do not add this dependency to a library.* The final application-developer 38 | should generally select the version of the BuckleScript compiler; you don't want users having 39 | duplicated versions of the compiler in their `node_modules`. Instead, library developers should 40 | add `bs-platform` to both `"peerDependencies"` (with a permissive version), and 41 | `"devDependencies"` (with a restrictive version): 42 | 43 | ```sh 44 | $ npm install --save-dev bs-platform 45 | ``` 46 | 47 | ```diff 48 | "devDependencies": { 49 | ... 50 | "bs-platform": "^5.0.0" 51 | }, 52 | "peerDependencies": { 53 | + "bs-platform": "4.x || 5.x" // example. express the versions of BuckleScript you support here. 54 | }, 55 | ``` 56 | 57 | 2. Add the ppx transformer to your `"devDependencies"`: 58 | 59 | ```sh 60 | $ npm install --save-dev ppx-sedlex 61 | ``` 62 | 63 | 3. Add the runtime package (this one!) to your direct `"dependencies"`, for both libraries and apps: 64 | 65 | ```sh 66 | $ npm install --save bs-sedlex 67 | ``` 68 | 69 | 4. Manually add it (the runtime package, `bs-sedlex`) to your `bsconfig.json`'s `bs-dependencies` 70 | field: 71 | 72 | ```diff 73 | "bs-dependencies": [ 74 | ... 75 | + "bs-sedlex" 76 | ], 77 | ``` 78 | 79 | 5. Additionally tell BuckleScript to apply the `ppx-sedlex` syntax-transformer over your source-code 80 | by adding a `ppx-flags` field at the root level of the same `bsconfig.json`. (Note that, 81 | unintuitively, this is *not* a relative path; it follows the format `package-name/file-path`.) 82 | 83 | ```diff 84 | "bs-dependencies": [ 85 | ... 86 | "bs-sedlex" 87 | ], 88 | +"ppx-flags": [ 89 | + "ppx-sedlex/ppx.js" 90 | +], 91 | ``` 92 | 93 | 6. Write blazing-fast, type-safe, and Unicode-aware / multilingual lexers and parsers galore! 94 | 95 | [npm]: 96 | "npm, the package-manager for the JavaScript ecosystem" 97 | 98 | ## Versioning of this package 99 | 100 | Thanks to [SemVer not including a ‘generation’ number][semver-213], there's really no way I can 101 | reasonably tie this project's version on npm to the upstream version of Sedlex as released by the 102 | community maintainers. As ugly as it is, I've opted to pin the *major version* of `bs-sedlex`, to 103 | the *flattened* major and minor versions of the upstream project. 104 | 105 | I started doing this with Sedlex 2.0; thus, the mapping looks like this: 106 | 107 | | Sedlex | `bs-sedlex` | 108 | | --------- | ----------- | 109 | | `v1.99.4` | `v1.99.4` | 110 | | `v2.0` | `v20.0.x` | 111 | 112 | Correspondingly, this project can't really strictly adhere to SemVer. Tentatively, I intend to use 113 | the ‘minor’ field for breaking changes to the port, and the ‘patch’ field for everything else. 114 | 115 | [semver-213]: 116 | "A discussion around extending SemVer with an additional, human-focused major component" 117 | 118 | ## Parser-writing tips from a fellow JavaScripter 119 | 120 | I'm dogfooding this port on a parsing-project in JavaScript & ML (Excmd.js, ). 121 | Feel free to refer to that for a real-world example of compiling industrial-strength OCaml parsing 122 | tooling down to JavaScript for the web. Some takeaways follow: 123 | 124 | - Use [Menhir][] for parser-generation. Seriously. It's got [spectacularly clear 125 | docs][menhir-docs], an [entire chapter in Real World OCaml][menhir-rwo] dedicated to it, and a 126 | laundry-list of advanced features — everything from automated tooling that *explains* reported 127 | parsing-conflicts to you, neophyte language-developer; to [an incremental-parsing 128 | API][menhir-incremental] allowing you to implement extremely advanced error-recovery and 129 | introspection/reporting tools. 130 | 131 | - If you want to take that advice, unfortunately, there's no cool, easy port to JavaScript for you, 132 | like this one for Sedlex. 😉 (Maybe I'll publish one someday!) Until one exists, you'll have to 133 | maintain a dualistic build-system that uses the standard OCaml tooling and build-system (i.e. 134 | [opam][] and ) to produce the `.ml` parsing-automaton, and then feed that 135 | into the BuckleScript build. Maybe you can [glean][excmd-bsconf-generators] some 136 | [ideas][excmd-bsconf-sources] from [my experiences][excmd-dune-libraries] here. 137 | 138 | - A major selling-point of sedlex is the deep and thorough Unicode compatibility. Use it! [I 139 | suggest][uax-notes] reading through the Unicode Consortium's documentation on the topics, known 140 | as Unicode Standard Annex №. 31, or [UAX#31][uax31]. It goes into more detail than you could ever 141 | want to know about a vast number of topics. Get this stuff right! 142 | 143 | - I (ELLIOTTCABLE) am also very happy to help with any of these topics — I spent a lot of time and 144 | effort figuring this out; and although it'll hopefully improve as the BuckleScript community 145 | grows, until then, there's a lot of minutae to get just right. I'm active on both the 146 | [OCaml][ocaml-discord] and [ReasonML][reasonml-discord] Discord servers (why there are *two*, I 147 | cannot fathom); as well as on the Freenode IRC server, in both `#ocaml` and 148 | [`#ELLIOTTCABLE`](http://ell.io/irc). Feel free to reach out if you just want to chat about these 149 | topics, or to get more formal support! 150 | 151 | [Menhir]: "The Menhir parser-generator for OCaml" 152 | [menhir-docs]: 153 | "HTML version of Menhir's in-depth documentation" 154 | [menhir-rwo]: 155 | "Real World OCaml - Chapter 16: Parsing with OCamllex and Menhir" 156 | [menhir-incremental]: 157 | "Menhir manual: the Incremental API" 158 | [opam]: "opam, the OCaml package-manager" 159 | [excmd-bsconf-generators]: 160 | "Setting up the undocumented 'generators' configuration in bsb to invoke Menhir" 161 | [excmd-bsconf-sources]: 162 | "Using the undocumented 'generators' configuration in bsb to produce a parser-automaton" 163 | [excmd-dune-libraries]: 164 | "Ensuring menhirLib is reachable from the OCaml side" 165 | [uax-notes]: 166 | "My own notes on UAX#31 adherence with regards to lexing and sedlex" 167 | [uax31]: 168 | "Unicode® Standard Annex #31: Unicode Identifier and Pattern Syntax" 169 | [ocaml-discord]: 170 | "Official OCaml Discord server" 171 | [reasonml-discord]: 172 | "Official ReasonML Discord server" 173 | 174 | # sedlex 175 | 176 | [![Build Status](https://travis-ci.com/ocaml-community/sedlex.svg?branch=master)](https://travis-ci.com/ocaml-community/sedlex) 177 | 178 | Unicode-friendly lexer generator for OCaml. 179 | 180 | This package is licensed by LexiFi under the terms of the MIT license. 181 | 182 | sedlex was originally written by Alain Frisch 183 | and is now maintained as part of the 184 | ocaml-community repositories on github. 185 | 186 | ## API 187 | The API is documented [here](https://ocaml-community.github.io/sedlex). 188 | 189 | ## Overview 190 | 191 | sedlex is a lexer generator for OCaml, similar to ocamllex, but 192 | supporting Unicode. Contrary to ocamllex, lexer specifications for 193 | sedlex are embedded in regular OCaml source files. 194 | 195 | The lexers work with a new kind of "lexbuf", similar to ocamllex 196 | Lexing lexbufs, but designed to support Unicode, and abstracting from 197 | a specific encoding. A single lexer can work with arbitrary encodings 198 | of the input stream. 199 | 200 | sedlex is the successor of the ulex project. Contrary to ulex which 201 | was implemented as a Camlp4 syntax extension, sedlex is based on the 202 | new "-ppx" technology of OCaml, which allow rewriting OCaml parse 203 | trees through external rewriters. (And what a better name than "sed" 204 | for a rewriter?) 205 | 206 | As any -ppx rewriter, sedlex does not touch the concrete syntax of the 207 | language: lexer specifications are written in source file which comply 208 | with the standard grammar of OCaml programs. sedlex reuse the syntax 209 | for pattern matching in order to describe lexers (regular expressions 210 | are encoded within OCaml patterns). A nice consequence is that your 211 | editor (vi, emacs, ...) won't get confused (indentation, coloring) and 212 | you don't need to learn new priority rules. Moreover, sedlex is 213 | compatible with any front-end parsing technology: it works fine even 214 | if you use camlp4 or camlp5, with the standard or revised syntax. 215 | 216 | 217 | ## Lexer specifications 218 | 219 | 220 | sedlex adds a new kind of expression to OCaml: lexer definitions. 221 | The syntax for the new construction is: 222 | 223 | ```ocaml 224 | match%sedlex lexbuf with 225 | | R1 -> e1 226 | ... 227 | | Rn -> en 228 | | _ -> def 229 | ``` 230 | 231 | or: 232 | 233 | ```ocaml 234 | [%sedlex match lexbuf with 235 | | R1 -> e1 236 | ... 237 | | Rn -> en 238 | | _ -> def 239 | ] 240 | ``` 241 | 242 | (The first vertical bar is optional as in any OCaml pattern matching. 243 | Guard expressions are not allowed.) 244 | 245 | where: 246 | - lexbuf is an arbitrary lowercase identifier, which must refer to 247 | an existing value of type `Sedlexing.lexbuf`. 248 | - the Ri are regular expressions (see below); 249 | - the ei and def are OCaml expressions (called actions) of the same type 250 | (the type for the whole lexer definition). 251 | 252 | Unlike ocamllex, lexers work on stream of Unicode codepoints, not 253 | bytes. 254 | 255 | The actions can call functions from the Sedlexing module to extract 256 | (parts of) the matched lexeme, in the desired encoding. 257 | 258 | Regular expressions are syntactically OCaml patterns: 259 | 260 | - `"...."` (string constant): recognize the specified string 261 | - `'....'` (character constant) : recognize the specified character 262 | - `i` (integer constant) : recognize the specified codepoint 263 | - `'...' .. '...'`: character range 264 | - `i1 .. i2`: range between two codepoints 265 | - `R1 | R2` : alternation 266 | - `R, R2, ..., Rn` : concatenation 267 | - `Star R` : Kleene star (0 or more repetition) 268 | - `Plus R` : equivalent to `R, R*` 269 | - `Opt R` : equivalent to `("" | R)` 270 | - `Rep (R, n)` : equivalent to `R{n}` 271 | - `Rep (R, n .. m)` : equivalent to `R{n, m}` 272 | - `Chars "..."` : recognize any character in the string 273 | - `Compl R` : assume that R is a single-character length regexp (see below) 274 | and recognize the complement set 275 | - `Sub (R1,R2)` : assume that R is a single-character length regexp (see below) 276 | and recognize the set of items in `R1` but not in `R2` ("subtract") 277 | - `Intersect (R1,R2)` : assume that `R` is a single-character length regexp (see 278 | below) and recognize the set of items which are in both `R1` and `R2` 279 | - `lid` (lowercase identifier) : reference a named regexp (see below) 280 | 281 | A single-character length regexp is a regexp which does not contain (after 282 | expansion of references) concatenation, Star, Plus, Opt or string constants 283 | with a length different from one. 284 | 285 | 286 | 287 | Note: 288 | - The OCaml source is assumed to be encoded in Latin1 (for string 289 | and character literals). 290 | 291 | 292 | It is possible to define named regular expressions with the following 293 | construction, that can appear in place of a structure item: 294 | 295 | ```ocaml 296 | let lid = [%sedlex.regexp? R] 297 | ``` 298 | 299 | where lid is the regexp name to be defined and R its definition. The 300 | scope of the "lid" regular expression is the rest of the structure, 301 | after the definition. 302 | 303 | The same syntax can be used for local binding: 304 | 305 | ```ocaml 306 | let lid = [%sedlex.regexp? R] in 307 | body 308 | ``` 309 | 310 | The scope of "lid" is the body expression. 311 | 312 | 313 | ## Predefined regexps 314 | 315 | sedlex provides a set of predefined regexps: 316 | - any: any character 317 | - eof: the virtual end-of-file character 318 | - xml_letter, xml_digit, xml_extender, xml_base_char, xml_ideographic, 319 | xml_combining_char, xml_blank: as defined by the XML recommandation 320 | - tr8876_ident_char: characters names in identifiers from ISO TR8876 321 | - cc, cf, cn, co, cs, ll, lm, lo, lt, lu, mc, me, mn, nd, nl, no, pc, pd, 322 | pe, pf, pi, po, ps, sc, sk, sm, so, zl, zp, zs: as defined by the 323 | Unicode standard (categories) 324 | - alphabetic, ascii_hex_digit, hex_digit, id_continue, id_start, 325 | lowercase, math, other_alphabetic, other_lowercase, other_math, 326 | other_uppercase, uppercase, white_space, xid_continue, xid_start: as 327 | defined by the Unicode standard (properties) 328 | 329 | 330 | ## Running a lexer 331 | 332 | See the interface of the Sedlexing module for a description of how to 333 | create lexbuf values (from strings, stream or channels encoded in 334 | Latin1, utf8 or utf16, or from integer arrays or streams representing 335 | Unicode code points). 336 | 337 | It is possible to work with a custom implementation for lex buffers. 338 | To do this, you just have to ensure that a module called Sedlexing is 339 | in scope of your lexer specifications, and that it defines at least 340 | the following functions: start, next, mark, backtrack. See the interface 341 | of the Sedlexing module for more information. 342 | 343 | 344 | 345 | ## Using sedlex 346 | 347 | The quick way: 348 | 349 | ``` 350 | opam install sedlex 351 | ``` 352 | 353 | 354 | Otherwise, the first thing to do is to compile and install sedlex. 355 | You need a recent version of OCaml and [dune](https://dune.build/). 356 | 357 | ``` 358 | make 359 | ``` 360 | 361 | ### With findlib 362 | 363 | If you have findlib, you can use it to install and use sedlex. 364 | The name of the findlib package is "sedlex". 365 | 366 | Installation (after "make"): 367 | 368 | ``` 369 | make install 370 | ``` 371 | 372 | Compilation of OCaml files with lexer specifications: 373 | 374 | ``` 375 | ocamlfind ocamlc -c -package sedlex my_file.ml 376 | ``` 377 | 378 | When linking, you must also include the sedlex package: 379 | 380 | ``` 381 | ocamlfind ocamlc -o my_prog -linkpkg -package sedlex my_file.cmo 382 | ``` 383 | 384 | 385 | There is also a sedlex.ppx subpackage containing the code of the ppx 386 | filter. This can be used to build custom drivers (combining several ppx 387 | transformations in a single process). 388 | 389 | 390 | ### Without findlib 391 | 392 | You can use sedlex without findlib. To compile, you need to run the 393 | source file through -ppx rewriter ppx_sedlex. Moreover, you need to 394 | link the application with the runtime support library for sedlex 395 | (sedlexing.cma / sedlexing.cmxa). 396 | 397 | ## Examples 398 | 399 | The `examples/` subdirectory contains several samples of sedlex in use. 400 | 401 | ## Contributors 402 | 403 | - Benus Becker: implementation of Utf16 404 | - sghost: for Unicode 6.3 categories and properties 405 | - Peter Zotov: 406 | - improvements to the build system 407 | - switched parts of ppx_sedlex to using concrete syntax (with ppx_metaquot) 408 | - Steffen Smolka: port to dune 409 | - Elliott Cable: publish this BuckleScript-compatible fork on npm 410 | --------------------------------------------------------------------------------