├── .gitignore ├── .travis-ci.sh ├── .travis.yml ├── README.md ├── dune-project ├── humane_re.opam ├── lib ├── dune ├── s.mli ├── str.ml └── str.mli └── lib_test ├── dune ├── test1.ml └── test_blog_post.ml /.gitignore: -------------------------------------------------------------------------------- 1 | _build/ 2 | *.install 3 | *.merlin 4 | -------------------------------------------------------------------------------- /.travis-ci.sh: -------------------------------------------------------------------------------- 1 | case "$OCAML_VERSION,$OPAM_VERSION" in 2 | 4.01.0,1.1.0) ppa=avsm/ocaml41+opam11 ;; 3 | *) echo Unknown $OCAML_VERSION,$OPAM_VERSION; exit 1 ;; 4 | esac 5 | 6 | echo "yes" | sudo add-apt-repository ppa:$ppa 7 | sudo apt-get update -qq 8 | sudo apt-get install -qq ocaml ocaml-native-compilers camlp4-extra opam time libssl-dev 9 | 10 | export OPAMYES=1 11 | export OPAMVERBOSE=1 12 | echo OCaml version 13 | ocaml -version 14 | echo OPAM versions 15 | opam --version 16 | opam --git-version 17 | 18 | # opam init git://github.com/ocaml/opam-repository >/dev/null 2>&1 19 | opam init 20 | sh ./deps 21 | 22 | eval `opam config env` 23 | make configure 24 | make all 25 | make test -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | script: bash -ex .travis-ci.sh 3 | env: 4 | - OCAML_VERSION=4.01.0 OPAM_VERSION=1.1.0 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | humane-re 2 | ========= 3 | 4 | Don't have a whole lot of time to read mli's? Do you just want to quickly split a string and get on with your life? 5 | 6 | Well you've come to the right place. Humane-re attempts to provide an easy interface for 90% of your regex needs 7 | Courtesy of ocaml-re, it's also free of: 8 | 9 | * C/C++ dependencies (re2, pcre) 10 | 11 | * Global variables (str) 12 | -------------------------------------------------------------------------------- /dune-project: -------------------------------------------------------------------------------- 1 | (lang dune 1.11) 2 | (name humane_re) 3 | (explicit_js_mode) 4 | -------------------------------------------------------------------------------- /humane_re.opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | maintainer: "rudi.grinberg@gmail.com" 3 | authors: ["Rudi Grinberg"] 4 | license: "LGPL-2.0 with OCaml linking exception" 5 | 6 | homepage: "https://github.com/rgrinberg/humane-re" 7 | bug-reports: "https://github.com/rgrinberg/humane-re/issues" 8 | dev-repo: "git+https://github.com/rgrinberg/humane-re.git" 9 | 10 | build: [ 11 | ["dune" "subst"] {pinned} 12 | ["dune" "build" "-p" name "-j" jobs 13 | "@install" 14 | "@runtest" {with-test} 15 | "@doc" {with-doc} 16 | ] 17 | ] 18 | 19 | depends: [ 20 | "ocaml" {>="4.03.0"} 21 | "dune" {>= "1.11.0"} 22 | "re" 23 | "ounit" {with-test} 24 | ] 25 | 26 | synopsis: "The human friendly interface to ocaml-re" 27 | 28 | -------------------------------------------------------------------------------- /lib/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (name humane_re) 3 | (public_name humane_re) 4 | (modules s str) 5 | (modules_without_implementation s) 6 | (libraries re re.emacs)) 7 | -------------------------------------------------------------------------------- /lib/s.mli: -------------------------------------------------------------------------------- 1 | (** Signatures Humane-re modules *) 2 | 3 | (** A slice of a string *) 4 | type 'a substr = < pos : int * int ; str : 'a > 5 | 6 | type 'a split_token = [`Token of 'a substr | `Delim of 'a substr] 7 | 8 | module type Group = sig 9 | type t 10 | type str 11 | type index (** The type of the group name. Int usually. *) 12 | 13 | (** Extract the group at [index] if it exists. *) 14 | val group : t -> index -> str option 15 | 16 | (** Same as [group] but raises [Not_found] if group is not matched *) 17 | val group_exn : t -> index -> str 18 | 19 | (** Extracts the group position if it exists *) 20 | val group_pos : t -> index -> (int * int) option 21 | 22 | (** The most general group extraction. Returns the string and the position 23 | if it exists *) 24 | val group_substr : t -> index -> (str substr) option 25 | 26 | (** fold over the group matches in the match order *) 27 | val fold_left : t -> init:'a -> f:('a -> str substr -> 'a) -> 'a 28 | 29 | (** Return all matched strings in the order they were matched *) 30 | val all : t -> str list 31 | 32 | (** Return all matched strings with the group names that were matched. *) 33 | val alli : t -> (index * str) list 34 | 35 | (** Return the string encompassing the full match *) 36 | val full_match : t -> str 37 | 38 | (** Return the position encompassing the full match *) 39 | val full_match_pos : t -> int * int 40 | end 41 | 42 | module type Re = sig 43 | type t (** The type of regular expression *) 44 | 45 | type str (** The type of string we match on *) 46 | 47 | module Group : Group with type str = str 48 | 49 | (** Compile a regular expression from a string *) 50 | val regexp : string -> t 51 | 52 | (** quote [str] returns a regular expression that matches [str] literally *) 53 | val quote : str -> t 54 | 55 | (** return true if the string matches the regular expression *) 56 | val matches : t -> str -> bool 57 | 58 | val split : ?max:int -> t -> str -> str list 59 | val split_delim : ?max:int -> t -> str -> [`Text of str | `Delim of str] list 60 | 61 | val fold_split : t -> str -> init:'b -> f:('b -> str split_token -> 'b) -> 'b 62 | val search_forward : ?start:int -> t -> str -> str substr option 63 | 64 | val fold_left_groups : t -> str -> init:'a -> f:('a -> Group.t -> 'a) ->'a 65 | val find_groups : t -> str -> Group.t list 66 | val find_concat_groups : t -> str -> str list 67 | 68 | val fold_left_match : t -> str -> init:'a -> f:('a -> str substr -> 'a) -> 'a 69 | val find_matches : t -> str -> string list 70 | 71 | (* TODO given up on replacement for now. will revisit once I'm content 72 | with an interface*) 73 | 74 | module Infix : sig 75 | val (=~) : str -> t -> bool 76 | end 77 | end 78 | -------------------------------------------------------------------------------- /lib/str.ml: -------------------------------------------------------------------------------- 1 | module List = ListLabels 2 | module String = StringLabels 3 | module Array = ArrayLabels 4 | 5 | type t = { 6 | re: Re.t; 7 | mutable mtch: Re.re option; (* match re; we prepend '^' *) 8 | mutable srch: Re.re option; (* search re, says as is *) 9 | } 10 | 11 | type str = string 12 | 13 | let regexp s = 14 | { re = Re.Emacs.re ~case:true s; 15 | mtch = None; 16 | srch = None } 17 | 18 | let quote s = 19 | let len = String.length s in 20 | let buf = Buffer.create (2 * len) in 21 | for i = 0 to len - 1 do 22 | match s.[i] with 23 | '[' | ']' | '*' | '.' | '\\' | '?' | '+' | '^' | '$' as c -> 24 | Buffer.add_char buf '\\'; 25 | Buffer.add_char buf c 26 | | c -> 27 | Buffer.add_char buf c 28 | done; 29 | buf |> Buffer.contents |> regexp 30 | ;; 31 | 32 | let rec get_mtch re = 33 | match re.mtch with 34 | | Some r -> r 35 | | None -> re.mtch <- Some (Re.compile (Re.seq [Re.start; re.re])); 36 | get_mtch re 37 | 38 | let matches re s = 39 | try 40 | ignore (Re.exec ~pos:0 (get_mtch re) s); 41 | true 42 | with Not_found -> false 43 | 44 | module Infix = struct 45 | let (=~) s re = matches re s 46 | end 47 | 48 | let string_after s n = String.sub s ~pos:n ~len:(String.length s - n) 49 | 50 | let rec get_srch re = 51 | match re.srch with 52 | | Some r -> r 53 | | None -> re.srch <- Some (Re.compile re.re); 54 | get_srch re 55 | 56 | (* searches forward and *) 57 | let search_forward_exn re s p = 58 | let res = Re.exec ~pos:p (get_srch re) s in 59 | Re.Group.offset res 0 60 | 61 | let search_forward re s p = 62 | try Some (search_forward_exn re s p) 63 | with Not_found -> None 64 | 65 | (* TODO not tail recursive *) 66 | let split ?(max=0) t text = 67 | let rec split start n = 68 | if start > String.length text then [] else 69 | if n = 1 then [string_after text start] else 70 | try 71 | let (pos, match_end) = search_forward_exn t text start in 72 | String.sub text ~pos:start ~len:(pos-start) :: split match_end (n-1) 73 | with Not_found -> 74 | [string_after text start] in 75 | if text = "" then [] else split 0 max 76 | ;; 77 | 78 | (* Not tail recursive, since max is assumed to be small *) 79 | let split_delim ?(max=0) t text = 80 | let rec split start n = 81 | if start >= String.length text then [] else 82 | if n = 1 then [`Text (string_after text start)] else 83 | try 84 | let (pos, match_end) = search_forward_exn t text start in 85 | let s = String.sub text ~pos:pos ~len:(match_end - pos) in 86 | if pos > start then 87 | `Text (String.sub text ~pos:start ~len:(pos-start)) :: 88 | `Delim (s) :: 89 | split match_end (n-1) 90 | else 91 | `Delim (s) :: split match_end (n-1) 92 | with Not_found -> 93 | [`Text (string_after text start)] in 94 | split 0 max 95 | ;; 96 | 97 | let fold_split re text ~init ~f = 98 | let sub start end_ = 99 | object 100 | method pos = (start, end_) 101 | method str = String.sub text ~pos:start ~len:(end_ - start) 102 | end in 103 | let rec split start acc = 104 | if start >= String.length text then acc else 105 | match search_forward re text start with 106 | | None -> 107 | let s = sub start (String.length text) in 108 | f acc (`Token s) 109 | | Some (pos, match_end) -> 110 | let s = sub pos match_end in 111 | if pos > start then 112 | let str = `Token (sub start pos) in 113 | let delim = `Delim (s) in 114 | split match_end (f (f acc str) delim) 115 | else 116 | split match_end (f acc (`Delim s)) 117 | in split 0 init 118 | ;; 119 | 120 | module Group = struct 121 | type str = string 122 | type t = { 123 | string: str; 124 | (* first element is the coordinate of the full match. all groups 125 | included. This is not really the cleanest representation but we reuse 126 | it to avoid copying *) 127 | matches: (int * int) array; 128 | } 129 | type index = int 130 | 131 | let of_offsets string matches = { string ; matches } 132 | 133 | let group_pos { matches ; _ } i = 134 | try 135 | let m = matches.(i) in 136 | if fst m = -1 137 | then None 138 | else Some m 139 | with Not_found -> None 140 | 141 | let group t i = 142 | match group_pos t i with 143 | | None -> None 144 | | Some (pos, stop) -> 145 | Some (String.sub t.string ~pos ~len:(stop - pos)) 146 | 147 | let group_exn t i = 148 | match group t i with 149 | | None -> raise Not_found 150 | | Some x -> x 151 | 152 | let group_substr t i = 153 | match group_pos t i with 154 | | None -> None 155 | | Some (pos, stop) -> 156 | Some (object 157 | method pos = (pos, stop) 158 | method str = String.sub t.string ~pos ~len:(stop - pos) 159 | end) 160 | 161 | let alli t = 162 | let rec loop acc i = 163 | if i = 0 then acc 164 | else 165 | match group t i with 166 | | None -> loop acc (pred i) 167 | | Some s -> loop ((i, s)::acc) (pred i) 168 | in loop [] (Array.length t.matches - 1) 169 | 170 | let all t = t |> alli |> List.map ~f:snd 171 | 172 | let some_exn = function 173 | | Some x -> x 174 | | _ -> invalid_arg "some_exn" 175 | 176 | let fold_left ({matches; _ } as t) ~init ~f = 177 | let acc = ref init in 178 | for i = 1 to Array.length matches - 1 do 179 | let pos = matches.(i) in 180 | if fst pos <> -1 then 181 | acc := f !acc 182 | (object 183 | method pos = matches.(i) 184 | method str = some_exn (group t i) 185 | end) 186 | done; 187 | !acc 188 | ;; 189 | 190 | let full_match_pos { matches ; _ } = matches.(0) 191 | 192 | let full_match { string ; matches } = 193 | let (pos, stop) = matches.(0) in 194 | String.sub string ~pos ~len:(stop - pos) 195 | 196 | end 197 | 198 | let fold_left_groups t str ~init ~f = 199 | let rec loop acc pos = 200 | try 201 | let res = Re.exec ~pos (get_srch t) str in 202 | let (_, new_pos) = Re.Group.offset res 0 in 203 | let match_t = res |> Re.Group.all_offset |> Group.of_offsets str in 204 | loop (f acc match_t) new_pos 205 | with Not_found -> acc 206 | in loop init 0 207 | 208 | let fold_left_match t str ~init ~f = 209 | fold_left_groups t str ~init ~f:(fun acc match_ -> 210 | f acc @@ object 211 | method pos = Group.full_match_pos match_ 212 | method str = Group.full_match match_ 213 | end) 214 | 215 | let find_groups t str = 216 | fold_left_groups t str ~init:[] ~f:List.(fun acc x -> x::acc) 217 | |> List.rev 218 | 219 | let find_matches t s = 220 | fold_left_match t s ~init:[] 221 | ~f:(fun acc match_ -> (match_ # str)::acc) 222 | |> List.rev 223 | 224 | let find_concat_groups t str = 225 | str 226 | |> find_groups t 227 | |> List.map ~f:Group.all 228 | |> List.concat 229 | 230 | let search_forward ?(start=0) re str = 231 | try 232 | let res = Re.exec ~pos:start (get_srch re) str in 233 | Some (object (_) 234 | val pos = Re.Group.offset res 0 235 | method pos = pos 236 | method str = String.sub str ~pos:(fst pos) ~len:((snd pos) - (fst pos)) 237 | end) 238 | with Not_found -> None 239 | -------------------------------------------------------------------------------- /lib/str.mli: -------------------------------------------------------------------------------- 1 | (** See s.mli for documentation *) 2 | module Group : (S.Group with type index = int 3 | and type str = string) 4 | 5 | include S.Re with type str = string 6 | and module Group := Group 7 | -------------------------------------------------------------------------------- /lib_test/dune: -------------------------------------------------------------------------------- 1 | (test 2 | (name test1) 3 | (modules test1) 4 | (libraries humane_re oUnit)) 5 | 6 | (test 7 | (name test_blog_post) 8 | (modules test_blog_post) 9 | (libraries humane_re oUnit)) 10 | -------------------------------------------------------------------------------- /lib_test/test1.ml: -------------------------------------------------------------------------------- 1 | open OUnit2 2 | 3 | module Str = Humane_re.Str 4 | 5 | let quote s = "\"" ^ s ^ "\"" 6 | let printer strings = "[" ^ (String.concat " " (List.map quote strings)) ^ "]" 7 | 8 | let test_split_simple _ = 9 | let re = Str.regexp "_\\| " in 10 | let str = "test_me one foo bar" in 11 | let s = ["test"; "me"; "one"; "foo"; "bar"] in 12 | let s' = Str.split re str in 13 | assert_equal s s' ~printer 14 | 15 | let test_find_matches _ = 16 | let re = Str.regexp "[0-9]+" in 17 | let str = "123 456 789 testing 000" in 18 | let s = ["123";"456";"789";"000"] in 19 | let s' = Str.find_matches re str in 20 | assert_equal s s' ~printer 21 | 22 | let test_find_concat_groups _ = 23 | let re = Str.regexp "\\([0-9]+\\)_\\([0-9]+\\)" in 24 | let str = "123_789 testin one two 000_111 foobar" in 25 | let s = ["123"; "789"; "000"; "111"] in 26 | let s' = Str.find_concat_groups re str in 27 | assert_equal s s' ~printer 28 | 29 | let test_find_groups _ = 30 | let re = Str.regexp "\\([0-9][0-9]\\) \\([a-z]+\\)" in 31 | let str = "12 fruit 15 apples XXX YYY 19 things" in 32 | let groups = Str.find_groups re str in 33 | assert_equal (List.length groups) 3 ~printer:string_of_int; 34 | let s = [("12 fruit", "12", "fruit"); 35 | ("15 apples", "15", "apples"); 36 | ("19 things", "19", "things")] in 37 | let s' = groups |> List.map (fun g -> 38 | let fm = Str.Group.full_match g in 39 | match Str.Group.all g with 40 | | [x ; y] -> (fm, x, y) 41 | | _ -> assert_failure "did not match 2 elems" 42 | ) in 43 | assert_equal s s' 44 | 45 | let test_fold_split1 _ = 46 | let test_string = "test:123456 one:456 four:xxx" in 47 | let re = Str.regexp "[: ]" in 48 | let all_tokens = ["test"; "123456"; "one"; "456"; "four"; "xxx"] in 49 | let fs = Str.fold_split re test_string ~init:[] 50 | ~f:(fun acc t -> 51 | match t with 52 | | `Delim _ -> acc 53 | | `Token sub -> (sub # str)::acc) |> List.rev in 54 | assert_equal all_tokens fs ~printer 55 | 56 | let test_bug_1 _ = 57 | let re = Humane_re.Str.regexp "x" in 58 | let res = 59 | Humane_re.Str.fold_split re "axbxc" ~init:[] ~f:(fun acc -> 60 | function 61 | | `Delim s 62 | | `Token s -> s#str :: acc) in 63 | assert_equal ["c"; "x"; "b"; "x"; "a"] res ~printer 64 | 65 | let test_fixtures = 66 | "test Humane_re.Str" >::: 67 | [ 68 | "test split simple" >:: test_split_simple; 69 | "test find matches" >:: test_find_matches; 70 | "test find concat groups" >:: test_find_concat_groups; 71 | "test find groups" >:: test_find_groups; 72 | "test fold split1" >:: test_fold_split1; 73 | "test bug 1 - fold split" >:: test_bug_1 74 | ] 75 | 76 | let _ = run_test_tt_main test_fixtures 77 | -------------------------------------------------------------------------------- /lib_test/test_blog_post.ml: -------------------------------------------------------------------------------- 1 | open OUnit2 2 | module Str = Humane_re.Str 3 | 4 | let is_valid_email = 5 | let email_re = Str.regexp ".+@.+" in 6 | let open Str.Infix in fun email -> 7 | email =~ email_re 8 | 9 | let extract_words = Str.(find_matches (regexp "\\b\\([A-Za-z]+\\)\\b")) 10 | 11 | let parse_header = 12 | let open Str in 13 | let re = regexp ":[ \t]*" in 14 | fun header -> 15 | match split ~max:2 re header with 16 | | [name; value] -> Some (name, value) 17 | | _ -> None 18 | 19 | let test_emails _ = 20 | let test_cases = [ 21 | ("", false); 22 | ("dont@spam.me", true); 23 | ("bill@gates.com", true); 24 | ("xxx.yyy@zzz.qqq.com", true); 25 | ("@xxx.com", false); 26 | ("yyy@", false); 27 | ] in 28 | test_cases |> List.iter (fun (email, result) -> 29 | assert_equal (is_valid_email email) result) 30 | 31 | let extract_imgur_links page = 32 | let is_imgur s = 33 | let open Str.Infix in 34 | s =~ (Str.regexp ".+\\bimgur\\.com.+") 35 | in 36 | let re = Str.regexp "\\([^<>]+\\)" in 37 | page 38 | |> Str.fold_left_groups re ~init:[] 39 | ~f:(fun acc g -> 40 | match Str.Group.all g with 41 | | [href; text] when is_imgur href -> (href, text)::acc 42 | | _ -> acc) 43 | |> List.rev 44 | 45 | let list_strings_tuples l = 46 | l 47 | |> List.map (fun (x, y) -> Printf.sprintf "(%s, %s)" x y) 48 | |> String.concat "\n" 49 | 50 | let test_extract _ = 51 | let page = " 52 | text 1 text 2 53 | text 3 54 | random tag 55 | invalid link 56 | imgur" in 57 | let links = extract_imgur_links page in 58 | assert_equal links ~printer:list_strings_tuples 59 | [ 60 | ("http://imgur.com/xxx.jpg", "text 1"); 61 | ("http://imgur.com/gif.jpg", "text 2"); 62 | ("http://imgur.com/png.jpg", "imgur"); 63 | ] 64 | 65 | let test_words _ = 66 | let words = "Lorem ipsum dolor sit amet, consectetur adipisicing elit, 67 | sed do eiusmod tempor incididunt ut labore et dolore magna aliqua." in 68 | let words_num = List.length (extract_words words) in 69 | assert_equal words_num 19 70 | 71 | let test_headers _ = 72 | let headers = [ 73 | ("User-Agent: firefox", "User-Agent", "firefox"); 74 | ("XXX:", "XXX", ""); 75 | ("Some-Header: one:two", "Some-Header", "one:two"); 76 | ] in 77 | headers |> List.iter (fun (header, k, v) -> 78 | assert_equal (parse_header header) (Some (k, v))); 79 | let none_headers = [ 80 | "User-Agent firefox"; 81 | "User-Agent"; 82 | ""; 83 | ] in 84 | none_headers |> List.iter (fun x -> assert_equal (parse_header x) None) 85 | 86 | let test_fixtures = 87 | "test Humane_re.Str blog examplesk" >::: 88 | [ 89 | "test validate emails" >:: test_emails; 90 | "test extract words" >:: test_words; 91 | "test parse headers" >:: test_headers; 92 | "test extract links" >:: test_extract; 93 | ] 94 | 95 | let _ = run_test_tt_main test_fixtures 96 | --------------------------------------------------------------------------------