├── .gitignore
├── .travis-ci.sh
├── .travis.yml
├── README.md
├── dune-project
├── humane_re.opam
├── lib
├── dune
├── s.mli
├── str.ml
└── str.mli
└── lib_test
├── dune
├── test1.ml
└── test_blog_post.ml
/.gitignore:
--------------------------------------------------------------------------------
1 | _build/
2 | *.install
3 | *.merlin
4 |
--------------------------------------------------------------------------------
/.travis-ci.sh:
--------------------------------------------------------------------------------
1 | case "$OCAML_VERSION,$OPAM_VERSION" in
2 | 4.01.0,1.1.0) ppa=avsm/ocaml41+opam11 ;;
3 | *) echo Unknown $OCAML_VERSION,$OPAM_VERSION; exit 1 ;;
4 | esac
5 |
6 | echo "yes" | sudo add-apt-repository ppa:$ppa
7 | sudo apt-get update -qq
8 | sudo apt-get install -qq ocaml ocaml-native-compilers camlp4-extra opam time libssl-dev
9 |
10 | export OPAMYES=1
11 | export OPAMVERBOSE=1
12 | echo OCaml version
13 | ocaml -version
14 | echo OPAM versions
15 | opam --version
16 | opam --git-version
17 |
18 | # opam init git://github.com/ocaml/opam-repository >/dev/null 2>&1
19 | opam init
20 | sh ./deps
21 |
22 | eval `opam config env`
23 | make configure
24 | make all
25 | make test
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: c
2 | script: bash -ex .travis-ci.sh
3 | env:
4 | - OCAML_VERSION=4.01.0 OPAM_VERSION=1.1.0
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | humane-re
2 | =========
3 |
4 | Don't have a whole lot of time to read mli's? Do you just want to quickly split a string and get on with your life?
5 |
6 | Well you've come to the right place. Humane-re attempts to provide an easy interface for 90% of your regex needs
7 | Courtesy of ocaml-re, it's also free of:
8 |
9 | * C/C++ dependencies (re2, pcre)
10 |
11 | * Global variables (str)
12 |
--------------------------------------------------------------------------------
/dune-project:
--------------------------------------------------------------------------------
1 | (lang dune 1.11)
2 | (name humane_re)
3 | (explicit_js_mode)
4 |
--------------------------------------------------------------------------------
/humane_re.opam:
--------------------------------------------------------------------------------
1 | opam-version: "2.0"
2 | maintainer: "rudi.grinberg@gmail.com"
3 | authors: ["Rudi Grinberg"]
4 | license: "LGPL-2.0 with OCaml linking exception"
5 |
6 | homepage: "https://github.com/rgrinberg/humane-re"
7 | bug-reports: "https://github.com/rgrinberg/humane-re/issues"
8 | dev-repo: "git+https://github.com/rgrinberg/humane-re.git"
9 |
10 | build: [
11 | ["dune" "subst"] {pinned}
12 | ["dune" "build" "-p" name "-j" jobs
13 | "@install"
14 | "@runtest" {with-test}
15 | "@doc" {with-doc}
16 | ]
17 | ]
18 |
19 | depends: [
20 | "ocaml" {>="4.03.0"}
21 | "dune" {>= "1.11.0"}
22 | "re"
23 | "ounit" {with-test}
24 | ]
25 |
26 | synopsis: "The human friendly interface to ocaml-re"
27 |
28 |
--------------------------------------------------------------------------------
/lib/dune:
--------------------------------------------------------------------------------
1 | (library
2 | (name humane_re)
3 | (public_name humane_re)
4 | (modules s str)
5 | (modules_without_implementation s)
6 | (libraries re re.emacs))
7 |
--------------------------------------------------------------------------------
/lib/s.mli:
--------------------------------------------------------------------------------
1 | (** Signatures Humane-re modules *)
2 |
3 | (** A slice of a string *)
4 | type 'a substr = < pos : int * int ; str : 'a >
5 |
6 | type 'a split_token = [`Token of 'a substr | `Delim of 'a substr]
7 |
8 | module type Group = sig
9 | type t
10 | type str
11 | type index (** The type of the group name. Int usually. *)
12 |
13 | (** Extract the group at [index] if it exists. *)
14 | val group : t -> index -> str option
15 |
16 | (** Same as [group] but raises [Not_found] if group is not matched *)
17 | val group_exn : t -> index -> str
18 |
19 | (** Extracts the group position if it exists *)
20 | val group_pos : t -> index -> (int * int) option
21 |
22 | (** The most general group extraction. Returns the string and the position
23 | if it exists *)
24 | val group_substr : t -> index -> (str substr) option
25 |
26 | (** fold over the group matches in the match order *)
27 | val fold_left : t -> init:'a -> f:('a -> str substr -> 'a) -> 'a
28 |
29 | (** Return all matched strings in the order they were matched *)
30 | val all : t -> str list
31 |
32 | (** Return all matched strings with the group names that were matched. *)
33 | val alli : t -> (index * str) list
34 |
35 | (** Return the string encompassing the full match *)
36 | val full_match : t -> str
37 |
38 | (** Return the position encompassing the full match *)
39 | val full_match_pos : t -> int * int
40 | end
41 |
42 | module type Re = sig
43 | type t (** The type of regular expression *)
44 |
45 | type str (** The type of string we match on *)
46 |
47 | module Group : Group with type str = str
48 |
49 | (** Compile a regular expression from a string *)
50 | val regexp : string -> t
51 |
52 | (** quote [str] returns a regular expression that matches [str] literally *)
53 | val quote : str -> t
54 |
55 | (** return true if the string matches the regular expression *)
56 | val matches : t -> str -> bool
57 |
58 | val split : ?max:int -> t -> str -> str list
59 | val split_delim : ?max:int -> t -> str -> [`Text of str | `Delim of str] list
60 |
61 | val fold_split : t -> str -> init:'b -> f:('b -> str split_token -> 'b) -> 'b
62 | val search_forward : ?start:int -> t -> str -> str substr option
63 |
64 | val fold_left_groups : t -> str -> init:'a -> f:('a -> Group.t -> 'a) ->'a
65 | val find_groups : t -> str -> Group.t list
66 | val find_concat_groups : t -> str -> str list
67 |
68 | val fold_left_match : t -> str -> init:'a -> f:('a -> str substr -> 'a) -> 'a
69 | val find_matches : t -> str -> string list
70 |
71 | (* TODO given up on replacement for now. will revisit once I'm content
72 | with an interface*)
73 |
74 | module Infix : sig
75 | val (=~) : str -> t -> bool
76 | end
77 | end
78 |
--------------------------------------------------------------------------------
/lib/str.ml:
--------------------------------------------------------------------------------
1 | module List = ListLabels
2 | module String = StringLabels
3 | module Array = ArrayLabels
4 |
5 | type t = {
6 | re: Re.t;
7 | mutable mtch: Re.re option; (* match re; we prepend '^' *)
8 | mutable srch: Re.re option; (* search re, says as is *)
9 | }
10 |
11 | type str = string
12 |
13 | let regexp s =
14 | { re = Re.Emacs.re ~case:true s;
15 | mtch = None;
16 | srch = None }
17 |
18 | let quote s =
19 | let len = String.length s in
20 | let buf = Buffer.create (2 * len) in
21 | for i = 0 to len - 1 do
22 | match s.[i] with
23 | '[' | ']' | '*' | '.' | '\\' | '?' | '+' | '^' | '$' as c ->
24 | Buffer.add_char buf '\\';
25 | Buffer.add_char buf c
26 | | c ->
27 | Buffer.add_char buf c
28 | done;
29 | buf |> Buffer.contents |> regexp
30 | ;;
31 |
32 | let rec get_mtch re =
33 | match re.mtch with
34 | | Some r -> r
35 | | None -> re.mtch <- Some (Re.compile (Re.seq [Re.start; re.re]));
36 | get_mtch re
37 |
38 | let matches re s =
39 | try
40 | ignore (Re.exec ~pos:0 (get_mtch re) s);
41 | true
42 | with Not_found -> false
43 |
44 | module Infix = struct
45 | let (=~) s re = matches re s
46 | end
47 |
48 | let string_after s n = String.sub s ~pos:n ~len:(String.length s - n)
49 |
50 | let rec get_srch re =
51 | match re.srch with
52 | | Some r -> r
53 | | None -> re.srch <- Some (Re.compile re.re);
54 | get_srch re
55 |
56 | (* searches forward and *)
57 | let search_forward_exn re s p =
58 | let res = Re.exec ~pos:p (get_srch re) s in
59 | Re.Group.offset res 0
60 |
61 | let search_forward re s p =
62 | try Some (search_forward_exn re s p)
63 | with Not_found -> None
64 |
65 | (* TODO not tail recursive *)
66 | let split ?(max=0) t text =
67 | let rec split start n =
68 | if start > String.length text then [] else
69 | if n = 1 then [string_after text start] else
70 | try
71 | let (pos, match_end) = search_forward_exn t text start in
72 | String.sub text ~pos:start ~len:(pos-start) :: split match_end (n-1)
73 | with Not_found ->
74 | [string_after text start] in
75 | if text = "" then [] else split 0 max
76 | ;;
77 |
78 | (* Not tail recursive, since max is assumed to be small *)
79 | let split_delim ?(max=0) t text =
80 | let rec split start n =
81 | if start >= String.length text then [] else
82 | if n = 1 then [`Text (string_after text start)] else
83 | try
84 | let (pos, match_end) = search_forward_exn t text start in
85 | let s = String.sub text ~pos:pos ~len:(match_end - pos) in
86 | if pos > start then
87 | `Text (String.sub text ~pos:start ~len:(pos-start)) ::
88 | `Delim (s) ::
89 | split match_end (n-1)
90 | else
91 | `Delim (s) :: split match_end (n-1)
92 | with Not_found ->
93 | [`Text (string_after text start)] in
94 | split 0 max
95 | ;;
96 |
97 | let fold_split re text ~init ~f =
98 | let sub start end_ =
99 | object
100 | method pos = (start, end_)
101 | method str = String.sub text ~pos:start ~len:(end_ - start)
102 | end in
103 | let rec split start acc =
104 | if start >= String.length text then acc else
105 | match search_forward re text start with
106 | | None ->
107 | let s = sub start (String.length text) in
108 | f acc (`Token s)
109 | | Some (pos, match_end) ->
110 | let s = sub pos match_end in
111 | if pos > start then
112 | let str = `Token (sub start pos) in
113 | let delim = `Delim (s) in
114 | split match_end (f (f acc str) delim)
115 | else
116 | split match_end (f acc (`Delim s))
117 | in split 0 init
118 | ;;
119 |
120 | module Group = struct
121 | type str = string
122 | type t = {
123 | string: str;
124 | (* first element is the coordinate of the full match. all groups
125 | included. This is not really the cleanest representation but we reuse
126 | it to avoid copying *)
127 | matches: (int * int) array;
128 | }
129 | type index = int
130 |
131 | let of_offsets string matches = { string ; matches }
132 |
133 | let group_pos { matches ; _ } i =
134 | try
135 | let m = matches.(i) in
136 | if fst m = -1
137 | then None
138 | else Some m
139 | with Not_found -> None
140 |
141 | let group t i =
142 | match group_pos t i with
143 | | None -> None
144 | | Some (pos, stop) ->
145 | Some (String.sub t.string ~pos ~len:(stop - pos))
146 |
147 | let group_exn t i =
148 | match group t i with
149 | | None -> raise Not_found
150 | | Some x -> x
151 |
152 | let group_substr t i =
153 | match group_pos t i with
154 | | None -> None
155 | | Some (pos, stop) ->
156 | Some (object
157 | method pos = (pos, stop)
158 | method str = String.sub t.string ~pos ~len:(stop - pos)
159 | end)
160 |
161 | let alli t =
162 | let rec loop acc i =
163 | if i = 0 then acc
164 | else
165 | match group t i with
166 | | None -> loop acc (pred i)
167 | | Some s -> loop ((i, s)::acc) (pred i)
168 | in loop [] (Array.length t.matches - 1)
169 |
170 | let all t = t |> alli |> List.map ~f:snd
171 |
172 | let some_exn = function
173 | | Some x -> x
174 | | _ -> invalid_arg "some_exn"
175 |
176 | let fold_left ({matches; _ } as t) ~init ~f =
177 | let acc = ref init in
178 | for i = 1 to Array.length matches - 1 do
179 | let pos = matches.(i) in
180 | if fst pos <> -1 then
181 | acc := f !acc
182 | (object
183 | method pos = matches.(i)
184 | method str = some_exn (group t i)
185 | end)
186 | done;
187 | !acc
188 | ;;
189 |
190 | let full_match_pos { matches ; _ } = matches.(0)
191 |
192 | let full_match { string ; matches } =
193 | let (pos, stop) = matches.(0) in
194 | String.sub string ~pos ~len:(stop - pos)
195 |
196 | end
197 |
198 | let fold_left_groups t str ~init ~f =
199 | let rec loop acc pos =
200 | try
201 | let res = Re.exec ~pos (get_srch t) str in
202 | let (_, new_pos) = Re.Group.offset res 0 in
203 | let match_t = res |> Re.Group.all_offset |> Group.of_offsets str in
204 | loop (f acc match_t) new_pos
205 | with Not_found -> acc
206 | in loop init 0
207 |
208 | let fold_left_match t str ~init ~f =
209 | fold_left_groups t str ~init ~f:(fun acc match_ ->
210 | f acc @@ object
211 | method pos = Group.full_match_pos match_
212 | method str = Group.full_match match_
213 | end)
214 |
215 | let find_groups t str =
216 | fold_left_groups t str ~init:[] ~f:List.(fun acc x -> x::acc)
217 | |> List.rev
218 |
219 | let find_matches t s =
220 | fold_left_match t s ~init:[]
221 | ~f:(fun acc match_ -> (match_ # str)::acc)
222 | |> List.rev
223 |
224 | let find_concat_groups t str =
225 | str
226 | |> find_groups t
227 | |> List.map ~f:Group.all
228 | |> List.concat
229 |
230 | let search_forward ?(start=0) re str =
231 | try
232 | let res = Re.exec ~pos:start (get_srch re) str in
233 | Some (object (_)
234 | val pos = Re.Group.offset res 0
235 | method pos = pos
236 | method str = String.sub str ~pos:(fst pos) ~len:((snd pos) - (fst pos))
237 | end)
238 | with Not_found -> None
239 |
--------------------------------------------------------------------------------
/lib/str.mli:
--------------------------------------------------------------------------------
1 | (** See s.mli for documentation *)
2 | module Group : (S.Group with type index = int
3 | and type str = string)
4 |
5 | include S.Re with type str = string
6 | and module Group := Group
7 |
--------------------------------------------------------------------------------
/lib_test/dune:
--------------------------------------------------------------------------------
1 | (test
2 | (name test1)
3 | (modules test1)
4 | (libraries humane_re oUnit))
5 |
6 | (test
7 | (name test_blog_post)
8 | (modules test_blog_post)
9 | (libraries humane_re oUnit))
10 |
--------------------------------------------------------------------------------
/lib_test/test1.ml:
--------------------------------------------------------------------------------
1 | open OUnit2
2 |
3 | module Str = Humane_re.Str
4 |
5 | let quote s = "\"" ^ s ^ "\""
6 | let printer strings = "[" ^ (String.concat " " (List.map quote strings)) ^ "]"
7 |
8 | let test_split_simple _ =
9 | let re = Str.regexp "_\\| " in
10 | let str = "test_me one foo bar" in
11 | let s = ["test"; "me"; "one"; "foo"; "bar"] in
12 | let s' = Str.split re str in
13 | assert_equal s s' ~printer
14 |
15 | let test_find_matches _ =
16 | let re = Str.regexp "[0-9]+" in
17 | let str = "123 456 789 testing 000" in
18 | let s = ["123";"456";"789";"000"] in
19 | let s' = Str.find_matches re str in
20 | assert_equal s s' ~printer
21 |
22 | let test_find_concat_groups _ =
23 | let re = Str.regexp "\\([0-9]+\\)_\\([0-9]+\\)" in
24 | let str = "123_789 testin one two 000_111 foobar" in
25 | let s = ["123"; "789"; "000"; "111"] in
26 | let s' = Str.find_concat_groups re str in
27 | assert_equal s s' ~printer
28 |
29 | let test_find_groups _ =
30 | let re = Str.regexp "\\([0-9][0-9]\\) \\([a-z]+\\)" in
31 | let str = "12 fruit 15 apples XXX YYY 19 things" in
32 | let groups = Str.find_groups re str in
33 | assert_equal (List.length groups) 3 ~printer:string_of_int;
34 | let s = [("12 fruit", "12", "fruit");
35 | ("15 apples", "15", "apples");
36 | ("19 things", "19", "things")] in
37 | let s' = groups |> List.map (fun g ->
38 | let fm = Str.Group.full_match g in
39 | match Str.Group.all g with
40 | | [x ; y] -> (fm, x, y)
41 | | _ -> assert_failure "did not match 2 elems"
42 | ) in
43 | assert_equal s s'
44 |
45 | let test_fold_split1 _ =
46 | let test_string = "test:123456 one:456 four:xxx" in
47 | let re = Str.regexp "[: ]" in
48 | let all_tokens = ["test"; "123456"; "one"; "456"; "four"; "xxx"] in
49 | let fs = Str.fold_split re test_string ~init:[]
50 | ~f:(fun acc t ->
51 | match t with
52 | | `Delim _ -> acc
53 | | `Token sub -> (sub # str)::acc) |> List.rev in
54 | assert_equal all_tokens fs ~printer
55 |
56 | let test_bug_1 _ =
57 | let re = Humane_re.Str.regexp "x" in
58 | let res =
59 | Humane_re.Str.fold_split re "axbxc" ~init:[] ~f:(fun acc ->
60 | function
61 | | `Delim s
62 | | `Token s -> s#str :: acc) in
63 | assert_equal ["c"; "x"; "b"; "x"; "a"] res ~printer
64 |
65 | let test_fixtures =
66 | "test Humane_re.Str" >:::
67 | [
68 | "test split simple" >:: test_split_simple;
69 | "test find matches" >:: test_find_matches;
70 | "test find concat groups" >:: test_find_concat_groups;
71 | "test find groups" >:: test_find_groups;
72 | "test fold split1" >:: test_fold_split1;
73 | "test bug 1 - fold split" >:: test_bug_1
74 | ]
75 |
76 | let _ = run_test_tt_main test_fixtures
77 |
--------------------------------------------------------------------------------
/lib_test/test_blog_post.ml:
--------------------------------------------------------------------------------
1 | open OUnit2
2 | module Str = Humane_re.Str
3 |
4 | let is_valid_email =
5 | let email_re = Str.regexp ".+@.+" in
6 | let open Str.Infix in fun email ->
7 | email =~ email_re
8 |
9 | let extract_words = Str.(find_matches (regexp "\\b\\([A-Za-z]+\\)\\b"))
10 |
11 | let parse_header =
12 | let open Str in
13 | let re = regexp ":[ \t]*" in
14 | fun header ->
15 | match split ~max:2 re header with
16 | | [name; value] -> Some (name, value)
17 | | _ -> None
18 |
19 | let test_emails _ =
20 | let test_cases = [
21 | ("", false);
22 | ("dont@spam.me", true);
23 | ("bill@gates.com", true);
24 | ("xxx.yyy@zzz.qqq.com", true);
25 | ("@xxx.com", false);
26 | ("yyy@", false);
27 | ] in
28 | test_cases |> List.iter (fun (email, result) ->
29 | assert_equal (is_valid_email email) result)
30 |
31 | let extract_imgur_links page =
32 | let is_imgur s =
33 | let open Str.Infix in
34 | s =~ (Str.regexp ".+\\bimgur\\.com.+")
35 | in
36 | let re = Str.regexp "\\([^<>]+\\)" in
37 | page
38 | |> Str.fold_left_groups re ~init:[]
39 | ~f:(fun acc g ->
40 | match Str.Group.all g with
41 | | [href; text] when is_imgur href -> (href, text)::acc
42 | | _ -> acc)
43 | |> List.rev
44 |
45 | let list_strings_tuples l =
46 | l
47 | |> List.map (fun (x, y) -> Printf.sprintf "(%s, %s)" x y)
48 | |> String.concat "\n"
49 |
50 | let test_extract _ =
51 | let page = "
52 | text 1 text 2
53 | text 3
54 | random tag
55 | invalid link
56 | imgur" in
57 | let links = extract_imgur_links page in
58 | assert_equal links ~printer:list_strings_tuples
59 | [
60 | ("http://imgur.com/xxx.jpg", "text 1");
61 | ("http://imgur.com/gif.jpg", "text 2");
62 | ("http://imgur.com/png.jpg", "imgur");
63 | ]
64 |
65 | let test_words _ =
66 | let words = "Lorem ipsum dolor sit amet, consectetur adipisicing elit,
67 | sed do eiusmod tempor incididunt ut labore et dolore magna aliqua." in
68 | let words_num = List.length (extract_words words) in
69 | assert_equal words_num 19
70 |
71 | let test_headers _ =
72 | let headers = [
73 | ("User-Agent: firefox", "User-Agent", "firefox");
74 | ("XXX:", "XXX", "");
75 | ("Some-Header: one:two", "Some-Header", "one:two");
76 | ] in
77 | headers |> List.iter (fun (header, k, v) ->
78 | assert_equal (parse_header header) (Some (k, v)));
79 | let none_headers = [
80 | "User-Agent firefox";
81 | "User-Agent";
82 | "";
83 | ] in
84 | none_headers |> List.iter (fun x -> assert_equal (parse_header x) None)
85 |
86 | let test_fixtures =
87 | "test Humane_re.Str blog examplesk" >:::
88 | [
89 | "test validate emails" >:: test_emails;
90 | "test extract words" >:: test_words;
91 | "test parse headers" >:: test_headers;
92 | "test extract links" >:: test_extract;
93 | ]
94 |
95 | let _ = run_test_tt_main test_fixtures
96 |
--------------------------------------------------------------------------------