├── .gitignore ├── dune-project ├── lib ├── dune ├── naive.ml ├── kmp.ml ├── kmp_online.ml ├── naive2.ml ├── api.ml └── two_way.ml ├── bench ├── dune └── main.ml ├── test ├── dune └── basic.ml ├── bench_scripts ├── human.sh ├── worst.sh ├── cold_small_all.sh ├── bench_random.sh ├── large.sh ├── cold_small.sh └── non_shared.sh └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | -------------------------------------------------------------------------------- /dune-project: -------------------------------------------------------------------------------- 1 | (lang dune 3.0) 2 | -------------------------------------------------------------------------------- /lib/dune: -------------------------------------------------------------------------------- 1 | (library (name search)) 2 | -------------------------------------------------------------------------------- /bench/dune: -------------------------------------------------------------------------------- 1 | (executable (name main) (libraries search)) 2 | -------------------------------------------------------------------------------- /test/dune: -------------------------------------------------------------------------------- 1 | (tests (names basic) (libraries search)) 2 | -------------------------------------------------------------------------------- /bench_scripts/human.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | hyperfine --warmup 10 \ 4 | "./bench.exe -search naive -n-iteration 10_000 -data human"\ 5 | "./bench.exe -search kmp -n-iteration 10_000 -data human"\ 6 | "./bench.exe -search two-way -n-iteration 10_000 -data human" 7 | -------------------------------------------------------------------------------- /bench_scripts/worst.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | 4 | hyperfine --warmup 10 \ 5 | "./bench.exe -search naive -n-iteration 10_000 -data worst"\ 6 | "./bench.exe -search kmp -n-iteration 10_000 -data worst"\ 7 | "./bench.exe -search two-way -n-iteration 10_000 -data worst" 8 | -------------------------------------------------------------------------------- /bench_scripts/cold_small_all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | 4 | 5 | parameters="-mode replace-all -n-iteration 10_00 -data random -shared false -needle-size 100 " 6 | hyperfine --warmup 10 \ 7 | "./bench.exe -search naive $parameters"\ 8 | "./bench.exe -search two-way $parameters" 9 | -------------------------------------------------------------------------------- /bench_scripts/bench_random.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | 4 | hyperfine --warmup 10 \ 5 | "./bench.exe -search naive -n-iteration 10_000 -data random"\ 6 | "./bench.exe -search kmp -n-iteration 10_000 -data random"\ 7 | "./bench.exe -search two-way -n-iteration 10_000 -data random" 8 | -------------------------------------------------------------------------------- /bench_scripts/large.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | 4 | parameters="-n-iteration 10_000 -data random -shared true -needle-size 100 " 5 | hyperfine --warmup 10 \ 6 | "./bench.exe -search naive $parameters"\ 7 | "./bench.exe -search kmp $parameters"\ 8 | "./bench.exe -search two-way $parameters" 9 | -------------------------------------------------------------------------------- /bench_scripts/cold_small.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | 4 | parameters="-n-iteration 10_000 -data random-small -shared false -needle-size 2 -seed 44" 5 | hyperfine --warmup 10 \ 6 | "./bench.exe -search naive $parameters"\ 7 | "./bench.exe -search kmp $parameters"\ 8 | "./bench.exe -search two-way $parameters" 9 | -------------------------------------------------------------------------------- /bench_scripts/non_shared.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | hyperfine --warmup 10 \ 4 | "./bench.exe -search naive -n-iteration 10_000 -data human -shared=false"\ 5 | "./bench.exe -search kmp -n-iteration 10_000 -data human -shared=false"\ 6 | "./bench.exe -search two-way -n-iteration 10_000 -data human -shared=false" 7 | -------------------------------------------------------------------------------- /lib/naive.ml: -------------------------------------------------------------------------------- 1 | let search needle s = 2 | let nlen = String.length needle in 3 | let slen = String.length s in 4 | let rec search ~npos ~needle ~spos ~s = 5 | if npos >= nlen then Some (spos-nlen) 6 | else if spos >= slen then None 7 | else if s.[spos] = needle.[npos] then 8 | search ~npos:(npos+1) ~needle ~spos:(spos + 1) ~s 9 | else 10 | search ~npos:0 ~needle ~spos:(spos - npos + 1 ) ~s 11 | in 12 | search ~npos:0 ~needle ~spos:0 ~s 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Florian Angeletti 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /test/basic.ml: -------------------------------------------------------------------------------- 1 | let implementations = [ 2 | "naive", Search.Naive.search; 3 | "kmp", Search.Kmp.search; 4 | "kmp_online", Search.Kmp_online.search; 5 | (* "kmp_compressed", Search.Kmp_compressed.search; *) 6 | ] 7 | 8 | let pp ppf = function 9 | | None -> Format.fprintf ppf "Not found" 10 | | Some x -> Format.fprintf ppf "%d" x 11 | 12 | let test (name,i) number (needle, s, expected) = 13 | match let r = i needle s in r = expected, r with 14 | | true, _ -> Format.printf "%d. %s:[ok]@." number name 15 | | false, r -> 16 | Format.printf "%d. %s:[ERROR] %a/%a@." 17 | number name 18 | pp r 19 | pp expected 20 | | exception e -> Format.printf "%d. %s:[CRASH]@." number name; raise e 21 | 22 | let tests = [ 23 | "a", "aa", Some 0; 24 | "b", "aaa", None; 25 | "a", "bbba", Some 3; 26 | "abc", "aaabc", Some 2; 27 | "aa", "abaa", Some 2; 28 | "aaba", "aaaaba", Some 2; 29 | "abaa", "ababaae", Some 2; 30 | "abcabcd", "abcabcabcdee", Some 3; 31 | "aababcabcd", "aadabceabcdeabcaabababcfaababcabcdabcdeabcdef", Some 24; 32 | ] 33 | 34 | 35 | let () = 36 | Format.printf "@."; 37 | List.iter (fun i -> 38 | List.iteri (test i) tests 39 | ) implementations; 40 | -------------------------------------------------------------------------------- /lib/kmp.ml: -------------------------------------------------------------------------------- 1 | 2 | let compute_backedges needle = 3 | let nlen = String.length needle in 4 | (* edges.(i) = k means that if we fail to accept a new character at position 5 | i, we should retry at position k *) 6 | let edges = Array.make nlen 0 in 7 | edges.(0) <- -1; 8 | let rec fill ~back_pos ~new_pos = 9 | if new_pos >= nlen then () 10 | else 11 | let new_char = needle.[new_pos] in 12 | if new_char = needle.[back_pos] then begin 13 | edges.(new_pos) <- edges.(back_pos); 14 | fill ~back_pos:(back_pos+1) ~new_pos:(new_pos+1) 15 | end 16 | else begin 17 | edges.(new_pos) <- back_pos; 18 | let back_pos = go_back new_char edges.(back_pos) in 19 | fill ~back_pos ~new_pos:(new_pos+1) 20 | end 21 | and go_back new_char back_pos = 22 | if back_pos <= 0 then 0 23 | else if needle.[back_pos] = new_char then 24 | back_pos 25 | else 26 | go_back new_char edges.(back_pos) 27 | in 28 | fill ~back_pos:0 ~new_pos:1; 29 | edges 30 | 31 | let execute_search edges ~needle s = 32 | let nlen = String.length needle in 33 | let slen = String.length s in 34 | let rec search edges needle nlen slen ~npos pos s = 35 | if npos >= nlen then Some (pos-nlen) 36 | else if pos >= slen then None 37 | else if needle.[npos] = s.[pos] then 38 | search edges needle nlen slen ~npos:(1+npos) (1+pos) s 39 | else if npos = 0 then 40 | search edges needle nlen slen ~npos:0 (1+pos) s 41 | else 42 | let npos = edges.(npos) in 43 | if npos < 0 then 44 | search edges needle nlen slen ~npos:0 (1+pos) s 45 | else 46 | search edges needle nlen slen ~npos pos s 47 | in 48 | search edges needle nlen slen ~npos:0 0 s 49 | 50 | 51 | let search needle = 52 | if String.length needle = 0 then fun _ -> Some 0 53 | else 54 | let edges = compute_backedges needle in 55 | fun s -> execute_search ~needle edges s 56 | 57 | 58 | let find_sub ?start:_ ~sub = 59 | search sub 60 | 61 | let replace_first ?start ~sub:needle ~by s = 62 | match find_sub ~start ~sub:needle s with 63 | | None -> s 64 | | Some i -> 65 | let rest_first = i + String.length needle in 66 | let rest_len = String.length s - i - String.length needle in 67 | String.concat by String.[sub s 0 i; sub s rest_first rest_len] 68 | 69 | let replace_all ?start:_ ~sub:_ ~by:_ _ = failwith "Not implemented" 70 | -------------------------------------------------------------------------------- /lib/kmp_online.ml: -------------------------------------------------------------------------------- 1 | type backedge_state = 2 | | On_going of { 3 | last_pos:int; 4 | back_pos:int; 5 | edges: int array; 6 | } 7 | | Done of int array 8 | 9 | let init_backedges needle = 10 | let edges = Array.make (String.length needle) 0 in 11 | edges.(0) <- -1; 12 | On_going { last_pos = 0; back_pos = 0; edges } 13 | 14 | let rec go_back needle edges new_char back_pos = 15 | if back_pos <= 0 then 0 16 | else if needle.[back_pos] = new_char then 17 | back_pos 18 | else 19 | go_back needle edges new_char edges.(back_pos) 20 | 21 | let execute_update_backedges edges back_pos needle new_pos = 22 | let new_char = needle.[new_pos] in 23 | if new_char = needle.[back_pos] then begin 24 | edges.(new_pos) <- edges.(back_pos); 25 | On_going { edges; back_pos=back_pos+1; last_pos = new_pos } 26 | end 27 | else begin 28 | edges.(new_pos) <- back_pos; 29 | let back_pos = go_back needle edges new_char edges.(back_pos) in 30 | On_going { edges; back_pos; last_pos = new_pos} 31 | end 32 | 33 | let update_backedges state needle new_pos = match state with 34 | | Done _ -> state 35 | | On_going {last_pos;back_pos;edges} -> 36 | if new_pos > last_pos then 37 | execute_update_backedges edges back_pos needle new_pos 38 | else state 39 | 40 | let backedge state pos = match state with 41 | | Done e -> e.(pos) 42 | | On_going r -> r.edges.(pos) 43 | 44 | let search needle s = 45 | let nlen = String.length needle in 46 | let slen = String.length s in 47 | let rec search edges ~npos pos = 48 | if npos >= nlen then Some (pos-nlen) 49 | else if pos >= slen then None 50 | else 51 | let edges = update_backedges edges needle npos in 52 | if needle.[npos] = s.[pos] then 53 | search edges ~npos:(1+npos) (1+pos) 54 | else if npos = 0 then 55 | search edges ~npos:0 (1+pos) 56 | else 57 | let npos = backedge edges npos in 58 | if npos < 0 then 59 | search edges ~npos:0 (1+pos) 60 | else 61 | search edges ~npos pos 62 | in 63 | search (init_backedges needle) ~npos:0 0 64 | 65 | 66 | 67 | let find_sub ?start:_ ~sub = 68 | search sub 69 | 70 | let replace_first ?start ~sub:needle ~by s = 71 | match find_sub ~start ~sub:needle s with 72 | | None -> s 73 | | Some i -> 74 | let rest_first = i + String.length needle in 75 | let rest_len = String.length s - i - String.length needle in 76 | String.concat by String.[sub s 0 i; sub s rest_first rest_len] 77 | 78 | let replace_all ?start:_ ~sub:_ ~by:_ _ = failwith "Not implemented" 79 | -------------------------------------------------------------------------------- /lib/naive2.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2025 Daniel C. Bünzli. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | open String 7 | 8 | 9 | let invalid_start ~start len = 10 | let i = Int.to_string in 11 | invalid_arg @@ concat "" ["start: "; i start; "not in range [0;"; i len; "]"] 12 | 13 | let is_sub ~sublen ~sub s j = 14 | let i = ref 0 in 15 | while !i < sublen && Char.equal (get s (j + !i)) (get sub !i) 16 | do incr i done; 17 | !i = sublen 18 | 19 | let primitive_find_sub ~start ~sub s = 20 | let slen = length s in 21 | if not (0 <= start && start <= slen) then invalid_start ~start slen else 22 | let sublen = length sub in 23 | let smax = slen - sublen in 24 | let j = ref start in 25 | while !j <= smax && not (is_sub ~sublen ~sub s !j) do incr j done; 26 | if !j <= smax then !j else -1 27 | 28 | let primitive_rfind_sub ~start ~sub s = 29 | let slen = length s in 30 | if not (0 <= start && start <= slen) then invalid_start ~start slen else 31 | let sublen = length sub in 32 | let smax = length s - sublen in 33 | let j = ref (if start > smax then smax else start) in 34 | while !j >= 0 && not (is_sub ~sublen ~sub s !j) do decr j done; 35 | if !j >= 0 then !j else -1 36 | 37 | let includes ~affix s = primitive_find_sub ~start:0 ~sub:affix s <> -1 38 | 39 | let find_sub ?(start = 0) ~sub s = 40 | match primitive_find_sub ~start ~sub s with -1 -> None | i -> Some i 41 | 42 | let rfind_sub ?start ~sub s = 43 | let start = match start with None -> length s | Some s -> s in 44 | match primitive_rfind_sub ~start ~sub s with -1 -> None | i -> Some i 45 | 46 | let find_all_sub ?(start = 0) f ~sub s acc = 47 | let rec loop f acc sub s ~start ~slen = 48 | if start > slen then acc else 49 | match primitive_find_sub ~start ~sub s with 50 | | -1 -> acc 51 | | i -> 52 | let acc = f i acc in 53 | let start = i + Int.max (length sub) 1 in 54 | loop f acc sub s ~start ~slen 55 | in 56 | let slen = length s in 57 | if not (0 <= start && start <= slen) then invalid_start ~start slen else 58 | loop f acc sub s ~start ~slen 59 | 60 | let rfind_all_sub ?start f ~sub s acc = 61 | let rec loop f acc sub s ~start ~slen = 62 | if start < 0 then acc else 63 | match primitive_rfind_sub ~start ~sub s with 64 | | -1 -> acc 65 | | i -> 66 | let start = i - Int.max (length sub) 1 in 67 | loop f (f i acc) sub s ~start ~slen 68 | in 69 | let slen = length s in 70 | let start = match start with None -> length s | Some s -> s in 71 | if not (0 <= start && start <= slen) then invalid_start ~start slen else 72 | loop f acc sub s ~start ~slen 73 | 74 | let replace_first ?(start = 0) ~sub:needle ~by s = 75 | match primitive_find_sub ~start ~sub:needle s with 76 | | -1 -> s 77 | | i -> 78 | let rest_first = i + length needle in 79 | let rest_len = length s - i - length needle in 80 | concat by [sub s 0 i; sub s rest_first rest_len] 81 | 82 | let replace_all ?start ~sub:needle ~by s = 83 | let chunk_first = ref 0 in 84 | let add_chunk i acc = 85 | let last_chunk = sub s !chunk_first (i - !chunk_first) in 86 | chunk_first := i + length needle; last_chunk :: acc 87 | in 88 | match find_all_sub ?start add_chunk ~sub:needle s [] with 89 | | [] -> s 90 | | chunks -> 91 | let last_chunk = sub s !chunk_first (length s - !chunk_first) in 92 | concat by (List.rev (last_chunk :: chunks)) 93 | -------------------------------------------------------------------------------- /bench/main.ml: -------------------------------------------------------------------------------- 1 | 2 | let _exp scale () = int_of_float (-. scale *. log (Random.float 1.)) 3 | 4 | let substring len text = 5 | let tlen = String.length text in 6 | let free_len = tlen - len in 7 | if free_len < 0 then failwith "Too large human text size" else 8 | let off0 = free_len/2 in 9 | let offset = off0 + Random.int off0 in 10 | String.sub text offset len 11 | 12 | let multisample_shared f ~haystack:text niter = 13 | for _i = 1 to niter do f text done 14 | 15 | let multisample f ~needle ~haystack niter = 16 | for _i = 0 to niter-1 do f needle haystack done 17 | 18 | 19 | type mode = 20 | | Replace_first 21 | | Replace_all 22 | | Search_first 23 | 24 | type bench_kind = 25 | | Worst 26 | | Random_large 27 | | Random_small 28 | | Human 29 | 30 | 31 | let implems: (string * (module Search.Api.Bench)) list = [ 32 | "naive", (module Search.Naive2); 33 | "kmp", (module Search.Kmp); 34 | "kmp_online", (module Search.Kmp_online); 35 | "two-way", (module Search.Two_way) 36 | ] 37 | 38 | let modes = [ 39 | "search", Search_first; 40 | "replace-first", Replace_first; 41 | "replace-all", Replace_all 42 | ] 43 | 44 | let data_kinds = [ 45 | "worst", Worst; 46 | "human", Human; 47 | "random", Random_large; 48 | "random-small", Random_small 49 | ] 50 | 51 | 52 | let symbol r l = 53 | Arg.Symbol (List.map fst l, fun s -> r := List.assoc s l) 54 | 55 | let implem = ref (module Search.Naive2: Search.Api.Bench) 56 | and shared = ref false 57 | and niter = ref 1000 58 | and size = ref 10 59 | and mode = ref Search_first 60 | and data = ref Human 61 | and haystack_size = ref (4 * 1024) 62 | and seed = ref 45 63 | 64 | let args = [ 65 | "-search", symbol implem implems, "implem"; 66 | "-data", symbol data data_kinds, "benchmark data kind"; 67 | "-mode", symbol mode modes, "function to benchmark"; 68 | "-needle-size", Arg.Int ((:=) size), "needle size"; 69 | "-haystack-size", Arg.Int ((:=) haystack_size), "haystack size"; 70 | "-n-iteration", Arg.Int ((:=) niter), "number of iteration"; 71 | "-shared", Arg.Bool ((:=) shared), "share needle compilation"; 72 | "-seed", Arg.Int ((:=) seed), "random seed" 73 | ] 74 | 75 | let _human_text size = 76 | substring size Text.data 77 | 78 | (* Benchmarking *) 79 | 80 | 81 | 82 | 83 | let random_string n = String.init n (fun _ -> Char.chr (0x20 + Random.int (0x7E-0x20))) 84 | let random_small_string alphasize n = 85 | String.init n (fun _ -> Char.chr (0x61 + Random.int alphasize)) 86 | 87 | 88 | (* This code is Copyright (c) 2025 Daniel C. Bünzli. All rights reserved. 89 | SPDX-License-Identifier: ISC *) 90 | let worst_ab_string n = (* a^(n-1)b *) 91 | String.init n (fun i -> if i = n - 1 then 'b' else 'a') 92 | 93 | let bench_needle needle_size data_kind () = match data_kind with 94 | | Worst -> worst_ab_string needle_size 95 | | Random_large -> random_string needle_size 96 | | Random_small -> random_small_string 2 needle_size 97 | | Human -> substring needle_size Text.data 98 | 99 | let bench_haystack haystack_size data_kind () = match data_kind with 100 | | Worst -> Stdlib.String.make haystack_size 'a' 101 | | Random_large -> random_string haystack_size 102 | | Random_small -> random_small_string 2 haystack_size 103 | | Human -> substring haystack_size Text.data 104 | 105 | 106 | let compile (module String:Search.Api.Bench) mode needle = 107 | let ignored f h = Sys.opaque_identity (ignore (f h)) in 108 | let f = 109 | match mode with 110 | | Replace_first -> ignored @@ String.replace_first ~sub:needle ~by:"" 111 | | Search_first -> ignored @@ String.find_sub ?start:None ~sub:needle 112 | | Replace_all -> ignored @@ String.replace_all ~sub:needle ~by:"" 113 | in 114 | fun h -> ignore (f h) 115 | 116 | 117 | 118 | let () = 119 | Arg.parse args ignore "main -search {kmp,naive}"; 120 | Random.init !seed; 121 | let niter = !niter in 122 | let needle = bench_needle !size !data () in 123 | Format.eprintf "needle=%S@." needle; 124 | let haystack = bench_haystack !haystack_size !data () in 125 | if !shared then 126 | let f = compile !implem !mode needle in 127 | multisample_shared f ~haystack niter 128 | else 129 | multisample (compile !implem !mode) 130 | ~needle 131 | ~haystack 132 | niter 133 | -------------------------------------------------------------------------------- /lib/api.ml: -------------------------------------------------------------------------------- 1 | module type T = sig 2 | val includes : affix:string -> string -> bool 3 | (** [includes ~affix s] is [true] if and only if [s] has [affix] as 4 | a substring. 5 | 6 | @since X.XX *) 7 | 8 | val find_sub : ?start:int -> sub:string -> string -> int option 9 | (** [find_sub ~start ~sub s] is the start position (if any) of the 10 | first occurence of [sub] in [s] after or at position [start] 11 | (which includes index [start] if it exists, defaults to [0]). 12 | 13 | Note if you need to search for [sub] multiple times in [s] use 14 | {!find_all_sub} it is more efficient. 15 | 16 | @raise Invalid_argument if [start] is not a valid position of [s]. 17 | 18 | @since X.XX *) 19 | 20 | val rfind_sub : ?start:int -> sub:string -> string -> int option 21 | (** [rfind_sub ~start ~sub s] is the start position (if any) of the 22 | first occurences of [sub] in [s] before or at position [start] 23 | (which includes index [start] if it exists, defaults to 24 | [String.length s]). 25 | 26 | Note if you need to search for [sub] multiple times in [s] use 27 | {!rfind_all_sub} it is more efficient. 28 | 29 | @raise Invalid_argument if [start] is not a valid position of [s]. 30 | 31 | @since X.XX *) 32 | 33 | val find_all_sub : 34 | ?start:int -> (int -> 'acc -> 'acc) -> sub:string -> string -> 'acc -> 'acc 35 | (** [find_all_sub ~start f ~sub s acc], starting with [acc], folds [f] over 36 | all non-overlapping starting positions of [sub] in [s] after or at 37 | position [start] (which includes index [start] if it exists, defaults 38 | to [0]). This is [acc] if [sub] could not be found in [s]. 39 | 40 | @raise Invalid_argument if [start] is not a valid position of [s]. 41 | 42 | @since X.XX *) 43 | 44 | val rfind_all_sub : 45 | ?start:int -> (int -> 'acc -> 'acc) -> sub:string -> string -> 'acc -> 'acc 46 | (** [rfind_all_sub ~start f ~sub s acc], starting with [acc], folds 47 | [f] over all non-overlapping starting positions of [sub] in [s] 48 | before or at position [start] (which includes index [start] if 49 | it exists, defaults to [String.length s]). This is [acc] if 50 | [sub] could not be found in [s]. 51 | 52 | @raise Invalid_argument if [start] is not a valid position of [s]. 53 | 54 | @since X.XX *) 55 | 56 | val replace_first : ?start:int -> sub:string -> by:string -> string -> string 57 | (** [replace_first ~start ~sub ~by s] replaces in [s] the first occurence 58 | of [sub] at or after position [start] (defaults to [0]) by [by]. 59 | 60 | @raise Invalid_argument if [start] is not a valid position of [s]. 61 | 62 | @since X.XX *) 63 | 64 | val replace_all : ?start:int -> sub:string -> by:string -> string -> string 65 | (** [replace_all ~start ~sub ~by] replaces in [s] all 66 | non-overlapping occurences of [sub] at or after position [start] 67 | (default to [0]) by [by]. 68 | 69 | @raise Invalid_argument if [start] is not a valid position of [s]. 70 | 71 | @since X.XX *) 72 | end 73 | 74 | module type Bench= sig 75 | val find_sub : ?start:int -> sub:string -> string -> int option 76 | (** [find_sub ~start ~sub s] is the start position (if any) of the 77 | first occurence of [sub] in [s] after or at position [start] 78 | (which includes index [start] if it exists, defaults to [0]). 79 | 80 | Note if you need to search for [sub] multiple times in [s] use 81 | {!find_all_sub} it is more efficient. 82 | 83 | @raise Invalid_argument if [start] is not a valid position of [s]. 84 | 85 | @since X.XX *) 86 | 87 | val replace_first : ?start:int -> sub:string -> by:string -> string -> string 88 | (** [replace_first ~start ~sub ~by s] replaces in [s] the first occurence 89 | of [sub] at or after position [start] (defaults to [0]) by [by]. 90 | 91 | @raise Invalid_argument if [start] is not a valid position of [s]. 92 | 93 | @since X.XX *) 94 | 95 | val replace_all : ?start:int -> sub:string -> by:string -> string -> string 96 | (** [replace_all ~start ~sub ~by] replaces in [s] all 97 | non-overlapping occurences of [sub] at or after position [start] 98 | (default to [0]) by [by]. 99 | *) 100 | 101 | end 102 | -------------------------------------------------------------------------------- /lib/two_way.ml: -------------------------------------------------------------------------------- 1 | (*--------------------------------------------------------------------------- 2 | Copyright (c) 2025 Daniel C. Bünzli. All rights reserved. 3 | SPDX-License-Identifier: ISC 4 | ---------------------------------------------------------------------------*) 5 | 6 | (** {1:design_notes Design notes} 7 | 8 | {ul 9 | {- We implement string search with the 10 | {{:https://doi.org/10.1145/116825.116845}two way algorithm}. It 11 | has a good worse case complexity and the result of 12 | pre-preprocessing is only two integers (versus allocating tables 13 | for Knuth-Morris-Pratt or Boyer-Moore).} 14 | {- We raise on invalid position, we could also return [None] 15 | or make the functions behave like the identity (if applicable). But 16 | this is more in the spirit of the current [String] module and it's 17 | also unclear which one is best.} 18 | {- In `rfind_sub`, the [start] argument has a different semantics from 19 | the argument [r*_from] functions. The advantage of the definition 20 | used below is that it coincides with the mental model of indices 21 | which is the one you want to work with.}} 22 | {- The [start] argument remains however a position (vs index) 23 | so that definitions on empty arguments are consistent and well 24 | defined without fuss.}} *) 25 | 26 | (** String search and replace functions. *) 27 | 28 | (** Implement API with two way string search *) 29 | (* See https://doi.org/10.1145/116825.116845 or 30 | http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 *) 31 | 32 | open String 33 | 34 | 35 | let invalid_start ~start len = 36 | let i = Int.to_string in 37 | invalid_arg @@ concat "" ["start: "; i start; "not in range [0;"; i len; "]"] 38 | 39 | let find_maximal_suffix_and_period ~sub = 40 | let sublen = length sub in 41 | let i = ref (-1) and j = ref 0 and k = ref 1 and p = ref 1 in 42 | let[@inline] maximal_suffix ~order = 43 | while (!j + !k < sublen) do 44 | let c = order * Char.compare (get sub (!j + !k)) (get sub (!i + !k)) in 45 | if c < 0 then (j := !j + !k; k := 1; p := !j - !i) else 46 | if c > 0 then (i := !j; j := !i + 1; k := 1; p := 1) else (* c = 0 *) 47 | if !k = !p then (j := !j + !p; k := 1) else incr k 48 | done; 49 | in 50 | (maximal_suffix[@inlined]) ~order:1; 51 | let l0 = !i and p0 = !p in 52 | i := -1; j := 0; k := 1; p := 1; 53 | (maximal_suffix[@inlined]) ~order:(-1); 54 | let l1 = !i and p1 = !p in 55 | if l0 > l1 then (l0, p0) else (l1, p1) 56 | 57 | let periodic_sub ~sub ~sub_lp:(l, p) = 58 | let i = ref 0 in 59 | while !i <= l && Char.equal (get sub !i) (get sub (!i + p)) 60 | do incr i done; 61 | !i > l 62 | 63 | let primitive_find_sub ~start ~sub ~sub_lp:(l, p as sub_lp) s = 64 | let slen = length s in 65 | if not (0 <= start && start <= slen) then invalid_start ~start slen else 66 | let sublen = length sub in 67 | let smax = slen - sublen in 68 | let j = ref start in 69 | try 70 | if periodic_sub ~sub ~sub_lp then begin 71 | let memory = ref (-1) in 72 | while (!j <= smax) do 73 | let i = ref (1 + Int.max l !memory) in 74 | while (!i < sublen && Char.equal (get sub !i) (get s (!i + !j))) 75 | do incr i done; 76 | if !i < sublen then (j := !j + (!i - l); memory := -1) else 77 | begin 78 | i := l; 79 | while (!i > !memory && Char.equal (get sub !i) (get s (!i + !j))) 80 | do decr i done; 81 | if !i <= !memory then raise_notrace Exit else 82 | (j := !j + p; memory := sublen - p - 1) 83 | end 84 | done; 85 | -1 86 | end else begin 87 | let p = 1 + Int.max (l + 1) (sublen - l - 1) in 88 | while (!j <= smax) do 89 | let i = ref (l + 1) in 90 | while (!i < sublen && Char.equal (get sub !i) (get s (!i + !j))) 91 | do incr i done; 92 | if !i < sublen then (j := !j + (!i - l)) else 93 | begin 94 | i := l; 95 | while (!i >= 0 && Char.equal (get sub !i) (get s (!i + !j))) 96 | do decr i done; 97 | if !i < 0 then raise_notrace Exit else (j := !j + p) 98 | end 99 | done; 100 | -1 101 | end 102 | with Exit -> !j 103 | 104 | let primitive_rfind_sub ~start ~sub ~sub_lp:(l, p as sub_lp) s = 105 | (* Note this is the same as above except for the assignement 106 | and test logic on [j] where we move from right to left. *) 107 | let slen = length s in 108 | if not (0 <= start && start <= slen) then invalid_start ~start slen else 109 | let sublen = length sub in 110 | let smax = slen - sublen in 111 | let j = ref (if start > smax then smax else start) in 112 | try 113 | if periodic_sub ~sub ~sub_lp then begin 114 | let memory = ref (-1) in 115 | while (!j >= 0) do 116 | let i = ref (1 + Int.max l !memory) in 117 | while (!i < sublen && Char.equal (get sub !i) (get s (!i + !j))) 118 | do incr i done; 119 | if !i < sublen then (j := !j - (!i - l); memory := -1) else 120 | begin 121 | i := l; 122 | while (!i > !memory && Char.equal (get sub !i) (get s (!i + !j))) 123 | do decr i done; 124 | if !i <= !memory then raise_notrace Exit else 125 | (j := !j - p; memory := sublen - p - 1) 126 | end 127 | done; 128 | -1 129 | end else begin 130 | let p = 1 + Int.max (l + 1) (sublen - l - 1) in 131 | while (!j >= 0) do 132 | let i = ref (l + 1) in 133 | while (!i < sublen && Char.equal (get sub !i) (get s (!i + !j))) 134 | do incr i done; 135 | if !i < sublen then (j := !j - (!i - l)) else 136 | begin 137 | i := l; 138 | while (!i >= 0 && Char.equal (get sub !i) (get s (!i + !j))) 139 | do decr i done; 140 | if !i < 0 then raise_notrace Exit else (j := !j - p) 141 | end 142 | done; 143 | -1 144 | end 145 | with Exit -> !j 146 | 147 | let includes ~affix:sub s = 148 | let sub_lp = find_maximal_suffix_and_period ~sub in 149 | primitive_find_sub ~start:0 ~sub ~sub_lp s <> -1 150 | 151 | let find_sub ?(start = 0) ~sub s = 152 | let sub_lp = find_maximal_suffix_and_period ~sub in 153 | match primitive_find_sub ~start ~sub_lp ~sub s with 154 | | -1 -> None | i -> Some i 155 | 156 | let rfind_sub ?start ~sub s = 157 | let start = match start with None -> length s | Some s -> s in 158 | let sub_lp = find_maximal_suffix_and_period ~sub in 159 | match primitive_rfind_sub ~start ~sub_lp ~sub s with 160 | | -1 -> None | i -> Some i 161 | 162 | let find_all_sub ?(start = 0) f ~sub s acc = 163 | let rec loop f acc sub sub_lp s ~start ~slen = 164 | if start > slen then acc else 165 | match primitive_find_sub ~start ~sub ~sub_lp s with 166 | | -1 -> acc 167 | | i -> 168 | let acc = f i acc in 169 | let start = i + length sub in 170 | let start = if start = i then start + 1 else start in 171 | loop f acc sub sub_lp s ~start ~slen 172 | in 173 | let slen = length s in 174 | if not (0 <= start && start <= slen) then invalid_start ~start slen else 175 | let sub_lp = find_maximal_suffix_and_period ~sub in 176 | loop f acc sub sub_lp s ~start ~slen 177 | 178 | let rfind_all_sub ?start f ~sub s acc = 179 | let rec loop f acc sub sub_lp s ~start ~slen = 180 | if start < 0 then acc else 181 | match primitive_rfind_sub ~start ~sub ~sub_lp s with 182 | | -1 -> acc 183 | | i -> 184 | let start = i - Int.max (length sub) 1 in 185 | loop f (f i acc) sub sub_lp s ~start ~slen 186 | in 187 | let slen = length s in 188 | let start = match start with None -> length s | Some s -> s in 189 | if not (0 <= start && start <= slen) then invalid_start ~start slen else 190 | let sub_lp = find_maximal_suffix_and_period ~sub in 191 | loop f acc sub sub_lp s ~start ~slen 192 | 193 | let replace_first ?(start = 0) ~sub:needle ~by s = 194 | let sub_lp = find_maximal_suffix_and_period ~sub:needle in 195 | match primitive_find_sub ~start ~sub:needle ~sub_lp s with 196 | | -1 -> s 197 | | i -> 198 | let rest_first = i + length needle in 199 | let rest_len = length s - i - length needle in 200 | concat by [sub s 0 i; sub s rest_first rest_len] 201 | 202 | let replace_all ?start ~sub:needle ~by s = 203 | let chunk_first = ref 0 in 204 | let add_chunk i acc = 205 | let acc = sub s !chunk_first (i - !chunk_first) :: acc in 206 | chunk_first := i + length needle; acc 207 | in 208 | match find_all_sub ?start add_chunk ~sub:needle s [] with 209 | | [] -> s 210 | | chunks -> 211 | let chunks = sub s !chunk_first (length s - !chunk_first) :: chunks in 212 | concat by (List.rev chunks) 213 | --------------------------------------------------------------------------------