├── lib ├── internal │ ├── dune │ ├── NFA.ml │ ├── LevBVNFA.ml │ ├── bitVec.ml │ ├── stringOps.ml │ ├── DemarauBVNFA.ml │ ├── matcher.ml │ ├── LevNFA.ml │ ├── DemarauNFA.ml │ └── DFA.ml ├── dune ├── strings.ml ├── match.ml ├── strings.mli ├── index.mld └── match.mli ├── test ├── dune ├── lev_tests.ml └── dem_tests.ml ├── dune-workspace.dev ├── .gitignore ├── dune-project ├── CHANGES.md ├── mula.opam ├── README.md └── LICENSE /lib/internal/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (name internal) 3 | (public_name mula.internal) 4 | ) 5 | -------------------------------------------------------------------------------- /test/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (name mula_tests) 3 | (inline_tests) 4 | (libraries mula) 5 | (preprocess (pps ppx_inline_test))) 6 | -------------------------------------------------------------------------------- /lib/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (name mula) 3 | (public_name mula) 4 | (libraries mula.internal) 5 | ) 6 | (documentation (package mula)) 7 | -------------------------------------------------------------------------------- /dune-workspace.dev: -------------------------------------------------------------------------------- 1 | (lang dune 2.8) 2 | (context default) 3 | (context (opam (switch 4.08.1))) 4 | (context (opam (switch 4.09.1))) 5 | (context (opam (switch 4.10.2))) 6 | (context (opam (switch 4.11.2))) 7 | (context (opam (switch 4.12.0))) 8 | -------------------------------------------------------------------------------- /lib/strings.ml: -------------------------------------------------------------------------------- 1 | module UsualString : Match.S with type ch = char and type t = string = struct 2 | type ch = char 3 | type t = string 4 | 5 | let length = String.length 6 | let get = String.get 7 | 8 | let equal = Char.equal 9 | end 10 | 11 | include Match.Make(UsualString) 12 | -------------------------------------------------------------------------------- /lib/match.ml: -------------------------------------------------------------------------------- 1 | open Internal 2 | 3 | module type S = sig 4 | type ch 5 | type t 6 | 7 | val length : t -> int 8 | val get : t -> int -> ch 9 | val equal : ch -> ch -> bool 10 | end 11 | 12 | module Make (St : S) = struct 13 | module Lev = Matcher.Make (St) (LevBVNFA) 14 | module Dem = Matcher.Make (St) (DemarauBVNFA) 15 | end 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.annot 2 | *.cmo 3 | *.cma 4 | *.cmi 5 | *.a 6 | *.o 7 | *.cmx 8 | *.cmxs 9 | *.cmxa 10 | 11 | # ocamlbuild working directory 12 | _build/ 13 | 14 | # ocamlbuild targets 15 | *.byte 16 | *.native 17 | 18 | # oasis generated files 19 | setup.data 20 | setup.log 21 | 22 | # Merlin configuring file for Vim and Emacs 23 | .merlin 24 | 25 | # Dune generated files 26 | *.install 27 | 28 | # Local OPAM switch 29 | _opam/ 30 | _build 31 | -------------------------------------------------------------------------------- /lib/internal/NFA.ml: -------------------------------------------------------------------------------- 1 | module type NFA_t = sig 2 | module State : sig 3 | type t 4 | val compare : t -> t -> int 5 | end 6 | module StateSet : sig 7 | include Set.S with type elt = State.t 8 | 9 | val start : t 10 | val err : t 11 | 12 | val pp_comma : Format.formatter -> unit -> unit 13 | val pp_states : Format.formatter -> t -> unit 14 | end 15 | module Transitions : sig 16 | val all_transitions : StateSet.t -> BitVec.t -> k:int -> StateSet.t 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /dune-project: -------------------------------------------------------------------------------- 1 | (lang dune 2.8) 2 | (name mula) 3 | 4 | (generate_opam_files true) 5 | (license CC0-1.0) 6 | (authors "Ifaz Kabir") 7 | (maintainers "Ifaz Kabir") 8 | (source (github ifazk/mula)) 9 | (documentation https://ifazk.github.io/mula/) 10 | (version 0.1.2) 11 | 12 | (package 13 | (name mula) 14 | (synopsis "ML's Universal Levenshtein Automata library") 15 | (description 16 | "\| ML's radishal Universal Levenshtein Automata library. 17 | "\| This allows checking if a string is within a given edit 18 | "\| distance of another string. 19 | "\| The library supports both the Levenshtein distance and 20 | "\| the Demarau-Levenshtein. 21 | "\| 22 | "\| The library can be used for fuzzy string searching or 23 | "\| approximate string matching, but does not provide these 24 | "\| facilities out of the box. 25 | ) 26 | (tags ("levenshtein" "demarau-levenshtein" "automaton" "typo" "edit" "distance" "approximate" "fuzzy" "string" "search" "matching")) 27 | (depends (ocaml (>= 4.08.1)) (ppx_inline_test :with-test))) 28 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## 0.1.2 - 2021-12-10 8 | ### Added 9 | - Main page for documentation now explain basic concepts and functionality and 10 | contains examples of using the library. 11 | 12 | ### Changed 13 | - Optimizations 14 | - Now using an NFA that uses bitwise operations for transitions and requires less branching. 15 | 16 | ## 0.1.1 - 2021-06-23 17 | ### Added 18 | - `ppx_inline_test` is now a test dependency, instead of a full dependency. 19 | - Support OCaml > 4.08.1. 20 | - Documentation and internal updates. 21 | 22 | ### Changed 23 | - Optimizations 24 | - Improved subsumption for Demarau-Levenshtein 25 | - Early cutoff based on size difference for `Make.*.get_distance`. 26 | - Bit fiddling optimizations: `snoc_ones`, `snoc_zeros`. 27 | 28 | ## 0.1.0 - 2021-06-20 29 | 30 | Initial release. 31 | -------------------------------------------------------------------------------- /mula.opam: -------------------------------------------------------------------------------- 1 | # This file is generated by dune, edit dune-project instead 2 | opam-version: "2.0" 3 | version: "0.1.2" 4 | synopsis: "ML's Universal Levenshtein Automata library" 5 | description: """ 6 | ML's radishal Universal Levenshtein Automata library. 7 | This allows checking if a string is within a given edit 8 | distance of another string. 9 | The library supports both the Levenshtein distance and 10 | the Demarau-Levenshtein. 11 | 12 | The library can be used for fuzzy string searching or 13 | approximate string matching, but does not provide these 14 | facilities out of the box. 15 | """ 16 | maintainer: ["Ifaz Kabir"] 17 | authors: ["Ifaz Kabir"] 18 | license: "CC0-1.0" 19 | tags: [ 20 | "levenshtein" 21 | "demarau-levenshtein" 22 | "automaton" 23 | "typo" 24 | "edit" 25 | "distance" 26 | "approximate" 27 | "fuzzy" 28 | "string" 29 | "search" 30 | "matching" 31 | ] 32 | homepage: "https://github.com/ifazk/mula" 33 | doc: "https://ifazk.github.io/mula/" 34 | bug-reports: "https://github.com/ifazk/mula/issues" 35 | depends: [ 36 | "dune" {>= "2.8"} 37 | "ocaml" {>= "4.08.1"} 38 | "ppx_inline_test" {with-test} 39 | "odoc" {with-doc} 40 | ] 41 | build: [ 42 | ["dune" "subst"] {dev} 43 | [ 44 | "dune" 45 | "build" 46 | "-p" 47 | name 48 | "-j" 49 | jobs 50 | "@install" 51 | "@runtest" {with-test} 52 | "@doc" {with-doc} 53 | ] 54 | ] 55 | dev-repo: "git+https://github.com/ifazk/mula.git" 56 | -------------------------------------------------------------------------------- /lib/internal/LevBVNFA.ml: -------------------------------------------------------------------------------- 1 | module BV = BitVec 2 | 3 | type states = States of (BV.t array) [@@unboxed] 4 | 5 | module StateSet = struct 6 | type t = states 7 | 8 | let start ~k : t = 9 | let arr = Array.make (k + 1) BV.zero in 10 | let init = BV.snoc_zeros ~m:k (BV.one) in 11 | arr.(0) <- init; 12 | States arr 13 | 14 | let _find_index ~f arr : int option = 15 | let rec find_index f arr n len = 16 | if n < len then 17 | begin if f arr.(n) then 18 | Some n 19 | else 20 | find_index f arr (n + 1) len 21 | end 22 | else 23 | None 24 | in 25 | find_index f arr 0 (Array.length arr) 26 | 27 | let min_cost_opt (States arr) : int option = 28 | _find_index ~f:(fun bv -> BV.non_zero bv) arr 29 | 30 | end 31 | 32 | module Transitions = struct 33 | 34 | let all_transitions (States input : StateSet.t) bv ~k : StateSet.t = 35 | let output = Array.make (k + 1) BV.zero in 36 | let del_mask = ref BV.zero in 37 | let prev = ref BV.zero in 38 | for i = 0 to k do 39 | let prev_bv = !prev in 40 | let ins_subs = (BV.logor (BV.shift_left prev_bv 1) prev_bv) in 41 | let dels = BV.logand bv !del_mask in 42 | let prev_transitions = BV.logor ins_subs dels in 43 | let self_transitions = BV.logand bv input.(i) in 44 | let transitions = BV.logor prev_transitions self_transitions in 45 | del_mask := BV.logor (BV.shift_right_logical input.(i) 1) (BV.shift_right_logical !del_mask 1); 46 | prev := input.(i); 47 | output.(i) <- transitions 48 | done; 49 | States output 50 | 51 | end 52 | -------------------------------------------------------------------------------- /lib/internal/bitVec.ml: -------------------------------------------------------------------------------- 1 | type t = Bits of int [@@unboxed] 2 | 3 | let[@inline] compare (Bits x) (Bits y) = 4 | Int.compare x y 5 | 6 | let get k (Bits n) = 7 | let two_to_k = Int.shift_left Int.one (k - 1) in 8 | if Int.equal (Int.logand two_to_k n) 0 then 9 | false 10 | else 11 | true 12 | 13 | let get_lane ~lane:l ~k bv = 14 | get (k + 1 - l) bv 15 | 16 | let get_right_of_lane ~lane:l ~k ~m bv = 17 | get_lane ~lane:(l + m) ~k bv 18 | 19 | let get_left_of_lane ~lane:l ~k ~m bv = 20 | get_lane ~lane:(l - m) ~k bv 21 | 22 | let rec pos_fold ~f ~init n = 23 | match n with 24 | | 0 -> init 25 | | n -> pos_fold ~f ~init:(f n init) (n - 1) 26 | 27 | let snoc_one (Bits n) = 28 | let n' = Int.logor Int.one (Int.shift_left n 1) in 29 | Bits n' 30 | 31 | let snoc_ones (Bits n) ~m = 32 | if m >= Sys.int_size then 33 | Bits (Int.minus_one) 34 | else 35 | let one_m_zeros = Int.shift_left Int.one m in 36 | let m_ones = one_m_zeros - 1 in 37 | let n' = Int.logor m_ones (Int.shift_left n m) in 38 | Bits n' 39 | 40 | let ones ~m = 41 | snoc_ones (Bits Int.zero) ~m 42 | 43 | let snoc_zero (Bits n) = 44 | Bits (Int.shift_left n 1) 45 | 46 | let snoc_zeros (Bits n) ~m = 47 | if m >= Sys.int_size then 48 | Bits (Int.zero) 49 | else 50 | let n' = Int.shift_left n m in 51 | Bits n' 52 | 53 | let zero = (Bits Int.zero) 54 | 55 | let one = (Bits Int.one) 56 | 57 | let non_zero (Bits x) = not (Int.equal Int.zero x) 58 | 59 | let logor (Bits x) (Bits y) = Bits (Int.logor x y) 60 | 61 | let logand (Bits x) (Bits y) = Bits (Int.logand x y) 62 | 63 | let shift_right_logical (Bits x) n = Bits (Int.shift_right_logical x n) 64 | 65 | let shift_left (Bits x) n = Bits (Int.shift_left x n) 66 | 67 | let pp_bv ppf (Bits n)= 68 | Format.fprintf ppf "%o" n 69 | -------------------------------------------------------------------------------- /lib/internal/stringOps.ml: -------------------------------------------------------------------------------- 1 | module type S = sig 2 | type ch 3 | type t 4 | 5 | val length : t -> int 6 | val get : t -> int -> ch 7 | val equal : ch -> ch -> bool 8 | end 9 | 10 | module BitVecOps (St : S) = struct 11 | 12 | (* note that St.get is zero-indexed, so we define a one-indexed version of St.get *) 13 | let get xs n = St.get xs (n-1) 14 | 15 | (* assumes 1 <= index <= str_len + k *) 16 | let sizes k index str_len = 17 | let prefix = 18 | if index > k then 0 else (k + 1 - index) 19 | in 20 | let start = max 1 (index - k) in 21 | let end_ = min (index + k) str_len in 22 | let suffix = 23 | if index + k <= str_len then 0 else index + k - str_len 24 | in 25 | (prefix,start,end_,suffix) 26 | 27 | (* assumes 1 <= index <= str_len + k *) 28 | let bit_vec_of ch str ~index ~k = 29 | let (_pre_size,start,end_,suf_size) = 30 | sizes k index (St.length str) 31 | in 32 | let prefix_bv = BitVec.zero in 33 | let index_bv_len = (end_ + 1) - start in (* argh one indexing! *) 34 | (* 2 - 4. start = 2, end = 4, len = 3, n = 3,2,1... (end + 1)-n = 2,3,4 *) 35 | let index_bv = 36 | BitVec.pos_fold 37 | ~f:(fun n bv -> if St.equal (get str (end_ + 1 - n)) ch then BitVec.snoc_one bv else BitVec.snoc_zero bv ) 38 | ~init:prefix_bv 39 | index_bv_len 40 | in 41 | let suffix_bv = 42 | BitVec.snoc_zeros index_bv ~m:suf_size 43 | in 44 | suffix_bv 45 | 46 | (* assumes str_len < index <= str_len + k *) 47 | let bit_vec_of_sentinel ~str_len ~index ~k = 48 | let (pre_size,start,end_,suf_size) = 49 | sizes k index str_len 50 | in 51 | let prefix_bv = BitVec.snoc_ones BitVec.zero ~m:pre_size in 52 | let index_bv_len = (end_ + 1) - start in (* argh one indexing! *) 53 | (* 2 - 4. start = 2, end = 4, len = 3, n = 3,2,1... (end + 1)-n = 2,3,4 *) 54 | let index_bv = BitVec.snoc_zeros prefix_bv ~m:index_bv_len in 55 | let suffix_bv = 56 | BitVec.snoc_ones index_bv ~m:suf_size 57 | in 58 | suffix_bv 59 | end 60 | -------------------------------------------------------------------------------- /lib/internal/DemarauBVNFA.ml: -------------------------------------------------------------------------------- 1 | module BV = BitVec 2 | 3 | type states = States of (BV.t array) * (BV.t array) 4 | 5 | module StateSet = struct 6 | type t = states 7 | 8 | let start ~k : t = 9 | let arr = Array.make (k + 1) BV.zero in 10 | (* Using k + 1 just to keep things simple *) 11 | let trans = Array.make (k + 1) BV.zero in 12 | let init = BV.snoc_zeros ~m:k (BV.one) in 13 | arr.(0) <- init; 14 | States (arr, trans) 15 | 16 | let _find_index ~f arr : int option = 17 | let rec find_index f arr n len = 18 | if n < len then 19 | begin if f arr.(n) then 20 | Some n 21 | else 22 | find_index f arr (n + 1) len 23 | end 24 | else 25 | None 26 | in 27 | find_index f arr 0 (Array.length arr) 28 | 29 | let min_cost_opt (States (arr,_)) : int option = 30 | _find_index ~f:(fun bv -> BV.non_zero bv) arr 31 | 32 | 33 | let pp_states ppf (States (arr,trans)) = 34 | Format.fprintf ppf 35 | "@[states@ @[%a@]@ transpose @[%a@]@]" 36 | (Format.pp_print_list ~pp_sep:(fun ppf () -> Format.pp_print_char ppf '|') BV.pp_bv) (Array.to_list arr) 37 | (Format.pp_print_list ~pp_sep:(fun ppf () -> Format.pp_print_char ppf '|') BV.pp_bv) (Array.to_list trans) 38 | end 39 | 40 | module Transitions = struct 41 | 42 | let all_transitions (States (input, trans) : StateSet.t) bv ~k : StateSet.t = 43 | let output = Array.make (k + 1) BV.zero in 44 | let out_trans = Array.make (k + 1) BV.zero in 45 | let del_mask = ref BV.zero in 46 | let prev = ref BV.zero in 47 | for i = 0 to k do 48 | let prev_bv = !prev in 49 | let ins_subs = (BV.logor (BV.shift_left prev_bv 1) prev_bv) in 50 | let dels = BV.logand bv !del_mask in 51 | let transpose_transitions = BV.shift_right_logical (BV.logand bv trans.(i)) 1 in 52 | let prev_transitions = BV.logor ins_subs dels in 53 | let self_transitions = BV.logand bv input.(i) in 54 | let transitions = BV.logor (BV.logor prev_transitions self_transitions) transpose_transitions in 55 | let transpose_intermediate = BV.shift_left (BV.logand bv (BV.shift_right_logical prev_bv 1)) 2 in 56 | del_mask := BV.logor (BV.shift_right_logical input.(i) 1) (BV.shift_right_logical !del_mask 1); 57 | prev := input.(i); 58 | out_trans.(i) <- transpose_intermediate; 59 | output.(i) <- transitions 60 | done; 61 | States (output, out_trans) 62 | 63 | end 64 | -------------------------------------------------------------------------------- /lib/internal/matcher.ml: -------------------------------------------------------------------------------- 1 | module type S = sig 2 | type ch 3 | type t 4 | 5 | val length : t -> int 6 | val get : t -> int -> ch 7 | val equal : ch -> ch -> bool 8 | end 9 | 10 | module type NFA_t = sig 11 | module StateSet : sig 12 | type t 13 | 14 | val start : k:int -> t 15 | 16 | val min_cost_opt : t -> int option 17 | end 18 | module Transitions : sig 19 | val all_transitions : StateSet.t -> BitVec.t -> k:int -> StateSet.t 20 | end 21 | end 22 | 23 | module Make (St : S) (NFA : NFA_t) = struct 24 | 25 | module GBV = StringOps.BitVecOps (St) 26 | 27 | type nfa_state = {nfa : NFA.StateSet.t option; k : int; str: St.t; str_len: int; fed_so_far: int} 28 | 29 | let start ~k ~str = 30 | if (k < 0) then 31 | failwith "the limit k cannot be negative" 32 | else if (k > ((Sys.int_size - 1) / 2)) then 33 | failwith "the limit k cannot be larger than ((int_size - 1) / 2)" 34 | else 35 | {nfa = Some (NFA.StateSet.start ~k); k; str; str_len = St.length str; fed_so_far = 0 } 36 | 37 | let feed {nfa;k;str;str_len;fed_so_far} ~ch = 38 | let index = fed_so_far + 1 in 39 | if Option.is_none nfa 40 | || index > str_len + k then 41 | {nfa = None; k; str; str_len; fed_so_far = index} 42 | else 43 | let bv = GBV.bit_vec_of ch str ~index ~k in 44 | let nfa = NFA.Transitions.all_transitions (Option.get nfa) bv ~k in 45 | {nfa = Some nfa;k;str;str_len;fed_so_far = index} 46 | 47 | let current_error {nfa;_} : int option = 48 | match nfa with 49 | | None -> None 50 | | Some nfa -> NFA.StateSet.min_cost_opt nfa 51 | 52 | let end_input {nfa;k;str=_;str_len;fed_so_far} : int option = 53 | let size_diff = str_len - fed_so_far in 54 | if Option.is_none nfa (* handles over feeding *) 55 | || size_diff > k then (* handle under feeding *) 56 | None 57 | else 58 | (* add (str_len - fed_so_far + k) many sentinels *) 59 | let sentinels = str_len - fed_so_far + k in 60 | let nfa = 61 | BitVec.pos_fold 62 | ~f:(fun n nfa -> 63 | let bv = GBV.bit_vec_of_sentinel ~str_len ~index:(fed_so_far + 1 + sentinels - n) ~k in 64 | NFA.Transitions.all_transitions nfa bv ~k) 65 | ~init:(Option.get nfa) 66 | sentinels 67 | in 68 | NFA.StateSet.min_cost_opt nfa 69 | 70 | let feed_str nfa_state ~str = 71 | (* TODO early exit *) 72 | let len = St.length str in 73 | BitVec.pos_fold 74 | ~f:(fun n nfa -> feed nfa ~ch:(St.get str (len - n))) 75 | ~init:nfa_state 76 | len 77 | 78 | let get_distance ~k str1 str2 = 79 | let len_diff = 80 | let len1 = St.length str1 in 81 | let len2 = St.length str2 in 82 | Int.abs (len1 - len2) 83 | in 84 | if len_diff > k then 85 | None 86 | else 87 | let start = start ~k ~str:str1 in 88 | let end_ = feed_str start ~str:str2 in 89 | let cost = end_input end_ in 90 | cost 91 | end 92 | -------------------------------------------------------------------------------- /test/lev_tests.ml: -------------------------------------------------------------------------------- 1 | open Mula.Strings.Lev 2 | 3 | (* Lev tests *) 4 | let%test "simple delete abc ab" = 5 | get_distance ~k:2 "abc" "ab" = Some 1 6 | 7 | let%test "swap (insert + delete) abc-acb" = 8 | get_distance ~k:2 "abc" "acb" = Some 2 9 | 10 | let%test "simple insert abc-acbc" = 11 | get_distance ~k:2 "abc" "acbc" = Some 1 12 | 13 | let%test "simple substitute abc-acc" = 14 | get_distance ~k:2 "abc" "acc" = Some 1 15 | 16 | let%test "simple substitute insert abc-accc" = 17 | get_distance ~k:2 "abc" "accc" = Some 2 18 | 19 | let%test "large diff abc-def" = 20 | get_distance ~k:2 "abc" "def" = None 21 | 22 | let%test "large diff abcdef-g" = 23 | get_distance ~k:2 "abcdef" "g" = None 24 | 25 | let%test "large diff abcdef-a" = 26 | get_distance ~k:2 "abcdef" "a" = None 27 | 28 | let%test "large diff abcdef-ab" = 29 | get_distance ~k:2 "abcdef" "ab" = None 30 | 31 | let%test "large diff abcdef-abc" = 32 | get_distance ~k:2 "abcdef" "abc" = None 33 | 34 | let%test "large strings abcdef-abcd" = 35 | get_distance ~k:2 "abcdef" "abcd" = Some 2 36 | 37 | let%test "large strings abcdef-abcde" = 38 | get_distance ~k:1 "abcdef" "abcde" = Some 1 39 | 40 | let%test "current distance should be zero abcdef-abcde" = 41 | let nfa = start ~k:2 ~str:"abcdef" in 42 | (feed_str ~str:"abcde" nfa |> current_error) 43 | = Some 0 44 | 45 | let%test "current distance should be zero abcdef-abcde" = 46 | let nfa = start ~k:2 ~str:"abcdef" in 47 | (feed_str ~str:"abcde" nfa |> end_input) 48 | = Some 1 49 | 50 | let%test "current distance should be zero abcdef-ab" = 51 | let nfa = start ~k:2 ~str:"abcdef" in 52 | (feed_str ~str:"ab" nfa |> current_error) 53 | = Some 0 54 | 55 | let%test "short input abcdef-ab" = 56 | let nfa = start ~k:2 ~str:"abcdef" in 57 | (feed_str ~str:"ab" nfa |> end_input) 58 | = None 59 | 60 | let%test "current distance should be one abcdef-abd" = 61 | let nfa = start ~k:2 ~str:"abcdef" in 62 | (feed_str ~str:"abd" nfa |> current_error) 63 | = Some 1 64 | 65 | (* empty *) 66 | let%test "empty-empty" = 67 | get_distance ~k:1 "" "" = Some 0 68 | 69 | let%test "empty-a" = 70 | get_distance ~k:1 "" "a" = Some 1 71 | 72 | let%test "a-empty" = 73 | get_distance ~k:1 "a" "" = Some 1 74 | 75 | let%test "ab-empty" = 76 | get_distance ~k:1 "ab" "" = None 77 | 78 | let%test "empty-ab" = 79 | get_distance ~k:1 "" "ab" = None 80 | 81 | (* k = 0 *) 82 | let%test "k=0 abc-ab" = 83 | get_distance ~k:0 "abc" "ab" = None 84 | 85 | let%test "k=0 abc-abc" = 86 | get_distance ~k:0 "abc" "abc" = Some 0 87 | 88 | let%test "k=0 empty-empty" = 89 | get_distance ~k:0 "" "" = Some 0 90 | 91 | let%test "k=0 empty-a" = 92 | get_distance ~k:0 "" "a" = None 93 | 94 | let%test "k=0 a-empty" = 95 | get_distance ~k:0 "a" "" = None 96 | 97 | let%test "delete match match insert match" = 98 | get_distance ~k:4 "abdc" "bdac" 99 | = Some 2 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mula 2 | ML's radishal library for matching with Universal Levenshtein Automata. 3 | 4 | This library not only computes if strings are within a certain edit distance, 5 | but also computes what the edit distance is. 6 | 7 | You can find documentation for the library [here](https://ifazk.github.io/mula/). 8 | 9 | We support both the standard Levenshtein distance as well as the 10 | Demarau-Levenshtein distance which includes transpositions of two adjacent 11 | characters as a primitive edit operation. 12 | 13 | We can also lazily feed characters into automata and get the current edit 14 | distance. 15 | 16 | For OCaml strings, we offer the `Mula.Strings` module which contains submodules 17 | `Lev` for the standard Levenshtein distance and `Dem` for the (restricted) 18 | Demarau-Levenshtein distance. 19 | 20 | Examples of use: 21 | ```ocaml 22 | # #require "mula";; 23 | # Mula.Strings.Lev.get_distance ~k:2 "abcd" "abdc";; 24 | - : int option = Some 2 25 | # Mula.Strings.Dem.get_distance ~k:2 "abcd" "abdc";; 26 | - : int option = Some 1 27 | # Mula.Strings.Lev.get_distance ~k:2 "abcd" "efgh";; 28 | - : int option = None 29 | ``` 30 | 31 | We can also lazily feed characters and strings into an `nfa` and get live error counts: 32 | ```ocaml 33 | # let lev_nfa = Mula.Strings.Lev.start ~k:2 ~str:"abcd";; 34 | val lev_nfa : Mula.Strings.Lev.nfa_state = 35 | # Mula.Strings.Lev.(feed_str lev_nfa ~str:"ab" |> current_error);; 36 | - : int option = Some 0 37 | # Mula.Strings.Lev.(feed lev_nfa ~ch:'a' |> feed ~ch:'b' |> feed ~ch:'c' |> current_error);; 38 | - : int option = Some 0 39 | # Mula.Strings.Lev.(feed_str lev_nfa ~str:"abd" |> current_error);; 40 | - : int option = Some 1 41 | # Mula.Strings.Lev.(feed_str lev_nfa ~str:"ab" |> feed_str ~str:"dc" |> current_error);; (* counts 'd' as an insert edit *) 42 | - : int option = Some 1 43 | # Mula.Strings.Lev.(feed_str lev_nfa ~str:"ab" |> feed_str ~str:"dc" |> end_input);; 44 | - : int option = Some 2 45 | ``` 46 | 47 | `Mula` also offers a functor if you want to use your own representations of 48 | strings: 49 | ```ocaml 50 | # #require "mula";; 51 | # module St = struct 52 | type ch = int 53 | type t = int array 54 | 55 | let length = Array.length 56 | let get = Array.get 57 | 58 | let equal = Int.equal 59 | end;; 60 | module St : 61 | sig 62 | ... 63 | end 64 | # module M = Mula.Match.Make(St);; 65 | module M : 66 | sig 67 | module Lev : 68 | sig 69 | type nfa_state = Mula.Match.Make(St).Lev.nfa_state 70 | val start : k:int -> str:St.t -> nfa_state 71 | val feed : nfa_state -> ch:int -> nfa_state 72 | val current_error : nfa_state -> int option 73 | val end_input : nfa_state -> int option 74 | val feed_str : nfa_state -> str:St.t -> nfa_state 75 | val get_distance : k:int -> St.t -> St.t -> int option 76 | end 77 | module Dem : 78 | sig 79 | ... 80 | end 81 | end 82 | ``` 83 | 84 | ### About the Name 85 | মুলা (mula/moola) means radish in the author's first language. 86 | -------------------------------------------------------------------------------- /lib/internal/LevNFA.ml: -------------------------------------------------------------------------------- 1 | type lane = Lane of int [@@unboxed] 2 | type errors = Err of int [@@unboxed] 3 | type cost = Cost of int [@@unboxed] 4 | 5 | module State = struct 6 | type t = lane * errors 7 | let compare ((Lane l1, Err e1) : t) ((Lane l2, Err e2) : t) = 8 | match Int.compare l1 l2 with 9 | | 0 -> Int.compare e1 e2 10 | | diff -> diff 11 | 12 | let pp_state ppf (Lane l, Err e) = 13 | Format.fprintf ppf "(@[%d, %d@])" l e 14 | end 15 | 16 | module StateSet = struct 17 | include Set.Make(State) 18 | 19 | let[@inline] subsumed_by ((Lane l1, Err e1) : elt) ((Lane l2, Err e2) : elt) = 20 | if e1 < e2 then 21 | (l1 + e1 - e2 <= l2 && l2 <= l1 + e2 - e1) 22 | else 23 | false 24 | 25 | let not_subsumed_by x y = 26 | not (subsumed_by x y) 27 | 28 | let reduce (states : t) : t = 29 | fold (fun elt acc -> filter (not_subsumed_by elt) acc) states states 30 | 31 | let min_cost (states : t) : int = 32 | fold (fun (_,Err e) acc -> min e acc) states Int.max_int 33 | 34 | let min_cost_opt (states : t) : int option = 35 | let min_cost = min_cost states in 36 | if (Int.equal min_cost Int.max_int) then 37 | None 38 | else 39 | Some min_cost 40 | 41 | let start ~k:_ : t = singleton (Lane 0, Err 0) 42 | 43 | let err : t = empty 44 | 45 | let is_err = is_empty 46 | 47 | let print_states s = 48 | to_seq s 49 | |> Seq.iter (fun (Lane l, Err e) -> print_int l; print_string " "; print_int e; print_newline ()) 50 | 51 | let pp_comma ppf () = 52 | Format.fprintf ppf ",@ " 53 | 54 | let pp_states ppf s = 55 | Format.fprintf ppf "{@[%a@]}" 56 | (Format.pp_print_list ~pp_sep:pp_comma State.pp_state) (to_seq s |> List.of_seq) 57 | 58 | end 59 | 60 | module Transitions = struct 61 | 62 | let get_delete ((Lane l, Err e) : State.t) bv ~k : State.t option = 63 | let err_left = k - e in 64 | let rec delete_state err_left err_visit = 65 | if Int.equal err_left 0 then 66 | None 67 | else if BitVec.get_right_of_lane ~lane:l ~k ~m:err_visit bv then 68 | Some (Lane (l + err_visit), Err (e + err_visit)) 69 | else 70 | delete_state (err_left - 1) (err_visit + 1) 71 | in 72 | delete_state err_left 1 73 | 74 | let get_sub_ins ((Lane l, Err e) : State.t) ~k : (State.t * State.t) option = 75 | let err_left = k - e in 76 | if Int.equal err_left 0 then 77 | None 78 | else 79 | Some (((Lane l, Err (e+1)), (Lane (l-1), Err (e+1)))) 80 | 81 | let transitions bv ~k ((Lane l, _) as x : State.t) : StateSet.t = 82 | if BitVec.get_lane ~lane:l ~k bv then 83 | StateSet.singleton x 84 | else 85 | match get_delete x bv ~k, get_sub_ins x ~k with 86 | | None, None -> StateSet.empty 87 | | Some d, None -> StateSet.singleton d 88 | | Some d, Some (s,i) -> StateSet.of_list [d;s;i] 89 | | None, Some (s,i) -> StateSet.of_list [s;i] 90 | 91 | let all_transitions (xs : StateSet.t) bv ~k : StateSet.t = 92 | StateSet.fold (fun elt acc -> StateSet.union (transitions bv ~k elt) acc) xs StateSet.empty 93 | |> StateSet.reduce 94 | 95 | end 96 | -------------------------------------------------------------------------------- /lib/internal/DemarauNFA.ml: -------------------------------------------------------------------------------- 1 | type cost = Cost of int [@@unboxed] 2 | 3 | module State = struct 4 | type t = 5 | | Std of {lane: int; error: int} 6 | | Trans of {lane: int; error: int} 7 | let compare (s1 : t) s2 = 8 | Stdlib.compare s1 s2 9 | 10 | let pp_state ppf = function 11 | | Std {lane; error} -> Format.fprintf ppf "(@[%d, %d@])" lane error 12 | | Trans {lane; error} -> Format.fprintf ppf "(t@[%d, %d@])" lane error 13 | end 14 | 15 | module StateSet = struct 16 | include Set.Make(State) 17 | 18 | let[@inline] subsumed_by s1 s2 = 19 | let open State in 20 | match s1, s2 with 21 | | Std {lane = l1; error = e1}, Std {lane = l2; error = e2} -> 22 | (e1 < e2) && (l1 + e1 - e2 <= l2 && l2 <= l1 + e2 - e1) 23 | | Std {lane = l1; error = e1}, Trans {lane = l2; error = e2} -> 24 | l2 = l1 + 1 && e2 = e1 + 1 25 | | _ -> false 26 | 27 | let not_subsumed_by x y = 28 | not (subsumed_by x y) 29 | 30 | let reduce (states : t) : t = 31 | fold (fun elt acc -> filter (not_subsumed_by elt) acc) states states 32 | 33 | let min_cost (states : t) : int = 34 | fold (fun (Std {lane = _; error = e} | Trans {lane = _; error = e}) acc -> min e acc) states Int.max_int 35 | 36 | let min_cost_opt (states : t) : int option = 37 | let min_cost = min_cost states in 38 | if (Int.equal min_cost Int.max_int) then 39 | None 40 | else 41 | Some min_cost 42 | 43 | let start ~k:_ : t = singleton (State.Std {lane = 0; error = 0}) 44 | 45 | let err : t = empty 46 | 47 | let is_err = is_empty 48 | 49 | let pp_comma ppf () = 50 | Format.fprintf ppf ",@ " 51 | 52 | let pp_states ppf s = 53 | Format.fprintf ppf "{@[%a@]}" 54 | (Format.pp_print_list ~pp_sep:pp_comma State.pp_state) (to_seq s |> List.of_seq) 55 | 56 | end 57 | 58 | module Transitions = struct 59 | 60 | let get_delete_trans s bv ~k : State.t list = 61 | match s with 62 | | State.Std {lane = l; error = e} -> 63 | let err_left = k - e in 64 | let rec delete_state err_left err_visit = 65 | if Int.equal err_left 0 then 66 | [] 67 | else if BitVec.get_right_of_lane ~lane:l ~k ~m:err_visit bv then 68 | let del = (State.Std {lane = (l + err_visit); error = (e + err_visit)}) in 69 | if err_visit = 1 then 70 | (State.Trans {lane = (l - err_visit); error = (e + err_visit)}) :: [del] 71 | else 72 | [del] 73 | else 74 | delete_state (err_left - 1) (err_visit + 1) 75 | in 76 | delete_state err_left 1 77 | | State.Trans _ -> [] 78 | 79 | let get_sub_ins s ~k : State.t list = 80 | match s with 81 | | State.Std {lane = l; error = e} -> 82 | let err_left = k - e in 83 | if Int.equal err_left 0 then 84 | [] 85 | else 86 | [ State.Std {lane = l; error = (e+1)} 87 | ; State.Std {lane = (l-1); error = (e+1)} 88 | ] 89 | | State.Trans _ -> 90 | [] 91 | 92 | let transitions bv ~k s : StateSet.t = 93 | match s with 94 | | State.Std {lane = l; error = _} as x -> 95 | if BitVec.get_lane ~lane:l ~k bv then 96 | StateSet.singleton x 97 | else 98 | StateSet.of_list (List.concat [get_delete_trans s bv ~k;get_sub_ins s ~k]) 99 | | State.Trans {lane; error} -> 100 | if BitVec.get_lane ~lane ~k bv then 101 | StateSet.singleton (State.Std {lane = lane +1; error}) 102 | else 103 | StateSet.empty 104 | 105 | let all_transitions (xs : StateSet.t) bv ~k : StateSet.t = 106 | StateSet.fold (fun elt acc -> StateSet.union (transitions bv ~k elt) acc) xs StateSet.empty 107 | |> StateSet.reduce 108 | 109 | end 110 | -------------------------------------------------------------------------------- /lib/strings.mli: -------------------------------------------------------------------------------- 1 | (** {1 Universal Levenshtein Automata for OCaml strings}*) 2 | 3 | (** We provide two kinds of Automata: 4 | - {!module:Lev}, for the standard Levenshtein distance. 5 | - {!module:Dem}, for the Demarau-Levenshtein distance which 6 | includes transpositions as a primitve edit operation. 7 | *) 8 | 9 | module Lev : sig 10 | 11 | (** Abstract type for the state of the automata. *) 12 | type nfa_state 13 | 14 | (** [start ~k ~str] produces the starting state of the automaton for edit 15 | distance [k]. Here [k] must not be negative and must not be greater 16 | [(Sys.int_size - 1) / 2]. *) 17 | val start : k:int -> str:string -> nfa_state 18 | 19 | (** [feed nfa ch] produces a new state where the automaton has been fed the 20 | character [ch]. *) 21 | val feed : nfa_state -> ch:char -> nfa_state 22 | 23 | (** [current_error nfa] produces [Some n] if the current error count recorded 24 | by the nfa is a number [n], or [None] if the error count is larger than 25 | the limit [k] that the nfa was started with. [current_error nfa] may be 26 | smaller than [end_input nfa], since it does not account for delete 27 | operations at the end of the fed string. *) 28 | val current_error : nfa_state -> int option 29 | 30 | (** [end_input nfa] computes the edit distance between the starting string and 31 | the string fed to [nfa]. It produces [Some n] if the edit distance in a 32 | number [n] that is less than the limit [k] that the automaton was started 33 | with, and [None] otherwise. *) 34 | val end_input : nfa_state -> int option 35 | 36 | (** [feed_str nfa ~str] produces a new state where the automaton has been fed 37 | the characters from the string [str]. *) 38 | val feed_str : nfa_state -> str:string -> nfa_state 39 | 40 | (** [get_distance ~k str1 str2] computes the edit distance between two 41 | strings. It creates an automaton with limit [k] and [str1], and then feeds 42 | it the string [str2], and thet outputs the result of calling [end_input] 43 | on the nfa. *) 44 | val get_distance : k:int -> string -> string -> int option 45 | end 46 | module Dem : sig 47 | type nfa_state 48 | 49 | (** [start ~k ~str] produces the starting state of the automaton for edit 50 | distance [k]. Here [k] must not be negative and must not be greater 51 | [(Sys.int_size - 1) / 2]. *) 52 | val start : k:int -> str:string -> nfa_state 53 | 54 | (** [feed nfa ch] produces a new state where the automaton has been fed the 55 | character [ch]. *) 56 | val feed : nfa_state -> ch:char -> nfa_state 57 | 58 | (** [current_error nfa] produces [Some n] if the current error count recorded 59 | by the nfa is a number [n], or [None] if the error count is larger than 60 | the limit [k] that the nfa was started with. [current_error nfa] may be 61 | smaller than [end_input nfa], since it does not account for delete 62 | operations at the end of the fed string. *) 63 | val current_error : nfa_state -> int option 64 | 65 | (** [end_input nfa] computes the edit distance between the starting string and 66 | the string fed to [nfa]. It produces [Some n] if the edit distance in a 67 | number [n] that is less than the limit [k] that the automaton was started 68 | with, and [None] otherwise. *) 69 | val end_input : nfa_state -> int option 70 | 71 | (** [feed_str nfa ~str] produces a new state where the automaton has been fed 72 | the characters from the string [str]. *) 73 | val feed_str : nfa_state -> str:string -> nfa_state 74 | 75 | (** [get_distance ~k str1 str2] computes the edit distance between two 76 | strings. It creates an automaton with limit [k] and [str1], and then feeds 77 | it the string [str2], and thet outputs the result of calling [end_input] 78 | on the nfa. *) 79 | val get_distance : k:int -> string -> string -> int option 80 | end 81 | -------------------------------------------------------------------------------- /test/dem_tests.ml: -------------------------------------------------------------------------------- 1 | open Mula.Strings.Dem 2 | 3 | (* Dem tests *) 4 | let%test "simple delete abc ab" = 5 | get_distance ~k:2 "abc" "ab" = Some 1 6 | 7 | let%test "swap (insert + delete) abc-acb" = 8 | get_distance ~k:2 "abc" "acb" = Some 1 9 | 10 | let%test "simple insert abc-acbc" = 11 | get_distance ~k:2 "abc" "acbc" = Some 1 12 | 13 | let%test "simple substitute abc-acc" = 14 | get_distance ~k:2 "abc" "acc" = Some 1 15 | 16 | let%test "simple substitute insert abc-accc" = 17 | get_distance ~k:2 "abc" "accc" = Some 2 18 | 19 | let%test "large diff abc-def" = 20 | get_distance ~k:2 "abc" "def" = None 21 | 22 | let%test "large diff abcdef-g" = 23 | get_distance ~k:2 "abcdef" "g" = None 24 | 25 | let%test "large diff abcdef-a" = 26 | get_distance ~k:2 "abcdef" "a" = None 27 | 28 | let%test "large diff abcdef-ab" = 29 | get_distance ~k:2 "abcdef" "ab" = None 30 | 31 | let%test "large diff abcdef-abc" = 32 | get_distance ~k:2 "abcdef" "abc" = None 33 | 34 | let%test "large strings abcdef-abcd" = 35 | get_distance ~k:2 "abcdef" "abcd" = Some 2 36 | 37 | let%test "large strings abcdef-abcde" = 38 | get_distance ~k:1 "abcdef" "abcde" = Some 1 39 | 40 | let%test "current distance should be zero abcdef-abcde" = 41 | let nfa = start ~k:2 ~str:"abcdef" in 42 | (feed_str ~str:"abcde" nfa |> current_error) 43 | = Some 0 44 | 45 | let%test "current distance should be zero abcdef-abcde" = 46 | let nfa = start ~k:2 ~str:"abcdef" in 47 | (feed_str ~str:"abcde" nfa |> end_input) 48 | = Some 1 49 | 50 | let%test "current distance should be zero abcdef-ab" = 51 | let nfa = start ~k:2 ~str:"abcdef" in 52 | (feed_str ~str:"ab" nfa |> current_error) 53 | = Some 0 54 | 55 | let%test "current distance should be one abcdef-ba" = 56 | let nfa = start ~k:2 ~str:"abcdef" in 57 | (feed_str ~str:"ba" nfa |> current_error) 58 | = Some 1 59 | 60 | let%test "short input abcdef-ab" = 61 | let nfa = start ~k:2 ~str:"abcdef" in 62 | (feed_str ~str:"ab" nfa |> end_input) 63 | = None 64 | 65 | let%test "current distance should be one abcdef-abd" = 66 | let nfa = start ~k:2 ~str:"abcdef" in 67 | (feed_str ~str:"abd" nfa |> current_error) 68 | = Some 1 69 | 70 | let%test "current distance should be one abcdef-abdc" = 71 | let nfa = start ~k:2 ~str:"abcdef" in 72 | (feed_str ~str:"abdc" nfa |> current_error) 73 | = Some 1 74 | 75 | let%test "no triangle inequality" = 76 | get_distance ~k:4 "abcd" "abdc" 77 | = Some 1 78 | && 79 | get_distance ~k:4 "abdc" "bdac" 80 | = Some 2 81 | && 82 | get_distance ~k:4 "abcd" "bdac" 83 | = Some 4 84 | 85 | (* A delete followed by a transposition has the same error count as a substututione *) 86 | 87 | let%test "2 deletes then transposition match" = 88 | get_distance ~k:4 "abcdef" "aedf" 89 | = Some 3 90 | 91 | let%test "2 deletes then transposition" = 92 | get_distance ~k:4 "abcd" "dc" 93 | = Some 3 94 | 95 | let%test "1 delete then transposition" = 96 | get_distance ~k:4 "abc" "cb" 97 | = Some 2 98 | 99 | let%test "1 substitution then 1 delete" = 100 | get_distance ~k:4 "abc" "cb" 101 | = Some 2 102 | 103 | let%test "1 substitution then 1 delete then match" = 104 | get_distance ~k:4 "abcd" "cbd" 105 | = Some 2 106 | 107 | (* empty *) 108 | let%test "empty-empty" = 109 | get_distance ~k:1 "" "" = Some 0 110 | 111 | let%test "empty-a" = 112 | get_distance ~k:1 "" "a" = Some 1 113 | 114 | let%test "a-empty" = 115 | get_distance ~k:1 "a" "" = Some 1 116 | 117 | let%test "ab-empty" = 118 | get_distance ~k:1 "ab" "" = None 119 | 120 | let%test "empty-ab" = 121 | get_distance ~k:1 "" "ab" = None 122 | 123 | (* k = 0 *) 124 | let%test "k=0 abc-ab" = 125 | get_distance ~k:0 "abc" "ab" = None 126 | 127 | let%test "k=0 abc-abc" = 128 | get_distance ~k:0 "abc" "abc" = Some 0 129 | 130 | let%test "k=0 empty-empty" = 131 | get_distance ~k:0 "" "" = Some 0 132 | 133 | let%test "k=0 empty-a" = 134 | get_distance ~k:0 "" "a" = None 135 | 136 | let%test "k=0 a-empty" = 137 | get_distance ~k:0 "a" "" = None 138 | -------------------------------------------------------------------------------- /lib/internal/DFA.ml: -------------------------------------------------------------------------------- 1 | module Make (NFA : NFA.NFA_t) = struct 2 | 3 | module NFAStateSetSet = struct 4 | include Set.Make(NFA.StateSet) 5 | 6 | let pp_states_set ppf s = 7 | Format.fprintf ppf "{@[%a@]}" 8 | (Format.pp_print_list ~pp_sep:NFA.StateSet.pp_comma NFA.StateSet.pp_states) (to_seq s |> List.of_seq) 9 | end 10 | 11 | module NFAStateSetMap = struct 12 | include Map.Make(NFA.StateSet) 13 | 14 | let pp_kv_pair ~pp_val ppf (k,v) = 15 | Format.fprintf ppf "@[%a ->@ %a@]" NFA.StateSet.pp_states k pp_val v 16 | 17 | let pp_map ~pp_val ppf map = 18 | Format.fprintf ppf "{@[%a@]}" 19 | (Format.pp_print_list ~pp_sep:NFA.StateSet.pp_comma (pp_kv_pair ~pp_val)) (to_seq map |> List.of_seq) 20 | end 21 | 22 | module Transitions = struct 23 | include Map.Make(BitVec) 24 | 25 | let pp_kv_pair ~pp_val ppf (k,v) = 26 | Format.fprintf ppf "@[%a ->@ %a@]" BitVec.pp_bv k pp_val v 27 | 28 | let pp_map ~pp_val ppf map = 29 | Format.fprintf ppf "[@[%a@]]" 30 | (Format.pp_print_list ~pp_sep:NFA.StateSet.pp_comma (pp_kv_pair ~pp_val)) (to_seq map |> List.of_seq) 31 | end 32 | 33 | module DFA = struct 34 | 35 | type dfa = (NFA.StateSet.t Transitions.t) NFAStateSetMap.t 36 | 37 | let add_key ~from:(from : NFA.StateSet.t) ~dfa:(dfa : dfa) : dfa = 38 | match NFAStateSetMap.find_opt from dfa with 39 | | None -> NFAStateSetMap.add from (Transitions.empty) dfa 40 | | Some _ -> dfa 41 | 42 | let add_transition ~from:(from : NFA.StateSet.t) (bv : BitVec.t) ~to_:(to_ : NFA.StateSet.t) ~dfa:(dfa : dfa) : dfa = 43 | let dfa = 44 | match NFAStateSetMap.find_opt from dfa with 45 | | None -> NFAStateSetMap.add from (Transitions.singleton bv to_) dfa 46 | | Some trans -> NFAStateSetMap.add from (Transitions.add bv to_ trans) dfa 47 | in 48 | (* make sure the to_ is in the set of keys*) 49 | add_key ~from:to_ ~dfa 50 | 51 | type dula_build = 52 | { marked : NFAStateSetSet.t 53 | ; unmarked : NFAStateSetSet.t 54 | ; k : int 55 | ; dfa : dfa 56 | } 57 | 58 | let build_transitions (dula : dula_build) ~t = 59 | let rec build_transitions ({marked; unmarked; k; dfa} as dula) t n max = 60 | let from = t in 61 | let bv = (BitVec.Bits n) in 62 | let transition : NFA.StateSet.t = NFA.Transitions.all_transitions t bv ~k in 63 | let dfa = add_transition ~from bv ~to_:transition ~dfa in 64 | let unmarked = 65 | if NFAStateSetSet.mem transition marked || NFAStateSetSet.mem transition unmarked then 66 | unmarked 67 | else 68 | NFAStateSetSet.add transition unmarked 69 | in 70 | let dula = { dula with dfa; unmarked } in 71 | if n = max then 72 | dula 73 | else 74 | build_transitions dula t (n + 1) max 75 | in 76 | let BitVec.(Bits max) = 77 | BitVec.ones ~m:(dula.k * 2 + 1) 78 | in 79 | build_transitions dula t 0 max 80 | 81 | let rec build_dfa ({marked; unmarked; k = _; dfa = _} as dula) = 82 | match NFAStateSetSet.max_elt_opt unmarked with 83 | | None -> dula 84 | | Some t -> 85 | let marked = NFAStateSetSet.add t marked in 86 | let unmarked = NFAStateSetSet.remove t unmarked in 87 | let dula = 88 | build_transitions { dula with marked; unmarked } ~t 89 | in 90 | build_dfa dula 91 | 92 | let start = NFAStateSetSet.of_list [NFA.StateSet.start;NFA.StateSet.err] 93 | 94 | let build_dula ~k = 95 | if (k < 1) || k > 3 then 96 | failwith "build_dula can only be called with 1 <= k <= 3" 97 | else 98 | build_dfa { marked = NFAStateSetSet.empty; unmarked = start; k; dfa = NFAStateSetMap.empty } 99 | 100 | let build_and_print_dula ~k = 101 | let {dfa;_} = build_dula ~k in 102 | NFAStateSetMap.pp_map ~pp_val:(Transitions.pp_map ~pp_val:NFA.StateSet.pp_states) Format.std_formatter dfa; 103 | Format.pp_print_newline Format.std_formatter () 104 | end 105 | end 106 | -------------------------------------------------------------------------------- /lib/index.mld: -------------------------------------------------------------------------------- 1 | {0 mula} 2 | ML's radishal library for matching with Universal Levenshtein Automata. 3 | 4 | {1 Library mula} 5 | The entry point of this library is the module: 6 | {!module-Mula}. 7 | 8 | {1 Basic Concepts} 9 | 10 | This library provides functions and functors to quickly compute Levenshtein edit 11 | distances of strings from a base string within a limit [k]. 12 | This can be used for fuzzy-string matching. 13 | 14 | The Levenshtein distance from a string [s1] to a string [s2] is the minimum 15 | number of character edits (insert, delete, substitute) operations needed to 16 | change [s1] into [s2]. 17 | We support both the standard Levenshtein distance as well as the 18 | (restricted) Demarau-Levenshtein distance, which includes transpositions of two 19 | adjacent characters as a edit operation. 20 | 21 | {1 Functionality} 22 | 23 | The {!module-Mula.Strings} module provides functions for working with OCaml 24 | strings directly, and the {!module-Mula.Match} module provides functors for 25 | working with your own representation of strings. 26 | 27 | The libary offers two ways of working with strings. 28 | You can use the [get_distance] function to directly compute edit distances, or 29 | you can create a an automata using the [start] function to create an automata 30 | and feed characters and substrings into it lazily. 31 | The latter approach allows you to get the live minimum error counts. 32 | 33 | The {!module-Mula.Strings} module (and the functors created by 34 | {!module-Mula.Match.Make}) contains submodules {!module-Mula.Strings.Lev} for 35 | the standard Levenshtein distance and {!module-Mula.Strings.Dem} for the 36 | (restricted) Demarau-Levenshtein distance. 37 | If you are unsure of which to use, use {!module-Mula.Strings.Dem}. 38 | 39 | {1 Examples} 40 | 41 | {2 Getting Edit Distances} 42 | 43 | {[ 44 | # #require "mula";; 45 | # Mula.Strings.Dem.get_distance ~k:2 "abcd" "abdc";; 46 | - : int option = Some 1 47 | # Mula.Strings.Lev.get_distance ~k:2 "abcd" "abdc";; 48 | - : int option = Some 2 49 | # Mula.Strings.Lev.get_distance ~k:2 "abcd" "efgh";; 50 | - : int option = None 51 | ]} 52 | 53 | {2 Live Minimal Error Counts} 54 | Examples of lazily feeding characters and into an automaton and getting live 55 | error counts: 56 | 57 | {[ 58 | # #require "mula";; 59 | # (* Create an automaton for a limit and base string *);; 60 | # module Lev = Mula.Strings.Lev;; 61 | # let lev_nfa = Lev.start ~k:2 ~str:"abcd";; 62 | val lev_nfa : Lev.nfa_state = 63 | # (* Get live error counts after feeding some characters into automaton *);; 64 | # Lev.(feed_str lev_nfa ~str:"ab" |> current_error);; 65 | - : int option = Some 0 66 | # Lev.(feed lev_nfa ~ch:'a' |> feed ~ch:'b' |> feed ~ch:'c' |> current_error);; 67 | - : int option = Some 0 68 | # Lev.(feed_str lev_nfa ~str:"abd" |> current_error);; 69 | - : int option = Some 1 70 | # Lev.(feed_str lev_nfa ~str:"ab" |> feed_str ~str:"dc" |> current_error);; 71 | - : int option = Some 1 72 | # (* End input to get edit distance *);; 73 | # Lev.(feed_str lev_nfa ~str:"ab" |> feed_str ~str:"dc" |> end_input);; 74 | - : int option = Some 2 75 | ]} 76 | 77 | The last two examples show that the live error count can be lower than the edit 78 | distance. 79 | In the first of the two examples, ['d'] is counted as a possible insert edit. 80 | In the second of the two examples, ['d'] and ['c'] are both counted as 81 | substitution edits. 82 | 83 | {2 Live Minimal Error Counts} 84 | 85 | Example of using the {!module-Mula.Match.Make} functor: 86 | 87 | {[ 88 | # #require "mula";; 89 | # module St = struct 90 | type ch = int 91 | type t = int array 92 | 93 | let length = Array.length 94 | let get = Array.get 95 | 96 | let equal = Int.equal 97 | end;; 98 | module St : 99 | sig 100 | ... 101 | end 102 | # module M = Mula.Match.Make(St);; 103 | module M : 104 | sig 105 | module Lev : 106 | sig 107 | type nfa_state = Mula.Match.Make(St).Lev.nfa_state 108 | val start : k:int -> str:St.t -> nfa_state 109 | val feed : nfa_state -> ch:int -> nfa_state 110 | val current_error : nfa_state -> int option 111 | val end_input : nfa_state -> int option 112 | val feed_str : nfa_state -> str:St.t -> nfa_state 113 | val get_distance : k:int -> St.t -> St.t -> int option 114 | end 115 | module Dem : 116 | sig 117 | ... 118 | end 119 | end 120 | ]} 121 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /lib/match.mli: -------------------------------------------------------------------------------- 1 | (** {1 Universal Levenshtein Automata} 2 | The {{:https://hal.archives-ouvertes.fr/hal-01360482/file/LATA2016.pdf}paper 3 | by Touzet} details the Universal Levenshtein Automata. 4 | 5 | Some nice computational properties of the (not-deterministic) automata include: 6 | - There are no epsilon transitions. 7 | - The automata are computable on demand, there is no need to store states and transitions in a data structure. 8 | - The states of the automata carry error counts. 9 | - There is a simple subsumption relation that prunes sets of states, so transitions produce small sets of states. 10 | 11 | These allow for efficient implementation together with interesting uses. 12 | {ol 13 | {- We not only know if two strings are within a certain edit distance, but 14 | we also know what the edit distance is if they are within the edit 15 | distance limit.} 16 | {- If several strings are compared, we can rank them by their edit 17 | distance.} 18 | {- We can lazily feed characters and string fragments to the automata and 19 | get the current error count.} 20 | } 21 | *) 22 | 23 | (** This module offers a functor to build matchers for different representations 24 | of strings and characters. For a prebuilt matcher for OCaml strings and 25 | characters, see {!module:Strings}. *) 26 | 27 | (** {2 String Abstraction} 28 | We abstract over strings and characters so that we do not rely on any 29 | specific encoding. 30 | We only need the following: 31 | - a function to compute length of strings, 32 | - a function to fetch a character at an index, and 33 | - a function to check if two characters are equal. 34 | *) 35 | 36 | module type S = 37 | sig 38 | (** {1 String Abstraction} 39 | We abstract over strings and characters so that we do not rely on any 40 | specific encoding. 41 | We only need the following: 42 | - a function to compute length of strings, 43 | - a function to fetch a character at an index, and 44 | - a function to check if two characters are equal. 45 | *) 46 | 47 | (** The type for characters *) 48 | type ch 49 | 50 | (** The type for strings *) 51 | type t 52 | 53 | (** [length s] should compute the length of the string [s] *) 54 | val length : t -> int 55 | 56 | (** [get s i] should fetch the character at index [i] of a string [s] *) 57 | val get : t -> int -> ch 58 | 59 | (** [equal c1 c2] should return true if [c1] and [c2] are equal, and false otherwise. *) 60 | val equal : ch -> ch -> bool 61 | end 62 | 63 | (** {2 Levenshtein Automata} 64 | Given a string representation, we produce two Automata: 65 | - {!module:Make.Lev}, for the standard Levenshtein distance. 66 | - {!module:Make.Dem}, for the Demarau-Levenshtein distance which 67 | includes transpositions as a primitve edit operation. 68 | *) 69 | 70 | module Make (St : S) : 71 | sig 72 | (** {1 Levenshtein Automata} 73 | Given a string representation, we produce two Automata: 74 | - {!module:Lev}, for the standard Levenshtein distance. 75 | - {!module:Dem}, for the Demarau-Levenshtein distance which 76 | includes transpositions as a primitve edit operation. 77 | *) 78 | 79 | module Lev : sig 80 | (** {1 Standard Universal Levenshtein Automaton} *) 81 | 82 | (** Abstract type for the state of the automata. *) 83 | type nfa_state 84 | 85 | (** [start ~k ~str] produces the starting state of the automaton for edit distance [k]. 86 | Here [k] must not be negative and must not be greater [(Sys.int_size - 1) / 2]. *) 87 | val start : k:int -> str:St.t -> nfa_state 88 | 89 | (** [feed nfa ch] produces a new state where the automaton has been fed 90 | the character [ch]. *) 91 | val feed : nfa_state -> ch:St.ch -> nfa_state 92 | 93 | (** [current_error nfa] produces [Some n] if the current error count 94 | recorded by the nfa is a number [n], or [None] if the error count is 95 | larger than the limit [k] that the nfa was started with. 96 | [current_error nfa] may be smaller than [end_input nfa], since it 97 | does not account for delete operations at the end of the fed string. 98 | *) 99 | val current_error : nfa_state -> int option 100 | 101 | (** [end_input nfa] computes the edit distance between the starting 102 | string and the string fed to [nfa]. It produces [Some n] if the edit 103 | distance in a number [n] that is less than the limit [k] that the 104 | automaton was started with, and [None] otherwise. *) 105 | val end_input : nfa_state -> int option 106 | 107 | (** [feed_str nfa ~str] produces a new state where the automaton has been fed 108 | the characters from the string [str]. *) 109 | val feed_str : nfa_state -> str:St.t -> nfa_state 110 | 111 | (** [get_distance ~k str1 str2] computes the edit distance between two 112 | strings. It creates an automaton with limit [k] and [str1], and then 113 | feeds it the string [str2], and thet outputs the result of calling 114 | [end_input] on the nfa. *) 115 | val get_distance : k:int -> St.t -> St.t -> int option 116 | end 117 | module Dem : sig 118 | (** {1 Universal Demarau-Levenshtein Automaton} *) 119 | 120 | (** Abstract type for the state of the automata. *) 121 | type nfa_state 122 | 123 | (** [start ~k ~str] produces the starting state of the automaton for edit distance [k]. 124 | Here [k] must not be negative and must not be greater [(Sys.int_size - 1) / 2]. *) 125 | val start : k:int -> str:St.t -> nfa_state 126 | 127 | (** [feed nfa ch] produces a new state where the automaton has been fed 128 | the character [ch]. *) 129 | val feed : nfa_state -> ch:St.ch -> nfa_state 130 | 131 | (** [current_error nfa] produces [Some n] if the current error count 132 | recorded by the nfa is a number [n], or [None] if the error count is 133 | larger than the limit [k] that the nfa was started with. 134 | [current_error nfa] may be smaller than [end_input nfa], since it 135 | does not account for delete operations at the end of the fed string. 136 | *) 137 | val current_error : nfa_state -> int option 138 | 139 | (** [end_input nfa] computes the edit distance between the starting 140 | string and the string fed to [nfa]. It produces [Some n] if the edit 141 | distance in a number [n] that is less than the limit [k] that the 142 | automaton was started with, and [None] otherwise. *) 143 | val end_input : nfa_state -> int option 144 | 145 | (** [feed_str nfa ~str] produces a new state where the automaton has been fed 146 | the characters from the string [str]. *) 147 | val feed_str : nfa_state -> str:St.t -> nfa_state 148 | 149 | (** [get_distance ~k str1 str2] computes the edit distance between two 150 | strings. It creates an automaton with limit [k] and [str1], and then 151 | feeds it the string [str2], and thet outputs the result of calling 152 | [end_input] on the nfa. *) 153 | val get_distance : k:int -> St.t -> St.t -> int option 154 | end 155 | end 156 | --------------------------------------------------------------------------------