├── .dockerignore ├── .gitignore ├── .ocamlformat ├── LICENSE ├── README.md ├── bin ├── bench.ml └── dune ├── dune-project ├── src ├── cmap.ml ├── dune ├── tdigest.ml └── tdigest.mli ├── tdigest.opam └── test ├── dune ├── shared.ml ├── test_discrete.ml └── test_tdigest.ml /.dockerignore: -------------------------------------------------------------------------------- 1 | **/.DS_Store 2 | .vscode 3 | _opam/ 4 | _build/ 5 | **/.merlin 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/.DS_Store 2 | .vscode 3 | _opam/ 4 | _build/ 5 | **/.merlin 6 | -------------------------------------------------------------------------------- /.ocamlformat: -------------------------------------------------------------------------------- 1 | # "Asemio Style" v1.3 2 | # Last updated for ocamlformat 0.25.1 3 | ocaml-version=5.0.0 4 | 5 | # All available options are listed. 6 | # An option is commented out when the value matches the default 7 | # When updating ocamlformat, update this file: 8 | # - add any new options 9 | # - try out new values for existing options 10 | # - if a default changed, see if we should follow it or not 11 | 12 | # assignment-operator = end-line 13 | # break-before-in = fit-or-vertical 14 | break-cases = all 15 | # break-collection-expressions = fit-or-vertical 16 | # break-colon = after 17 | # break-fun-decl = wrap 18 | # break-fun-sig = wrap 19 | break-infix = fit-or-vertical 20 | # break-infix-before-func = false 21 | # break-separators = after 22 | # break-sequences = true 23 | # break-string-literals = auto 24 | # break-struct = force 25 | cases-exp-indent = 2 26 | # cases-matching-exp-indent = normal 27 | disambiguate-non-breaking-match = true 28 | doc-comments = before 29 | # doc-comments-padding = 2 30 | # doc-comments-tag-only = default 31 | # dock-collection-brackets = true 32 | exp-grouping = preserve 33 | # extension-indent = 2 34 | field-space = tight-decl 35 | function-indent = 0 36 | # function-indent-nested = never 37 | if-then-else = keyword-first 38 | # indent-after-in = 0 39 | indicate-multiline-delimiters = space 40 | indicate-nested-or-patterns = space 41 | # infix-precedence = indent 42 | # leading-nested-match-parens = false 43 | let-and = sparse 44 | # let-binding-indent = 2 45 | # let-binding-spacing = compact 46 | # let-module = compact 47 | # line-endings = lf 48 | margin = 106 49 | # match-indent = 0 50 | # match-indent-nested = never 51 | max-indent = 2 52 | module-item-spacing = sparse 53 | # nested-match = wrap 54 | parens-ite = true 55 | parens-tuple = multi-line-only 56 | # parens-tuple-patterns = multi-line-only 57 | # parse-docstrings = false 58 | # parse-toplevel-phrases = false 59 | # sequence-blank-line = preserve-one 60 | # sequence-style = terminator 61 | single-case = sparse 62 | # space-around-arrays = true 63 | # space-around-lists = true 64 | # space-around-records = true 65 | # space-around-variants = true 66 | # stritem-extension-indent = 0 67 | type-decl = sparse 68 | # type-decl-indent = 2 69 | # wrap-comments = false 70 | # wrap-fun-args = true 71 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020 Simon Grondin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Tdigest 2 | ======= 3 | 4 | OCaml implementation of the T-Digest algorithm. 5 | 6 | ```ocaml 7 | let td = 8 | Tdigest.create () 9 | |> Tdigest.add_list [ 10.0; 11.0; 12.0; 13.0 ] 10 | in 11 | 12 | Tdigest.percentiles td [ 0.; 0.25; 0.5; 0.75; 1. ] 13 | (* [ Some 10; Some 10.5; Some 11.5; Some 12.5; Some 13 ] *) 14 | 15 | Tdigest.p_ranks td [ 9.; 10.; 11.; 12.; 13.; 14. ] 16 | (* [ Some 0; Some 0.125; Some 0.375; Some 0.625; Some 0.875; Some 1 ] *) 17 | ``` 18 | 19 | The T-Digest is a data structure and algorithm for constructing an approximate distribution for a collection of real numbers presented as a stream. 20 | 21 | The T-Digest can estimate percentiles or quantiles extremely accurately even at the tails, while using a fraction of the space of the original data. 22 | 23 | A median of medians is not equal to the median of the whole dataset. Percentiles are critical measures that are expensive to compute due to their requirement of having the entire **sorted** dataset present in one place. These downsides are addressed by using the T-Digest. 24 | 25 | A T-Digest is concatenable, making it a good fit for distributed systems. The internal state of a T-Digest can be exported as a binary string, and the concatenation of any number of those strings can then be imported to form a new T-Digest. 26 | 27 | ```ocaml 28 | let combined = Tdigest.merge [ td1; td2; td3 ] in 29 | ``` 30 | 31 | A T-Digest's state can be stored in a database `VARCHAR`/`TEXT` column and multiple such states can be merged by concatenating strings: 32 | ```sql 33 | -- Combine multiple states in the database 34 | SELECT 35 | STRING_AGG(M.tdigest_state) AS concat_state 36 | FROM my_table AS M 37 | ``` 38 | ```ocaml 39 | (* Then load this combined state into a single T-Digest *) 40 | let combined = Tdigest.of_string concat_state in 41 | ``` 42 | 43 | Links: 44 | - [A simple overview of the T-Digest](https://dataorigami.net/blogs/napkin-folding/19055451-percentile-and-quantile-estimation-of-big-data-the-t-digest) 45 | - [A walkthrough of the algorithm by its creator](https://mapr.com/blog/better-anomaly-detection-t-digest-whiteboard-walkthrough/) 46 | - [The white paper](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) 47 | 48 | This library started off as a port of [Will Welch's JavaScript implementation](https://github.com/welch/tdigest), down to the unit tests. However some modifications have been made to adapt it to OCaml, the most important one being immutability. As such, almost every function in the `Tdigest` module return a new `Tdigest.t`, including "reading" ones since they may trigger intermediate computations worth caching. 49 | 50 | ## Usage 51 | 52 | The API is well documented [here](https://github.com/SGrondin/tdigest/blob/master/src/tdigest.mli). 53 | 54 | ```sh 55 | opam install tdigest 56 | ``` 57 | 58 | ### Marshal 59 | 60 | The `Tdigest.t` type cannot be marshalled. 61 | 62 | Use the functions in `Tdigest.Marshallable` if your application requires marshalling a T-Digest data structure. Note that `Tdigest.Marshallable.t` is approximately 5 times slower than `Tdigest.t`. 63 | 64 | ## Performance 65 | 66 | On an ancient 2015 MacBook Pro, this implementation can incorporate 1,000,000 random floating points in just 770ms. 67 | 68 | Exporting and importing state (`to_string`/`of_string`) is cheap. 69 | -------------------------------------------------------------------------------- /bin/bench.ml: -------------------------------------------------------------------------------- 1 | open! Base 2 | 3 | let print_endline = Stdlib.print_endline 4 | 5 | let sprintf = Printf.sprintf 6 | 7 | let info_to_string x = 8 | let open Tdigest in 9 | sprintf {s|----- 10 | Count: %d 11 | Size: %d 12 | # Cumulates: %d 13 | # Compressions: %d 14 | # Auto Compressions: %d 15 | -----|s} 16 | x.count x.size x.cumulates_count x.compress_count x.auto_compress_count 17 | 18 | let snapshot sf t0 t1 = 19 | print_endline 20 | (sprintf sf Float.((t1 - t0) * 1_000_000. |> round_nearest |> to_string_hum ~delimiter:',')) 21 | 22 | module type S = module type of Tdigest.M 23 | 24 | let test (module M : S) = 25 | let arr = Array.init 2_000_000 ~f:(fun _i -> Random.float 1.) in 26 | let td = M.create () in 27 | 28 | let t0 = Unix.gettimeofday () in 29 | let td = Array.fold arr ~init:td ~f:(fun acc data -> M.add acc ~data) in 30 | 31 | let t1 = Unix.gettimeofday () in 32 | let info1 = M.info td in 33 | let td = M.compress td in 34 | 35 | let t2 = Unix.gettimeofday () in 36 | let info2 = M.info td in 37 | 38 | snapshot "Add 2_000_000: %s us" t0 t1; 39 | print_endline (info_to_string info1); 40 | snapshot "Compress: %s us" t1 t2; 41 | print_endline (info_to_string info2); 42 | 43 | let t3 = Unix.gettimeofday () in 44 | let _td, str = M.to_string td in 45 | let t4 = Unix.gettimeofday () in 46 | print_endline (sprintf "Serialized into %d bytes" (String.length str)); 47 | snapshot "Serialization: %s us" t3 t4; 48 | 49 | let t5 = Unix.gettimeofday () in 50 | let td = M.of_string (sprintf "%s%s" str str) in 51 | let t6 = Unix.gettimeofday () in 52 | snapshot "Parsing: %s us" t5 t6; 53 | let info3 = M.info td in 54 | print_endline (info_to_string info3) 55 | 56 | let () = test (module Tdigest) 57 | 58 | let () = test (module Tdigest.Marshallable) 59 | -------------------------------------------------------------------------------- /bin/dune: -------------------------------------------------------------------------------- 1 | (executable 2 | (name bench) 3 | (libraries 4 | tdigest 5 | 6 | base 7 | unix 8 | ) 9 | 10 | (modes exe) 11 | (ocamlopt_flags -O3 -unbox-closures) 12 | ) 13 | 14 | (env 15 | (dev 16 | (flags (:standard -warn-error -A)))) 17 | -------------------------------------------------------------------------------- /dune-project: -------------------------------------------------------------------------------- 1 | (lang dune 1.9) 2 | (name tdigest) 3 | (version 2.2.1) 4 | -------------------------------------------------------------------------------- /src/cmap.ml: -------------------------------------------------------------------------------- 1 | (* Code adapted from https://github.com/janestreet/core/blob/e70fdcdaa308dffb1b5bb8bb38acf46c73b40161/core/src/map.ml and map_intf.ml 2 | 3 | The MIT License 4 | 5 | Copyright (c) 2008--2023 Jane Street Group, LLC opensource-contacts@janestreet.com 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 12 | *) 13 | open! Base 14 | module Tree0 = Map.Using_comparator.Tree 15 | 16 | module type Creators_generic = sig 17 | include Map.Creators_generic 18 | 19 | val of_hashtbl_exn : ('k, 'cmp, ('k key, 'v) Hashtbl.t -> ('k, 'v, 'cmp) t) create_options 20 | 21 | (** Never requires a comparator because it can get one from the input [Set.t]. *) 22 | val of_key_set : ('k key, 'cmp cmp) Set.t -> f:(('k key -> 'v)[@local]) -> ('k, 'v, 'cmp) t 23 | end 24 | 25 | type ('k, 'cmp) comparator = (module Comparator.S with type t = 'k and type comparator_witness = 'cmp) 26 | 27 | let to_comparator (type k cmp) ((module M) : (k, cmp) Comparator.Module.t) = M.comparator 28 | 29 | let of_comparator (type k cmp) comparator : (k, cmp) Comparator.Module.t = 30 | ( module struct 31 | type t = k 32 | 33 | type comparator_witness = cmp 34 | 35 | let comparator = comparator 36 | end ) 37 | 38 | module Using_comparator = struct 39 | include Map.Using_comparator 40 | 41 | let of_hashtbl_exn ~comparator hashtbl = 42 | match of_iteri ~comparator ~iteri:(Hashtbl.iteri hashtbl) with 43 | | `Ok map -> map 44 | | `Duplicate_key key -> 45 | Error.create "Map.of_hashtbl_exn: duplicate key" key comparator.sexp_of_t |> Error.raise 46 | 47 | let tree_of_hashtbl_exn ~comparator hashtbl = to_tree (of_hashtbl_exn ~comparator hashtbl) 48 | 49 | let key_set ~comparator t = 50 | Set.Using_comparator.of_sorted_array_unchecked ~comparator (List.to_array (keys t)) 51 | 52 | let key_set_of_tree ~comparator t = key_set ~comparator (of_tree ~comparator t) 53 | 54 | let of_key_set key_set ~f = 55 | of_sorted_array_unchecked ~comparator:(Set.comparator key_set) 56 | (Array.map (Set.to_array key_set) ~f:(fun key -> key, f key)) 57 | 58 | let tree_of_key_set key_set ~f = to_tree (of_key_set key_set ~f) 59 | end 60 | 61 | module Accessors = struct 62 | include ( 63 | Map.Using_comparator : 64 | Map.Accessors_generic 65 | with type ('a, 'b, 'c) access_options := ('a, 'b, 'c) Map.Without_comparator.t 66 | with type ('a, 'b, 'c) t := ('a, 'b, 'c) Map.t 67 | with type ('a, 'b, 'c) tree := ('a, 'b, 'c) Tree0.t 68 | with type 'k key := 'k 69 | with type 'c cmp := 'c ) 70 | 71 | let key_set t = Using_comparator.key_set t ~comparator:(Using_comparator.comparator t) 72 | end 73 | 74 | let key_set t = Using_comparator.key_set ~comparator:(Using_comparator.comparator t) t 75 | 76 | let of_key_set = Using_comparator.of_key_set 77 | 78 | let hash_fold_direct = Using_comparator.hash_fold_direct 79 | 80 | let comparator = Using_comparator.comparator 81 | 82 | let comparator_s = Map.comparator_s 83 | 84 | type 'k key = 'k 85 | 86 | type 'c cmp = 'c 87 | 88 | include ( 89 | struct 90 | include Map 91 | 92 | let of_tree m = Map.Using_comparator.of_tree ~comparator:(to_comparator m) 93 | 94 | let to_tree = Map.Using_comparator.to_tree 95 | end : 96 | sig 97 | type ('a, 'b, 'c) t = ('a, 'b, 'c) Map.t 98 | 99 | include 100 | Map.Creators_and_accessors_generic 101 | with type ('a, 'b, 'c) create_options := ('a, 'b, 'c) Map.With_first_class_module.t 102 | with type ('a, 'b, 'c) access_options := ('a, 'b, 'c) Map.Without_comparator.t 103 | with type ('a, 'b, 'c) t := ('a, 'b, 'c) t 104 | with type ('a, 'b, 'c) tree := ('a, 'b, 'c) Tree0.t 105 | with type 'k key := 'k key 106 | with type 'c cmp := 'c cmp 107 | end ) 108 | 109 | module Empty_without_value_restriction = Using_comparator.Empty_without_value_restriction 110 | 111 | let merge_skewed = Map.merge_skewed 112 | 113 | let of_hashtbl_exn m t = Using_comparator.of_hashtbl_exn ~comparator:(to_comparator m) t 114 | 115 | module Creators (Key : Comparator.S1) : sig 116 | type ('a, 'b, 'c) t_ = ('a Key.t, 'b, Key.comparator_witness) t 117 | 118 | type ('a, 'b, 'c) tree = ('a, 'b, Key.comparator_witness) Tree0.t 119 | 120 | val t_of_sexp : (Sexp.t -> 'a Key.t) -> (Sexp.t -> 'b) -> Sexp.t -> ('a, 'b, _) t_ 121 | 122 | include 123 | Creators_generic 124 | with type ('a, 'b, 'c) t := ('a, 'b, 'c) t_ 125 | with type ('a, 'b, 'c) tree := ('a, 'b, 'c) tree 126 | with type 'a key := 'a Key.t 127 | with type 'a cmp := Key.comparator_witness 128 | with type ('a, 'b, 'c) create_options := ('a, 'b, 'c) Map.Without_comparator.t 129 | with type ('a, 'b, 'c) access_options := ('a, 'b, 'c) Map.Without_comparator.t 130 | end = struct 131 | let comparator = Key.comparator 132 | 133 | type ('a, 'b, 'c) t_ = ('a Key.t, 'b, Key.comparator_witness) t 134 | 135 | type ('a, 'b, 'c) tree = ('a, 'b, Key.comparator_witness) Tree0.t 136 | 137 | module M_empty = Empty_without_value_restriction (Key) 138 | 139 | let empty = M_empty.empty 140 | 141 | let of_tree tree = Using_comparator.of_tree ~comparator tree 142 | 143 | let singleton k v = Using_comparator.singleton ~comparator k v 144 | 145 | let of_sorted_array_unchecked array = Using_comparator.of_sorted_array_unchecked ~comparator array 146 | 147 | let of_sorted_array array = Using_comparator.of_sorted_array ~comparator array 148 | 149 | let of_increasing_iterator_unchecked ~len ~f = 150 | Using_comparator.of_increasing_iterator_unchecked ~comparator ~len ~f 151 | 152 | let of_increasing_sequence seq = Using_comparator.of_increasing_sequence ~comparator seq 153 | 154 | let of_sequence seq = Using_comparator.of_sequence ~comparator seq 155 | 156 | let of_sequence_or_error seq = Using_comparator.of_sequence_or_error ~comparator seq 157 | 158 | let of_sequence_exn seq = Using_comparator.of_sequence_exn ~comparator seq 159 | 160 | let of_sequence_multi seq = Using_comparator.of_sequence_multi ~comparator seq 161 | 162 | let of_sequence_fold seq ~init ~f = Using_comparator.of_sequence_fold ~comparator seq ~init ~f 163 | 164 | let of_sequence_reduce seq ~f = Using_comparator.of_sequence_reduce ~comparator seq ~f 165 | 166 | let of_list_with_key list ~get_key = Using_comparator.of_list_with_key ~comparator list ~get_key 167 | 168 | let of_list_with_key_or_error list ~get_key = 169 | Using_comparator.of_list_with_key_or_error ~comparator list ~get_key 170 | 171 | let of_list_with_key_exn list ~get_key = Using_comparator.of_list_with_key_exn ~comparator list ~get_key 172 | 173 | let of_list_with_key_multi list ~get_key = 174 | Using_comparator.of_list_with_key_multi ~comparator list ~get_key 175 | 176 | let of_alist alist = Using_comparator.of_alist ~comparator alist 177 | 178 | let of_alist_or_error alist = Using_comparator.of_alist_or_error ~comparator alist 179 | 180 | let of_alist_exn alist = Using_comparator.of_alist_exn ~comparator alist 181 | 182 | let of_hashtbl_exn hashtbl = Using_comparator.of_hashtbl_exn ~comparator hashtbl 183 | 184 | let of_alist_multi alist = Using_comparator.of_alist_multi ~comparator alist 185 | 186 | let of_alist_fold alist ~init ~f = Using_comparator.of_alist_fold ~comparator alist ~init ~f 187 | 188 | let of_alist_reduce alist ~f = Using_comparator.of_alist_reduce ~comparator alist ~f 189 | 190 | let of_iteri ~iteri = Using_comparator.of_iteri ~comparator ~iteri 191 | 192 | let of_iteri_exn ~iteri = Using_comparator.of_iteri_exn ~comparator ~iteri 193 | 194 | let t_of_sexp k_of_sexp v_of_sexp sexp = 195 | Using_comparator.t_of_sexp_direct ~comparator k_of_sexp v_of_sexp sexp 196 | 197 | let of_key_set key_set ~f = Using_comparator.of_key_set key_set ~f 198 | 199 | let map_keys t ~f = Using_comparator.map_keys ~comparator t ~f 200 | 201 | let map_keys_exn t ~f = Using_comparator.map_keys_exn ~comparator t ~f 202 | 203 | let transpose_keys t = Using_comparator.transpose_keys ~comparator t 204 | 205 | let of_list_with_key_fold list ~get_key ~init ~f = 206 | Using_comparator.of_list_with_key_fold ~comparator list ~get_key ~init ~f 207 | 208 | let of_list_with_key_reduce l ~get_key ~f = 209 | Using_comparator.of_list_with_key_reduce ~comparator l ~get_key ~f 210 | end 211 | 212 | module Make_tree_S1 (Key : Comparator.S1) = struct 213 | open Tree0 214 | 215 | let comparator = Key.comparator 216 | 217 | let sexp_of_t = sexp_of_t 218 | 219 | let t_of_sexp a b c = t_of_sexp_direct a b c ~comparator 220 | 221 | let empty = empty_without_value_restriction 222 | 223 | let of_tree tree = tree 224 | 225 | let singleton a = singleton a ~comparator 226 | 227 | let of_sorted_array_unchecked a = of_sorted_array_unchecked a ~comparator 228 | 229 | let of_sorted_array a = of_sorted_array a ~comparator 230 | 231 | let of_increasing_iterator_unchecked ~len ~f = of_increasing_iterator_unchecked ~len ~f ~comparator 232 | 233 | let of_increasing_sequence seq = of_increasing_sequence ~comparator seq 234 | 235 | let of_sequence s = of_sequence s ~comparator 236 | 237 | let of_sequence_or_error s = of_sequence_or_error s ~comparator 238 | 239 | let of_sequence_exn s = of_sequence_exn s ~comparator 240 | 241 | let of_sequence_multi s = of_sequence_multi s ~comparator 242 | 243 | let of_sequence_fold s ~init ~f = of_sequence_fold s ~init ~f ~comparator 244 | 245 | let of_sequence_reduce s ~f = of_sequence_reduce s ~f ~comparator 246 | 247 | let of_alist a = of_alist a ~comparator 248 | 249 | let of_alist_or_error a = of_alist_or_error a ~comparator 250 | 251 | let of_alist_exn a = of_alist_exn a ~comparator 252 | 253 | let of_hashtbl_exn a = Using_comparator.tree_of_hashtbl_exn a ~comparator 254 | 255 | let of_alist_multi a = of_alist_multi a ~comparator 256 | 257 | let of_alist_fold a ~init ~f = of_alist_fold a ~init ~f ~comparator 258 | 259 | let of_alist_reduce a ~f = of_alist_reduce a ~f ~comparator 260 | 261 | let of_list_with_key l ~get_key = of_list_with_key l ~get_key ~comparator 262 | 263 | let of_list_with_key_or_error l ~get_key = of_list_with_key_or_error l ~get_key ~comparator 264 | 265 | let of_list_with_key_exn l ~get_key = of_list_with_key_exn l ~get_key ~comparator 266 | 267 | let of_list_with_key_multi l ~get_key = of_list_with_key_multi l ~get_key ~comparator 268 | 269 | let of_iteri ~iteri = of_iteri ~iteri ~comparator 270 | 271 | let of_iteri_exn ~iteri = of_iteri_exn ~iteri ~comparator 272 | 273 | let of_key_set = Using_comparator.tree_of_key_set 274 | 275 | let to_tree t = t 276 | 277 | let invariants a = invariants a ~comparator 278 | 279 | let is_empty a = is_empty a 280 | 281 | let length a = length a 282 | 283 | let set a ~key ~data = set a ~key ~data ~comparator 284 | 285 | let add a ~key ~data = add a ~key ~data ~comparator 286 | 287 | let add_exn a ~key ~data = add_exn a ~key ~data ~comparator 288 | 289 | let add_multi a ~key ~data = add_multi a ~key ~data ~comparator 290 | 291 | let remove_multi a b = remove_multi a b ~comparator 292 | 293 | let find_multi a b = find_multi a b ~comparator 294 | 295 | let change a b ~f = change a b ~f ~comparator 296 | 297 | let update a b ~f = update a b ~f ~comparator 298 | 299 | let find_exn a b = find_exn a b ~comparator 300 | 301 | let find a b = find a b ~comparator 302 | 303 | let remove a b = remove a b ~comparator 304 | 305 | let mem a b = mem a b ~comparator 306 | 307 | let iter_keys = iter_keys 308 | 309 | let iter = iter 310 | 311 | let iteri = iteri 312 | 313 | let iteri_until = iteri_until 314 | 315 | let iter2 a b ~f = iter2 a b ~f ~comparator 316 | 317 | let map = map 318 | 319 | let mapi = mapi 320 | 321 | let fold = fold 322 | 323 | let fold_until = fold_until 324 | 325 | let fold_right = fold_right 326 | 327 | let fold2 a b ~init ~f = fold2 a b ~init ~f ~comparator 328 | 329 | let filter_keys a ~f = filter_keys a ~f 330 | 331 | let filter a ~f = filter a ~f 332 | 333 | let filteri a ~f = filteri a ~f 334 | 335 | let filter_map a ~f = filter_map a ~f 336 | 337 | let filter_mapi a ~f = filter_mapi a ~f 338 | 339 | let partition_mapi t ~f = partition_mapi t ~f 340 | 341 | let partition_map t ~f = partition_map t ~f 342 | 343 | let partitioni_tf t ~f = partitioni_tf t ~f 344 | 345 | let partition_tf t ~f = partition_tf t ~f 346 | 347 | let combine_errors t = combine_errors t ~comparator 348 | 349 | let compare_direct a b c = compare_direct a b c ~comparator 350 | 351 | let equal a b c = equal a b c ~comparator 352 | 353 | let keys = keys 354 | 355 | let data = data 356 | 357 | let to_alist = to_alist 358 | 359 | let symmetric_diff a b ~data_equal = symmetric_diff a b ~data_equal ~comparator 360 | 361 | let fold_symmetric_diff a b ~data_equal ~init ~f = 362 | fold_symmetric_diff a b ~data_equal ~f ~init ~comparator 363 | 364 | let merge a b ~f = merge a b ~f ~comparator 365 | 366 | let merge_skewed a b ~combine = merge_skewed a b ~combine ~comparator 367 | 368 | let min_elt = min_elt 369 | 370 | let min_elt_exn = min_elt_exn 371 | 372 | let max_elt = max_elt 373 | 374 | let max_elt_exn = max_elt_exn 375 | 376 | let for_all = for_all 377 | 378 | let for_alli = for_alli 379 | 380 | let exists = exists 381 | 382 | let existsi = existsi 383 | 384 | let count = count 385 | 386 | let counti = counti 387 | 388 | let split a b = split a b ~comparator 389 | 390 | let split_le_gt a b = split_le_gt a b ~comparator 391 | 392 | let split_lt_ge a b = split_lt_ge a b ~comparator 393 | 394 | let append ~lower_part ~upper_part = append ~lower_part ~upper_part ~comparator 395 | 396 | let subrange t ~lower_bound ~upper_bound = subrange t ~lower_bound ~upper_bound ~comparator 397 | 398 | let fold_range_inclusive t ~min ~max ~init ~f = fold_range_inclusive t ~min ~max ~init ~f ~comparator 399 | 400 | let range_to_alist t ~min ~max = range_to_alist t ~min ~max ~comparator 401 | 402 | let closest_key a b c = closest_key a b c ~comparator 403 | 404 | let nth = nth 405 | 406 | let nth_exn = nth_exn 407 | 408 | let rank a b = rank a b ~comparator 409 | 410 | let to_sequence ?order ?keys_greater_or_equal_to ?keys_less_or_equal_to t = 411 | to_sequence ~comparator ?order ?keys_greater_or_equal_to ?keys_less_or_equal_to t 412 | 413 | let binary_search t ~compare how v = binary_search ~comparator t ~compare how v 414 | 415 | let binary_search_segmented t ~segment_of how = binary_search_segmented ~comparator t ~segment_of how 416 | 417 | let binary_search_subrange t ~compare ~lower_bound ~upper_bound = 418 | binary_search_subrange ~comparator t ~compare ~lower_bound ~upper_bound 419 | 420 | module Make_applicative_traversals (A : Applicative.Lazy_applicative) = struct 421 | module Traversals = Make_applicative_traversals (A) 422 | 423 | let mapi = Traversals.mapi 424 | 425 | let filter_mapi = Traversals.filter_mapi 426 | end 427 | 428 | let key_set t = Using_comparator.key_set_of_tree ~comparator t 429 | 430 | let map_keys t ~f = map_keys t ~f ~comparator 431 | 432 | let map_keys_exn t ~f = map_keys_exn t ~f ~comparator 433 | 434 | let transpose_keys t = transpose_keys ~comparator ~comparator t 435 | end 436 | 437 | module Make_tree_plain (Key : sig 438 | type t [@@deriving sexp_of] 439 | 440 | include Comparator.S with type t := t 441 | end) = 442 | struct 443 | module Key_S1 = Comparator.S_to_S1 (Key) 444 | include Make_tree_S1 (Key_S1) 445 | 446 | type +'v t = (Key.t, 'v, Key.comparator_witness) Tree0.t 447 | 448 | let sexp_of_t sexp_of_v t = sexp_of_t Key.sexp_of_t sexp_of_v [%sexp_of: _] t 449 | 450 | module Provide_of_sexp 451 | (X : sig 452 | type t [@@deriving of_sexp] 453 | end 454 | with type t := Key.t) = 455 | struct 456 | let t_of_sexp v_of_sexp sexp = t_of_sexp X.t_of_sexp v_of_sexp sexp 457 | end 458 | end 459 | 460 | module Make_tree (Key : sig 461 | type t [@@deriving sexp] 462 | 463 | include Comparator.S with type t := t 464 | end) = 465 | struct 466 | include Make_tree_plain (Key) 467 | include Provide_of_sexp (Key) 468 | end 469 | 470 | module type Key_plain = sig 471 | type t [@@deriving sexp_of] 472 | 473 | val compare : t -> t -> int 474 | end 475 | 476 | module type Key = sig 477 | type t [@@deriving sexp] 478 | 479 | val compare : t -> t -> int 480 | end 481 | 482 | module Make_plain_using_comparator (Key : sig 483 | type t [@@deriving sexp_of] 484 | 485 | include Comparator.S with type t := t 486 | end) = 487 | struct 488 | module Key = Key 489 | module Key_S1 = Comparator.S_to_S1 (Key) 490 | include Creators (Key_S1) 491 | 492 | type key = Key.t 493 | 494 | type ('a, 'b, 'c) map = ('a, 'b, 'c) t 495 | 496 | type 'v t = (key, 'v, Key.comparator_witness) map 497 | 498 | include Accessors 499 | 500 | let compare cmpv t1 t2 = compare_direct cmpv t1 t2 501 | 502 | let sexp_of_t sexp_of_v t = Using_comparator.sexp_of_t Key.sexp_of_t sexp_of_v [%sexp_of: _] t 503 | 504 | module Provide_of_sexp 505 | (Key : sig 506 | type t [@@deriving of_sexp] 507 | end 508 | with type t := Key.t) = 509 | struct 510 | let t_of_sexp v_of_sexp sexp = t_of_sexp Key.t_of_sexp v_of_sexp sexp 511 | end 512 | 513 | module Provide_hash (Key' : Hasher.S with type t := Key.t) = struct 514 | let hash_fold_t (type a) hash_fold_data state (t : a t) = 515 | Using_comparator.hash_fold_direct Key'.hash_fold_t hash_fold_data state t 516 | end 517 | end 518 | 519 | module Make_plain (Key : Key_plain) = Make_plain_using_comparator (struct 520 | include Key 521 | include Comparator.Make (Key) 522 | end) 523 | 524 | module Make_using_comparator (Key_sexp : sig 525 | type t [@@deriving sexp] 526 | 527 | include Comparator.S with type t := t 528 | end) = 529 | struct 530 | include Make_plain_using_comparator (Key_sexp) 531 | module Key = Key_sexp 532 | include Provide_of_sexp (Key) 533 | 534 | module _ = struct 535 | include Tree0 536 | include Provide_of_sexp (Key) 537 | end 538 | end 539 | 540 | module Make (Key : Key) = Make_using_comparator (struct 541 | include Key 542 | include Comparator.Make (Key) 543 | end) 544 | 545 | module Tree = struct 546 | include Tree0 547 | 548 | let of_hashtbl_exn = Using_comparator.tree_of_hashtbl_exn 549 | 550 | let key_set = Using_comparator.key_set_of_tree 551 | 552 | let of_key_set = Using_comparator.tree_of_key_set 553 | end 554 | -------------------------------------------------------------------------------- /src/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (name tdigest) 3 | (public_name tdigest) 4 | (libraries 5 | base 6 | ) 7 | (preprocess (pps 8 | ppx_sexp_conv 9 | )) 10 | ) 11 | 12 | (env 13 | (dev 14 | (flags (:standard -warn-error -A)))) 15 | -------------------------------------------------------------------------------- /src/tdigest.ml: -------------------------------------------------------------------------------- 1 | open! Base 2 | module Map = Cmap 3 | open Option.Monad_infix 4 | 5 | type delta = 6 | | Merging of float 7 | | Discrete 8 | [@@deriving sexp] 9 | 10 | type k = 11 | | Manual 12 | | Automatic of float 13 | [@@deriving sexp] 14 | 15 | type cx = 16 | | Always 17 | | Growth of float 18 | [@@deriving sexp] 19 | 20 | let default_delta = Merging 0.01 21 | 22 | let default_k = Automatic 25.0 23 | 24 | let default_cx = Growth 1.1 25 | 26 | type settings = { 27 | delta: delta; 28 | k: k; 29 | cx: cx; 30 | k_delta: float option; 31 | } 32 | 33 | type centroid = { 34 | mean: float; 35 | cumn: float; 36 | mean_cumn: float; 37 | n: float; 38 | } 39 | 40 | let empty_centroid = { mean = 0.0; n = 0.0; cumn = 0.0; mean_cumn = 0.0 } 41 | 42 | type stats = { 43 | cumulates_count: int; 44 | compress_count: int; 45 | auto_compress_count: int; 46 | } 47 | [@@deriving sexp] 48 | 49 | let empty_stats = { cumulates_count = 0; compress_count = 0; auto_compress_count = 0 } 50 | 51 | type info = { 52 | count: int; 53 | size: int; 54 | cumulates_count: int; 55 | compress_count: int; 56 | auto_compress_count: int; 57 | } 58 | [@@deriving sexp] 59 | 60 | module type M = sig 61 | type 'a t 62 | 63 | val empty : 'a t 64 | 65 | module Map : sig 66 | val is_empty : 'a t -> bool 67 | 68 | val length : 'a t -> int 69 | 70 | val min_elt : 'a t -> (float * 'a) option 71 | 72 | val max_elt : 'a t -> (float * 'a) option 73 | 74 | val iter : 'a t -> f:('a -> unit) -> unit 75 | 76 | val map : 'a t -> f:('a -> 'b) -> 'b t 77 | 78 | val fold : 'a t -> init:'b -> f:(key:float -> data:'a -> 'b -> 'b) -> 'b 79 | 80 | val fold_right : 'a t -> init:'b -> f:(key:float -> data:'a -> 'b -> 'b) -> 'b 81 | 82 | val binary_search : 83 | 'a t -> 84 | compare:(key:float -> data:'a -> float -> int) -> 85 | Binary_searchable.Which_target_by_key.t -> 86 | float -> 87 | (float * 'a) option 88 | 89 | val add_exn : 'a t -> key:float -> data:'a -> 'a t 90 | 91 | val remove : 'a t -> float -> 'a t 92 | end 93 | end 94 | 95 | module Make (M : M) = struct 96 | open Float 97 | 98 | type t = { 99 | settings: settings; 100 | centroids: centroid M.t; 101 | mutable min: centroid option; 102 | mutable max: centroid option; 103 | n: float; 104 | last_cumulate: float; 105 | stats: stats; 106 | } 107 | 108 | let get_min = function 109 | | { min = Some _ as x; _ } -> x 110 | | { min = None; n = 0.0; _ } -> None 111 | | { min = None; _ } as td -> 112 | let min = M.Map.min_elt td.centroids >>| snd in 113 | td.min <- min; 114 | min 115 | 116 | let get_max = function 117 | | { max = Some _ as x; _ } -> x 118 | | { max = None; n = 0.0; _ } -> None 119 | | { max = None; _ } as td -> 120 | let max = M.Map.max_elt td.centroids >>| snd in 121 | td.max <- max; 122 | max 123 | 124 | let get_k_delta = function 125 | | Automatic k, Merging delta -> Some (k / delta) 126 | | _ -> None 127 | 128 | let create ?(delta = default_delta) ?(k = default_k) ?(cx = default_cx) () = 129 | let k = 130 | match k with 131 | | Manual -> k 132 | | Automatic x when is_positive x -> k 133 | | Automatic 0.0 -> 134 | invalid_arg 135 | "TDigest.create: k parameter cannot be zero, set to Tdigest.Manual to disable automatic \ 136 | compression." 137 | | Automatic x -> Printf.invalid_argf "TDigest k parameter must be positive, but was %f" x () 138 | in 139 | let cx = 140 | match cx with 141 | | Always -> cx 142 | | Growth x when is_positive x -> cx 143 | | Growth 0.0 -> 144 | invalid_arg 145 | "TDigest.create: cx parameter cannot be zero, set to Tdigest.Always to disable caching of \ 146 | cumulative totals." 147 | | Growth x -> Printf.invalid_argf "TDigest.create: cx parameter must be positive, but was %f" x () 148 | in 149 | { 150 | settings = { delta; k; cx; k_delta = get_k_delta (k, delta) }; 151 | centroids = M.empty; 152 | min = None; 153 | max = None; 154 | n = 0.0; 155 | last_cumulate = 0.0; 156 | stats = empty_stats; 157 | } 158 | 159 | let is_empty { centroids; _ } = M.Map.is_empty centroids 160 | 161 | let info { centroids; n; stats; _ } = 162 | { 163 | count = to_int n; 164 | size = M.Map.length centroids; 165 | cumulates_count = stats.cumulates_count; 166 | compress_count = stats.compress_count; 167 | auto_compress_count = stats.auto_compress_count; 168 | } 169 | 170 | let find_nearest td mean = 171 | let gt = ref None in 172 | let lte = 173 | M.Map.binary_search td.centroids `Last_less_than_or_equal_to mean 174 | ~compare:(fun ~key ~data against -> 175 | let x = compare key against in 176 | if Int.is_positive x then gt := Some (key, data); 177 | x ) 178 | in 179 | match lte with 180 | | Some (k, v) when mean = k -> Some v 181 | | Some (k1, v1) -> ( 182 | match !gt with 183 | | None -> None 184 | | Some (k2, _v2) when mean - k1 < k2 - mean -> Some v1 185 | | Some (_k2, v2) -> Some v2 ) 186 | | None -> None 187 | 188 | let use_cache = function 189 | | { n; last_cumulate; settings = { cx = Growth cx; _ }; _ } when cx > n / last_cumulate -> true 190 | | _ -> false 191 | 192 | let cumulate td ~exact = 193 | if td.n = td.last_cumulate || ((not exact) && use_cache td) 194 | then td 195 | else ( 196 | let cumn = ref 0.0 in 197 | let centroids = 198 | M.Map.map td.centroids ~f:(fun data -> 199 | let updated = { data with mean_cumn = !cumn + (data.n / 2.); cumn = !cumn + data.n } in 200 | cumn := updated.cumn; 201 | updated ) 202 | in 203 | { 204 | td with 205 | centroids; 206 | min = None; 207 | max = None; 208 | n = !cumn; 209 | last_cumulate = !cumn; 210 | stats = { td.stats with cumulates_count = Int.succ td.stats.cumulates_count }; 211 | } ) 212 | 213 | let new_bounds ({ mean; _ } as added) = function 214 | | { n = 0.0; min = None; max = None; _ } -> Some added, Some added 215 | | { min = Some existing; max; _ } when mean < existing.mean -> Some added, max 216 | | { min; max = Some existing; _ } when mean > existing.mean -> min, Some added 217 | | { min; max; _ } -> min, max 218 | 219 | let new_centroid td ~mean ~n ~cumn = 220 | let data = { mean; cumn; n; mean_cumn = n / 2. } in 221 | let centroids = M.Map.add_exn td.centroids ~key:mean ~data in 222 | let min, max = new_bounds data td in 223 | { td with centroids; min; max; n = td.n + n } 224 | 225 | let add_weight td nearest ~mean ~n = 226 | let updated = 227 | { 228 | mean = 229 | ( if nearest.mean = mean 230 | then nearest.mean 231 | else nearest.mean + (n * (mean - nearest.mean) / (nearest.n + n)) ); 232 | cumn = nearest.cumn + n; 233 | mean_cumn = nearest.mean_cumn + (n / 2.); 234 | n = nearest.n + n; 235 | } 236 | in 237 | let centroids = 238 | M.Map.remove td.centroids nearest.mean |> M.Map.add_exn ~key:updated.mean ~data:updated 239 | in 240 | { td with centroids; n = td.n + n; min = None; max = None } 241 | 242 | let internal_digest td ~n ~mean = 243 | let nearest_is_boundary boundary nearest = 244 | Option.value_map boundary ~default:false ~f:(fun { mean; _ } -> mean = nearest.mean) 245 | in 246 | let td = 247 | match find_nearest td mean, td.settings.delta with 248 | | Some nearest, _ when nearest.mean = mean -> add_weight td nearest ~mean ~n 249 | | Some nearest, _ when nearest_is_boundary (get_min td) nearest -> 250 | new_centroid td ~mean ~n ~cumn:0.0 251 | | Some nearest, _ when nearest_is_boundary (get_max td) nearest -> 252 | new_centroid td ~mean ~n ~cumn:td.n 253 | | Some nearest, Discrete -> new_centroid td ~mean ~n ~cumn:nearest.cumn 254 | | Some nearest, Merging delta -> 255 | let p = nearest.mean_cumn / td.n in 256 | let max_n = round_down (4.0 * td.n * delta * p * (1.0 - p)) in 257 | if max_n - nearest.n >= n 258 | then add_weight td nearest ~mean ~n 259 | else new_centroid td ~mean ~n ~cumn:nearest.cumn 260 | | None, _ -> new_centroid td ~mean ~n ~cumn:0.0 261 | in 262 | cumulate td ~exact:false 263 | 264 | let weights_of_td = function 265 | (* n is out of sync, must check centroids *) 266 | | { centroids; _ } when M.Map.is_empty centroids -> [||] 267 | | { centroids; _ } -> 268 | let arr = Array.create ~len:(M.Map.length centroids) empty_centroid in 269 | let _i = 270 | M.Map.fold centroids ~init:0 ~f:(fun ~key:_ ~data i -> 271 | arr.(i) <- data; 272 | Int.succ i ) 273 | in 274 | arr 275 | 276 | let weights_of_table table = 277 | let arr = Array.create ~len:(Hashtbl.length table) empty_centroid in 278 | let _i = 279 | Hashtbl.fold table ~init:0 ~f:(fun ~key:mean ~data:n i -> 280 | arr.(i) <- { empty_centroid with mean; n }; 281 | Int.succ i ) 282 | in 283 | arr 284 | 285 | let rebuild ~auto settings (stats : stats) arr = 286 | Array.permute arr; 287 | let blank = 288 | { 289 | settings; 290 | centroids = M.empty; 291 | min = None; 292 | max = None; 293 | n = 0.0; 294 | last_cumulate = 0.0; 295 | stats = 296 | { 297 | stats with 298 | compress_count = Int.succ stats.compress_count; 299 | auto_compress_count = (if auto then Int.succ else Fn.id) stats.auto_compress_count; 300 | }; 301 | } 302 | in 303 | let td = Array.fold arr ~init:blank ~f:(fun acc { mean; n; _ } -> internal_digest acc ~n ~mean) in 304 | cumulate td ~exact:true 305 | 306 | let digest ?(n = 1) td ~mean = 307 | let td = internal_digest td ~n:(of_int n) ~mean in 308 | match td.settings with 309 | | { k_delta = Some kd; _ } when M.Map.length td.centroids |> of_int > kd -> 310 | rebuild ~auto:true td.settings td.stats (weights_of_td td) 311 | | _ -> td 312 | 313 | let add ?(n = 1) ~data td = 314 | if Int.(n <= 0) then invalid_arg "Tdigest.add: n <= 0"; 315 | digest td ~n ~mean:data 316 | 317 | let add_list ?(n = 1) xs td = 318 | if Int.(n <= 0) then invalid_arg "Tdigest.add_list: n <= 0"; 319 | List.fold xs ~init:td ~f:(fun acc mean -> digest acc ~n ~mean) 320 | 321 | let compress ?delta td = 322 | match delta with 323 | | None -> rebuild ~auto:false td.settings td.stats (weights_of_td td) 324 | | Some delta -> 325 | let settings = td.settings in 326 | let updated = rebuild ~auto:false { td.settings with delta } td.stats (weights_of_td td) in 327 | { updated with settings } 328 | 329 | let to_string td = 330 | let buf = Bytes.create (M.Map.length td.centroids |> Int.( * ) 16) in 331 | let add_float pos ~data:f = 332 | let v = Int64.bits_of_float f in 333 | let rec loop pos = function 334 | | 8 -> pos 335 | | i -> 336 | Bytes.set buf pos Int64.(255L land shift_right v Int.(i * 8) |> to_int_exn |> Char.of_int_exn); 337 | (loop [@tailcall]) (Int.succ pos) (Int.succ i) 338 | in 339 | loop pos 0 340 | in 341 | let _pos = 342 | M.Map.fold td.centroids ~init:0 ~f:(fun ~key:_ ~data:{ mean; n; _ } pos -> 343 | add_float pos ~data:mean |> add_float ~data:n ) 344 | in 345 | td, Bytes.unsafe_to_string ~no_mutation_while_string_reachable:buf 346 | 347 | let parse_float str pos = 348 | let open Int64 in 349 | let next off = String.get str Int.(pos + off) |> Char.to_int |> of_int_exn in 350 | (String.get str pos |> Char.to_int |> of_int_exn) 351 | lor shift_left (next 1) 8 352 | lor shift_left (next 2) 16 353 | lor shift_left (next 3) 24 354 | lor shift_left (next 4) 32 355 | lor shift_left (next 5) 40 356 | lor shift_left (next 6) 48 357 | lor shift_left (next 7) 56 358 | |> float_of_bits 359 | 360 | let of_string ?(delta = default_delta) ?(k = default_k) ?(cx = default_cx) str = 361 | if Int.(String.length str % 16 <> 0) then invalid_arg "Tdigest.of_string: invalid string length"; 362 | let settings = { delta; k; cx; k_delta = get_k_delta (k, delta) } in 363 | let table = Hashtbl.create (module Float) in 364 | let rec loop = function 365 | | pos when Int.(pos = String.length str) -> () 366 | | pos -> 367 | let mean = parse_float str pos in 368 | let n = parse_float str Int.(pos + 8) in 369 | Hashtbl.update table mean ~f:(Option.value_map ~default:n ~f:(( + ) n)); 370 | (loop [@tailcall]) Int.(pos + 16) 371 | in 372 | loop 0; 373 | weights_of_table table |> rebuild ~auto:true settings empty_stats 374 | 375 | module Export = struct 376 | type td_t = t 377 | 378 | type settings = { 379 | delta: delta; 380 | k: k; 381 | cx: cx; 382 | } 383 | [@@deriving sexp] 384 | 385 | type t = { 386 | settings: settings; 387 | state: string; 388 | stats: stats; 389 | } 390 | [@@deriving sexp] 391 | 392 | let create ({ settings = { delta; k; cx; _ }; stats; _ } as td : td_t) = 393 | { settings = { delta; k; cx }; state = to_string td |> snd; stats } 394 | end 395 | 396 | let sexp_of_t td = Export.create td |> [%sexp_of: Export.t] 397 | 398 | let t_of_sexp sexp = 399 | let Export.{ settings = { delta; k; cx }; state; stats } = [%of_sexp: Export.t] sexp in 400 | { (of_string ~delta ~k ~cx state) with stats } 401 | 402 | let merge ?(delta = default_delta) ?(k = default_k) ?(cx = default_cx) tds = 403 | let settings = { delta; k; cx; k_delta = get_k_delta (k, delta) } in 404 | let table = Hashtbl.create (module Float) in 405 | List.iter tds ~f:(fun { centroids; _ } -> 406 | M.Map.iter centroids ~f:(fun { mean; n; _ } -> 407 | Hashtbl.update table mean ~f:(Option.value_map ~default:n ~f:(( + ) n)) ) ); 408 | weights_of_table table |> rebuild ~auto:true settings empty_stats 409 | 410 | type bounds = 411 | | Neither 412 | | Both of centroid * centroid 413 | | Equal of centroid 414 | | Lower of centroid 415 | | Upper of centroid 416 | 417 | let bounds td needle lens = 418 | let gt = ref None in 419 | let lte = 420 | M.Map.binary_search td.centroids `Last_less_than_or_equal_to needle 421 | ~compare:(fun ~key ~data against -> 422 | let x = compare (lens data) against in 423 | if Int.is_positive x then gt := Some (key, data); 424 | x ) 425 | in 426 | match lte with 427 | | Some (_k, v) when lens v = needle -> Equal v 428 | | Some (_k1, v1) -> ( 429 | match !gt with 430 | | Some (_k2, v2) -> Both (v1, v2) 431 | | None -> Lower v1 ) 432 | | None -> ( 433 | match get_min td with 434 | | Some v -> Upper v 435 | | None -> Neither ) 436 | 437 | let percentile td p = 438 | match td with 439 | | { n = 0.0; _ } -> td, None 440 | | td -> ( 441 | let td = cumulate td ~exact:true in 442 | let h = td.n * p in 443 | match bounds td h (fun { mean_cumn; _ } -> mean_cumn), td.settings.delta with 444 | | Lower x, _ 445 | |Upper x, _ 446 | |Equal x, _ -> 447 | td, Some x.mean 448 | | Both (lower, upper), Merging _ -> 449 | let num = 450 | lower.mean 451 | + ((h - lower.mean_cumn) * (upper.mean - lower.mean) / (upper.mean_cumn - lower.mean_cumn)) 452 | in 453 | td, Some num 454 | | Both (lower, _upper), Discrete when h <= lower.cumn -> td, Some lower.mean 455 | | Both (_lower, upper), Discrete -> td, Some upper.mean 456 | | Neither, _ -> td, None ) 457 | 458 | let percentiles td ps = List.fold_map ps ~init:td ~f:percentile 459 | 460 | let p_rank td p = 461 | match get_min td with 462 | | None -> td, None 463 | | Some v when p < v.mean -> td, Some 0.0 464 | | Some _ -> ( 465 | match get_max td with 466 | | None -> td, None 467 | | Some v when p > v.mean -> td, Some 1.0 468 | | Some _ -> ( 469 | let td = cumulate td ~exact:true in 470 | match bounds td p (fun { mean; _ } -> mean), td.settings.delta with 471 | | Both (lower, _), Discrete 472 | |Lower lower, Discrete 473 | |Equal lower, Discrete -> 474 | td, Some (lower.cumn / td.n) 475 | | Neither, Discrete 476 | |Upper _, Discrete -> 477 | td, None 478 | | Equal x, Merging _ -> td, Some (x.mean_cumn / td.n) 479 | | Both (lower, upper), Merging _ -> 480 | let num = 481 | lower.mean_cumn 482 | + ((p - lower.mean) * (upper.mean_cumn - lower.mean_cumn) / (upper.mean - lower.mean)) 483 | in 484 | td, Some (num / td.n) 485 | | _, Merging _ -> td, None ) ) 486 | 487 | let p_ranks td ps = List.fold_map ps ~init:td ~f:p_rank 488 | 489 | module Private = struct 490 | let centroids td = 491 | M.Map.fold_right td.centroids ~init:[] ~f:(fun ~key:_ ~data:{ mean; n; _ } acc -> (mean, n) :: acc) 492 | 493 | let min td = get_min td >>| fun { mean; n; _ } -> mean, n 494 | 495 | let max td = get_max td >>| fun { mean; n; _ } -> mean, n 496 | end 497 | end 498 | 499 | module M = Make (struct 500 | type 'a t = (float, 'a, Float.comparator_witness) Map.t 501 | 502 | let empty = Map.empty (module Float) 503 | 504 | module Map = Map 505 | end) 506 | 507 | module Marshallable = Make (struct 508 | type 'a t = (float, 'a, Float.comparator_witness) Map.Tree.t 509 | 510 | module Map = Map.Make_tree (Float) 511 | 512 | let empty = Map.empty 513 | end) 514 | 515 | include M 516 | -------------------------------------------------------------------------------- /src/tdigest.mli: -------------------------------------------------------------------------------- 1 | open! Base 2 | 3 | (** 4 | [delta] is the compression factor, the max fraction of mass that can be owned by one centroid (bigger, up to 1.0, means more compression). 5 | [~delta:Discrete] switches off TDigest behavior and treats the distribution as discrete, with no merging and exact values reported. 6 | *) 7 | type delta = 8 | | Merging of float 9 | | Discrete 10 | [@@deriving sexp] 11 | 12 | (** 13 | [k] is a size threshold that triggers recompression as the TDigest grows during input. 14 | [~k:Manual] disables automatic recompression. 15 | *) 16 | type k = 17 | | Manual 18 | | Automatic of float 19 | [@@deriving sexp] 20 | 21 | (** 22 | [cx] (default: [1.1]) specifies how often to update cached cumulative totals used for quantile estimation during ingest. 23 | This is a tradeoff between performance and accuracy. 24 | [~cx:Always] will recompute cumulatives on every new datapoint, but the performance drops by 15-25x or even more depending on the size of the dataset. 25 | *) 26 | type cx = 27 | | Always 28 | | Growth of float 29 | [@@deriving sexp] 30 | 31 | (** 32 | [count]: sum of all [n] 33 | 34 | [size]: size of the internal B-Tree. Calling [Tdigest.compress] will usually reduce this size. 35 | 36 | [cumulates_count]: number of cumulate operations over the life of this Tdigest instance. 37 | 38 | [compress_count]: number of compression operations over the life of this Tdigest instance. 39 | 40 | [auto_cumulates_count]: number of compression operations over the life of this Tdigest instance that were not triggered by a manual call to [Tdigest.compress]. 41 | *) 42 | type info = { 43 | count: int; 44 | size: int; 45 | cumulates_count: int; 46 | compress_count: int; 47 | auto_compress_count: int; 48 | } 49 | [@@deriving sexp] 50 | 51 | module M : sig 52 | type t [@@deriving sexp] 53 | 54 | (** 55 | [Tdigest.create ?delta ?k ?cx ()] 56 | 57 | Allocate an empty Tdigest instance. 58 | 59 | [delta] (default: [0.01]) is the compression factor, the max fraction of mass that can be owned by one centroid (bigger, up to 1.0, means more compression). 60 | [~delta:Discrete] switches off TDigest behavior and treats the distribution as discrete, with no merging and exact values reported. 61 | 62 | [k] (default: [25]) is a size threshold that triggers recompression as the TDigest grows during input. 63 | [~k:Manual] disables automatic recompression. 64 | 65 | [cx] (default: [1.1]) specifies how often to update cached cumulative totals used for quantile estimation during ingest. 66 | This is a tradeoff between performance and accuracy. 67 | [~cx:Always] will recompute cumulatives on every new datapoint, but the performance drops by 15-25x or even more depending on the size of the dataset. 68 | *) 69 | val create : ?delta:delta -> ?k:k -> ?cx:cx -> unit -> t 70 | 71 | (** 72 | [Tdigest.is_empty td] returns [true] when the T-Digest does not contain any values. 73 | *) 74 | val is_empty : t -> bool 75 | 76 | (** 77 | [Tdigest.info td] returns a record with these fields: 78 | 79 | [count]: sum of all [n] 80 | 81 | [size]: size of the internal B-Tree. Calling [Tdigest.compress] will usually reduce this size. 82 | 83 | [cumulates_count]: number of cumulate operations over the life of this Tdigest instance. 84 | 85 | [compress_count]: number of compression operations over the life of this Tdigest instance. 86 | 87 | [auto_cumulates_count]: number of compression operations over the life of this Tdigest instance that were not triggered by a manual call to [Tdigest.compress]. 88 | *) 89 | val info : t -> info 90 | 91 | (** 92 | [Tdigest.add ?n ~data td] 93 | 94 | Incorporate a value ([data]) having count [n] (default: [1]) into a new Tdigest. 95 | *) 96 | val add : ?n:int -> data:float -> t -> t 97 | 98 | (** 99 | [Tdigest.add_list ?n ll td] 100 | 101 | Incorporate a list of values each having count [n] (default: [1]) into a new Tdigest. 102 | *) 103 | val add_list : ?n:int -> float list -> t -> t 104 | 105 | (** 106 | [Tdigest.merge ?delta ?k ?cx tdigests] 107 | 108 | Efficiently combine multiple Tdigests into a new one. 109 | *) 110 | val merge : ?delta:delta -> ?k:k -> ?cx:cx -> t list -> t 111 | 112 | (** 113 | [Tdigest.p_rank td q] 114 | For a value [q] estimate the percentage ([0..1]) of values [<= q]. 115 | 116 | Returns a new Tdigest to reuse intermediate computations. 117 | *) 118 | val p_rank : t -> float -> t * float option 119 | 120 | (** 121 | Same as [Tdigest.p_rank] but for a list of values. 122 | 123 | Returns a new Tdigest to reuse intermediate computations. 124 | *) 125 | val p_ranks : t -> float list -> t * float option list 126 | 127 | (** 128 | [Tdigest.percentile td p] 129 | 130 | For a percentage [p] ([0..1]) estimate the smallest value [q] at which at least [p] percent of the values [<= q]. 131 | 132 | For discrete distributions, this selects q using the Nearest Rank Method 133 | [https://en.wikipedia.org/wiki/Percentile#The_Nearest_Rank_method] 134 | 135 | For continuous distributions, interpolates data values between count-weighted bracketing means. 136 | 137 | Returns a new Tdigest to reuse intermediate computations. 138 | *) 139 | val percentile : t -> float -> t * float option 140 | 141 | (** 142 | Same as [Tdigest.percentile] but for a list of values. 143 | 144 | Returns a new Tdigest to reuse intermediate computations. 145 | *) 146 | val percentiles : t -> float list -> t * float option list 147 | 148 | (** 149 | [Tdigest.compress ?delta td] 150 | 151 | Manual recompression. Not guaranteed to reduce size further if too few values have been added since the last compression. 152 | 153 | [delta] (default: initial value passed to [Tdigest.create]) The compression level to use for this operation only. This does not alter the [delta] used by the Tdigest going forward. 154 | *) 155 | val compress : ?delta:delta -> t -> t 156 | 157 | (** 158 | [Tdigest.to_string td] 159 | 160 | Serialize the internal state into a binary string that can be stored or concatenated with other such binary strings. 161 | 162 | Use [Tdigest.of_string] to create a new Tdigest instance from it. 163 | 164 | Returns a new Tdigest to reuse intermediate computations. 165 | *) 166 | val to_string : t -> t * string 167 | 168 | (** 169 | [Tdigest.of_string ?delta ?k ?cx str] 170 | 171 | See [Tdigest.create] for the meaning of the optional parameters. 172 | 173 | Allocate a new Tdigest from a string or concatenation of strings originally created by [Tdigest.to_string]. 174 | *) 175 | val of_string : ?delta:delta -> ?k:k -> ?cx:cx -> string -> t 176 | 177 | (** For internal use *) 178 | module Private : sig 179 | (** For internal use *) 180 | val centroids : t -> (float * float) list 181 | 182 | (** For internal use *) 183 | val min : t -> (float * float) option 184 | 185 | (** For internal use *) 186 | val max : t -> (float * float) option 187 | end 188 | end 189 | 190 | (** A marshallable version of this library. Approximately 5x slower than the non-marshallable version. *) 191 | module Marshallable : sig 192 | type t 193 | 194 | include module type of M with type t := t 195 | end 196 | 197 | include module type of M 198 | -------------------------------------------------------------------------------- /tdigest.opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | maintainer: "Simon Grondin" 3 | authors: [ 4 | "Simon Grondin" 5 | "Will Welch" 6 | ] 7 | synopsis: "OCaml implementation of the T-Digest algorithm" 8 | description: """ 9 | The T-Digest is a data structure and algorithm for constructing an approximate distribution for a collection of real numbers presented as a stream. 10 | 11 | The T-Digest can estimate percentiles or quantiles extremely accurately even at the tails, while using a fraction of the space. 12 | 13 | Additionally, the T-Digest is concatenable, making it a good fit for distributed systems. The internal state of a T-Digest can be exported as a binary string, and the concatenation of any number of those strings can then be imported to form a new T-Digest. 14 | """ 15 | license: "MIT" 16 | tags: [] 17 | homepage: "https://github.com/SGrondin/tdigest" 18 | dev-repo: "git://github.com/SGrondin/tdigest" 19 | doc: "https://github.com/SGrondin/tdigest" 20 | bug-reports: "https://github.com/SGrondin/tdigest/issues" 21 | depends: [ 22 | "ocaml" { >= "4.10.0" } 23 | "dune" { >= "1.9.0" } 24 | 25 | "base" { >= "v0.17.0" & < "v0.18.0" } 26 | "ppx_sexp_conv" { >= "v0.17.0" } 27 | 28 | "ppx_expect" { >= "v0.17.0" & with-test } 29 | "ppx_custom_printf" { >= "v0.17.0" & with-test } 30 | "ocamlformat" { = "0.25.1" & with-dev-setup } 31 | "ocaml-lsp-server" { with-dev-setup } 32 | ] 33 | build: ["dune" "build" "-p" name "-j" jobs] 34 | -------------------------------------------------------------------------------- /test/dune: -------------------------------------------------------------------------------- 1 | (library 2 | (name test) 3 | (inline_tests) 4 | (libraries 5 | tdigest 6 | ) 7 | (preprocess (pps 8 | ppx_expect 9 | ppx_sexp_conv 10 | ppx_custom_printf 11 | )) 12 | (ocamlopt_flags -Oclassic) 13 | ) 14 | -------------------------------------------------------------------------------- /test/shared.ml: -------------------------------------------------------------------------------- 1 | open! Base 2 | 3 | let sprintf = Printf.sprintf 4 | 5 | let print_endline = Stdlib.print_endline 6 | 7 | let render_centroids ll = 8 | List.map ll ~f:(fun (mean, n) -> ("mean", mean), ("n", n)) 9 | |> [%sexp_of: ((string * float) * (string * float)) list] 10 | 11 | let check td = Tdigest.Private.centroids td |> render_centroids |> Sexp.to_string_hum |> print_endline 12 | 13 | let checkm td = 14 | Tdigest.Marshallable.Private.centroids td |> render_centroids |> Sexp.to_string_hum |> print_endline 15 | 16 | let check_size td = (Tdigest.info td).size |> sprintf !"%{sexp: int}" |> print_endline 17 | 18 | let check_min_max td = 19 | (Tdigest.Private.min td, Tdigest.Private.max td) 20 | |> sprintf !"%{sexp: (float * float) option * (float * float) option}" 21 | |> print_endline 22 | 23 | let check_p_rank p td = Tdigest.p_rank td p |> snd |> sprintf !"%{sexp: float option}" |> print_endline 24 | 25 | let check_percentile p td = 26 | Tdigest.percentile td p |> snd |> sprintf !"%{sexp: float option}" |> print_endline 27 | 28 | let check_p_ranks ps td = 29 | Tdigest.p_ranks td ps |> snd |> sprintf !"%{sexp: float option list}" |> print_endline 30 | 31 | let check_percentiles ps td = 32 | Tdigest.percentiles td ps |> snd |> sprintf !"%{sexp: float option list}" |> print_endline 33 | 34 | let identical_sexp ~received ~expected = 35 | if not (Sexp.equal received expected) 36 | then print_endline (sprintf !"Not identical.\nReceived: %{Sexp}\nExpected: %{Sexp}" received expected) 37 | else print_endline "Identical" 38 | 39 | let expected_centroids td = 40 | let received = Tdigest.Private.centroids td |> render_centroids in 41 | let expected = List.init 100 ~f:(fun i -> i * 10 |> Int.to_float, 1.) |> render_centroids in 42 | identical_sexp ~received ~expected 43 | -------------------------------------------------------------------------------- /test/test_discrete.ml: -------------------------------------------------------------------------------- 1 | open! Base 2 | open Shared 3 | 4 | let%expect_test "discrete digests" = 5 | (* consumes increasing-valued points *) 6 | let () = 7 | let xs = List.init 100 ~f:(fun i -> i * 10 |> Float.of_int) in 8 | let td = Tdigest.create ~delta:Tdigest.Discrete () |> Tdigest.add_list xs in 9 | expected_centroids td; 10 | [%expect {| Identical |}] 11 | in 12 | 13 | (* consumes decreasing-valued points *) 14 | let () = 15 | let xs = List.init 100 ~f:(fun i -> (99 - i) * 10 |> Float.of_int) in 16 | let td = Tdigest.create ~delta:Tdigest.Discrete () |> Tdigest.add_list xs in 17 | expected_centroids td; 18 | [%expect {| Identical |}] 19 | in 20 | 21 | (* consumes same-valued points into a single point *) 22 | Tdigest.create ~delta:Tdigest.Discrete () |> Fn.apply_n_times ~n:100 (Tdigest.add ~data:1000.) |> check; 23 | [%expect {| (((mean 1000) (n 100))) |}]; 24 | 25 | (* selects a run of duplicates containing the percentile *) 26 | Tdigest.create ~delta:Tdigest.Discrete () 27 | |> Tdigest.add_list 28 | [ 29 | 5.; 30 | 0.; 31 | 0.; 32 | 8.; 33 | 0.; 34 | 0.; 35 | 0.; 36 | 0.; 37 | 0.; 38 | 0.; 39 | 0.; 40 | 0.; 41 | 0.; 42 | 0.; 43 | 0.; 44 | 0.; 45 | 0.; 46 | 0.; 47 | 3.; 48 | 0.; 49 | 0.; 50 | 0.; 51 | 0.; 52 | 6.; 53 | 1.; 54 | 0.; 55 | 6.; 56 | 5.; 57 | 3.; 58 | 6.; 59 | 1.; 60 | 1.; 61 | 0.; 62 | 0.; 63 | 1.; 64 | 1.; 65 | 0.; 66 | 0.; 67 | 1.; 68 | 0.; 69 | ] 70 | |> check_percentile 0.5; 71 | [%expect {| (0) |}]; 72 | 73 | (* handles multiples duplicates *) 74 | Tdigest.create ~delta:Tdigest.Discrete () 75 | |> Fn.apply_n_times ~n:10 (fun td -> 76 | td |> Tdigest.add ~data:0. |> Tdigest.add ~data:1. |> Tdigest.add ~data:0.5 ) 77 | |> check; 78 | [%expect {| (((mean 0) (n 10)) ((mean 0.5) (n 10)) ((mean 1) (n 10))) |}] 79 | 80 | let%expect_test "discrete percentile ranks" = 81 | (* from a single point *) 82 | Tdigest.create ~delta:Tdigest.Discrete () 83 | |> Tdigest.add ~data:0. 84 | |> check_p_ranks [ -1.5; 0.; 0.5; 1.; 1.5 ]; 85 | [%expect {| ((0) (1) (1) (1) (1)) |}]; 86 | 87 | (* from two points *) 88 | Tdigest.create ~delta:Tdigest.Discrete () 89 | |> Tdigest.add_list [ 0.; 1. ] 90 | |> check_p_ranks [ -1.5; 0.; 0.5; 1.; 1.5 ]; 91 | [%expect {| ((0) (0.5) (0.5) (1) (1)) |}]; 92 | 93 | (* from three points *) 94 | Tdigest.create ~delta:Tdigest.Discrete () 95 | |> Tdigest.add_list [ -1.; 0.; 1. ] 96 | |> check_p_ranks [ -1.5; -1.; -0.5; 0.; 0.5; 1.; 1.5 ]; 97 | [%expect 98 | {| 99 | ((0) (0.33333333333333331) (0.33333333333333331) (0.66666666666666663) 100 | (0.66666666666666663) (1) (1)) |}]; 101 | 102 | (* from three points is same as from multiples of those points *) 103 | let () = 104 | let ps = [ -1.5; -1.; -0.5; 0.; 0.5; 1.; 1.5 ] in 105 | let td1 = Tdigest.create ~delta:Tdigest.Discrete () |> Tdigest.add_list [ 0.; 1.; -1. ] in 106 | check_p_ranks ps td1; 107 | let td2 = td1 |> Tdigest.add_list [ 0.; 1.; -1. ] |> Tdigest.add_list [ 0.; 1.; -1. ] in 108 | check_p_ranks ps td2; 109 | 110 | [%expect 111 | {| 112 | ((0) (0.33333333333333331) (0.33333333333333331) (0.66666666666666663) 113 | (0.66666666666666663) (1) (1)) 114 | ((0) (0.33333333333333331) (0.33333333333333331) (0.66666666666666663) 115 | (0.66666666666666663) (1) (1)) |}] 116 | in 117 | 118 | (* from four points away from the origin *) 119 | Tdigest.create ~delta:Tdigest.Discrete () 120 | |> Tdigest.add_list [ 10.; 11.; 12.; 13. ] 121 | |> check_p_ranks [ 9.; 10.; 11.; 12.; 13.; 14. ]; 122 | [%expect {| ((0) (0.25) (0.5) (0.75) (1) (1)) |}]; 123 | 124 | (* from four points is same as from multiples of those points *) 125 | let () = 126 | let ps = [ 9.; 10.; 11.; 12.; 13.; 14. ] in 127 | let td1 = Tdigest.create ~delta:Tdigest.Discrete () |> Tdigest.add_list [ 10.; 11.; 12.; 13. ] in 128 | check_p_ranks ps td1; 129 | let td2 = td1 |> Tdigest.add_list [ 10.; 11.; 12.; 13. ] |> Tdigest.add_list [ 10.; 11.; 12.; 13. ] in 130 | check_p_ranks ps td2; 131 | [%expect {| 132 | ((0) (0.25) (0.5) (0.75) (1) (1)) 133 | ((0) (0.25) (0.5) (0.75) (1) (1)) |}] 134 | in 135 | () 136 | 137 | let%expect_test "discrete percentiles" = 138 | (* from a single point *) 139 | Tdigest.create ~delta:Tdigest.Discrete () |> Tdigest.add ~data:0. |> check_percentiles [ 0.; 0.5; 1. ]; 140 | [%expect {| ((0) (0) (0)) |}]; 141 | 142 | (* from two points *) 143 | Tdigest.create ~delta:Tdigest.Discrete () 144 | |> Tdigest.add_list [ 0.; 10. ] 145 | |> check_percentiles [ 0.; 1 // 4; 1 // 2; 3 // 4; 1. ]; 146 | [%expect {| ((0) (0) (0) (10) (10)) |}]; 147 | 148 | (* from three points *) 149 | Tdigest.create ~delta:Tdigest.Discrete () 150 | |> Tdigest.add_list [ 0.; 5.; 10. ] 151 | |> check_percentiles [ 0.; 1 // 4; Float.(1. / 2.9); 1 // 2; 2 // 3; 3 // 4; 1. ]; 152 | [%expect {| ((0) (0) (5) (5) (5) (10) (10)) |}]; 153 | 154 | (* from four points away from the origin *) 155 | Tdigest.create ~delta:Tdigest.Discrete () 156 | |> Tdigest.add_list [ 10.; 11.; 12.; 13. ] 157 | |> check_percentiles [ 0.; 1 // 4; 1 // 2; 3 // 4; 1. ]; 158 | [%expect {| ((10) (10) (11) (12) (13)) |}] 159 | -------------------------------------------------------------------------------- /test/test_tdigest.ml: -------------------------------------------------------------------------------- 1 | open! Base 2 | open Shared 3 | 4 | let%expect_test "T-Digests in which each point becomes a centroid" = 5 | (* consumes a point *) 6 | Tdigest.create () |> Tdigest.add ~data:0. |> check; 7 | [%expect {| (((mean 0) (n 1))) |}]; 8 | 9 | (* consumes two points *) 10 | Tdigest.create () |> Tdigest.add_list [ 0.; 1. ] |> check; 11 | [%expect {| (((mean 0) (n 1)) ((mean 1) (n 1))) |}]; 12 | 13 | (* consumes three points *) 14 | Tdigest.create () |> Tdigest.add_list [ 0.; 1.; -1. ] |> check; 15 | [%expect {| (((mean -1) (n 1)) ((mean 0) (n 1)) ((mean 1) (n 1))) |}]; 16 | 17 | (* consumes increasing-valued points *) 18 | let () = 19 | let td = 20 | let xs = List.init 100 ~f:(fun i -> i * 10 |> Float.of_int) in 21 | Tdigest.create ~delta:(Tdigest.Merging 0.001) ~k:Tdigest.Manual () |> Tdigest.add_list xs 22 | in 23 | expected_centroids td; 24 | [%expect {| Identical |}] 25 | in 26 | 27 | (* consumes decreasing-valued points *) 28 | let () = 29 | let td = 30 | let xs = List.init 100 ~f:(fun i -> (99 - i) * 10 |> Float.of_int) in 31 | Tdigest.create ~delta:(Tdigest.Merging 0.001) ~k:Tdigest.Manual () |> Tdigest.add_list xs 32 | in 33 | expected_centroids td; 34 | [%expect {| Identical |}] 35 | in 36 | () 37 | 38 | let%expect_test "T-Digests in which points are merged into centroids" = 39 | (* consumes same-valued points into a single point *) 40 | Tdigest.create () |> Fn.apply_n_times ~n:100 (Tdigest.add ~data:1000.) |> check; 41 | [%expect {| (((mean 1000) (n 100))) |}]; 42 | 43 | (* handles multiple duplicates *) 44 | Tdigest.create ~delta:(Tdigest.Merging 1.) ~k:Tdigest.Manual ~cx:Tdigest.Always () 45 | |> Fn.apply_n_times ~n:10 (fun td -> 46 | td |> Tdigest.add ~data:0. |> Tdigest.add ~data:1. |> Tdigest.add ~data:0.5 ) 47 | |> check; 48 | [%expect {| (((mean 0) (n 10)) ((mean 0.5) (n 10)) ((mean 1) (n 10))) |}] 49 | 50 | let%expect_test "compress" = 51 | (* compresses points and preserves bounds *) 52 | let () = 53 | let xs = List.init 100 ~f:(fun i -> i * 10 |> Float.of_int) in 54 | let td = Tdigest.create ~delta:(Tdigest.Merging 0.001) ~k:Tdigest.Manual () |> Tdigest.add_list xs in 55 | (* must be 100 *) 56 | check_size td; 57 | 58 | let td = Tdigest.compress ~delta:(Tdigest.Merging 0.1) td in 59 | (* must be < 100 *) 60 | check_size td; 61 | check_min_max td; 62 | [%expect {| 63 | 100 64 | 44 65 | (((0 1)) ((990 1))) |}] 66 | in 67 | 68 | (* K automatically compresses during ingest *) 69 | let () = 70 | let td = 71 | Array.init 10_000 ~f:(fun i -> i * 10 |> Float.of_int) 72 | |> Array.fold ~init:(Tdigest.create ()) ~f:(fun td x -> Tdigest.add td ~data:x) 73 | in 74 | (* must be < 10_000 *) 75 | check_size td; 76 | check_min_max td; 77 | [%expect {| 78 | 2132 79 | (((0 1)) ((99990 1))) |}] 80 | in 81 | () 82 | 83 | let%expect_test "percentile ranks" = 84 | (* reports None when given no points *) 85 | Tdigest.create () |> check_p_rank 1.; 86 | [%expect {| () |}]; 87 | 88 | (* from a single point *) 89 | Tdigest.create () |> Tdigest.add ~data:0. |> check_p_ranks [ -0.5; 0.; 0.5; 1.; 1.5 ]; 90 | [%expect {| ((0) (0.5) (1) (1) (1)) |}]; 91 | 92 | (* from three points *) 93 | Tdigest.create () 94 | |> Tdigest.add_list [ -1.; 0.; 1. ] 95 | |> check_p_ranks [ -1.5; -1.0; -0.5; 0.; 0.5; 1.0; 1.5 ]; 96 | [%expect 97 | {| 98 | ((0) (0.16666666666666666) (0.33333333333333331) (0.5) (0.66666666666666663) 99 | (0.83333333333333337) (1)) |}]; 100 | 101 | (* from three points is same as from multiples of those points *) 102 | let () = 103 | let ps = [ -1.5; -1.0; -0.5; 0.; 0.5; 1.0; 1.5 ] in 104 | let td = Tdigest.create () |> Tdigest.add_list [ 0.; 1.; -1. ] in 105 | check_percentiles ps td; 106 | let td = td |> Tdigest.add_list [ 0.; 1.; -1. ] |> Tdigest.add_list [ 0.; 1.; -1. ] in 107 | check_percentiles ps td; 108 | [%expect {| 109 | ((-1) (-1) (-1) (-1) (0) (1) (1)) 110 | ((-1) (-1) (-1) (-1) (0) (1) (1)) |}] 111 | in 112 | 113 | (* from four points away from the origin *) 114 | let () = 115 | Tdigest.create () 116 | |> Tdigest.add_list [ 10.; 11.; 12.; 13. ] 117 | |> check_p_ranks [ 9.; 10.; 11.; 12.; 13.; 14. ]; 118 | [%expect {| ((0) (0.125) (0.375) (0.625) (0.875) (1)) |}] 119 | in 120 | 121 | (* from four points is same as from multiples of those points *) 122 | let () = 123 | let ps = [ 9.; 10.; 11.; 12.; 13.; 14. ] in 124 | let td = 125 | Tdigest.create ~delta:(Tdigest.Merging 0.) ~k:Tdigest.Manual () 126 | |> Tdigest.add_list [ 10.; 11.; 12.; 13. ] 127 | in 128 | check_p_ranks ps td; 129 | let td = td |> Tdigest.add_list [ 10.; 11.; 12.; 13. ] |> Tdigest.add_list [ 10.; 11.; 12.; 13. ] in 130 | check_p_ranks ps td; 131 | [%expect 132 | {| 133 | ((0) (0.125) (0.375) (0.625) (0.875) (1)) 134 | ((0) (0.125) (0.375) (0.625) (0.875) (1)) |}] 135 | in 136 | 137 | (* from lots of uniformly distributed points *) 138 | let () = 139 | let td = 140 | Tdigest.create () 141 | |> Fn.apply_n_times ~n:100_000 (fun td -> Tdigest.add td ~data:(Random.float 1.)) 142 | |> Tdigest.compress 143 | in 144 | let _i, max_err, _td = 145 | Fn.apply_n_times ~n:100 146 | (fun (i, max_err, td) -> 147 | let td, q = Tdigest.p_rank td i |> fun (x, y) -> x, Option.value_exn y in 148 | let m = Float.(max max_err (i - q |> abs)) in 149 | Float.(i + 0.01), m, td) 150 | (0.01, 0.0, td) 151 | in 152 | Float.to_string max_err |> print_endline; 153 | (* must be < 0.01 *) 154 | [%expect {| 0.0038269003338100016 |}] 155 | in 156 | 157 | (* from an exact match *) 158 | let () = 159 | Tdigest.create ~delta:(Tdigest.Merging 0.001) ~k:Tdigest.Manual () 160 | |> Fn.apply_n_times ~n:10 (Tdigest.add_list [ 10.; 20.; 30. ]) 161 | |> check_p_rank 20.; 162 | [%expect {| (0.5) |}] 163 | in 164 | () 165 | 166 | let%expect_test "percentiles" = 167 | (* reports None when given no points *) 168 | Tdigest.create () |> check_percentile 0.5; 169 | [%expect {| () |}]; 170 | 171 | (* from a single point *) 172 | Tdigest.create () |> Tdigest.add ~data:0. |> check_percentiles [ 0.; 0.5; 1. ]; 173 | [%expect {| ((0) (0) (0)) |}]; 174 | 175 | (* from two points *) 176 | Tdigest.create () 177 | |> Tdigest.add_list [ 0.; 1. ] 178 | |> check_percentiles [ -1 // 4; 0.; 1 // 4; 1 // 2; 5 // 8; 3 // 4; 1.; 1.25 ]; 179 | [%expect {| ((0) (0) (0) (0.5) (0.75) (1) (1) (1)) |}]; 180 | 181 | (* from three points *) 182 | Tdigest.create () 183 | |> Tdigest.add_list [ 0.; 0.5; 1. ] 184 | |> check_percentiles [ 0.; 1 // 4; 1 // 2; 3 // 4; 1. ]; 185 | [%expect {| ((0) (0.125) (0.5) (0.875) (1)) |}]; 186 | 187 | (* from four points *) 188 | Tdigest.create () 189 | |> Tdigest.add_list [ 10.; 11.; 12.; 13. ] 190 | |> check_percentiles [ 0.; 1 // 4; 1 // 2; 3 // 4; 1. ]; 191 | [%expect {| ((10) (10.5) (11.5) (12.5) (13)) |}]; 192 | 193 | (* from lots of uniformly distributed points *) 194 | let () = 195 | let td = 196 | Tdigest.create () 197 | |> Fn.apply_n_times ~n:100_000 (fun td -> Tdigest.add td ~data:(Random.float 1.)) 198 | |> Tdigest.compress 199 | in 200 | let _i, max_err, _td = 201 | Fn.apply_n_times ~n:100 202 | (fun (i, max_err, td) -> 203 | let td, q = Tdigest.p_rank td i |> fun (x, y) -> x, Option.value_exn y in 204 | let m = Float.(max max_err (i - q |> abs)) in 205 | Float.(i + 0.01), m, td) 206 | (0.01, 0.0, td) 207 | in 208 | Float.to_string max_err |> print_endline; 209 | (* must be < 0.01 *) 210 | [%expect {| 0.0038269003338100016 |}] 211 | in 212 | () 213 | 214 | let%expect_test "serialization" = 215 | (* identical after recreating (to_string/of_string) *) 216 | let () = 217 | let xs = List.init 10 ~f:(fun _i -> Random.float 1.) in 218 | let td = Tdigest.create () |> Tdigest.add_list xs in 219 | let td1, export = Tdigest.to_string td in 220 | if String.length export <> 160 then failwith "export length <> 160"; 221 | let td2 = Tdigest.of_string export in 222 | check td1; 223 | check td2; 224 | [%expect 225 | {| 226 | (((mean 0.0278013320652886) (n 1)) ((mean 0.088701617893937754) (n 1)) 227 | ((mean 0.25409775932960554) (n 1)) ((mean 0.29053618718083241) (n 1)) 228 | ((mean 0.48715518185429985) (n 1)) ((mean 0.55401752229333867) (n 1)) 229 | ((mean 0.63535141231675607) (n 1)) ((mean 0.68236915472644311) (n 1)) 230 | ((mean 0.77208758810861389) (n 1)) ((mean 0.78255046724687982) (n 1))) 231 | (((mean 0.0278013320652886) (n 1)) ((mean 0.088701617893937754) (n 1)) 232 | ((mean 0.25409775932960554) (n 1)) ((mean 0.29053618718083241) (n 1)) 233 | ((mean 0.48715518185429985) (n 1)) ((mean 0.55401752229333867) (n 1)) 234 | ((mean 0.63535141231675607) (n 1)) ((mean 0.68236915472644311) (n 1)) 235 | ((mean 0.77208758810861389) (n 1)) ((mean 0.78255046724687982) (n 1))) |}] 236 | in 237 | 238 | (* identical after recreating (sexp) *) 239 | let () = 240 | let xs = List.init 10 ~f:(fun _i -> Random.float 1.) in 241 | let td1 = Tdigest.create () |> Tdigest.add_list xs in 242 | let td2 = [%sexp_of: Tdigest.t] td1 |> [%of_sexp: Tdigest.t] in 243 | print_endline (sprintf !"%{sexp#hum: Tdigest.t}" td1); 244 | print_endline (sprintf !"%{sexp#hum: Tdigest.t}" td2); 245 | check td1; 246 | check td2; 247 | [%expect 248 | {| 249 | ((settings ((delta (Merging 0.01)) (k (Automatic 25)) (cx (Growth 1.1)))) 250 | (state 251 | "E\160\000 Random.float 1.) in 272 | let td1 = Tdigest.Marshallable.create () |> Tdigest.Marshallable.add_list xs in 273 | let td2 : Tdigest.Marshallable.t = 274 | Stdlib.Marshal.to_string td1 [] |> fun s -> 275 | print_endline (sprintf "Length: %d" (String.length s)); 276 | Stdlib.Marshal.from_string s 0 277 | in 278 | print_endline (sprintf !"%{sexp#hum: Tdigest.Marshallable.t}" td1); 279 | print_endline (sprintf !"%{sexp#hum: Tdigest.Marshallable.t}" td2); 280 | checkm td1; 281 | checkm td2; 282 | [%expect 283 | {| 284 | Length: 531 285 | ((settings ((delta (Merging 0.01)) (k (Automatic 25)) (cx (Growth 1.1)))) 286 | (state 287 | "q\13752\204H\161?\000\000\000\000\000\000\240?)'\237\200\164\019\197?\000\000\000\000\000\000\240?\177\177\131SS\168\208?\000\000\000\000\000\000\240?\167\144'\003}/\224?\000\000\000\000\000\000\240?\182\188\187h\203o\225?\000\000\000\000\000\000\240?\133\132\245Gqp\228?\000\000\000\000\000\000\240?\130\246\223Pj\145\230?\000\000\000\000\000\000\240?\178 Tdigest.add_list (xs1 @ xs2) in 311 | let td2 = 312 | let a = Tdigest.create () |> Tdigest.add_list xs1 in 313 | let b = Tdigest.create () |> Tdigest.add_list xs2 in 314 | Tdigest.merge [ a; b ] 315 | in 316 | let ps = [ 0.0; 0.25; 0.50; 0.75; 1.0 ] in 317 | check_percentiles ps td1; 318 | check_percentiles ps td2; 319 | [%expect {| 320 | ((1) (3) (3.75) (6.75) (9)) 321 | ((1) (3) (3.75) (6.75) (9)) |}] 322 | 323 | let%expect_test "is_empty" = 324 | let xs = [ 3.0; 4.0; 3.5; 7.0 ] in 325 | let td = Tdigest.create () in 326 | Tdigest.is_empty td |> Bool.to_string |> print_endline; 327 | let td = Tdigest.add_list xs td in 328 | Tdigest.is_empty td |> Bool.to_string |> print_endline; 329 | [%expect {| 330 | true 331 | false |}] 332 | --------------------------------------------------------------------------------