├── .gitignore ├── CHANGES.md ├── LICENSE ├── README.md ├── bentov.opam ├── dune-project ├── lib ├── bentov.ml ├── bentov.mli └── dune ├── src ├── bt.ml └── dune └── test ├── dune ├── gaussian.ml └── test.ml /.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | *~ 3 | *.native 4 | *.byte 5 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | ## v1 (2020-06-24) 2 | 3 | * first `opam` release 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016-present, Mika Illouz 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, this 11 | list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | * Neither the name of EigenDog nor the names of its contributors may 15 | be used to endorse or promote products derived from this software 16 | without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bentov 2 | 3 | An OCaml implementation of histogram-sketching algorithm described in 4 | [A Streaming Parallel Decision Tree 5 | Algorithm](http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf) 6 | by Yael Ben-Haim and Elad Tom-Tov. Included is a command-line utility 7 | `bt`, which can read a file (or stdin) containing numbers, one per 8 | line, and output a representation of the approximated distribution. 9 | 10 | For example, to approximate 10 quantiles of 1M data in U(0,1): 11 | 12 | ```sh 13 | echo "" | awk '{ for ( i=0 ; i < 1e6 ; i++ ) { print rand() } }' | bt -n 20 -u 10 14 | ``` 15 | 16 | In this example, the size of the approximating histogram is 20. For 17 | additional details, `bt --help` . 18 | 19 | To install: 20 | 21 | ```sh 22 | opam install bentov 23 | ``` 24 | 25 | # Documentation 26 | 27 | See https://barko.github.io/bentov 28 | 29 | # License 30 | 31 | BSD 32 | -------------------------------------------------------------------------------- /bentov.opam: -------------------------------------------------------------------------------- 1 | opam-version: "2.0" 2 | synopsis: "1D histogram sketching" 3 | description: 4 | "Bentov implements an algorithm which approximates a 1D histogram as data is streamed over it." 5 | maintainer: ["mika@illouz.net"] 6 | authors: ["Mika Illouz"] 7 | license: "BSD" 8 | homepage: "https://github.com/barko/bentov" 9 | doc: "https://barko.github.io/bentov/" 10 | bug-reports: "https://github.com/barko/bentov/issues" 11 | depends: [ 12 | "dune" {> "1.5"} 13 | "cmdliner" {>= "1.0.4"} 14 | "ocaml" {>= "4.08.0"} 15 | ] 16 | build: [ 17 | ["dune" "subst"] {pinned} 18 | [ 19 | "dune" 20 | "build" 21 | "-p" 22 | name 23 | "-j" 24 | jobs 25 | "@install" 26 | "@doc" {with-doc} 27 | ] 28 | ] 29 | 30 | dev-repo: "git+https://github.com/barko/bentov.git" 31 | -------------------------------------------------------------------------------- /dune-project: -------------------------------------------------------------------------------- 1 | (lang dune 2.5) 2 | (name bentov) 3 | -------------------------------------------------------------------------------- /lib/bentov.ml: -------------------------------------------------------------------------------- 1 | type bin = { 2 | center : float; 3 | count : int 4 | } 5 | 6 | type histogram = { 7 | max_bins : int; 8 | num_bins : int; 9 | bins : bin list; 10 | range : (float * float) option; (* (min * max) *) 11 | total_count : int; 12 | } 13 | 14 | let max_bins h = 15 | h.max_bins 16 | 17 | let num_bins h = 18 | h.num_bins 19 | 20 | let bins h = 21 | h.bins 22 | 23 | let range h = 24 | h.range 25 | 26 | let total_count h = 27 | h.total_count 28 | 29 | (* not tail rec! *) 30 | let rec insert value = function 31 | | [] -> [{ center = value ; count = 1 }], true 32 | | h :: t -> 33 | if value < h.center then 34 | { center = value ; count = 1 } :: h :: t, true 35 | else if value = h.center then 36 | { h with count = h.count + 1 } :: t, false 37 | else 38 | let t, num_bins_is_incr = insert value t in 39 | h :: t, num_bins_is_incr 40 | 41 | let rec min_diff_index i index min_diff = function 42 | | a :: b :: t -> 43 | let diff = b.center -. a.center in 44 | assert ( diff > 0. ); 45 | if diff < min_diff then 46 | min_diff_index (i+1) i diff (b :: t) 47 | else 48 | (* no change *) 49 | min_diff_index (i+1) index min_diff (b :: t) 50 | 51 | | [ _ ] -> index 52 | | [] -> assert false 53 | 54 | let min_diff_index = function 55 | | a :: b :: t -> 56 | let diff = b.center -. a.center in 57 | assert ( diff > 0. ); 58 | min_diff_index 1 0 diff (b :: t) 59 | 60 | | [ _ ] 61 | | [] -> assert false 62 | 63 | let merge_bins lo hi = 64 | assert (lo.center < hi.center); 65 | let sum_count = lo.count + hi.count in 66 | let center = 67 | (* average of centers, weighted by their height *) 68 | (lo.center *. (float lo.count) +. hi.center *. (float hi.count)) /. 69 | (float sum_count) 70 | in 71 | { center; count = sum_count } 72 | 73 | (* not tail rec! *) 74 | let merge_bins_at_index = 75 | let rec loop i index = function 76 | | a :: b :: t -> 77 | if i = index then 78 | let bin = merge_bins a b in 79 | bin :: t 80 | else 81 | a :: (loop (i + 1) index (b :: t)) 82 | 83 | | [ _ ] 84 | | [] -> assert false 85 | in 86 | fun index bins -> 87 | loop 0 index bins 88 | 89 | let create max_bins = 90 | if max_bins < 2 then 91 | raise (Invalid_argument (Printf.sprintf "max_bins: %d" max_bins)) 92 | else { 93 | max_bins; 94 | num_bins = 0; 95 | bins = []; 96 | total_count = 0; 97 | range = None 98 | } 99 | 100 | let add value histogram = 101 | let range = 102 | match histogram.range with 103 | | Some (mn, mx) -> Some (min value mn, max value mx) 104 | | None -> Some (value, value) 105 | in 106 | let total_count = histogram.total_count + 1 in 107 | let bins, is_augmented = insert value histogram.bins in 108 | if histogram.num_bins = histogram.max_bins then 109 | if is_augmented then 110 | (* merge bins, so as to keep their number at [max_bins] *) 111 | let index = min_diff_index bins in 112 | let bins = merge_bins_at_index index bins in 113 | { histogram with bins; range; total_count } 114 | else 115 | { histogram with bins; range; total_count } 116 | else 117 | if is_augmented then 118 | { histogram with bins; range; total_count; 119 | num_bins = histogram.num_bins + 1; } 120 | else 121 | { histogram with bins; range; total_count } 122 | 123 | (* merge two sorted bin lists; not tail rec! *) 124 | let rec binary_merge a b = 125 | match a, b with 126 | | a_h :: a_t, b_h :: b_t -> 127 | if a_h.center < b_h.center then 128 | a_h :: (binary_merge a_t b) 129 | else if a_h.center > b_h.center then 130 | b_h :: (binary_merge a b_t) 131 | else 132 | (* a_h.center = b_h.center: merge the two cells into one *) 133 | let merged = { a_h with count = a_h.count + b_h.count } in 134 | merged :: (binary_merge a_t b_t) 135 | 136 | | [], _ :: _ -> b 137 | | _ :: _, [] -> a 138 | | [], [] -> [] 139 | 140 | let rec k_ary_merge_half accu = function 141 | | a :: b :: t -> 142 | let ab = binary_merge a b in 143 | k_ary_merge_half (ab :: accu) t 144 | 145 | | [a] -> (a :: accu) 146 | | [] -> accu 147 | 148 | let rec k_ary_merge t = 149 | match k_ary_merge_half [] t with 150 | | [a] -> a 151 | | t -> k_ary_merge t 152 | 153 | 154 | let rec reduce bins ~num_bins ~max_bins = 155 | if num_bins > max_bins then 156 | let index = min_diff_index bins in 157 | let bins = merge_bins_at_index index bins in 158 | reduce bins ~num_bins:(num_bins - 1) ~max_bins 159 | else 160 | bins 161 | 162 | 163 | 164 | let merge h_list max_bins = 165 | let bins, _, total_count, range = List.fold_left 166 | (fun (t_bins, t_num_bins, t_total_count, t_range) 167 | { bins; num_bins; total_count; range; _} -> 168 | let t_range = 169 | match t_range, range with 170 | | Some (t_mn, t_mx), Some (mn, mx) -> 171 | Some ((min t_mn mn), (max t_mx mx)) 172 | | None, Some _ -> range 173 | | Some _, None -> t_range 174 | | None, None -> None 175 | in 176 | bins :: t_bins, 177 | t_num_bins + num_bins, 178 | t_total_count + total_count, 179 | t_range 180 | ) ([], 0, 0, None) h_list in 181 | 182 | (* even if [num_bins <= output_max_bins], we have to apply 183 | [k_ary_merge] to combine indentical bin centers *) 184 | let merged_bins = k_ary_merge bins in 185 | let num_bins = List.length merged_bins in 186 | let bins = reduce merged_bins ~num_bins ~max_bins in 187 | let num_bins = List.length bins in 188 | { bins; 189 | num_bins; 190 | max_bins; 191 | total_count; 192 | range } 193 | 194 | (* add a value with a count; equivalent to calling [add value hist] 195 | [count] times *) 196 | let addc value count hist = 197 | let singleton = { 198 | bins = [{ center = value ; count }]; 199 | total_count = count; 200 | range = Some (value, value); 201 | num_bins = 1; 202 | max_bins = 1; (* benign *) 203 | } in 204 | merge [hist; singleton] hist.max_bins 205 | 206 | let pos_quadratic_root ~a ~b ~c = 207 | if a = 0.0 then 208 | -.c /. b 209 | else 210 | let discriminant = b *. b -. 4. *. a *. c in 211 | ((sqrt discriminant) -. b) /. (2. *. a) 212 | 213 | exception Empty 214 | 215 | let sum = 216 | let rec find_i b i sum = function 217 | | ({ center = p_i; count = m_i } as bin_i) :: 218 | ({ center = p_i1; _ } as bin_i1) :: t -> 219 | if p_i <= b && b < p_i1 then 220 | bin_i, bin_i1, sum 221 | else 222 | find_i b (i+1) (sum +. (float m_i)) (bin_i1 :: t) 223 | 224 | | _ -> raise Not_found 225 | in 226 | 227 | fun histogram b -> 228 | let {center = p_i; count = m_i}, {center = p_i1; count = m_i1 }, sum_i0 = 229 | find_i b 0 0.0 histogram.bins in 230 | let m_i = float m_i in 231 | let m_i1 = float m_i1 in 232 | let bpp = (b -. p_i) /. (p_i1 -. p_i) in 233 | let m_b = m_i +. (m_i1 -. m_i) *. bpp in 234 | let s = (m_i +. m_b) *. bpp /. 2. in 235 | s +. sum_i0 +. m_i /. 2. 236 | 237 | let uniform = 238 | let rec loop span cum_sum_at_centers j accu = 239 | let s = (float j) *. span in 240 | match cum_sum_at_centers with 241 | | (cum_sum_0, {center = p_0; count = m_0}) :: 242 | ((cum_sum_1, {center = p_1; count = m_1}) as bin_1) :: rest -> 243 | if s < cum_sum_0 then 244 | loop span cum_sum_at_centers (j + 1) accu 245 | else if cum_sum_0 <= s && s < cum_sum_1 then 246 | let d = s -. cum_sum_0 in 247 | let c = -2. *. d in 248 | let b = float (2 * m_0) in 249 | let a = float (m_1 - m_0) in 250 | let z = pos_quadratic_root ~a ~b ~c in 251 | let u = p_0 +. (p_1 -. p_0) *. z in 252 | loop span cum_sum_at_centers (j + 1) ((j, u) :: accu) 253 | else 254 | loop span (bin_1 :: rest) j accu 255 | | [ _ ] -> List.rev accu 256 | | [] -> assert false 257 | in 258 | let cum_sum_at_centers hist = 259 | let bin, hist_rest, cum_sum = 260 | match hist with 261 | | ({count; _} as bin) :: rest -> bin, rest, (float count) /. 2. 262 | | _ -> raise Empty 263 | in 264 | let _, _, cum_sum_at_centers = List.fold_left ( 265 | fun (cum_sum, prev_count, cum_sum_at_centers) ({count; _} as bin) -> 266 | let cum_sum = cum_sum +. (float (prev_count + count)) /. 2. in 267 | let cum_sum_at_centers = (cum_sum, bin) :: cum_sum_at_centers in 268 | cum_sum, count, cum_sum_at_centers 269 | ) (cum_sum, bin.count, [cum_sum, bin]) hist_rest in 270 | List.rev cum_sum_at_centers 271 | in 272 | fun hist b -> 273 | if b < 1 then 274 | raise (Invalid_argument "uniform") 275 | else 276 | let cum_sum_at_centers = cum_sum_at_centers hist.bins in 277 | let span = (float hist.total_count) /. (float b) in 278 | loop span cum_sum_at_centers 0 [] 279 | 280 | 281 | let mean { bins; total_count; _ } = 282 | if total_count = 0 then 283 | raise Empty 284 | else 285 | let m = List.fold_left ( 286 | fun sum { center; count } -> 287 | sum +. center *. (float count) 288 | ) 0.0 bins in 289 | m /. (float total_count) 290 | 291 | let mean_stdev histogram = 292 | if histogram.total_count = 0 then 293 | raise Empty 294 | else 295 | let mean = mean histogram in 296 | let v = List.fold_left ( 297 | fun sum { center; count } -> 298 | let diff = center -. mean in 299 | sum +. diff *. diff *. (float count) 300 | ) 0.0 histogram.bins 301 | in 302 | let stdev = sqrt (v /. (float histogram.total_count)) in 303 | mean, stdev 304 | 305 | -------------------------------------------------------------------------------- /lib/bentov.mli: -------------------------------------------------------------------------------- 1 | type bin = { 2 | center : float; 3 | (** the center of the bin *) 4 | 5 | count : int; 6 | (** the number of values in the bin *) 7 | } 8 | (** [bin] represents one of the bins in a 1D histogram. The bin is 9 | centered in [center] and its mass is [count], half of which is on 10 | either side of [center]. *) 11 | 12 | type histogram 13 | 14 | val bins : histogram -> bin list 15 | (** [bins h] returns the list of bins, sorted by the bin center, 16 | comprising histogram [h] *) 17 | 18 | val num_bins : histogram -> int 19 | (** [num_bins h] returns the size of the histogram [h] in terms of the 20 | number of bins; equivalent to [List.length (bins h)] *) 21 | 22 | val max_bins : histogram -> int 23 | (** [max_bins h] returns the maximum number of bins of this histogram; 24 | when the number of unique values added to the histogram exceeds 25 | this [max_bins h], [h] becomes an approximation. *) 26 | 27 | val total_count : histogram -> int 28 | (** [total_count h] returns the number of values added to histogram [h] *) 29 | 30 | val range : histogram -> (float * float) option 31 | (** [range h] returns the minimum and maximum values seen in the 32 | construction of histogram h, or [None] if no values have yet been 33 | added *) 34 | 35 | val create : int -> histogram 36 | (** [create max_bins] creates a histogram with up to [max_bins] bins *) 37 | 38 | val add : float -> histogram -> histogram 39 | (** [add v h] adds a value to [v] to histogram [h], returning the 40 | updated histogram *) 41 | 42 | val addc : float -> int -> histogram -> histogram 43 | (** [addc v c h] adds a value to [v] to histogram [h] with count [c], 44 | returning the updated histogram. It is equivalent to calling [add 45 | v h] [c] times. *) 46 | 47 | val merge : histogram list -> int -> histogram 48 | (** [merge h_list max_bins] creates a new histogram from the histograms 49 | in [h_list], whose size is no bigger than [max_bins] *) 50 | 51 | val sum : histogram -> float -> float 52 | (** [sum hist b] returns an estimate of the number of points in the 53 | interval [[−infinity,b]]. For an estimate to be feasible, [b] must 54 | be strictly between the left and right bin centers. Otherwise, 55 | [Not_found] is raised. *) 56 | 57 | exception Empty 58 | 59 | val uniform : histogram -> int -> (int * float) list 60 | (** [uniform hist num_intervals] returns estimates of the quantiles of 61 | the distribution represented by histogram [hist]. The quantiles are 62 | associated with the boundaries of [num_interval] intervals, in 63 | ascending order. For example, [uniform hist 4] returns an estimate 64 | of the interquartile range, consisting of the 25-th percentile, 65 | 50-th percentile (median) and 75-th percentile. (The minimum and 66 | maximum of the histogram are seperately available through the 67 | function [range]. [uniform] can return fewer than the desired 68 | quantiles in histograms that have extremely dense regions. Raises 69 | [Empty] if the histogram has not been [add]'d any data.*) 70 | 71 | val mean : histogram -> float 72 | (** [mean hist] returns an estimate of the mean of the distribution 73 | represented by [hist]. Raises [Empty] if the histogram has not been 74 | [add]'d any data. *) 75 | 76 | val mean_stdev : histogram -> float * float 77 | (** [mean_stdev hist] returns estimates of the mean and standard 78 | deviation of the distribution represented bh [hist]. Raises 79 | [Empty] if the histogram has not been [add]'d any data. *) 80 | -------------------------------------------------------------------------------- /lib/dune: -------------------------------------------------------------------------------- 1 | (library (name bentov) (public_name bentov)) 2 | -------------------------------------------------------------------------------- /src/bt.ml: -------------------------------------------------------------------------------- 1 | open Bentov 2 | 3 | let pr = Printf.printf 4 | 5 | let print_histogram h = 6 | let count = total_count h in 7 | pr "total_count=%d\n" count; 8 | pr "max_bins=%d\n" (max_bins h); 9 | pr "num_bins=%d\n" (num_bins h); 10 | (match (range h) with 11 | | Some (mn, mx) -> pr "min=%+.5e\nmax=%+.5e\n" mn mx 12 | | None -> () 13 | ); 14 | let count_f = float count in 15 | List.iter ( 16 | fun bin -> 17 | let frequency = (float bin.count) /. count_f in 18 | pr "%.8f %8d %.3e\n" bin.center bin.count frequency 19 | ) (bins h); 20 | pr "\n" 21 | 22 | let rec fold_lines f ch x0 = 23 | let x, is_done = 24 | try 25 | let line = input_line ch in 26 | let x = f x0 line in 27 | x, false 28 | with End_of_file -> 29 | x0, true 30 | in 31 | if is_done then 32 | x 33 | else 34 | fold_lines f ch x 35 | 36 | 37 | let histogram_of_file path_opt max_bins = 38 | let ch = 39 | match path_opt with 40 | | Some path -> open_in path 41 | | None -> stdin 42 | in 43 | 44 | let hist = fold_lines ( 45 | fun hist line -> 46 | let value = float_of_string line in 47 | add value hist 48 | ) ch (create max_bins) in 49 | 50 | close_in ch; 51 | 52 | hist 53 | 54 | let print_uniform hist num_intervals = 55 | let u = uniform hist num_intervals in 56 | pr "uniform with %d intervals:\n%!" num_intervals; 57 | List.iter ( 58 | fun (rank, quantile) -> 59 | pr "%d/%d[=%.3f] %+.5e\n" rank num_intervals 60 | ((float rank)/.(float num_intervals)) quantile 61 | ) u 62 | 63 | 64 | let print_mean_stdev hist = 65 | let m, sd = mean_stdev hist in 66 | pr "mean=%+.5e\nstdev=%+.5e\n" m sd 67 | 68 | 69 | let main max_bins path_opt p_histogram p_mean_stdev p_uniform = 70 | let hist = histogram_of_file path_opt max_bins in 71 | if p_mean_stdev then ( 72 | print_mean_stdev hist; 73 | pr "\n" 74 | ); 75 | 76 | if p_histogram then ( 77 | print_histogram hist; 78 | pr "\n" 79 | ); 80 | 81 | match p_uniform with 82 | | Some intervals -> 83 | print_uniform hist intervals; 84 | pr "\n" 85 | | None -> () 86 | 87 | open Cmdliner 88 | 89 | let _ = 90 | let command = 91 | let doc = "read a column of numbers from an input file (or stdin), and \ 92 | update an approximate histogram of these numbers. Every so \ 93 | often, print out the centers of the histogram bins, along with \ 94 | their probability mass. Based on the algorithm described in \ 95 | \"A Streaming Parallel Decision Tree Algorithm\" by Yael \ 96 | Ben-Haim and Elad Tom-Tov. \ 97 | http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf" in 98 | 99 | let max_bins = 100 | let doc = "approximate the histogram with this maximum number of bins" in 101 | Arg.(required & opt (some int) (Some 20) & 102 | info ["n"; "max-bins"] ~docv:"INT" ~doc) 103 | in 104 | 105 | let path = 106 | let doc = "path of input file (absent: stdin)" in 107 | Arg.(value & opt (some string) None & 108 | info ["i"; "input"] ~docv:"PATH" ~doc) 109 | in 110 | 111 | let print_mean_stdev = 112 | let doc = "use the histogram to compute and print the mean and \ 113 | standard deviation" in 114 | Arg.(value & flag & info ["s"; "stats"] ~doc) 115 | in 116 | 117 | let print_histogram = 118 | let doc = "print details of the histogram, including its bins" in 119 | Arg.(value & flag & info ["b"; "bins"] ~doc) 120 | in 121 | 122 | let print_uniform = 123 | let doc = "compute and print the value deliniating the interval which 124 | are equally spaced in terms of probability mass." in 125 | Arg.(value & opt (some int) None & info ["u"; "uniform"] ~docv:"INT" ~doc) 126 | in 127 | 128 | Cmd.v (Cmd.info "bt" ~doc) 129 | Term.(const main 130 | $ max_bins 131 | $ path 132 | $ print_histogram 133 | $ print_mean_stdev 134 | $ print_uniform 135 | ) 136 | in 137 | Cmd.eval ~catch:false command 138 | -------------------------------------------------------------------------------- /src/dune: -------------------------------------------------------------------------------- 1 | (executable 2 | (name bt) 3 | (public_name bt) 4 | (libraries cmdliner bentov) 5 | ) 6 | -------------------------------------------------------------------------------- /test/dune: -------------------------------------------------------------------------------- 1 | (test (name test) (libraries bentov)) 2 | -------------------------------------------------------------------------------- /test/gaussian.ml: -------------------------------------------------------------------------------- 1 | open Seq 2 | 3 | (** Knuth algorithm for producing a sample from N(0,1) *) 4 | let random_normal : float Seq.t = 5 | let rec gen phase () = 6 | let u1 = Random.float 1.0 in 7 | let u2 = Random.float 1.0 in 8 | let v1 = 2. *. u1 -. 1.0 in 9 | let v2 = 2. *. u2 -. 1.0 in 10 | let s = v1 *. v1 +. v2 *. v2 in 11 | if s >= 1.0 || s = 0.0 then 12 | gen phase () 13 | else 14 | let v = if phase then v1 else v2 in 15 | let x = v *. sqrt ((-2. *. log s) /. s) in 16 | Cons (x, gen (not phase)) 17 | in 18 | gen true 19 | 20 | (* [random_normal ~mu ~sigma] produces a sequence whose elements 21 | are N([mean],[stddev]) *) 22 | let seq ~mu ~sigma = 23 | if mu = 0. && sigma = 1.0 then 24 | random_normal 25 | else 26 | Seq.map (fun x -> mu +. x *. sigma) random_normal 27 | 28 | (* 29 | let mean_std x = 30 | let n = List.length x in 31 | let sum = List.fold_left ( +. ) 0. x in 32 | let mean = sum /. (float n) in 33 | let rss = List.fold_left (fun rss x -> rss +. (x -. mean)**2.) 0. x in 34 | let std = sqrt (rss /. (float (n-1))) in 35 | mean, std 36 | 37 | 38 | let _ = 39 | let x = take_rev 100_000 (seq ~mu:0. ~stddev:1.) in 40 | let qs = quantiles x 10 in 41 | List.iter (fun (i, q) -> Printf.printf "%d %f\n" i q) qs 42 | *) 43 | -------------------------------------------------------------------------------- /test/test.ml: -------------------------------------------------------------------------------- 1 | (* In this test, we measure the error between the true quantiles of 2 | data, and the quantiles as computed through the [Bentov] histogram 3 | approximation module. The data is drawn from a mixture of two 4 | Gaussians, N(2,1) and N(5,1), where the mixing coefficient is 1/2. 5 | We compute the approximate quantiles in two ways: In the frist, we 6 | simply add each sample into a [Bentov.histogram]. We call this 7 | histogram [mixed]. In the second, we add a datum to one of two 8 | [Bentov.histogram]'s, one associated with each of the Guassians. We 9 | then merge these two histograms using [Bentov.merge]. We call the 10 | result of merging the two sub-histograms [merged]. Finally, we 11 | compute and print the mean-square-error between the true quantiles 12 | and [mixed], and the true quantiles and [merged]. *) 13 | 14 | (* [quantiles list num_intervals] returns the quantiles (true, not 15 | approximated) of list [list] at [num_intervals + 1] points, including 16 | the minimum and maximum values (which are the first and last values of 17 | the result, resp. *) 18 | let quantiles = 19 | let rec loop i j span accu = function 20 | | x0 :: x1 :: rest -> 21 | let s = (float j) *. span in 22 | if i <= s && s < i +. 1. then 23 | let d = s -. i in 24 | let x_d = (x1 -. x0) *. d +. x0 in 25 | loop i (j + 1) span ((j, x_d) :: accu) (x0 :: x1 :: rest) 26 | else 27 | loop (i +. 1.) j span accu (x1 :: rest) 28 | 29 | | [x] -> 30 | let x_d = j, x in 31 | List.rev (x_d :: accu) 32 | 33 | | [] -> assert false 34 | in 35 | fun x m -> 36 | let x_sorted = List.sort Stdlib.compare x in 37 | let n = List.length x_sorted in 38 | let span = (float n) /. (float m) in 39 | loop 0. 0 span [] x_sorted 40 | 41 | 42 | open Seq 43 | 44 | (** [up_to n seq] returns a sequence that echos at most [n] values of 45 | sequence [seq] *) 46 | let up_to = 47 | let rec loop i n seq () = 48 | if i = n then 49 | Nil 50 | else 51 | match seq () with 52 | | Nil -> Nil 53 | | Cons (v, seq) -> 54 | Cons (v, loop (i + 1) n seq) 55 | in 56 | fun n seq -> 57 | if n < 0 then 58 | raise (Invalid_argument "up_to") 59 | else 60 | loop 0 n seq 61 | 62 | module IntMap = Map.Make(Int) 63 | 64 | let map_of_assoc assoc = 65 | List.fold_left ( 66 | fun k_to_v (k, v) -> 67 | IntMap.add k v k_to_v 68 | ) IntMap.empty assoc 69 | 70 | let _ = 71 | (* the number of data to draw *) 72 | let n = 100_000 in 73 | 74 | (* the size of the approximate histograms *) 75 | let q = 20 in 76 | 77 | let rec gaussian_mixture normal_a normal_b () = 78 | if Random.bool () then 79 | match normal_a () with 80 | | Cons (x, normal_a) -> 81 | Cons ((`A, x), gaussian_mixture normal_a normal_b) 82 | | Nil -> Nil 83 | else 84 | match normal_b () with 85 | | Cons (x, normal_b) -> 86 | Cons ((`B, x), gaussian_mixture normal_a normal_b) 87 | | Nil -> Nil 88 | in 89 | 90 | let n1 = Gaussian.seq ~mu:2.0 ~sigma:1.0 in (* N(2,1) *) 91 | let n2 = Gaussian.seq ~mu:5.0 ~sigma:1.0 in (* N(5,1) *) 92 | 93 | let gmm = up_to n (gaussian_mixture n1 n2) in 94 | 95 | let open Bentov in 96 | 97 | let normal_a_h, normal_b_h, mixed_h, data = Seq.fold_left ( 98 | fun (normal_a_h, normal_b_h, mixed_h, data) draw -> 99 | match draw with 100 | | `A, x -> 101 | let normal_a_h = Bentov.add x normal_a_h in 102 | let mixed_h = Bentov.add x mixed_h in 103 | normal_a_h, normal_b_h, mixed_h, x :: data 104 | 105 | | `B, x -> 106 | let normal_b_h = Bentov.add x normal_b_h in 107 | let mixed_h = Bentov.add x mixed_h in 108 | normal_a_h, normal_b_h, mixed_h, x :: data 109 | ) ( 110 | let normal_a_h = create q in 111 | let normal_b_h = create q in 112 | let mixed_h = create q in 113 | normal_a_h, normal_b_h, mixed_h, [] 114 | ) gmm in 115 | 116 | (* merge the two sub-histograms *) 117 | let merged_h = merge [normal_a_h; normal_b_h] q in 118 | 119 | assert (total_count mixed_h = n ); 120 | assert (total_count merged_h = n ); 121 | 122 | (* measure the error between the true quantiles and approximations 123 | on a grid half the size of our approximate histograms *) 124 | let num_intervals = q/2 in 125 | 126 | let error i actual mixed merged = 127 | match IntMap.find_opt i actual with 128 | | None -> None, None 129 | | Some actual -> 130 | (match IntMap.find_opt i mixed with 131 | | Some mixed -> Some (actual -. mixed) 132 | | None -> None 133 | ), 134 | (match IntMap.find_opt i merged with 135 | | Some merged -> Some (actual -. merged) 136 | | None -> None 137 | ) 138 | in 139 | 140 | (* compute sum of squared-errors *) 141 | let rec stats i actual mixed merged mixed_stats merged_stats = 142 | if i < num_intervals then 143 | let mixed_err, merged_err = error i actual mixed merged in 144 | let mixed_stats = 145 | match mixed_err with 146 | | Some err -> 147 | let sum_se_mixed, n_mixed = mixed_stats in 148 | sum_se_mixed +. err *. err, n_mixed + 1 149 | | None -> mixed_stats 150 | in 151 | let merged_stats = 152 | match merged_err with 153 | | Some err -> 154 | let sum_se_merged, n_merged = merged_stats in 155 | sum_se_merged +. err *. err, n_merged + 1 156 | | None -> merged_stats 157 | in 158 | stats (i+1) actual mixed merged mixed_stats merged_stats 159 | else 160 | mixed_stats, merged_stats 161 | in 162 | 163 | let mixed_q = map_of_assoc (uniform mixed_h num_intervals) in 164 | let merged_q = map_of_assoc (uniform merged_h num_intervals) in 165 | let actual_q = map_of_assoc (quantiles data num_intervals) in 166 | 167 | let (sum_se_mixed, n_mixed), (sum_se_merged, n_merged) = 168 | stats 0 actual_q mixed_q merged_q (0., 0) (0., 0) in 169 | 170 | let err_mixed = sqrt ((sum_se_mixed) /. (float n_mixed)) in 171 | let err_merged = sqrt ((sum_se_merged) /. (float n_merged)) in 172 | Printf.printf "err_mixed=%e (n=%d)\nerr_merged=%e (n=%d)\n" 173 | err_mixed n_mixed err_merged n_merged; 174 | assert (err_mixed <= 5e-2 && err_merged <= 5e-2) 175 | 176 | --------------------------------------------------------------------------------