├── .gitignore
├── CHANGES.md
├── LICENSE
├── README.md
├── bentov.opam
├── dune-project
├── lib
    ├── bentov.ml
    ├── bentov.mli
    └── dune
├── src
    ├── bt.ml
    └── dune
└── test
    ├── dune
    ├── gaussian.ml
    └── test.ml


/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | *~
3 | *.native
4 | *.byte
5 | 


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
1 | ## v1 (2020-06-24)
2 | 
3 | * first `opam` release
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016-present, Mika Illouz
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice, this
11 |   list of conditions and the following disclaimer in the documentation and/or
12 |   other materials provided with the distribution.
13 | 
14 | * Neither the name of EigenDog nor the names of its contributors may
15 |   be used to endorse or promote products derived from this software
16 |   without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bentov
 2 | 
 3 | An OCaml implementation of histogram-sketching algorithm described in
 4 | [A Streaming Parallel Decision Tree
 5 | Algorithm](http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf)
 6 | by Yael Ben-Haim and Elad Tom-Tov. Included is a command-line utility
 7 | `bt`, which can read a file (or stdin) containing numbers, one per
 8 | line, and output a representation of the approximated distribution.
 9 | 
10 | For example, to approximate 10 quantiles of 1M data in U(0,1):
11 | 
12 | ```sh
13 | echo "" | awk '{ for ( i=0 ; i < 1e6 ; i++ ) { print rand() } }' | bt -n 20 -u 10
14 | ```
15 | 
16 | In this example, the size of the approximating histogram is 20.  For
17 | additional details, `bt --help` .
18 | 
19 | To install:
20 | 
21 | ```sh
22 | opam install bentov
23 | ```
24 | 
25 | # Documentation
26 | 
27 | See https://barko.github.io/bentov
28 | 
29 | # License
30 | 
31 | BSD
32 | 


--------------------------------------------------------------------------------
/bentov.opam:
--------------------------------------------------------------------------------
 1 | opam-version: "2.0"
 2 | synopsis: "1D histogram sketching"
 3 | description:
 4 |   "Bentov implements an algorithm which approximates a 1D histogram as data is streamed over it."
 5 | maintainer: ["mika@illouz.net"]
 6 | authors: ["Mika Illouz"]
 7 | license: "BSD"
 8 | homepage: "https://github.com/barko/bentov"
 9 | doc: "https://barko.github.io/bentov/"
10 | bug-reports: "https://github.com/barko/bentov/issues"
11 | depends: [
12 |   "dune" {> "1.5"}
13 |   "cmdliner" {>= "1.0.4"}
14 |   "ocaml" {>= "4.08.0"}
15 | ]
16 | build: [
17 |   ["dune" "subst"] {pinned}
18 |   [
19 |     "dune"
20 |     "build"
21 |     "-p"
22 |     name
23 |     "-j"
24 |     jobs
25 |     "@install"
26 |     "@doc" {with-doc}
27 |   ]
28 | ]
29 | 
30 | dev-repo: "git+https://github.com/barko/bentov.git"
31 | 


--------------------------------------------------------------------------------
/dune-project:
--------------------------------------------------------------------------------
1 | (lang dune 2.5)
2 | (name bentov)
3 | 


--------------------------------------------------------------------------------
/lib/bentov.ml:
--------------------------------------------------------------------------------
  1 | type bin = {
  2 |   center : float;
  3 |   count : int
  4 | }
  5 | 
  6 | type histogram = {
  7 |   max_bins : int;
  8 |   num_bins : int;
  9 |   bins : bin list;
 10 |   range : (float * float) option; (* (min * max) *)
 11 |   total_count : int;
 12 | }
 13 | 
 14 | let max_bins h =
 15 |   h.max_bins
 16 | 
 17 | let num_bins h =
 18 |   h.num_bins
 19 | 
 20 | let bins h =
 21 |   h.bins
 22 | 
 23 | let range h =
 24 |   h.range
 25 | 
 26 | let total_count h =
 27 |   h.total_count
 28 | 
 29 | (* not tail rec! *)
 30 | let rec insert value = function
 31 |   | [] -> [{ center = value ; count = 1 }], true
 32 |   | h :: t ->
 33 |     if value < h.center then
 34 |       { center = value ; count = 1 } :: h :: t, true
 35 |     else if value = h.center then
 36 |       { h with count = h.count + 1 } :: t, false
 37 |     else
 38 |       let t, num_bins_is_incr = insert value t in
 39 |       h :: t, num_bins_is_incr
 40 | 
 41 | let rec min_diff_index i index min_diff = function
 42 |   | a :: b :: t ->
 43 |     let diff = b.center -. a.center in
 44 |     assert ( diff > 0. );
 45 |     if diff < min_diff then
 46 |       min_diff_index (i+1) i diff (b :: t)
 47 |     else
 48 |       (* no change *)
 49 |       min_diff_index (i+1) index min_diff (b :: t)
 50 | 
 51 |   | [ _ ] -> index
 52 |   | [] -> assert false
 53 | 
 54 | let min_diff_index = function
 55 |   | a :: b :: t ->
 56 |     let diff = b.center -. a.center in
 57 |     assert ( diff > 0. );
 58 |     min_diff_index 1 0 diff (b :: t)
 59 | 
 60 |   | [ _ ]
 61 |   | [] -> assert false
 62 | 
 63 | let merge_bins lo hi =
 64 |   assert (lo.center < hi.center);
 65 |   let sum_count = lo.count + hi.count in
 66 |   let center =
 67 |     (* average of centers, weighted by their height *)
 68 |     (lo.center *. (float lo.count) +. hi.center *. (float hi.count)) /.
 69 |     (float sum_count)
 70 |   in
 71 |   { center; count = sum_count }
 72 | 
 73 | (* not tail rec! *)
 74 | let merge_bins_at_index =
 75 |   let rec loop i index = function
 76 |     | a :: b :: t ->
 77 |       if i = index then
 78 |         let bin = merge_bins a b in
 79 |         bin :: t
 80 |       else
 81 |         a :: (loop (i + 1) index (b :: t))
 82 | 
 83 |     | [ _ ]
 84 |     | [] -> assert false
 85 |   in
 86 |   fun index bins ->
 87 |     loop 0 index bins
 88 | 
 89 | let create max_bins =
 90 |   if max_bins < 2 then
 91 |     raise (Invalid_argument (Printf.sprintf "max_bins: %d" max_bins))
 92 |   else {
 93 |     max_bins;
 94 |     num_bins = 0;
 95 |     bins = [];
 96 |     total_count = 0;
 97 |     range = None
 98 |   }
 99 | 
100 | let add value histogram =
101 |   let range =
102 |     match histogram.range with
103 |       | Some (mn, mx) -> Some (min value mn, max value mx)
104 |       | None -> Some (value, value)
105 |   in
106 |   let total_count = histogram.total_count + 1 in
107 |   let bins, is_augmented = insert value histogram.bins in
108 |   if histogram.num_bins = histogram.max_bins then
109 |     if is_augmented then
110 |       (* merge bins, so as to keep their number at [max_bins] *)
111 |       let index = min_diff_index bins in
112 |       let bins = merge_bins_at_index index bins in
113 |       { histogram with bins; range; total_count }
114 |     else
115 |       { histogram with bins; range; total_count }
116 |   else
117 |     if is_augmented then
118 |       { histogram with bins; range; total_count;
119 |                        num_bins = histogram.num_bins + 1; }
120 |     else
121 |       { histogram with bins; range; total_count }
122 | 
123 | (* merge two sorted bin lists; not tail rec! *)
124 | let rec binary_merge a b =
125 |   match a, b with
126 |     | a_h :: a_t, b_h :: b_t ->
127 |       if a_h.center < b_h.center then
128 |         a_h :: (binary_merge a_t b)
129 |       else if a_h.center > b_h.center then
130 |         b_h :: (binary_merge a b_t)
131 |       else
132 |         (* a_h.center = b_h.center: merge the two cells into one *)
133 |         let merged = { a_h with count = a_h.count + b_h.count } in
134 |         merged :: (binary_merge a_t b_t)
135 | 
136 |     | [], _ :: _ -> b
137 |     | _ :: _, [] -> a
138 |     | [], [] -> []
139 | 
140 | let rec k_ary_merge_half accu = function
141 |   | a :: b :: t ->
142 |     let ab = binary_merge a b in
143 |     k_ary_merge_half  (ab :: accu) t
144 | 
145 |   | [a] -> (a :: accu)
146 |   | [] -> accu
147 | 
148 | let rec k_ary_merge t =
149 |   match k_ary_merge_half [] t with
150 |     | [a] -> a
151 |     | t -> k_ary_merge t
152 | 
153 | 
154 | let rec reduce bins ~num_bins ~max_bins =
155 |   if num_bins > max_bins then
156 |     let index = min_diff_index bins in
157 |     let bins = merge_bins_at_index index bins in
158 |     reduce bins ~num_bins:(num_bins - 1) ~max_bins
159 |   else
160 |     bins
161 | 
162 | 
163 | 
164 | let merge h_list max_bins =
165 |   let bins, _, total_count, range = List.fold_left
166 |       (fun (t_bins, t_num_bins, t_total_count, t_range)
167 |         { bins; num_bins; total_count; range; _} ->
168 |         let t_range =
169 |           match t_range, range with
170 |             | Some (t_mn, t_mx), Some (mn, mx) ->
171 |               Some ((min t_mn mn), (max t_mx mx))
172 |             | None, Some _ -> range
173 |             | Some _, None -> t_range
174 |             | None, None -> None
175 |         in
176 |         bins :: t_bins,
177 |         t_num_bins + num_bins,
178 |         t_total_count + total_count,
179 |         t_range
180 |       ) ([], 0, 0, None) h_list in
181 | 
182 |   (* even if [num_bins <= output_max_bins], we have to apply
183 |      [k_ary_merge] to combine indentical bin centers *)
184 |   let merged_bins = k_ary_merge bins in
185 |   let num_bins = List.length merged_bins in
186 |   let bins = reduce merged_bins ~num_bins ~max_bins in
187 |   let num_bins = List.length bins in
188 |   { bins;
189 |     num_bins;
190 |     max_bins;
191 |     total_count;
192 |     range }
193 | 
194 | (* add a value with a count; equivalent to calling [add value hist]
195 |    [count] times *)
196 | let addc value count hist =
197 |   let singleton = {
198 |     bins = [{ center = value ; count }];
199 |     total_count = count;
200 |     range = Some (value, value);
201 |     num_bins = 1;
202 |     max_bins = 1; (* benign *)
203 |   } in
204 |   merge [hist; singleton] hist.max_bins
205 | 
206 | let pos_quadratic_root ~a ~b ~c =
207 |   if a = 0.0 then
208 |     -.c /. b
209 |   else
210 |     let discriminant = b *. b -. 4. *. a *. c in
211 |     ((sqrt discriminant) -. b) /. (2. *. a)
212 | 
213 | exception Empty
214 | 
215 | let sum =
216 |   let rec find_i b i sum = function
217 |     | ({ center = p_i; count = m_i } as bin_i) ::
218 |         ({ center = p_i1; _ } as bin_i1) :: t ->
219 |       if p_i <= b && b < p_i1 then
220 |         bin_i, bin_i1, sum
221 |       else
222 |         find_i b (i+1) (sum +. (float m_i)) (bin_i1 :: t)
223 | 
224 |     | _ -> raise Not_found
225 |   in
226 | 
227 |   fun histogram b ->
228 |     let {center = p_i; count = m_i}, {center = p_i1; count = m_i1 }, sum_i0 =
229 |       find_i b 0 0.0 histogram.bins in
230 |     let m_i = float m_i in
231 |     let m_i1 = float m_i1 in
232 |     let bpp = (b -. p_i) /. (p_i1 -. p_i) in
233 |     let m_b = m_i +. (m_i1 -. m_i) *. bpp in
234 |     let s = (m_i +. m_b) *. bpp /. 2. in
235 |     s +. sum_i0 +. m_i /. 2.
236 | 
237 | let uniform =
238 |   let rec loop span cum_sum_at_centers j accu =
239 |     let s = (float j) *. span in
240 |     match cum_sum_at_centers with
241 |     | (cum_sum_0, {center = p_0; count = m_0}) ::
242 |         ((cum_sum_1, {center = p_1; count = m_1}) as bin_1) :: rest ->
243 |       if s < cum_sum_0 then
244 |         loop span cum_sum_at_centers (j + 1) accu
245 |       else if cum_sum_0 <= s && s < cum_sum_1 then
246 |         let d = s -. cum_sum_0 in
247 |         let c = -2. *. d in
248 |         let b = float (2 * m_0) in
249 |         let a = float (m_1 - m_0) in
250 |         let z = pos_quadratic_root ~a ~b ~c in
251 |         let u = p_0 +. (p_1 -. p_0) *. z in
252 |         loop span cum_sum_at_centers (j + 1) ((j, u) :: accu)
253 |       else
254 |         loop span (bin_1 :: rest) j accu
255 |     | [ _ ] -> List.rev accu
256 |     | [] -> assert false
257 |   in
258 |   let cum_sum_at_centers hist =
259 |     let bin, hist_rest, cum_sum =
260 |       match hist with
261 |       | ({count; _} as bin) :: rest -> bin, rest, (float count) /. 2.
262 |       | _ -> raise Empty
263 |     in
264 |     let _, _, cum_sum_at_centers = List.fold_left (
265 |       fun (cum_sum, prev_count, cum_sum_at_centers) ({count; _} as bin) ->
266 |         let cum_sum = cum_sum +. (float (prev_count + count)) /. 2. in
267 |         let cum_sum_at_centers = (cum_sum, bin) :: cum_sum_at_centers in
268 |         cum_sum, count, cum_sum_at_centers
269 |     ) (cum_sum, bin.count, [cum_sum, bin]) hist_rest in
270 |     List.rev cum_sum_at_centers
271 |   in
272 |   fun hist b ->
273 |     if b < 1 then
274 |       raise (Invalid_argument "uniform")
275 |     else
276 |       let cum_sum_at_centers = cum_sum_at_centers hist.bins in
277 |       let span = (float hist.total_count) /. (float b) in
278 |       loop span cum_sum_at_centers 0 []
279 | 
280 | 
281 | let mean { bins; total_count; _ } =
282 |   if total_count = 0 then
283 |     raise Empty
284 |   else
285 |     let m = List.fold_left (
286 |       fun sum { center; count } ->
287 |         sum +. center *. (float count)
288 |     ) 0.0 bins in
289 |     m /. (float total_count)
290 | 
291 | let mean_stdev histogram =
292 |   if histogram.total_count = 0 then
293 |     raise Empty
294 |   else
295 |     let mean = mean histogram in
296 |     let v = List.fold_left (
297 |         fun sum { center; count } ->
298 |           let diff = center -. mean in
299 |           sum +. diff *. diff *. (float count)
300 |       ) 0.0 histogram.bins
301 |     in
302 |     let stdev = sqrt (v /. (float histogram.total_count)) in
303 |     mean, stdev
304 | 
305 | 


--------------------------------------------------------------------------------
/lib/bentov.mli:
--------------------------------------------------------------------------------
 1 | type bin = {
 2 |   center : float;
 3 |   (** the center of the bin *)
 4 | 
 5 |   count : int;
 6 |   (** the number of values in the bin *)
 7 | }
 8 | (** [bin] represents one of the bins in a 1D histogram. The bin is
 9 |     centered in [center] and its mass is [count], half of which is on
10 |     either side of [center]. *)
11 | 
12 | type histogram
13 | 
14 | val bins : histogram -> bin list
15 | (** [bins h] returns the list of bins, sorted by the bin center,
16 |     comprising histogram [h] *)
17 | 
18 | val num_bins : histogram -> int
19 | (** [num_bins h] returns the size of the histogram [h] in terms of the
20 |     number of bins; equivalent to [List.length (bins h)] *)
21 | 
22 | val max_bins : histogram -> int
23 | (** [max_bins h] returns the maximum number of bins of this histogram;
24 |     when the number of unique values added to the histogram exceeds
25 |     this [max_bins h], [h] becomes an approximation. *)
26 | 
27 | val total_count : histogram -> int
28 | (** [total_count h] returns the number of values added to histogram [h] *)
29 | 
30 | val range : histogram -> (float * float) option
31 | (** [range h] returns the minimum and maximum values seen in the
32 |     construction of histogram h, or [None] if no values have yet been
33 |     added *)
34 | 
35 | val create : int -> histogram
36 | (** [create max_bins] creates a histogram with up to [max_bins] bins *)
37 | 
38 | val add : float -> histogram -> histogram
39 | (** [add v h] adds a value to [v] to histogram [h], returning the
40 |     updated histogram *)
41 | 
42 | val addc : float -> int -> histogram -> histogram
43 | (** [addc v c h] adds a value to [v] to histogram [h] with count [c],
44 |     returning the updated histogram.  It is equivalent to calling [add
45 |     v h] [c] times. *)
46 | 
47 | val merge : histogram list -> int -> histogram
48 | (** [merge h_list max_bins] creates a new histogram from the histograms
49 |     in [h_list], whose size is no bigger than [max_bins] *)
50 | 
51 | val sum : histogram -> float -> float
52 | (** [sum hist b] returns an estimate of the number of points in the
53 |     interval [[−infinity,b]]. For an estimate to be feasible, [b] must
54 |     be strictly between the left and right bin centers. Otherwise,
55 |     [Not_found] is raised. *)
56 | 
57 | exception Empty
58 | 
59 | val uniform : histogram -> int -> (int * float) list
60 | (** [uniform hist num_intervals] returns estimates of the quantiles of
61 |     the distribution represented by histogram [hist]. The quantiles are
62 |     associated with the boundaries of [num_interval] intervals, in
63 |     ascending order. For example, [uniform hist 4] returns an estimate
64 |     of the interquartile range, consisting of the 25-th percentile,
65 |     50-th percentile (median) and 75-th percentile. (The minimum and
66 |     maximum of the histogram are seperately available through the
67 |     function [range]. [uniform] can return fewer than the desired
68 |     quantiles in histograms that have extremely dense regions. Raises
69 |     [Empty] if the histogram has not been [add]'d any data.*)
70 | 
71 | val mean : histogram -> float
72 | (** [mean hist] returns an estimate of the mean of the distribution
73 |     represented by [hist]. Raises [Empty] if the histogram has not been
74 |     [add]'d any data. *)
75 | 
76 | val mean_stdev : histogram -> float * float
77 | (** [mean_stdev hist] returns estimates of the mean and standard
78 |     deviation of the distribution represented bh [hist]. Raises
79 |     [Empty] if the histogram has not been [add]'d any data. *)
80 | 


--------------------------------------------------------------------------------
/lib/dune:
--------------------------------------------------------------------------------
1 | (library (name bentov) (public_name bentov))
2 | 


--------------------------------------------------------------------------------
/src/bt.ml:
--------------------------------------------------------------------------------
  1 | open Bentov
  2 | 
  3 | let pr = Printf.printf
  4 | 
  5 | let print_histogram h =
  6 |   let count = total_count h in
  7 |   pr "total_count=%d\n" count;
  8 |   pr "max_bins=%d\n" (max_bins h);
  9 |   pr "num_bins=%d\n" (num_bins h);
 10 |   (match (range h) with
 11 |     | Some (mn, mx) -> pr "min=%+.5e\nmax=%+.5e\n" mn mx
 12 |     | None -> ()
 13 |   );
 14 |   let count_f = float count in
 15 |   List.iter (
 16 |     fun bin ->
 17 |       let frequency = (float bin.count) /. count_f in
 18 |       pr "%.8f %8d %.3e\n"  bin.center bin.count frequency
 19 |   ) (bins h);
 20 |   pr "\n"
 21 | 
 22 | let rec fold_lines f ch x0 =
 23 |   let x, is_done =
 24 |     try
 25 |       let line = input_line ch in
 26 |       let x = f x0 line in
 27 |       x, false
 28 |     with End_of_file ->
 29 |       x0, true
 30 |   in
 31 |   if is_done then
 32 |     x
 33 |   else
 34 |     fold_lines f ch x
 35 | 
 36 | 
 37 | let histogram_of_file path_opt max_bins =
 38 |   let ch =
 39 |     match path_opt with
 40 |       | Some path -> open_in path
 41 |       | None -> stdin
 42 |   in
 43 | 
 44 |   let hist = fold_lines (
 45 |     fun hist line ->
 46 |       let value = float_of_string line in
 47 |       add value hist
 48 |   ) ch (create max_bins) in
 49 | 
 50 |   close_in ch;
 51 | 
 52 |   hist
 53 | 
 54 | let print_uniform hist num_intervals =
 55 |   let u = uniform hist num_intervals in
 56 |   pr "uniform with %d intervals:\n%!" num_intervals;
 57 |   List.iter (
 58 |     fun  (rank, quantile) ->
 59 |       pr "%d/%d[=%.3f] %+.5e\n" rank num_intervals
 60 |         ((float rank)/.(float num_intervals)) quantile
 61 |   ) u
 62 | 
 63 | 
 64 | let print_mean_stdev hist =
 65 |   let m, sd = mean_stdev hist in
 66 |   pr "mean=%+.5e\nstdev=%+.5e\n" m sd
 67 | 
 68 | 
 69 | let main max_bins path_opt p_histogram p_mean_stdev p_uniform =
 70 |   let hist = histogram_of_file path_opt max_bins in
 71 |   if p_mean_stdev then (
 72 |     print_mean_stdev hist;
 73 |     pr "\n"
 74 |   );
 75 | 
 76 |   if p_histogram then (
 77 |     print_histogram hist;
 78 |     pr "\n"
 79 |   );
 80 | 
 81 |   match p_uniform with
 82 |     | Some intervals ->
 83 |       print_uniform hist intervals;
 84 |       pr "\n"
 85 |     | None -> ()
 86 | 
 87 | open Cmdliner
 88 | 
 89 | let _ =
 90 |   let command =
 91 |     let doc = "read a column of numbers from an input file (or stdin), and \
 92 |                update an approximate histogram of these numbers.  Every so \
 93 |                often, print out the centers of the histogram bins, along with \
 94 |                their probability mass.  Based on the algorithm described in \
 95 |                \"A Streaming Parallel Decision Tree Algorithm\" by Yael \
 96 |                Ben-Haim and Elad Tom-Tov. \
 97 |                http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf" in
 98 | 
 99 |     let max_bins =
100 |       let doc = "approximate the histogram with this maximum number of bins" in
101 |       Arg.(required & opt (some int) (Some 20) &
102 |            info ["n"; "max-bins"] ~docv:"INT" ~doc)
103 |     in
104 | 
105 |     let path =
106 |       let doc = "path of input file (absent: stdin)" in
107 |       Arg.(value & opt (some string) None &
108 |            info ["i"; "input"] ~docv:"PATH" ~doc)
109 |     in
110 | 
111 |     let print_mean_stdev =
112 |       let doc = "use the histogram to compute and print the mean and \
113 |                  standard deviation" in
114 |       Arg.(value & flag & info ["s"; "stats"] ~doc)
115 |     in
116 | 
117 |     let print_histogram =
118 |       let doc = "print details of the histogram, including its bins" in
119 |       Arg.(value & flag & info ["b"; "bins"] ~doc)
120 |     in
121 | 
122 |     let print_uniform =
123 |       let doc = "compute and print the value deliniating the interval which
124 |                  are equally spaced in terms of probability mass." in
125 |       Arg.(value & opt (some int) None & info ["u"; "uniform"] ~docv:"INT" ~doc)
126 |     in
127 | 
128 |     Cmd.v (Cmd.info "bt" ~doc)
129 |     Term.(const main
130 |           $ max_bins
131 |           $ path
132 |           $ print_histogram
133 |           $ print_mean_stdev
134 |           $ print_uniform
135 |          )
136 |   in
137 |   Cmd.eval ~catch:false command
138 | 


--------------------------------------------------------------------------------
/src/dune:
--------------------------------------------------------------------------------
1 | (executable
2 |  (name bt)
3 |  (public_name bt)
4 |  (libraries cmdliner bentov)
5 | )
6 | 


--------------------------------------------------------------------------------
/test/dune:
--------------------------------------------------------------------------------
1 | (test (name test) (libraries bentov))
2 | 


--------------------------------------------------------------------------------
/test/gaussian.ml:
--------------------------------------------------------------------------------
 1 | open Seq
 2 | 
 3 | (** Knuth algorithm for producing a sample from N(0,1) *)
 4 | let random_normal : float Seq.t =
 5 |   let rec gen phase () =
 6 |     let u1 = Random.float 1.0 in
 7 |     let u2 = Random.float 1.0 in
 8 |     let v1 = 2. *. u1 -. 1.0 in
 9 |     let v2 = 2. *. u2 -. 1.0 in
10 |     let s = v1 *. v1 +. v2 *. v2 in
11 |     if s >= 1.0 || s = 0.0 then
12 |       gen phase ()
13 |     else
14 |       let v = if phase then v1 else v2 in
15 |       let x = v *. sqrt ((-2. *. log s) /. s) in
16 |       Cons (x, gen (not phase))
17 |   in
18 |   gen true
19 | 
20 | (* [random_normal ~mu ~sigma] produces a sequence whose elements
21 |    are N([mean],[stddev]) *)
22 | let seq ~mu ~sigma =
23 |   if mu = 0. && sigma = 1.0 then
24 |     random_normal
25 |   else
26 |     Seq.map (fun x -> mu +. x *. sigma) random_normal
27 | 
28 | (*
29 | let mean_std x =
30 |   let n = List.length x in
31 |   let sum = List.fold_left ( +. ) 0. x in
32 |   let mean = sum /. (float n) in
33 |   let rss = List.fold_left (fun rss x -> rss +. (x -. mean)**2.) 0. x in
34 |   let std = sqrt (rss /. (float (n-1))) in
35 |   mean, std
36 | 
37 | 
38 | let _ =
39 |   let x = take_rev 100_000 (seq ~mu:0. ~stddev:1.) in
40 |   let qs = quantiles x 10 in
41 |   List.iter (fun (i, q) -> Printf.printf "%d %f\n" i q) qs
42 | *)
43 | 


--------------------------------------------------------------------------------
/test/test.ml:
--------------------------------------------------------------------------------
  1 | (* In this test, we measure the error between the true quantiles of
  2 |    data, and the quantiles as computed through the [Bentov] histogram
  3 |    approximation module. The data is drawn from a mixture of two
  4 |    Gaussians, N(2,1) and N(5,1), where the mixing coefficient is 1/2.
  5 |    We compute the approximate quantiles in two ways: In the frist, we
  6 |    simply add each sample into a [Bentov.histogram]. We call this
  7 |    histogram [mixed]. In the second, we add a datum to one of two
  8 |    [Bentov.histogram]'s, one associated with each of the Guassians. We
  9 |    then merge these two histograms using [Bentov.merge]. We call the
 10 |    result of merging the two sub-histograms [merged]. Finally, we
 11 |    compute and print the mean-square-error between the true quantiles
 12 |    and [mixed], and the true quantiles and [merged]. *)
 13 | 
 14 | (* [quantiles list num_intervals] returns the quantiles (true, not
 15 |    approximated) of list [list] at [num_intervals + 1] points, including
 16 |    the minimum and maximum values (which are the first and last values of
 17 |    the result, resp. *)
 18 | let quantiles =
 19 |   let rec loop i j span accu = function
 20 |     | x0 :: x1 :: rest ->
 21 |       let s = (float j) *. span in
 22 |       if i <= s && s < i +. 1. then
 23 |         let d = s -. i in
 24 |         let x_d = (x1 -. x0) *. d +. x0 in
 25 |         loop i (j + 1) span ((j, x_d) :: accu) (x0 :: x1 :: rest)
 26 |       else
 27 |         loop (i +. 1.) j span accu (x1 :: rest)
 28 | 
 29 |     | [x] ->
 30 |       let x_d = j, x in
 31 |       List.rev (x_d :: accu)
 32 | 
 33 |     | [] -> assert false
 34 |   in
 35 |   fun x m ->
 36 |     let x_sorted = List.sort Stdlib.compare x in
 37 |     let n = List.length x_sorted in
 38 |     let span = (float n) /. (float m) in
 39 |     loop 0. 0 span [] x_sorted
 40 | 
 41 | 
 42 | open Seq
 43 | 
 44 | (** [up_to n seq] returns a sequence that echos at most [n] values of
 45 |     sequence [seq] *)
 46 | let up_to =
 47 |   let rec loop i n seq () =
 48 |     if i = n then
 49 |       Nil
 50 |     else
 51 |       match seq () with
 52 |       | Nil -> Nil
 53 |       | Cons (v, seq) ->
 54 |         Cons (v, loop (i + 1) n seq)
 55 |   in
 56 |   fun n seq ->
 57 |     if n < 0 then
 58 |       raise (Invalid_argument "up_to")
 59 |     else
 60 |       loop 0 n seq
 61 | 
 62 | module IntMap = Map.Make(Int)
 63 | 
 64 | let map_of_assoc assoc =
 65 |   List.fold_left (
 66 |     fun k_to_v (k, v) ->
 67 |       IntMap.add k v k_to_v
 68 |   ) IntMap.empty assoc
 69 | 
 70 | let _ =
 71 |   (* the number of data to draw *)
 72 |   let n = 100_000 in
 73 | 
 74 |   (* the size of the approximate histograms *)
 75 |   let q = 20 in
 76 | 
 77 |   let rec gaussian_mixture normal_a normal_b () =
 78 |     if Random.bool () then
 79 |       match normal_a () with
 80 |       | Cons (x, normal_a) ->
 81 |         Cons ((`A, x), gaussian_mixture normal_a normal_b)
 82 |       | Nil -> Nil
 83 |     else
 84 |       match normal_b () with
 85 |       | Cons (x, normal_b) ->
 86 |         Cons ((`B, x), gaussian_mixture normal_a normal_b)
 87 |       | Nil -> Nil
 88 |   in
 89 | 
 90 |   let n1 = Gaussian.seq ~mu:2.0 ~sigma:1.0 in (* N(2,1) *)
 91 |   let n2 = Gaussian.seq ~mu:5.0 ~sigma:1.0 in (* N(5,1) *)
 92 | 
 93 |   let gmm = up_to n (gaussian_mixture n1 n2) in
 94 | 
 95 |   let open Bentov in
 96 | 
 97 |   let normal_a_h, normal_b_h, mixed_h, data = Seq.fold_left (
 98 |     fun (normal_a_h, normal_b_h, mixed_h, data) draw ->
 99 |       match draw with
100 |       | `A, x ->
101 |         let normal_a_h = Bentov.add x normal_a_h in
102 |         let mixed_h = Bentov.add x mixed_h in
103 |         normal_a_h, normal_b_h, mixed_h, x :: data
104 | 
105 |       | `B, x ->
106 |         let normal_b_h = Bentov.add x normal_b_h in
107 |         let mixed_h = Bentov.add x mixed_h in
108 |         normal_a_h, normal_b_h, mixed_h, x :: data
109 |   ) (
110 |     let normal_a_h = create q in
111 |     let normal_b_h = create q in
112 |     let mixed_h = create q in
113 |     normal_a_h, normal_b_h, mixed_h, []
114 |   ) gmm in
115 | 
116 |   (* merge the two sub-histograms *)
117 |   let merged_h = merge [normal_a_h; normal_b_h] q in
118 | 
119 |   assert (total_count mixed_h = n );
120 |   assert (total_count merged_h = n );
121 | 
122 |   (* measure the error between the true quantiles and approximations
123 |      on a grid half the size of our approximate histograms *)
124 |   let num_intervals = q/2 in
125 | 
126 |   let error i actual mixed merged =
127 |     match IntMap.find_opt i actual with
128 |     | None -> None, None
129 |     | Some actual ->
130 |       (match IntMap.find_opt i mixed with
131 |        | Some mixed -> Some (actual -. mixed)
132 |        | None -> None
133 |       ),
134 |       (match IntMap.find_opt i merged with
135 |        | Some merged -> Some (actual -. merged)
136 |        | None -> None
137 |       )
138 |   in
139 | 
140 |   (* compute sum of squared-errors *)
141 |   let rec stats i actual mixed merged mixed_stats merged_stats =
142 |     if i < num_intervals then
143 |       let mixed_err, merged_err = error i actual mixed merged in
144 |       let mixed_stats =
145 |         match mixed_err with
146 |         | Some err ->
147 |           let sum_se_mixed, n_mixed = mixed_stats in
148 |           sum_se_mixed +. err *. err, n_mixed + 1
149 |         | None -> mixed_stats
150 |       in
151 |       let merged_stats =
152 |         match merged_err with
153 |         | Some err ->
154 |           let sum_se_merged, n_merged = merged_stats in
155 |           sum_se_merged +. err *. err, n_merged + 1
156 |         | None -> merged_stats
157 |       in
158 |       stats (i+1) actual mixed merged mixed_stats merged_stats
159 |     else
160 |       mixed_stats, merged_stats
161 |   in
162 | 
163 |   let mixed_q  = map_of_assoc (uniform mixed_h num_intervals) in
164 |   let merged_q = map_of_assoc (uniform merged_h num_intervals) in
165 |   let actual_q = map_of_assoc (quantiles data num_intervals) in
166 | 
167 |   let (sum_se_mixed, n_mixed), (sum_se_merged, n_merged) =
168 |     stats 0 actual_q mixed_q merged_q (0., 0) (0., 0) in
169 | 
170 |   let err_mixed = sqrt ((sum_se_mixed) /. (float n_mixed)) in
171 |   let err_merged = sqrt ((sum_se_merged) /. (float n_merged)) in
172 |   Printf.printf "err_mixed=%e (n=%d)\nerr_merged=%e (n=%d)\n"
173 |     err_mixed n_mixed err_merged n_merged;
174 |   assert (err_mixed <= 5e-2 && err_merged <= 5e-2)
175 | 
176 | 


--------------------------------------------------------------------------------